爬虫代码段

爬去图片

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
import os
import urllib
print 'Please enter a URL like http://www.py4inf.com/cover.jpg'
urlstr = raw_input().strip()
img = urllib.urlopen(urlstr)
# Get the last "word"
words = urlstr.split('/')
fname = words[-1]
# Don't overwrite the file
if os.path.exists(fname) :
if raw_input('Replace '+fname+' (Y/n)?') != 'Y' :
print 'Data not copied'
exit()
print 'Replacing',fname
fhand = open(fname, 'w')
size = 0
while True:
info = img.read(100000)
if len(info) < 1 : break
size = size + len(info)
fhand.write(info)
print size,'characters copied to',fname
fhand.close()

http://www.jianshu.com/p/7e86f0505656

爬去某页面在本站的链接

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import requests
import re
def get_pages(link):
links_to_visit = []
links_to_visit.append(link)
while links_to_visit:
current_link = links_to_visit.pop(0)
page = requests.get(current_link)
for url in re.findall('<a href="([^"]+)">', str(page.content)):
if url[0] == '/':
url = current_link + url[1:]
pattern = re.compile('https?')
if pattern.match(url):
links_to_visit.append(url)
yield current_link
webpage = get_pages('http://sample.com')
for result in webpage:
print(result)
# '<a href="([^"]+)">
用括号提取匹配内容中的链接,但不包括外部网站链接
([^"]+)"匹配一个或多个非"字符
外部链接通常如下<a href="http://www.ccin.com/contact-us/" target="_blank">']
文章目录
  1. 1. 爬去图片
  2. 2. 爬去某页面在本站的链接