华电北风吹
天津大学认知计算与应用重点实验室
日期:2015/7/1
作为第一个爬虫代码:
请求一个网址,从返回的html文本中,利用正则表达式,读取到特定博客用户的所有博客。
import re
import urllib.request
import urllib
import random
queue = []
url = 'http://blog.csdn.net/youname/article/details/blogID'
queue.append(url)
cnt = 0
while queue:
Length=len(queue)
k=random.randint(0,Length-1)
url = queue[k]
queue.clear()
try:
print(str(cnt) + '正在抓取--->' + url)
cnt += 1
req = urllib.request.Request(url, headers = {'Connection': 'Keep-Alive',
'Accept': 'text/html, application/xhtml+xml, */*',
'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko'})
oper = urllib.request.urlopen(req)
data = oper.read().decode('utf-8')
except:
continue
linkre = re.compile(r'/youname/article/details/\d{8}')
for x in linkre.findall(data):
nexturl='http://blog.csdn.net'+x
queue.append(nexturl)