基于python的网络爬虫,爬取新闻网站内容。
import re
import time
from html.parser import HTMLParser
from urllib import request
class MyHTMLParser(HTMLParser):
def handle_data(self, data): #html里的内容
data = str(data).strip()
if (data.__len__() == 0):
return
mgr = re.match(r'[.\s\S]*[}{;)>/]$', data)
if(mgr.__str__().__len__()>4):
return
print( data)
parser = MyHTMLParser()
with request.urlopen('http://www.bbc.com/news/world-us-canada-44309961') as f:
data = f.read().decode('utf-8')
regex = '/news/world-.{2,6}-\d{8}'
pat = re.compile(regex)
strurl = re.findall(pat,data)
for strur in strurl:
time.sleep(1.5)
with request.urlopen('http://www.bbc.com'+strur) as f:
data = f.read().decode('utf-8')
parser.feed(data)