原文地址: 运用python抓取博客园首页的全部数据,并且定时持续抓取新发布的内容存入mongodb中
依赖包:
1.jieba
2.pymongo
3.HTMLParser
# -*- coding: utf-8 -*-
"""
@author: jiangfuqiang
"""
from HTMLParser import HTMLParser
import re
import time
from datetime import date
import pymongo
import urllib2
import sys
import traceback
import jieba
default_encoding = 'utf-8'
if sys.getdefaultencoding() != default_encoding:
reload(sys)
sys.setdefaultencoding(default_encoding)
isExist = False
class FetchCnblog(HTMLParser):
def __init__(self, id):
HTMLParser.__init__(self)
self.result = []
self.data = {}
self.isTitleLink = False
self.id = id
self.isSummary = False
self.isPostItem = False
self.isArticleView = False
def handle_data(self, data):
if self.isTitleLink and self.isPostItem:
self.data['title'] = data
self.isTitleLink = False
elif self.isSummary and self.isPostItem:
data = data.strip()
if data:
self.data['desc'] = data
def handle_starttag(self, tag, attrs):
if tag == 'a':
for key, value in attrs:
if key == 'class':
if value == 'titlelnk':
self.isTitleLink = True
elif value == 'gray' and self.isArticleView:
self.isArticleView = False
for key, value in attrs:
if key == 'href':
self.data['readmoreLink'] = value
reg = 'd+'
result = re.search(reg,value)
self.isPostItem = False
if result:
self.data['id'] = int(result.group())
else:
self.data = {}
return
if self.data['id'] <= self.id:
self.data = {}
isExist = True
return
else:
self.data['srouce'] = "www.cnblogs.com"
self.data['source_key'] = 'cnblogs'
self.data['fetchTime'] = str(date.today())
self.data['keyword'] = ",".join(jieba.cut(self.data['title']))
self.result.append(self.data)
self.data = {}
elif tag == 'p':
for key, value in attrs:
if key == 'class' and value == 'post_item_summary':
self.isSummary = True
elif tag == 'img':
for key, value in attrs:
if key == 'class' and value == 'pfs':
for key, value in attrs:
if key == 'src':
self.data['imgSrc'] = value
elif tag == 'div':
for key, value in attrs:
if key == 'class' and value == 'post_item_foot':
self.isSummary = False
elif key == 'class' and value == 'post_item':
self.isPostItem = True
elif tag == 'span':
for key , value in attrs:
if key == 'class' and value == 'article_view':
self.isArticleView = True
def getResult(self):
return self.result
if __name__ == "__main__":
con = pymongo.Connection('localhost', 27017)
db = con.blog
fetchblog = db.fetch_blog
record = db.record
url = "http://www.cnblogs.com/sitehome/p/%d"
count = 1
flag = False
headers={
'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
reco = record.find_one({"type":'cnblogs'})
id = 0
if reco:
id = reco['maxId']
while isExist == False:
try:
req = urllib2.Request(url%count,headers=headers)
request = urllib2.urlopen(req)
data = request.read()
fj = FetchCnblog(id)
fj.feed(data)
result = fj.getResult()
if len(result) < 1:
isExist = True
else:
if flag == False:
flag = True
dic = result[0]
id = int(dic['id'])
record.update({"type":'cnblogs'},{"$set":{'maxId':id}},True,False)
result.reverse()
for doc in result:
fetchblog.insert(doc)
print "page is %d"%count
count += 1
time.sleep(5)
except Exception, e:
traceback.print_exc()
print "parse error",e
程序如果在linux,mac下执行,在可在crontab -e中设置定时任务,如果在windows执行,则自己再在程序里加个定时器即可