爬虫是什么?
网络爬虫(又被称为网页蜘蛛,网络机器人,在FOAF社区中间,更经常的称为网页追逐者),是一种按照一定的规则,自动的抓取万维网信息的程序或者脚本。
手写Python爬虫需要知道什么?
爬取豆瓣是遍历豆瓣所有图书id,效率不高,会有很多空载,毕竟年轻不懂事o(>﹏<)o
SpiderMain类
class SpiderMain(object):
def __init__(self):
self.urls = url_manager.UrlManager() # url管理器
self.downloader = html_downloader.HtmlDownloader() # html网页下载器
self.parser = html_parser.HtmlParser() # html分析器
self.mongodb = mongoDB.MongoDB() # 数据库操作器
def craw(self, rootId):
count = 1
creatUrl, creatBookId = self.urls.creat_url(rootId)
while count < 100000:
creatUrl, creatBookId = self.urls.creat_url(creatBookId)
html_page = self.downloader.download(creatUrl)
if html_page == 404: # 某些页面一直是404,无法爬取
print creatUrl + '404'
else:
try:
new_urls, new_data = self.parser.parse(creatUrl, html_page) #
if new_urls == False:
pass
else:
self.mongodb.collect_data(creatBookId, new_data)
except Exception, e:
print e
time.sleep(random.uniform(0.1, 0.3))
count += 1
UrlManager类
因为是遍历id所有,就循环+1··· <( ̄3 ̄)>
class UrlManager(object):
def __init__(self):
pass
# 添加新的单个url,只添加不在新旧集合中的url
def creat_url(self, id):
book_id = id + 1
url = "https://book.douban.com/subject/"+str(book_id)+"/"
return url, book_id
HtmlDownloader类
这里的异常处理参照某位大神写的,感觉可有可无。因为并不能一直待在电脑前听蜂鸣o(>﹏<)o。因此还是打日志靠谱···
class HtmlDownloader(object):
def __init__(self):
pass
//生成一个cookies里的bid
def random_str(self,randomlength=11):
str = ''
chars = 'AaBbCcDdEeFfGgHhIiJjKkLlMmNnOoPpQqRrSsTtUuVvWwXxYyZz0123456789'
length = len(chars) - 1
random = Random()
for i in range(randomlength):
str += chars[random.randint(0, length)]
return str
def downloadConf(self, url):
//固定了header,应该随机更换
Header = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:40.0) Gecko/20100101 Firefox/40.0' # 浏览器头描述
try:
request = urllib2.Request(url)
request.add_header('User-Agent', Header)
request.add_header('Cookie', 'bid='+self.random_str())
response = urllib2.urlopen(request)
page = response.read().decode('utf-8')
return page
except urllib2.URLError, e:
if hasattr(e, "code"):
print e.code
if hasattr(e, "reason"):
print e.reason
if e.code == 403:
winsound.Beep(600, 1000) # 蜂鸣发出警告,音量600, 时常1000ms
return 403
if e.code == 404:
winsound.Beep(600, 300) # 蜂鸣发出警告,音量600, 时常300ms
sleep(0.1)
winsound.Beep(600, 300) # 蜂鸣发出警告,音量600, 时常300ms
return 404
def download(self, url):
times = 1
if url is None:
return None
while True:
html_page = self.downloadConf(url)
if html_page == 404: # 404错误,返回404,然后将url放入404Urls
return 404
elif html_page == 403: # 如果出现403等错误,等待后继续爬取
sleeptime = random.randint(20, 30) * times # 递增等待时间
print 'sleeping %d times...' % times
times += 1
sleep(sleeptime)
else:
return html_page
HtmlParser类
class HtmlParser(object):
def __init__(self):
pass
def parse(self, url, page):
res_data = {}
res_data['url'] = url
if url is None or page is None:
return
test = re.findall( re.compile('.*?(.*?)' ,re.S), page)
if test[0] != '豆瓣读书':
return False, False
else:
try: #舍弃页面信息不完全的url
soup = BeautifulSoup(page, 'html.parser', from_encoding='utf-8')
#url
res_data['url'] = url
res_data['bookName'] = soup.find('span', property='v:itemreviewed').string
res_data['score'] = soup.find('strong', class_='ll rating_num ').string
res_data['bg_url'] = soup.find('a', class_='nbg').attrs["href"]
info = soup.find('div', id='info')
res_data['author'] = info.find(text=' 作者').next_element.next_element.string
res_data['publisher'] = info.find(text='出版社:').next_element
res_data['time'] = info.find(text='出版年:').next_element
res_data['price'] = info.find(text='定价:').next_element
res_data['ISBN'] = info.find(text='ISBN:').next_element.strip()
res_data['intro'] = soup.find('div', class_='intro').find('p').string
return url, res_data
except:
return False, False
MongoDB类
使用mongodb存储数据
class MongoDB(object):
def __init__(self):
client = pymongo.MongoClient('x.x.x.x', 27017)
db = client.leason
db.authenticate("leason", "xxxxxx")
self.bookCol = db.book
def collect_data(self, creatBookId, data):
data['_id'] = creatBookId
self.bookCol.insert(data)
这个年轻时候写着玩的爬虫到底有哪些问题( ⊙ o ⊙ )
完整代码
leason|个人博客