爬虫练手小项目:豆瓣高分图书TOP100
import requests
import re
from requests.exceptions import RequestException
import json
import time
def get_one_page(url):
headers ={
'User-Agent':'Mozilla/5.0 (Macintosh;Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'}
response = requests.get(url,headers=headers)
if response.status_code == 200:
return response.text
return None
def parse_one_page(html,offset):
pattern = re.compile('.*?title.*?_blank">(.*?).*?rating_nums">(.*?)',re.S)
items = re.findall(pattern,html)
index = offset
for item in items:
index = index + 1
yield {
'index':index,
'image':item[0],
'title':item[1].strip(),
'score':item[2]
}
def write_to_file(content):
with open("doubandushu.txt",'a',encoding='utf-8') as f:
#print(type(json.dumps(content)))
f.write(json.dumps(content,ensure_ascii=False)+'\n')
def main(offset):
url = "https://www.douban.com/doulist/45004834/?"+'start='+str(offset)+'&sort=time&sub_type='
html = get_one_page(url)
for item in parse_one_page(html,offset):
print(item)
write_to_file(item)
if __name__=='__main__':
for i in range(0,4):
main(offset = i*25)
time.sleep(1)
* 注:1.本项目模仿崔庆才著的《Python3网络爬虫开发实践》3.4节抓取猫眼电影排行;2.博主能力有限,无法写出合适的正则表达式抓取书籍作者信息,在此请教其他网友;