豆瓣最受欢迎的书评网址:https://book.douban.com/review/best/
import requests
import csv
from pyquery import PyQuery as pq
import io
import sys
# 改变标准输出的默认编码
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='utf-8')
# 请求headers处理,伪装成浏览器,防止反爬虫问题
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3314.0 Safari/537.36 SE 2.X MetaSr 1.0'
}
url='https://book.douban.com/review/best/?start='
filename="douban.csv"
# 定义写入表头函数
def writeHead():
# 在open()内增加一个参数newline=''是为了解决导出的CSV用Excel打开时出现空行
with open(filename,'w',encoding='utf-8',newline='') as f:
writer=csv.writer(f)
# writerow()用法详见https://www.jianshu.com/p/e6768d9af085
writer.writerow(["昵称","评论时间","评论链接","评论标题","点赞数","砸砖数","回应数"])
# 定义写入内容函数
def writeData(data):
with open(filename,'a',encoding='gb18030',newline='') as f:
writer=csv.writer(f)
writer.writerow(data)
# 爬取三页内容
def getPage(url):
for i in range (3):
my_url=url+str(i*20)
print(my_url)
response=requests.get(my_url,headers=headers)
if response.status_code==200:
html=response.text.replace('回应','')
doc=pq(html) #文件初始化
items=doc('.review-list').children('div').items()
for item in items:
name=item('.name').text()
time=item('.main-meta').text()
a=item('div.main-bd h2 a')
up=item('.up span').text()
down=item('.down span').text()
reply=item('.reply').text()
print(name,time,a.attr.href,a.html(),up,down,reply)
writeData((name,time,a.attr.href,a.html(),up,down,reply))
writeHead()
getPage(url)