python爬取豆瓣书评实战——初级

豆瓣最受欢迎的书评网址:https://book.douban.com/review/best/

python爬取豆瓣书评实战——初级_第1张图片

import requests
import csv
from pyquery import PyQuery as pq
import io
import sys

#  改变标准输出的默认编码
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='utf-8')
# 请求headers处理,伪装成浏览器,防止反爬虫问题
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3314.0 Safari/537.36 SE 2.X MetaSr 1.0'
}
url='https://book.douban.com/review/best/?start='
filename="douban.csv"

# 定义写入表头函数
def writeHead():
	# 在open()内增加一个参数newline=''是为了解决导出的CSV用Excel打开时出现空行
    with open(filename,'w',encoding='utf-8',newline='') as f:
        writer=csv.writer(f)
        # writerow()用法详见https://www.jianshu.com/p/e6768d9af085
        writer.writerow(["昵称","评论时间","评论链接","评论标题","点赞数","砸砖数","回应数"])

# 定义写入内容函数
def writeData(data):
    with open(filename,'a',encoding='gb18030',newline='') as f:
        writer=csv.writer(f)
        writer.writerow(data)
        
# 爬取三页内容
def getPage(url):
    for i in range (3):
        my_url=url+str(i*20)
        print(my_url)
        response=requests.get(my_url,headers=headers)
        if response.status_code==200:
            html=response.text.replace('回应','')
            doc=pq(html) #文件初始化
            items=doc('.review-list').children('div').items()
            for item in items:
                name=item('.name').text()
                time=item('.main-meta').text()
                a=item('div.main-bd h2 a')
                up=item('.up span').text()
                down=item('.down span').text()
                reply=item('.reply').text()
                print(name,time,a.attr.href,a.html(),up,down,reply)
                writeData((name,time,a.attr.href,a.html(),up,down,reply))
writeHead()
getPage(url)

运行上述代码后生成
python爬取豆瓣书评实战——初级_第2张图片
文件部分数据所示
python爬取豆瓣书评实战——初级_第3张图片

你可能感兴趣的:(python,python)