从这个网址学习的:https://zhuanlan.zhihu.com/p/399300580
------------------------------------------------------------------------
https://movie.douban.com/subject/25728006/comments 点击下一页
https://movie.douban.com/subject/25728006/comments?start=20&limit=20&status=P&sort=new_score
打开页面F12进入开发者工具,查看
requests.get(url, params= params, headers=headers)
其中获取网页需要url(网址)、 params(参数) 、headers(头文件)
url=https://movie.douban.com/subject/25728006/comments
params={
'percent_type':'',
'start':'20',
'limit':'20',
'status':'P',
'sort':'new_score',
'comments_only':'1',
}
headers={
'Accept':'application/json, text/plain, */*',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9,en;q=0.8',
'Connection':'keep-alive',
'Cookie':'ll="118254"; bid=bzf7LGz3pZA....',
'Host':'movie.douban.com',
'Referer':'https://movie.douban.com/subject/25728006/comments?start=20&limit=20&status=P&sort=new_score',
'sec-ch-ua':'" Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"',
'sec-ch-ua-mobile':'?0',
'Sec-Fetch-Dest':'empty',
'Sec-Fetch-Mode':'cors',
'Sec-Fetch-Site':'same-origin',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.106 Safari/537.36',
}
代码查看
import requests
import re
import pandas as pd
import os
headers={
'Cookie':'ll="118254"; bid=bzf7LGz3pZA; _vwo_uuid_v2=DB12523A0B0C7127645E914A1FB363352|3d83981785084d997d7462a2ce24a947; __utmz=223695111.1626234491.2.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; douban-fav-remind=1; __utmz=30149280.1629095213.3.3.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1629249503%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3D71zldwjBiMBa-xfexgVZ43eTQq2n8KKtTWTsWh37m72e_lfEOE1x3NuDj6egeYBLyqGE4gjSJnbxueQLcYZWsq%26wd%3D%26eqid%3Ddb6736ec000219350000000660ee5e6f%22%5D; _pk_ses.100001.4cf6=*; ap_v=0,6.0; __utma=30149280.1335619985.1616046306.1629187696.1629249503.8; __utmc=30149280; __utmt=1; __utmb=30149280.1.10.1629249503; __utma=223695111.444014824.1616046306.1629187696.1629249506.7; __utmb=223695111.0.10.1629249506; __utmc=223695111; _pk_id.100001.4cf6=fa72408676bee41c.1616046306.7.1629250023.1629187696.',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.106 Safari/537.36',
}
# 请求网页数据
def get_html(tid, page, _type):
"""
tid:产品的id,比如 速度与激情9的产品id是25728006
page:短评页码,0-24
headers:请求头,需要带浏览器和cookie信息
_type:评价类型(好评:h,中评:m,差评:l),为空就是全部
"""
url = f'https://movie.douban.com/subject/{tid}/comments?'
params = {
'percent_type': _type,
'start': page * 20,
'limit': 20,
'status': 'P',
'sort': 'new_score',
'comments_only': 1,
'ck': 't9O9',
}
r = requests.get(url, params=params, headers=headers)
# 请求数据为json
data = r.json()
html = data['html']
# 我们正则处理,所以先将空字符去掉
html = re.sub('\s', '', html)
return html
print(get_html('25728006',0,''))
结果选取第一个用户:
完整关于作者、日期、评价内容、有用数据以及星数的解析过程如下
# 解析数据
def get_data(html):
df = pd.DataFrame(columns=['作者', '日期', '评论', '投票数', 'star'])
df["作者"] = re.findall('(.*?)', html)
df["评论"] = re.findall('"short">(.*?)', html)
df["投票数"] = re.findall('"votesvote-count">(\d+)', html)
#当_type:==‘’时,下面这一行获取的列表长度可能会报错:ValueError: Length of values (19) does not match length of index (20)
# 原来没有评分的则这项为空,_type:==‘’只能暂时去掉这项
xx=re.findall('
上面的方法是使用的正规则来获取的,其实可以用BeautifulSoup结合lxml来获取,使用方法见笔记一
数据存储:这里将数据存储为csv文件,主要是追加写入比较方便,最好能写入到数据库后期好分析。
# 存储数据
def save_df(df):
if os.path.exists('data.csv'):
df.to_csv('data.csv',index=None,mode='a',header=None,encoding='utf_8_sig')
else:
df.to_csv('data.csv',index=None,encoding='utf_8_sig')
主进程调用
if __name__ == '__main__':
# 产品id,比如 速度与激情9的产品id是25728006
tid = 25728006
# 评价类型(好评:h,中评:m,差评:l)
_type = 'l'
for page in range(10):
html = get_html(tid,page,_type)
df = get_data(html)
save_df(df)
print(f'{page+1}页评价已采集..')
结果
完整代码
import requests
import re
import pandas as pd
import os
headers={
'Cookie':'ll="118254"; bid=bzf7LGz3pZA; _vwo_uuid_v2=DB12523A0B0C7127645E914A1FB363352|3d83981785084d997d7462a2ce24a947; __utmz=223695111.1626234491.2.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; douban-fav-remind=1; __utmz=30149280.1629095213.3.3.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1629249503%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3D71zldwjBiMBa-xfexgVZ43eTQq2n8KKtTWTsWh37m72e_lfEOE1x3NuDj6egeYBLyqGE4gjSJnbxueQLcYZWsq%26wd%3D%26eqid%3Ddb6736ec000219350000000660ee5e6f%22%5D; _pk_ses.100001.4cf6=*; ap_v=0,6.0; __utma=30149280.1335619985.1616046306.1629187696.1629249503.8; __utmc=30149280; __utmt=1; __utmb=30149280.1.10.1629249503; __utma=223695111.444014824.1616046306.1629187696.1629249506.7; __utmb=223695111.0.10.1629249506; __utmc=223695111; _pk_id.100001.4cf6=fa72408676bee41c.1616046306.7.1629250023.1629187696.',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.106 Safari/537.36',
}
# 请求网页数据
def get_html(tid, page, _type):
"""
tid:产品的id,比如 速度与激情9的产品id是25728006
page:短评页码,0-24
headers:请求头,需要带浏览器和cookie信息
_type:评价类型(好评:h,中评:m,差评:l),为空就是全部
"""
url = f'https://movie.douban.com/subject/{tid}/comments?'
params = {
'percent_type': _type,
'start': page * 20,
'limit': 20,
'status': 'P',
'sort': 'new_score',
'comments_only': 1,
'ck': 't9O9',
}
r = requests.get(url, params=params, headers=headers)
# 请求数据为json
data = r.json()
html = data['html']
# 我们正则处理,所以先将空字符去掉: \s 匹配任意空白字符,等价于 [\t\n\r\f]。
html = re.sub('\s', '', html)
return html
# 解析数据
def get_data(html):
df = pd.DataFrame(columns=['作者', '日期', '评论', '投票数', 'star'])
df["作者"] = re.findall('(.*?)', html)
df["评论"] = re.findall('"short">(.*?)', html)
df["投票数"] = re.findall('"votesvote-count">(\d+)', html)
#当_type:==‘’时,下面这一行获取的列表长度可能会报错:ValueError: Length of values (19) does not match length of index (20)
# 原来没有评分的则这项为空,_type:==‘’只能暂时去掉这项
xx=re.findall('