http://maoyan.com/films/1217236
我们直接访问这个,在web端只能看到最热的10条短评,那怎么获取到所有短评呢?
(1) 访问上面的链接,按下F12,然后点击图片上的图标,把浏览模式(响应式设计模式,火狐快捷键Ctrl+Shift+M)改为手机模式,刷新页面。
(2)换用谷歌浏览器,F12下进行上面操作,加载完毕后下拉短评,页面继续加载,找到含有offset和startTime的加载条,发现它的Response中包含我们想要的数据,为json格式。
(1)简单分析
通过上面分析
Request URL: http://m.maoyan.com/mmdb/comments/movie/1217236.json?v=yes&offset=0&startTime=0%2021%3A09%3A31
Request Method: GET
下滑了几次次,我发现了下面规律:
分析上面数据变化,可以大致猜测出:offset表示该接口显示评论开始位置,每个页面15条,比如:15,则显示15-30这中间的15条评论; startTime表示当前评论的时间,固定格式(2018-10-06)。
另外接口最后的%2021%3A09%3A31是不变的。
(2)代码获取
import requests
from fake_useragent import UserAgent
import json
'''
小编准备的python学习资料,加群:821460695 即可免费获取!
'''
headers = {
"User-Agent": UserAgent(verify_ssl=False).random,
"Host":"m.maoyan.com",
"Referer":"http://m.maoyan.com/movie/1217236/comments?_v_=yes"
}
# 猫眼电影短评接口
offset = 0
# 电影是2018.9.21上映的
startTime = '2018-09-21'
comment_api = 'http://m.maoyan.com/mmdb/comments/movie/1217236.json?_v_=yes&offset={0}&startTime={1}%2021%3A09%3A31'.format(offset,startTime)
# 发送get请求
response_comment = requests.get(comment_api,headers = headers)
json_comment = response_comment.text
json_comment = json.loads(json_comment)
print(json_comment)
# 获取数据并存储
def get_data(self,json_comment):
json_response = json_comment["cmts"] # 列表
list_info = []
for data in json_response:
cityName = data["cityName"]
content = data["content"]
if "gender" in data:
gender = data["gender"]
else:
gender = 0
nickName = data["nickName"]
userLevel = data["userLevel"]
score = data["score"]
list_one = [self.time,nickName,gender,cityName,userLevel,score,content]
list_info.append(list_one)
self.file_do(list_info)
3.存储数据
# 存储文件
def file_do(list_info):
# 获取文件大小
file_size = os.path.getsize(r'G:\maoyan\maoyan.csv')
if file_size == 0:
# 表头
name = ['评论日期', '评论者昵称', '性别', '所在城市','猫眼等级','评分','评论内容']
# 建立DataFrame对象
file_test = pd.DataFrame(columns=name, data=list_info)
# 数据写入
file_test.to_csv(r'G:\maoyan\maoyan.csv', encoding='gbk', index=False)
else:
with open(r'G:\maoyan\maoyan.csv', 'a+', newline='') as file_test:
# 追加到文件后面
writer = csv.writer(file_test)
# 写入文件
writer.writerows(list_info)
1.提取数据
代码:
def read_csv():
content = ''
# 读取文件内容
with open(r'G:\maoyan\maoyan.csv', 'r', encoding='utf_8_sig', newline='') as file_test:
# 读文件
reader = csv.reader(file_test)
i = 0
for row in reader:
if i != 0:
time.append(row[0])
nickName.append(row[1])
gender.append(row[2])
cityName.append(row[3])
userLevel.append(row[4])
score.append(row[5])
content = content + row[6]
# print(row)
i = i + 1
print('一共有:' + str(i - 1) + '条数据')
return content
一共有:15195条数据
评论者性别分布可视化
# 评论者性别分布可视化
def sex_distribution(gender):
# print(gender)
from pyecharts import Pie
list_num = []
list_num.append(gender.count('0')) # 未知
list_num.append(gender.count('1')) # 男
list_num.append(gender.count('2')) # 女
attr = ["其他","男","女"]
pie = Pie("性别饼图")
pie.add("", attr, list_num, is_label_show=True)
pie.render("H:\PyCoding\spider_maoyan\picture\sex_pie.html")
# 评论者所在城市分布可视化
def city_distribution(cityName):
city_list = list(set(cityName))
city_dict = {city_list[i]:0 for i in range(len(city_list))}
for i in range(len(city_list)):
city_dict[city_list[i]] = cityName.count(city_list[i])
# 根据数量(字典的键值)排序
sort_dict = sorted(city_dict.items(), key=lambda d: d[1], reverse=True)
city_name = []
city_num = []
for i in range(len(sort_dict)):
city_name.append(sort_dict[i][0])
city_num.append(sort_dict[i][1])
import random
from pyecharts import Bar
bar = Bar("评论者城市分布")
bar.add("", city_name, city_num, is_label_show=True, is_datazoom_show=True)
bar.render("H:\PyCoding\spider_maoyan\picture\city_bar.html")
# 地图可视化
def render_city(cities):
def time_num_visualization(time):
from pyecharts import Line
time_list = list(set(time))
time_dict = {time_list[i]: 0 for i in range(len(time_list))}
time_num = []
for i in range(len(time_list)):
time_dict[time_list[i]] = time.count(time_list[i])
# 根据数量(字典的键值)排序
sort_dict = sorted(time_dict.items(), key=lambda d: d[0], reverse=False)
time_name = []
time_num = []
print(sort_dict)
for i in range(len(sort_dict)):
time_name.append(sort_dict[i][0])
time_num.append(sort_dict[i][1])
line = Line("评论数量日期折线图")
line.add(
"日期-评论数",
time_name,
time_num,
is_fill=True,
area_color="#000",
area_opacity=0.3,
is_smooth=True,
)
line.render("H:\PyCoding\spider_maoyan\picture\c_num_line.html")
def level_score_visualization(userLevel,score):
from pyecharts import Pie
userLevel_list = list(set(userLevel))
userLevel_num = []
for i in range(len(userLevel_list)):
userLevel_num.append(userLevel.count(userLevel_list[i]))
score_list = list(set(score))
score_num = []
for i in range(len(score_list)):
score_num.append(score.count(score_list[i]))
pie01 = Pie("等级环状饼图", title_pos='center', width=900)
pie01.add(
"等级",
userLevel_list,
userLevel_num,
radius=[40, 75],
label_text_color=None,
is_label_show=True,
legend_orient="vertical",
legend_pos="left",
)
pie01.render("H:\PyCoding\spider_maoyan\picture\level_pie.html")
pie02 = Pie("评分玫瑰饼图", title_pos='center', width=900)
pie02.add(
"评分",
score_list,
score_num,
center=[50, 50],
is_random=True,
radius=[30, 75],
rosetype="area",
is_legend_show=False,
is_label_show=True,
)
pie02.render("H:\PyCoding\spider_maoyan\picture\score_pie.html")