本文讲述如何利用Python语言针对美丽的AngelaBaby的影视作品进行 爬虫与数据分析
本项目进行:
数据分析包括:
import requests
import pandas as pd
from fake_useragent import UserAgent # fake_useragent第三方库,来实现随机请求头的设置
from bs4 import BeautifulSoup
from tqdm import tqdm # 进度条
import plotly.express as px
import plotly.graph_objects as go
# pandas的dataFrame输出不换行
pd.set_option('display.max_rows', 500) #设置最大可见100行
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 5000)
'''
1部分
'''
# 建立随机agent 防止页面拒绝访问
# ua = UserAgent()
# headers = {'User-Agent': ua.random}
headers = { # 这是我的真实User-Agent
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'
}
# 提取总页面内容, 做一个索引
tbl_baby_movies = pd.DataFrame() # DataFrame的单元格可以存放数值、字符串等,这和excel表很像,同时DataFrame可以设置列名columns与行名index
for main_page_num in tqdm(range(10)): # 对于杨颖的10页节目单
baby_movie_url = f'https://movie.douban.com/celebrity/1033011/movies?start={main_page_num * 10}&format=pic&sortby=time&'
# 获取
response = requests.get(baby_movie_url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
'''
指定Beautiful的解析器为“html.parser”还有BeautifulSoup(markup,“lxml”)BeautifulSoup(markup, “lxml-xml”) BeautifulSoup(markup,“xml”)等等很多种
'''
for movie in range(len(soup.find_all('h6'))):
baby_movies_dict = {}
# 节目名称
baby_movies_dict['movie_name'] = soup.find_all('h6')[movie].a.text
# 节目链接(以防万一)
baby_movies_dict['movie_link'] = soup.find_all('h6')[movie].a['href']
# 是否上映
if soup.find_all('h6')[movie].find_all('span')[1].text == '(未上映)':
baby_movies_dict['coming_soon'] = 'Y'
else:
baby_movies_dict['coming_soon'] = 'N'
tbl_baby_movies = tbl_baby_movies.append(baby_movies_dict, ignore_index=True)
tbl_baby_movies.to_csv('angelababy_tbl_baby_movies.csv', index=None, encoding='utf-8-sig')
print()
print(tbl_baby_movies)
print()
'''
2部分
'''
# 把已经上架的选出来
onboard_general_links = tbl_baby_movies[tbl_baby_movies['coming_soon'] == 'N']['movie_link'].tolist()
# print('onboard_general_links -> ', onboard_general_links)
# 获取页面
general_links = pd.DataFrame()
for link in tqdm(onboard_general_links):
general = {}
response = requests.get(link, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
# 左边的节目详细信息
general['name'] = soup.find('span', {'property': 'v:itemreviewed'}).text # 节目名称
# print('测试 -> ', general['name'])
try:
general['link'] = link
# 影片类型
general['type1'] = soup.find_all('span', {'property': 'v:genre'})[0].text
general['type2'] = soup.find_all('span', {'property': 'v:genre'})[1].text
general['type3'] = soup.find_all('span', {'property': 'v:genre'})[2].text
# print('generatype1 -> ', general['type1'])
# print('generatype2 -> ', general['type2'])
# print('generatype3 -> ', general['type3'])
except:
pass
try:
general['program_length'] = int(soup.find_all('span', {'property': 'v:runtime'})[0].text.replace('分钟', ''))
except:
pass
#右边的豆瓣评分板
try:
general['overall_star'] = float(soup.find_all('strong', {'class': 'll rating_num'})[0].text)
for star in range(5):
general['overall_star_' + str(star + 1)] = float(
soup.find_all('span', {'class': 'rating_per'})[star].text.replace('%', ''))
except:
pass
# 好于百分之几的什么片
try:
good_than = soup.find_all('div', {'class': 'rating_betterthan'})[0].find_all('a')
general['good_than_1'] = good_than[0].text
general['good_than_2'] = good_than[1].text
except:
pass
general_links = general_links.append(general, ignore_index='True')
# 评分补空值
# general_links = general_links[[col for col in general_links.columns.tolist() if 'overall_star' in col]].fillna(0)
# 保存表
general_links.to_csv('angelababy_programs_org.csv', index=None, encoding='utf-8-sig')
print()
print(general_links)
print()
angelababy_programs_data = pd.read_csv('angelababy_programs.csv') # 读取CSV, 这个操作运行时, 我注释了爬虫, 怕总爬官方墙我
print(angelababy_programs_data)
print('看看angelababy_programs_data平均分 —> ', angelababy_programs_data.loc[:, 'overall_star'].mean())
angelababy_programs_org_data = pd.read_csv('angelababy_programs_org.csv') # 读取CSV, 这个操作运行时, 我注释了爬虫, 怕总爬官方墙我
baby_movie_links = angelababy_programs_org_data[(angelababy_programs_org_data['type1'] != '真人秀') & (angelababy_programs_org_data['type1'] != '脱口秀') & (angelababy_programs_org_data['type1'] != '歌舞') & (angelababy_programs_org_data['type1'] != '音乐')]
baby_movie_links['link_id'] = baby_movie_links['link'].str.replace('https://movie.douban.com/subject/', '').str.replace('/', '')
print(baby_movie_links[['name', 'link', 'link_id']].head(10)) # 成功
# 再看看总体评分的变化
print(baby_movie_links.loc[:, 'overall_star'].mean())
'''
3部分
'''
data= pd.DataFrame()
for baby_item in tqdm(baby_movie_links['link_id'].tolist()):
extract_page = 10
ID = baby_item
status='P'#or F:想看 P: 看过
SORT = 'new_score'#or time:按最新排序 new_score: 按热门排序
for page in tqdm(range(extract_page)):
url = f'https://movie.douban.com/subject/{baby_item}/comments?start={page*20}&limit=20&sort={SORT}&status={status}'
headers = { # 这是我的真实User-Agent
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'
}
try:#尝试着对我们合成的页面进行获取
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
#拿到一些电影统一的信息
movie_name = soup.find_all('div', {'id': 'content'})[0].select('h1')[0].text.replace(' 短评', '')
movie_staff = soup.find_all('span', {'class': 'attrs'})[0].find_all('a')
actor = ' '.join([name.text for name in movie_staff[1:-1]])
movie_type = soup.find_all('span', {'class': 'attrs'})[0].find_all('p')[2].text.replace(' ', '').replace('\n', '').replace('类型:', '')
movie_region = soup.find_all('span', {'class':'attrs'})[0].find_all('p')[3].text.replace(' ', '').replace('\n', '').replace('地区:', '')
movie_time = soup.find_all('span',{'class':'attrs'})[0].find_all('p')[4].text.replace(' ', '').replace('\n', '').replace('片长:', '').replace('分钟', '')
move_onboard_time = soup.find_all('span',{'class':'attrs'})[0].find_all('p')[-1].text.replace(' ','').replace('\n','').replace('上映:','')
comments = soup.find_all('div', {'class': 'comment'})
for comment in range(len(comments)):
temp_comment_info = {}
#影片固定信息
temp_comment_info['movie'] = movie_name
temp_comment_info['director'] = movie_staff[0].text
temp_comment_info['actor'] = actor
#评论信息
temp_comment_info['user'] = comments[comment].find_all('a', {'class': ''})[0].text
temp_comment_info['user_url'] = comments[comment].find_all('a', {'class': ''})[0]['href']
temp_comment_info['useful'] = int(comments[comment].find('span', {'class': 'votes'}).text)
temp_comment_info['date'] = comments[comment].find('span', {'class':'comment-time'}).text.replace('\n', '').strip(' ')
temp_comment_info['text'] = comments[comment].find('span', {'class': 'short'}).text
if status == 'P':
temp_comment_info['status'] = 'P'
try:
temp_comment_info['rating'] = int(comments[comment].find('span',{'class':'comment-info'}).find_all('span')[1]['class'][0].replace('allstar', ''))
except:
print(f'no rating in page {page}, comment {comment+1}')
else:
temp_comment_info['status'] = 'F'
data = data.append(temp_comment_info, ignore_index=True)
except:#如果没有的话就说明评论太少导致页面不足,进入下一个电影/电视剧
print(f'no page {page+1} for id {baby_item}')
break
data.to_csv('baby_comment.csv', index=None, encoding='utf-8-sig')
data = pd.read_csv('baby_comment.csv') # 读取CSV, 这个操作运行时, 我注释了爬虫, 怕总爬官方墙我
# print(data.head())
data_1 = data.copy()
'''
平均评分总览图
'''
#更改日期格式
data_1['year_mth'] = data_1['date'].str[:7].str.replace('-', '/').astype(str)
# data_1['date'] = pd.to_datetime(data_1['date'], format="%m/%d/%Y", errors='coerce')
# print(data_1.head())
#选出有评分的
data_2 = data_1[data_1['rating'].notnull()]
print(data_2)
print()
print('民众评分', data_2['rating'].mean()/10*2)
fig = px.bar(data_2.groupby('movie')['rating'].mean().sort_values(ascending=False).reset_index(), x='movie', y='rating', color='rating') # 按什么(东西)的什么(值)降序排列
fig.update_layout(title_text='TEAM-AG团队') # 图名称(会展示出来)
fig.show()
'''
民众VS豆瓣图
'''
# 我们连接上最开始的总体评分
# baby_movie_links = baby_movie_links.fillna(0)
# print('---baby_movie_links.fillna(0)---')
# print(baby_movie_links)
data_2 = data_2.merge(baby_movie_links, how='left', left_on='movie', right_on='name')
data_2.to_csv('data_2.csv', index=None, encoding='utf-8-sig')
# print(data_2)
#标准化一下评分
data_2['rating'] = data_2['rating']/10
data_2['overall_star'] = data_2['overall_star']/2
#制作表格
pingfen_compare = data_2.groupby('movie')[['rating', 'overall_star']].mean().sort_values(by='rating', ascending=False).reset_index()
pingfen_compare = pingfen_compare.fillna(0)
print('------修改前------')
print(pingfen_compare)
pingfen_compare = pingfen_compare[pingfen_compare['overall_star'] != 0] # 可以注释掉
print('------NaN修改后------')
print(pingfen_compare)
fig = go.Figure(data=[
go.Bar(name='热评评分', x=pingfen_compare['movie'], y=pingfen_compare['rating'], marker_color='lightgreen'),
go.Bar(name='豆瓣影评分', x=pingfen_compare['movie'], y=pingfen_compare['overall_star'], marker_color='lightsalmon')
])
# Change the bar mode
fig.update_layout(title_text='TEAM-AG团队') # 图名称(会展示出来)
fig.update_layout(barmode='group') # 不知道有什么用 可以注释掉
fig.show()
'''
差值图
'''
pingfen_compare['diff'] = pingfen_compare['rating']-pingfen_compare['overall_star']
fig = go.Figure(data=[go.Bar(x=pingfen_compare.sort_values('diff')['movie'], y=pingfen_compare.sort_values('diff')['diff'])])
# Customize aspect
fig.update_traces(marker_color='rgb(158,202,225)', marker_line_color='rgb(8,48,107)',
marker_line_width=1, opacity=0.7)
fig.update_layout(title_text='TEAM-AG团队') # 越大越没有嫌疑
fig.show()
'''
(评分 看看大家一般给几颗)占比图
'''
colors = ['gold', 'mediumturquoise', 'darkorange', 'lightgreen']
print('')
print('测试')
print(data_2['movie'].value_counts())
print(data_2['rating'].value_counts())
fig = go.Figure(data=[go.Pie(labels=data_2['rating'].value_counts().reset_index()['index'],
values=data_2['rating'].value_counts().reset_index()['rating'], )])
fig.update_traces(hoverinfo='label+percent', textinfo='label+percent', textfont_size=20,
marker=dict(colors=colors, line=dict(color='#000000', width=2)))
fig.update_layout(title_text='TEAM-AG团队')
fig.show()
'''
baby火爆时间
'''
print(data_2)
rating_by_ymh = pd.DataFrame(data_2.groupby(['year_mth', 'movie'])['rating'].mean()).reset_index(drop=False)
fig = px.bar(rating_by_ymh, x="year_mth", y="rating", color='movie',
height=400)
fig.update_layout(title_text='TEAM-AG团队')
fig.show()
特别鸣谢
python爬虫 403 Forbidden 解决方法
https://www.cnblogs.com/tian-sun/p/7404429.html
404、403、400 、408、305等常见错误代码解析
https://jingyan.baidu.com/article/3a2f7c2ecb8fe726afd61126.html
pandas中.loc和.iloc以及.at和.iat的区别
https://blog.csdn.net/weixin_42782150/article/details/90764949
pd.DataFrame()函数解析
https://blog.csdn.net/tefuirnever/article/details/93708964
大学四年自学走来,这些私藏的实用工具/学习网站我贡献出来了
https://blog.csdn.net/m0_37907797/article/details/102781027?utm_medium=distribute.pc_relevant_right.none-task-blog-BlogCommendFromMachineLearnPai2-3.nonecase&depth_1-utm_source=distribute.pc_relevant_right.none-task-blog-BlogCommendFromMachineLearnPai2-3.nonecase
python DataFrame 打印结果不换行方法
https://blog.csdn.net/Gooooa/article/details/79527759?utm_medium=distribute.pc_relevant.none-task-blog-BlogCommendFromMachineLearnPai2-1.nonecase&depth_1-utm_source=distribute.pc_relevant.none-task-blog-BlogCommendFromMachineLearnPai2-1.nonecase
python-详解pandas库的pd.merge函数
https://blog.csdn.net/brucewong0516/article/details/82707492
Pandas的set_index和reset_index用法
https://blog.csdn.net/cuit2016123070/article/details/83624074?utm_medium=distribute.pc_relevant_t0.none-task-blog-BlogCommendFromMachineLearnPai2-1.nonecase&depth_1-utm_source=distribute.pc_relevant_t0.none-task-blog-BlogCommendFromMachineLearnPai2-1.nonecase
pandas 中的函数—— .reset_index()
https://blog.csdn.net/weixin_43655282/article/details/97889398
豆瓣爬虫实战-棕熊永动机
https://blog.csdn.net/johnchang0201/article/details/103506364?ops_request_misc=%257B%2522request%255Fid%2522%253A%2522159186625419724845027610%2522%252C%2522scm%2522%253A%252220140713.130102334.pc%255Fall.%2522%257D&request_id=159186625419724845027610&biz_id=0&utm_medium=distribute.pc_search_result.none-task-blog-2allfirst_rank_ecpm_v1~pc_rank_v3-1-103506364.first_rank_ecpm_v1_pc_rank_v3&utm_term=%E7%88%AC%E8%99%ABangelababy
plotly使用指南
https://blog.csdn.net/u012897374/article/details/77857980?ops_request_misc=%257B%2522request%255Fid%2522%253A%2522159219838419726869035621%2522%252C%2522scm%2522%253A%252220140713.130102334…%2522%257D&request_id=159219838419726869035621&biz_id=0&utm_medium=distribute.pc_search_result.none-task-blog-2allfirst_rank_ecpm_v1~pc_rank_v3-1-77857980.first_rank_ecpm_v1_pc_rank_v3&utm_term=plotly.bar
可视化神器–Plotly
https://www.jianshu.com/p/e5fb1b5c0957
astype()函数
https://www.cnblogs.com/xxswkl/p/10989212.html
Python-copy()与deepcopy()区别
https://blog.csdn.net/qq_32907349/article/details/52190796?utm_medium=distribute.pc_relevant_t0.none-task-blog-BlogCommendFromMachineLearnPai2-1.nonecase&depth_1-utm_source=distribute.pc_relevant_t0.none-task-blog-BlogCommendFromMachineLearnPai2-1.nonecase
pandas中.value_counts()的用法
https://www.jianshu.com/p/f773b4b82c66
pandas的to_datetime时间转换使用方法以及学习的心得
https://blog.csdn.net/qq_36523839/article/details/79746977?ops_request_misc=%257B%2522request%255Fid%2522%253A%2522159222871119725222435199%2522%252C%2522scm%2522%253A%252220140713.130102334…%2522%257D&request_id=159222871119725222435199&biz_id=0&utm_medium=distribute.pc_search_result.none-task-blog-2alltop_click~default-1-79746977.first_rank_ecpm_v1_pc_rank_v3&utm_term=pd.to_datetime
Plotly Python Open Source Graphing Library
https://plotly.com/python/