# 主程序
import requests
import json
from bs4 import BeautifulSoup
import pandas as pd
import time
#pandas查看数据显示不全问题的解决
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.unicode.ambiguous_as_wide', True)
pd.set_option('display.unicode.east_asian_width', True)
pd.set_option('display.width', 5000)
# 获取热门电影url
def HTML(url):
ls = []
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'}
try:
r = requests.get(url, headers=headers)
r.raise_for_status()
r.encoding = r.apparent_encoding
subjects = r.json()['subjects']
for i in range(len(subjects)):
ls.append(subjects[i]['url'])
return ls
except:
print("wrong")
# 获取每部电影信息
def each_movie(url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36"
}
r = requests.get(url=url, headers=headers)
r.raise_for_status()
html = r.text
soup = BeautifulSoup(html, 'html.parser')
datas = soup.find_all('script')
temp = str(datas[7])
items = temp.strip('')
data = json.loads(items, strict=False)
movieInfo = soup.find('div', attrs={'id': 'info'})
rate = data["aggregateRating"]['ratingValue']
director =(data['director'])[0]['name']
made_country = (movieInfo.find(attrs={'property': "v:initialReleaseDate"}).string)[11:-1]
runtime = movieInfo.find(attrs={'property': "v:runtime"}).string
actor_list = []
for iterm in data['actor']:
actor_list.append(iterm['name'])
movie_list.append((data['name'], rate,director, actor_list, data['genre'], made_country, data['datePublished'], runtime,data['description']))
return movie_list
movie_list = []
movie_list.append(("电影名称", "评分","导演", "主演", "类型", "制方国家", "上映时间", "片长", "介绍"))
url = 'https://movie.douban.com/j/search_subjects?type=movie&tag=%E7%83%AD%E9%97%A8&page_limit=50&page_start=0'
ls = HTML(url)
for item in ls:
each_movie(item)
time.sleep(1)
df = pd.DataFrame(movie_list)
df.to_csv(r'douban.csv',index=False)
print(df)
然后对数据做一些简单的处理,便于后面的可视化
主演传进去的是列表类型的,我先把外面的括号去掉,还有类型,上映时间转成日期格式
```python
df['主演'] = df['主演'].str.strip('[]')
df['类型'] = df['类型'].str.strip('[]')
df['上映时间']=pd.to_datetime(df['上映时间'],format='%Y-%m-%d')
df.info()#查看每一列信息
做一些简单的数据清洗(写两种方法)
1.基于正态分布的离群点检测
rate = df['评分'].mean()
std = df['评分'].std()
any(df['评分']>rate+3*std)
any(df['评分']<rate-3*std)
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
# df.duplicated(subset=df['电影名称'])
df.评分.plot(kind='hist',bins=30,density=True)
df.评分.plot(kind='kde' )
plt.show()
2.95盖帽法
q95 = df['评分'].quantile(q=0.95)
q1 = df['评分'].quantile(q=0.01)
df.loc[df['评分']>q95,'新评分']=q95
df.loc[df['评分']<q1,'新评分']=q1
df[['评分','新评分']].describe()
df['新评分'].plot(kind='box')
plt.show()
然后是可视化
做了评分和类型的,其实是一样的
评分
import matplotlib as mpl
import matplotlib.pyplot as plt
# plt.rcParams['font.sans-serif'] = ['SimHei']
area_split = df['制方国家'].str.split('/').apply(pd.Series)
a = area_split.apply(pd.value_counts)
area_count = a.sum(axis=1)
area_df = pd.DataFrame(area_count, columns=['count'], dtype=int).sort_values(by='count')
area_df.plot.barh()
类型
mpl.style.use('ggplot')
area_split = df['类型'].str.split(',').apply(pd.Series)
a = area_split.apply(pd.value_counts)
area_count = a.sum(axis=1)
area_df = pd.DataFrame(area_count, columns=['count'], dtype=int).sort_values(by='count')
area_df.plot.barh()
好了,就到这啦,基础太差啦
这个大作业做了好久
还有很不满意的就是从网页代码提取信息的时候不会用正则表达式,希望大家帮忙指正!!!
要继续加油哦