python爬取豆瓣热门电影并实现数据可视化(小白学习记录)

python爬取豆瓣热门电影并实现数据可视化(小白学习记录)

# 主程序
import requests
import json
from bs4 import BeautifulSoup
import pandas as pd
import time

#pandas查看数据显示不全问题的解决
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.unicode.ambiguous_as_wide', True)
pd.set_option('display.unicode.east_asian_width', True)
pd.set_option('display.width', 5000)


# 获取热门电影url
def HTML(url):
    ls = []
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'}

    try:
        r = requests.get(url, headers=headers)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        subjects = r.json()['subjects']
        for i in range(len(subjects)):
            ls.append(subjects[i]['url'])
        return ls
    except:
        print("wrong")
 # 获取每部电影信息

def each_movie(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36"
    }
    r = requests.get(url=url, headers=headers)
    r.raise_for_status()
    html = r.text
    soup = BeautifulSoup(html, 'html.parser')
    datas = soup.find_all('script')
    temp = str(datas[7])
    items = temp.strip('')
    data = json.loads(items, strict=False)
    movieInfo = soup.find('div', attrs={'id': 'info'})
    rate = data["aggregateRating"]['ratingValue']
    director =(data['director'])[0]['name']
    made_country = (movieInfo.find(attrs={'property': "v:initialReleaseDate"}).string)[11:-1]
    runtime = movieInfo.find(attrs={'property': "v:runtime"}).string
    actor_list = []
    for iterm in data['actor']:
        actor_list.append(iterm['name'])

    movie_list.append((data['name'], rate,director, actor_list, data['genre'], made_country, data['datePublished'], runtime,data['description']))
    return movie_list


movie_list = []
movie_list.append(("电影名称", "评分","导演", "主演", "类型", "制方国家", "上映时间", "片长", "介绍"))
url = 'https://movie.douban.com/j/search_subjects?type=movie&tag=%E7%83%AD%E9%97%A8&page_limit=50&page_start=0'
ls = HTML(url)
for item in ls:
    each_movie(item)
    time.sleep(1)
df = pd.DataFrame(movie_list)
df.to_csv(r'douban.csv',index=False)
print(df)

然后对数据做一些简单的处理,便于后面的可视化
主演传进去的是列表类型的,我先把外面的括号去掉,还有类型,上映时间转成日期格式


```python
df['主演'] = df['主演'].str.strip('[]')
df['类型'] = df['类型'].str.strip('[]')
df['上映时间']=pd.to_datetime(df['上映时间'],format='%Y-%m-%d')
df.info()#查看每一列信息

做一些简单的数据清洗(写两种方法)
1.基于正态分布的离群点检测

rate = df['评分'].mean()
std = df['评分'].std()
any(df['评分']>rate+3*std)
any(df['评分']<rate-3*std)
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
# df.duplicated(subset=df['电影名称'])
df.评分.plot(kind='hist',bins=30,density=True)
df.评分.plot(kind='kde' )
plt.show()

2.95盖帽法

q95 = df['评分'].quantile(q=0.95)
q1 = df['评分'].quantile(q=0.01)
df.loc[df['评分']>q95,'新评分']=q95
df.loc[df['评分']<q1,'新评分']=q1
df[['评分','新评分']].describe()
df['新评分'].plot(kind='box')
plt.show()

然后是可视化
做了评分和类型的,其实是一样的

评分

import  matplotlib as mpl
import matplotlib.pyplot as plt
# plt.rcParams['font.sans-serif'] = ['SimHei'] 
area_split = df['制方国家'].str.split('/').apply(pd.Series)
a = area_split.apply(pd.value_counts)
area_count = a.sum(axis=1)
area_df = pd.DataFrame(area_count, columns=['count'], dtype=int).sort_values(by='count')
area_df.plot.barh()

类型

mpl.style.use('ggplot')
area_split = df['类型'].str.split(',').apply(pd.Series)
a = area_split.apply(pd.value_counts)
area_count = a.sum(axis=1)
area_df = pd.DataFrame(area_count, columns=['count'], dtype=int).sort_values(by='count')
area_df.plot.barh()

好了,就到这啦,基础太差啦
这个大作业做了好久
还有很不满意的就是从网页代码提取信息的时候不会用正则表达式,希望大家帮忙指正!!!
要继续加油哦

你可能感兴趣的:(python爬取豆瓣热门电影并实现数据可视化(小白学习记录))