Abstract: IMDB电影数据分析练习。
数据集
包含来自MovieLens 电影推荐服务的5星评分和文本标记数据和来自IMDB1950-2012年IMDB TOP10000排行榜数据
MovieLens数据集包含27278部电影的20000263份评分和465564次标签应用
实践内容
1.什么样题材的电影评分会相对较高(较低)
2.电影时长对评分是否有影响
3.不同年代什么类型电影较受欢迎
4.其他自选角度
1.构建数据框:理想情况下,把所有数据放入这个数据框中
2.清洗数据:对构建的数据框进行数据清理,它应该具有以下属性:
Each row describes a single object
Each column describes a property of that object
Columns are numeric whenever appropriate
3.探索全局特征:通过直方图,散点图,聚合函数等获得一个数据的全局的了解
4.探索分组特征。通过一些分组操作分析数据集
%matplotlib inline import matplotlib.pyplot as plt import pandas as pd import numpy as np #tell pandas to display wide tables as pretty HTML tables pd.set_option('display.width', 500) pd.set_option('display.max_columns', 100) def remove_border(axes=None, top=False, right=False, left=True, bottom=True): """ Minimize chartjunk by stripping out unnecesasry plot borders and axis ticks The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn """ ax = axes or plt.gca() ax.spines['top'].set_visible(top) ax.spines['right'].set_visible(right) ax.spines['left'].set_visible(left) ax.spines['bottom'].set_visible(bottom) #turn off all ticks ax.yaxis.set_ticks_position('none') ax.xaxis.set_ticks_position('none') #now re-enable visibles if top: ax.xaxis.tick_top() if bottom: ax.xaxis.tick_bottom() if left: ax.yaxis.tick_left() if right: ax.yaxis.tick_right()
# 1.构建数据框 data=pd.read_csv("../input/movielens/imdb10000.csv") print data.head() # print the first 5 rows # 2.清洗数据 # 数据的问题:1.电影时长:改字符串类型为数值;2.电影流派不是原子数据,很难提取特定的流派做分析;3.上映年份在名称和时间上是重复的 # 2.1 修正电影时长(列表解析法) clean_runtime = [float(r.split(' ')[0] for r in data.runtime)] data['runtime'] = clean_runtime print data.head() # 2.2 分割流派信息(使用指示变量的概念将流派列分割成许多列。每个新的列将对应于一个单一的流派,每个单元格将为True或False) # determine the unique genres genres = set() for m in data.genres: genres.update(g for g in m.split('|')) genres = sorted(genres) # make a column for each genre for genre in genres: data[genre] = [genre in movie.split('|') for movie in data.genres] print data.head() genres = set() # 2.3 从名称中移除年份信息 data['title']=[t[0:-6] for t in data.title] print data.head() # 3.探索全局特征 print data[['score','runtime','year','votes']].describe() # 4.发现损坏数据并清洗 # 看电影时长为0的有多少个 print ((len(data[data.runtime==0]))) # 标记为nan data.runtime[data.runtime==0]=np.nan # 5.探索局部特征(通过一些基本的可视化) # 5.1 分数与年份的变化关系 plt.hist(data.score, bins=30, color='#000000') plt.xlabel('Release Year') remove_border() plt.show() # 5.2 Runtime distribution plt.hist(data.runtime, bins=50, color='#000000') plt.xlabel('Runtime distribution') plt.show() # 5.3 IMDB Rating # plt.scatter():绘制散点图 plt.scatter(data.year,data.score,lw=0,alpha=.08,color='k') plt.xlabel("Year") plt.ylabel("IMDB Rating") remove_border() # 6.寻找异常点 # 6.1 评价较低但投票数高 print data[(data.votes > 9e4)] & data[(data.score < 5)][['title','year','score','votes','genres']] # 6.2 最低评分电影 print data[data.score == data.score.min()][['title','year','score','votes','genres']] # 6.3 最高评分电影 print data[data.score == data.score.max()][['title','year','score','votes','genres']] # 7.对一些行或列,使用聚合函数如 sum 进行分析 # 7.1 哪些流派出现频次最高? print genre_count = np.sort(data[genres].sum())[::-1] pd.DataFrame({'genre count':genre_count}) # 7.2 平均一部电影有多少个流派标记? genre_count = data[genres].sum(axis=1) print (("average movie has %0.2f genres" % genre_count.mean())) print genre_count.describe() # 8.探索分组特征 # 8.1 将电影按年代划分 decade = (data.year // 10)*10 tyd = data[['title','year']] tyd['decade'] = decade print tyd.head() # 8.2 将电影按年代分组(groupby) decade_mean = data.groupby(decade).score.mean() decade_mean.name = 'Decade Mean' print(decade_mean) plt.plot(decade_mean.index, decade_mean.values, 'o-', color='r', lw=3, label='Decade Average') plt.scatter(data.year, data.score, alpha=.04, lw=0, color='k') plt.xlabel("Year") plt.ylabel("Score") plt.legend(frameon=False) remove_border() # 8.3 看每年评分的分散情况 grouped_scores = data.groupby(decade).score mean = grouped_scores.mean() std = grouped_scores.std() plt.plot(decade_mean.index, decade_mean.values, 'o-',color='r', lw=3, label='Decade Average') plt.fill_between(decade_mean.index, (decade_mean + std).values, (decade_mean - std).values, color='r', alpha=.2) plt.scatter(data.year, data.score, alpha=.04, lw=0, color='k') plt.xlabel("Year") plt.ylabel("Score") plt.legend(frameon=False) remove_border() # 9.small multiples # 9.1 按流派划分数据,看发行时间、时长、评分如何分布 fig, axes = plt.subplots(nrows=4,ncols=6,figsize=(12,8),tight_layout=True) bins = np.arange(1950, 2013, 3) for ax, genre in zip(axes.ravel(), genres): ax.hist(data[data[genre] == 1].year,bins=bins, histtype='stepfilled', normed=True, color='r', alpha=.3, ec='none') ax.hist(data.year, bins=bins, histtype='stepfilled', ec='None', normed=True, zorder=0, color='#cccccc') ax.annotate(genre, xy=(1955, 3e-2), fontsize=14) ax.xaxis.set_ticks(np.arange(1950, 2013, 30)) ax.set_yticks([]) remove_border(ax, left=False) ax.set_xlabel('Year')
btw:欢迎关注 ~
Github: https://github.com/ScarlettYellow
个人博客:https://scarletthuang.cn/