数据分析:MovieLen 1M 数据集

# -*- coding: utf-8 -*-  

import pandas as pd

#数据来源 www.grouplens.org/node/73
unames = ['user_id', 'gender', 'age', 'occupation', 'zip' ]
users = pd.read_table('ml-1m/users.dat', sep='::' , header = None, names = unames )#sep 分隔符
rnames = ['user_id', 'movie_id', 'rating', 'timestamp'  ]
ratings = pd.read_table('ml-1m/ratings.dat', sep='::' , header = None, names = rnames )
mnames = ['movie_id', 'title', 'genres' ]
movies = pd.read_table('ml-1m/movies.dat', sep='::' ,header = None, names = mnames )
#print users[:5], ratings[:5], movies[:5]

#合并表格
data = pd.merge(pd.merge(ratings, users), movies ) #.merge() 函数用于合并两个数组内容到第一个数组。
#print data

#按性别计算平均得分
mean_ratings  = data.pivot_table('rating', rows='title', cols='gender', aggfunc='mean')
#print mean_ratings[:5]

#对title分组
#.size得到一个含有各电影分组大小的Series对象
ratings_by_title = data.groupby('title' ).size()
#print ratings_by_title[:10]

#过滤掉评论小于250条的电影
active_titles = ratings_by_title.index[ratings_by_title >=250]
#print active_titles

#选取所需行
mean_ratings = mean_ratings.ix[active_titles]
#print mean_ratings

#对F进行降序排列
top_female_ratings = mean_ratings.sort_index(by='F', ascending=False)
#print top_female_ratings

#分歧最大电影
mean_ratings['diff'] = mean_ratings['M'] - mean_ratings['F']
#女性更喜欢
sorted_by_diff = mean_ratings.sort_index(by='diff')
#print sorted_by_diff
#男性更喜欢
#print sorted_by_diff[::-1][:15]

#分歧最大的电影(不分性别)
#根据电影名称分类的得分数据的标准差
rating_std_by_title = data.groupby('title')['rating'].std()
#根据active_titles进行过滤
rating_std_by_title = rating_std_by_title.ix[active_titles]
#根据值对Series进行降序排列
#ascending 表示升序
print rating_std_by_title.order(ascending=False)[:10]


你可能感兴趣的:(数据分析:MovieLen 1M 数据集)