电影评分数据

import pandas as pd
import os #导入os模块
encoding = 'latin1'# 格式设置成“latinl""
#用pandas.read_table讲各个表分别读到一个pandas Dataframe对像中
#pandas 有两种数据格式Dataframe表示一个表格,类似表格的数据结构;Series是一个一维的类似的数组对象,包含一个数组的数据
#和一个与数组关联的数据标签,被叫做索引,最简单的Series是由一个数组的数据构成:
#os.path模块重要用于文件的属性获取,
#os.path.expanduser(path)  #把path中包含的"~"和"~user"转换成用户目录
#os.path.expandvars(path)  #根据环境变量的值替换path中包含的”$name”和”${name}”
upath = os.path.expanduser('ch02/movielens/users.dat')
rpath = os.path.expanduser('ch02/movielens/ratings.dat')
mpath = os.path.expanduser('ch02/movielens/movies.dat')

unames = ['user_id', 'gender', 'age', 'occupation', 'zip']#设列名称
rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
mnames = ['movie_id', 'title', 'genres']
users = pd.read_csv(upath, sep='::', header=None, names=unames, encoding=encoding)
ratings = pd.read_csv(rpath, sep='::', header=None, names=rnames, encoding=encoding)
movies = pd.read_csv(mpath, sep='::', header=None, names=mnames, encoding=encoding)
#这里是用pd.read_cav读取文件 #sep="::"表示记录用“:”隔开如果记录使用“,”就sep=","
#header=None默认无,encoding=encoding默认编码如utf-8
/Users/zhongyaode/anaconda/lib/python3.6/site-packages/ipykernel/__main__.py:1: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.
  if __name__ == '__main__':
/Users/zhongyaode/anaconda/lib/python3.6/site-packages/ipykernel/__main__.py:2: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.
  from ipykernel import kernelapp as app
/Users/zhongyaode/anaconda/lib/python3.6/site-packages/ipykernel/__main__.py:3: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.
  app.launch_new_instance()

users[:20]#切语法;查看DataFrame

ratings[:5]

movies[:5]

ratings

#利用pandas将ratings和users合并到一起,然后再将moview也合并到一起

#pandas会根据列名推断哪些列是合并列

data = pd.merge(pd.merge(ratings, users), movies)

data

data = pd.merge(pd.merge(ratings,users),movies)
data

data['rating'].mean()
3.5815644530293169
data.ix[1]#索引
user_id                                            2
movie_id                                        1193
rating                                             5
timestamp                                  978298413
gender                                             M
age                                               56
occupation                                        16
zip                                            70072
title         One Flew Over the Cuckoo's Nest (1975)
genres                                         Drama
Name: 1, dtype: object

按性别计算每部电影的平均得分,可以使用pivot_table

mean_ratings = data.pivot_table('rating', index='title',
columns='gender', aggfunc='mean')
mean_ratings[:7]

#过滤掉数据不足250条的电影,对title进行分组,利用ize()得到一个含有个各电影分组大小的Series的对象
ratings_by_title = data.groupby('title').size()
ratings_by_title[:10]
title
$1,000,000 Duck (1971)                37
'Night Mother (1986)                  70
'Til There Was You (1997)             52
'burbs, The (1989)                   303
...And Justice for All (1979)        199
1-900 (1994)                           2
10 Things I Hate About You (1999)    700
101 Dalmatians (1961)                565
101 Dalmatians (1996)                364
12 Angry Men (1957)                  616
dtype: int64
active_titles=ratings_by_title.index[ratings_by_title>=250]
active_titles
Index([''burbs, The (1989)', '10 Things I Hate About You (1999)',
       '101 Dalmatians (1961)', '101 Dalmatians (1996)', '12 Angry Men (1957)',
       '13th Warrior, The (1999)', '2 Days in the Valley (1996)',
       '20,000 Leagues Under the Sea (1954)', '2001: A Space Odyssey (1968)',
       '2010 (1984)',
       ...
       'X-Men (2000)', 'Year of Living Dangerously (1982)',
       'Yellow Submarine (1968)', 'You've Got Mail (1998)',
       'Young Frankenstein (1974)', 'Young Guns (1988)',
       'Young Guns II (1990)', 'Young Sherlock Holmes (1985)',
       'Zero Effect (1998)', 'eXistenZ (1999)'],
      dtype='object', name='title', length=1216)

mean_ratings=mean_ratings.ix[active_titles]
mean_ratings

为了了解女性最了解的电影,对F进行降序排列

top_female_ratings = mean_ratings.sort_values(by='F', ascending=False)#sort_index被弃用使用sort_values
top_female_ratings[:10]

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

data['age'].mean()
29.738313692438279
data['age'].max()
56
data['age'].min()
1
data['age'].var()#var方差std标准差
138.10909427256377
#画直方图
fig=plt.figure()
x=data['age']
ax=fig.add_subplot(111)
numBins=5
ax.hist(x,numBins,color='red',alpha=0.8,rwidth=0.5)
plt.title(u'age')
plt.show()

import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
%matplotlib inline
%config InlineBackend.figure_format='retina'
def normfun(x,mu,sigma):
    pdf=np.exp(-((x-mu)**2)/(2*sigma**2))/(sigma*np.sqrt(2*np.pi))
    return pdf
def normfun(x,mu,sigma):
    pdf=np.exp(-((x-mu)**2)/(2*sigma**2))/(sigma*np.sqrt(2*np.pi))
    return pdf
p=data['age']
x=np.arange(1,60,1)#140和160是起点和终点,根据最大、最小、均值设置;
#0.3表示线的弯曲角度(步长)
y=normfun(x,mean,std)
plt.plot(x,y)
plt.hist(p,bins=6,rwidth=0.9,normed=True)#time表示数据,bins表示显示的组数
#rwidth表示每组宽度,normed表示显示曲线
plt.title("age")
plt.xlabel("stakes")
plt.ylabel("Probability")
plt.show()
x=np.arange(1,60,1)#140和160是起点和终点,根据最大、最小、均值设置;
#0.3表示线的弯曲角度(步长)
y=normfun(x,mean,std)
plt.plot(x,y)
plt.hist(p,bins=2,rwidth=0.9,normed=True)#time表示数据,bins表示显示的组数
#rwidth表示每组宽度,normed表示显示曲线
plt.title("time")
plt.xlabel("stakes")
plt.ylabel("Probability")
plt.show()
x=np.arange(1,60,1)#140和160是起点和终点,根据最大、最小、均值设置;
#0.3表示线的弯曲角度(步长)
y=normfun(x,mean,std)
plt.plot(x,y)
![
![
![
![
![output_34_0.png](http://upload-images.jianshu.io/upload_images/2007820-408b82f75a63a3f3.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
](http://upload-images.jianshu.io/upload_images/2007820-cb7ffce4a59d0504.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
](http://upload-images.jianshu.io/upload_images/2007820-a8fce3d1a47c8184.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
](http://upload-images.jianshu.io/upload_images/2007820-9271b213266a0748.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
](http://upload-images.jianshu.io/upload_images/2007820-0554285cc748af49.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
plt.hist(p,bins=6,rwidth=0.9,normed=True)#time表示数据,bins表示显示的组数
#rwidth表示每组宽度,normed表示显示曲线
plt.title("time")
plt.xlabel("stakes")
plt.ylabel("Probability")
plt.show()
len(p)#显示记录数
1000209
std=p.std()
std
11.751982567744209
mean=p.mean()
std=p.std()
结果分析:电影人的平均值为29.74,大部分人的年纪在20~30之间。
标准差是11.75,波动较小有68%的人年纪在29.74-11.75到29.74+11.75之间
数据显示10岁以下的人很少,广告应该控制在20~60之间
a=p[:100000]#拿出前10%的数据,进行分析
x=np.arange(1,60,1)#140和160是起点和终点,根据最大、最小、均值设置;
#0.3表示线的弯曲角度(步长)
y=normfun(x,mean,std)
plt.plot(x,y)
plt.hist(a,bins=6,rwidth=0.9,normed=True)#time表示数据,bins表示显示的组数
#rwidth表示每组宽度,normed表示显示曲线
plt.title("age")
plt.xlabel("stakes")
plt.ylabel("Probability")
plt.show()
c=p[:10000]#拿出前1%的数据进行分析
x=np.arange(1,60,1)#140和160是起点和终点,根据最大、最小、均值设置;
#0.3表示线的弯曲角度(步长)
y=normfun(x,mean,std)
plt.plot(x,y)
plt.hist(c,bins=6,rwidth=0.9,normed=True)#time表示数据,bins表示显示的组数
#rwidth表示每组宽度,normed表示显示曲线
plt.title("age")
plt.xlabel("stakes")
plt.ylabel("Probability")
plt.show()
符合上面的结论:
##时间有限先到这里,相信随着深入会体会到更多的乐趣

你可能感兴趣的:(电影评分数据)