import pandas as pd
import os #导入os模块
encoding = 'latin1'# 格式设置成“latinl""
#用pandas.read_table讲各个表分别读到一个pandas Dataframe对像中
#pandas 有两种数据格式Dataframe表示一个表格,类似表格的数据结构;Series是一个一维的类似的数组对象,包含一个数组的数据
#和一个与数组关联的数据标签,被叫做索引,最简单的Series是由一个数组的数据构成:
#os.path模块重要用于文件的属性获取,
#os.path.expanduser(path) #把path中包含的"~"和"~user"转换成用户目录
#os.path.expandvars(path) #根据环境变量的值替换path中包含的”$name”和”${name}”
upath = os.path.expanduser('ch02/movielens/users.dat')
rpath = os.path.expanduser('ch02/movielens/ratings.dat')
mpath = os.path.expanduser('ch02/movielens/movies.dat')
unames = ['user_id', 'gender', 'age', 'occupation', 'zip']#设列名称
rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
mnames = ['movie_id', 'title', 'genres']
users = pd.read_csv(upath, sep='::', header=None, names=unames, encoding=encoding)
ratings = pd.read_csv(rpath, sep='::', header=None, names=rnames, encoding=encoding)
movies = pd.read_csv(mpath, sep='::', header=None, names=mnames, encoding=encoding)
#这里是用pd.read_cav读取文件 #sep="::"表示记录用“:”隔开如果记录使用“,”就sep=","
#header=None默认无,encoding=encoding默认编码如utf-8
/Users/zhongyaode/anaconda/lib/python3.6/site-packages/ipykernel/__main__.py:1: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.
if __name__ == '__main__':
/Users/zhongyaode/anaconda/lib/python3.6/site-packages/ipykernel/__main__.py:2: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.
from ipykernel import kernelapp as app
/Users/zhongyaode/anaconda/lib/python3.6/site-packages/ipykernel/__main__.py:3: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.
app.launch_new_instance()
users[:20]#切语法;查看DataFrame
ratings[:5]
movies[:5]
ratings
#利用pandas将ratings和users合并到一起,然后再将moview也合并到一起
#pandas会根据列名推断哪些列是合并列
data = pd.merge(pd.merge(ratings, users), movies)
data
data = pd.merge(pd.merge(ratings,users),movies)
data
data['rating'].mean()
3.5815644530293169
data.ix[1]#索引
user_id 2
movie_id 1193
rating 5
timestamp 978298413
gender M
age 56
occupation 16
zip 70072
title One Flew Over the Cuckoo's Nest (1975)
genres Drama
Name: 1, dtype: object
按性别计算每部电影的平均得分,可以使用pivot_table
mean_ratings = data.pivot_table('rating', index='title',
columns='gender', aggfunc='mean')
mean_ratings[:7]
#过滤掉数据不足250条的电影,对title进行分组,利用ize()得到一个含有个各电影分组大小的Series的对象
ratings_by_title = data.groupby('title').size()
ratings_by_title[:10]
title
$1,000,000 Duck (1971) 37
'Night Mother (1986) 70
'Til There Was You (1997) 52
'burbs, The (1989) 303
...And Justice for All (1979) 199
1-900 (1994) 2
10 Things I Hate About You (1999) 700
101 Dalmatians (1961) 565
101 Dalmatians (1996) 364
12 Angry Men (1957) 616
dtype: int64
active_titles=ratings_by_title.index[ratings_by_title>=250]
active_titles
Index([''burbs, The (1989)', '10 Things I Hate About You (1999)',
'101 Dalmatians (1961)', '101 Dalmatians (1996)', '12 Angry Men (1957)',
'13th Warrior, The (1999)', '2 Days in the Valley (1996)',
'20,000 Leagues Under the Sea (1954)', '2001: A Space Odyssey (1968)',
'2010 (1984)',
...
'X-Men (2000)', 'Year of Living Dangerously (1982)',
'Yellow Submarine (1968)', 'You've Got Mail (1998)',
'Young Frankenstein (1974)', 'Young Guns (1988)',
'Young Guns II (1990)', 'Young Sherlock Holmes (1985)',
'Zero Effect (1998)', 'eXistenZ (1999)'],
dtype='object', name='title', length=1216)
mean_ratings=mean_ratings.ix[active_titles]
mean_ratings
为了了解女性最了解的电影,对F进行降序排列
top_female_ratings = mean_ratings.sort_values(by='F', ascending=False)#sort_index被弃用使用sort_values
top_female_ratings[:10]
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
data['age'].mean()
29.738313692438279
data['age'].max()
56
data['age'].min()
1
data['age'].var()#var方差std标准差
138.10909427256377
#画直方图
fig=plt.figure()
x=data['age']
ax=fig.add_subplot(111)
numBins=5
ax.hist(x,numBins,color='red',alpha=0.8,rwidth=0.5)
plt.title(u'age')
plt.show()
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format='retina'
def normfun(x,mu,sigma):
pdf=np.exp(-((x-mu)**2)/(2*sigma**2))/(sigma*np.sqrt(2*np.pi))
return pdf
def normfun(x,mu,sigma):
pdf=np.exp(-((x-mu)**2)/(2*sigma**2))/(sigma*np.sqrt(2*np.pi))
return pdf
p=data['age']
x=np.arange(1,60,1)#140和160是起点和终点,根据最大、最小、均值设置;
#0.3表示线的弯曲角度(步长)
y=normfun(x,mean,std)
plt.plot(x,y)
plt.hist(p,bins=6,rwidth=0.9,normed=True)#time表示数据,bins表示显示的组数
#rwidth表示每组宽度,normed表示显示曲线
plt.title("age")
plt.xlabel("stakes")
plt.ylabel("Probability")
plt.show()
x=np.arange(1,60,1)#140和160是起点和终点,根据最大、最小、均值设置;
#0.3表示线的弯曲角度(步长)
y=normfun(x,mean,std)
plt.plot(x,y)
plt.hist(p,bins=2,rwidth=0.9,normed=True)#time表示数据,bins表示显示的组数
#rwidth表示每组宽度,normed表示显示曲线
plt.title("time")
plt.xlabel("stakes")
plt.ylabel("Probability")
plt.show()
x=np.arange(1,60,1)#140和160是起点和终点,根据最大、最小、均值设置;
#0.3表示线的弯曲角度(步长)
y=normfun(x,mean,std)
plt.plot(x,y)
![
![
![
![
![output_34_0.png](http://upload-images.jianshu.io/upload_images/2007820-408b82f75a63a3f3.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
](http://upload-images.jianshu.io/upload_images/2007820-cb7ffce4a59d0504.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
](http://upload-images.jianshu.io/upload_images/2007820-a8fce3d1a47c8184.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
](http://upload-images.jianshu.io/upload_images/2007820-9271b213266a0748.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
](http://upload-images.jianshu.io/upload_images/2007820-0554285cc748af49.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
plt.hist(p,bins=6,rwidth=0.9,normed=True)#time表示数据,bins表示显示的组数
#rwidth表示每组宽度,normed表示显示曲线
plt.title("time")
plt.xlabel("stakes")
plt.ylabel("Probability")
plt.show()
len(p)#显示记录数
1000209
std=p.std()
std
11.751982567744209
mean=p.mean()
std=p.std()
结果分析:电影人的平均值为29.74,大部分人的年纪在20~30之间。
标准差是11.75,波动较小有68%的人年纪在29.74-11.75到29.74+11.75之间
数据显示10岁以下的人很少,广告应该控制在20~60之间
a=p[:100000]#拿出前10%的数据,进行分析
x=np.arange(1,60,1)#140和160是起点和终点,根据最大、最小、均值设置;
#0.3表示线的弯曲角度(步长)
y=normfun(x,mean,std)
plt.plot(x,y)
plt.hist(a,bins=6,rwidth=0.9,normed=True)#time表示数据,bins表示显示的组数
#rwidth表示每组宽度,normed表示显示曲线
plt.title("age")
plt.xlabel("stakes")
plt.ylabel("Probability")
plt.show()
c=p[:10000]#拿出前1%的数据进行分析
x=np.arange(1,60,1)#140和160是起点和终点,根据最大、最小、均值设置;
#0.3表示线的弯曲角度(步长)
y=normfun(x,mean,std)
plt.plot(x,y)
plt.hist(c,bins=6,rwidth=0.9,normed=True)#time表示数据,bins表示显示的组数
#rwidth表示每组宽度,normed表示显示曲线
plt.title("age")
plt.xlabel("stakes")
plt.ylabel("Probability")
plt.show()
符合上面的结论:
##时间有限先到这里,相信随着深入会体会到更多的乐趣