读取列表
import pandas as pd
unames=['user_id','gender','age','occupation','zip']
users=pd.read_table('E:/DataAnalysis/pydata-book/pydata-book-1st-edition/ch02/movielens/users.dat',sep='::',header=None,names=unames)
rnames=['user_id','movie_id','rating','timestamp']
ratings=pd.read_table('E:/DataAnalysis/pydata-book/pydata-book-1st-edition/ch02/movielens/ratings.dat',sep='::',header=None,names=rnames)
mnames=['movie_id','title','genres'] #注意这里是genres,不是gender,否者无法合成movies=pd.read_table('E:/DataAnalysis/pydata-book/pydata-book-1st-edition/ch02/movielens/movies.dat',sep='::',header=None,names=mnames)
合并列表
data=pd.merge(pd.merge(ratings,users),movies)
求平均分,且有效评分在250次及以上
mean_ratings=data.pivot_table('rating',index='title',columns='gender',aggfunc='mean')#求不同性别平均分数
mean_ratings[:5]
Out[41]:
gender F M
title
$1,000,000 Duck (1971) 3.375000 2.761905
'Night Mother (1986) 3.388889 3.352941
'Til There Was You (1997) 2.675676 2.733333
'burbs, The (1989) 2.793478 2.962085
...And Justice for All (1979) 3.828571 3.689024
#过滤评分不够250条的电影,先对title分组,利用size()得到一个含有个电影分组大小的series
rating_by_title=data.groupby('title').size()
rating_by_title[:10]
Out[44]:
title
$1,000,000 Duck (1971) 37
'Night Mother (1986) 70
'Til There Was You (1997) 52
'burbs, The (1989) 303
...And Justice for All (1979) 199
1-900 (1994) 2
10 Things I Hate About You (1999) 700
101 Dalmatians (1961) 565
101 Dalmatians (1996) 364
12 Angry Men (1957) 616
dtype: int64
active_titles=rating_by_title.index[rating_by_title>=250]
active_titles
Out[46]:
Index([''burbs, The (1989)', '10 Things I Hate About You (1999)',
'101 Dalmatians (1961)', '101 Dalmatians (1996)', '12 Angry Men (1957)',
'13th Warrior, The (1999)', '2 Days in the Valley (1996)',
'20,000 Leagues Under the Sea (1954)', '2001: A Space Odyssey (1968)',
'2010 (1984)',
...
'X-Men (2000)', 'Year of Living Dangerously (1982)',
'Yellow Submarine (1968)', 'You've Got Mail (1998)',
'Young Frankenstein (1974)', 'Young Guns (1988)',
'Young Guns II (1990)', 'Young Sherlock Holmes (1985)',
'Zero Effect (1998)', 'eXistenZ (1999)'],
dtype='object', name='title', length=1216)
#评分数据大于250条,男女的平均评分
mean_ratings=mean_ratings.ix[active_titles]
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: DeprecationWarning:
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing
See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
"""Entry point for launching an IPython kernel.
mean_ratings=mean_ratings.loc[active_titles]
mean_ratings
Out[50]:
gender F M
title
'burbs, The (1989) 2.793478 2.962085
10 Things I Hate About You (1999) 3.646552 3.311966
101 Dalmatians (1961) 3.791444 3.500000
101 Dalmatians (1996) 3.240000 2.911215
12 Angry Men (1957) 4.184397 4.328421
13th Warrior, The (1999) 3.112000 3.168000
2 Days in the Valley (1996) 3.488889 3.244813
20,000 Leagues Under the Sea (1954) 3.670103 3.709205
2001: A Space Odyssey (1968) 3.825581 4.129738
2010 (1984) 3.446809 3.413712
28 Days (2000) 3.209424 2.977707
39 Steps, The (1935) 3.965517 4.107692
54 (1998) 2.701754 2.782178
7th Voyage of Sinbad, The (1958) 3.409091 3.658879
8MM (1999) 2.906250 2.850962
About Last Night... (1986) 3.188679 3.140909
Absent Minded Professor, The (1961) 3.469388 3.446809
Absolute Power (1997) 3.469136 3.327759
Abyss, The (1989) 3.659236 3.689507
Ace Ventura: Pet Detective (1994) 3.000000 3.197917
Ace Ventura: When Nature Calls (1995) 2.269663 2.543333
Addams Family Values (1993) 3.000000 2.878531
Addams Family, The (1991) 3.186170 3.163498
Adventures in Babysitting (1987) 3.455782 3.208122
Adventures of Buckaroo Bonzai Across the 8th Di... 3.308511 3.402321
Adventures of Priscilla, Queen of the Desert, T... 3.989071 3.688811
Adventures of Robin Hood, The (1938) 4.166667 3.918367
African Queen, The (1951) 4.324232 4.223822
Age of Innocence, The (1993) 3.827068 3.339506
Agnes of God (1985) 3.534884 3.244898
... ...
White Men Can't Jump (1992) 3.028777 3.231061
Who Framed Roger Rabbit? (1988) 3.569378 3.713251
Who's Afraid of Virginia Woolf? (1966) 4.029703 4.096939
Whole Nine Yards, The (2000) 3.296552 3.404814
Wild Bunch, The (1969) 3.636364 4.128099
Wild Things (1998) 3.392000 3.459082
Wild Wild West (1999) 2.275449 2.131973
William Shakespeare's Romeo and Juliet (1996) 3.532609 3.318644
Willow (1988) 3.658683 3.453543
Willy Wonka and the Chocolate Factory (1971) 4.063953 3.789474
Witness (1985) 4.115854 3.941504
Wizard of Oz, The (1939) 4.355030 4.203138
Wolf (1994) 3.074074 2.899083
Women on the Verge of a Nervous Breakdown (1988) 3.934307 3.865741
Wonder Boys (2000) 4.043796 3.913649
Working Girl (1988) 3.606742 3.312500
World Is Not Enough, The (1999) 3.337500 3.388889
Wrong Trousers, The (1993) 4.588235 4.478261
Wyatt Earp (1994) 3.147059 3.283898
X-Files: Fight the Future, The (1998) 3.489474 3.493797
X-Men (2000) 3.682310 3.851702
Year of Living Dangerously (1982) 3.951220 3.869403
Yellow Submarine (1968) 3.714286 3.689286
You've Got Mail (1998) 3.542424 3.275591
Young Frankenstein (1974) 4.289963 4.239177
Young Guns (1988) 3.371795 3.425620
Young Guns II (1990) 2.934783 2.904025
Young Sherlock Holmes (1985) 3.514706 3.363344
Zero Effect (1998) 3.864407 3.723140
eXistenZ (1999) 3.098592 3.289086
求女性最喜欢的电影, 就是根据F进行降级排列
#求女性喜欢的电影,对F进行降序排列
top_female_ratings=mean_ratings.sort_index(by='F',ascending='False')
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: by argument to sort_index is deprecated, please use .sort_values(by=...)
Entry point for launching an IPython kernel.
#报错,错误两个:1,sort_index已经改成 sort_values,2,'False'没有单引号#
top_female_ratings=mean_ratings.sort_values(by='F',ascending=False)
top_female_ratings[:10]
计算评分分歧
#计算评分分歧,再增加一列计算分差
mean_ratings
Out[62]:
gender F M
title
'burbs, The (1989) 2.793478 2.962085
10 Things I Hate About You (1999) 3.646552 3.311966
101 Dalmatians (1961) 3.791444 3.500000
101 Dalmatians (1996) 3.240000 2.911215
12 Angry Men (1957) 4.184397 4.328421
13th Warrior, The (1999) 3.112000 3.168000
2 Days in the Valley (1996) 3.488889 3.244813
20,000 Leagues Under the Sea (1954) 3.670103 3.709205
2001: A Space Odyssey (1968) 3.825581 4.129738
2010 (1984) 3.446809 3.413712
28 Days (2000) 3.209424 2.977707
39 Steps, The (1935) 3.965517 4.107692
54 (1998) 2.701754 2.782178
7th Voyage of Sinbad, The (1958) 3.409091 3.658879
8MM (1999) 2.906250 2.850962
About Last Night... (1986) 3.188679 3.140909
Absent Minded Professor, The (1961) 3.469388 3.446809
Absolute Power (1997) 3.469136 3.327759
Abyss, The (1989) 3.659236 3.689507
Ace Ventura: Pet Detective (1994) 3.000000 3.197917
Ace Ventura: When Nature Calls (1995) 2.269663 2.543333
Addams Family Values (1993) 3.000000 2.878531
Addams Family, The (1991) 3.186170 3.163498
Adventures in Babysitting (1987) 3.455782 3.208122
Adventures of Buckaroo Bonzai Across the 8th Di... 3.308511 3.402321
Adventures of Priscilla, Queen of the Desert, T... 3.989071 3.688811
Adventures of Robin Hood, The (1938) 4.166667 3.918367
African Queen, The (1951) 4.324232 4.223822
Age of Innocence, The (1993) 3.827068 3.339506
Agnes of God (1985) 3.534884 3.244898
... ...
White Men Can't Jump (1992) 3.028777 3.231061
Who Framed Roger Rabbit? (1988) 3.569378 3.713251
Who's Afraid of Virginia Woolf? (1966) 4.029703 4.096939
Whole Nine Yards, The (2000) 3.296552 3.404814
Wild Bunch, The (1969) 3.636364 4.128099
Wild Things (1998) 3.392000 3.459082
Wild Wild West (1999) 2.275449 2.131973
William Shakespeare's Romeo and Juliet (1996) 3.532609 3.318644
Willow (1988) 3.658683 3.453543
Willy Wonka and the Chocolate Factory (1971) 4.063953 3.789474
Witness (1985) 4.115854 3.941504
Wizard of Oz, The (1939) 4.355030 4.203138
Wolf (1994) 3.074074 2.899083
Women on the Verge of a Nervous Breakdown (1988) 3.934307 3.865741
Wonder Boys (2000) 4.043796 3.913649
Working Girl (1988) 3.606742 3.312500
World Is Not Enough, The (1999) 3.337500 3.388889
Wrong Trousers, The (1993) 4.588235 4.478261
Wyatt Earp (1994) 3.147059 3.283898
X-Files: Fight the Future, The (1998) 3.489474 3.493797
X-Men (2000) 3.682310 3.851702
Year of Living Dangerously (1982) 3.951220 3.869403
Yellow Submarine (1968) 3.714286 3.689286
You've Got Mail (1998) 3.542424 3.275591
Young Frankenstein (1974) 4.289963 4.239177
Young Guns (1988) 3.371795 3.425620
Young Guns II (1990) 2.934783 2.904025
Young Sherlock Holmes (1985) 3.514706 3.363344
Zero Effect (1998) 3.864407 3.723140
eXistenZ (1999) 3.098592 3.289086
[1216 rows x 2 columns]
#计算评分分歧,再增加一列计算分差
mean_ratings['diff']=mean_ratings['M']-mean_ratings['F']
#对diff进行排序
sorted_by_diff=mean_ratings.sort_values(by='diff')
sorted_by_diff[:15]
Out[67]:
gender F M diff
title
Dirty Dancing (1987) 3.790378 2.959596 -0.830782
Jumpin' Jack Flash (1986) 3.254717 2.578358 -0.676359
Grease (1978) 3.975265 3.367041 -0.608224
Little Women (1994) 3.870588 3.321739 -0.548849
Steel Magnolias (1989) 3.901734 3.365957 -0.535777
Anastasia (1997) 3.800000 3.281609 -0.518391
Rocky Horror Picture Show, The (1975) 3.673016 3.160131 -0.512885
Color Purple, The (1985) 4.158192 3.659341 -0.498851
Age of Innocence, The (1993) 3.827068 3.339506 -0.487561
Free Willy (1993) 2.921348 2.438776 -0.482573
French Kiss (1995) 3.535714 3.056962 -0.478752
Little Shop of Horrors, The (1960) 3.650000 3.179688 -0.470312
Guys and Dolls (1955) 4.051724 3.583333 -0.468391
Mary Poppins (1964) 4.197740 3.730594 -0.467147
Patch Adams (1998) 3.473282 3.008746 -0.464536
#男性最喜欢的电影
sprted_by_diff[::-1][:15]
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
in ()
----> 1 sprted_by_diff[::-1][:15]
NameError: name 'sprted_by_diff' is not defined
sorted_by_diff[::-1][:15]
Out[70]:
gender F M diff
title
Good, The Bad and The Ugly, The (1966) 3.494949 4.221300 0.726351
Kentucky Fried Movie, The (1977) 2.878788 3.555147 0.676359
Dumb & Dumber (1994) 2.697987 3.336595 0.638608
Longest Day, The (1962) 3.411765 4.031447 0.619682
Cable Guy, The (1996) 2.250000 2.863787 0.613787
Evil Dead II (Dead By Dawn) (1987) 3.297297 3.909283 0.611985
Hidden, The (1987) 3.137931 3.745098 0.607167
Rocky III (1982) 2.361702 2.943503 0.581801
Caddyshack (1980) 3.396135 3.969737 0.573602
For a Few Dollars More (1965) 3.409091 3.953795 0.544704
Porky's (1981) 2.296875 2.836364 0.539489
Animal House (1978) 3.628906 4.167192 0.538286
Exorcist, The (1973) 3.537634 4.067239 0.529605
Fright Night (1985) 2.973684 3.500000 0.526316
Barb Wire (1996) 1.585366 2.100386 0.515020
#找出分歧最大的电影,不考虑性别因素
#计算方差或标准差
rating_std_by_title=data.groupby('title')['rating'].std()
#利用active_titles过滤
rating_std_by_title=rating_std_by_title.loc[active_titles]
#降序排列,Series没有order,没有sort,我用了sort_values
rating_std_by_title.order(ascending=False)[:10]
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
in ()
----> 1 rating_std_by_title.order(ascending=False)[:10]
D:\ProgramData\Anaconda3\lib\site-packages\pandas\core\generic.py in __getattr__(self, name)
4370 if self._info_axis._can_hold_identifiers_and_holds_name(name):
4371 return self[name]
-> 4372 return object.__getattribute__(self, name)
4373
4374 def __setattr__(self, name, value):
AttributeError: 'Series' object has no attribute 'order'
rating_std_by_title.sort(ascending=False)[:10]
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
in ()
----> 1 rating_std_by_title.sort(ascending=False)[:10]
D:\ProgramData\Anaconda3\lib\site-packages\pandas\core\generic.py in __getattr__(self, name)
4370 if self._info_axis._can_hold_identifiers_and_holds_name(name):
4371 return self[name]
-> 4372 return object.__getattribute__(self, name)
4373
4374 def __setattr__(self, name, value):
AttributeError: 'Series' object has no attribute 'sort'
rating_std_by_title.sort_values(ascending=False)[:10]
Out[80]:
title
Dumb & Dumber (1994) 1.321333
Blair Witch Project, The (1999) 1.316368
Natural Born Killers (1994) 1.307198
Tank Girl (1995) 1.277695
Rocky Horror Picture Show, The (1975) 1.260177
Eyes Wide Shut (1999) 1.259624
Evita (1996) 1.253631
Billy Madison (1995) 1.249970
Fear and Loathing in Las Vegas (1998) 1.246408
Bicentennial Man (1999) 1.245533
Name: rating, dtype: float64