2018-11-31课程04

数据科学之3-5DataFrame的Selecting和Indexing

import numpy as np
import pandas as pd
from pandas import Series, DataFrame
imdb = pd.read_csv('movie_metadata.csv')
imdb.shape
(5043, 28)
#默认5行
imdb.head()
























































































































































color director_name num_critic_for_reviews duration director_facebook_likes actor_3_facebook_likes actor_2_name actor_1_facebook_likes gross genres ... num_user_for_reviews language country content_rating budget title_year actor_2_facebook_likes imdb_score aspect_ratio movie_facebook_likes
0 Color James Cameron 723.0 178.0 0.0 855.0 Joel David Moore 1000.0 760505847.0 Action|Adventure|Fantasy|Sci-Fi ... 3054.0 English USA PG-13 237000000.0 2009.0 936.0 7.9 1.78 33000
1 Color Gore Verbinski 302.0 169.0 563.0 1000.0 Orlando Bloom 40000.0 309404152.0 Action|Adventure|Fantasy ... 1238.0 English USA PG-13 300000000.0 2007.0 5000.0 7.1 2.35 0
2 Color Sam Mendes 602.0 148.0 0.0 161.0 Rory Kinnear 11000.0 200074175.0 Action|Adventure|Thriller ... 994.0 English UK PG-13 245000000.0 2015.0 393.0 6.8 2.35 85000
3 Color Christopher Nolan 813.0 164.0 22000.0 23000.0 Christian Bale 27000.0 448130642.0 Action|Thriller ... 2701.0 English USA PG-13 250000000.0 2012.0 23000.0 8.5 2.35 164000
4 NaN Doug Walker NaN NaN 131.0 NaN Rob Walker 131.0 NaN Documentary ... NaN NaN NaN NaN NaN NaN 12.0 7.1 NaN 0

5 rows × 28 columns


imdb.tail()
























































































































































color director_name num_critic_for_reviews duration director_facebook_likes actor_3_facebook_likes actor_2_name actor_1_facebook_likes gross genres ... num_user_for_reviews language country content_rating budget title_year actor_2_facebook_likes imdb_score aspect_ratio movie_facebook_likes
5038 Color Scott Smith 1.0 87.0 2.0 318.0 Daphne Zuniga 637.0 NaN Comedy|Drama ... 6.0 English Canada NaN NaN 2013.0 470.0 7.7 NaN 84
5039 Color NaN 43.0 43.0 NaN 319.0 Valorie Curry 841.0 NaN Crime|Drama|Mystery|Thriller ... 359.0 English USA TV-14 NaN NaN 593.0 7.5 16.00 32000
5040 Color Benjamin Roberds 13.0 76.0 0.0 0.0 Maxwell Moody 0.0 NaN Drama|Horror|Thriller ... 3.0 English USA NaN 1400.0 2013.0 0.0 6.3 NaN 16
5041 Color Daniel Hsia 14.0 100.0 0.0 489.0 Daniel Henney 946.0 10443.0 Comedy|Drama|Romance ... 9.0 English USA PG-13 NaN 2012.0 719.0 6.3 2.35 660
5042 Color Jon Gunn 43.0 90.0 16.0 16.0 Brian Herzlinger 86.0 85222.0 Documentary ... 84.0 English USA PG 1100.0 2004.0 23.0 6.6 1.85 456

5 rows × 28 columns


#imdb['color']  是一个Series
#imdb[['color','director_name']] #生成新的DataFrame
sub_df = imdb[['director_name', 'imdb_score','movie_title']] 
sub_df.iloc[10:20,:]# 返回10到20行










































































director_name imdb_score movie_title
10 Zack Snyder 6.9 Batman v Superman: Dawn of Justice
11 Bryan Singer 6.1 Superman Returns
12 Marc Forster 6.7 Quantum of Solace
13 Gore Verbinski 7.3 Pirates of the Caribbean: Dead Man's Chest
14 Gore Verbinski 6.5 The Lone Ranger
15 Zack Snyder 7.2 Man of Steel
16 Andrew Adamson 6.6 The Chronicles of Narnia: Prince Caspian
17 Joss Whedon 8.1 The Avengers
18 Rob Marshall 6.7 Pirates of the Caribbean: On Stranger Tides
19 Barry Sonnenfeld 6.8 Men in Black 3

sub_df.iloc[10:15,0:2]# 都是左闭右开






































director_name imdb_score
10 Zack Snyder 6.9
11 Bryan Singer 6.1
12 Marc Forster 6.7
13 Gore Verbinski 7.3
14 Gore Verbinski 6.5

sub_df.loc[15:17, :'imdb_score'] # loc里面是label不是index,从哪里到哪里的概念




























director_name imdb_score
15 Zack Snyder 7.2
16 Andrew Adamson 6.6
17 Joss Whedon 8.1

reindex

s1 = pd.Series([1, 2, 3, 4], index=['A', 'B', 'C', 'D'])
s1
A    1
B    2
C    3
D    4
dtype: int64
s1.reindex(index=['A', 'B', 'C', 'D','E'])
A    1.0
B    2.0
C    3.0
D    4.0
E    NaN
dtype: float64
s1
A    1
B    2
C    3
D    4
dtype: int64
s1.reindex(index=['A', 'B', 'C', 'D','E'], fill_value=10)
A     1
B     2
C     3
D     4
E    10
dtype: int64
s2 = pd.Series(['A', 'B', 'C'], index=[1, 5, 10])
s2
1     A
5     B
10    C
dtype: object
s2.reindex(index=range(15))
0     NaN
1       A
2     NaN
3     NaN
4     NaN
5       B
6     NaN
7     NaN
8     NaN
9     NaN
10      C
11    NaN
12    NaN
13    NaN
14    NaN
dtype: object
s2.reindex(index=range(15), method='ffill')# forward
0     NaN
1       A
2       A
3       A
4       A
5       B
6       B
7       B
8       B
9       B
10      C
11      C
12      C
13      C
14      C
dtype: object

DataFrame

df1 = pd.DataFrame(np.random.rand(25).reshape(5, 5),index=['A', 'B', 'D','E', 'F'],columns=['c1', 'c2', 'c3', 'c4','c5'])
df1 #没 E
























































c1 c2 c3 c4 c5
A 0.101685 0.300087 0.169289 0.709127 0.714686
B 0.431398 0.069205 0.795478 0.515863 0.124097
D 0.518218 0.057719 0.966178 0.318853 0.304998
E 0.167965 0.414956 0.054904 0.805507 0.207914
F 0.957420 0.774384 0.090844 0.881185 0.129451

df1.reindex(index=['A', 'B', 'C', 'D','E', 'F'])
































































c1 c2 c3 c4 c5
A 0.101685 0.300087 0.169289 0.709127 0.714686
B 0.431398 0.069205 0.795478 0.515863 0.124097
C NaN NaN NaN NaN NaN
D 0.518218 0.057719 0.966178 0.318853 0.304998
E 0.167965 0.414956 0.054904 0.805507 0.207914
F 0.957420 0.774384 0.090844 0.881185 0.129451

df1.reindex(columns=['c1', 'c2', 'c3', 'c4','c5','C6'])






























































c1 c2 c3 c4 c5 C6
A 0.101685 0.300087 0.169289 0.709127 0.714686 NaN
B 0.431398 0.069205 0.795478 0.515863 0.124097 NaN
D 0.518218 0.057719 0.966178 0.318853 0.304998 NaN
E 0.167965 0.414956 0.054904 0.805507 0.207914 NaN
F 0.957420 0.774384 0.090844 0.881185 0.129451 NaN

s1
A    1
B    2
C    3
D    4
dtype: int64
# 减少的index
s1.reindex(index=['A', 'B'])
A    1
B    2
dtype: int64
# 减少的index
df1.reindex(index=['A', 'B'])
































c1 c2 c3 c4 c5
A 0.101685 0.300087 0.169289 0.709127 0.714686
B 0.431398 0.069205 0.795478 0.515863 0.124097

#还可以用drop
s1.drop('A')
B    2
C    3
D    4
dtype: int64
df1.drop('A', axis=0)
















































c1 c2 c3 c4 c5
B 0.431398 0.069205 0.795478 0.515863 0.124097
D 0.518218 0.057719 0.966178 0.318853 0.304998
E 0.167965 0.414956 0.054904 0.805507 0.207914
F 0.957420 0.774384 0.090844 0.881185 0.129451

# df1.drop('c1', axis=0)  报错 
df1.drop('c1', axis=1)


















































c2 c3 c4 c5
A 0.300087 0.169289 0.709127 0.714686
B 0.069205 0.795478 0.515863 0.124097
D 0.057719 0.966178 0.318853 0.304998
E 0.414956 0.054904 0.805507 0.207914
F 0.774384 0.090844 0.881185 0.129451

你可能感兴趣的:(2018-11-31课程04)