数据科学之3-5DataFrame的Selecting和Indexing
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
imdb = pd.read_csv('movie_metadata.csv')
imdb.shape
(5043, 28)
#默认5行
imdb.head()
|
color |
director_name |
num_critic_for_reviews |
duration |
director_facebook_likes |
actor_3_facebook_likes |
actor_2_name |
actor_1_facebook_likes |
gross |
genres |
... |
num_user_for_reviews |
language |
country |
content_rating |
budget |
title_year |
actor_2_facebook_likes |
imdb_score |
aspect_ratio |
movie_facebook_likes |
---|
0 |
Color |
James Cameron |
723.0 |
178.0 |
0.0 |
855.0 |
Joel David Moore |
1000.0 |
760505847.0 |
Action|Adventure|Fantasy|Sci-Fi |
... |
3054.0 |
English |
USA |
PG-13 |
237000000.0 |
2009.0 |
936.0 |
7.9 |
1.78 |
33000 |
---|
1 |
Color |
Gore Verbinski |
302.0 |
169.0 |
563.0 |
1000.0 |
Orlando Bloom |
40000.0 |
309404152.0 |
Action|Adventure|Fantasy |
... |
1238.0 |
English |
USA |
PG-13 |
300000000.0 |
2007.0 |
5000.0 |
7.1 |
2.35 |
0 |
---|
2 |
Color |
Sam Mendes |
602.0 |
148.0 |
0.0 |
161.0 |
Rory Kinnear |
11000.0 |
200074175.0 |
Action|Adventure|Thriller |
... |
994.0 |
English |
UK |
PG-13 |
245000000.0 |
2015.0 |
393.0 |
6.8 |
2.35 |
85000 |
---|
3 |
Color |
Christopher Nolan |
813.0 |
164.0 |
22000.0 |
23000.0 |
Christian Bale |
27000.0 |
448130642.0 |
Action|Thriller |
... |
2701.0 |
English |
USA |
PG-13 |
250000000.0 |
2012.0 |
23000.0 |
8.5 |
2.35 |
164000 |
---|
4 |
NaN |
Doug Walker |
NaN |
NaN |
131.0 |
NaN |
Rob Walker |
131.0 |
NaN |
Documentary |
... |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
12.0 |
7.1 |
NaN |
0 |
---|
5 rows × 28 columns
imdb.tail()
|
color |
director_name |
num_critic_for_reviews |
duration |
director_facebook_likes |
actor_3_facebook_likes |
actor_2_name |
actor_1_facebook_likes |
gross |
genres |
... |
num_user_for_reviews |
language |
country |
content_rating |
budget |
title_year |
actor_2_facebook_likes |
imdb_score |
aspect_ratio |
movie_facebook_likes |
---|
5038 |
Color |
Scott Smith |
1.0 |
87.0 |
2.0 |
318.0 |
Daphne Zuniga |
637.0 |
NaN |
Comedy|Drama |
... |
6.0 |
English |
Canada |
NaN |
NaN |
2013.0 |
470.0 |
7.7 |
NaN |
84 |
---|
5039 |
Color |
NaN |
43.0 |
43.0 |
NaN |
319.0 |
Valorie Curry |
841.0 |
NaN |
Crime|Drama|Mystery|Thriller |
... |
359.0 |
English |
USA |
TV-14 |
NaN |
NaN |
593.0 |
7.5 |
16.00 |
32000 |
---|
5040 |
Color |
Benjamin Roberds |
13.0 |
76.0 |
0.0 |
0.0 |
Maxwell Moody |
0.0 |
NaN |
Drama|Horror|Thriller |
... |
3.0 |
English |
USA |
NaN |
1400.0 |
2013.0 |
0.0 |
6.3 |
NaN |
16 |
---|
5041 |
Color |
Daniel Hsia |
14.0 |
100.0 |
0.0 |
489.0 |
Daniel Henney |
946.0 |
10443.0 |
Comedy|Drama|Romance |
... |
9.0 |
English |
USA |
PG-13 |
NaN |
2012.0 |
719.0 |
6.3 |
2.35 |
660 |
---|
5042 |
Color |
Jon Gunn |
43.0 |
90.0 |
16.0 |
16.0 |
Brian Herzlinger |
86.0 |
85222.0 |
Documentary |
... |
84.0 |
English |
USA |
PG |
1100.0 |
2004.0 |
23.0 |
6.6 |
1.85 |
456 |
---|
5 rows × 28 columns
#imdb['color'] 是一个Series
#imdb[['color','director_name']] #生成新的DataFrame
sub_df = imdb[['director_name', 'imdb_score','movie_title']]
sub_df.iloc[10:20,:]# 返回10到20行
|
director_name |
imdb_score |
movie_title |
---|
10 |
Zack Snyder |
6.9 |
Batman v Superman: Dawn of Justice |
---|
11 |
Bryan Singer |
6.1 |
Superman Returns |
---|
12 |
Marc Forster |
6.7 |
Quantum of Solace |
---|
13 |
Gore Verbinski |
7.3 |
Pirates of the Caribbean: Dead Man's Chest |
---|
14 |
Gore Verbinski |
6.5 |
The Lone Ranger |
---|
15 |
Zack Snyder |
7.2 |
Man of Steel |
---|
16 |
Andrew Adamson |
6.6 |
The Chronicles of Narnia: Prince Caspian |
---|
17 |
Joss Whedon |
8.1 |
The Avengers |
---|
18 |
Rob Marshall |
6.7 |
Pirates of the Caribbean: On Stranger Tides |
---|
19 |
Barry Sonnenfeld |
6.8 |
Men in Black 3 |
---|
sub_df.iloc[10:15,0:2]# 都是左闭右开
|
director_name |
imdb_score |
---|
10 |
Zack Snyder |
6.9 |
---|
11 |
Bryan Singer |
6.1 |
---|
12 |
Marc Forster |
6.7 |
---|
13 |
Gore Verbinski |
7.3 |
---|
14 |
Gore Verbinski |
6.5 |
---|
sub_df.loc[15:17, :'imdb_score'] # loc里面是label不是index,从哪里到哪里的概念
|
director_name |
imdb_score |
---|
15 |
Zack Snyder |
7.2 |
---|
16 |
Andrew Adamson |
6.6 |
---|
17 |
Joss Whedon |
8.1 |
---|
reindex
s1 = pd.Series([1, 2, 3, 4], index=['A', 'B', 'C', 'D'])
s1
A 1
B 2
C 3
D 4
dtype: int64
s1.reindex(index=['A', 'B', 'C', 'D','E'])
A 1.0
B 2.0
C 3.0
D 4.0
E NaN
dtype: float64
s1
A 1
B 2
C 3
D 4
dtype: int64
s1.reindex(index=['A', 'B', 'C', 'D','E'], fill_value=10)
A 1
B 2
C 3
D 4
E 10
dtype: int64
s2 = pd.Series(['A', 'B', 'C'], index=[1, 5, 10])
s2
1 A
5 B
10 C
dtype: object
s2.reindex(index=range(15))
0 NaN
1 A
2 NaN
3 NaN
4 NaN
5 B
6 NaN
7 NaN
8 NaN
9 NaN
10 C
11 NaN
12 NaN
13 NaN
14 NaN
dtype: object
s2.reindex(index=range(15), method='ffill')# forward
0 NaN
1 A
2 A
3 A
4 A
5 B
6 B
7 B
8 B
9 B
10 C
11 C
12 C
13 C
14 C
dtype: object
DataFrame
df1 = pd.DataFrame(np.random.rand(25).reshape(5, 5),index=['A', 'B', 'D','E', 'F'],columns=['c1', 'c2', 'c3', 'c4','c5'])
df1 #没 E
|
c1 |
c2 |
c3 |
c4 |
c5 |
---|
A |
0.101685 |
0.300087 |
0.169289 |
0.709127 |
0.714686 |
---|
B |
0.431398 |
0.069205 |
0.795478 |
0.515863 |
0.124097 |
---|
D |
0.518218 |
0.057719 |
0.966178 |
0.318853 |
0.304998 |
---|
E |
0.167965 |
0.414956 |
0.054904 |
0.805507 |
0.207914 |
---|
F |
0.957420 |
0.774384 |
0.090844 |
0.881185 |
0.129451 |
---|
df1.reindex(index=['A', 'B', 'C', 'D','E', 'F'])
|
c1 |
c2 |
c3 |
c4 |
c5 |
---|
A |
0.101685 |
0.300087 |
0.169289 |
0.709127 |
0.714686 |
---|
B |
0.431398 |
0.069205 |
0.795478 |
0.515863 |
0.124097 |
---|
C |
NaN |
NaN |
NaN |
NaN |
NaN |
---|
D |
0.518218 |
0.057719 |
0.966178 |
0.318853 |
0.304998 |
---|
E |
0.167965 |
0.414956 |
0.054904 |
0.805507 |
0.207914 |
---|
F |
0.957420 |
0.774384 |
0.090844 |
0.881185 |
0.129451 |
---|
df1.reindex(columns=['c1', 'c2', 'c3', 'c4','c5','C6'])
|
c1 |
c2 |
c3 |
c4 |
c5 |
C6 |
---|
A |
0.101685 |
0.300087 |
0.169289 |
0.709127 |
0.714686 |
NaN |
---|
B |
0.431398 |
0.069205 |
0.795478 |
0.515863 |
0.124097 |
NaN |
---|
D |
0.518218 |
0.057719 |
0.966178 |
0.318853 |
0.304998 |
NaN |
---|
E |
0.167965 |
0.414956 |
0.054904 |
0.805507 |
0.207914 |
NaN |
---|
F |
0.957420 |
0.774384 |
0.090844 |
0.881185 |
0.129451 |
NaN |
---|
s1
A 1
B 2
C 3
D 4
dtype: int64
# 减少的index
s1.reindex(index=['A', 'B'])
A 1
B 2
dtype: int64
# 减少的index
df1.reindex(index=['A', 'B'])
|
c1 |
c2 |
c3 |
c4 |
c5 |
---|
A |
0.101685 |
0.300087 |
0.169289 |
0.709127 |
0.714686 |
---|
B |
0.431398 |
0.069205 |
0.795478 |
0.515863 |
0.124097 |
---|
#还可以用drop
s1.drop('A')
B 2
C 3
D 4
dtype: int64
df1.drop('A', axis=0)
|
c1 |
c2 |
c3 |
c4 |
c5 |
---|
B |
0.431398 |
0.069205 |
0.795478 |
0.515863 |
0.124097 |
---|
D |
0.518218 |
0.057719 |
0.966178 |
0.318853 |
0.304998 |
---|
E |
0.167965 |
0.414956 |
0.054904 |
0.805507 |
0.207914 |
---|
F |
0.957420 |
0.774384 |
0.090844 |
0.881185 |
0.129451 |
---|
# df1.drop('c1', axis=0) 报错
df1.drop('c1', axis=1)
|
c2 |
c3 |
c4 |
c5 |
---|
A |
0.300087 |
0.169289 |
0.709127 |
0.714686 |
---|
B |
0.069205 |
0.795478 |
0.515863 |
0.124097 |
---|
D |
0.057719 |
0.966178 |
0.318853 |
0.304998 |
---|
E |
0.414956 |
0.054904 |
0.805507 |
0.207914 |
---|
F |
0.774384 |
0.090844 |
0.881185 |
0.129451 |
---|