数据科学之3-5DataFrame的Selecting和Indexing

import numpy as np
import pandas as pd
from pandas import Series, DataFrame

imdb = pd.read_csv('movie_metadata.csv')

imdb.shape

(5043, 28)

#默认5行
imdb.head()

	color	director_name	num_critic_for_reviews	duration	director_facebook_likes	actor_3_facebook_likes	actor_2_name	actor_1_facebook_likes	gross	genres	...	num_user_for_reviews	language	country	content_rating	budget	title_year	actor_2_facebook_likes	imdb_score	aspect_ratio	movie_facebook_likes
0	Color	James Cameron	723.0	178.0	0.0	855.0	Joel David Moore	1000.0	760505847.0	Action\|Adventure\|Fantasy\|Sci-Fi	...	3054.0	English	USA	PG-13	237000000.0	2009.0	936.0	7.9	1.78	33000
1	Color	Gore Verbinski	302.0	169.0	563.0	1000.0	Orlando Bloom	40000.0	309404152.0	Action\|Adventure\|Fantasy	...	1238.0	English	USA	PG-13	300000000.0	2007.0	5000.0	7.1	2.35	0
2	Color	Sam Mendes	602.0	148.0	0.0	161.0	Rory Kinnear	11000.0	200074175.0	Action\|Adventure\|Thriller	...	994.0	English	UK	PG-13	245000000.0	2015.0	393.0	6.8	2.35	85000
3	Color	Christopher Nolan	813.0	164.0	22000.0	23000.0	Christian Bale	27000.0	448130642.0	Action\|Thriller	...	2701.0	English	USA	PG-13	250000000.0	2012.0	23000.0	8.5	2.35	164000
4	NaN	Doug Walker	NaN	NaN	131.0	NaN	Rob Walker	131.0	NaN	Documentary	...	NaN	NaN	NaN	NaN	NaN	NaN	12.0	7.1	NaN	0

5 rows × 28 columns

imdb.tail()

	color	director_name	num_critic_for_reviews	duration	director_facebook_likes	actor_3_facebook_likes	actor_2_name	actor_1_facebook_likes	gross	genres	...	num_user_for_reviews	language	country	content_rating	budget	title_year	actor_2_facebook_likes	imdb_score	aspect_ratio	movie_facebook_likes
5038	Color	Scott Smith	1.0	87.0	2.0	318.0	Daphne Zuniga	637.0	NaN	Comedy\|Drama	...	6.0	English	Canada	NaN	NaN	2013.0	470.0	7.7	NaN	84
5039	Color	NaN	43.0	43.0	NaN	319.0	Valorie Curry	841.0	NaN	Crime\|Drama\|Mystery\|Thriller	...	359.0	English	USA	TV-14	NaN	NaN	593.0	7.5	16.00	32000
5040	Color	Benjamin Roberds	13.0	76.0	0.0	0.0	Maxwell Moody	0.0	NaN	Drama\|Horror\|Thriller	...	3.0	English	USA	NaN	1400.0	2013.0	0.0	6.3	NaN	16
5041	Color	Daniel Hsia	14.0	100.0	0.0	489.0	Daniel Henney	946.0	10443.0	Comedy\|Drama\|Romance	...	9.0	English	USA	PG-13	NaN	2012.0	719.0	6.3	2.35	660
5042	Color	Jon Gunn	43.0	90.0	16.0	16.0	Brian Herzlinger	86.0	85222.0	Documentary	...	84.0	English	USA	PG	1100.0	2004.0	23.0	6.6	1.85	456

5 rows × 28 columns

#imdb['color']  是一个Series

#imdb[['color','director_name']] #生成新的DataFrame

sub_df = imdb[['director_name', 'imdb_score','movie_title']]

sub_df.iloc[10:20,:]# 返回10到20行

	director_name	imdb_score	movie_title
10	Zack Snyder	6.9	Batman v Superman: Dawn of Justice
11	Bryan Singer	6.1	Superman Returns
12	Marc Forster	6.7	Quantum of Solace
13	Gore Verbinski	7.3	Pirates of the Caribbean: Dead Man's Chest
14	Gore Verbinski	6.5	The Lone Ranger
15	Zack Snyder	7.2	Man of Steel
16	Andrew Adamson	6.6	The Chronicles of Narnia: Prince Caspian
17	Joss Whedon	8.1	The Avengers
18	Rob Marshall	6.7	Pirates of the Caribbean: On Stranger Tides
19	Barry Sonnenfeld	6.8	Men in Black 3

sub_df.iloc[10:15,0:2]# 都是左闭右开

	director_name	imdb_score
10	Zack Snyder	6.9
11	Bryan Singer	6.1
12	Marc Forster	6.7
13	Gore Verbinski	7.3
14	Gore Verbinski	6.5

sub_df.loc[15:17, :'imdb_score'] # loc里面是label不是index,从哪里到哪里的概念

	director_name	imdb_score
15	Zack Snyder	7.2
16	Andrew Adamson	6.6
17	Joss Whedon	8.1

reindex

s1 = pd.Series([1, 2, 3, 4], index=['A', 'B', 'C', 'D'])

s1

A    1
B    2
C    3
D    4
dtype: int64

s1.reindex(index=['A', 'B', 'C', 'D','E'])

A    1.0
B    2.0
C    3.0
D    4.0
E    NaN
dtype: float64

s1

A    1
B    2
C    3
D    4
dtype: int64

s1.reindex(index=['A', 'B', 'C', 'D','E'], fill_value=10)

A     1
B     2
C     3
D     4
E    10
dtype: int64

s2 = pd.Series(['A', 'B', 'C'], index=[1, 5, 10])
s2

1     A
5     B
10    C
dtype: object

s2.reindex(index=range(15))

0     NaN
1       A
2     NaN
3     NaN
4     NaN
5       B
6     NaN
7     NaN
8     NaN
9     NaN
10      C
11    NaN
12    NaN
13    NaN
14    NaN
dtype: object

s2.reindex(index=range(15), method='ffill')# forward

0     NaN
1       A
2       A
3       A
4       A
5       B
6       B
7       B
8       B
9       B
10      C
11      C
12      C
13      C
14      C
dtype: object

DataFrame

df1 = pd.DataFrame(np.random.rand(25).reshape(5, 5),index=['A', 'B', 'D','E', 'F'],columns=['c1', 'c2', 'c3', 'c4','c5'])

df1 #没 E

	c1	c2	c3	c4	c5
A	0.101685	0.300087	0.169289	0.709127	0.714686
B	0.431398	0.069205	0.795478	0.515863	0.124097
D	0.518218	0.057719	0.966178	0.318853	0.304998
E	0.167965	0.414956	0.054904	0.805507	0.207914
F	0.957420	0.774384	0.090844	0.881185	0.129451

df1.reindex(index=['A', 'B', 'C', 'D','E', 'F'])

	c1	c2	c3	c4	c5
A	0.101685	0.300087	0.169289	0.709127	0.714686
B	0.431398	0.069205	0.795478	0.515863	0.124097
C	NaN	NaN	NaN	NaN	NaN
D	0.518218	0.057719	0.966178	0.318853	0.304998
E	0.167965	0.414956	0.054904	0.805507	0.207914
F	0.957420	0.774384	0.090844	0.881185	0.129451

df1.reindex(columns=['c1', 'c2', 'c3', 'c4','c5','C6'])

	c1	c2	c3	c4	c5	C6
A	0.101685	0.300087	0.169289	0.709127	0.714686	NaN
B	0.431398	0.069205	0.795478	0.515863	0.124097	NaN
D	0.518218	0.057719	0.966178	0.318853	0.304998	NaN
E	0.167965	0.414956	0.054904	0.805507	0.207914	NaN
F	0.957420	0.774384	0.090844	0.881185	0.129451	NaN

s1

A    1
B    2
C    3
D    4
dtype: int64

# 减少的index
s1.reindex(index=['A', 'B'])

A    1
B    2
dtype: int64

# 减少的index
df1.reindex(index=['A', 'B'])

	c1	c2	c3	c4	c5
A	0.101685	0.300087	0.169289	0.709127	0.714686
B	0.431398	0.069205	0.795478	0.515863	0.124097

#还可以用drop
s1.drop('A')

B    2
C    3
D    4
dtype: int64

df1.drop('A', axis=0)

	c1	c2	c3	c4	c5
B	0.431398	0.069205	0.795478	0.515863	0.124097
D	0.518218	0.057719	0.966178	0.318853	0.304998
E	0.167965	0.414956	0.054904	0.805507	0.207914
F	0.957420	0.774384	0.090844	0.881185	0.129451

# df1.drop('c1', axis=0)  报错

df1.drop('c1', axis=1)

	c2	c3	c4	c5
A	0.300087	0.169289	0.709127	0.714686
B	0.069205	0.795478	0.515863	0.124097
D	0.057719	0.966178	0.318853	0.304998
E	0.414956	0.054904	0.805507	0.207914
F	0.774384	0.090844	0.881185	0.129451

2018-11-31课程04

数据科学之3-5DataFrame的Selecting和Indexing

reindex

DataFrame

你可能感兴趣的:(2018-11-31课程04)