第四节 Pandas 入门
慕课网python数科学入门课程学习笔记
一. Series 序列
import numpy as np
import pandas as pd
s1 = pd.Series([1,2,3,4])
s1
0 1 1 2 2 3 3 4 dtype: int64
s1.values
array([1, 2, 3, 4], dtype=int64)
s1.index
RangeIndex(start=0, stop=4, step=1)
s2 = pd.Series(np.arange(10))
s2
0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 dtype: int32
s3 = pd.Series({'1':1, '2':2, '3':3})
s3
1 1 2 2 3 3 dtype: int64
s4 = pd.Series([1,2,3,4],index=['A','B','C','D'])
s4
A 1 B 2 C 3 D 4 dtype: int64
序列的访问、转换、属性
s4['A']
1
s4[s4>2]
C 3 D 4 dtype: int64
s4.to_dict()
{‘A’: 1, ‘B’: 2, ‘C’: 3, ‘D’: 4}
s4
A 1 B 2 C 3 D 4 dtype: int64
index_1 = ['A', 'B', 'C', 'D', 'E']
s5 = pd.Series(s4,index=index_1)
s5
A 1.0 B 2.0 C 3.0 D 4.0 E NaN dtype: float64
pd.isnull(s5)
A False B False C False D False E True dtype: bool
s5.name = 'demo'
s5.index.name = 'key value'
s5
key value A 1.0 B 2.0 C 3.0 D 4.0 E NaN Name: demo, dtype: float64
s5.index
Index([‘A’, ‘B’, ‘C’, ‘D’, ‘E’], dtype=’object’, name=’key value’)
s5.values
array([ 1., 2., 3., 4., nan]) ——-
二.Dataframe 入门
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
import webbrowser
link = 'https://www.tiobe.com/tiobe-index/'
webbrowser.open(link)
df = pd.read_clipboard()
df
|
May |
2018 |
May.1 |
2017 |
Change |
Programming |
Language |
Ratings |
Change.1 |
0 |
1 |
1 |
Java |
16.380% |
+1.74% |
NaN |
NaN |
NaN |
NaN |
1 |
2 |
2 |
C |
14.000% |
+7.00% |
NaN |
NaN |
NaN |
NaN |
2 |
3 |
3 |
C++ |
7.668% |
+2.92% |
NaN |
NaN |
NaN |
NaN |
3 |
4 |
4 |
Python |
5.192% |
+1.64% |
NaN |
NaN |
NaN |
NaN |
4 |
5 |
5 |
C# |
4.402% |
+0.95% |
NaN |
NaN |
NaN |
NaN |
5 |
6 |
6 |
Visual |
Basic |
.NET |
4.124% |
+0.73% |
NaN |
NaN |
6 |
7 |
9 |
change |
PHP |
3.321% |
+0.63% |
NaN |
NaN |
NaN |
type(df)
pandas.core.frame.DataFrame
df.columns
Index([‘May’, ‘2018’, ‘May.1’, ‘2017’, ‘Change’, ‘Programming’, ‘Language’, ‘Ratings’, ‘Change.1’], dtype=’object’)
df.Change
0 +1.74% 1 +7.00% 2 +2.92% 3 +1.64% 4 +0.95% 5 .NET 6 3.321% Name: Change, dtype: object
df_filter = DataFrame(df,columns=['May','2018','Change'])
df_filter
|
May |
2018 |
Change |
0 |
1 |
1 |
+1.74% |
1 |
2 |
2 |
+7.00% |
2 |
3 |
3 |
+2.92% |
3 |
4 |
4 |
+1.64% |
4 |
5 |
5 |
+0.95% |
5 |
6 |
6 |
.NET |
6 |
7 |
9 |
3.321% |
df_filter['Change']
0 +1.74% 1 +7.00% 2 +2.92% 3 +1.64% 4 +0.95% 5 .NET 6 3.321% Name: Change, dtype: object
type(df_filter['Change'])
pandas.core.series.Series
添加列 和 值
df_new = DataFrame(df,columns=['Change','Sep 2019'])
df_new
|
Change |
Sep 2019 |
0 |
+1.74% |
NaN |
1 |
+7.00% |
NaN |
2 |
+2.92% |
NaN |
3 |
+1.64% |
NaN |
4 |
+0.95% |
NaN |
5 |
.NET |
NaN |
6 |
3.321% |
NaN |
df_new['Sep 2019'] = range(0,7)
df_new
|
Change |
Sep 2019 |
0 |
+1.74% |
0 |
1 |
+7.00% |
1 |
2 |
+2.92% |
2 |
3 |
+1.64% |
3 |
4 |
+0.95% |
4 |
5 |
.NET |
5 |
6 |
3.321% |
6 |
df_new['Sep 2019'] = np.arange(0,7)
df_new
|
Change |
Sep 2019 |
0 |
+1.74% |
0 |
1 |
+7.00% |
1 |
2 |
+2.92% |
2 |
3 |
+1.64% |
3 |
4 |
+0.95% |
4 |
5 |
.NET |
5 |
6 |
3.321% |
6 |
df_new['Sep 2019'] = pd.Series(np.arange(0,7))
df_new
|
Change |
Sep 2019 |
0 |
+1.74% |
0 |
1 |
+7.00% |
1 |
2 |
+2.92% |
2 |
3 |
+1.64% |
3 |
4 |
+0.95% |
4 |
5 |
.NET |
5 |
6 |
3.321% |
6 |
df_new['Sep 2019'] = pd.Series([100,200],index=[1,2])
df_new
|
Change |
Sep 2019 |
0 |
+1.74% |
NaN |
1 |
+7.00% |
100.0 |
2 |
+2.92% |
200.0 |
3 |
+1.64% |
NaN |
4 |
+0.95% |
NaN |
5 |
.NET |
NaN |
6 |
3.321% |
NaN |
三.深入理解Series和DataFrame
1.Series 和 DataFrame 对比
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
data = {'Country':['A', 'B', 'C'],
'Capital':['AA', 'BB','CC'],
'Population':[11111,22222,33333]}
Series
s1 = pd.Series(data['Country'])
s1
0 A 1 B 2 C dtype: object
Dataframe
df = pd.DataFrame(data)
df
|
Country |
Capital |
Population |
0 |
A |
AA |
11111 |
1 |
B |
BB |
22222 |
2 |
C |
CC |
33333 |
df_torows = df.iterrows()
for row in df_torows:
print(row)
print(type(row))
(0, Country A Capital AA Population 11111 Name: 0, dtype: object)
for row in df.iterrows():
print(type(row[0]),type(row[1]))
break
通过 几个 Series 创建 Dataframe
s1 = pd.Series(data['Country'])
s2 = pd.Series(data['Capital'])
s3 = pd.Series(data['Population'])
df_new = pd.DataFrame([s1,s2,s3],index=['Country', 'Capital', 'Capital'])
df_new
|
0 |
1 |
2 |
Country |
A |
B |
C |
Capital |
AA |
BB |
CC |
Capital |
11111 |
22222 |
33333 |
df
|
Country |
Capital |
Population |
0 |
A |
AA |
11111 |
1 |
B |
BB |
22222 |
2 |
C |
CC |
33333 |
df_new = df_new.T
df_new
|
Country |
Capital |
Capital |
0 |
A |
AA |
11111 |
1 |
B |
BB |
22222 |
2 |
C |
CC |
33333 |
2.Dataframe IO
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
import webbrowser
link = 'http://pandas.pydata.org/pandas-docs/version/0.20/io.html'
webbrowser.open(link)
True
df = pd.read_clipboard()
df
|
Format Type |
Data Description |
Reader |
Writer |
0 |
text |
CSV |
read_csv |
to_csv |
1 |
text |
JSON |
read_json |
to_json |
2 |
text |
HTML |
read_html |
to_html |
3 |
text |
Local clipboard |
read_clipboard |
to_clipboard |
4 |
binary |
MS Excel |
read_excel |
to_excel |
5 |
binary |
HDF5 Format |
read_hdf |
to_hdf |
6 |
binary |
Feather Format |
read_feather |
to_feather |
7 |
binary |
Msgpack |
read_msgpack |
to_msgpack |
8 |
binary |
Stata |
read_stata |
to_stata |
9 |
binary |
SAS |
read_sas |
None |
10 |
binary |
Python Pickle Format |
read_pickle |
to_pickle |
11 |
SQL |
SQL |
read_sql |
to_sql |
12 |
SQL |
Google Big Query |
read_gbq |
to_gbq |
df.to_csv('df1.csv',index=False)
!ls
Dataframe IO.ipynb Dataframe.ipynb Series.ipynb df1.csv 娣卞叆鐞嗚ВSeries鍜孌ataFrame.ipynb
!more df1.csv
Format Type,Data Description,Reader,Writer text,CSV,read_csv,to_csv text,JSON,read_json,to_json text,HTML,read_html,to_html text,Local clipboard,read_clipboard,to_clipboard binary,MS Excel,read_excel,to_excel binary,HDF5 Format,read_hdf,to_hdf binary,Feather Format,read_feather,to_feather binary,Msgpack,read_msgpack,to_msgpack binary,Stata,read_stata,to_stata binary,SAS,read_sas, binary,Python Pickle Format,read_pickle,to_pickle SQL,SQL,read_sql,to_sql SQL,Google Big Query,read_gbq,to_gbq
df2 = pd.read_csv('df1.csv')
df2
|
Format Type |
Data Description |
Reader |
Writer |
0 |
text |
CSV |
read_csv |
to_csv |
1 |
text |
JSON |
read_json |
to_json |
2 |
text |
HTML |
read_html |
to_html |
3 |
text |
Local clipboard |
read_clipboard |
to_clipboard |
4 |
binary |
MS Excel |
read_excel |
to_excel |
5 |
binary |
HDF5 Format |
read_hdf |
to_hdf |
6 |
binary |
Feather Format |
read_feather |
to_feather |
7 |
binary |
Msgpack |
read_msgpack |
to_msgpack |
8 |
binary |
Stata |
read_stata |
to_stata |
9 |
binary |
SAS |
read_sas |
NaN |
10 |
binary |
Python Pickle Format |
read_pickle |
to_pickle |
11 |
SQL |
SQL |
read_sql |
to_sql |
12 |
SQL |
Google Big Query |
read_gbq |
to_gbq |
df.to_json()
'{"Format Type":{"0":"text","1":"text","2":"text","3":"text","4":"binary","5":"binary","6":"binary","7":"binary","8":"binary","9":"binary","10":"binary","11":"SQL","12":"SQL"},"Data Description":。。。
3.Selecting and Index
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
!ls
Dataframe IO.ipynb Dataframe.ipynb Selecting and Indexing.ipynb Series.ipynb data.csv df1.csv 娣卞叆鐞嗚ВSeries鍜孌ataFrame.ipynb
read_data = pd.read_csv('data.csv')
read_data.shape
(20, 8)
read_data.head(4)
|
A |
B |
C |
D |
E |
F |
G |
H |
0 |
1 |
1 |
Java |
16.38% |
1.74% |
21 |
Apex |
0.90% |
1 |
2 |
2 |
C |
14.00% |
7.00% |
22 |
PL/SQL |
0.90% |
2 |
3 |
3 |
C++ |
7.67% |
2.92% |
23 |
Transact-SQL |
0.88% |
3 |
4 |
4 |
Python |
5.19% |
1.64% |
24 |
Ada |
0.87% |
read_data.tail(3)
|
A |
B |
C |
D |
E |
F |
G |
H |
17 |
18 |
10 |
Perl |
0.91% |
-1.69% |
38 |
Alice |
0.47% |
18 |
19 |
13 |
Swift |
0.91% |
-1.37% |
39 |
Lua |
0.42% |
19 |
20 |
31 |
Scala |
0.90% |
0.18% |
40 |
Fortran |
0.42% |
sub_data = read_data[['A', 'B','C']]
sub_data.head(3)
|
A |
B |
C |
0 |
1 |
1 |
Java |
1 |
2 |
2 |
C |
2 |
3 |
3 |
C++ |
sub_data.iloc[3:6, :]
|
A |
B |
C |
3 |
4 |
4 |
Python |
4 |
5 |
5 |
C# |
5 |
6 |
6 |
Visual Basic .NET |
read_data.loc[10:13, : 'D']
|
A |
B |
C |
D |
10 |
11 |
14 |
R |
1.18% |
11 |
12 |
18 |
Delphi/Object Pascal |
1.01% |
12 |
13 |
8 |
Assembly language |
1.00% |
13 |
14 |
16 |
Go |
0.97% |
四.Reindexing Series and DataFrame
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
1.series reindex
shift + Tab 键可查看函数说明
s1 = Series([1,2,3,4],index=['A','B','C','D'])
s1
A 1 B 2 C 3 D 4 dtype: int64
s1.reindex(index=['A','B','C','D','E'],fill_value=10)
A 1 B 2 C 3 D 4 E 10 dtype: int64
s2 = Series(['A','B','C'],index=[1,3,6])
s2
1 A 3 B 6 C dtype: object
s2.reindex(index=range(8))
0 NaN 1 A 2 NaN 3 B 4 NaN 5 NaN 6 C 7 NaN dtype: object
s2.reindex(index=range(8),method='ffill')
0 NaN 1 A 2 A 3 B 4 B 5 B 6 C 7 C dtype: object
2.reindex dataframe
df1 = DataFrame(np.random.rand(25).reshape([5,5]),index=['A','B','D','E','F'], columns=['c1', 'c2', 'c3', 'c4', 'c5'])
df1
|
c1 |
c2 |
c3 |
c4 |
c5 |
A |
0.123618 |
0.348567 |
0.119156 |
0.380952 |
0.379118 |
B |
0.476492 |
0.254976 |
0.629318 |
0.728708 |
0.747153 |
D |
0.965314 |
0.424126 |
0.913850 |
0.092063 |
0.196096 |
E |
0.960760 |
0.866313 |
0.226766 |
0.865781 |
0.465341 |
F |
0.982832 |
0.340850 |
0.725084 |
0.519617 |
0.889651 |
df1.reindex(index=['A','B','C','D','E','F'])
|
c1 |
c2 |
c3 |
c4 |
c5 |
A |
0.123618 |
0.348567 |
0.119156 |
0.380952 |
0.379118 |
B |
0.476492 |
0.254976 |
0.629318 |
0.728708 |
0.747153 |
C |
NaN |
NaN |
NaN |
NaN |
NaN |
D |
0.965314 |
0.424126 |
0.913850 |
0.092063 |
0.196096 |
E |
0.960760 |
0.866313 |
0.226766 |
0.865781 |
0.465341 |
F |
0.982832 |
0.340850 |
0.725084 |
0.519617 |
0.889651 |
3.利用 reindex() 选取某些值
s1
A 1 B 2 C 3 D 4 dtype: int64
s1.reindex(index=['A','B'])
A 1 B 2 dtype: int64
df1.reindex(index=['A','B'])
|
c1 |
c2 |
c3 |
c4 |
c5 |
A |
0.123618 |
0.348567 |
0.119156 |
0.380952 |
0.379118 |
B |
0.476492 |
0.254976 |
0.629318 |
0.728708 |
0.747153 |
4.删除
s1.drop('A')
B 2 C 3 D 4 dtype: int64
df1.drop('A',axis=0)
|
c1 |
c2 |
c3 |
c4 |
c5 |
B |
0.476492 |
0.254976 |
0.629318 |
0.728708 |
0.747153 |
D |
0.965314 |
0.424126 |
0.913850 |
0.092063 |
0.196096 |
E |
0.960760 |
0.866313 |
0.226766 |
0.865781 |
0.465341 |
F |
0.982832 |
0.340850 |
0.725084 |
0.519617 |
0.889651 |
五.NaN —— Not a Number
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
Nan in Numpy
n = np.nan
type(n)
float
m = 1
m + n
nan
Nan in Series
s1 = Series([1,2,np.nan,3,4],index=['A', 'B', 'C', 'D', 'E'])
s1
A 1.0 B 2.0 C NaN D 3.0 E 4.0 dtype: float64
s1.isnull()
A False B False C True D False E False dtype: bool
s1.dropna()
A 1.0 B 2.0 D 3.0 E 4.0 dtype: float64
NaN in DataFrame
dframe = DataFrame([[1,2,3],[np.nan,5,6],[7,np.nan,9],[np.nan, np.nan, np.nan]])
dframe
|
0 |
1 |
2 |
0 |
1.0 |
2.0 |
3.0 |
1 |
NaN |
5.0 |
6.0 |
2 |
7.0 |
NaN |
9.0 |
3 |
NaN |
NaN |
NaN |
df1 = dframe.dropna(axis=0, how='any')
df1
df2 = dframe.dropna(thresh=2)
df2
|
0 |
1 |
2 |
0 |
1.0 |
2.0 |
3.0 |
1 |
NaN |
5.0 |
6.0 |
2 |
7.0 |
NaN |
9.0 |
df3 = dframe.fillna(value=1)
df3
|
0 |
1 |
2 |
0 |
1.0 |
2.0 |
3.0 |
1 |
1.0 |
5.0 |
6.0 |
2 |
7.0 |
1.0 |
9.0 |
3 |
1.0 |
1.0 |
1.0 |
df3 = dframe.fillna(value={0:0, 1:1,2:2,3:3})
df3
|
0 |
1 |
2 |
0 |
1.0 |
2.0 |
3.0 |
1 |
0.0 |
5.0 |
6.0 |
2 |
7.0 |
1.0 |
9.0 |
3 |
0.0 |
1.0 |
2.0 |
六、多级Index
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
s1 = Series(np.random.randn(6),index=[['1','1','1','2','2','2'],
['a','b','c','a','b','c']])
s1
1 a 0.301681 b -1.596626 c -0.261337 2 a 0.739900 b 0.299108 c 0.074713 dtype: float64
s1['1']
a 0.301681 b -1.596626 c -0.261337 dtype: float64
s1['1']['a']
0.3016807350048885
s1[:,'a']
1 0.301681 2 0.739900 dtype: float64
二级 index Series 与 DataFrame互相转换
df1 = s1.unstack()
df1
|
a |
b |
c |
1 |
0.301681 |
-1.596626 |
-0.261337 |
2 |
0.739900 |
0.299108 |
0.074713 |
df2 = DataFrame([s1['1'],s1['2']])
df2
|
a |
b |
c |
0 |
0.301681 |
-1.596626 |
-0.261337 |
1 |
0.739900 |
0.299108 |
0.074713 |
s2 = df1.unstack()
s2
a 1 0.301681 2 0.739900 b 1 -1.596626 2 0.299108 c 1 -0.261337 2 0.074713 dtype: float64
s2 = df1.T.unstack()
s2
1 a 0.301681 b -1.596626 c -0.261337 2 a 0.739900 b 0.299108 c 0.074713 dtype: float64
多级index DataFrame
df = DataFrame(np.arange(16).reshape(4,4),
index=[['a','a','b','b'],[1,2,1,2]],
columns=[['c','c','d','d'],[5,5,6,7]])
df
|
|
c |
d |
|
|
5 |
5 |
6 |
7 |
a |
1 |
0 |
1 |
2 |
3 |
2 |
4 |
5 |
6 |
7 |
b |
1 |
8 |
9 |
10 |
11 |
2 |
12 |
13 |
14 |
15 |
df['d']
|
|
6 |
7 |
a |
1 |
2 |
3 |
2 |
6 |
7 |
b |
1 |
10 |
11 |
2 |
14 |
15 |
df['d'][6]
a 1 2 2 6 b 1 10 2 14 Name: 6, dtype: int32 —–
七、Mapping 和 Replace
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
df1 = DataFrame({"城市":["北京","上海","广州"],"人口":[1000,2000,3000]})
df1
|
城市 |
人口 |
0 |
北京 |
1000 |
1 |
上海 |
2000 |
2 |
广州 |
3000 |
df1['GDP'] = Series([1000,2000,1500])
df1
|
城市 |
人口 |
GDP |
0 |
北京 |
1000 |
1000 |
1 |
上海 |
2000 |
2000 |
2 |
广州 |
3000 |
1500 |
map() 方法给DataFrame添加列
gdp_map = {"北京":100,"广州":300,"上海":200}
df1['GDP'] = df1['城市'].map(gdp_map)
df1
|
城市 |
人口 |
GDP |
0 |
北京 |
1000 |
100 |
1 |
上海 |
2000 |
200 |
2 |
广州 |
3000 |
300 |
df2 = DataFrame({"城市":["北京","上海","广州"],"人口":[1000,2000,3000]},
index=['A','B','C'])
df2
|
城市 |
人口 |
A |
北京 |
1000 |
B |
上海 |
2000 |
C |
广州 |
3000 |
df2['GDP'] = Series([1000,2000,1500])
df2
|
城市 |
人口 |
GDP |
A |
北京 |
1000 |
NaN |
B |
上海 |
2000 |
NaN |
C |
广州 |
3000 |
NaN |
df2['GDP'] = Series([1000,2000,1500],index=['A','B','C'])
df2
|
城市 |
人口 |
GDP |
A |
北京 |
1000 |
1000 |
B |
上海 |
2000 |
2000 |
C |
广州 |
3000 |
1500 |
replace in series
s1 = Series(np.arange(6))
s1
0 0
1 1
2 2
3 3
4 4
5 5
dtype: int32
s1.replace(1,np.nan)
0 0.0
1 NaN
2 2.0
3 3.0
4 4.0
5 5.0
dtype: float64
s1.replace([1,2,3],[10,20,30])
0 0
1 10
2 20
3 30
4 4
5 5
dtype: int64