Pandas数据操作
import pandas as pd
Series索引
ser_obj = pd.Series(range(5), index = ['a', 'b', 'c', 'd', 'e'])
print(ser_obj.head())
a 0
b 1
c 2
d 3
e 4
dtype: int32
行索引
print(ser_obj['a'])
print(ser_obj[0])
0
0
切片索引
print(ser_obj[1:3])
print(ser_obj['b':'d'])
b 1
c 2
dtype: int32
b 1
c 2
d 3
dtype: int32
不连续索引
print(ser_obj[[0, 2, 4]])
print(ser_obj[['a', 'e']])
a 0
c 2
e 4
dtype: int32
a 0
e 4
dtype: int32
布尔索引
ser_bool = ser_obj > 2
print(ser_bool)
print(ser_obj[ser_bool])
print(ser_obj[ser_obj > 2])
a False
b False
c False
d True
e True
dtype: bool
d 3
e 4
dtype: int32
d 3
e 4
dtype: int32
DataFrame索引
import numpy as np
df_obj = pd.DataFrame(np.random.randn(5,4), columns = ['a', 'b', 'c', 'd'])
print(df_obj.head())
a b c d
0 -0.255086 -1.605135 -0.491771 0.147356
1 -0.870266 -0.495241 -0.077998 1.017201
2 1.146990 -1.016143 -0.829765 3.012885
3 -0.331168 0.183293 0.898056 -0.595689
4 0.048942 1.577595 -0.980013 1.382445
列索引
print('列索引')
print(df_obj['a']) # 返回Series类型
print(type(df_obj.loc[[0]])) # 返回DataFrame类型
列索引
0 -0.255086
1 -0.870266
2 1.146990
3 -0.331168
4 0.048942
Name: a, dtype: float64
不连续索引
print('不连续索引')
print(df_obj[['a','c']])
print(df_obj.iloc[[1, 3]])
不连续索引
a c
0 -0.255086 -0.491771
1 -0.870266 -0.077998
2 1.146990 -0.829765
3 -0.331168 0.898056
4 0.048942 -0.980013
a b c d
1 -0.870266 -0.495241 -0.077998 1.017201
3 -0.331168 0.183293 0.898056 -0.595689
三种索引方式
标签索引 loc
# Series
print(ser_obj['b':'d'])
print(ser_obj.loc['b':'d'])
# DataFrame
print(df_obj['a'])
print(df_obj.loc[0:2, 'a'])
b 1
c 2
d 3
dtype: int32
b 1
c 2
d 3
dtype: int32
0 -0.255086
1 -0.870266
2 1.146990
3 -0.331168
4 0.048942
Name: a, dtype: float64
0 -0.255086
1 -0.870266
2 1.146990
Name: a, dtype: float64
整型位置索引 iloc
print(ser_obj[1:3])
print(ser_obj.iloc[1:3])
# DataFrame
print(df_obj.iloc[0:2, 0]) # 注意和df_obj.loc[0:2, 'a']的区别
b 1
c 2
dtype: int32
b 1
c 2
dtype: int32
0 -0.255086
1 -0.870266
Name: a, dtype: float64
混合索引 ix
print(ser_obj.ix[1:3])
print(ser_obj.ix['b':'c'])
# DataFrame
print(df_obj.ix[0:2, 0]) # 先按标签索引尝试操作,然后再按位置索引尝试操作
b 1
c 2
dtype: int32
b 1
c 2
dtype: int32
0 -0.255086
1 -0.870266
2 1.146990
Name: a, dtype: float64
C:\Users\weixiao\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: DeprecationWarning:
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing
See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
"""Entry point for launching an IPython kernel.
C:\Users\weixiao\Anaconda3\lib\site-packages\ipykernel_launcher.py:5: DeprecationWarning:
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing
See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
"""
运算与对齐
s1 = pd.Series(range(10, 20), index = range(10))
s2 = pd.Series(range(20, 25), index = range(5))
print('s1: ' )
print(s1)
print('')
print('s2: ')
print(s2)
s1:
0 10
1 11
2 12
3 13
4 14
5 15
6 16
7 17
8 18
9 19
dtype: int32
s2:
0 20
1 21
2 22
3 23
4 24
dtype: int32
Series 对齐运算
s1 + s2
0 30.0
1 32.0
2 34.0
3 36.0
4 38.0
5 NaN
6 NaN
7 NaN
8 NaN
9 NaN
dtype: float64
import numpy as np
df1 = pd.DataFrame(np.ones((2,2)), columns = ['a', 'b'])
df2 = pd.DataFrame(np.ones((3,3)), columns = ['a', 'b', 'c'])
print('df1: ')
print(df1)
print('')
print('df2: ')
print(df2)
df1:
a b
0 1.0 1.0
1 1.0 1.0
df2:
a b c
0 1.0 1.0 1.0
1 1.0 1.0 1.0
2 1.0 1.0 1.0
DataFrame对齐操作
df1 + df2
a b c
0 2.0 2.0 NaN
1 2.0 2.0 NaN
2 NaN NaN NaN
# 填充未对齐的数据进行运算
```py
print(s1)
print(s2)
s1.add(s2, fill_value = -1)
0 10
1 11
2 12
3 13
4 14
5 15
6 16
7 17
8 18
9 19
dtype: int32
0 20
1 21
2 22
3 23
4 24
dtype: int32
0 30.0
1 32.0
2 34.0
3 36.0
4 38.0
5 14.0
6 15.0
7 16.0
8 17.0
9 18.0
dtype: float64
df1.sub(df2, fill_value = 2.)
a b c
0 0.0 0.0 1.0
1 0.0 0.0 1.0
2 1.0 1.0 1.0
# 填充NaN
```py
s3 = s1 + s2
print(s3)
0 30.0
1 32.0
2 34.0
3 36.0
4 38.0
5 NaN
6 NaN
7 NaN
8 NaN
9 NaN
dtype: float64
s3_filled = s3.fillna(-1)
print(s3_filled)
0 30.0
1 32.0
2 34.0
3 36.0
4 38.0
5 -1.0
6 -1.0
7 -1.0
8 -1.0
9 -1.0
dtype: float64
df3 = df1 + df2
print(df3)
a b c
0 2.0 2.0 NaN
1 2.0 2.0 NaN
2 NaN NaN NaN
df3.fillna(100, inplace = True)
print(df3)
a b c
0 2.0 2.0 100.0
1 2.0 2.0 100.0
2 100.0 100.0 100.0
函数应用
Numpy ufunc 函数
df = pd.DataFrame(np.random.randn(5,4) - 1)
print(df)
print(np.abs(df))
0 1 2 3
0 -0.155651 -0.610114 -0.237815 -1.395167
1 0.016375 -0.782956 -1.605014 -0.596293
2 -0.749993 0.286530 -0.964842 -2.545031
3 0.754360 1.161125 -1.315330 -2.810557
4 -1.950679 -0.432384 -0.811125 -0.284081
0 1 2 3
0 0.155651 0.610114 0.237815 1.395167
1 0.016375 0.782956 1.605014 0.596293
2 0.749993 0.286530 0.964842 2.545031
3 0.754360 1.161125 1.315330 2.810557
4 1.950679 0.432384 0.811125 0.284081
使用apply应用行或列数据
#f = lambda x : x.max()
print(df.apply(lambda x : x.max()))
0 0.754360
1 1.161125
2 -0.237815
3 -0.284081
dtype: float64
指定轴方向
print(df.apply(lambda x : x.max(), axis=1))
0 -0.155651
1 0.016375
2 0.286530
3 1.161125
4 -0.284081
dtype: float64
使用applymap应用到每个数据
f2 = lambda x : '%.2f' % x
print(df.applymap(f2))
0 1 2 3
0 -0.16 -0.61 -0.24 -1.40
1 0.02 -0.78 -1.61 -0.60
2 -0.75 0.29 -0.96 -2.55
3 0.75 1.16 -1.32 -2.81
4 -1.95 -0.43 -0.81 -0.28
排序
s4 = pd.Series(range(10, 15), index = np.random.randint(5, size=5))
print(s4)
3 10
3 11
2 12
3 13
2 14
dtype: int32
索引排序
s4.sort_index()
2 12
2 14
3 10
3 11
3 13
dtype: int32
df4 = pd.DataFrame(np.random.randn(3, 4),
index=np.random.randint(3, size=3),
columns=np.random.randint(4, size=4))
print(df4)
2 2 1 0
0 0.578335 -1.058499 -2.050409 1.033859
2 -0.336811 0.062738 0.993909 -1.329422
1 -0.767846 -0.669763 1.197213 -0.060293
#df4.sort_index(ascending=False)
df4.sort_index(axis=1)
按值排序
df4.sort_values(by=1)
处理缺失数据
df_data = pd.DataFrame([np.random.randn(3), [1., np.nan, np.nan],
[4., np.nan, np.nan], [1., np.nan, 2.]])
df_data.head()
isnull
df_data.isnull()
dropna
df_data.dropna()
#df_data.dropna(axis=1)
fillna
df_data.fillna(-100.)