import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
对象创建
通过传入一些值的列表来创建一个Series, Pandas会自动创建一个默认的整数索引:
s = pd.Series([1,3,5,np.nan,6,8])
s
0 1.0
1 3.0
2 5.0
3 NaN
4 6.0
5 8.0
dtype: float64
通过传递带有日期时间索引和带标签列的NumPy数组来创建DataFrame:
dates = pd.date_range('20130101',periods=6)
dates
df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=list('ABCD'))
df
|
A |
B |
C |
D |
2013-01-01 |
-0.828948 |
0.281765 |
0.803692 |
0.030016 |
2013-01-02 |
0.418212 |
1.537528 |
0.407742 |
0.625449 |
2013-01-03 |
0.746757 |
-0.338140 |
-0.734583 |
-2.377116 |
2013-01-04 |
-0.507705 |
-0.409561 |
-2.596286 |
0.464993 |
2013-01-05 |
-0.154101 |
-0.675057 |
-0.747016 |
-0.192082 |
2013-01-06 |
0.892789 |
-1.848313 |
0.897434 |
0.157656 |
通过传递可以转化为类似Series的dict对象来创建DataFrame:
df2 = pd.DataFrame({ 'A' : 1.,
'B' : pd.Timestamp('20130102'),
'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
'D' : np.array([3] * 4,dtype='int32'),
'E' : pd.Categorical(["test","train","test","train"]),
'F' : 'foo' })
df2
|
A |
B |
C |
D |
E |
F |
0 |
1.0 |
2013-01-02 |
1.0 |
3 |
test |
foo |
1 |
1.0 |
2013-01-02 |
1.0 |
3 |
train |
foo |
2 |
1.0 |
2013-01-02 |
1.0 |
3 |
test |
foo |
3 |
1.0 |
2013-01-02 |
1.0 |
3 |
train |
foo |
df2.dtypes
A float64
B datetime64[ns]
C float32
D int32
E category
F object
dtype: object
查看数据
df.head()
|
A |
B |
C |
D |
2013-01-01 |
-0.828948 |
0.281765 |
0.803692 |
0.030016 |
2013-01-02 |
0.418212 |
1.537528 |
0.407742 |
0.625449 |
2013-01-03 |
0.746757 |
-0.338140 |
-0.734583 |
-2.377116 |
2013-01-04 |
-0.507705 |
-0.409561 |
-2.596286 |
0.464993 |
2013-01-05 |
-0.154101 |
-0.675057 |
-0.747016 |
-0.192082 |
df.tail(2)
|
A |
B |
C |
D |
2013-01-05 |
-0.154101 |
-0.675057 |
-0.747016 |
-0.192082 |
2013-01-06 |
0.892789 |
-1.848313 |
0.897434 |
0.157656 |
显示索引、列和底层NumPy数据:
df.index
DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
'2013-01-05', '2013-01-06'],
dtype='datetime64[ns]', freq='D')
df.columns
Index(['A', 'B', 'C', 'D'], dtype='object')
df.values
array([[-0.82894761, 0.28176527, 0.80369199, 0.03001636],
[ 0.41821203, 1.53752828, 0.40774162, 0.62544912],
[ 0.74675688, -0.33814015, -0.73458287, -2.3771161 ],
[-0.5077046 , -0.4095612 , -2.59628619, 0.46499331],
[-0.15410053, -0.67505665, -0.74701636, -0.19208195],
[ 0.89278944, -1.84831322, 0.8974336 , 0.15765575]])
df.describe()
|
A |
B |
C |
D |
count |
6.000000 |
6.000000 |
6.000000 |
6.000000 |
mean |
0.094501 |
-0.241963 |
-0.328170 |
-0.215181 |
std |
0.699243 |
1.117690 |
1.327384 |
1.099356 |
min |
-0.828948 |
-1.848313 |
-2.596286 |
-2.377116 |
25% |
-0.419304 |
-0.608683 |
-0.743908 |
-0.136557 |
50% |
0.132056 |
-0.373851 |
-0.163421 |
0.093836 |
75% |
0.664621 |
0.126789 |
0.704704 |
0.388159 |
max |
0.892789 |
1.537528 |
0.897434 |
0.625449 |
df.T
|
2013-01-01 00:00:00 |
2013-01-02 00:00:00 |
2013-01-03 00:00:00 |
2013-01-04 00:00:00 |
2013-01-05 00:00:00 |
2013-01-06 00:00:00 |
A |
-0.828948 |
0.418212 |
0.746757 |
-0.507705 |
-0.154101 |
0.892789 |
B |
0.281765 |
1.537528 |
-0.338140 |
-0.409561 |
-0.675057 |
-1.848313 |
C |
0.803692 |
0.407742 |
-0.734583 |
-2.596286 |
-0.747016 |
0.897434 |
D |
0.030016 |
0.625449 |
-2.377116 |
0.464993 |
-0.192082 |
0.157656 |
print( df.sort_index(axis=1, ascending=False))
print( df.sort_index(axis=0, ascending=False))
print( df.sort_index(axis=1, ascending=True))
print( df.sort_index(axis=0, ascending=True))
D C B A
2013-01-01 0.030016 0.803692 0.281765 -0.828948
2013-01-02 0.625449 0.407742 1.537528 0.418212
2013-01-03 -2.377116 -0.734583 -0.338140 0.746757
2013-01-04 0.464993 -2.596286 -0.409561 -0.507705
2013-01-05 -0.192082 -0.747016 -0.675057 -0.154101
2013-01-06 0.157656 0.897434 -1.848313 0.892789
A B C D
2013-01-06 0.892789 -1.848313 0.897434 0.157656
2013-01-05 -0.154101 -0.675057 -0.747016 -0.192082
2013-01-04 -0.507705 -0.409561 -2.596286 0.464993
2013-01-03 0.746757 -0.338140 -0.734583 -2.377116
2013-01-02 0.418212 1.537528 0.407742 0.625449
2013-01-01 -0.828948 0.281765 0.803692 0.030016
A B C D
2013-01-01 -0.828948 0.281765 0.803692 0.030016
2013-01-02 0.418212 1.537528 0.407742 0.625449
2013-01-03 0.746757 -0.338140 -0.734583 -2.377116
2013-01-04 -0.507705 -0.409561 -2.596286 0.464993
2013-01-05 -0.154101 -0.675057 -0.747016 -0.192082
2013-01-06 0.892789 -1.848313 0.897434 0.157656
A B C D
2013-01-01 -0.828948 0.281765 0.803692 0.030016
2013-01-02 0.418212 1.537528 0.407742 0.625449
2013-01-03 0.746757 -0.338140 -0.734583 -2.377116
2013-01-04 -0.507705 -0.409561 -2.596286 0.464993
2013-01-05 -0.154101 -0.675057 -0.747016 -0.192082
2013-01-06 0.892789 -1.848313 0.897434 0.157656
df.sort_values(by='B')
|
A |
B |
C |
D |
2013-01-06 |
0.892789 |
-1.848313 |
0.897434 |
0.157656 |
2013-01-05 |
-0.154101 |
-0.675057 |
-0.747016 |
-0.192082 |
2013-01-04 |
-0.507705 |
-0.409561 |
-2.596286 |
0.464993 |
2013-01-03 |
0.746757 |
-0.338140 |
-0.734583 |
-2.377116 |
2013-01-01 |
-0.828948 |
0.281765 |
0.803692 |
0.030016 |
2013-01-02 |
0.418212 |
1.537528 |
0.407742 |
0.625449 |
选择
df['A']
2013-01-01 -0.828948
2013-01-02 0.418212
2013-01-03 0.746757
2013-01-04 -0.507705
2013-01-05 -0.154101
2013-01-06 0.892789
Freq: D, Name: A, dtype: float64
df[0:3]
|
A |
B |
C |
D |
2013-01-01 |
-0.828948 |
0.281765 |
0.803692 |
0.030016 |
2013-01-02 |
0.418212 |
1.537528 |
0.407742 |
0.625449 |
2013-01-03 |
0.746757 |
-0.338140 |
-0.734583 |
-2.377116 |
df['20130102':'20130104']
|
A |
B |
C |
D |
2013-01-02 |
0.418212 |
1.537528 |
0.407742 |
0.625449 |
2013-01-03 |
0.746757 |
-0.338140 |
-0.734583 |
-2.377116 |
2013-01-04 |
-0.507705 |
-0.409561 |
-2.596286 |
0.464993 |
df.loc[dates[0]]
A -0.828948
B 0.281765
C 0.803692
D 0.030016
Name: 2013-01-01 00:00:00, dtype: float64
df.loc['20130102':'20130104',['A','B']]
|
A |
B |
2013-01-02 |
0.418212 |
1.537528 |
2013-01-03 |
0.746757 |
-0.338140 |
2013-01-04 |
-0.507705 |
-0.409561 |
df.loc['20130102',['A','B']]
A 0.418212
B 1.537528
Name: 2013-01-02 00:00:00, dtype: float64
df.at[dates[0],'A']
-0.8289476073976824
df.at[dates[0],'A']
-0.8289476073976824
df.iloc[3]
A -0.507705
B -0.409561
C -2.596286
D 0.464993
Name: 2013-01-04 00:00:00, dtype: float64
df.iloc[3:5,0:2]
|
A |
B |
2013-01-04 |
-0.507705 |
-0.409561 |
2013-01-05 |
-0.154101 |
-0.675057 |
df.iloc[[1,2,4],[0,2]]
|
A |
C |
2013-01-02 |
0.418212 |
0.407742 |
2013-01-03 |
0.746757 |
-0.734583 |
2013-01-05 |
-0.154101 |
-0.747016 |
df.iloc[1,1]
1.5375282822642125
布尔索引
df[df.A > 0]
|
A |
B |
C |
D |
2013-01-02 |
0.418212 |
1.537528 |
0.407742 |
0.625449 |
2013-01-03 |
0.746757 |
-0.338140 |
-0.734583 |
-2.377116 |
2013-01-06 |
0.892789 |
-1.848313 |
0.897434 |
0.157656 |
df[df > 0]
|
A |
B |
C |
D |
2013-01-01 |
NaN |
0.281765 |
0.803692 |
0.030016 |
2013-01-02 |
0.418212 |
1.537528 |
0.407742 |
0.625449 |
2013-01-03 |
0.746757 |
NaN |
NaN |
NaN |
2013-01-04 |
NaN |
NaN |
NaN |
0.464993 |
2013-01-05 |
NaN |
NaN |
NaN |
NaN |
2013-01-06 |
0.892789 |
NaN |
0.897434 |
0.157656 |
赋值
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130102', periods=6))
s1
2013-01-02 1
2013-01-03 2
2013-01-04 3
2013-01-05 4
2013-01-06 5
2013-01-07 6
Freq: D, dtype: int64
df['F'] = s1
df.at[dates[0],'A'] = 0
df.iat[0,1] = 0
df.loc[:,'D'] = np.array([5] * len(df))
df
|
A |
B |
C |
D |
F |
2013-01-01 |
0.000000 |
0.000000 |
0.803692 |
5 |
NaN |
2013-01-02 |
0.418212 |
1.537528 |
0.407742 |
5 |
1.0 |
2013-01-03 |
0.746757 |
-0.338140 |
-0.734583 |
5 |
2.0 |
2013-01-04 |
-0.507705 |
-0.409561 |
-2.596286 |
5 |
3.0 |
2013-01-05 |
-0.154101 |
-0.675057 |
-0.747016 |
5 |
4.0 |
2013-01-06 |
0.892789 |
-1.848313 |
0.897434 |
5 |
5.0 |
df2 = df.copy()
df2[df2 > 0] = -df2
df2
|
A |
B |
C |
D |
F |
2013-01-01 |
0.000000 |
0.000000 |
-0.803692 |
-5 |
NaN |
2013-01-02 |
-0.418212 |
-1.537528 |
-0.407742 |
-5 |
-1.0 |
2013-01-03 |
-0.746757 |
-0.338140 |
-0.734583 |
-5 |
-2.0 |
2013-01-04 |
-0.507705 |
-0.409561 |
-2.596286 |
-5 |
-3.0 |
2013-01-05 |
-0.154101 |
-0.675057 |
-0.747016 |
-5 |
-4.0 |
2013-01-06 |
-0.892789 |
-1.848313 |
-0.897434 |
-5 |
-5.0 |
插入
df.loc['new']=[1,2,3,4,5]
df
|
A |
B |
C |
D |
F |
2013-01-01 00:00:00 |
0.000000 |
0.000000 |
0.803692 |
5 |
NaN |
2013-01-02 00:00:00 |
0.418212 |
1.537528 |
0.407742 |
5 |
1.0 |
2013-01-03 00:00:00 |
0.746757 |
-0.338140 |
-0.734583 |
5 |
2.0 |
2013-01-04 00:00:00 |
-0.507705 |
-0.409561 |
-2.596286 |
5 |
3.0 |
2013-01-05 00:00:00 |
-0.154101 |
-0.675057 |
-0.747016 |
5 |
4.0 |
2013-01-06 00:00:00 |
0.892789 |
-1.848313 |
0.897434 |
5 |
5.0 |
new |
1.000000 |
2.000000 |
3.000000 |
4 |
5.0 |
df3=pd.DataFrame([6,6,6,6,6]).T
df3.columns = df.columns
df_new = pd.concat([df,df3],ignore_index=True)
df_new
|
A |
B |
C |
D |
F |
0 |
0.000000 |
0.000000 |
0.803692 |
5 |
NaN |
1 |
0.418212 |
1.537528 |
0.407742 |
5 |
1.0 |
2 |
0.746757 |
-0.338140 |
-0.734583 |
5 |
2.0 |
3 |
-0.507705 |
-0.409561 |
-2.596286 |
5 |
3.0 |
4 |
-0.154101 |
-0.675057 |
-0.747016 |
5 |
4.0 |
5 |
0.892789 |
-1.848313 |
0.897434 |
5 |
5.0 |
6 |
1.000000 |
2.000000 |
3.000000 |
4 |
5.0 |
7 |
6.000000 |
6.000000 |
6.000000 |
6 |
6.0 |
统计
df.mean()
A 0.342279
B 0.038065
C 0.147283
D 4.857143
F 3.333333
dtype: float64
df.mean(1)
2013-01-01 00:00:00 1.450923
2013-01-02 00:00:00 1.672696
2013-01-03 00:00:00 1.334807
2013-01-04 00:00:00 0.897290
2013-01-05 00:00:00 1.484765
2013-01-06 00:00:00 1.988382
new 3.000000
dtype: float64
df.sum()
A 2.395953
B 0.266457
C 1.030982
D 34.000000
F 20.000000
dtype: float64
df.sum(1)
2013-01-01 00:00:00 5.803692
2013-01-02 00:00:00 8.363482
2013-01-03 00:00:00 6.674034
2013-01-04 00:00:00 4.486448
2013-01-05 00:00:00 7.423826
2013-01-06 00:00:00 9.941910
new 15.000000
dtype: float64
df.var()
A 0.331841
B 1.751315
C 3.050677
D 0.142857
F 2.666667
dtype: float64
df.std()
df.corr()
df.cov()
df.describe()
统计作图
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.figure(figsize = (7,5))
plt.plot(x, y, s)
这是 Matplotlib通用的绘图方式,绘制y对于x(即以x为横轴的二维图形),字符串参量S指定绘制时图形的类型、样式和颜色,常用的选项有:'b’为蓝色、'r’为红色、'g’为绿色、‘o’为圆圈、’+‘为加号标记、’-‘为实线、’–'为虚线。当x、y均为实数同维向量时,则描出点(x(i),y(i)),然后用直线依次相连。
plt.plot(kind=box)
这里使用的是 DataFrame或 Series对象内置的方法作图,默认以 Index为横坐标,每列数据为纵坐标自动作图,通过kind参数指定作图类型,支持line(线)、bar(条形)barh、hist(直方图)、box(箱线图)、kde(密度图)和area、pie(饼图)等,同时也能够接受 plt.plot()中接受的参数。因此,如果数据已经被加载为 Pandas中的对象,那么以这种方式作图是比较简洁的。
x=np.linspace(0,2*np.pi,50)
y=np.sin(x)
plt.plot(x,y,'bp--')
plt.show()
plt.pie(size)
使用Matplotlib绘制饼图,其中size是一个列表,记录各个扇形的比例。pie有丰富的参数.
import matplotlib.pyplot as plt
labels= 'Frogs', 'Hogs','Dogs', 'Logs'
sizes= [15, 30, 45, 10]
colors=['yellowgreen', 'gold', 'lightskyblue', 'lightcoral']
explode= (0, 0.1, 0, 0)
plt.pie (sizes, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%',
shadow=True, startangle=90)
plt.axis ('equal')
plt.show()