10分钟学习pandas

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
s = pd.Series([1,3,5,np.nan,6,8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64
len(s)
6
s.describe()
count    5.000000
mean     4.600000
std      2.701851
min      1.000000
25%      3.000000
50%      5.000000
75%      6.000000
max      8.000000
dtype: float64
dates = pd.date_range('20190101',periods=6)
dates
DatetimeIndex(['2019-01-01', '2019-01-02', '2019-01-03', '2019-01-04',
               '2019-01-05', '2019-01-06'],
              dtype='datetime64[ns]', freq='D')
df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=list('ABCD'))#randn返回一个标准正态分布
df
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
A B C D
2019-01-01 0.317575 0.330549 0.055375 -1.284453
2019-01-02 -0.272201 -0.770618 0.217658 1.349425
2019-01-03 0.842686 -0.973354 0.596166 0.215889
2019-01-04 0.252868 -0.778050 -0.324255 -1.378140
2019-01-05 0.281876 1.143140 -0.781625 -1.245069
2019-01-06 -0.468258 -1.682376 1.494058 1.262588
df.describe()
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
A B C D
count 6.000000 6.000000 6.000000 6.000000
mean 0.159091 -0.455118 0.209563 -0.179960
std 0.468160 1.014914 0.786212 1.293573
min -0.468258 -1.682376 -0.781625 -1.378140
25% -0.140934 -0.924528 -0.229347 -1.274607
50% 0.267372 -0.774334 0.136517 -0.514590
75% 0.308650 0.055257 0.501539 1.000913
max 0.842686 1.143140 1.494058 1.349425
df.shape
(6, 4)
df2 = pd.DataFrame({'A':1.,
                   'B':pd.Timestamp('20190101'),
                   'C':pd.Series(1,index=list(range(4)),dtype='float32'),
                   'D':np.array([3]*4,dtype='int32'),
                   'E':pd.Categorical(["test","train","test","train"]),
                   'F':'foo'})#foo被用作占位符的名字,用于实体举例,类似有foo,bar,baz
df2
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
A B C D E F
0 1.0 2019-01-01 1.0 3 test foo
1 1.0 2019-01-01 1.0 3 train foo
2 1.0 2019-01-01 1.0 3 test foo
3 1.0 2019-01-01 1.0 3 train foo
df2.dtypes
A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object
df.head()
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
A B C D
2019-01-01 0.317575 0.330549 0.055375 -1.284453
2019-01-02 -0.272201 -0.770618 0.217658 1.349425
2019-01-03 0.842686 -0.973354 0.596166 0.215889
2019-01-04 0.252868 -0.778050 -0.324255 -1.378140
2019-01-05 0.281876 1.143140 -0.781625 -1.245069
df.tail()
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
A B C D
2019-01-02 -0.272201 -0.770618 0.217658 1.349425
2019-01-03 0.842686 -0.973354 0.596166 0.215889
2019-01-04 0.252868 -0.778050 -0.324255 -1.378140
2019-01-05 0.281876 1.143140 -0.781625 -1.245069
2019-01-06 -0.468258 -1.682376 1.494058 1.262588
df.index
DatetimeIndex(['2019-01-01', '2019-01-02', '2019-01-03', '2019-01-04',
               '2019-01-05', '2019-01-06'],
              dtype='datetime64[ns]', freq='D')
df.columns
Index(['A', 'B', 'C', 'D'], dtype='object')

df.values
array([[ 0.31757454,  0.33054893,  0.05537508, -1.28445319],
       [-0.27220143, -0.77061807,  0.21765843,  1.34942538],
       [ 0.84268621, -0.97335385,  0.59616646,  0.21588867],
       [ 0.25286828, -0.77804969, -0.32425479, -1.37813964],
       [ 0.28187609,  1.14314031, -0.78162546, -1.24506887],
       [-0.4682577 , -1.68237556,  1.49405812,  1.26258772]])

df.describe()#以列统计
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

A B C D
count 6.000000 6.000000 6.000000 6.000000
mean 0.159091 -0.455118 0.209563 -0.179960
std 0.468160 1.014914 0.786212 1.293573
min -0.468258 -1.682376 -0.781625 -1.378140
25% -0.140934 -0.924528 -0.229347 -1.274607
50% 0.267372 -0.774334 0.136517 -0.514590
75% 0.308650 0.055257 0.501539 1.000913
max 0.842686 1.143140 1.494058 1.349425
df2.describe(include='all')
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

A B C D E F
count 4.0 4 4.0 4.0 4 4
unique NaN 1 NaN NaN 2 1
top NaN 2019-01-01 00:00:00 NaN NaN train foo
freq NaN 4 NaN NaN 2 4
first NaN 2019-01-01 00:00:00 NaN NaN NaN NaN
last NaN 2019-01-01 00:00:00 NaN NaN NaN NaN
mean 1.0 NaN 1.0 3.0 NaN NaN
std 0.0 NaN 0.0 0.0 NaN NaN
min 1.0 NaN 1.0 3.0 NaN NaN
25% 1.0 NaN 1.0 3.0 NaN NaN
50% 1.0 NaN 1.0 3.0 NaN NaN
75% 1.0 NaN 1.0 3.0 NaN NaN
max 1.0 NaN 1.0 3.0 NaN NaN
df.T
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

2019-01-01 00:00:00 2019-01-02 00:00:00 2019-01-03 00:00:00 2019-01-04 00:00:00 2019-01-05 00:00:00 2019-01-06 00:00:00
A 0.317575 -0.272201 0.842686 0.252868 0.281876 -0.468258
B 0.330549 -0.770618 -0.973354 -0.778050 1.143140 -1.682376
C 0.055375 0.217658 0.596166 -0.324255 -0.781625 1.494058
D -1.284453 1.349425 0.215889 -1.378140 -1.245069 1.262588
df.sort_index(axis=1,ascending=True)#以行为序,降序
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

A B C D
2019-01-01 0.317575 0.330549 0.055375 -1.284453
2019-01-02 -0.272201 -0.770618 0.217658 1.349425
2019-01-03 0.842686 -0.973354 0.596166 0.215889
2019-01-04 0.252868 -0.778050 -0.324255 -1.378140
2019-01-05 0.281876 1.143140 -0.781625 -1.245069
2019-01-06 -0.468258 -1.682376 1.494058 1.262588
df.sort_values(by='B')#以B的值升序排列,以列
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

A B C D
2019-01-06 -0.468258 -1.682376 1.494058 1.262588
2019-01-03 0.842686 -0.973354 0.596166 0.215889
2019-01-04 0.252868 -0.778050 -0.324255 -1.378140
2019-01-02 -0.272201 -0.770618 0.217658 1.349425
2019-01-01 0.317575 0.330549 0.055375 -1.284453
2019-01-05 0.281876 1.143140 -0.781625 -1.245069
df['A']#df.A 选择列
2019-01-01    0.317575
2019-01-02   -0.272201
2019-01-03    0.842686
2019-01-04    0.252868
2019-01-05    0.281876
2019-01-06   -0.468258
Freq: D, Name: A, dtype: float64

df[0:3]#只显示0,1,2,此时切割行
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

A B C D
2019-01-01 0.317575 0.330549 0.055375 -1.284453
2019-01-02 -0.272201 -0.770618 0.217658 1.349425
2019-01-03 0.842686 -0.973354 0.596166 0.215889
df['20190101':'20190103']#此时包含头尾
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

A B C D
2019-01-01 0.317575 0.330549 0.055375 -1.284453
2019-01-02 -0.272201 -0.770618 0.217658 1.349425
2019-01-03 0.842686 -0.973354 0.596166 0.215889
df.loc[dates[0]]#行
A    0.317575
B    0.330549
C    0.055375
D   -1.284453
Name: 2019-01-01 00:00:00, dtype: float64

dates[0]
Timestamp('2019-01-01 00:00:00', freq='D')

df.loc[:,['A','B']]#列
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

A B
2019-01-01 0.317575 0.330549
2019-01-02 -0.272201 -0.770618
2019-01-03 0.842686 -0.973354
2019-01-04 0.252868 -0.778050
2019-01-05 0.281876 1.143140
2019-01-06 -0.468258 -1.682376
df.loc['20190104',['A','B']].shape#返回形状变小
(2,)

#loc,at也可用来获得标量值
df.loc[dates[0],'A']
0.3175745430028141

df.at[dates[0],'A']
0.3175745430028141

df.at[dates[0],'A']#其中at/iat通过便签获取某个数值的具体位置
df.iat[1,2]#iat可用行号
0.21765843113729494

df.iloc[3]
A    0.252868
B   -0.778050
C   -0.324255
D   -1.378140
Name: 2019-01-04 00:00:00, dtype: float64

df.iloc[3,1]
-0.7780496857268889

df.iloc[[1,2,4],[0,2]]
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

A C
2019-01-02 -0.272201 0.217658
2019-01-03 0.842686 0.596166
2019-01-05 0.281876 -0.781625
df.iloc[1:3,:]
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

A B C D
2019-01-02 -0.272201 -0.770618 0.217658 1.349425
2019-01-03 0.842686 -0.973354 0.596166 0.215889
df.iloc[1:3]
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

A B C D
2019-01-02 -0.272201 -0.770618 0.217658 1.349425
2019-01-03 0.842686 -0.973354 0.596166 0.215889
df[df.A>0]
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

A B C D
2019-01-01 0.317575 0.330549 0.055375 -1.284453
2019-01-03 0.842686 -0.973354 0.596166 0.215889
2019-01-04 0.252868 -0.778050 -0.324255 -1.378140
2019-01-05 0.281876 1.143140 -0.781625 -1.245069
df[df>0]
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

A B C D
2019-01-01 0.317575 0.330549 0.055375 NaN
2019-01-02 NaN NaN 0.217658 1.349425
2019-01-03 0.842686 NaN 0.596166 0.215889
2019-01-04 0.252868 NaN NaN NaN
2019-01-05 0.281876 1.143140 NaN NaN
2019-01-06 NaN NaN 1.494058 1.262588
df3 = df.copy()
df3['E']=['one','two','three','four','three','one']
df3
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

A B C D E
2019-01-01 0.317575 0.330549 0.055375 -1.284453 one
2019-01-02 -0.272201 -0.770618 0.217658 1.349425 two
2019-01-03 0.842686 -0.973354 0.596166 0.215889 three
2019-01-04 0.252868 -0.778050 -0.324255 -1.378140 four
2019-01-05 0.281876 1.143140 -0.781625 -1.245069 three
2019-01-06 -0.468258 -1.682376 1.494058 1.262588 one
df3[df3['E'].isin(['one','two'])]
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

A B C D E
2019-01-01 0.317575 0.330549 0.055375 -1.284453 one
2019-01-02 -0.272201 -0.770618 0.217658 1.349425 two
2019-01-06 -0.468258 -1.682376 1.494058 1.262588 one
#设置新列时会自动按索引对齐
s1= pd.Series([1,2,3,4,5,6],index=pd.date_range('20190102',periods=6))
s1
2019-01-02    1
2019-01-03    2
2019-01-04    3
2019-01-05    4
2019-01-06    5
2019-01-07    6
Freq: D, dtype: int64

df['F']=s1
df
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

A B C D F
2019-01-01 0.317575 0.330549 0.055375 -1.284453 NaN
2019-01-02 -0.272201 -0.770618 0.217658 1.349425 1.0
2019-01-03 0.842686 -0.973354 0.596166 0.215889 2.0
2019-01-04 0.252868 -0.778050 -0.324255 -1.378140 3.0
2019-01-05 0.281876 1.143140 -0.781625 -1.245069 4.0
2019-01-06 -0.468258 -1.682376 1.494058 1.262588 5.0
df.at[dates[0],'A']=0
df.iat[0,1]=0
df.loc[:,'D']=np.array([5]*len(df))
df
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

A B C D F
2019-01-01 0.000000 0.000000 0.055375 5 NaN
2019-01-02 -0.272201 -0.770618 0.217658 5 1.0
2019-01-03 0.842686 -0.973354 0.596166 5 2.0
2019-01-04 0.252868 -0.778050 -0.324255 5 3.0
2019-01-05 0.281876 1.143140 -0.781625 5 4.0
2019-01-06 -0.468258 -1.682376 1.494058 5 5.0
df4 = df.copy()
df4[df4>0]=-df4
df4
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

A B C D F
2019-01-01 0.000000 0.000000 -0.055375 -5 NaN
2019-01-02 -0.272201 -0.770618 -0.217658 -5 -1.0
2019-01-03 -0.842686 -0.973354 -0.596166 -5 -2.0
2019-01-04 -0.252868 -0.778050 -0.324255 -5 -3.0
2019-01-05 -0.281876 -1.143140 -0.781625 -5 -4.0
2019-01-06 -0.468258 -1.682376 -1.494058 -5 -5.0
df1 = df.reindex(index=dates[0:4],columns=list(df.columns)+['E'])
df1.loc[dates[0]:dates[1],'E']=1
df1
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

A B C D F E
2019-01-01 0.000000 0.000000 0.055375 5 NaN 1.0
2019-01-02 -0.272201 -0.770618 0.217658 5 1.0 1.0
2019-01-03 0.842686 -0.973354 0.596166 5 2.0 NaN
2019-01-04 0.252868 -0.778050 -0.324255 5 3.0 NaN
df1.dropna(how='any')
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

A B C D F E
2019-01-02 -0.272201 -0.770618 0.217658 5 1.0 1.0
df1.fillna(value=5)
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

A B C D F E
2019-01-01 0.000000 0.000000 0.055375 5 5.0 1.0
2019-01-02 -0.272201 -0.770618 0.217658 5 1.0 1.0
2019-01-03 0.842686 -0.973354 0.596166 5 2.0 5.0
2019-01-04 0.252868 -0.778050 -0.324255 5 3.0 5.0
pd.isna(df1)
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

A B C D F E
2019-01-01 False False False False True False
2019-01-02 False False False False False False
2019-01-03 False False False False False True
2019-01-04 False False False False False True
df.mean()#列
A    0.106162
B   -0.510209
C    0.209563
D    5.000000
F    3.000000
dtype: float64

df.mean(1)#行
2019-01-01    1.263844
2019-01-02    1.034968
2019-01-03    1.493100
2019-01-04    1.430113
2019-01-05    1.928678
2019-01-06    1.868685
Freq: D, dtype: float64

#操作具有不同维度且需要对齐的对象。此外,pandas会自动沿着指定的维度广播。
s = pd.Series([1,3,5,np.nan,6,8],index=dates).shift(2)
s
2019-01-01    NaN
2019-01-02    NaN
2019-01-03    1.0
2019-01-04    3.0
2019-01-05    5.0
2019-01-06    NaN
Freq: D, dtype: float64

df.sub(s,axis='index')#每行每列按照s值进行减法操作
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

A B C D F
2019-01-01 NaN NaN NaN NaN NaN
2019-01-02 NaN NaN NaN NaN NaN
2019-01-03 -0.157314 -1.973354 -0.403834 4.0 1.0
2019-01-04 -2.747132 -3.778050 -3.324255 2.0 0.0
2019-01-05 -4.718124 -3.856860 -5.781625 0.0 -1.0
2019-01-06 NaN NaN NaN NaN NaN
df
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

A B C D F
2019-01-01 0.000000 0.000000 0.055375 5 NaN
2019-01-02 -0.272201 -0.770618 0.217658 5 1.0
2019-01-03 0.842686 -0.973354 0.596166 5 2.0
2019-01-04 0.252868 -0.778050 -0.324255 5 3.0
2019-01-05 0.281876 1.143140 -0.781625 5 4.0
2019-01-06 -0.468258 -1.682376 1.494058 5 5.0
df.apply(np.cumsum)#从第一行加到最后一行
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

A B C D F
2019-01-01 0.000000 0.000000 0.055375 5 NaN
2019-01-02 -0.272201 -0.770618 0.273034 10 1.0
2019-01-03 0.570485 -1.743972 0.869200 15 3.0
2019-01-04 0.823353 -2.522022 0.544945 20 6.0
2019-01-05 1.105229 -1.378881 -0.236680 25 10.0
2019-01-06 0.636971 -3.061257 1.257378 30 15.0
df.apply(lambda x:x.max()-x.min())#以列
A    1.310944
B    2.825516
C    2.275684
D    0.000000
F    4.000000
dtype: float64

df['F'].value_counts()
5.0    1
4.0    1
3.0    1
2.0    1
1.0    1
Name: F, dtype: int64

s = pd.Series(np.random.randint(0,7,size=10))
s
0    1
1    1
2    1
3    2
4    3
5    2
6    6
7    0
8    4
9    6
dtype: int32

s.dtype
dtype('int32')

s.shape
(10,)

s.value_counts()
1    3
6    2
2    2
4    1
3    1
0    1
dtype: int64

s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
s.dtype
dtype('O')

s.str


s.str.lower()
0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

#数据合并
df = pd.DataFrame(np.random.randn(10,4))
df
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

0 1 2 3
0 -0.936960 -0.210650 1.887069 0.128005
1 0.657660 0.254801 -0.092190 -1.957160
2 -0.920141 -2.259964 0.083965 0.371865
3 -0.755403 0.633426 0.090949 -0.626964
4 0.170052 1.164537 -1.193706 1.391785
5 0.084560 -1.513814 0.069032 0.099851
6 -0.683663 0.031873 -0.062998 0.523253
7 -0.926594 0.125286 -1.894089 -0.449402
8 0.610722 0.329156 0.025149 -2.673445
9 0.336673 1.205792 -1.346179 0.214389
pieces = [df[:3],df[3:7],df[7:]]
pieces
[          0         1         2         3
 0 -0.936960 -0.210650  1.887069  0.128005
 1  0.657660  0.254801 -0.092190 -1.957160
 2 -0.920141 -2.259964  0.083965  0.371865,
           0         1         2         3
 3 -0.755403  0.633426  0.090949 -0.626964
 4  0.170052  1.164537 -1.193706  1.391785
 5  0.084560 -1.513814  0.069032  0.099851
 6 -0.683663  0.031873 -0.062998  0.523253,
           0         1         2         3
 7 -0.926594  0.125286 -1.894089 -0.449402
 8  0.610722  0.329156  0.025149 -2.673445
 9  0.336673  1.205792 -1.346179  0.214389]

pd.concat(pieces)
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

0 1 2 3
0 -0.936960 -0.210650 1.887069 0.128005
1 0.657660 0.254801 -0.092190 -1.957160
2 -0.920141 -2.259964 0.083965 0.371865
3 -0.755403 0.633426 0.090949 -0.626964
4 0.170052 1.164537 -1.193706 1.391785
5 0.084560 -1.513814 0.069032 0.099851
6 -0.683663 0.031873 -0.062998 0.523253
7 -0.926594 0.125286 -1.894089 -0.449402
8 0.610722 0.329156 0.025149 -2.673445
9 0.336673 1.205792 -1.346179 0.214389

concat

  1. axis=1列合并;axis=0行合并(default)
  2. join=‘inner’只返回合并的共同对象;join=‘outer’返回合并后的所有列
  3. ignore_index=True会自动连续生成索引值;ignore_index=False则不改变原有的索引值
  4. verify_integrity会检查当前合并是否重复索引
  5. keys会导致生成多维数组
  6. name会重新命名列名。
pd.concat(pieces,axis=1)
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

0 1 2 3 0 1 2 3 0 1 2 3
0 -0.936960 -0.210650 1.887069 0.128005 NaN NaN NaN NaN NaN NaN NaN NaN
1 0.657660 0.254801 -0.092190 -1.957160 NaN NaN NaN NaN NaN NaN NaN NaN
2 -0.920141 -2.259964 0.083965 0.371865 NaN NaN NaN NaN NaN NaN NaN NaN
3 NaN NaN NaN NaN -0.755403 0.633426 0.090949 -0.626964 NaN NaN NaN NaN
4 NaN NaN NaN NaN 0.170052 1.164537 -1.193706 1.391785 NaN NaN NaN NaN
5 NaN NaN NaN NaN 0.084560 -1.513814 0.069032 0.099851 NaN NaN NaN NaN
6 NaN NaN NaN NaN -0.683663 0.031873 -0.062998 0.523253 NaN NaN NaN NaN
7 NaN NaN NaN NaN NaN NaN NaN NaN -0.926594 0.125286 -1.894089 -0.449402
8 NaN NaN NaN NaN NaN NaN NaN NaN 0.610722 0.329156 0.025149 -2.673445
9 NaN NaN NaN NaN NaN NaN NaN NaN 0.336673 1.205792 -1.346179 0.214389
pd.concat(pieces,join='outer',axis=1)
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

0 1 2 3 0 1 2 3 0 1 2 3
0 -0.936960 -0.210650 1.887069 0.128005 NaN NaN NaN NaN NaN NaN NaN NaN
1 0.657660 0.254801 -0.092190 -1.957160 NaN NaN NaN NaN NaN NaN NaN NaN
2 -0.920141 -2.259964 0.083965 0.371865 NaN NaN NaN NaN NaN NaN NaN NaN
3 NaN NaN NaN NaN -0.755403 0.633426 0.090949 -0.626964 NaN NaN NaN NaN
4 NaN NaN NaN NaN 0.170052 1.164537 -1.193706 1.391785 NaN NaN NaN NaN
5 NaN NaN NaN NaN 0.084560 -1.513814 0.069032 0.099851 NaN NaN NaN NaN
6 NaN NaN NaN NaN -0.683663 0.031873 -0.062998 0.523253 NaN NaN NaN NaN
7 NaN NaN NaN NaN NaN NaN NaN NaN -0.926594 0.125286 -1.894089 -0.449402
8 NaN NaN NaN NaN NaN NaN NaN NaN 0.610722 0.329156 0.025149 -2.673445
9 NaN NaN NaN NaN NaN NaN NaN NaN 0.336673 1.205792 -1.346179 0.214389
#sql格式合并
left = pd.DataFrame({'key':['foo','bar'],'lval':[1,2]})
right = pd.DataFrame({'key':['foo','bar'],'rval':[4,5]})
left
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

key lval
0 foo 1
1 bar 2
right
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

key rval
0 foo 4
1 bar 5
pd.merge(left,right,on='key')
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

key lval rval
0 foo 1 4
1 bar 2 5
left = pd.DataFrame({'key':['foo','foo'],'lval':[1,2]})
right = pd.DataFrame({'key':['foo','foo'],'rval':[4,5]})
left
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

key lval
0 foo 1
1 foo 2
right
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

key rval
0 foo 4
1 foo 5
pd.merge(left,right,on='key')#以某列为键值进行合并
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

key lval rval
0 foo 1 4
1 foo 1 5
2 foo 2 4
3 foo 2 5
#将行append到dataframe中
df = pd.DataFrame(np.random.randn(8,4),columns=list('ABCD'))
df
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

A B C D
0 0.862756 -1.436692 0.367396 1.033803
1 -0.732357 -0.350199 -0.116083 -2.435210
2 0.316582 0.468616 0.433001 -0.443120
3 -0.189939 -2.437137 0.126893 -2.273711
4 0.913514 -0.752727 -1.651140 1.156839
5 -0.314581 1.296585 0.579130 -0.871556
6 0.361473 0.687854 -1.044602 0.233138
7 0.045199 2.176608 -0.258569 -1.018576
s = df.iloc[3]
df.append(s,ignore_index=False)
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

A B C D
0 0.862756 -1.436692 0.367396 1.033803
1 -0.732357 -0.350199 -0.116083 -2.435210
2 0.316582 0.468616 0.433001 -0.443120
3 -0.189939 -2.437137 0.126893 -2.273711
4 0.913514 -0.752727 -1.651140 1.156839
5 -0.314581 1.296585 0.579130 -0.871556
6 0.361473 0.687854 -1.044602 0.233138
7 0.045199 2.176608 -0.258569 -1.018576
3 -0.189939 -2.437137 0.126893 -2.273711

“分组依据”是指涉及以下一个或多个步骤的过程:
根据某些标准将数据分成组
独立地将函数应用于每个组
将结果组合成数据结构

df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
   ....:                           'foo', 'bar', 'foo', 'foo'],
   ....:                    'B' : ['one', 'one', 'two', 'three',
   ....:                           'two', 'two', 'one', 'three'],
   ....:                    'C' : np.random.randn(8),
   ....:                    'D' : np.random.randn(8)})
df
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

A B C D
0 foo one 0.982988 -0.386029
1 bar one 1.632482 -0.327520
2 foo two 0.149619 -0.138297
3 bar three -1.480397 1.105690
4 foo two 0.647044 -1.097276
5 bar two -0.675596 0.250176
6 foo one 0.437309 1.031742
7 foo three 0.434659 1.197695
df.groupby('A').sum()
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

C D
A
bar -0.523512 1.028346
foo 2.651618 0.607834
df.groupby('A').max()
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

B C D
A
bar two 1.632482 1.105690
foo two 0.982988 1.197695
df.groupby(['A','B']).sum()
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

C D
A B
bar one 1.632482 -0.327520
three -1.480397 1.105690
two -0.675596 0.250176
foo one 1.420297 0.645713
three 0.434659 1.197695
two 0.796663 -1.235574
tuples = list(zip(*[['bar', 'bar', 'baz', 'baz',
   ....:                      'foo', 'foo', 'qux', 'qux'],
   ....:                     ['one', 'two', 'one', 'two',
   ....:                      'one', 'two', 'one', 'two']]))
tuples
[('bar', 'one'),
 ('bar', 'two'),
 ('baz', 'one'),
 ('baz', 'two'),
 ('foo', 'one'),
 ('foo', 'two'),
 ('qux', 'one'),
 ('qux', 'two')]

zip() 函数用于将可迭代的对象作为参数,将对象中对应的元素打包成一个个元组,然后返回由这些元组组成的列表,在python3中用于减少内存

index = pd.MultiIndex.from_tuples(tuples,names=['first','second'])
index
MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], ['one', 'two']],
           labels=[[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]],
           names=['first', 'second'])

df = pd.DataFrame(np.random.randn(8,2),index=index,columns=['A','B'])
df
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

A B
first second
bar one -0.752364 0.982241
two -0.626690 1.358099
baz one 0.342360 -0.618870
two -0.399448 -0.500175
foo one -0.746328 -0.244752
two 1.556458 -1.340098
qux one -0.569393 -0.340625
two 0.513714 0.151477
#stack压缩列
stacked = df.stack()
stacked
first  second   
bar    one     A   -0.752364
               B    0.982241
       two     A   -0.626690
               B    1.358099
baz    one     A    0.342360
               B   -0.618870
       two     A   -0.399448
               B   -0.500175
foo    one     A   -0.746328
               B   -0.244752
       two     A    1.556458
               B   -1.340098
qux    one     A   -0.569393
               B   -0.340625
       two     A    0.513714
               B    0.151477
dtype: float64

stacked.unstack()
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

A B
first second
bar one -0.752364 0.982241
two -0.626690 1.358099
baz one 0.342360 -0.618870
two -0.399448 -0.500175
foo one -0.746328 -0.244752
two 1.556458 -1.340098
qux one -0.569393 -0.340625
two 0.513714 0.151477
stacked.unstack(1)
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

second one two
first
bar A -0.752364 -0.626690
B 0.982241 1.358099
baz A 0.342360 -0.399448
B -0.618870 -0.500175
foo A -0.746328 1.556458
B -0.244752 -1.340098
qux A -0.569393 0.513714
B -0.340625 0.151477
stacked.unstack(0)
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

first bar baz foo qux
second
one A -0.752364 0.342360 -0.746328 -0.569393
B 0.982241 -0.618870 -0.244752 -0.340625
two A -0.626690 -0.399448 1.556458 0.513714
B 1.358099 -0.500175 -1.340098 0.151477
df = pd.DataFrame({'A' : ['one', 'one', 'two', 'three'] * 3,
   .....:                    'B' : ['A', 'B', 'C'] * 4,
   .....:                    'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
   .....:                    'D' : np.random.randn(12),
   .....:                    'E' : np.random.randn(12)})
df
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

A B C D E
0 one A foo 0.346892 -0.615216
1 one B foo 0.808681 0.161598
2 two C foo -1.324783 -0.088082
3 three A bar -0.227795 -0.180022
4 one B bar 1.130028 -0.361439
5 one C bar 0.510629 -1.466063
6 two A foo 0.379503 0.008279
7 three B foo 0.921087 -0.148614
8 one C foo 0.332222 -0.127428
9 one A bar -0.784876 -0.736117
10 two B bar -0.793957 1.705022
11 three C bar -0.898485 1.038166
pd.pivot_table(df,values='D',index=['A','B'],columns=['C'])
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

C bar foo
A B
one A -0.784876 0.346892
B 1.130028 0.808681
C 0.510629 0.332222
three A -0.227795 NaN
B NaN 0.921087
C -0.898485 NaN
two A NaN 0.379503
B -0.793957 NaN
C NaN -1.324783
rng = pd.date_range('1/1/2012', periods=100, freq='S')
rng
DatetimeIndex(['2012-01-01 00:00:00', '2012-01-01 00:00:01',
               '2012-01-01 00:00:02', '2012-01-01 00:00:03',
               '2012-01-01 00:00:04', '2012-01-01 00:00:05',
               '2012-01-01 00:00:06', '2012-01-01 00:00:07',
               '2012-01-01 00:00:08', '2012-01-01 00:00:09',
               '2012-01-01 00:00:10', '2012-01-01 00:00:11',
               '2012-01-01 00:00:12', '2012-01-01 00:00:13',
               '2012-01-01 00:00:14', '2012-01-01 00:00:15',
               '2012-01-01 00:00:16', '2012-01-01 00:00:17',
               '2012-01-01 00:00:18', '2012-01-01 00:00:19',
               '2012-01-01 00:00:20', '2012-01-01 00:00:21',
               '2012-01-01 00:00:22', '2012-01-01 00:00:23',
               '2012-01-01 00:00:24', '2012-01-01 00:00:25',
               '2012-01-01 00:00:26', '2012-01-01 00:00:27',
               '2012-01-01 00:00:28', '2012-01-01 00:00:29',
               '2012-01-01 00:00:30', '2012-01-01 00:00:31',
               '2012-01-01 00:00:32', '2012-01-01 00:00:33',
               '2012-01-01 00:00:34', '2012-01-01 00:00:35',
               '2012-01-01 00:00:36', '2012-01-01 00:00:37',
               '2012-01-01 00:00:38', '2012-01-01 00:00:39',
               '2012-01-01 00:00:40', '2012-01-01 00:00:41',
               '2012-01-01 00:00:42', '2012-01-01 00:00:43',
               '2012-01-01 00:00:44', '2012-01-01 00:00:45',
               '2012-01-01 00:00:46', '2012-01-01 00:00:47',
               '2012-01-01 00:00:48', '2012-01-01 00:00:49',
               '2012-01-01 00:00:50', '2012-01-01 00:00:51',
               '2012-01-01 00:00:52', '2012-01-01 00:00:53',
               '2012-01-01 00:00:54', '2012-01-01 00:00:55',
               '2012-01-01 00:00:56', '2012-01-01 00:00:57',
               '2012-01-01 00:00:58', '2012-01-01 00:00:59',
               '2012-01-01 00:01:00', '2012-01-01 00:01:01',
               '2012-01-01 00:01:02', '2012-01-01 00:01:03',
               '2012-01-01 00:01:04', '2012-01-01 00:01:05',
               '2012-01-01 00:01:06', '2012-01-01 00:01:07',
               '2012-01-01 00:01:08', '2012-01-01 00:01:09',
               '2012-01-01 00:01:10', '2012-01-01 00:01:11',
               '2012-01-01 00:01:12', '2012-01-01 00:01:13',
               '2012-01-01 00:01:14', '2012-01-01 00:01:15',
               '2012-01-01 00:01:16', '2012-01-01 00:01:17',
               '2012-01-01 00:01:18', '2012-01-01 00:01:19',
               '2012-01-01 00:01:20', '2012-01-01 00:01:21',
               '2012-01-01 00:01:22', '2012-01-01 00:01:23',
               '2012-01-01 00:01:24', '2012-01-01 00:01:25',
               '2012-01-01 00:01:26', '2012-01-01 00:01:27',
               '2012-01-01 00:01:28', '2012-01-01 00:01:29',
               '2012-01-01 00:01:30', '2012-01-01 00:01:31',
               '2012-01-01 00:01:32', '2012-01-01 00:01:33',
               '2012-01-01 00:01:34', '2012-01-01 00:01:35',
               '2012-01-01 00:01:36', '2012-01-01 00:01:37',
               '2012-01-01 00:01:38', '2012-01-01 00:01:39'],
              dtype='datetime64[ns]', freq='S')

ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)
ts
2012-01-01 00:00:00    225
2012-01-01 00:00:01    354
2012-01-01 00:00:02    438
2012-01-01 00:00:03    440
2012-01-01 00:00:04      9
2012-01-01 00:00:05    179
2012-01-01 00:00:06    396
2012-01-01 00:00:07    200
2012-01-01 00:00:08    413
2012-01-01 00:00:09    490
2012-01-01 00:00:10     37
2012-01-01 00:00:11     57
2012-01-01 00:00:12     33
2012-01-01 00:00:13    388
2012-01-01 00:00:14     44
2012-01-01 00:00:15     95
2012-01-01 00:00:16      8
2012-01-01 00:00:17      1
2012-01-01 00:00:18    307
2012-01-01 00:00:19    332
2012-01-01 00:00:20     20
2012-01-01 00:00:21     84
2012-01-01 00:00:22    309
2012-01-01 00:00:23    308
2012-01-01 00:00:24     67
2012-01-01 00:00:25    245
2012-01-01 00:00:26    180
2012-01-01 00:00:27      9
2012-01-01 00:00:28    126
2012-01-01 00:00:29    232
                      ... 
2012-01-01 00:01:10    409
2012-01-01 00:01:11    355
2012-01-01 00:01:12     70
2012-01-01 00:01:13    266
2012-01-01 00:01:14    118
2012-01-01 00:01:15    325
2012-01-01 00:01:16    214
2012-01-01 00:01:17      3
2012-01-01 00:01:18    143
2012-01-01 00:01:19     28
2012-01-01 00:01:20     56
2012-01-01 00:01:21    120
2012-01-01 00:01:22     99
2012-01-01 00:01:23    102
2012-01-01 00:01:24     71
2012-01-01 00:01:25    464
2012-01-01 00:01:26    489
2012-01-01 00:01:27    404
2012-01-01 00:01:28    356
2012-01-01 00:01:29    197
2012-01-01 00:01:30    390
2012-01-01 00:01:31    345
2012-01-01 00:01:32    115
2012-01-01 00:01:33    377
2012-01-01 00:01:34    388
2012-01-01 00:01:35     39
2012-01-01 00:01:36    406
2012-01-01 00:01:37    408
2012-01-01 00:01:38    410
2012-01-01 00:01:39    256
Freq: S, Length: 100, dtype: int32

rng = pd.date_range('3/6/2012 00:00', periods=5, freq='D')
rng
DatetimeIndex(['2012-03-06', '2012-03-07', '2012-03-08', '2012-03-09',
               '2012-03-10'],
              dtype='datetime64[ns]', freq='D')

ts_utc = ts.tz_localize('UTC')

ts_utc
2012-01-01 00:00:00+00:00    225
2012-01-01 00:00:01+00:00    354
2012-01-01 00:00:02+00:00    438
2012-01-01 00:00:03+00:00    440
2012-01-01 00:00:04+00:00      9
2012-01-01 00:00:05+00:00    179
2012-01-01 00:00:06+00:00    396
2012-01-01 00:00:07+00:00    200
2012-01-01 00:00:08+00:00    413
2012-01-01 00:00:09+00:00    490
2012-01-01 00:00:10+00:00     37
2012-01-01 00:00:11+00:00     57
2012-01-01 00:00:12+00:00     33
2012-01-01 00:00:13+00:00    388
2012-01-01 00:00:14+00:00     44
2012-01-01 00:00:15+00:00     95
2012-01-01 00:00:16+00:00      8
2012-01-01 00:00:17+00:00      1
2012-01-01 00:00:18+00:00    307
2012-01-01 00:00:19+00:00    332
2012-01-01 00:00:20+00:00     20
2012-01-01 00:00:21+00:00     84
2012-01-01 00:00:22+00:00    309
2012-01-01 00:00:23+00:00    308
2012-01-01 00:00:24+00:00     67
2012-01-01 00:00:25+00:00    245
2012-01-01 00:00:26+00:00    180
2012-01-01 00:00:27+00:00      9
2012-01-01 00:00:28+00:00    126
2012-01-01 00:00:29+00:00    232
                            ... 
2012-01-01 00:01:10+00:00    409
2012-01-01 00:01:11+00:00    355
2012-01-01 00:01:12+00:00     70
2012-01-01 00:01:13+00:00    266
2012-01-01 00:01:14+00:00    118
2012-01-01 00:01:15+00:00    325
2012-01-01 00:01:16+00:00    214
2012-01-01 00:01:17+00:00      3
2012-01-01 00:01:18+00:00    143
2012-01-01 00:01:19+00:00     28
2012-01-01 00:01:20+00:00     56
2012-01-01 00:01:21+00:00    120
2012-01-01 00:01:22+00:00     99
2012-01-01 00:01:23+00:00    102
2012-01-01 00:01:24+00:00     71
2012-01-01 00:01:25+00:00    464
2012-01-01 00:01:26+00:00    489
2012-01-01 00:01:27+00:00    404
2012-01-01 00:01:28+00:00    356
2012-01-01 00:01:29+00:00    197
2012-01-01 00:01:30+00:00    390
2012-01-01 00:01:31+00:00    345
2012-01-01 00:01:32+00:00    115
2012-01-01 00:01:33+00:00    377
2012-01-01 00:01:34+00:00    388
2012-01-01 00:01:35+00:00     39
2012-01-01 00:01:36+00:00    406
2012-01-01 00:01:37+00:00    408
2012-01-01 00:01:38+00:00    410
2012-01-01 00:01:39+00:00    256
Freq: S, Length: 100, dtype: int32

ts_utc.tz_convert('US/Eastern')
2011-12-31 19:00:00-05:00    225
2011-12-31 19:00:01-05:00    354
2011-12-31 19:00:02-05:00    438
2011-12-31 19:00:03-05:00    440
2011-12-31 19:00:04-05:00      9
2011-12-31 19:00:05-05:00    179
2011-12-31 19:00:06-05:00    396
2011-12-31 19:00:07-05:00    200
2011-12-31 19:00:08-05:00    413
2011-12-31 19:00:09-05:00    490
2011-12-31 19:00:10-05:00     37
2011-12-31 19:00:11-05:00     57
2011-12-31 19:00:12-05:00     33
2011-12-31 19:00:13-05:00    388
2011-12-31 19:00:14-05:00     44
2011-12-31 19:00:15-05:00     95
2011-12-31 19:00:16-05:00      8
2011-12-31 19:00:17-05:00      1
2011-12-31 19:00:18-05:00    307
2011-12-31 19:00:19-05:00    332
2011-12-31 19:00:20-05:00     20
2011-12-31 19:00:21-05:00     84
2011-12-31 19:00:22-05:00    309
2011-12-31 19:00:23-05:00    308
2011-12-31 19:00:24-05:00     67
2011-12-31 19:00:25-05:00    245
2011-12-31 19:00:26-05:00    180
2011-12-31 19:00:27-05:00      9
2011-12-31 19:00:28-05:00    126
2011-12-31 19:00:29-05:00    232
                            ... 
2011-12-31 19:01:10-05:00    409
2011-12-31 19:01:11-05:00    355
2011-12-31 19:01:12-05:00     70
2011-12-31 19:01:13-05:00    266
2011-12-31 19:01:14-05:00    118
2011-12-31 19:01:15-05:00    325
2011-12-31 19:01:16-05:00    214
2011-12-31 19:01:17-05:00      3
2011-12-31 19:01:18-05:00    143
2011-12-31 19:01:19-05:00     28
2011-12-31 19:01:20-05:00     56
2011-12-31 19:01:21-05:00    120
2011-12-31 19:01:22-05:00     99
2011-12-31 19:01:23-05:00    102
2011-12-31 19:01:24-05:00     71
2011-12-31 19:01:25-05:00    464
2011-12-31 19:01:26-05:00    489
2011-12-31 19:01:27-05:00    404
2011-12-31 19:01:28-05:00    356
2011-12-31 19:01:29-05:00    197
2011-12-31 19:01:30-05:00    390
2011-12-31 19:01:31-05:00    345
2011-12-31 19:01:32-05:00    115
2011-12-31 19:01:33-05:00    377
2011-12-31 19:01:34-05:00    388
2011-12-31 19:01:35-05:00     39
2011-12-31 19:01:36-05:00    406
2011-12-31 19:01:37-05:00    408
2011-12-31 19:01:38-05:00    410
2011-12-31 19:01:39-05:00    256
Freq: S, Length: 100, dtype: int32

prng = pd.period_range('1990Q1', '2000Q4', freq='Q-NOV')
prng
PeriodIndex(['1990Q1', '1990Q2', '1990Q3', '1990Q4', '1991Q1', '1991Q2',
             '1991Q3', '1991Q4', '1992Q1', '1992Q2', '1992Q3', '1992Q4',
             '1993Q1', '1993Q2', '1993Q3', '1993Q4', '1994Q1', '1994Q2',
             '1994Q3', '1994Q4', '1995Q1', '1995Q2', '1995Q3', '1995Q4',
             '1996Q1', '1996Q2', '1996Q3', '1996Q4', '1997Q1', '1997Q2',
             '1997Q3', '1997Q4', '1998Q1', '1998Q2', '1998Q3', '1998Q4',
             '1999Q1', '1999Q2', '1999Q3', '1999Q4', '2000Q1', '2000Q2',
             '2000Q3', '2000Q4'],
            dtype='period[Q-NOV]', freq='Q-NOV')

ts = pd.Series(np.random.randn(len(prng)), prng)
ts
1990Q1    1.796304
1990Q2    0.659808
1990Q3   -0.647755
1990Q4    1.846486
1991Q1    0.488348
1991Q2    1.830351
1991Q3   -1.658804
1991Q4    0.585780
1992Q1   -0.596026
1992Q2   -1.900346
1992Q3   -0.066638
1992Q4    0.419037
1993Q1    0.055711
1993Q2   -2.103900
1993Q3    0.229944
1993Q4    0.317348
1994Q1   -0.776638
1994Q2   -0.241438
1994Q3   -0.587104
1994Q4    0.825772
1995Q1    2.444721
1995Q2    0.803142
1995Q3    0.494378
1995Q4   -0.984900
1996Q1   -0.431641
1996Q2    0.766768
1996Q3   -1.176313
1996Q4    0.339700
1997Q1   -1.523029
1997Q2    0.512173
1997Q3    1.359914
1997Q4    0.564407
1998Q1    0.354859
1998Q2   -0.493561
1998Q3    0.514986
1998Q4   -0.156142
1999Q1    1.047135
1999Q2    0.648944
1999Q3   -1.581937
1999Q4    0.261181
2000Q1   -0.809498
2000Q2    1.102175
2000Q3    0.424905
2000Q4   -0.775245
Freq: Q-NOV, dtype: float64

ts.index = (prng.asfreq('M', 'e') + 1).asfreq('H', 's') + 9#e-end,s-start
ts.head()
1990-03-01 09:00    1.796304
1990-06-01 09:00    0.659808
1990-09-01 09:00   -0.647755
1990-12-01 09:00    1.846486
1991-03-01 09:00    0.488348
Freq: H, dtype: float64

df = pd.DataFrame({"id":[1,2,3,4,5,6], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']})
df
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

id raw_grade
0 1 a
1 2 b
2 3 b
3 4 a
4 5 a
5 6 e
df["grade"] = df["raw_grade"].astype("category")
df["grade"]
0    a
1    b
2    b
3    a
4    a
5    e
Name: grade, dtype: category
Categories (3, object): [a, b, e]

df["grade"].cat.categories = ["very good", "good", "very bad"]
df["grade"]
0    very good
1         good
2         good
3    very good
4    very good
5     very bad
Name: grade, dtype: category
Categories (3, object): [very good, good, very bad]

df["grade"] = df["grade"].cat.set_categories(["very bad", "bad", "medium", "good", "very good"])
df["grade"]#methods under Series .cat return a new Series by default
0    very good
1         good
2         good
3    very good
4    very good
5     very bad
Name: grade, dtype: category
Categories (5, object): [very bad, bad, medium, good, very good]

df.sort_values(by="grade")
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

id raw_grade grade
5 6 e very bad
1 2 b good
2 3 b good
0 1 a very good
3 4 a very good
4 5 a very good
df.groupby("grade").size()
grade
very bad     1
bad          0
medium       0
good         2
very good    3
dtype: int64

ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000))
ts
2000-01-01    0.585074
2000-01-02    0.605786
2000-01-03    0.697632
2000-01-04   -0.783338
2000-01-05    1.150780
2000-01-06   -0.718491
2000-01-07    0.696745
2000-01-08    0.270574
2000-01-09    0.657496
2000-01-10   -2.613661
2000-01-11   -1.978929
2000-01-12    0.325563
2000-01-13    0.286470
2000-01-14   -0.315502
2000-01-15    0.487703
2000-01-16   -1.830420
2000-01-17    0.847074
2000-01-18   -2.363392
2000-01-19    0.139429
2000-01-20   -0.512045
2000-01-21    0.209301
2000-01-22   -0.202987
2000-01-23   -0.605512
2000-01-24    0.113967
2000-01-25   -0.546799
2000-01-26    1.758145
2000-01-27    0.299658
2000-01-28   -0.614838
2000-01-29    0.262877
2000-01-30    0.021676
                ...   
2002-08-28    0.222110
2002-08-29   -1.846013
2002-08-30   -0.094660
2002-08-31    1.281895
2002-09-01   -1.072053
2002-09-02    0.503427
2002-09-03   -0.499512
2002-09-04   -1.080912
2002-09-05   -0.780288
2002-09-06   -0.537608
2002-09-07   -0.991904
2002-09-08    0.159327
2002-09-09    0.224638
2002-09-10    2.063388
2002-09-11    1.217366
2002-09-12    0.603689
2002-09-13    0.832689
2002-09-14   -1.788089
2002-09-15   -2.183370
2002-09-16   -0.759798
2002-09-17   -0.836241
2002-09-18    0.298536
2002-09-19    1.969939
2002-09-20   -0.688728
2002-09-21   -0.964116
2002-09-22   -1.279596
2002-09-23    0.357739
2002-09-24    1.253534
2002-09-25   -0.798673
2002-09-26   -1.023241
Freq: D, Length: 1000, dtype: float64

ts.cumsum()
2000-01-01     0.585074
2000-01-02     1.190860
2000-01-03     1.888493
2000-01-04     1.105155
2000-01-05     2.255935
2000-01-06     1.537445
2000-01-07     2.234190
2000-01-08     2.504764
2000-01-09     3.162260
2000-01-10     0.548599
2000-01-11    -1.430329
2000-01-12    -1.104767
2000-01-13    -0.818296
2000-01-14    -1.133798
2000-01-15    -0.646095
2000-01-16    -2.476516
2000-01-17    -1.629442
2000-01-18    -3.992834
2000-01-19    -3.853405
2000-01-20    -4.365450
2000-01-21    -4.156149
2000-01-22    -4.359136
2000-01-23    -4.964649
2000-01-24    -4.850682
2000-01-25    -5.397481
2000-01-26    -3.639336
2000-01-27    -3.339678
2000-01-28    -3.954516
2000-01-29    -3.691639
2000-01-30    -3.669963
                ...    
2002-08-28   -12.290664
2002-08-29   -14.136676
2002-08-30   -14.231337
2002-08-31   -12.949442
2002-09-01   -14.021495
2002-09-02   -13.518068
2002-09-03   -14.017579
2002-09-04   -15.098492
2002-09-05   -15.878779
2002-09-06   -16.416387
2002-09-07   -17.408292
2002-09-08   -17.248965
2002-09-09   -17.024327
2002-09-10   -14.960940
2002-09-11   -13.743574
2002-09-12   -13.139885
2002-09-13   -12.307196
2002-09-14   -14.095285
2002-09-15   -16.278655
2002-09-16   -17.038453
2002-09-17   -17.874694
2002-09-18   -17.576157
2002-09-19   -15.606219
2002-09-20   -16.294946
2002-09-21   -17.259062
2002-09-22   -18.538658
2002-09-23   -18.180919
2002-09-24   -16.927385
2002-09-25   -17.726058
2002-09-26   -18.749299
Freq: D, Length: 1000, dtype: float64

ts.plot()#绘制所有列


10分钟学习pandas_第1张图片

df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index,
   .....:                   columns=['A', 'B', 'C', 'D'])
df
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

A B C D
2000-01-01 -0.434770 1.797170 0.354820 -0.166193
2000-01-02 0.420233 0.823251 0.468701 -0.582949
2000-01-03 0.674668 1.032230 1.134346 0.696656
2000-01-04 0.948684 0.188295 -0.604249 -0.062779
2000-01-05 -1.381247 -0.360335 -0.346491 1.072618
2000-01-06 0.492510 1.924341 0.522605 0.293788
2000-01-07 -1.129093 0.063874 0.099183 0.557496
2000-01-08 1.142263 -0.309192 1.140049 1.007656
2000-01-09 1.751273 -0.747153 0.795127 -0.480155
2000-01-10 -1.519661 -1.187734 0.417908 -0.675147
2000-01-11 -0.096192 1.095308 0.094648 1.485943
2000-01-12 0.109115 -0.213535 -0.927250 1.189941
2000-01-13 -0.787367 -0.919787 1.286709 0.894471
2000-01-14 -0.584850 0.794088 0.533716 -0.159539
2000-01-15 -1.352332 -0.880446 0.041934 0.002573
2000-01-16 0.317933 0.957925 0.813780 0.952499
2000-01-17 0.950317 0.162642 -0.018575 -0.940598
2000-01-18 -2.021125 1.592108 0.219355 -1.300103
2000-01-19 -0.673145 -1.852674 -0.492845 0.070786
2000-01-20 -0.562802 -0.504083 0.980132 -0.079636
2000-01-21 0.693927 0.276601 -0.502267 1.824789
2000-01-22 0.240543 -0.049004 0.051460 -1.093965
2000-01-23 0.159181 0.559377 0.353952 -1.750909
2000-01-24 -1.009695 -0.169914 2.214441 -1.301680
2000-01-25 0.741394 -0.206067 -1.250305 -2.021061
2000-01-26 -1.050527 -0.448726 0.744841 0.559876
2000-01-27 -0.268987 0.755171 -0.865320 -0.077159
2000-01-28 -1.445525 -0.443887 0.048399 0.295317
2000-01-29 -0.348641 -0.570866 0.446533 -0.745215
2000-01-30 -0.803883 0.719817 0.035095 -0.057671
... ... ... ... ...
2002-08-28 -1.400857 -1.993967 -0.563839 -0.553431
2002-08-29 -0.860120 -0.252746 -0.585336 0.083630
2002-08-30 0.677218 0.113083 -0.507485 -1.247440
2002-08-31 1.901913 0.124469 -0.482948 0.093981
2002-09-01 1.728861 1.909778 -1.206848 -1.324399
2002-09-02 1.419153 -1.000495 -0.117854 -0.630926
2002-09-03 0.716920 -0.831795 2.443522 -0.247801
2002-09-04 -0.886588 -0.487240 0.476527 1.273604
2002-09-05 -2.361533 -0.074533 -1.095040 0.087406
2002-09-06 -1.225924 -0.444836 0.378192 -0.785585
2002-09-07 -1.064395 0.046003 0.148525 0.393557
2002-09-08 -0.294659 0.912430 -0.795767 0.064672
2002-09-09 0.276846 0.993007 -0.493192 0.673319
2002-09-10 1.676072 0.102106 -1.286082 -1.454404
2002-09-11 2.124521 0.069451 0.495054 0.148496
2002-09-12 0.821348 -0.880714 0.933978 1.869043
2002-09-13 -0.890738 -1.263920 0.128660 -0.282550
2002-09-14 -1.097484 0.652124 0.702043 -0.552927
2002-09-15 0.161343 0.157393 0.851718 -1.265120
2002-09-16 0.865516 -1.196734 -0.985248 -1.472387
2002-09-17 -0.539248 1.388908 -0.870515 -0.671165
2002-09-18 1.154511 0.879535 -0.249820 -0.393302
2002-09-19 1.237163 0.668046 0.917817 0.300664
2002-09-20 -0.187801 0.173142 -0.225307 2.142230
2002-09-21 0.517452 -0.547158 1.587477 -0.922776
2002-09-22 0.424784 0.696831 1.340258 1.252117
2002-09-23 -0.687751 -0.006990 -0.607220 0.709964
2002-09-24 -1.811347 0.200485 2.117700 -0.468944
2002-09-25 -0.431668 -0.385997 0.303936 0.817534
2002-09-26 0.678959 1.061957 1.252870 0.735550

1000 rows × 4 columns

df = df.cumsum()
plt.figure(); df.plot(); plt.legend(loc='best')


10分钟学习pandas_第2张图片

你可能感兴趣的:(Data,Mining,python)