panda version:’0.19.2’
1.创建数据集
In [58]: names=['Bob','Jessica','Mary','John','Mel']
In [59]: births = [968, 155, 77, 578, 973]
In [60]: BabyDataSet = list(zip(names,births))
...: BabyDataSet
Out[60]: [('Bob', 968), ('Jessica', 155), ('Mary', 77), ('John', 578), ('Mel', 973)]
In [61]: df = pd.DataFrame(data = BabyDataSet, columns=['Names', 'Births
...: '])
...: df
Out[61]:
Names Births
0 Bob 968
1 Jessica 155
2 Mary 77
3 John 578
4 Mel 973
2.查看数据类型
In [64]: df.dtypes
Out[64]:
Names object
Births int64
dtype: object
In [65]: df.Names.dtypes
Out[65]: dtype('O')
In [66]: df.Births.dtypes
Out[66]: dtype('int64')
3.按照指定 的一列排序
In [68]: df.sort_values(['Births'])
Out[68]:
Names Births
2 Mary 77
1 Jessica 155
3 John 578
0 Bob 968
4 Mel 973
In [69]: df.sort_values(['Births'],ascending=False)
Out[69]:
Names Births
4 Mel 973
0 Bob 968
3 John 578
1 Jessica 155
2 Mary 77
4.选取前几行或者末尾几行
In [72]: df.head(3)
Out[72]:
Names Births
0 Bob 968
1 Jessica 155
2 Mary 77
In [73]: df.tail(2)
Out[73]:
Names Births
3 John 578
4 Mel 973
In [74]: df
Out[74]:
Names Births
0 Bob 968
1 Jessica 155
2 Mary 77
3 John 578
4 Mel 973
5.选取一列的最大值
In [75]: df['Births']
Out[75]:
0 968
1 155
2 77
3 578
4 973
Name: Births, dtype: int64
In [76]: df['Births'].max()
Out[76]: 973
In [77]: df['Names'][df['Births'] == df['Births'].max()]
Out[77]:
4 Mel
Name: Names, dtype: object
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
s = pd.Series([1, 3, 5, np.nan, 6, 8])
print(s)
0 1.0
1 3.0
2 5.0
3 NaN
4 6.0
5 8.0
dtype: float64
dates = pd.date_range('20130101', periods=6)
print(dates)
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
print(df)
df2 = pd.DataFrame({'A': 1.,
'B': pd.Timestamp('20130102'),
'C': pd.Series(1, index=list(range(4)), dtype='float32'),
'D': np.array([3] * 4, dtype='int32'),
'E': pd.Categorical(["test", "train", "test", "train"]),
'F': 'foo'})
print(df2)
DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
'2013-01-05', '2013-01-06'],
dtype='datetime64[ns]', freq='D')
A B C D
2013-01-01 -1.324399 -0.917851 -0.710650 0.977088
2013-01-02 0.034877 -0.417994 1.412711 -0.626197
2013-01-03 -0.559784 -0.085540 1.067182 0.649621
2013-01-04 0.849592 -1.251283 1.956991 1.189781
2013-01-05 -1.742392 0.193744 -0.570087 -0.277156
2013-01-06 -0.129934 -0.890113 -1.324529 -1.298726
A B C D E F
0 1.0 2013-01-02 1.0 3 test foo
1 1.0 2013-01-02 1.0 3 train foo
2 1.0 2013-01-02 1.0 3 test foo
3 1.0 2013-01-02 1.0 3 train foo
print "--------*---------------"
print(df2.head(1))
print(df2.tail(1))
print(df2.values)
print "--------*---------------"
print df
print df.sort_index(axis=1, ascending=False)
print df.sort_values(by='B')
A B C D E F
0 1.0 2013-01-02 1.0 3 test foo
A B C D E F
3 1.0 2013-01-02 1.0 3 train foo
[[1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'test' 'foo']
[1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'train' 'foo']
[1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'test' 'foo']
[1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'train' 'foo']]
--------*---------------
A B C D
2013-01-01 -1.324399 -0.917851 -0.710650 0.977088
2013-01-02 0.034877 -0.417994 1.412711 -0.626197
2013-01-03 -0.559784 -0.085540 1.067182 0.649621
2013-01-04 0.849592 -1.251283 1.956991 1.189781
2013-01-05 -1.742392 0.193744 -0.570087 -0.277156
2013-01-06 -0.129934 -0.890113 -1.324529 -1.298726
D C B A
2013-01-01 0.977088 -0.710650 -0.917851 -1.324399
2013-01-02 -0.626197 1.412711 -0.417994 0.034877
2013-01-03 0.649621 1.067182 -0.085540 -0.559784
2013-01-04 1.189781 1.956991 -1.251283 0.849592
2013-01-05 -0.277156 -0.570087 0.193744 -1.742392
2013-01-06 -1.298726 -1.324529 -0.890113 -0.129934
A B C D
2013-01-04 0.849592 -1.251283 1.956991 1.189781
2013-01-01 -1.324399 -0.917851 -0.710650 0.977088
2013-01-06 -0.129934 -0.890113 -1.324529 -1.298726
2013-01-02 0.034877 -0.417994 1.412711 -0.626197
2013-01-03 -0.559784 -0.085540 1.067182 0.649621
2013-01-05 -1.742392 0.193744 -0.570087 -0.277156
print df.A
print df['A']
print df[0:3]
2013-01-01 -1.324399
2013-01-02 0.034877
2013-01-03 -0.559784
2013-01-04 0.849592
2013-01-05 -1.742392
2013-01-06 -0.129934
Freq: D, Name: A, dtype: float64
2013-01-01 -1.324399
2013-01-02 0.034877
2013-01-03 -0.559784
2013-01-04 0.849592
2013-01-05 -1.742392
2013-01-06 -0.129934
Freq: D, Name: A, dtype: float64
A B C D
2013-01-01 -1.324399 -0.917851 -0.710650 0.977088
2013-01-02 0.034877 -0.417994 1.412711 -0.626197
2013-01-03 -0.559784 -0.085540 1.067182 0.649621
print df.loc[dates[0]]
A -1.324399
B -0.917851
C -0.710650
D 0.977088
Name: 2013-01-01 00:00:00, dtype: float64
df.loc[:,['A','B']]
|
A |
B |
2013-01-01 |
-1.324399 |
-0.917851 |
2013-01-02 |
0.034877 |
-0.417994 |
2013-01-03 |
-0.559784 |
-0.085540 |
2013-01-04 |
0.849592 |
-1.251283 |
2013-01-05 |
-1.742392 |
0.193744 |
2013-01-06 |
-0.129934 |
-0.890113 |
df.loc['20130102':'20130104',['A','B']]
df.loc[dates[0],'A']
-1.3243992801327025
df2 = df.copy()
df2['E'] = ['one', 'one','two','three','four','three']
df2
|
A |
B |
C |
D |
E |
2013-01-01 |
-1.324399 |
-0.917851 |
-0.710650 |
0.977088 |
one |
2013-01-02 |
0.034877 |
-0.417994 |
1.412711 |
-0.626197 |
one |
2013-01-03 |
-0.559784 |
-0.085540 |
1.067182 |
0.649621 |
two |
2013-01-04 |
0.849592 |
-1.251283 |
1.956991 |
1.189781 |
three |
2013-01-05 |
-1.742392 |
0.193744 |
-0.570087 |
-0.277156 |
four |
2013-01-06 |
-0.129934 |
-0.890113 |
-1.324529 |
-1.298726 |
three |
df2[df2['E'].isin(['two','four'])]
|
A |
B |
C |
D |
E |
2013-01-03 |
-0.559784 |
-0.085540 |
1.067182 |
0.649621 |
two |
2013-01-05 |
-1.742392 |
0.193744 |
-0.570087 |
-0.277156 |
four |
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130102', periods=6))
s1
2013-01-02 1 2013-01-03 2 2013-01-04 3 2013-01-05 4 2013-01-06 5 2013-01-07 6 Freq: D, dtype: int64
df['F'] = s1
df
|
A |
B |
C |
D |
F |
2013-01-01 |
-1.324399 |
-0.917851 |
-0.710650 |
0.977088 |
NaN |
2013-01-02 |
0.034877 |
-0.417994 |
1.412711 |
-0.626197 |
1.0 |
2013-01-03 |
-0.559784 |
-0.085540 |
1.067182 |
0.649621 |
2.0 |
2013-01-04 |
0.849592 |
-1.251283 |
1.956991 |
1.189781 |
3.0 |
2013-01-05 |
-1.742392 |
0.193744 |
-0.570087 |
-0.277156 |
4.0 |
2013-01-06 |
-0.129934 |
-0.890113 |
-1.324529 |
-1.298726 |
5.0 |
df.mean()
A -0.478673 B -0.561506 C 0.305270 D 0.102402 F 3.000000 dtype: float64
df.apply(lambda x: x.max() - x.min())
A 2.591984 B 1.445027 C 3.281519 D 2.488508 F 4.000000 dtype: float64
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
s.str.lower()
0 a 1 b 2 c 3 aaba 4 baca 5 NaN 6 caba 7 dog 8 cat dtype: object
left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]})
left
right
pd.merge(left, right, on='key')
|
key |
lval |
rval |
0 |
foo |
1 |
4 |
1 |
foo |
1 |
5 |
2 |
foo |
2 |
4 |
3 |
foo |
2 |
5 |
left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]})
left
right
pd.merge(left, right, on='key')
|
key |
lval |
rval |
0 |
foo |
1 |
4 |
1 |
bar |
2 |
5 |
df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
'foo', 'bar', 'foo', 'foo'],
'B' : ['one', 'one', 'two', 'three',
'two', 'two', 'one', 'three'],
'C' : np.random.randn(8),
'D' : np.random.randn(8)})
df
|
A |
B |
C |
D |
0 |
foo |
one |
0.850151 |
-0.095071 |
1 |
bar |
one |
0.252074 |
0.504999 |
2 |
foo |
two |
-0.139441 |
-1.190568 |
3 |
bar |
three |
-0.971856 |
0.340176 |
4 |
foo |
two |
1.546175 |
-0.402114 |
5 |
bar |
two |
0.026199 |
1.313452 |
6 |
foo |
one |
-0.267510 |
-0.981974 |
7 |
foo |
three |
1.018972 |
1.100904 |
df.groupby('A').sum()
|
C |
D |
A |
|
|
bar |
-0.693583 |
2.158627 |
foo |
3.008348 |
-1.568822 |
df.groupby(['A','B']).sum()
|
|
C |
D |
A |
B |
|
|
bar |
one |
0.252074 |
0.504999 |
three |
-0.971856 |
0.340176 |
two |
0.026199 |
1.313452 |
foo |
one |
0.582641 |
-1.077045 |
three |
1.018972 |
1.100904 |
two |
1.406734 |
-1.592681 |