import numpy as np
import pandas as pd
d=pd.DataFrame(np.random.randint(1,24,size=(6,4)),
index=pd.date_range("19980102",periods=6),
columns=["a","b","c","d"])
print(d)
a b c d
1998-01-02 15 21 23 7
1998-01-03 11 6 5 2
1998-01-04 13 4 23 23
1998-01-05 21 20 22 20
1998-01-06 5 23 19 22
1998-01-07 4 9 11 4
d2 = pd.DataFrame({'A': np.random.rand(4),
'B': pd.Timestamp('20130102'),
'C': pd.Series(np.arange(1,5), index=list(range(4)), dtype='float32'),
'D': np.array([3] * 4, dtype='int32'),
'E': pd.Categorical(["test", "train", "test", "train"]),
'F': 'foo'})
d2
|
A |
B |
C |
D |
E |
F |
0 |
0.075741 |
2013-01-02 |
1.0 |
3 |
test |
foo |
1 |
0.410090 |
2013-01-02 |
2.0 |
3 |
train |
foo |
2 |
0.620653 |
2013-01-02 |
3.0 |
3 |
test |
foo |
3 |
0.061835 |
2013-01-02 |
4.0 |
3 |
train |
foo |
查看数据
查看数据类型,dtypes输出每一列数据类型
print(d.dtypes)
print(d2.dtypes)
a int32
b int32
c int32
d int32
dtype: object
A float64
B datetime64[ns]
C float32
D int32
E category
F object
dtype: object
查看 DataFrame 头部
print(d.head())
查看 DataFrame 尾部,可选择数量
print(d.tail(3))
a b c d
1998-01-02 15 21 23 7
1998-01-03 11 6 5 2
1998-01-04 13 4 23 23
1998-01-05 21 20 22 20
1998-01-06 5 23 19 22
a b c d
1998-01-05 21 20 22 20
1998-01-06 5 23 19 22
1998-01-07 4 9 11 4
行索引列标签
print(d.index)
print(d.columns)
DatetimeIndex(['1998-01-02', '1998-01-03', '1998-01-04', '1998-01-05',
'1998-01-06', '1998-01-07'],
dtype='datetime64[ns]', freq='D')
Index(['a', 'b', 'c', 'd'], dtype='object')
输出数组对象
print(d.to_numpy())
print(d2.to_numpy())
[[15 21 23 7]
[11 6 5 2]
[13 4 23 23]
[21 20 22 20]
[ 5 23 19 22]
[ 4 9 11 4]]
[[0.07574114092524209 Timestamp('2013-01-02 00:00:00') 1.0 3 'test' 'foo']
[0.41008997302466055 Timestamp('2013-01-02 00:00:00') 2.0 3 'train'
'foo']
[0.6206525618816419 Timestamp('2013-01-02 00:00:00') 3.0 3 'test' 'foo']
[0.061835078456274895 Timestamp('2013-01-02 00:00:00') 4.0 3 'train'
'foo']]
describe() 快速查看数据的统计摘要:
数字数据
DataFrame.count
计算非NA /空观测值的数量。
DataFrame.max
对象中的最大值。
DataFrame.min
对象中的最小值。
DataFrame.mean
值的平均值。
DataFrame.std
观测值的标准差。
DataFrame.select_dtypes
DataFrame的子集,包括/基于列的dtype排除列。
对象的数据(例如字符串或时间戳)
结果的指数将包括count,unique,top,和freq。
该top 是最常见的值。该freq是最常见的值的频率。时间戳记还包括first和last项目。
混合数据输出数字数据
d.describe()
|
a |
b |
c |
d |
count |
6.000000 |
6.000000 |
6.000000 |
6.000000 |
mean |
11.500000 |
13.833333 |
17.166667 |
13.000000 |
std |
6.379655 |
8.424172 |
7.494442 |
9.674709 |
min |
4.000000 |
4.000000 |
5.000000 |
2.000000 |
25% |
6.500000 |
6.750000 |
13.000000 |
4.750000 |
50% |
12.000000 |
14.500000 |
20.500000 |
13.500000 |
75% |
14.500000 |
20.750000 |
22.750000 |
21.500000 |
max |
21.000000 |
23.000000 |
23.000000 |
23.000000 |
25%、50%、75%为分位值
一共6个数,共5个数字间隔,每个四分位间5/4=1.25个数。
计算25分位:
第1个四分位数为上面6个数中的第1+1.25=2.25个数
指的是第2个数+第2个和第3个数中间的0.25位置处,即:
5+0.25*(11-5)= 6.5
计算50分位:
第2个四分位数为上面6个数中的第1+1.25*2=3.5个数
即:11+0.5(13-11)=12
计算75分位:
第3个四分位数为上面6个数中的第1+1.25*3=4.75个数
指的是第4个数+第4个和第5个数中间的0.75位置处,即:
13+0.75(15-13)=14.5
d2.describe()
|
A |
C |
D |
count |
4.000000 |
4.000000 |
4.0 |
mean |
0.292080 |
2.500000 |
3.0 |
std |
0.271846 |
1.290994 |
0.0 |
min |
0.061835 |
1.000000 |
3.0 |
25% |
0.072265 |
1.750000 |
3.0 |
50% |
0.242916 |
2.500000 |
3.0 |
75% |
0.462731 |
3.250000 |
3.0 |
max |
0.620653 |
4.000000 |
3.0 |
转置
d.T
|
1998-01-02 |
1998-01-03 |
1998-01-04 |
1998-01-05 |
1998-01-06 |
1998-01-07 |
a |
15 |
11 |
13 |
21 |
5 |
4 |
b |
21 |
6 |
4 |
20 |
23 |
9 |
c |
23 |
5 |
23 |
22 |
19 |
11 |
d |
7 |
2 |
23 |
20 |
22 |
4 |
排序
索引轴排序
d.sort_index(axis=0, ascending=False)
|
a |
b |
c |
d |
1998-01-07 |
4 |
9 |
11 |
4 |
1998-01-06 |
5 |
23 |
19 |
22 |
1998-01-05 |
21 |
20 |
22 |
20 |
1998-01-04 |
13 |
4 |
23 |
23 |
1998-01-03 |
11 |
6 |
5 |
2 |
1998-01-02 |
15 |
21 |
23 |
7 |
d.sort_index(axis=0, ascending=False)
|
a |
b |
c |
d |
1998-01-07 |
4 |
9 |
11 |
4 |
1998-01-06 |
5 |
23 |
19 |
22 |
1998-01-05 |
21 |
20 |
22 |
20 |
1998-01-04 |
13 |
4 |
23 |
23 |
1998-01-03 |
11 |
6 |
5 |
2 |
1998-01-02 |
15 |
21 |
23 |
7 |
值排序
d.sort_values(by=['a'],axis=0)
|
a |
b |
c |
d |
1998-01-07 |
4 |
9 |
11 |
4 |
1998-01-06 |
5 |
23 |
19 |
22 |
1998-01-03 |
11 |
6 |
5 |
2 |
1998-01-04 |
13 |
4 |
23 |
23 |
1998-01-02 |
15 |
21 |
23 |
7 |
1998-01-05 |
21 |
20 |
22 |
20 |
d.sort_values(by='1998-01-02',axis=1)
|
d |
a |
b |
c |
1998-01-02 |
7 |
15 |
21 |
23 |
1998-01-03 |
2 |
11 |
6 |
5 |
1998-01-04 |
23 |
13 |
4 |
23 |
1998-01-05 |
20 |
21 |
20 |
22 |
1998-01-06 |
22 |
5 |
23 |
19 |
1998-01-07 |
4 |
4 |
9 |
11 |
索引和切片
[ ] 切片
print(d.a)
print(d['a'])
1998-01-02 15
1998-01-03 11
1998-01-04 13
1998-01-05 21
1998-01-06 5
1998-01-07 4
Freq: D, Name: a, dtype: int32
1998-01-02 15
1998-01-03 11
1998-01-04 13
1998-01-05 21
1998-01-06 5
1998-01-07 4
Freq: D, Name: a, dtype: int32
选择多列,切片,
d[:3]
|
a |
b |
c |
d |
1998-01-02 |
15 |
21 |
23 |
7 |
1998-01-03 |
11 |
6 |
5 |
2 |
1998-01-04 |
13 |
4 |
23 |
23 |
d['1998-01-03':'1998-01-05']
|
a |
b |
c |
d |
1998-01-03 |
11 |
6 |
5 |
2 |
1998-01-04 |
13 |
4 |
23 |
23 |
1998-01-05 |
21 |
20 |
22 |
20 |
按标签选择切片loc
d.loc['1998-01-03':'1998-01-05']
|
a |
b |
c |
d |
1998-01-03 |
11 |
6 |
5 |
2 |
1998-01-04 |
13 |
4 |
23 |
23 |
1998-01-05 |
21 |
20 |
22 |
20 |
d.loc['1998-01-03':'1998-01-06', 'b':'d']
|
b |
c |
d |
1998-01-03 |
6 |
5 |
2 |
1998-01-04 |
4 |
23 |
23 |
1998-01-05 |
20 |
22 |
20 |
1998-01-06 |
23 |
19 |
22 |
使用布尔数组获取值:
print(d.loc['1998-01-04'] > 0)
print(d.loc[:, d.loc['1998-01-04'] > 0])
a True
b True
c True
d True
Name: 1998-01-04 00:00:00, dtype: bool
a b c d
1998-01-02 15 21 23 7
1998-01-03 11 6 5 2
1998-01-04 13 4 23 23
1998-01-05 21 20 22 20
1998-01-06 5 23 19 22
1998-01-07 4 9 11 4
iloc属性类似NumPy 用整数切片:
d.iloc[3]
a 21
b 20
c 22
d 20
Name: 1998-01-05 00:00:00, dtype: int32
d.iloc[3:5, 0:2]
|
a |
b |
1998-01-05 |
21 |
20 |
1998-01-06 |
5 |
23 |
d.iloc[1, 1]
6
用 isin() 插入、筛选:
d['E'] = ['one', 'one', 'two', 'three', 'four', 'three']
d
|
a |
b |
c |
d |
E |
1998-01-02 |
15 |
21 |
23 |
7 |
one |
1998-01-03 |
11 |
6 |
5 |
2 |
one |
1998-01-04 |
13 |
4 |
23 |
23 |
two |
1998-01-05 |
21 |
20 |
22 |
20 |
three |
1998-01-06 |
5 |
23 |
19 |
22 |
four |
1998-01-07 |
4 |
9 |
11 |
4 |
three |
d[d['E'].isin(['two', 'four'])]
|
a |
b |
c |
d |
E |
1998-01-04 |
13 |
4 |
23 |
23 |
two |
1998-01-06 |
5 |
23 |
19 |
22 |
four |
d
|
a |
b |
c |
d |
E |
1998-01-02 |
15 |
21 |
23 |
7 |
one |
1998-01-03 |
11 |
6 |
5 |
2 |
one |
1998-01-04 |
13 |
4 |
23 |
23 |
two |
1998-01-05 |
21 |
20 |
22 |
20 |
three |
1998-01-06 |
5 |
23 |
19 |
22 |
four |
1998-01-07 |
4 |
9 |
11 |
4 |
three |
reindex更改、添加、删除指定轴的索引,并返回数据副本,即不更改原数据。
df = pd.DataFrame(np.random.randn(6, 4),
index=pd.date_range('20130101', periods=6),
columns=list('ABCD'))
df1=df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])
df1.loc[dates[0]:dates[1], 'E'] = 1
df1
|
A |
B |
C |
D |
E |
1998-01-02 |
NaN |
NaN |
NaN |
NaN |
1.0 |
1998-01-03 |
NaN |
NaN |
NaN |
NaN |
1.0 |
1998-01-04 |
NaN |
NaN |
NaN |
NaN |
NaN |
1998-01-05 |
NaN |
NaN |
NaN |
NaN |
NaN |
删除所有含缺失值的行:
df1.dropna(how='any')
填充缺失值:
df1.fillna(value=5)
|
A |
B |
C |
D |
E |
1998-01-02 |
5.0 |
5.0 |
5.0 |
5.0 |
1.0 |
1998-01-03 |
5.0 |
5.0 |
5.0 |
5.0 |
1.0 |
1998-01-04 |
5.0 |
5.0 |
5.0 |
5.0 |
5.0 |
1998-01-05 |
5.0 |
5.0 |
5.0 |
5.0 |
5.0 |
3、运算
统计mean()
print(df)
print(df.mean())
A B C D
2013-01-01 0.725530 0.304938 0.988725 0.749843
2013-01-02 -0.697489 -0.916037 0.019967 0.256584
2013-01-03 -0.590450 -0.261403 -0.414659 -0.344422
2013-01-04 -1.720918 -0.553150 -1.047237 0.222394
2013-01-05 -3.493505 1.665040 0.356288 0.953887
2013-01-06 0.936351 -1.692798 1.251221 1.718479
A -0.806747
B -0.242235
C 0.192384
D 0.592794
dtype: float64
df.mean(1)
2013-01-01 0.692259
2013-01-02 -0.334244
2013-01-03 -0.402733
2013-01-04 -0.774728
2013-01-05 -0.129572
2013-01-06 0.553313
Freq: D, dtype: float64
s = pd.Series([1, 3, 5, np.nan, 6, 8],
index=pd.date_range('20130101', periods=6)).shift(2)
s
2013-01-01 NaN
2013-01-02 NaN
2013-01-03 1.0
2013-01-04 3.0
2013-01-05 5.0
2013-01-06 NaN
Freq: D, dtype: float64
df
|
A |
B |
C |
D |
2013-01-01 |
0.725530 |
0.304938 |
0.988725 |
0.749843 |
2013-01-02 |
-0.697489 |
-0.916037 |
0.019967 |
0.256584 |
2013-01-03 |
-0.590450 |
-0.261403 |
-0.414659 |
-0.344422 |
2013-01-04 |
-1.720918 |
-0.553150 |
-1.047237 |
0.222394 |
2013-01-05 |
-3.493505 |
1.665040 |
0.356288 |
0.953887 |
2013-01-06 |
0.936351 |
-1.692798 |
1.251221 |
1.718479 |
df.sub(s, axis='index')
|
A |
B |
C |
D |
2013-01-01 |
NaN |
NaN |
NaN |
NaN |
2013-01-02 |
NaN |
NaN |
NaN |
NaN |
2013-01-03 |
-1.590450 |
-1.261403 |
-1.414659 |
-1.344422 |
2013-01-04 |
-4.720918 |
-3.553150 |
-4.047237 |
-2.777606 |
2013-01-05 |
-8.493505 |
-3.334960 |
-4.643712 |
-4.046113 |
2013-01-06 |
NaN |
NaN |
NaN |
NaN |
合并(Merge)
pd.concat([df[:2],df[4:5]])
|
A |
B |
C |
D |
2013-01-01 |
0.725530 |
0.304938 |
0.988725 |
0.749843 |
2013-01-02 |
-0.697489 |
-0.916037 |
0.019967 |
0.256584 |
2013-01-05 |
-3.493505 |
1.665040 |
0.356288 |
0.953887 |
连接(join)
left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]})
left
right
pd.merge(left, right, on='key')
|
key |
lval |
rval |
0 |
foo |
1 |
4 |
1 |
foo |
1 |
5 |
2 |
foo |
2 |
4 |
3 |
foo |
2 |
5 |
追加(Append)
df = pd.DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D'])
df.append(df, ignore_index=True)
|
A |
B |
C |
D |
0 |
-0.761027 |
0.430054 |
0.452784 |
1.122863 |
1 |
-2.276889 |
-0.943561 |
1.823242 |
-0.716462 |
2 |
0.430023 |
-0.812228 |
0.938351 |
-0.839029 |
3 |
0.169974 |
0.890258 |
-0.387269 |
0.510224 |
4 |
-1.647350 |
1.135522 |
-1.064858 |
-0.303383 |
5 |
-0.382446 |
0.890663 |
-0.052855 |
-0.548905 |
6 |
-0.963716 |
-1.307239 |
1.830830 |
0.106964 |
7 |
1.481288 |
-0.022846 |
1.371338 |
-0.227230 |
8 |
-0.761027 |
0.430054 |
0.452784 |
1.122863 |
9 |
-2.276889 |
-0.943561 |
1.823242 |
-0.716462 |
10 |
0.430023 |
-0.812228 |
0.938351 |
-0.839029 |
11 |
0.169974 |
0.890258 |
-0.387269 |
0.510224 |
12 |
-1.647350 |
1.135522 |
-1.064858 |
-0.303383 |
13 |
-0.382446 |
0.890663 |
-0.052855 |
-0.548905 |
14 |
-0.963716 |
-1.307239 |
1.830830 |
0.106964 |
15 |
1.481288 |
-0.022846 |
1.371338 |
-0.227230 |
未完待续。。。