import pandas as pd
import numpy as np
1 Object Creation
通过list创建Series
,使用默认的np.arange(n)作为index,如下所示
s = pd.Series([1, 3, 5, np.nan, 6, 8])
print s
0 1.0
1 3.0
2 5.0
3 NaN
4 6.0
5 8.0
dtype: float64
通过numpy array创建DataFrame
,使用datetime作为index,如下所示
dates = pd.date_range("20180101", periods=6)
print dates
DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',
'2018-01-05', '2018-01-06'],
dtype='datetime64[ns]', freq='D')
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list("ABCD"))
print df
A B C D
2018-01-01 -0.182417 -0.765569 -1.795267 0.412669
2018-01-02 2.160570 0.043086 -0.365435 -1.059866
2018-01-03 -1.245584 -1.048822 -0.549211 2.065063
2018-01-04 -1.509264 -1.069869 0.866164 0.004419
2018-01-05 1.105069 0.556120 -0.271595 -0.674373
2018-01-06 0.557255 0.206010 0.641051 -0.028821
通过字典创建DataFrame
,如下所示
df2 = pd.DataFrame({"A": 1.,
"B": pd.Timestamp("20180102"),
"C": pd.Series(1, index=list(range(4)), dtype="float32"),
"D": np.array([3] * 4, dtype="int32"),
"E": pd.Categorical(["test", "train", "test", "train"]),
"F": "foo"})
print df2
A B C D E F
0 1.0 2018-01-02 1.0 3 test foo
1 1.0 2018-01-02 1.0 3 train foo
2 1.0 2018-01-02 1.0 3 test foo
3 1.0 2018-01-02 1.0 3 train foo
查看各列的数据类型
print df2.dtypes
A float64
B datetime64[ns]
C float32
D int32
E category
F object
dtype: object
2 Viewing Data
查看frame中顶部或者底部的几行
print df.head()
A B C D
2018-01-01 -0.182417 -0.765569 -1.795267 0.412669
2018-01-02 2.160570 0.043086 -0.365435 -1.059866
2018-01-03 -1.245584 -1.048822 -0.549211 2.065063
2018-01-04 -1.509264 -1.069869 0.866164 0.004419
2018-01-05 1.105069 0.556120 -0.271595 -0.674373
print df.tail(3)
A B C D
2018-01-04 -1.509264 -1.069869 0.866164 0.004419
2018-01-05 1.105069 0.556120 -0.271595 -0.674373
2018-01-06 0.557255 0.206010 0.641051 -0.028821
查看index、columns以及内部的numpy data
print df.index
DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',
'2018-01-05', '2018-01-06'],
dtype='datetime64[ns]', freq='D')
print df.columns
Index([u'A', u'B', u'C', u'D'], dtype='object')
print df.values
[[-0.1824167 -0.76556885 -1.79526709 0.41266945]
[ 2.1605698 0.0430857 -0.36543471 -1.05986617]
[-1.24558386 -1.04882168 -0.54921109 2.06506281]
[-1.50926408 -1.06986922 0.86616383 0.00441866]
[ 1.10506893 0.55611973 -0.27159477 -0.67437281]
[ 0.55725508 0.20601021 0.64105132 -0.02882132]]
描述数据的统计信息
print df.describe()
A B C D
count 6.000000 6.000000 6.000000 6.000000
mean 0.147605 -0.346507 -0.245715 0.119848
std 1.409374 0.701984 0.951930 1.088811
min -1.509264 -1.069869 -1.795267 -1.059866
25% -0.979792 -0.978008 -0.503267 -0.512985
50% 0.187419 -0.361242 -0.318515 -0.012201
75% 0.968115 0.165279 0.412890 0.310607
max 2.160570 0.556120 0.866164 2.065063
转置
print df.T
2018-01-01 2018-01-02 2018-01-03 2018-01-04 2018-01-05 2018-01-06
A -0.182417 2.160570 -1.245584 -1.509264 1.105069 0.557255
B -0.765569 0.043086 -1.048822 -1.069869 0.556120 0.206010
C -1.795267 -0.365435 -0.549211 0.866164 -0.271595 0.641051
D 0.412669 -1.059866 2.065063 0.004419 -0.674373 -0.028821
按照axis排序
print df.sort_index(axis=1, ascending=False) #按照columns逆序排序
print df.sort_index(axis=0, ascending=False) #按照index逆序排序
D C B A
2018-01-01 0.412669 -1.795267 -0.765569 -0.182417
2018-01-02 -1.059866 -0.365435 0.043086 2.160570
2018-01-03 2.065063 -0.549211 -1.048822 -1.245584
2018-01-04 0.004419 0.866164 -1.069869 -1.509264
2018-01-05 -0.674373 -0.271595 0.556120 1.105069
2018-01-06 -0.028821 0.641051 0.206010 0.557255
A B C D
2018-01-06 0.557255 0.206010 0.641051 -0.028821
2018-01-05 1.105069 0.556120 -0.271595 -0.674373
2018-01-04 -1.509264 -1.069869 0.866164 0.004419
2018-01-03 -1.245584 -1.048822 -0.549211 2.065063
2018-01-02 2.160570 0.043086 -0.365435 -1.059866
2018-01-01 -0.182417 -0.765569 -1.795267 0.412669
按照值排序
print df.sort_values(by="B") #按照第B列的值正序排序
A B C D
2018-01-04 -1.509264 -1.069869 0.866164 0.004419
2018-01-03 -1.245584 -1.048822 -0.549211 2.065063
2018-01-01 -0.182417 -0.765569 -1.795267 0.412669
2018-01-02 2.160570 0.043086 -0.365435 -1.059866
2018-01-06 0.557255 0.206010 0.641051 -0.028821
2018-01-05 1.105069 0.556120 -0.271595 -0.674373
3 Selection
建议使用.at
, .iat
, .loc
, iloc
, ix
方法来访问DataFrame中的数据
3.1 Getting
获取单独的一列,返回一个Series
,等价于df.A
print df["A"]
print df.A
2018-01-01 -0.182417
2018-01-02 2.160570
2018-01-03 -1.245584
2018-01-04 -1.509264
2018-01-05 1.105069
2018-01-06 0.557255
Freq: D, Name: A, dtype: float64
2018-01-01 -0.182417
2018-01-02 2.160570
2018-01-03 -1.245584
2018-01-04 -1.509264
2018-01-05 1.105069
2018-01-06 0.557255
Freq: D, Name: A, dtype: float64
通过[begin_idx: end_idx]下标来访问某几行
print df[0:3] #这里不包含第3行
A B C D
2018-01-01 -0.182417 -0.765569 -1.795267 0.412669
2018-01-02 2.160570 0.043086 -0.365435 -1.059866
2018-01-03 -1.245584 -1.048822 -0.549211 2.065063
print df["20180102": "20180104"] #需要注意的是这里包涵有"20180104"
A B C D
2018-01-02 2.160570 0.043086 -0.365435 -1.059866
2018-01-03 -1.245584 -1.048822 -0.549211 2.065063
2018-01-04 -1.509264 -1.069869 0.866164 0.004419
3.2 Selection by Label
使用.loc
和.at
来访问数据
print df.loc[dates[0]] #获取某一行
A -0.182417
B -0.765569
C -1.795267
D 0.412669
Name: 2018-01-01 00:00:00, dtype: float64
print df.loc[:, ["A", "B"]] #获取所有行中指定列的数据
A B
2018-01-01 -0.182417 -0.765569
2018-01-02 2.160570 0.043086
2018-01-03 -1.245584 -1.048822
2018-01-04 -1.509264 -1.069869
2018-01-05 1.105069 0.556120
2018-01-06 0.557255 0.206010
print df.loc["20180102": "20180105", ["A", "B"]] #获取指定行指定列的数据
A B
2018-01-02 2.160570 0.043086
2018-01-03 -1.245584 -1.048822
2018-01-04 -1.509264 -1.069869
2018-01-05 1.105069 0.556120
print df.loc["20180106", ["A", "C"]]
A 0.557255
C 0.641051
Name: 2018-01-06 00:00:00, dtype: float64
print df.loc[dates[0], "A"] #获取数据表中的某一个数据
-0.182416696336
print df.at[dates[0], "A"] #快速获取数据表中的某一个数据
-0.182416696336
3.3 Selection by Position
使用iloc
和iat
访问数据
print df.iloc[3]
A -1.509264
B -1.069869
C 0.866164
D 0.004419
Name: 2018-01-04 00:00:00, dtype: float64
print df.iloc[3:5, 0:2]
A B
2018-01-04 -1.509264 -1.069869
2018-01-05 1.105069 0.556120
print df.iloc[[1, 2, 3], [0, 2]]
A C
2018-01-02 2.160570 -0.365435
2018-01-03 -1.245584 -0.549211
2018-01-04 -1.509264 0.866164
print df.iloc[1:3, :]
A B C D
2018-01-02 2.160570 0.043086 -0.365435 -1.059866
2018-01-03 -1.245584 -1.048822 -0.549211 2.065063
print df.iloc[:, 1:3]
B C
2018-01-01 -0.765569 -1.795267
2018-01-02 0.043086 -0.365435
2018-01-03 -1.048822 -0.549211
2018-01-04 -1.069869 0.866164
2018-01-05 0.556120 -0.271595
2018-01-06 0.206010 0.641051
print df.iloc[1,1]
0.0430856959577
print df.iat[1,1]
0.0430856959577
3.4 Boolean Indexing
根据某列的值选取数据
print df[df["A"] > 0] #选取A列值大于0的所有行
A B C D
2018-01-02 2.160570 0.043086 -0.365435 -1.059866
2018-01-05 1.105069 0.556120 -0.271595 -0.674373
2018-01-06 0.557255 0.206010 0.641051 -0.028821
根据布尔值获取数据
print df[df > 0]
A B C D
2018-01-01 NaN NaN NaN 0.412669
2018-01-02 2.160570 0.043086 NaN NaN
2018-01-03 NaN NaN NaN 2.065063
2018-01-04 NaN NaN 0.866164 0.004419
2018-01-05 1.105069 0.556120 NaN NaN
2018-01-06 0.557255 0.206010 0.641051 NaN
使用isin()方法进行过滤
df2 = df.copy()
df2["E"] = ["one", "one", "two", "three", "four", "three"]
print df2
print df2[df2["E"].isin(["two", "four"])]
A B C D E
2018-01-01 -0.182417 -0.765569 -1.795267 0.412669 one
2018-01-02 2.160570 0.043086 -0.365435 -1.059866 one
2018-01-03 -1.245584 -1.048822 -0.549211 2.065063 two
2018-01-04 -1.509264 -1.069869 0.866164 0.004419 three
2018-01-05 1.105069 0.556120 -0.271595 -0.674373 four
2018-01-06 0.557255 0.206010 0.641051 -0.028821 three
A B C D E
2018-01-03 -1.245584 -1.048822 -0.549211 2.065063 two
2018-01-05 1.105069 0.556120 -0.271595 -0.674373 four
3.5 Setting
s1 = pd.Series(range(6), index=pd.date_range("20180101", periods=6))
df["F"] = s1 #使用Series修改F列的值
print df
A B C D F
2018-01-01 -0.182417 -0.765569 -1.795267 0.412669 0
2018-01-02 2.160570 0.043086 -0.365435 -1.059866 1
2018-01-03 -1.245584 -1.048822 -0.549211 2.065063 2
2018-01-04 -1.509264 -1.069869 0.866164 0.004419 3
2018-01-05 1.105069 0.556120 -0.271595 -0.674373 4
2018-01-06 0.557255 0.206010 0.641051 -0.028821 5
df.at[dates[0], "A"] = 0 #重置某一个元素的值
print df
A B C D F
2018-01-01 0.000000 -0.765569 -1.795267 0.412669 0
2018-01-02 2.160570 0.043086 -0.365435 -1.059866 1
2018-01-03 -1.245584 -1.048822 -0.549211 2.065063 2
2018-01-04 -1.509264 -1.069869 0.866164 0.004419 3
2018-01-05 1.105069 0.556120 -0.271595 -0.674373 4
2018-01-06 0.557255 0.206010 0.641051 -0.028821 5
df.iat[0, 1] = -1 #重置某一个元素的值
print df
A B C D F
2018-01-01 0.000000 -1.000000 -1.795267 0.412669 0
2018-01-02 2.160570 0.043086 -0.365435 -1.059866 1
2018-01-03 -1.245584 -1.048822 -0.549211 2.065063 2
2018-01-04 -1.509264 -1.069869 0.866164 0.004419 3
2018-01-05 1.105069 0.556120 -0.271595 -0.674373 4
2018-01-06 0.557255 0.206010 0.641051 -0.028821 5
df.loc[:, "D"] = np.array([5] * len(df))
print df
A B C D F
2018-01-01 0.000000 -1.000000 -1.795267 5 0
2018-01-02 2.160570 0.043086 -0.365435 5 1
2018-01-03 -1.245584 -1.048822 -0.549211 5 2
2018-01-04 -1.509264 -1.069869 0.866164 5 3
2018-01-05 1.105069 0.556120 -0.271595 5 4
2018-01-06 0.557255 0.206010 0.641051 5 5
df2 = df.copy()
df2[df2 > 0] = -df2 #将df2中所有大于0的值转换成对应的相反数
print df2
A B C D F
2018-01-01 0.000000 -1.000000 -1.795267 -5 0
2018-01-02 -2.160570 -0.043086 -0.365435 -5 -1
2018-01-03 -1.245584 -1.048822 -0.549211 -5 -2
2018-01-04 -1.509264 -1.069869 -0.866164 -5 -3
2018-01-05 -1.105069 -0.556120 -0.271595 -5 -4
2018-01-06 -0.557255 -0.206010 -0.641051 -5 -5
4 Missing Data
pandas默认使用np.nan
来替换数据中的缺失值。
使用重构索引reindex
从一个DataFrame中创建一个有缺失值的DataFrame,如下所示:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns)+["E"])
df1.loc[dates[0]: dates[1], "E"] = 1
print df1
A B C D F E
2018-01-01 0.000000 -1.000000 -1.795267 5 0 1.0
2018-01-02 2.160570 0.043086 -0.365435 5 1 1.0
2018-01-03 -1.245584 -1.048822 -0.549211 5 2 NaN
2018-01-04 -1.509264 -1.069869 0.866164 5 3 NaN
4.1 缺失值检查
使用df.isnull()
或者df.notnull()
方法判断缺失值
print df1.notnull()
A B C D F E
2018-01-01 True True True True True True
2018-01-02 True True True True True True
2018-01-03 True True True True True False
2018-01-04 True True True True True False
print df1.isnull()
A B C D F E
2018-01-01 False False False False False False
2018-01-02 False False False False False False
2018-01-03 False False False False False True
2018-01-04 False False False False False True
4.2 填充缺失值
使用df.fillna()
方法用标量值填充缺失值
print df1.fillna(value=4)
A B C D F E
2018-01-01 0.000000 -1.000000 -1.795267 5 0 1.0
2018-01-02 2.160570 0.043086 -0.365435 5 1 1.0
2018-01-03 -1.245584 -1.048822 -0.549211 5 2 4.0
2018-01-04 -1.509264 -1.069869 0.866164 5 3 4.0
4.3 丢弃缺失值
可以通过df.dropna()
方法丢弃掉所有包含有缺失值的行,如下所示
print df1.dropna()
A B C D F E
2018-01-01 0.00000 -1.000000 -1.795267 5 0 1.0
2018-01-02 2.16057 0.043086 -0.365435 5 1 1.0
4.4 固定值替换
可以通过df.replace()
方法替换掉某一些固定的值,如下所示
print df1.replace({1:1.1, 0: 0.1})
A B C D F E
2018-01-01 0.100000 -1.000000 -1.795267 5 0.1 1.1
2018-01-02 2.160570 0.043086 -0.365435 5 1.1 1.1
2018-01-03 -1.245584 -1.048822 -0.549211 5 2.0 NaN
2018-01-04 -1.509264 -1.069869 0.866164 5 3.0 NaN
5 Operations
5.1 统计方法
按列求平均值
print df.mean()
A 0.178008
B -0.385579
C -0.245715
D 5.000000
F 2.500000
dtype: float64
按行求平均值
print df.mean(1)
2018-01-01 0.440947
2018-01-02 1.567644
2018-01-03 0.831277
2018-01-04 1.257406
2018-01-05 2.077919
2018-01-06 2.280863
Freq: D, dtype: float64
除了求平均值,还有其他统计方法如:min
max
等
print df.min()
A -1.509264
B -1.069869
C -1.795267
D 5.000000
F 0.000000
dtype: float64
5.2 Apply
可以直接对数据施加某种操作(默认按列处理,即将每一列传入操作函数),如下所示
按列累计求和
print df
print df.apply(np.cumsum)
A B C D F
2018-01-01 0.000000 -1.000000 -1.795267 5 0
2018-01-02 2.160570 0.043086 -0.365435 5 1
2018-01-03 -1.245584 -1.048822 -0.549211 5 2
2018-01-04 -1.509264 -1.069869 0.866164 5 3
2018-01-05 1.105069 0.556120 -0.271595 5 4
2018-01-06 0.557255 0.206010 0.641051 5 5
A B C D F
2018-01-01 0.000000 -1.000000 -1.795267 5 0
2018-01-02 2.160570 -0.956914 -2.160702 10 1
2018-01-03 0.914986 -2.005736 -2.709913 15 3
2018-01-04 -0.594278 -3.075605 -1.843749 20 6
2018-01-05 0.510791 -2.519485 -2.115344 25 10
2018-01-06 1.068046 -2.313475 -1.474293 30 15
求各列最大值与最小值的差
print df.apply(lambda x: x.max() - x.min())
A 3.669834
B 1.625989
C 2.661431
D 0.000000
F 5.000000
dtype: float64
5.4 字符串方法
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
print s.str.lower()
0 a
1 b
2 c
3 aaba
4 baca
5 NaN
6 caba
7 dog
8 cat
dtype: object
print s.str.upper()
0 A
1 B
2 C
3 AABA
4 BACA
5 NaN
6 CABA
7 DOG
8 CAT
dtype: object
print s.str.len()
0 1.0
1 1.0
2 1.0
3 4.0
4 4.0
5 NaN
6 4.0
7 3.0
8 3.0
dtype: float64
print s.str.strip("a")
0 A
1 B
2 C
3 Aab
4 Bac
5 NaN
6 CABA
7 dog
8 cat
dtype: object
print s.str.lstrip("c")
0 A
1 B
2 C
3 Aaba
4 Baca
5 NaN
6 CABA
7 dog
8 at
dtype: object
6 Merge
6.1 Concat
使用concat
将两个DataFrame连接在一起,如下所示
pd.concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False)
参数 | 含义 |
---|---|
objs | Series、DataFrame或者它们的list |
axis | 连接的轴,默认为0,也就是按行连接在一起 |
join | 暂未搞懂 |
join_axes | 暂未搞懂 |
ignore_index | 是否忽略objs中的index |
df = pd.DataFrame(np.random.randn(10, 4))
print df
0 1 2 3
0 -0.867021 0.605678 0.012679 -1.775585
1 -0.434919 -0.896450 0.250021 -0.353441
2 1.926973 -0.853758 -1.694442 -0.426520
3 0.451539 0.619271 0.215580 0.347851
4 0.024561 -0.272727 1.234351 1.129837
5 -2.072784 0.962564 0.945457 -1.331562
6 -1.007067 0.277316 1.338265 -0.363388
7 -0.086109 -0.131523 1.161846 1.909355
8 -0.991148 0.657311 1.150405 -0.808498
9 0.921782 -1.977269 0.368596 0.961012
pieces = [df[3:7], df[:3], df[7:], df[:]]
print pd.concat(pieces)
A B C D
3 0.110854 0.106648 0.809414 -0.893506
4 1.919487 -1.024317 -0.641602 1.686787
5 1.836246 0.024001 -0.199739 1.479661
6 0.368328 0.541060 0.014246 -0.050968
0 -2.163123 0.948758 -0.388814 0.870307
1 0.013942 -2.070907 1.344709 -0.976564
2 1.498956 2.899843 0.433546 0.288232
7 1.374221 0.515852 -0.950572 -0.961190
0 -2.163123 0.948758 -0.388814 0.870307
1 0.013942 -2.070907 1.344709 -0.976564
2 1.498956 2.899843 0.433546 0.288232
3 0.110854 0.106648 0.809414 -0.893506
4 1.919487 -1.024317 -0.641602 1.686787
5 1.836246 0.024001 -0.199739 1.479661
6 0.368328 0.541060 0.014246 -0.050968
7 1.374221 0.515852 -0.950572 -0.961190
6.2 Join
类似SQL的连接操作
pd.merge(left, right, how='inner', on=None, left_on=None, right_on=None, left_index=False, right_index=False, sort=True)
参数 | 含义 |
---|---|
left | 一个DataFrame |
right | 另一个DataFrame |
how | 连接方式,是left、right、outer以及inner中的一个,默认为inner |
on | how为inner时,内连,两个DataFrame都要有的列 |
left_on | how为left时,左外连,左侧DataFrame中要作为健的列 |
right_on | how为right,右外连,右侧DataFrame中要作为健的列 |
left_index | 暂时无用 |
right_index | 暂时无用 |
sort | 是否按照连接健按照字典序进行排序 |
left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]})
print left
print right
key lval
0 foo 1
1 foo 2
key rval
0 foo 4
1 foo 5
print pd.merge(left, right, on="key")
key lval rval
0 foo 1 4
1 foo 1 5
2 foo 2 4
3 foo 2 5
left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]})
print left
print right
key lval
0 foo 1
1 bar 2
key rval
0 foo 4
1 bar 5
print pd.merge(left, right, on="key")
key lval rval
0 foo 1 4
1 bar 2 5
left = pd.DataFrame({'key': ['foo', 'bar', 'bar'], 'lval': [1, 2, 3]})
right = pd.DataFrame({'key': ['foo', 'foo', 'bar'], 'rval': [4, 5, 6]})
print left
print right
print pd.merge(left, right, on="key")
key lval
0 foo 1
1 bar 2
2 bar 3
key rval
0 foo 4
1 foo 5
2 bar 6
key lval rval
0 foo 1 4
1 foo 1 5
2 bar 2 6
3 bar 3 6
6.3 Append
使用append为DataFrame新增一行,如下所示
pd.append(other, ignore_index=False, verify_integrity=False)
参数 | 含义 |
---|---|
other | 另外一个Series或DataFrame, or Series或DataFrame的list |
ignore_index | 是否忽略other中的index |
verify_intergrity | 暂不使用 |
df = pd.DataFrame(np.random.randn(8, 4), columns = ["A", "B", "C", "D"])
print df
A B C D
0 -2.163123 0.948758 -0.388814 0.870307
1 0.013942 -2.070907 1.344709 -0.976564
2 1.498956 2.899843 0.433546 0.288232
3 0.110854 0.106648 0.809414 -0.893506
4 1.919487 -1.024317 -0.641602 1.686787
5 1.836246 0.024001 -0.199739 1.479661
6 0.368328 0.541060 0.014246 -0.050968
7 1.374221 0.515852 -0.950572 -0.961190
s = df.iloc[3]
print df.append(s)
A B C D
0 -2.163123 0.948758 -0.388814 0.870307
1 0.013942 -2.070907 1.344709 -0.976564
2 1.498956 2.899843 0.433546 0.288232
3 0.110854 0.106648 0.809414 -0.893506
4 1.919487 -1.024317 -0.641602 1.686787
5 1.836246 0.024001 -0.199739 1.479661
6 0.368328 0.541060 0.014246 -0.050968
7 1.374221 0.515852 -0.950572 -0.961190
3 0.110854 0.106648 0.809414 -0.893506
print df.append(s, ignore_index=True)
A B C D
0 -2.163123 0.948758 -0.388814 0.870307
1 0.013942 -2.070907 1.344709 -0.976564
2 1.498956 2.899843 0.433546 0.288232
3 0.110854 0.106648 0.809414 -0.893506
4 1.919487 -1.024317 -0.641602 1.686787
5 1.836246 0.024001 -0.199739 1.479661
6 0.368328 0.541060 0.014246 -0.050968
7 1.374221 0.515852 -0.950572 -0.961190
8 0.110854 0.106648 0.809414 -0.893506
7 Grouping
group by的执行过程如下
- 将数据按照group keys分组
- 对每个分组中的数据按列执行某种操作,如按列求和
- 将结果组织成DataFrame返回
df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
'B' : ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
'C' : np.random.randn(8),
'D' : np.random.randn(8)})
print df
A B C D
0 foo one -1.303717 -0.981339
1 bar one 1.739146 -0.913236
2 foo two -0.917766 -0.183430
3 bar three -0.584993 -0.899904
4 foo two -0.515345 -0.694656
5 bar two 1.162459 -2.185869
6 foo one 0.500489 -0.292250
7 foo three 1.353060 0.643049
print df.groupby("A").sum()
C D
A
bar 2.316612 -3.999009
foo -0.883280 -1.508626
print df.groupby(["A", "B"]).mean()
C D
A B
bar one 1.739146 -0.913236
three -0.584993 -0.899904
two 1.162459 -2.185869
foo one -0.401614 -0.636794
three 1.353060 0.643049
two -0.716556 -0.439043
8 Time Series
rng = pd.date_range("1/1/2018", periods=100, freq="S")
ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)
print ts
2018-01-01 00:00:00 392
2018-01-01 00:00:01 26
2018-01-01 00:00:02 474
2018-01-01 00:00:03 308
2018-01-01 00:00:04 27
2018-01-01 00:00:05 337
2018-01-01 00:00:06 224
2018-01-01 00:00:07 257
2018-01-01 00:00:08 336
2018-01-01 00:00:09 79
2018-01-01 00:00:10 125
2018-01-01 00:00:11 186
2018-01-01 00:00:12 443
2018-01-01 00:00:13 304
2018-01-01 00:00:14 446
2018-01-01 00:00:15 476
2018-01-01 00:00:16 124
2018-01-01 00:00:17 466
2018-01-01 00:00:18 186
2018-01-01 00:00:19 370
2018-01-01 00:00:20 408
2018-01-01 00:00:21 243
2018-01-01 00:00:22 425
2018-01-01 00:00:23 276
2018-01-01 00:00:24 429
2018-01-01 00:00:25 339
2018-01-01 00:00:26 354
2018-01-01 00:00:27 403
2018-01-01 00:00:28 102
2018-01-01 00:00:29 122
...
2018-01-01 00:01:10 181
2018-01-01 00:01:11 70
2018-01-01 00:01:12 134
2018-01-01 00:01:13 246
2018-01-01 00:01:14 87
2018-01-01 00:01:15 313
2018-01-01 00:01:16 473
2018-01-01 00:01:17 292
2018-01-01 00:01:18 460
2018-01-01 00:01:19 293
2018-01-01 00:01:20 316
2018-01-01 00:01:21 449
2018-01-01 00:01:22 334
2018-01-01 00:01:23 327
2018-01-01 00:01:24 120
2018-01-01 00:01:25 87
2018-01-01 00:01:26 353
2018-01-01 00:01:27 401
2018-01-01 00:01:28 406
2018-01-01 00:01:29 346
2018-01-01 00:01:30 107
2018-01-01 00:01:31 128
2018-01-01 00:01:32 214
2018-01-01 00:01:33 64
2018-01-01 00:01:34 433
2018-01-01 00:01:35 264
2018-01-01 00:01:36 429
2018-01-01 00:01:37 140
2018-01-01 00:01:38 256
2018-01-01 00:01:39 291
Freq: S, dtype: int64
rng = pd.date_range('3/6/2018 00:00', periods=5, freq='D')
ts = pd.Series(np.random.randn(len(rng)), rng)
print ts
2018-03-06 1.696274
2018-03-07 1.816374
2018-03-08 2.211654
2018-03-09 -0.282086
2018-03-10 0.232116
Freq: D, dtype: float64
rng = pd.date_range('1/1/2018', periods=5, freq='M')
ts = pd.Series(np.random.randn(len(rng)), index=rng)
print ts
2018-01-31 -0.240352
2018-02-28 0.808056
2018-03-31 1.432285
2018-04-30 0.088228
2018-05-31 0.510007
Freq: M, dtype: float64
9 Getting Data In/Out
待补充