Pandas系列教程(七)
对于想要入门数据科学的朋友们来说,Python是一个很好的选择,除了因为简单的语法外,Python 生态中提供了很多在数值计算方面非常优秀的库,其中Pandas不可不提,Pandas是很强大是数据集处理工具,往往和numpy, matplotlib 等库搭配使用,我也是刚刚开始学习Pandas, 顺便翻译了一下官方的Pandas教程, 这里使用的是jupyter notebook, 因为博客不支持html直接编辑,所以只能转化为markdown 格式,如果想直接查看html版本可点击每一节下的链接。本文仅供学习和交流使用,欢迎大家交流和指正!
摘要
- groupby()函数的使用
- percentile 分位点的使用
- mean(),std()等统计函数
HTML版本点击此处
import pandas as pd
import sys
print('Python version ' + sys.version)
print('Pandas version ' + pd.__version__)
Python version 3.6.5 |Anaconda, Inc.| (default, Apr 29 2018, 16:14:56)
[GCC 7.2.0]
Pandas version 0.23.0
States = ['NY', 'NY', 'NY', 'NY', 'FL', 'FL', 'GA', 'GA', 'FL', 'FL']
data = [1.0, 2, 3, 4, 5, 6, 7, 8, 9, 10]
idx = pd.date_range('1/1/2012', periods=10, freq='MS')
df1 = pd.DataFrame(data, index=idx, columns=['Revenue'])
df1['State'] = States
data2 = [10.0, 10.0, 9, 9, 8, 8, 7, 7, 6, 6]
idx2 = pd.date_range('1/1/2013', periods=10, freq='MS')
df2 = pd.DataFrame(data2, index=idx2, columns=['Revenue'])
df2['State'] = States
df = pd.concat([df1,df2])
df
|
Revenue |
State |
2012-01-01 |
1.0 |
NY |
2012-02-01 |
2.0 |
NY |
2012-03-01 |
3.0 |
NY |
2012-04-01 |
4.0 |
NY |
2012-05-01 |
5.0 |
FL |
2012-06-01 |
6.0 |
FL |
2012-07-01 |
7.0 |
GA |
2012-08-01 |
8.0 |
GA |
2012-09-01 |
9.0 |
FL |
2012-10-01 |
10.0 |
FL |
2013-01-01 |
10.0 |
NY |
2013-02-01 |
10.0 |
NY |
2013-03-01 |
9.0 |
NY |
2013-04-01 |
9.0 |
NY |
2013-05-01 |
8.0 |
FL |
2013-06-01 |
8.0 |
FL |
2013-07-01 |
7.0 |
GA |
2013-08-01 |
7.0 |
GA |
2013-09-01 |
6.0 |
FL |
2013-10-01 |
6.0 |
FL |
newdf = df.copy()
newdf['x-Mean'] = abs(newdf['Revenue'] - newdf['Revenue'].mean())
newdf['1.96*std'] = 1.96*newdf['Revenue'].std()
newdf['Outlier'] = abs(newdf['Revenue'] - newdf['Revenue'].mean()) > newdf['1.96*std']
newdf
|
Revenue |
State |
x-Mean |
1.96*std |
Outlier |
2012-01-01 |
1.0 |
NY |
5.75 |
5.200273 |
True |
2012-02-01 |
2.0 |
NY |
4.75 |
5.200273 |
False |
2012-03-01 |
3.0 |
NY |
3.75 |
5.200273 |
False |
2012-04-01 |
4.0 |
NY |
2.75 |
5.200273 |
False |
2012-05-01 |
5.0 |
FL |
1.75 |
5.200273 |
False |
2012-06-01 |
6.0 |
FL |
0.75 |
5.200273 |
False |
2012-07-01 |
7.0 |
GA |
0.25 |
5.200273 |
False |
2012-08-01 |
8.0 |
GA |
1.25 |
5.200273 |
False |
2012-09-01 |
9.0 |
FL |
2.25 |
5.200273 |
False |
2012-10-01 |
10.0 |
FL |
3.25 |
5.200273 |
False |
2013-01-01 |
10.0 |
NY |
3.25 |
5.200273 |
False |
2013-02-01 |
10.0 |
NY |
3.25 |
5.200273 |
False |
2013-03-01 |
9.0 |
NY |
2.25 |
5.200273 |
False |
2013-04-01 |
9.0 |
NY |
2.25 |
5.200273 |
False |
2013-05-01 |
8.0 |
FL |
1.25 |
5.200273 |
False |
2013-06-01 |
8.0 |
FL |
1.25 |
5.200273 |
False |
2013-07-01 |
7.0 |
GA |
0.25 |
5.200273 |
False |
2013-08-01 |
7.0 |
GA |
0.25 |
5.200273 |
False |
2013-09-01 |
6.0 |
FL |
0.75 |
5.200273 |
False |
2013-10-01 |
6.0 |
FL |
0.75 |
5.200273 |
False |
newdf1 = df.copy()
State = newdf1.groupby('State')
newdf1['x-Mean'] = State.transform(lambda x: abs(x - x.mean()))
newdf1['1.96*std'] = State.transform(lambda x:1.96*x.std())
newdf1['Outlier'] = State.transform(lambda x: abs(x - x.mean()) > 1.96*x.std())
newdf1
|
Revenue |
State |
x-Mean |
1.96*std |
Outlier |
2012-01-01 |
1.0 |
NY |
5.00 |
7.554813 |
False |
2012-02-01 |
2.0 |
NY |
4.00 |
7.554813 |
False |
2012-03-01 |
3.0 |
NY |
3.00 |
7.554813 |
False |
2012-04-01 |
4.0 |
NY |
2.00 |
7.554813 |
False |
2012-05-01 |
5.0 |
FL |
2.25 |
3.434996 |
False |
2012-06-01 |
6.0 |
FL |
1.25 |
3.434996 |
False |
2012-07-01 |
7.0 |
GA |
0.25 |
0.980000 |
False |
2012-08-01 |
8.0 |
GA |
0.75 |
0.980000 |
False |
2012-09-01 |
9.0 |
FL |
1.75 |
3.434996 |
False |
2012-10-01 |
10.0 |
FL |
2.75 |
3.434996 |
False |
2013-01-01 |
10.0 |
NY |
4.00 |
7.554813 |
False |
2013-02-01 |
10.0 |
NY |
4.00 |
7.554813 |
False |
2013-03-01 |
9.0 |
NY |
3.00 |
7.554813 |
False |
2013-04-01 |
9.0 |
NY |
3.00 |
7.554813 |
False |
2013-05-01 |
8.0 |
FL |
0.75 |
3.434996 |
False |
2013-06-01 |
8.0 |
FL |
0.75 |
3.434996 |
False |
2013-07-01 |
7.0 |
GA |
0.25 |
0.980000 |
False |
2013-08-01 |
7.0 |
GA |
0.25 |
0.980000 |
False |
2013-09-01 |
6.0 |
FL |
1.25 |
3.434996 |
False |
2013-10-01 |
6.0 |
FL |
1.25 |
3.434996 |
False |
"""
方法二对多个关键字分组
这种分组方式统计的revenue就是在计算均值的时候计入每年状态相同的月份
"""
newdf = df.copy()
StateMonth = newdf.groupby(['State', lambda x: x.month])
newdf['Outlier'] = StateMonth.transform( lambda x: abs(x-x.mean()) > 1.96*x.std() )
newdf['x-Mean'] = StateMonth.transform( lambda x: abs(x-x.mean()) )
newdf['1.96*std'] = StateMonth.transform( lambda x: 1.96*x.std() )
newdf
|
Revenue |
State |
Outlier |
x-Mean |
1.96*std |
2012-01-01 |
1.0 |
NY |
False |
4.5 |
12.473364 |
2012-02-01 |
2.0 |
NY |
False |
4.0 |
11.087434 |
2012-03-01 |
3.0 |
NY |
False |
3.0 |
8.315576 |
2012-04-01 |
4.0 |
NY |
False |
2.5 |
6.929646 |
2012-05-01 |
5.0 |
FL |
False |
1.5 |
4.157788 |
2012-06-01 |
6.0 |
FL |
False |
1.0 |
2.771859 |
2012-07-01 |
7.0 |
GA |
False |
0.0 |
0.000000 |
2012-08-01 |
8.0 |
GA |
False |
0.5 |
1.385929 |
2012-09-01 |
9.0 |
FL |
False |
1.5 |
4.157788 |
2012-10-01 |
10.0 |
FL |
False |
2.0 |
5.543717 |
2013-01-01 |
10.0 |
NY |
False |
4.5 |
12.473364 |
2013-02-01 |
10.0 |
NY |
False |
4.0 |
11.087434 |
2013-03-01 |
9.0 |
NY |
False |
3.0 |
8.315576 |
2013-04-01 |
9.0 |
NY |
False |
2.5 |
6.929646 |
2013-05-01 |
8.0 |
FL |
False |
1.5 |
4.157788 |
2013-06-01 |
8.0 |
FL |
False |
1.0 |
2.771859 |
2013-07-01 |
7.0 |
GA |
False |
0.0 |
0.000000 |
2013-08-01 |
7.0 |
GA |
False |
0.5 |
1.385929 |
2013-09-01 |
6.0 |
FL |
False |
1.5 |
4.157788 |
2013-10-01 |
6.0 |
FL |
False |
2.0 |
5.543717 |
newdf = df.copy()
State = newdf.groupby('State')
def s(group):
group['x-Mean'] = abs(group['Revenue'] - group['Revenue'].mean())
group['1.96*std'] = 1.96*group['Revenue'].std()
group['Outlier'] = abs(group['Revenue'] - group['Revenue'].mean()) > 1.96*group['Revenue'].std()
return group
Newdf2 = State.apply(s)
Newdf2
|
Revenue |
State |
x-Mean |
1.96*std |
Outlier |
2012-01-01 |
1.0 |
NY |
5.00 |
7.554813 |
False |
2012-02-01 |
2.0 |
NY |
4.00 |
7.554813 |
False |
2012-03-01 |
3.0 |
NY |
3.00 |
7.554813 |
False |
2012-04-01 |
4.0 |
NY |
2.00 |
7.554813 |
False |
2012-05-01 |
5.0 |
FL |
2.25 |
3.434996 |
False |
2012-06-01 |
6.0 |
FL |
1.25 |
3.434996 |
False |
2012-07-01 |
7.0 |
GA |
0.25 |
0.980000 |
False |
2012-08-01 |
8.0 |
GA |
0.75 |
0.980000 |
False |
2012-09-01 |
9.0 |
FL |
1.75 |
3.434996 |
False |
2012-10-01 |
10.0 |
FL |
2.75 |
3.434996 |
False |
2013-01-01 |
10.0 |
NY |
4.00 |
7.554813 |
False |
2013-02-01 |
10.0 |
NY |
4.00 |
7.554813 |
False |
2013-03-01 |
9.0 |
NY |
3.00 |
7.554813 |
False |
2013-04-01 |
9.0 |
NY |
3.00 |
7.554813 |
False |
2013-05-01 |
8.0 |
FL |
0.75 |
3.434996 |
False |
2013-06-01 |
8.0 |
FL |
0.75 |
3.434996 |
False |
2013-07-01 |
7.0 |
GA |
0.25 |
0.980000 |
False |
2013-08-01 |
7.0 |
GA |
0.25 |
0.980000 |
False |
2013-09-01 |
6.0 |
FL |
1.25 |
3.434996 |
False |
2013-10-01 |
6.0 |
FL |
1.25 |
3.434996 |
False |
newdf = df.copy()
StateMonth = newdf.groupby(['State', lambda x: x.month])
def s(group):
group['x-Mean'] = abs(group['Revenue'] - group['Revenue'].mean())
group['1.96*std'] = 1.96*group['Revenue'].std()
group['Outlier'] = abs(group['Revenue'] - group['Revenue'].mean()) > 1.96*group['Revenue'].std()
return group
Newdf2 = StateMonth.apply(s)
Newdf2
|
Revenue |
State |
x-Mean |
1.96*std |
Outlier |
2012-01-01 |
1.0 |
NY |
4.5 |
12.473364 |
False |
2012-02-01 |
2.0 |
NY |
4.0 |
11.087434 |
False |
2012-03-01 |
3.0 |
NY |
3.0 |
8.315576 |
False |
2012-04-01 |
4.0 |
NY |
2.5 |
6.929646 |
False |
2012-05-01 |
5.0 |
FL |
1.5 |
4.157788 |
False |
2012-06-01 |
6.0 |
FL |
1.0 |
2.771859 |
False |
2012-07-01 |
7.0 |
GA |
0.0 |
0.000000 |
False |
2012-08-01 |
8.0 |
GA |
0.5 |
1.385929 |
False |
2012-09-01 |
9.0 |
FL |
1.5 |
4.157788 |
False |
2012-10-01 |
10.0 |
FL |
2.0 |
5.543717 |
False |
2013-01-01 |
10.0 |
NY |
4.5 |
12.473364 |
False |
2013-02-01 |
10.0 |
NY |
4.0 |
11.087434 |
False |
2013-03-01 |
9.0 |
NY |
3.0 |
8.315576 |
False |
2013-04-01 |
9.0 |
NY |
2.5 |
6.929646 |
False |
2013-05-01 |
8.0 |
FL |
1.5 |
4.157788 |
False |
2013-06-01 |
8.0 |
FL |
1.0 |
2.771859 |
False |
2013-07-01 |
7.0 |
GA |
0.0 |
0.000000 |
False |
2013-08-01 |
7.0 |
GA |
0.5 |
1.385929 |
False |
2013-09-01 |
6.0 |
FL |
1.5 |
4.157788 |
False |
2013-10-01 |
6.0 |
FL |
2.0 |
5.543717 |
False |
newdf = df.copy()
State = newdf.groupby('State')
newdf['Lower'] = State['Revenue'].transform( lambda x: x.quantile(q=.25) - (1.5*(x.quantile(q=.75)-x.quantile(q=.25))) )
newdf['Upper'] = State['Revenue'].transform( lambda x: x.quantile(q=.75) + (1.5*(x.quantile(q=.75)-x.quantile(q=.25))) )
newdf['Outlier'] = (newdf['Revenue'] < newdf['Lower']) | (newdf['Revenue'] > newdf['Upper'])
newdf
|
Revenue |
State |
Lower |
Upper |
Outlier |
2012-01-01 |
1.0 |
NY |
-7.000 |
19.000 |
False |
2012-02-01 |
2.0 |
NY |
-7.000 |
19.000 |
False |
2012-03-01 |
3.0 |
NY |
-7.000 |
19.000 |
False |
2012-04-01 |
4.0 |
NY |
-7.000 |
19.000 |
False |
2012-05-01 |
5.0 |
FL |
2.625 |
11.625 |
False |
2012-06-01 |
6.0 |
FL |
2.625 |
11.625 |
False |
2012-07-01 |
7.0 |
GA |
6.625 |
7.625 |
False |
2012-08-01 |
8.0 |
GA |
6.625 |
7.625 |
True |
2012-09-01 |
9.0 |
FL |
2.625 |
11.625 |
False |
2012-10-01 |
10.0 |
FL |
2.625 |
11.625 |
False |
2013-01-01 |
10.0 |
NY |
-7.000 |
19.000 |
False |
2013-02-01 |
10.0 |
NY |
-7.000 |
19.000 |
False |
2013-03-01 |
9.0 |
NY |
-7.000 |
19.000 |
False |
2013-04-01 |
9.0 |
NY |
-7.000 |
19.000 |
False |
2013-05-01 |
8.0 |
FL |
2.625 |
11.625 |
False |
2013-06-01 |
8.0 |
FL |
2.625 |
11.625 |
False |
2013-07-01 |
7.0 |
GA |
6.625 |
7.625 |
False |
2013-08-01 |
7.0 |
GA |
6.625 |
7.625 |
False |
2013-09-01 |
6.0 |
FL |
2.625 |
11.625 |
False |
2013-10-01 |
6.0 |
FL |
2.625 |
11.625 |
False |
x = [1,2,3,4]
y = [2,3,4,5]
dataset = list(zip(x,y))
du = pd.DataFrame(data=dataset,columns=['a','b'])
du
|
a |
b |
0 |
1 |
2 |
1 |
2 |
3 |
2 |
3 |
4 |
3 |
4 |
5 |
du['a']
0 1
1 2
2 3
3 4
Name: a, dtype: int64
du['a'].quantile()
2.5