import pandas as pd
import datetime
import numpy as np
# rank函数
S = pd.Series([3,5,1,9])
S.rank(method='average')
0 2.0
1 3.0
2 1.0
3 4.0
dtype: float64
S = pd.Series([4,5,1,9,2,5,4])
S.rank(method='average')
0 3.5
1 5.5
2 1.0
3 7.0
4 2.0
5 5.5
6 3.5
dtype: float64
S = pd.Series([4,5,1,9,2,5,4])
S.rank(method='min')
0 3.0
1 5.0
2 1.0
3 7.0
4 2.0
5 5.0
6 3.0
dtype: float64
S = pd.Series([4,5,1,9,2,5,4])
S.rank(method='max')
0 4.0
1 6.0
2 1.0
3 7.0
4 2.0
5 6.0
6 4.0
dtype: float64
S = pd.Series([4,5,1,9,2,5,4])
S.rank(method='first')
0 3.0
1 5.0
2 1.0
3 7.0
4 2.0
5 6.0
6 4.0
dtype: float64
S = pd.Series([4,5,1,9,2,5,4])
S.rank(method='dense')
0 3.0
1 4.0
2 1.0
3 5.0
4 2.0
5 4.0
6 3.0
dtype: float64
#groupby分组运算
df=pd.DataFrame({'a':['A','B','C','A','B','C','A','B','C'],\
'b':[1,3,5,7,9,2,4,6,8],\
'c':[1,2,3,4,5,6,7,8,9]})
df.groupby('a',as_index=False)['b'].agg({'mean_value':'mean'})
|
a |
mean_value |
0 |
A |
4 |
1 |
B |
6 |
2 |
C |
5 |
df.groupby('a',as_index=False)['b'].agg({'mean_value':'mean','max_vale':'max'})
|
a |
mean_value |
max_vale |
0 |
A |
4 |
7 |
1 |
B |
6 |
9 |
2 |
C |
5 |
8 |
df['rank']=df.groupby('a')['b'].rank(ascending=False)
df
|
a |
b |
c |
rank |
0 |
A |
1 |
1 |
3.0 |
1 |
B |
3 |
2 |
3.0 |
2 |
C |
5 |
3 |
2.0 |
3 |
A |
7 |
4 |
1.0 |
4 |
B |
9 |
5 |
1.0 |
5 |
C |
2 |
6 |
3.0 |
6 |
A |
4 |
7 |
2.0 |
7 |
B |
6 |
8 |
2.0 |
8 |
C |
8 |
9 |
1.0 |
对分组进行迭代
df=pd.DataFrame({'key1':['a','a','b','b','a'],\
'key2':['one','two','one','two','one'],\
'data_1':[np.random.randint(0,8) for i in range(5)],\
'data_2':[np.random.randint(5,10) for i in range(5)]})
df
|
data_1 |
data_2 |
key1 |
key2 |
0 |
2 |
7 |
a |
one |
1 |
3 |
5 |
a |
two |
2 |
5 |
9 |
b |
one |
3 |
1 |
6 |
b |
two |
4 |
4 |
6 |
a |
one |
tmp=df.groupby(['key1','key2'],as_index=False)['data_1']\
.agg({'max_value':'max','mean_value':'mean'})
tmp
|
key1 |
key2 |
max_value |
mean_value |
0 |
a |
one |
4 |
3 |
1 |
a |
two |
3 |
3 |
2 |
b |
one |
5 |
5 |
3 |
b |
two |
1 |
1 |
for key,df in tmp.groupby('key1',as_index=False):
print key
print df
a
key1 key2 max_value mean_value
0 a one 4 3
1 a two 3 3
b
key1 key2 max_value mean_value
2 b one 5 5
3 b two 1 1
#DataFrame.iterrows()函数
for index,row in tmp.iterrows():
print index
print row
print type(row)
0
key1 a
key2 one
max_value 4
mean_value 3
Name: 0, dtype: object
1
key1 a
key2 two
max_value 3
mean_value 3
Name: 1, dtype: object
2
key1 b
key2 one
max_value 5
mean_value 5
Name: 2, dtype: object
3
key1 b
key2 two
max_value 1
mean_value 1
Name: 3, dtype: object
tmp=pd.DataFrame({'item_id':['A','A','A','B','B','B','C','C','C'],\
'day':[1,2,3,1,2,3,1,2,3],\
'buy':[5,4,8,6,4,2,12,18,10],\
'cnt':[10,20,32,30,16,4,16,30,20]})
tmp
|
buy |
cnt |
day |
item_id |
0 |
5 |
10 |
1 |
A |
1 |
4 |
20 |
2 |
A |
2 |
8 |
32 |
3 |
A |
3 |
6 |
30 |
1 |
B |
4 |
4 |
16 |
2 |
B |
5 |
2 |
4 |
3 |
B |
6 |
12 |
16 |
1 |
C |
7 |
18 |
30 |
2 |
C |
8 |
10 |
20 |
3 |
C |
item='item_id'
features = []
for key, df in tmp.groupby(item, as_index=False):
feature = {}
feature[item] = key
for index, row in df.iterrows():
feature[item + 'buy' + str(int(row['day']))] = row['buy']
feature[item + 'cnt' + str(int(row['day']))] = row['cnt']
features.append(feature)
features
[{'item_id': 'A',
'item_idbuy1': 5L,
'item_idbuy2': 4L,
'item_idbuy3': 8L,
'item_idcnt1': 10L,
'item_idcnt2': 20L,
'item_idcnt3': 32L},
{'item_id': 'B',
'item_idbuy1': 6L,
'item_idbuy2': 4L,
'item_idbuy3': 2L,
'item_idcnt1': 30L,
'item_idcnt2': 16L,
'item_idcnt3': 4L},
{'item_id': 'C',
'item_idbuy1': 12L,
'item_idbuy2': 18L,
'item_idbuy3': 10L,
'item_idcnt1': 16L,
'item_idcnt2': 30L,
'item_idcnt3': 20L}]
pd.DataFrame(features)
|
item_id |
item_idbuy1 |
item_idbuy2 |
item_idbuy3 |
item_idcnt1 |
item_idcnt2 |
item_idcnt3 |
0 |
A |
5 |
4 |
8 |
10 |
20 |
32 |
1 |
B |
6 |
4 |
2 |
30 |
16 |
4 |
2 |
C |
12 |
18 |
10 |
16 |
30 |
20 |