可以参考下这个, 对ann date分组后 取 report date 最大的记录
D=pd.DataFrame([(1,'a','a'),(2,'a','b'),(3,'b','a'),(4,'b','b')],columns=['F','Ann','Report'])
G=D.groupby('Ann')
G.apply(lambda x:x.sort_values(by='Report',ascending=False).iloc[0])
Out[29]:
F Ann Report
Ann
a 2 a b
b 4 b b
import pandas as pd
D=pd.DataFrame([(1,'a','a'),(2,'a','b'),(3,'b','a'),(4,'b','b')],columns=['F','Ann','Report'])
D.set_index('Ann',inplace=True)
D
Out[10]:
F Report
Ann
a 1 a
a 2 b
b 3 a
b 4 b
G=D.groupby(D.index)
G.apply(lambda x:x.sort_values(by='Report',ascending=False).iloc[0])
Out[12]:
F Report
Ann
a 2 b
b 4 b
A=pd.DataFrame([('2000-01-01',10,'1999-01-01')],columns=['ANN','Factor_A','Report_A'])
A.set_index('ANN',inplace=True)
B=pd.DataFrame([('2001-01-01',10,'1993-01-01')],columns=['ANN','Factor_B','Report_B'])
B.set_index('ANN',inplace=True)
A
Out[20]:
Factor_A Report_A
ANN
2000-01-01 10 1999-01-01
B
Out[21]:
Factor_B Report_B
ANN
2001-01-01 10 1993-01-01
A.join(B,how='outer').sort_index()
Out[23]:
Factor_A Report_A Factor_B Report_B
ANN
2000-01-01 10.0 1999-01-01 NaN NaN
2001-01-01 NaN NaN 10.0 1993-01-01