import pandas as pd
import numpy as np
salaries=pd.DataFrame({
'name':['BOSS','Lilei','Lilei','Han','BOSS','BOSS','Han','BOSS'],
'Year':[2016,2016,2016,2016,2017,2017,2017,2017],
'Salary':[999999,20000,25000,3000,9999999,999999,3500,999999],
'Bonus':[100000,20000,20000,5000,200000,300000,3000,400000]
})
print(salaries.columns)
#Index(['Bonus', 'Salary', 'Year', 'name'],dtype='object')
print(salaries.info())
RangeIndex: 8 entries, 0 to 7
Data columns (total 4 columns):
Bonus 8 non-null int64
Salary 8 non-null int64
Year 8 non-null int64
name 8 non-null object
dtypes: int64(3), object(1)
memory usage: 336.0+ bytes
None
print(salaries.describe())
Bonus Salary Year
count 8.000000 8.000000e+00 8.000000
mean 131000.000000 1.631437e+06 2016.500000
std 152851.935826 3.416521e+06 0.534522
min 3000.000000 3.000000e+03 2016.000000
25% 16250.000000 1.587500e+04 2016.000000
50% 60000.000000 5.124995e+05 2016.500000
75% 225000.000000 9.999990e+05 2017.000000
max 400000.000000 9.999999e+06 2017.000000
salaries=salaries[['name','Year','Salary','Bonus']]
#字典无序,规定下columns的顺序
print(salaries)
name Year Salary Bonus
0 BOSS 2016 999999 100000
1 Lilei 2016 20000 20000
2 Lilei 2016 25000 20000
3 Han 2016 3000 5000
4 BOSS 2017 9999999 200000
5 BOSS 2017 999999 300000
6 Han 2017 3500 3000
7 BOSS 2017 999999 400000
group_by_name=salaries.groupby('name')
print(type(group_by_name))
print(group_by_name.groups) ###groups方法
print(len(group_by_name.groups))
{'Han': Int64Index([3, 6], dtype='int64'),
'BOSS': Int64Index([0, 4, 5, 7], dtype='int64'),
'Lilei': Int64Index([1, 2], dtype='int64')}
3
for name,group in group_by_name:
print(name)
print(group)
BOSS
name Year Salary Bonus
0 BOSS 2016 999999 100000
4 BOSS 2017 9999999 200000
5 BOSS 2017 999999 300000
7 BOSS 2017 999999 400000
Han
name Year Salary Bonus
3 Han 2016 3000 5000
6 Han 2017 3500 3000
Lilei
name Year Salary Bonus
1 Lilei 2016 20000 20000
2 Lilei 2016 25000 20000
print(group_by_name.get_group('Lilei'))
name Year Salary Bonus
1 Lilei 2016 20000 20000
2 Lilei 2016 25000 20000
print(group_by_name.get_group('BOSS'))
name Year Salary Bonus
0 BOSS 2016 999999 100000
4 BOSS 2017 9999999 200000
5 BOSS 2017 999999 300000
7 BOSS 2017 999999 400000
print(group_by_name.sum()) #相同name求和
Year Salary Bonus
name
BOSS 8067 12999996 1000000
Han 4033 6500 8000
Lilei 4032 45000 40000
print(group_by_name[['Salary','Bonus']].sum())
Salary Bonus
name
BOSS 12999996 1000000
Han 6500 8000
Lilei 45000 40000
print(group_by_name[['Salary','Bonus']].agg(sum))
#也可以用agg聚合函数
Salary Bonus
name
BOSS 12999996 1000000
Han 6500 8000
Lilei 45000 40000
print(group_by_name[['Bonus','Salary']].agg(['sum','mean','std']))
Bonus Salary
sum mean std sum mean std
name
BOSS 1000000 250000 129099.444874 12999996 3249999 4.500000e+06
Han 8000 4000 1414.213562 6500 3250 3.535534e+02
Lilei 40000 20000 0.000000 45000 22500 3.535534e+03
print(group_by_name[['Bonus','Salary']].agg([np.sum,np.mean,np.std]))
Bonus Salary
sum mean std sum mean std
name
BOSS 1000000 250000 129099.444874 12999996 3249999 4.500000e+06
Han 8000 4000 1414.213562 6500 3250 3.535534e+02
Lilei 40000 20000 0.000000 45000 22500 3.535534e+03
group_by_name_year=salaries.groupby(['name','Year'])
print(group_by_name_year.sum())
Salary Bonus
name Year
BOSS 2016 999999 100000
2017 11999997 900000
Han 2016 3000 5000
2017 3500 3000
Lilei 2016 45000 40000
print(group_by_name_year.size())
name Year
BOSS 2016 1
2017 3
Han 2016 1
2017 1
Lilei 2016 2
dtype: int64
print(group_by_name_year.mean())
Salary Bonus
name Year
BOSS 2016 999999 100000
2017 3999999 300000
Han 2016 3000 5000
2017 3500 3000
Lilei 2016 22500 20000
print(group_by_name_year.median())
Salary Bonus
name Year
BOSS 2016 999999 100000
2017 999999 300000
Han 2016 3000 5000
2017 3500 3000
Lilei 2016 22500 20000
可以参考http://blog.csdn.net/cymy001/article/details/78300775
nvda=pd.read_csv(r'D:\PYTHON35\idle\database\data\NVDA.csv',index_col=0,parse_dates=['Date'])
#parse_dates参数是把Data列解析为一个单独的data列
print(nvda.head())
Open High Low Close Adj Close Volume
Date
1999-01-22 1.750000 1.953125 1.552083 1.640625 1.523430 67867200
1999-01-25 1.770833 1.833333 1.640625 1.812500 1.683028 12762000
1999-01-26 1.833333 1.869792 1.645833 1.671875 1.552448 8580000
1999-01-27 1.677083 1.718750 1.583333 1.666667 1.547611 6109200
1999-01-28 1.666667 1.677083 1.651042 1.661458 1.542776 5688000
nvda['year']=nvda.index.year #加一个columns,'year'取自index列data里的year
print(nvda.head()) #对比nvda看
Open High Low Close Adj Close Volume year
Date
1999-01-22 1.750000 1.953125 1.552083 1.640625 1.523430 67867200 1999
1999-01-25 1.770833 1.833333 1.640625 1.812500 1.683028 12762000 1999
1999-01-26 1.833333 1.869792 1.645833 1.671875 1.552448 8580000 1999
1999-01-27 1.677083 1.718750 1.583333 1.666667 1.547611 6109200 1999
1999-01-28 1.666667 1.677083 1.651042 1.661458 1.542776 5688000 1999
nvda.drop(['year'],axis=1,inplace=True) #drop去掉year列,inplace原地改变
print(nvda.head())
Open High Low Close Adj Close Volume
Date
1999-01-22 1.750000 1.953125 1.552083 1.640625 1.523430 67867200
1999-01-25 1.770833 1.833333 1.640625 1.812500 1.683028 12762000
1999-01-26 1.833333 1.869792 1.645833 1.671875 1.552448 8580000
1999-01-27 1.677083 1.718750 1.583333 1.666667 1.547611 6109200
1999-01-28 1.666667 1.677083 1.651042 1.661458 1.542776 5688000
print(nvda.index.day) #取索引中的日期day
Int64Index([22, 25, 26, 27, 28, 29, 1, 2, 3, 4,
...
7, 10, 11, 12, 13, 14, 17, 18, 19, 20],
dtype='int64', name='Date', length=4654)
key=lambda x:x.year
nvda1=nvda.groupby(key).agg(['mean','std']).head() #按index里year项分类后,用均值方差聚合
print(nvda1)
Open High Low \
mean std mean std mean std
1999 1.950782 0.588882 2.007317 0.614302 1.883559 0.571658
2000 8.781084 2.999908 9.222697 3.114186 8.360522 2.904761
2001 13.091254 3.839777 13.600750 3.829838 12.680548 3.830944
2002 9.690344 6.561287 9.955093 6.664226 9.344391 6.375212
2003 5.902434 1.461862 6.042659 1.491260 5.764960 1.423422
Close Adj Close Volume
mean std mean std mean std
1999 1.947230 0.601041 1.808134 0.558107 6.433220e+06 8.142949e+06
2000 8.778826 3.013104 8.151729 2.797869 1.104182e+07 7.985374e+06
2001 13.181552 3.833637 12.239956 3.559789 2.782387e+07 1.384318e+07
2002 9.614749 6.519053 8.927940 6.053379 3.168655e+07 1.558742e+07
2003 5.900344 1.459852 5.478865 1.355570 2.430220e+07 1.899657e+07
my_transform=lambda x:(x-x.mean())/x.std()
tsf=nvda.groupby(key).transform(my_transform) ###
print(tsf.head())
Open High Low Close Adj Close Volume
Date
1999-01-22 -0.340955 -0.088217 -0.579850 -0.510124 -0.510124 7.544438
1999-01-25 -0.305578 -0.283222 -0.424964 -0.224161 -0.224161 0.777210
1999-01-26 -0.199444 -0.223871 -0.415854 -0.458130 -0.458131 0.263637
1999-01-27 -0.464778 -0.469747 -0.525185 -0.466795 -0.466798 -0.039791
1999-01-28 -0.482465 -0.537575 -0.406741 -0.475462 -0.475461 -0.091517
price_range=lambda x:x.max()-x.min()
tsf_price_range=nvda.groupby(key).transform(price_range).head()
print(tsf_price_range)
Open High Low Close Adj Close Volume
Date
1999-01-22 2.552084 2.53125 2.510417 2.546875 2.364945 67375200.0
1999-01-25 2.552084 2.53125 2.510417 2.546875 2.364945 67375200.0
1999-01-26 2.552084 2.53125 2.510417 2.546875 2.364945 67375200.0
1999-01-27 2.552084 2.53125 2.510417 2.546875 2.364945 67375200.0
1999-01-28 2.552084 2.53125 2.510417 2.546875 2.364945 67375200.0
app=nvda.groupby(key).apply(my_transform).head() ###apply
print(app)
Open High Low Close Adj Close Volume
Date
1999-01-22 -0.340955 -0.088217 -0.579850 -0.510124 -0.510124 7.544438
1999-01-25 -0.305578 -0.283222 -0.424964 -0.224161 -0.224161 0.777210
1999-01-26 -0.199444 -0.223871 -0.415854 -0.458130 -0.458131 0.263637
1999-01-27 -0.464778 -0.469747 -0.525185 -0.466795 -0.466798 -0.039791
1999-01-28 -0.482465 -0.537575 -0.406741 -0.475462 -0.475461 -0.091517
import pandas as pd
import numpy as np
salaries=pd.DataFrame({
'name':['BOSS','Lilei','Lilei','Han','BOSS','BOSS','Han','BOSS'],
'Year':[2016,2016,2016,2016,2017,2017,2017,2017],
'Salary':[999999,20000,25000,3000,9999999,999999,3500,999999],
'Bonus':[100000,20000,20000,5000,200000,300000,3000,400000]
})
salaries=salaries[['name','Year','Salary','Bonus']]
print(salaries)
name Year Salary Bonus
0 BOSS 2016 999999 100000
1 Lilei 2016 20000 20000
2 Lilei 2016 25000 20000
3 Han 2016 3000 5000
4 BOSS 2017 9999999 200000
5 BOSS 2017 999999 300000
6 Han 2017 3500 3000
7 BOSS 2017 999999 400000
def year_trans(x):
return x-1
salaries.loc[:,'trans_year']=salaries['Year'].apply(year_trans)
salaries['tmp']=salaries['Year']-salaries['Salary']
print(salaries)
name Year Salary Bonus trans_year tmp
0 BOSS 2016 999999 100000 2015 -997983
1 Lilei 2016 20000 20000 2015 -17984
2 Lilei 2016 25000 20000 2015 -22984
3 Han 2016 3000 5000 2015 -984
4 BOSS 2017 9999999 200000 2016 -9997982
5 BOSS 2017 999999 300000 2016 -997982
6 Han 2017 3500 3000 2016 -1483
7 BOSS 2017 999999 400000 2016 -997982
salaries['Salary_shift']=salaries['Salary'].shift(1) #向下平移1
print(salaries)
name Year Salary Bonus trans_year tmp Salary_shift
0 BOSS 2016 999999 100000 2015 -997983 NaN
1 Lilei 2016 20000 20000 2015 -17984 999999.0
2 Lilei 2016 25000 20000 2015 -22984 20000.0
3 Han 2016 3000 5000 2015 -984 25000.0
4 BOSS 2017 9999999 200000 2016 -9997982 3000.0
5 BOSS 2017 999999 300000 2016 -997982 9999999.0
6 Han 2017 3500 3000 2016 -1483 999999.0
7 BOSS 2017 999999 400000 2016 -997982 3500.0
salaries['diff']=salaries['Salary']-salaries['Salary_shift'] ###shift作用效果生效
print(salaries)
print(salaries)
name Year Salary Bonus trans_year tmp Salary_shift diff
0 BOSS 2016 999999 100000 2015 -997983 NaN NaN
1 Lilei 2016 20000 20000 2015 -17984 999999.0 -979999.0
2 Lilei 2016 25000 20000 2015 -22984 20000.0 5000.0
3 Han 2016 3000 5000 2015 -984 25000.0 -22000.0
4 BOSS 2017 9999999 200000 2016 -9997982 3000.0 9996999.0
5 BOSS 2017 999999 300000 2016 -997982 9999999.0 -9000000.0
6 Han 2017 3500 3000 2016 -1483 999999.0 -996499.0
7 BOSS 2017 999999 400000 2016 -997982 3500.0 996499.0
df1=pd.DataFrame({'apts':[55000,60000,80000],
'cars':[200000,300000,400000]})
df2 = pd.DataFrame({'apts':[35000, 30000, 50000],
'cars':[210000,310000,410000]})
df3 = pd.DataFrame({'apts':[34000, 32000, 70000],
'cars':[240000,350000,440000]})
print(df1)
apts cars
0 55000 200000
1 60000 300000
2 80000 400000
print(df2)
apts cars
0 35000 210000
1 30000 310000
2 50000 410000
print(df3)
apts cars
0 34000 240000
1 32000 350000
2 70000 440000
print(pd.concat([df1,df2,df3])) #纵向同columns拼接
apts cars
0 55000 200000
1 60000 300000
2 80000 400000
0 35000 210000
1 30000 310000
2 50000 410000
0 34000 240000
1 32000 350000
2 70000 440000
df1=pd.DataFrame({'apts':[55000,60000,80000],
'cars':[200000,300000,400000]})
DF=pd.DataFrame({'cars':[34000, 32000, 70000],
'apts':[240000,350000,440000],
'add':[11,22,33]})
DF=DF[['cars','add','apts']]
print(DF)
cars add apts
0 34000 11 240000
1 32000 22 350000
2 70000 33 440000
print(pd.concat([df1,DF])) #concat可以自动查找相同columns的名字索引进行列对齐
add apts cars
0 NaN 55000 200000
1 NaN 60000 300000
2 NaN 80000 400000
0 11.0 240000 34000
1 22.0 350000 32000
2 33.0 440000 70000
print(pd.concat([df1,df2,df3],axis=1)) #axis参数1为横向操作,0为纵向操作
apts cars apts cars apts cars
0 55000 200000 35000 210000 34000 240000
1 60000 300000 30000 310000 32000 350000
2 80000 400000 50000 410000 70000 440000
df1=pd.DataFrame({'apts':[55000,60000,80000],
'cars':[200000,300000,400000]})
df2=pd.DataFrame({'apts':[35000, 30000, 50000],
'cars':[210000,310000,410000]})
df3=pd.DataFrame({'apts':[34000, 32000, 70000],
'cars':[240000,350000,440000]})# print(df1.append(df2))
apts cars
0 55000 200000
1 60000 300000
2 80000 400000
0 35000 210000
1 30000 310000
2 50000 410000
print(df1.append([df2,df3]))
apts cars
0 55000 200000
1 60000 300000
2 80000 400000
0 35000 210000
1 30000 310000
2 50000 410000
0 34000 240000
1 32000 350000
2 70000 440000
df4=pd.DataFrame({'apts':[55000,60000,80000],
'cars':[200000,300000,400000]},
index=['Beijing','Shanghai','Nanjing'])
df5=pd.DataFrame({'apts':[55000, 70000, 80000],
'cars':[100000, 330000, 260000]},
index=['Beijing','Shanghai','Nanjing'])
print(df4)
apts cars
Beijing 55000 200000
Shanghai 60000 300000
Nanjing 80000 400000
print(df5)
apts cars
Beijing 55000 100000
Shanghai 70000 330000
Nanjing 80000 260000
print(pd.merge(df4,df5,on=['apts'],how='left'))
#merge将df4和df5按apts列合并,left表示以df4为基准,注意合并DataFrame里cars列名字变化
apts cars_x cars_y
0 55000 200000 100000.0
1 60000 300000 NaN
2 80000 400000 260000.0
print(pd.merge(df4,df5,on=['apts'],how='right'))
apts cars_x cars_y
0 55000 200000.0 100000
1 80000 400000.0 260000
2 70000 NaN 330000
print(pd.merge(df4,df5,on=['apts'],how='inner'))
apts cars_x cars_y
0 55000 200000 100000
1 80000 400000 260000
print(pd.merge(df4,df5,on=['apts'],how='outer'))
apts cars_x cars_y
0 55000 200000.0 100000.0
1 60000 300000.0 NaN
2 80000 400000.0 260000.0
3 70000 NaN 330000.0
print(pd.merge(df4,df5,on=['apts','cars'],how='outer'))
apts cars
0 55000 200000
1 60000 300000
2 80000 400000
3 55000 100000
4 70000 330000
5 80000 260000
pd.DataFrame.count,
pd.DataFrame.value_counts,
pd.DataFrame.size