创建一个DataFrame的三种方法
1、用字典dict,字典值value是列表list
population = {"city":["beijing","shanghai","guangzhou","shenzhen","hangzhou","chongqing"],
"year":[2016,2017,2016,2017,2017,2016],
"population":[2100,2300,1000,700,500,500]}#字典里的键和值必须一一对应,否则会报错
population = pd.DataFrame(population)
print(population)
city population year
0 beijing 2100 2016
1 shanghai 2300 2017
2 guangzhou 1000 2016
3 shenzhen 700 2017
4 hangzhou 500 2017
5 chongqing 500 2016
pdc = pd.DataFrame(population,columns=["year","city","population"])#改变列的参数
print(pdc)
year city population
0 2016 beijing 2100
1 2017 shanghai 2300
2 2016 guangzhou 1000
3 2017 shenzhen 700
4 2017 hangzhou 500
5 2016 chongqing 500
temp = {"city":["beijing","shanghai","guangzhou","shenzhen","hangzhou","chongqing"],
"year":[2016,2017,2016,2017,2017,2016],
"population":[2100,2300,1000,700,500,500]}
pdci = pd.DataFrame(temp,columns=["year","city","population"],index = ['one','two','three','four','five','six'])
#改变列的顺序和索引格式
print(pdci)
year city population
one 2016 beijing 2100
two 2017 shanghai 2300
three 2016 guangzhou 1000
four 2017 shenzhen 700
five 2017 hangzhou 500
six 2016 chongqing 500
2、用series构建DataFrame
from pandas import pandas as pd
cities={'Beijing':55000,'Shanghai':60000,'shenzhen':50000,'Hangzhou':20000,'Guangzhou':45000,'Suzhou':None}
apts=pd.Series(cities,name='income')
apts['shenzhen']=70000
less_than_50000=(apts<50000)
apts[less_than_50000]=40000
apts2=pd.Series({'Beijing':10000,'Shanghai':8000,'shenzhen':6000,'Tianjin':40000,'Guangzhou':7000,'Chongqing':30000})
#print(apts2)
apts=apts+apts2
apts[apts.isnull()]=apts.mean()#缺省值用中位数填充
#print(apts)
df=pd.DataFrame({'apts':apts,'apts2':apts2})#两个series合并成一个df,共有的键显示值,非共有的显示NaN
apts apts2
Beijing 65000.0 10000.0
Chongqing 64000.0 30000.0
Guangzhou 47000.0 7000.0
Hangzhou 64000.0 NaN
Shanghai 68000.0 8000.0
Suzhou 64000.0 NaN
Tianjin 64000.0 40000.0
shenzhen 76000.0 6000.0
3、用一个字典构成的列表list of dicts来构建DataFrame
data = [{'lucy':9999,'linus':8888,'curry':100000},{'lucy':9998,'linus':8887,'curry':1000000}]
pd2 = pd.DataFrame(data,index=['salary1','salary2'])#一个疑问,为什么Lucy在最后?
print(pd2)
curry linus lucy
salary1 100000 8888 9999
salary2 1000000 8887 9998
广播特性
from pandas import pandas as pd
cities={'Beijing':55000,'Shanghai':60000,'shenzhen':50000,'Hangzhou':20000,'Guangzhou':45000,'Suzhou':None}
apts=pd.Series(cities,name='income')
apts['shenzhen']=70000
less_than_50000=(apts<50000)
apts[less_than_50000]=40000
apts2=pd.Series({'Beijing':10000,'Shanghai':8000,'shenzhen':6000,'Tianjin':40000,'Guangzhou':7000,'Chongqing':30000})
#print(apts2)
apts=apts+apts2
apts[apts.isnull()]=apts.mean()#缺省值用中位数填充
#print(apts)
df=pd.DataFrame({'apts':apts,'apts2':apts2})#两个series合并成一个df,共有的键显示值,非共有的显示NaN
df['bonus']=2000#新增一列bonus,并赋值为2000
print(df)
apts apts2 bonus
Beijing 65000.0 10000.0 2000
Chongqing 64000.0 30000.0 2000
Guangzhou 47000.0 7000.0 2000
Hangzhou 64000.0 NaN 2000
Shanghai 68000.0 8000.0 2000
Suzhou 64000.0 NaN 2000
Tianjin 64000.0 40000.0 2000
shenzhen 76000.0 6000.0 2000
df['income']=df['apts']*2+df['apts2']*1.5+df['bonus']
print(df)
apts apts2 bonus income
Beijing 65000.0 10000.0 2000 147000.0
Chongqing 64000.0 30000.0 2000 175000.0
Guangzhou 47000.0 7000.0 2000 106500.0
Hangzhou 64000.0 NaN 2000 NaN
Shanghai 68000.0 8000.0 2000 150000.0
Suzhou 64000.0 NaN 2000 NaN
Tianjin 64000.0 40000.0 2000 190000.0
shenzhen 76000.0 6000.0 2000 163000.0
print(df.index)
Index(['Beijing', 'Chongqing', 'Guangzhou', 'Hangzhou', 'Shanghai', 'Suzhou',
'Tianjin', 'shenzhen'],
dtype='object')
1、利用表达式boolean定位
from pandas import pandas as pd
cities={'Beijing':55000,'Shanghai':60000,'shenzhen':50000,'Hangzhou':20000,'Guangzhou':45000,'Suzhou':None}
apts=pd.Series(cities,name='income')
apts['shenzhen']=70000
less_than_50000=(apts<50000)
apts[less_than_50000]=40000
apts2=pd.Series({'Beijing':10000,'Shanghai':8000,'shenzhen':6000,'Tianjin':40000,'Guangzhou':7000,'Chongqing':30000})
#print(apts2)
apts=apts+apts2
apts[apts.isnull()]=apts.mean()#缺省值用中位数填充
#print(apts)
df=pd.DataFrame({'apts':apts,'apts2':apts2})#两个series合并成一个df,共有的键显示值,非共有的显示NaN
df['bonus']=2000
df['income']=df['apts']*2+df['apts2']*1.5+df['bonus']
print(df)
print(df['apts']==64000)#boolean条件,另一种实现方式 print(df.apts==64000)
apts apts2 bonus income
Beijing 65000.0 10000.0 2000 147000.0
Chongqing 64000.0 30000.0 2000 175000.0
Guangzhou 47000.0 7000.0 2000 106500.0
Hangzhou 64000.0 NaN 2000 NaN
Shanghai 68000.0 8000.0 2000 150000.0
Suzhou 64000.0 NaN 2000 NaN
Tianjin 64000.0 40000.0 2000 190000.0
shenzhen 76000.0 6000.0 2000 163000.0
Beijing False
Chongqing True
Guangzhou False
Hangzhou True
Shanghai False
Suzhou True
Tianjin True
shenzhen False
Name: apts, dtype: bool
print(df[df['apts']==64000]) #对行做选择,就是把apts列等于64000的行取出来??为何apts2和bonus等也被选出来?
apts apts2 bonus income
Chongqing 64000.0 30000.0 2000 175000.0
Hangzhou 64000.0 NaN 2000 NaN
Suzhou 64000.0 NaN 2000 NaN
Tianjin 64000.0 40000.0 2000 190000.0
df[df.apts==64000]['income']=200000 #报错,在复制片段上赋值,原来的df没被改变??之前新加入的列保存为副本,原始的df不变
2、利用loc,iloc,ix函数定位
loc:通过“行标签”索引行数据
print(df.loc['Hangzhou'])
apts 64000.0
apts2 NaN
bonus 2000.0
income NaN
Name: Hangzhou, dtype: float64
print(df.loc[['Hangzhou','Shanghai']])
apts apts2 bonus income
Hangzhou 64000.0 NaN 2000 NaN
Shanghai 68000.0 8000.0 2000 150000.0
print(df.loc[df['apts']==64000,['apts2','apts','bonus']])
apts2 apts bonus
Chongqing 30000.0 64000.0 2000
Hangzhou NaN 64000.0 2000
Suzhou NaN 64000.0 2000
Tianjin 40000.0 64000.0 2000
print (df.iloc[0:5])
apts apts2 bonus income
Beijing 65000.0 10000.0 2000 147000.0
Chongqing 64000.0 30000.0 2000 175000.0
Guangzhou 47000.0 7000.0 2000 106500.0
Hangzhou 64000.0 NaN 2000 NaN
Shanghai 68000.0 8000.0 2000 150000.0
print (df.ix[1:4,1:4])#用行号和列号做数据选择
apts2 bonus income
Chongqing 30000.0 2000 175000.0
Guangzhou 7000.0 2000 106500.0
Hangzhou NaN 2000 NaN
df.loc[:,'income']=100000#定位赋值
print(df)
apts apts2 bonus income
Beijing 65000.0 10000.0 2000 100000
Chongqing 64000.0 30000.0 2000 100000
Guangzhou 47000.0 7000.0 2000 100000
Hangzhou 64000.0 NaN 2000 100000
Shanghai 68000.0 8000.0 2000 100000
Suzhou 64000.0 NaN 2000 100000
Tianjin 64000.0 40000.0 2000 100000
shenzhen 76000.0 6000.0 2000 100000
print(df.info())
Index: 8 entries, Beijing to shenzhen
Data columns (total 4 columns):
apts 8 non-null float64
apts2 6 non-null float64
bonus 8 non-null int64
income 6 non-null float64
dtypes: float64(3), int64(1)
memory usage: 320.0+ bytes
None
print(df.describe())
apts apts2 bonus income
count 8.000000 6.000000 8.0 6.000000
mean 64000.000000 16833.333333 2000.0 155250.000000
std 8017.837257 14483.323744 0.0 28739.780792
min 47000.000000 6000.000000 2000.0 106500.000000
25% 64000.000000 7250.000000 2000.0 147750.000000
50% 64000.000000 9000.000000 2000.0 156500.000000
75% 65750.000000 25000.000000 2000.0 172000.000000
max 76000.000000 40000.000000 2000.0 190000.000000
print(df.head(3))
apts apts2 bonus income
Beijing 65000.0 10000.0 2000 147000.0
Chongqing 64000.0 30000.0 2000 175000.0
Guangzhou 47000.0 7000.0 2000 106500.0
print(df.tail(3))
apts apts2 bonus income
Suzhou 64000.0 NaN 2000 NaN
Tianjin 64000.0 40000.0 2000 190000.0
shenzhen 76000.0 6000.0 2000 163000.0
import pandas as pd
cities={'Beijing':55000,'Shanghai':60000,'shenzhen':50000,'Hangzhou':20000,'Guangzhou':45000,'Suzhou':None}
apts=pd.Series(cities,name='income')
apts['shenzhen']=70000
less_than_50000=(apts<50000)
apts[less_than_50000]=40000
apts2=pd.Series({'Beijing':10000,'Shanghai':8000,'shenzhen':6000,'Tianjin':40000,'Guangzhou':7000,'Chongqing':30000})
apts=apts+apts2
apts[apts.isnull()]=apts.mean()
df=pd.DataFrame({'apts':apts,'apts2':apts2})
df['bonus']=2000
df['income']=df['apts']*2+df['apts2']*1.5+df['bonus']
#print(df)
dff=df.fillna(value=0)#用0填充缺省值
print(dff)#df本身不改变
apts apts2 bonus income
Beijing 65000.0 10000.0 2000 147000.0
Chongqing 64000.0 30000.0 2000 175000.0
Guangzhou 47000.0 7000.0 2000 106500.0
Hangzhou 64000.0 0.0 2000 0.0
Shanghai 68000.0 8000.0 2000 150000.0
Suzhou 64000.0 0.0 2000 0.0
Tianjin 64000.0 40000.0 2000 190000.0
shenzhen 76000.0 6000.0 2000 163000.0
dff = df.fillna(value=0,inplace=True)
print(df);print('\n\n');print(dff)inplace为true,直接改变df本身且不返回dff,故dff为none
apts apts2 bonus income
Beijing 65000.0 10000.0 2000 147000.0
Chongqing 64000.0 30000.0 2000 175000.0
Guangzhou 47000.0 7000.0 2000 106500.0
Hangzhou 64000.0 0.0 2000 0.0
Shanghai 68000.0 8000.0 2000 150000.0
Suzhou 64000.0 0.0 2000 0.0
Tianjin 64000.0 40000.0 2000 190000.0
shenzhen 76000.0 6000.0 2000 163000.0
None
ffill
dfff= df.fillna(method='ffill')#新生成的补NaN前向拷贝,df没变
print(dfff)
apts apts2 bonus income
Beijing 65000.0 10000.0 2000 147000.0
Chongqing 64000.0 30000.0 2000 175000.0
Guangzhou 47000.0 7000.0 2000 106500.0
Hangzhou 64000.0 7000.0 2000 106500.0
Shanghai 68000.0 8000.0 2000 150000.0
Suzhou 64000.0 8000.0 2000 150000.0
Tianjin 64000.0 40000.0 2000 190000.0
shenzhen 76000.0 6000.0 2000 163000.0
bfill
dfba=df.fillna(method='bfill')#新生成的补NaN后向拷贝,df没变
print(dfba)
apts apts2 bonus income
Beijing 65000.0 10000.0 2000 147000.0
Chongqing 64000.0 30000.0 2000 175000.0
Guangzhou 47000.0 7000.0 2000 106500.0
Hangzhou 64000.0 8000.0 2000 150000.0
Shanghai 68000.0 8000.0 2000 150000.0
Suzhou 64000.0 40000.0 2000 190000.0
Tianjin 64000.0 40000.0 2000 190000.0
shenzhen 76000.0 6000.0 2000 163000.0
层次化的index
import pandas as pd
import numpy as np
data = pd.Series(np.random.randn(10),index=[['a','a','a','b','b','c','c','d','d','d'],[1,2,3,1,2,1,2,1,2,3]])
print(data)
print(type(data))
a 1 -0.047278
2 -1.483728
3 -1.022545
b 1 1.031973
2 -0.198181
c 1 1.416993
2 1.656448
d 1 -0.464159
2 -0.880094
3 0.421264
dtype: float64
print(data.index)
MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2, 3]],
labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 0, 1, 0, 1, 2]])
print(data['b':'c'])
b 1 1.031973
2 -0.198181
c 1 1.416993
2 1.656448
dtype: float64
print(data[:2])
a 1 -0.047278
2 -1.483728
dtype: float64
unstack=data.unstack() #将层级数据横向拉开,不够长的补NaN
print(unstack)
print(type(unstack))
1 2 3
a -0.047278 -1.483728 -1.022545
b 1.031973 -0.198181 NaN
c 1.416993 1.656448 NaN
d -0.464159 -0.880094 0.421264