Python3 pandas库DataFrame基础用法

创建一个DataFrame的三种方法

1、用字典dict,字典值value是列表list

population = {"city":["beijing","shanghai","guangzhou","shenzhen","hangzhou","chongqing"],
              "year":[2016,2017,2016,2017,2017,2016],
              "population":[2100,2300,1000,700,500,500]}#字典里的键和值必须一一对应,否则会报错
population = pd.DataFrame(population)
print(population)
        city  population  year
0    beijing        2100  2016
1   shanghai        2300  2017
2  guangzhou        1000  2016
3   shenzhen         700  2017
4   hangzhou         500  2017
5  chongqing         500  2016
pdc = pd.DataFrame(population,columns=["year","city","population"])#改变列的参数
print(pdc)
   year       city  population
0  2016    beijing        2100
1  2017   shanghai        2300
2  2016  guangzhou        1000
3  2017   shenzhen         700
4  2017   hangzhou         500
5  2016  chongqing         500
temp = {"city":["beijing","shanghai","guangzhou","shenzhen","hangzhou","chongqing"],
              "year":[2016,2017,2016,2017,2017,2016],
              "population":[2100,2300,1000,700,500,500]}
pdci = pd.DataFrame(temp,columns=["year","city","population"],index = ['one','two','three','four','five','six'])
#改变列的顺序和索引格式
print(pdci)
      year       city  population
one    2016    beijing        2100
two    2017   shanghai        2300
three  2016  guangzhou        1000
four   2017   shenzhen         700
five   2017   hangzhou         500
six    2016  chongqing         500

2、用series构建DataFrame

from pandas import pandas as pd
cities={'Beijing':55000,'Shanghai':60000,'shenzhen':50000,'Hangzhou':20000,'Guangzhou':45000,'Suzhou':None}
apts=pd.Series(cities,name='income')
apts['shenzhen']=70000
less_than_50000=(apts<50000)
apts[less_than_50000]=40000

apts2=pd.Series({'Beijing':10000,'Shanghai':8000,'shenzhen':6000,'Tianjin':40000,'Guangzhou':7000,'Chongqing':30000})
#print(apts2)

apts=apts+apts2
apts[apts.isnull()]=apts.mean()#缺省值用中位数填充
#print(apts)
df=pd.DataFrame({'apts':apts,'apts2':apts2})#两个series合并成一个df,共有的键显示值,非共有的显示NaN
              apts    apts2
Beijing    65000.0  10000.0
Chongqing  64000.0  30000.0
Guangzhou  47000.0   7000.0
Hangzhou   64000.0      NaN
Shanghai   68000.0   8000.0
Suzhou     64000.0      NaN
Tianjin    64000.0  40000.0
shenzhen   76000.0   6000.0

3、用一个字典构成的列表list of dicts来构建DataFrame

data = [{'lucy':9999,'linus':8888,'curry':100000},{'lucy':9998,'linus':8887,'curry':1000000}]
pd2 = pd.DataFrame(data,index=['salary1','salary2'])#一个疑问,为什么Lucy在最后?
print(pd2)
           curry  linus  lucy
salary1   100000   8888  9999
salary2  1000000   8887  9998

广播特性

from pandas import pandas as pd
cities={'Beijing':55000,'Shanghai':60000,'shenzhen':50000,'Hangzhou':20000,'Guangzhou':45000,'Suzhou':None}
apts=pd.Series(cities,name='income')
apts['shenzhen']=70000
less_than_50000=(apts<50000)
apts[less_than_50000]=40000

apts2=pd.Series({'Beijing':10000,'Shanghai':8000,'shenzhen':6000,'Tianjin':40000,'Guangzhou':7000,'Chongqing':30000})
#print(apts2)

apts=apts+apts2
apts[apts.isnull()]=apts.mean()#缺省值用中位数填充

#print(apts)

df=pd.DataFrame({'apts':apts,'apts2':apts2})#两个series合并成一个df,共有的键显示值,非共有的显示NaN
df['bonus']=2000#新增一列bonus,并赋值为2000
print(df)
              apts    apts2  bonus
Beijing    65000.0  10000.0   2000
Chongqing  64000.0  30000.0   2000
Guangzhou  47000.0   7000.0   2000
Hangzhou   64000.0      NaN   2000
Shanghai   68000.0   8000.0   2000
Suzhou     64000.0      NaN   2000
Tianjin    64000.0  40000.0   2000
shenzhen   76000.0   6000.0   2000
df['income']=df['apts']*2+df['apts2']*1.5+df['bonus']
print(df)
              apts    apts2  bonus    income
Beijing    65000.0  10000.0   2000  147000.0
Chongqing  64000.0  30000.0   2000  175000.0
Guangzhou  47000.0   7000.0   2000  106500.0
Hangzhou   64000.0      NaN   2000       NaN
Shanghai   68000.0   8000.0   2000  150000.0
Suzhou     64000.0      NaN   2000       NaN
Tianjin    64000.0  40000.0   2000  190000.0
shenzhen   76000.0   6000.0   2000  163000.0
print(df.index)
Index(['Beijing', 'Chongqing', 'Guangzhou', 'Hangzhou', 'Shanghai', 'Suzhou',
       'Tianjin', 'shenzhen'],
      dtype='object')

定位DataFrame里的元素

1、利用表达式boolean定位

from pandas import pandas as pd
cities={'Beijing':55000,'Shanghai':60000,'shenzhen':50000,'Hangzhou':20000,'Guangzhou':45000,'Suzhou':None}
apts=pd.Series(cities,name='income')
apts['shenzhen']=70000
less_than_50000=(apts<50000)
apts[less_than_50000]=40000

apts2=pd.Series({'Beijing':10000,'Shanghai':8000,'shenzhen':6000,'Tianjin':40000,'Guangzhou':7000,'Chongqing':30000})
#print(apts2)


apts=apts+apts2
apts[apts.isnull()]=apts.mean()#缺省值用中位数填充

#print(apts)

df=pd.DataFrame({'apts':apts,'apts2':apts2})#两个series合并成一个df,共有的键显示值,非共有的显示NaN
df['bonus']=2000
df['income']=df['apts']*2+df['apts2']*1.5+df['bonus']
print(df)
print(df['apts']==64000)#boolean条件,另一种实现方式 print(df.apts==64000)
             apts    apts2  bonus    income
Beijing    65000.0  10000.0   2000  147000.0
Chongqing  64000.0  30000.0   2000  175000.0
Guangzhou  47000.0   7000.0   2000  106500.0
Hangzhou   64000.0      NaN   2000       NaN
Shanghai   68000.0   8000.0   2000  150000.0
Suzhou     64000.0      NaN   2000       NaN
Tianjin    64000.0  40000.0   2000  190000.0
shenzhen   76000.0   6000.0   2000  163000.0
Beijing      False
Chongqing     True
Guangzhou    False
Hangzhou      True
Shanghai     False
Suzhou        True
Tianjin       True
shenzhen     False
Name: apts, dtype: bool
print(df[df['apts']==64000]) #对行做选择,就是把apts列等于64000的行取出来??为何apts2和bonus等也被选出来?
apts    apts2  bonus    income
Chongqing  64000.0  30000.0   2000  175000.0
Hangzhou   64000.0      NaN   2000       NaN
Suzhou     64000.0      NaN   2000       NaN
Tianjin    64000.0  40000.0   2000  190000.0
df[df.apts==64000]['income']=200000 #报错,在复制片段上赋值,原来的df没被改变??之前新加入的列保存为副本,原始的df不变

2、利用loc,iloc,ix函数定位

loc:通过“行标签”索引行数据

print(df.loc['Hangzhou'])
apts      64000.0
apts2         NaN
bonus      2000.0
income        NaN
Name: Hangzhou, dtype: float64
print(df.loc[['Hangzhou','Shanghai']])
             apts   apts2  bonus    income
Hangzhou  64000.0     NaN   2000       NaN
Shanghai  68000.0  8000.0   2000  150000.0
print(df.loc[df['apts']==64000,['apts2','apts','bonus']])
 apts2     apts  bonus
Chongqing  30000.0  64000.0   2000
Hangzhou       NaN  64000.0   2000
Suzhou         NaN  64000.0   2000
Tianjin    40000.0  64000.0   2000
iloc:通过“行号”索引行数据
print (df.iloc[0:5])
apts    apts2  bonus    income
Beijing    65000.0  10000.0   2000  147000.0
Chongqing  64000.0  30000.0   2000  175000.0
Guangzhou  47000.0   7000.0   2000  106500.0
Hangzhou   64000.0      NaN   2000       NaN
Shanghai   68000.0   8000.0   2000  150000.0
ix:通过行标签或者行号索引行数据(基于loc和iloc 的混合)
print (df.ix[1:4,1:4])#用行号和列号做数据选择
apts2  bonus    income
Chongqing  30000.0   2000  175000.0
Guangzhou   7000.0   2000  106500.0
Hangzhou       NaN   2000       NaN
df.loc[:,'income']=100000#定位赋值
print(df)
apts    apts2  bonus  income
Beijing    65000.0  10000.0   2000  100000
Chongqing  64000.0  30000.0   2000  100000
Guangzhou  47000.0   7000.0   2000  100000
Hangzhou   64000.0      NaN   2000  100000
Shanghai   68000.0   8000.0   2000  100000
Suzhou     64000.0      NaN   2000  100000
Tianjin    64000.0  40000.0   2000  100000
shenzhen   76000.0   6000.0   2000  100000

info()和describe(),head(),tail()

print(df.info())

Index: 8 entries, Beijing to shenzhen
Data columns (total 4 columns):
apts      8 non-null float64
apts2     6 non-null float64
bonus     8 non-null int64
income    6 non-null float64
dtypes: float64(3), int64(1)
memory usage: 320.0+ bytes
None
print(df.describe())
apts         apts2   bonus         income
count      8.000000      6.000000     8.0       6.000000
mean   64000.000000  16833.333333  2000.0  155250.000000
std     8017.837257  14483.323744     0.0   28739.780792
min    47000.000000   6000.000000  2000.0  106500.000000
25%    64000.000000   7250.000000  2000.0  147750.000000
50%    64000.000000   9000.000000  2000.0  156500.000000
75%    65750.000000  25000.000000  2000.0  172000.000000
max    76000.000000  40000.000000  2000.0  190000.000000
print(df.head(3))
apts    apts2  bonus    income
Beijing    65000.0  10000.0   2000  147000.0
Chongqing  64000.0  30000.0   2000  175000.0
Guangzhou  47000.0   7000.0   2000  106500.0
print(df.tail(3))
 apts    apts2  bonus    income
Suzhou    64000.0      NaN   2000       NaN
Tianjin   64000.0  40000.0   2000  190000.0
shenzhen  76000.0   6000.0   2000  163000.0

缺省值填充fillna,ffill,bfill

import pandas as pd
cities={'Beijing':55000,'Shanghai':60000,'shenzhen':50000,'Hangzhou':20000,'Guangzhou':45000,'Suzhou':None}
apts=pd.Series(cities,name='income')
apts['shenzhen']=70000
less_than_50000=(apts<50000)
apts[less_than_50000]=40000
apts2=pd.Series({'Beijing':10000,'Shanghai':8000,'shenzhen':6000,'Tianjin':40000,'Guangzhou':7000,'Chongqing':30000})
apts=apts+apts2
apts[apts.isnull()]=apts.mean()
df=pd.DataFrame({'apts':apts,'apts2':apts2})
df['bonus']=2000 
df['income']=df['apts']*2+df['apts2']*1.5+df['bonus']
#print(df)

fillna
dff=df.fillna(value=0)#用0填充缺省值
print(dff)#df本身不改变
apts    apts2  bonus    income
Beijing    65000.0  10000.0   2000  147000.0
Chongqing  64000.0  30000.0   2000  175000.0
Guangzhou  47000.0   7000.0   2000  106500.0
Hangzhou   64000.0      0.0   2000       0.0
Shanghai   68000.0   8000.0   2000  150000.0
Suzhou     64000.0      0.0   2000       0.0
Tianjin    64000.0  40000.0   2000  190000.0
shenzhen   76000.0   6000.0   2000  163000.0
dff = df.fillna(value=0,inplace=True)
print(df);print('\n\n');print(dff)inplace为true,直接改变df本身且不返回dff,故dff为none
apts    apts2  bonus    income
Beijing    65000.0  10000.0   2000  147000.0
Chongqing  64000.0  30000.0   2000  175000.0
Guangzhou  47000.0   7000.0   2000  106500.0
Hangzhou   64000.0      0.0   2000       0.0
Shanghai   68000.0   8000.0   2000  150000.0
Suzhou     64000.0      0.0   2000       0.0
Tianjin    64000.0  40000.0   2000  190000.0
shenzhen   76000.0   6000.0   2000  163000.0

None

ffill

dfff= df.fillna(method='ffill')#新生成的补NaN前向拷贝,df没变
print(dfff)
apts    apts2  bonus    income
Beijing    65000.0  10000.0   2000  147000.0
Chongqing  64000.0  30000.0   2000  175000.0
Guangzhou  47000.0   7000.0   2000  106500.0
Hangzhou   64000.0   7000.0   2000  106500.0
Shanghai   68000.0   8000.0   2000  150000.0
Suzhou     64000.0   8000.0   2000  150000.0
Tianjin    64000.0  40000.0   2000  190000.0
shenzhen   76000.0   6000.0   2000  163000.0

bfill

dfba=df.fillna(method='bfill')#新生成的补NaN后向拷贝,df没变
print(dfba)
apts    apts2  bonus    income
Beijing    65000.0  10000.0   2000  147000.0
Chongqing  64000.0  30000.0   2000  175000.0
Guangzhou  47000.0   7000.0   2000  106500.0
Hangzhou   64000.0   8000.0   2000  150000.0
Shanghai   68000.0   8000.0   2000  150000.0
Suzhou     64000.0  40000.0   2000  190000.0
Tianjin    64000.0  40000.0   2000  190000.0
shenzhen   76000.0   6000.0   2000  163000.0

层次化的index

import pandas as pd
import numpy as np
data = pd.Series(np.random.randn(10),index=[['a','a','a','b','b','c','c','d','d','d'],[1,2,3,1,2,1,2,1,2,3]])
print(data)
print(type(data))
a  1   -0.047278
   2   -1.483728
   3   -1.022545
b  1    1.031973
   2   -0.198181
c  1    1.416993
   2    1.656448
d  1   -0.464159
   2   -0.880094
   3    0.421264
dtype: float64

print(data.index)
MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 0, 1, 0, 1, 2]])

print(data['b':'c'])
b  1    1.031973
   2   -0.198181
c  1    1.416993
   2    1.656448
dtype: float64

print(data[:2])
a  1   -0.047278
   2   -1.483728
dtype: float64
unstack:Series转化成DataFrame
unstack=data.unstack()   #将层级数据横向拉开,不够长的补NaN
print(unstack)
print(type(unstack))
 1         2         3
a -0.047278 -1.483728 -1.022545
b  1.031973 -0.198181       NaN
c  1.416993  1.656448       NaN
d -0.464159 -0.880094  0.421264








你可能感兴趣的:(Python3 pandas库DataFrame基础用法)