一个DataFrame就是一张表格,Series可以理解成一维数据,DataFrame就是一个二维数据,DataFrame可以由多个Series组成(DataFrame可以理解成Series的一个集合)
population={'city':['Beijing','Shanghai','Guangzhou','Shenzhen','Hangzhou','Chongqing'],
'year':[2016,2017,2016,2017,2016,2016],
'population':[2100,2300,1000,700,500,500]
}
population=pd.DataFrame(population) ###
print(population)
city population year
0 Beijing 2100 2016
1 Shanghai 2300 2017
2 Guangzhou 1000 2016
3 Shenzhen 700 2017
4 Hangzhou 500 2016
5 Chongqing 500 2016
pdc=pd.DataFrame(population,columns=['year','city','population']) #columns参数改变列名
print(pdc)
year city population
0 2016 Beijing 2100
1 2017 Shanghai 2300
2 2016 Guangzhou 1000
3 2017 Shenzhen 700
4 2016 Hangzhou 500
5 2016 Chongqing 500
tmp={'city':['Beijing','Shanghai','Guangzhou','Shenzhen','Hangzhou','Chongqing'],
'year':[2016,2017,2016,2017,2016,2016],
'population':[2100,2300,1000,700,500,500]
}
pdci=pd.DataFrame(tmp,columns=['year','city','population'],
index=['one','two','three','four','five','six']) #改变行index索引和列名columns
print(pdci)
year city population
one 2016 Beijing 2100
two 2017 Shanghai 2300
three 2016 Guangzhou 1000
four 2017 Shenzhen 700
five 2016 Hangzhou 500
six 2016 Chongqing 500
cities={'Beijing':55000,'Shanghai':60000,'shenzhen':50000,'Hangzhou':20000,'Guangzhou':45000,'Suzhou':None}
apts=pd.Series(cities,name='income')
apts['shenzhen']=70000
less_than_50000=(apts<50000)
apts[less_than_50000]=40000
apts2=pd.Series({'Beijing':10000,'Shanghai':8000,'shenzhen':6000,'Tianjin':40000,'Guangzhou':7000,'Chongqing':30000})
# print(apts2)
apts=apts+apts2
apts[apts.isnull()]=apts.mean()
# print(apts)
df=pd.DataFrame({'apts':apts,'apts2':apts2}) ###
print(df)
apts apts2
Beijing 65000.0 10000.0
Chongqing 64000.0 30000.0
Guangzhou 47000.0 7000.0
Hangzhou 64000.0 NaN
Shanghai 68000.0 8000.0
Suzhou 64000.0 NaN
Tianjin 64000.0 40000.0
shenzhen 76000.0 6000.0
data=[{'JackMa':99999999999,'Han':5000,'David':10000},
{'JackMa':99999999998,'Han':4000,'David':11000}]
pdl=pd.DataFrame(data,index=['salary1','salary2'])
print(pdl)
David Han JackMa
salary1 10000 5000 99999999999
salary2 11000 4000 99999999998
cities={'Beijing':55000,'Shanghai':60000,'shenzhen':50000,'Hangzhou':20000,'Guangzhou':45000,'Suzhou':None}
apts=pd.Series(cities,name='income')
apts['shenzhen']=70000
less_than_50000=(apts<50000)
apts[less_than_50000]=40000
apts2=pd.Series({'Beijing':10000,'Shanghai':8000,'shenzhen':6000,'Tianjin':40000,'Guangzhou':7000,'Chongqing':30000})
apts=apts+apts2
apts[apts.isnull()]=apts.mean()
df=pd.DataFrame({'apts':apts,'apts2':apts2})
#print(df)
df['bonus']=2000 #增加一个新列bonus,并且都赋值2000
print(df)
apts apts2 bonus
Beijing 65000.0 10000.0 2000
Chongqing 64000.0 30000.0 2000
Guangzhou 47000.0 7000.0 2000
Hangzhou 64000.0 NaN 2000
Shanghai 68000.0 8000.0 2000
Suzhou 64000.0 NaN 2000
Tianjin 64000.0 40000.0 2000
shenzhen 76000.0 6000.0 2000
df['income']=df['apts']*2+df['apts2']*1.5+df['bonus']
print(df)
apts apts2 bonus income
Beijing 65000.0 10000.0 2000 147000.0
Chongqing 64000.0 30000.0 2000 175000.0
Guangzhou 47000.0 7000.0 2000 106500.0
Hangzhou 64000.0 NaN 2000 NaN
Shanghai 68000.0 8000.0 2000 150000.0
Suzhou 64000.0 NaN 2000 NaN
Tianjin 64000.0 40000.0 2000 190000.0
shenzhen 76000.0 6000.0 2000 163000.0
print(df.index)
Index(['Beijing', 'Chongqing', 'Guangzhou', 'Hangzhou', 'Shanghai', 'Suzhou',
'Tianjin', 'shenzhen'],
dtype='object')
import pandas as pd
cities={'Beijing':55000,'Shanghai':60000,'shenzhen':50000,'Hangzhou':20000,'Guangzhou':45000,'Suzhou':None}
apts=pd.Series(cities,name='income')
apts['shenzhen']=70000
less_than_50000=(apts<50000)
apts[less_than_50000]=40000
apts2=pd.Series({'Beijing':10000,'Shanghai':8000,'shenzhen':6000,'Tianjin':40000,'Guangzhou':7000,'Chongqing':30000})
apts=apts+apts2
apts[apts.isnull()]=apts.mean()
df=pd.DataFrame({'apts':apts,'apts2':apts2})
df['bonus']=2000 #增加一个新列bonus,并且都赋值2000
df['income']=df['apts']*2+df['apts2']*1.5+df['bonus']
#print(df)
# apts apts2 bonus income
#Beijing 65000.0 10000.0 2000 147000.0
#Chongqing 64000.0 30000.0 2000 175000.0
#Guangzhou 47000.0 7000.0 2000 106500.0
#Hangzhou 64000.0 NaN 2000 NaN
#Shanghai 68000.0 8000.0 2000 150000.0
#Suzhou 64000.0 NaN 2000 NaN
#Tianjin 64000.0 40000.0 2000 190000.0
#shenzhen 76000.0 6000.0 2000 163000.0
print(df.apts==64000)
print(df['apts']==64000) #boolean条件
Beijing False
Chongqing True
Guangzhou False
Hangzhou True
Shanghai False
Suzhou True
Tianjin True
shenzhen False
Name: apts, dtype: bool
print(df[df['apts']==64000]) #对行做选择,就是把apts列等于64000的行取出来
apts apts2 bonus income
Chongqing 64000.0 30000.0 2000 175000.0
Hangzhou 64000.0 NaN 2000 NaN
Suzhou 64000.0 NaN 2000 NaN
Tianjin 64000.0 40000.0 2000 190000.0
df[df.apts==64000]['income']=200000 #报错,在复制片段上赋值,原来的df没被改变
print(df.loc['Hangzhou']) #定位选某一行
apts 64000.0
apts2 NaN
bonus 2000.0
income NaN
Name: Hangzhou, dtype: float64
print(df.loc[['Hangzhou','Shanghai']])
apts apts2 bonus income
Hangzhou 64000.0 NaN 2000 NaN
Shanghai 68000.0 8000.0 2000 150000.0
print(df.loc[df['apts']==64000,['apts2','apts','bonus']])
#前面的部分是对行做选择,后面的部分是对列做选择
apts2 apts bonus
Chongqing 30000.0 64000.0 2000
Hangzhou NaN 64000.0 2000
Suzhou NaN 64000.0 2000
Tianjin 40000.0 64000.0 2000
print(df.iloc[0:5])
apts apts2 bonus income
Beijing 65000.0 10000.0 2000 147000.0
Chongqing 64000.0 30000.0 2000 175000.0
Guangzhou 47000.0 7000.0 2000 106500.0
Hangzhou 64000.0 NaN 2000 NaN
Shanghai 68000.0 8000.0 2000 150000.0
print(df.ix[1:4,1:3]) #用行号和列号做数据选择
apts2 bonus
Chongqing 30000.0 2000
Guangzhou 7000.0 2000
Hangzhou NaN 2000
df.loc[:,'income']=5000
print(df)
apts apts2 bonus income
Beijing 65000.0 10000.0 2000 5000
Chongqing 64000.0 30000.0 2000 5000
Guangzhou 47000.0 7000.0 2000 5000
Hangzhou 64000.0 NaN 2000 5000
Shanghai 68000.0 8000.0 2000 5000
Suzhou 64000.0 NaN 2000 5000
Tianjin 64000.0 40000.0 2000 5000
shenzhen 76000.0 6000.0 2000 5000
print(df.info())
Index: 8 entries, Beijing to shenzhen
Data columns (total 4 columns):
apts 8 non-null float64
apts2 6 non-null float64
bonus 8 non-null int64
income 8 non-null int64
dtypes: float64(2), int64(2)
memory usage: 320.0+ bytes
None
print(df.describe())
apts apts2 bonus income
count 8.000000 6.000000 8.0 8.0
mean 64000.000000 16833.333333 2000.0 5000.0
std 8017.837257 14483.323744 0.0 0.0
min 47000.000000 6000.000000 2000.0 5000.0
25% 64000.000000 7250.000000 2000.0 5000.0
50% 64000.000000 9000.000000 2000.0 5000.0
75% 65750.000000 25000.000000 2000.0 5000.0
max 76000.000000 40000.000000 2000.0 5000.0
print(df.head(2))
apts apts2 bonus income
Beijing 65000.0 10000.0 2000 5000
Chongqing 64000.0 30000.0 2000 5000
print(df.tail(2))
apts apts2 bonus income
Tianjin 64000.0 40000.0 2000 5000
shenzhen 76000.0 6000.0 2000 5000
#df2.loc[((df2['dow']==0)|(df2['dow']==2)|(df2['dow']==4)),:]
#df2.loc[ df2['dow'].isin([0,2,4]) , : ] #可以是一个列表,numpy array,Series
#~(df2['dow'].isin([0,2,4]))
import pandas as pd
cities={'Beijing':55000,'Shanghai':60000,'shenzhen':50000,'Hangzhou':20000,'Guangzhou':45000,'Suzhou':None}
apts=pd.Series(cities,name='income')
apts['shenzhen']=70000
less_than_50000=(apts<50000)
apts[less_than_50000]=40000
apts2=pd.Series({'Beijing':10000,'Shanghai':8000,'shenzhen':6000,'Tianjin':40000,'Guangzhou':7000,'Chongqing':30000})
apts=apts+apts2
apts[apts.isnull()]=apts.mean()
df=pd.DataFrame({'apts':apts,'apts2':apts2})
df['bonus']=2000
df['income']=df['apts']*2+df['apts2']*1.5+df['bonus']
#print(df)
# apts apts2 bonus income
#Beijing 65000.0 10000.0 2000 147000.0
#Chongqing 64000.0 30000.0 2000 175000.0
#Guangzhou 47000.0 7000.0 2000 106500.0
#Hangzhou 64000.0 NaN 2000 NaN
#Shanghai 68000.0 8000.0 2000 150000.0
#Suzhou 64000.0 NaN 2000 NaN
#Tianjin 64000.0 40000.0 2000 190000.0
#shenzhen 76000.0 6000.0 2000 163000.0
dff=df.fillna(value=0) #df没变
print(dff)
apts apts2 bonus income
Beijing 65000.0 10000.0 2000 147000.0
Chongqing 64000.0 30000.0 2000 175000.0
Guangzhou 47000.0 7000.0 2000 106500.0
Hangzhou 64000.0 0.0 2000 0.0
Shanghai 68000.0 8000.0 2000 150000.0
Suzhou 64000.0 0.0 2000 0.0
Tianjin 64000.0 40000.0 2000 190000.0
shenzhen 76000.0 6000.0 2000 163000.0
dff=df.fillna(value=0, inplace=True)
print(df);print(dff) #inplace参数True,df改变,没有新的dff拷贝
apts apts2 bonus income
Beijing 65000.0 10000.0 2000 147000.0
Chongqing 64000.0 30000.0 2000 175000.0
Guangzhou 47000.0 7000.0 2000 106500.0
Hangzhou 64000.0 0.0 2000 0.0
Shanghai 68000.0 8000.0 2000 150000.0
Suzhou 64000.0 0.0 2000 0.0
Tianjin 64000.0 40000.0 2000 190000.0
shenzhen 76000.0 6000.0 2000 163000.0
None
dffr=df.fillna(method='ffill') #新生成的补NaN前向拷贝,df没变
print(dffr)
apts apts2 bonus income
Beijing 65000.0 10000.0 2000 147000.0
Chongqing 64000.0 30000.0 2000 175000.0
Guangzhou 47000.0 7000.0 2000 106500.0
Hangzhou 64000.0 7000.0 2000 106500.0
Shanghai 68000.0 8000.0 2000 150000.0
Suzhou 64000.0 8000.0 2000 150000.0
Tianjin 64000.0 40000.0 2000 190000.0
shenzhen 76000.0 6000.0 2000 163000.0
dfba=df.fillna(method='bfill') #新生成的补NaN后向拷贝,df没变
print(dfba)
apts apts2 bonus income
Beijing 65000.0 10000.0 2000 147000.0
Chongqing 64000.0 30000.0 2000 175000.0
Guangzhou 47000.0 7000.0 2000 106500.0
Hangzhou 64000.0 8000.0 2000 150000.0
Shanghai 68000.0 8000.0 2000 150000.0
Suzhou 64000.0 40000.0 2000 190000.0
Tianjin 64000.0 40000.0 2000 190000.0
shenzhen 76000.0 6000.0 2000 163000.0
import pandas as pd
import numpy as np
data=pd.Series(np.random.randn(10),index=[['a','a','a','b','b','c','c','d','d','d'],
[1,2,3,1,2,1,2,1,2,3]])
print(data)
print(type(data))
a 1 0.346467
2 -0.043077
3 0.043878
b 1 0.107763
2 -0.175726
c 1 -1.833683
2 0.033884
d 1 -1.807021
2 0.819740
3 0.294679
dtype: float64
print(data.index)
MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2, 3]],
labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 0, 1, 0, 1, 2]])
print(data['b':'c'])
b 1 0.353241
2 0.379744
c 1 -0.860706
2 -0.795483
dtype: float64
print(data[:2])
a 1 0.763116
2 0.058009
dtype: float64
unstack=data.unstack() #将层级数据横向拉开,不够长的补NaN
print(unstack)
print(type(unstack))
1 2 3
a -0.637935 -0.104897 -1.536381
b 2.448302 1.679833 NaN
c -0.845155 0.829459 NaN
d 0.597535 -0.464255 -0.898994
#对比data的类型
import pandas as pd
cities={'Beijing':55000,'Shanghai':60000,'shenzhen':50000,'Hangzhou':20000,'Guangzhou':45000,'Suzhou':None}
apts=pd.Series(cities,name='income')
apts['shenzhen']=70000
less_than_50000=(apts<50000)
apts[less_than_50000]=40000
apts2=pd.Series({'Beijing':10000,'Shanghai':8000,'shenzhen':6000,'Tianjin':40000,'Guangzhou':7000,'Chongqing':30000})
apts=apts+apts2
apts[apts.isnull()]=apts.mean()
df=pd.DataFrame({'apts':apts,'apts2':apts2})
df['bonus']=2000
df['income']=df['apts']*2+df['apts2']*1.5+df['bonus']
#print(df)
df.to_csv('df.csv')
df.to_csv('df2.csv',index=False) #去掉第一列,行索引列
import os
df2_site = r"D:\PYTHON35\idle\df2.csv"
pwd = os.getcwd() #获取当前工作目录
os.chdir(os.path.dirname(df2_site))
tmp_df = pd.read_csv(os.path.basename(df2_site)) ###
print(tmp_df)
apts apts2 bonus income
0 65000.0 10000.0 2000 147000.0
1 64000.0 30000.0 2000 175000.0
2 47000.0 7000.0 2000 106500.0
3 64000.0 NaN 2000 NaN
4 68000.0 8000.0 2000 150000.0
5 64000.0 NaN 2000 NaN
6 64000.0 40000.0 2000 190000.0
7 76000.0 6000.0 2000 163000.0
tmp_df_index = pd.Index(['Beijing','Shanghai',"Suzhou",'Hangzhou','Tianjin','Chongqing','Nanjing','Shenzhen'])
tmp_df.index=tmp_df_index #修改索引
print(tmp_df)
apts apts2 bonus income
Beijing 65000.0 10000.0 2000 147000.0
Shanghai 64000.0 30000.0 2000 175000.0
Suzhou 47000.0 7000.0 2000 106500.0
Hangzhou 64000.0 NaN 2000 NaN
Tianjin 68000.0 8000.0 2000 150000.0
Chongqing 64000.0 NaN 2000 NaN
Nanjing 64000.0 40000.0 2000 190000.0
Shenzhen 76000.0 6000.0 2000 163000.0
df.to_csv('df3.csv',sep='\t')