numpy库和pandas库是作为python数据处理不可缺少的两个库,大家可能学完numpy之后感觉pandas和numpy大同小异,我个人理解是pandas和numpy都是对数组和矩阵进行操作,而pandas中的矩阵是含有index的矩阵,其中可以自己定义主键,类似于大家学习sql语言,其中也包含了全连接左外连接等较为复杂的操作。本文将我学习pandas库的历程分享出来供大家学习参考,欢迎指正。
目录
1.pandas属性
2. pandas选择数据
3.pandas设置值
4.丢失数据处理
5.导入导出文件
6.concat
7.merge
import pandas as pd
import numpy as np
df=pd.DataFrame(np.arange(0,12).reshape(3,4),index=['a','b','c'],columns=['d','e','f','g'])
#此处index代表列 分别给3行4列命名
print(df)
print('----------------')
print(df.T) #转置
print('-------------------------------------------')
print(df.index) #每一列的名称
print(df.columns) #每一行的名称
print('-------------------------------------------')
print(df.describe()) #查看数值型列的汇总统计
# df.values 查看数据表的值
print('----------------')
df1=pd.DataFrame(np.arange(0,12).reshape(3,4))
print(df1)
print('----------------')
#默认行列名称为数字0.1.2.3..
df2=pd.DataFrame({'a':1,'b':np.random.random(3),'c':np.array(['tree','forest','car'])})
print(df2.dtypes) #每一列数据形式
print('----------------')
print(df2) #字典方式定义数据表,其中key为行名,列名为默认值
print('----------------')
print(df2.sort_index(axis=1,ascending=False))#对行进行降序排序
#df2.sort_index(axis=0,ascending=True)#对列进行升序排序
print('----------------')
df2.sort_values(by='b') #按照特定的值进行排序
输出: d e f g a 0 1 2 3 b 4 5 6 7 c 8 9 10 11 ---------------- a b c d 0 4 8 e 1 5 9 f 2 6 10 g 3 7 11 ------------------------------------------- Index(['a', 'b', 'c'], dtype='object') Index(['d', 'e', 'f', 'g'], dtype='object') ------------------------------------------- d e f g count 3.0 3.0 3.0 3.0 mean 4.0 5.0 6.0 7.0 std 4.0 4.0 4.0 4.0 min 0.0 1.0 2.0 3.0 25% 2.0 3.0 4.0 5.0 50% 4.0 5.0 6.0 7.0 75% 6.0 7.0 8.0 9.0 max 8.0 9.0 10.0 11.0 ---------------- 0 1 2 3 0 0 1 2 3 1 4 5 6 7 2 8 9 10 11 ---------------- a int64 b float64 c object dtype: object ---------------- a b c 0 1 0.075662 tree 1 1 0.160645 forest 2 1 0.593432 car ---------------- c b a 0 tree 0.075662 1 1 forest 0.160645 1 2 car 0.593432 1 ----------------
Out[53]:
a | b | c | |
---|---|---|---|
0 | 1 | 0.075662 | tree |
1 | 1 | 0.160645 | forest |
2 | 1 | 0.593432 | car |
import pandas as pd
import numpy as np
df=pd.DataFrame(np.arange(0,12).reshape(3,4),index=['a','b','c'],columns=['d','e','f','g'])
print(df)
print('----------------')
print(df.d)#只能输出列,无法输出行 df.a 错误
print('----------------')
print(df[0:2])#只对行操作
print('----------------')
print(df.loc[:,'d'])#可对行和列进行标签操作
print('----------------')
print(df.loc[:,['d','f']])#可对多行和列进行标签操作
print('----------------')
print(df.loc['a',['d','f']])#切片 ###
#######################################
print('----------------')#利用行列index进行切片
print(df.iloc[1:2,2:3])
print('----------------')
print(df.iloc[1:2,[1,3]])
print('----------------')
print(df[df.d>4])
输出:
d e f g a 0 1 2 3 b 4 5 6 7 c 8 9 10 11 ---------------- a 0 b 4 c 8 Name: d, dtype: int32 ---------------- d e f g a 0 1 2 3 b 4 5 6 7 ---------------- a 0 b 4 c 8 Name: d, dtype: int32 ---------------- d f a 0 2 b 4 6 c 8 10 ---------------- d 0 f 2 Name: a, dtype: int32 ---------------- f b 6 ---------------- e g b 5 7 ---------------- d e f g c 8 9 10 11
import pandas as pd
import numpy as np
df=pd.DataFrame(np.arange(0,12).reshape(3,4),index=['a','b','c'],columns=['d','e','f','g'])
print(df)
print('----------------')
df.loc['a','e']=0
print(df)
print('----------------')
df.e[df.d>3]=1 ###对指定列进行更改
print(df)
print('----------------')
df['h']=np.array([1,2,3])#增加新列
print(df)
输出:
d e f g a 0 1 2 3 b 4 5 6 7 c 8 9 10 11 ---------------- d e f g a 0 0 2 3 b 4 5 6 7 c 8 9 10 11 ---------------- d e f g a 0 0 2 3 b 4 1 6 7 c 8 1 10 11 ---------------- d e f g h a 0 0 2 3 1 b 4 1 6 7 2 c 8 1 10 11 3
import pandas as pd
import numpy as np
df=pd.DataFrame(np.arange(0,12).reshape(3,4),index=['a','b','c'],columns=['d','e','f','g'])
df.iloc[1,1]=np.nan
df.iloc[2,2]=np.nan
print(df)
print('----------------')
print(df.dropna(axis=1,how='any'))#axis=1 按行扫描,如有nan,删掉所在列 #any 只要行/列有nan 删掉整行/列
#all 只有整行/列都是nan才删掉
print('----------------')
print(df.fillna(value=8))#填充缺失数据
print('----------------')
print(np.any(df.isnull()))#判断是否有空值
输出:
d e f g
a 0 1.0 2.0 3
b 4 NaN 6.0 7
c 8 9.0 NaN 11
----------------
d g
a 0 3
b 4 7
c 8 11
----------------
d e f g
a 0 1.0 2.0 3
b 4 8.0 6.0 7
c 8 9.0 8.0 11
----------------
True
import pandas as pd
data=pd.read_csv(r'C:\Users\28632\Desktop\math-modleing\123.csv')#不在一个目录下绝对路径要加r
#存数据
data.to_csv(r'C:\Users\28632\Desktop\math-modleing\123.csv')
data
输出:
Unnamed: 0 | x | y | angle | distance | R | |
---|---|---|---|---|---|---|
0 | 0 | 1595 | 1 | 37 | 2163 | NaN |
1 | 1 | 152 | 940 | 301 | 1211 | NaN |
2 | 2 | 444 | 522 | 88 | 1875 | NaN |
3 | 3 | 344 | 1920 | 322 | 358 | NaN |
4 | 4 | 387 | 1453 | 308 | 1011 | NaN |
... | ... | ... | ... | ... | ... | ... |
953 | 953 | 2435 | 1221 | 48 | 1806 | NaN |
954 | 954 | 1580 | 531 | 324 | 458 | NaN |
955 | 955 | 1873 | 581 | 309 | 665 | NaN |
956 | 956 | 2252 | 2118 | 83 | 1680 | NaN |
957 | 957 | 2322 | 566 | 324 | 455 | NaN |
958 rows × 6 columns
import pandas as pd
import numpy as np
df1=pd.DataFrame(np.zeros((3,3)),index=['a','b','c'],columns=['d','e','f'])
df2=pd.DataFrame(np.ones((3,3)),index=['a','b','c'],columns=['d','e','f'])
print(df1)
print(df2)
result=pd.concat([df1,df2],axis=0,ignore_index=True)#ignore_index 列重新标序号
print(result)
输出:
d e f
a 0.0 0.0 0.0
b 0.0 0.0 0.0
c 0.0 0.0 0.0
d e f
a 1.0 1.0 1.0
b 1.0 1.0 1.0
c 1.0 1.0 1.0
d e f
0 0.0 0.0 0.0
1 0.0 0.0 0.0
2 0.0 0.0 0.0
3 1.0 1.0 1.0
4 1.0 1.0 1.0
5 1.0 1.0 1.0
import pandas as pd
import numpy as np
df1=pd.DataFrame(np.zeros((3,3)),index=['a','b','c'],columns=['d','e','f'])
df2=pd.DataFrame(np.ones((3,3)),index=['b','c','d'],columns=['e','f','g'])
print(df1)
print(df2)
result=pd.concat([df1,df2],ignore_index=True)#默认join='outer'
print(result)
res=pd.concat([df1,df2],ignore_index=True,join='inner') #删除nan的行列
print(res)
result1=pd.concat([df1,df2.reindex(df1.index)],axis=1)
print(result1)
输出:
d e f a 0.0 0.0 0.0 b 0.0 0.0 0.0 c 0.0 0.0 0.0 e f g b 1.0 1.0 1.0 c 1.0 1.0 1.0 d 1.0 1.0 1.0 d e f g 0 0.0 0.0 0.0 NaN 1 0.0 0.0 0.0 NaN 2 0.0 0.0 0.0 NaN 3 NaN 1.0 1.0 1.0 4 NaN 1.0 1.0 1.0 5 NaN 1.0 1.0 1.0 e f 0 0.0 0.0 1 0.0 0.0 2 0.0 0.0 3 1.0 1.0 4 1.0 1.0 5 1.0 1.0 d e f e f g a 0.0 0.0 0.0 NaN NaN NaN b 0.0 0.0 0.0 1.0 1.0 1.0 c 0.0 0.0 0.0 1.0 1.0 1.0
import pandas as pd
left=pd.DataFrame({
'key':['k0','k1','k2','k3'],
'a':['a0','a1','a2','a3'],
'b':['b0','b1','b2','b3']
})
right=pd.DataFrame({
'key':['k0','k1','k2','k3'],
'c':['c0','c1','c2','c3'],
'b':['d0','d1','d2','d3']
})
print(left)
print(right)
res=pd.merge(left,right,on='key')#利用主键合并
print(res)
输出:
key a b 0 k0 a0 b0 1 k1 a1 b1 2 k2 a2 b2 3 k3 a3 b3 key c b 0 k0 c0 d0 1 k1 c1 d1 2 k2 c2 d2 3 k3 c3 d3 key a b_x c b_y 0 k0 a0 b0 c0 d0 1 k1 a1 b1 c1 d1 2 k2 a2 b2 c2 d2 3 k3 a3 b3 c3 d3
import pandas as pd
left=pd.DataFrame({
'key1':['k0','k0','k1','k2'],
'key2':['k0','k1','k0','k1'],
'a':['a0','a1','a2','a3'],
'b':['b0','b1','b2','b3']
})
right=pd.DataFrame({
'key1':['k0','k1','k1','k2'],
'key2':['k0','k0','k0','k0'],
'c':['c0','c1','c2','c3'],
'b':['d0','d1','d2','d3']
})
print(left)
print(right)
res=pd.merge(left,right,on=['key1','key2'])#两个主键合并
print(res)
res1=pd.merge(left,right,on=['key1','key2'],how='outer',indicator='True')#indicator
print(res1)
输出:
key1 key2 a b 0 k0 k0 a0 b0 1 k0 k1 a1 b1 2 k1 k0 a2 b2 3 k2 k1 a3 b3 key1 key2 c b 0 k0 k0 c0 d0 1 k1 k0 c1 d1 2 k1 k0 c2 d2 3 k2 k0 c3 d3 key1 key2 a b_x c b_y 0 k0 k0 a0 b0 c0 d0 1 k1 k0 a2 b2 c1 d1 2 k1 k0 a2 b2 c2 d2 key1 key2 a b_x c b_y True 0 k0 k0 a0 b0 c0 d0 both 1 k0 k1 a1 b1 NaN NaN left_only 2 k1 k0 a2 b2 c1 d1 both 3 k1 k0 a2 b2 c2 d2 both 4 k2 k1 a3 b3 NaN NaN left_only 5 k2 k0 NaN NaN c3 d3 right_only
import pandas as pd
left=pd.DataFrame({
'a':['a0','a1','a2'],
'b':['b0','b1','b2']
},index=['a','b','c'])
right=pd.DataFrame({
'c':['c0','c1','c2'],
'b':['d0','d1','d2']
},index=['c','d','e'])
print(left)
print(right)
res=pd.merge(left,right,left_index=True,right_index=True,how='outer')#利用index合并
print(res)
输出:
a b a a0 b0 b a1 b1 c a2 b2 c b c c0 d0 d c1 d1 e c2 d2 a b_x c b_y a a0 b0 NaN NaN b a1 b1 NaN NaN c a2 b2 c0 d0 d NaN NaN c1 d1 e NaN NaN c2 d2
import pandas as pd
left=pd.DataFrame({
'k':['a0','a1','a2'],
'b':['b0','b1','b2']
},index=['a','b','c'])
right=pd.DataFrame({
'k':['a0','a0','c2'],
'b':['d0','d1','d2']
},index=['c','d','e'])
print(left)
print(right)
res=pd.merge(left,right,on='k',suffixes=['_left','_right'],how='inner')
print(res)
res=pd.merge(left,right,on='k',how='inner')
print(res)
输出:
k b a a0 b0 b a1 b1 c a2 b2 k b c a0 d0 d a0 d1 e c2 d2 k b_left b_right 0 a0 b0 d0 1 a0 b0 d1 k b_x b_y 0 a0 b0 d0 1 a0 b0 d1