python学习之pandas库入门教程

  numpy库和pandas库是作为python数据处理不可缺少的两个库,大家可能学完numpy之后感觉pandas和numpy大同小异,我个人理解是pandas和numpy都是对数组和矩阵进行操作,而pandas中的矩阵是含有index的矩阵,其中可以自己定义主键,类似于大家学习sql语言,其中也包含了全连接左外连接等较为复杂的操作。本文将我学习pandas库的历程分享出来供大家学习参考,欢迎指正。

目录

1.pandas属性

2. pandas选择数据

3.pandas设置值

4.丢失数据处理

5.导入导出文件

6.concat

7.merge


1.pandas属性

import pandas as pd
import numpy as np
df=pd.DataFrame(np.arange(0,12).reshape(3,4),index=['a','b','c'],columns=['d','e','f','g'])
                                    #此处index代表列 分别给3行4列命名
print(df)
print('----------------')
print(df.T)                         #转置
print('-------------------------------------------')
print(df.index)                     #每一列的名称
print(df.columns)                   #每一行的名称
print('-------------------------------------------')
print(df.describe())                #查看数值型列的汇总统计
                                    # df.values 查看数据表的值
print('----------------')
df1=pd.DataFrame(np.arange(0,12).reshape(3,4))
print(df1)
print('----------------')
                                    #默认行列名称为数字0.1.2.3..
df2=pd.DataFrame({'a':1,'b':np.random.random(3),'c':np.array(['tree','forest','car'])})
print(df2.dtypes)                   #每一列数据形式
print('----------------')
print(df2)                          #字典方式定义数据表,其中key为行名,列名为默认值
print('----------------')
print(df2.sort_index(axis=1,ascending=False))#对行进行降序排序 
#df2.sort_index(axis=0,ascending=True)#对列进行升序排序 
print('----------------')
df2.sort_values(by='b')             #按照特定的值进行排序
输出:
  d  e   f   g
a  0  1   2   3
b  4  5   6   7
c  8  9  10  11
----------------
   a  b   c
d  0  4   8
e  1  5   9
f  2  6  10
g  3  7  11
-------------------------------------------
Index(['a', 'b', 'c'], dtype='object')
Index(['d', 'e', 'f', 'g'], dtype='object')
-------------------------------------------
         d    e     f     g
count  3.0  3.0   3.0   3.0
mean   4.0  5.0   6.0   7.0
std    4.0  4.0   4.0   4.0
min    0.0  1.0   2.0   3.0
25%    2.0  3.0   4.0   5.0
50%    4.0  5.0   6.0   7.0
75%    6.0  7.0   8.0   9.0
max    8.0  9.0  10.0  11.0
----------------
   0  1   2   3
0  0  1   2   3
1  4  5   6   7
2  8  9  10  11
----------------
a      int64
b    float64
c     object
dtype: object
----------------
   a         b       c
0  1  0.075662    tree
1  1  0.160645  forest
2  1  0.593432     car
----------------
        c         b  a
0    tree  0.075662  1
1  forest  0.160645  1
2     car  0.593432  1
----------------

Out[53]:

a b c
0 1 0.075662 tree
1 1 0.160645 forest
2 1 0.593432 car

 

2. pandas选择数据

import pandas as pd
import numpy as np
df=pd.DataFrame(np.arange(0,12).reshape(3,4),index=['a','b','c'],columns=['d','e','f','g'])
print(df)
print('----------------')
print(df.d)#只能输出列,无法输出行  df.a 错误
print('----------------')
print(df[0:2])#只对行操作
print('----------------')
print(df.loc[:,'d'])#可对行和列进行标签操作
print('----------------')
print(df.loc[:,['d','f']])#可对多行和列进行标签操作
print('----------------')
print(df.loc['a',['d','f']])#切片  ###
#######################################
print('----------------')#利用行列index进行切片
print(df.iloc[1:2,2:3])
print('----------------')
print(df.iloc[1:2,[1,3]])
print('----------------')
print(df[df.d>4])

输出:

  d  e   f   g
a  0  1   2   3
b  4  5   6   7
c  8  9  10  11
----------------
a    0
b    4
c    8
Name: d, dtype: int32
----------------
   d  e  f  g
a  0  1  2  3
b  4  5  6  7
----------------
a    0
b    4
c    8
Name: d, dtype: int32
----------------
   d   f
a  0   2
b  4   6
c  8  10
----------------
d    0
f    2
Name: a, dtype: int32
----------------
   f
b  6
----------------
   e  g
b  5  7
----------------
   d  e   f   g
c  8  9  10  11

3.pandas设置值

import pandas as pd
import numpy as np
df=pd.DataFrame(np.arange(0,12).reshape(3,4),index=['a','b','c'],columns=['d','e','f','g'])
print(df)
print('----------------')
df.loc['a','e']=0
print(df)
print('----------------')
df.e[df.d>3]=1    ###对指定列进行更改
print(df)
print('----------------')
df['h']=np.array([1,2,3])#增加新列
print(df)

输出:

  d  e   f   g
a  0  1   2   3
b  4  5   6   7
c  8  9  10  11
----------------
   d  e   f   g
a  0  0   2   3
b  4  5   6   7
c  8  9  10  11
----------------
   d  e   f   g
a  0  0   2   3
b  4  1   6   7
c  8  1  10  11
----------------
   d  e   f   g  h
a  0  0   2   3  1
b  4  1   6   7  2
c  8  1  10  11  3

4.丢失数据处理

import pandas as pd
import numpy as np
df=pd.DataFrame(np.arange(0,12).reshape(3,4),index=['a','b','c'],columns=['d','e','f','g'])
df.iloc[1,1]=np.nan
df.iloc[2,2]=np.nan
print(df)
print('----------------')
print(df.dropna(axis=1,how='any'))#axis=1 按行扫描,如有nan,删掉所在列  #any 只要行/列有nan 删掉整行/列
                                                                         #all 只有整行/列都是nan才删掉
print('----------------')
print(df.fillna(value=8))#填充缺失数据
print('----------------')
print(np.any(df.isnull()))#判断是否有空值

输出:

d    e    f   g
a  0  1.0  2.0   3
b  4  NaN  6.0   7
c  8  9.0  NaN  11
----------------
   d   g
a  0   3
b  4   7
c  8  11
----------------
   d    e    f   g
a  0  1.0  2.0   3
b  4  8.0  6.0   7
c  8  9.0  8.0  11
----------------
True

5.导入导出文件

import pandas as pd
data=pd.read_csv(r'C:\Users\28632\Desktop\math-modleing\123.csv')#不在一个目录下绝对路径要加r
#存数据
data.to_csv(r'C:\Users\28632\Desktop\math-modleing\123.csv')
data

输出:

Unnamed: 0 x y angle distance R
0 0 1595 1 37 2163 NaN
1 1 152 940 301 1211 NaN
2 2 444 522 88 1875 NaN
3 3 344 1920 322 358 NaN
4 4 387 1453 308 1011 NaN
... ... ... ... ... ... ...
953 953 2435 1221 48 1806 NaN
954 954 1580 531 324 458 NaN
955 955 1873 581 309 665 NaN
956 956 2252 2118 83 1680 NaN
957 957 2322 566 324 455 NaN

958 rows × 6 columns


6.concat

import pandas as pd 
import numpy as np
df1=pd.DataFrame(np.zeros((3,3)),index=['a','b','c'],columns=['d','e','f'])
df2=pd.DataFrame(np.ones((3,3)),index=['a','b','c'],columns=['d','e','f'])
print(df1)
print(df2)
result=pd.concat([df1,df2],axis=0,ignore_index=True)#ignore_index 列重新标序号
print(result)

输出:

 d    e    f
a  0.0  0.0  0.0
b  0.0  0.0  0.0
c  0.0  0.0  0.0
     d    e    f
a  1.0  1.0  1.0
b  1.0  1.0  1.0
c  1.0  1.0  1.0
     d    e    f
0  0.0  0.0  0.0
1  0.0  0.0  0.0
2  0.0  0.0  0.0
3  1.0  1.0  1.0
4  1.0  1.0  1.0
5  1.0  1.0  1.0
import pandas as pd 
import numpy as np
df1=pd.DataFrame(np.zeros((3,3)),index=['a','b','c'],columns=['d','e','f'])
df2=pd.DataFrame(np.ones((3,3)),index=['b','c','d'],columns=['e','f','g'])
print(df1)
print(df2)
result=pd.concat([df1,df2],ignore_index=True)#默认join='outer'
print(result)
res=pd.concat([df1,df2],ignore_index=True,join='inner') #删除nan的行列
print(res)
result1=pd.concat([df1,df2.reindex(df1.index)],axis=1)
print(result1)

输出:

 d    e    f
a  0.0  0.0  0.0
b  0.0  0.0  0.0
c  0.0  0.0  0.0
     e    f    g
b  1.0  1.0  1.0
c  1.0  1.0  1.0
d  1.0  1.0  1.0
     d    e    f    g
0  0.0  0.0  0.0  NaN
1  0.0  0.0  0.0  NaN
2  0.0  0.0  0.0  NaN
3  NaN  1.0  1.0  1.0
4  NaN  1.0  1.0  1.0
5  NaN  1.0  1.0  1.0
     e    f
0  0.0  0.0
1  0.0  0.0
2  0.0  0.0
3  1.0  1.0
4  1.0  1.0
5  1.0  1.0
     d    e    f    e    f    g
a  0.0  0.0  0.0  NaN  NaN  NaN
b  0.0  0.0  0.0  1.0  1.0  1.0
c  0.0  0.0  0.0  1.0  1.0  1.0

7.merge

import pandas as pd
left=pd.DataFrame({
    'key':['k0','k1','k2','k3'],
    'a':['a0','a1','a2','a3'],
    'b':['b0','b1','b2','b3']
})
right=pd.DataFrame({
    'key':['k0','k1','k2','k3'],
    'c':['c0','c1','c2','c3'],
    'b':['d0','d1','d2','d3']
})
print(left)
print(right)
res=pd.merge(left,right,on='key')#利用主键合并
print(res)

输出:

key   a   b
0  k0  a0  b0
1  k1  a1  b1
2  k2  a2  b2
3  k3  a3  b3
  key   c   b
0  k0  c0  d0
1  k1  c1  d1
2  k2  c2  d2
3  k3  c3  d3
  key   a b_x   c b_y
0  k0  a0  b0  c0  d0
1  k1  a1  b1  c1  d1
2  k2  a2  b2  c2  d2
3  k3  a3  b3  c3  d3

import pandas as pd
left=pd.DataFrame({
    'key1':['k0','k0','k1','k2'],
    'key2':['k0','k1','k0','k1'],
    'a':['a0','a1','a2','a3'],
    'b':['b0','b1','b2','b3']
})
right=pd.DataFrame({
    'key1':['k0','k1','k1','k2'],
    'key2':['k0','k0','k0','k0'],
    'c':['c0','c1','c2','c3'],
    'b':['d0','d1','d2','d3']
})
print(left)
print(right)
res=pd.merge(left,right,on=['key1','key2'])#两个主键合并
print(res)
res1=pd.merge(left,right,on=['key1','key2'],how='outer',indicator='True')#indicator
print(res1)

输出:

key1 key2   a   b
0   k0   k0  a0  b0
1   k0   k1  a1  b1
2   k1   k0  a2  b2
3   k2   k1  a3  b3
  key1 key2   c   b
0   k0   k0  c0  d0
1   k1   k0  c1  d1
2   k1   k0  c2  d2
3   k2   k0  c3  d3
  key1 key2   a b_x   c b_y
0   k0   k0  a0  b0  c0  d0
1   k1   k0  a2  b2  c1  d1
2   k1   k0  a2  b2  c2  d2
  key1 key2    a  b_x    c  b_y        True
0   k0   k0   a0   b0   c0   d0        both
1   k0   k1   a1   b1  NaN  NaN   left_only
2   k1   k0   a2   b2   c1   d1        both
3   k1   k0   a2   b2   c2   d2        both
4   k2   k1   a3   b3  NaN  NaN   left_only
5   k2   k0  NaN  NaN   c3   d3  right_only
import pandas as pd
left=pd.DataFrame({
    'a':['a0','a1','a2'],
    'b':['b0','b1','b2']
},index=['a','b','c'])
right=pd.DataFrame({
    'c':['c0','c1','c2'],
    'b':['d0','d1','d2']
},index=['c','d','e'])
print(left)
print(right)
res=pd.merge(left,right,left_index=True,right_index=True,how='outer')#利用index合并 
print(res)

输出:

a   b
a  a0  b0
b  a1  b1
c  a2  b2
    c   b
c  c0  d0
d  c1  d1
e  c2  d2
     a  b_x    c  b_y
a   a0   b0  NaN  NaN
b   a1   b1  NaN  NaN
c   a2   b2   c0   d0
d  NaN  NaN   c1   d1
e  NaN  NaN   c2   d2
import pandas as pd
left=pd.DataFrame({
    'k':['a0','a1','a2'],
    'b':['b0','b1','b2']
},index=['a','b','c'])
right=pd.DataFrame({
    'k':['a0','a0','c2'],
    'b':['d0','d1','d2']
},index=['c','d','e'])
print(left)
print(right)
res=pd.merge(left,right,on='k',suffixes=['_left','_right'],how='inner')
print(res)
res=pd.merge(left,right,on='k',how='inner')
print(res)

输出:

 k   b
a  a0  b0
b  a1  b1
c  a2  b2
    k   b
c  a0  d0
d  a0  d1
e  c2  d2
    k b_left b_right
0  a0     b0      d0
1  a0     b0      d1
    k b_x b_y
0  a0  b0  d0
1  a0  b0  d1

你可能感兴趣的:(python,pandas,jupyter,numpy,机器学习)