Python中的DataFrame模块学习

Python中的DataFrame模块学习

本文是基于Windows系统环境,学习和测试DataFrame模块:

  • Windows 10

  • PyCharm 2018.3.5 for Windows (exe)

  • python 3.6.8 Windows x86 executable installer


1. 初始化DataFrame

  • 创建一个空的DataFrame变量
import pandas as pd
import numpy as np
data = pd.DataFrame()
print(np.shape(data)) # (0,0)
  • 通过字典创建一个DataFrame
import pandas as pd
import numpy as np
dict_a = {'name': ['xu', 'wang'], 'gender': ['male', 'female']}
data = pd.DataFrame(dict_a)
print(np.shape(data)) # (2,2)
print(data) 
# data = 
# 	name  gender
# 0	 xu		male
# 1	 wang	female
  • 通过numpy.array创建一个DataFrame
import pandas as pd
import numpy as np
mat = np.random.randn(3,4)
df = pd.DataFrame(mat)
df.columns = ['a','b','c','d']
print(df)
  • 一个DataFrame转成numpy.array
import pandas as pd
import numpy as np
mat = np.random.randn(3,4)
df = pd.DataFrame(mat)
df.columns = ['a','b','c','d']
print(df)
n = np.array(df)
print(n)
  • DataFrame增加一列数据
import pandas as pd
import numpy as np
data = pd.DataFrame()
data['ID'] = range(0,10) 
print(np.shape(data)) # (10,1) 
  • DataFrame增加一行数据
import pandas as pd
import numpy as np
df = pd.DataFrame(columns=('a', 'b', 'c'))
df = df.append([{'a': 10.0, 'b': 'name', 'c': 10}], ignore_index=True)
  • DataFrame增加一列数据,且值相同
import pandas as pd
import numpy as np
dict_a = {'name': ['xu', 'wang'], 'gender': ['male', 'female']}
data = pd.DataFrame(dict_a)
data['country'] = 'China' 
print(data) 
# data = 
# 	name    gender	country
# 0	 xu		male	China
# 1	 wang	female	China
  • DataFrame删除重复的数据行
import pandas as pd
norepeat_df = df.drop_duplicates(subset=['A_ID', 'B_ID'], keep='first')
# norepeat_df = df.drop_duplicates(subset=[1, 2], keep='first')
# keep=False时,就是去掉所有的重复行 
# keep=‘first'时,就是保留第一次出现的重复行 
# keep='last'时就是保留最后一次出现的重复行。

2. 基本操作

  • 获取DataFrame的行数和列数
df.shape[0] # 行数
df.shape[1] # 列数
  • 获取DataFrame的转置
df.T
  • 修改DataFrame的数据精度
a = np.array([[0.03, 0.05, 1.22], [0.04, 4.54, 3.68]])
df = pd.DataFrame(a.T, columns=['a', 'b'])
df.round(2) # 保留两位小数,四舍五入  
print(df)
#    a    b
# 0  0.0  0.0
# 1  0.1  4.5
# 2  1.2  3.7
  • 获取DataFrame的apply函数
a = np.array([[3, 1, 2], [2, 4, 3]])
df = pd.DataFrame(a.T, columns=['a', 'b'])
print(df)
#    a  b
# 0  3  2
# 1  1  4
# 2  2  3

f = lambda x: np.mean(x)
t1 = df.apply(f) # 按列处理
print(t1)
# a    2.0
# b    3.0
t2 = df.apply(f, axis=1) # 按行处理
print(t2)
# 0    2.5
# 1    2.5
# 2    2.5
  • 选择DataFrame里面某一列等于某个值的所有行
df.loc[df['columnName']=='the value']
  • 去除某一列两端的指定字符
import pandas as pd
dict_a = {'name': ['.xu', 'wang'], 'gender': ['male', 'female.']}
data = pd.DataFrame(dict_a)
print(data) 
# data = 
# 	name    gender	
# 0	 .xu		male	
# 1	 wang	female.	
data['name'] = data['name'].str.strip('.') # 删除'.'
# data['name'] = data['name'].str.strip() # 删除空格
print(data) 
# data = 
# 	name    gender	
# 0	 xu		male	
# 1	 wang	female.	
  • 重新调整index的值
import pandas as pd
data = pd.DataFrame()
data['ID'] = range(0,3) 
# data = 
# 	ID
# 0	 0
# 1	 1
# 2  2
data.index = range(1,len(data) + 1) 
# data = 
# 	ID
# 1	 0
# 2	 1
# 3  2
  • 调整DataFrame列顺序
import pandas as pd
data = pd.DataFrame()
print(data)
# data = 
# 	ID  name
# 0	 0	xu
# 1	 1	wang
# 2  2	li
data = data[['name','ID']]
# data = 
# 	name  ID
# 0	 xu	   0
# 1	 wang  1
# 2  li    2
  • 获取DataFrame的列名
import pandas as pd
data = pd.DataFrame()
print(data)
# data = 
# 	ID  name
# 0	 0	xu
# 1	 1	wang
# 2  2	li
print(data.columns.values.tolist())
# 	['ID', 'name']
  • 获取DataFrame的行名
import pandas as pd
data = pd.DataFrame()
print(data)
# data = 
# 	ID  name
# 0	 0	xu
# 1	 1	wang
# 2  2	li
print(data._stat_axis.values.tolist())
# 	[0, 1, 2]
  • DataFrame按列进行遍历
import pandas as pd
import numpy as py
data = pd.DataFrame(np.arange(6).reshape((2, 3)))
print(data)
# data = 
# 	 0	1	2
# 0	 0	1	2
# 1	 3	4	5
cols = data.columns.values
for i in range(len(cols)):
	print(data[cols[i]]) 



data = pd.DataFrame(np.arange(6).reshape((2, 3)), columns=['a', 'b', 'c'])
print(data)
# data = 
# 	 0	1	2
# 0	 0	1	2
# 1	 3	4	5
cols = data.columns.values
for i in range(len(cols)):
	print(data[cols[i]]) 

3. 读写操作

  • 将csv文件读入DataFrame数据
  • read_csv()函数的参数配置参考官网pandas.read_csv
import pandas as pd 
data = pd.read_csv('user.csv')
print (data) 
  • 将DataFrame数据写入csv文件
  • to_csv()函数的参数配置参考官网pandas.DataFrame.to_csv
import pandas as pd 
data = pd.read_csv('test1.csv')
data.to_csv("test2.csv",index=False, header=True)

4. 异常处理

  • 过滤所有包含NaN的行
  • dropna()函数的参数配置参考官网pandas.DataFrame.dropna
from numpy import nan as NaN
import pandas as pd 
data = pd.DataFrame([[1,2,3],[NaN,NaN,2],[NaN,NaN,NaN],[8,8,NaN]])
print (data) 
# data =
# 1    2   3
# NaN NaN  2
# NaN NaN NaN
# 8    8  NaN
data = data.dropna()
# DataFrame.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)
# axis: 0 or 'index'表示去除行   1 or 'columns'表示去除列
# how: 'any'表示行或列只要含有NaN就去除,'all'表示行或列全都含有NaN才去除
# thresh: 整数n,表示每行或列中至少有n个元素补位NaN,否则去除
# subset: ['name', 'gender'] 在子集中去除NaN值,子集也可以index,但是要配合axis=1
# inplace: 如何为True,则执行操作,然后返回None
print(data)
# data =
# 1    2   3

你可能感兴趣的:(Python)