pandas 数据清洗

读入数据

import os
import sys
import pandas as pd
import numpy as np

def readpm(filename,startline=2):#读入数据函数
    return pd.read_csv(filename,header = startline,
                       usecols = [0,2,3,4,5,6,7,9,10] )

abspath = os.path.dirname(__file__)  #运行目录
sys.path.append(abspath)  
print(abspath) 
if abspath=='':  
    os.chdir(sys.path[0])      
else:  
    os.chdir(abspath)

pmdata_list = []
for file in os.listdir(abspath):#遍历读取csv文件
    if file.endswith('csv'):
        pmdata_list.append(readpm(file))


df1 = pd.DataFrame()  #合并数据
df10 = df1.append(pmdata_list,ignore_index=True)
print(df10)

处理缺失值

‘’‘系统默认的缺失值为 None和 np.nan 相应函数为df.isna() 别名为isnull 反函数为:notna’’’
‘’‘np.nan 更保险,None容易有其他隐患 ‘’’

print(df10['column'].isna()) #缺失值全是True

inf 与-inf 正负无穷大是否被认为缺失值(默认False)

print(pd.options.mode.use_inf_as_na)#属性可以设置,默认不是缺失值

处理自定义缺失值

''' dfnew = df.replace('自定义缺失值',np.nan) '''
''' dfnew = df.replace(['自定义缺失值1','自定义缺失值2',..],[np.nan,np.nan,..]) '''
''' dfnew = df.replace(['自定义缺失值1','自定义缺失值2',..],np.nan) '''#简写

print(df10.isna())#全单元格检查

检查多个单元格是否为指定值

‘’’
df.any(
axis:(index(0),columns(1))
skipna = True :检查时是否忽略缺失值
level = None:多重索引时指定具体的级别
)
df.all(
axis:(index(0),columns(1))
skipna = True :检查时是否忽略缺失值
level = None:多重索引时指定具体的级别
)
‘’’

df_nan = pd.read_excel('123.xlsx',sheet_name=0)
df_nan.replace('无',np.nan,inplace = True)
print(df_nan[df_nan.isna().any(1)]) #把存在nan的行筛选出了

缺失值填充

‘’’
df.fillna(
value:用于填充缺失值的数值
也可以提供dict/Series/DataFrame 敬进一步知名哪些索引/列会被替换
不能使用list
method = None :有索引时的具体填充方法,向前填充,向后填充等
limit = None :指定了method后 设定的最大填充步长,大于此步长不能填充
axis :(o or ‘index’,1 or ‘columns’)
inplace = False
)

‘’’
‘’’
在构建新索引时完成缺失值填充任务
df.reindex(labels = None,fill_value = np.nan) #reindex 的特殊用法
‘’’

df_nan.fillna('未知',inplace = True)
print(df_nan)
df_nan.replace('未知',np.NaN,inplace = True)
df_nan.fillna(df_nan['均消费'].mean())  
print(df_nan)

删除缺失值

‘’’
df.dropna(
axis = 0:(index(0),columns(1))
how = any:(any,all)有一个就删除,全是才删除
thresh = 删除的数量阈值,int
subset :希望在处理中包括的行列子集
inplace = False
)
‘’’

df_nan_drop = df_nan.dropna()   
print(df_nan_drop)   

数据查重

标识出重复的行

‘’’ 标识出重复的行的意义在于进一步检查重复的原因,以便将可能的错误数据加以修改
duplicated
‘’’

df_duplicated = pd.read_excel('123.xlsx',sheet_name=0)
df_dup_done = df_duplicated.duplicated()
print(df_dup_done)

利用索引进行标识

‘’’ df.index.duplicated()’’’

print(df_duplicated[df_duplicated.set_index(['店家','地址']).index.duplicated()])

直接删除重复

‘’’
drop_duplicates(subset=’’):加subset则按照指定的行进行去重
keep = ‘first’/last/False(是否直接删除除有重复的所有记录)
‘’’

df_delet = df_duplicated.drop_duplicates(['店家','地址'])
print(df_delet)
print(df_duplicated[~df_duplicated.duplicated(['店家','地址'])])#直接取反~

你可能感兴趣的:(数据分析&文本挖掘,pandas,数据,去重,清洗,处理)