import os
import sys
import pandas as pd
import numpy as np
def readpm(filename,startline=2):#读入数据函数
return pd.read_csv(filename,header = startline,
usecols = [0,2,3,4,5,6,7,9,10] )
abspath = os.path.dirname(__file__) #运行目录
sys.path.append(abspath)
print(abspath)
if abspath=='':
os.chdir(sys.path[0])
else:
os.chdir(abspath)
pmdata_list = []
for file in os.listdir(abspath):#遍历读取csv文件
if file.endswith('csv'):
pmdata_list.append(readpm(file))
df1 = pd.DataFrame() #合并数据
df10 = df1.append(pmdata_list,ignore_index=True)
print(df10)
‘’‘系统默认的缺失值为 None和 np.nan 相应函数为df.isna() 别名为isnull 反函数为:notna’’’
‘’‘np.nan 更保险,None容易有其他隐患 ‘’’
print(df10['column'].isna()) #缺失值全是True
print(pd.options.mode.use_inf_as_na)#属性可以设置,默认不是缺失值
''' dfnew = df.replace('自定义缺失值',np.nan) '''
''' dfnew = df.replace(['自定义缺失值1','自定义缺失值2',..],[np.nan,np.nan,..]) '''
''' dfnew = df.replace(['自定义缺失值1','自定义缺失值2',..],np.nan) '''#简写
print(df10.isna())#全单元格检查
‘’’
df.any(
axis:(index(0),columns(1))
skipna = True :检查时是否忽略缺失值
level = None:多重索引时指定具体的级别
)
df.all(
axis:(index(0),columns(1))
skipna = True :检查时是否忽略缺失值
level = None:多重索引时指定具体的级别
)
‘’’
df_nan = pd.read_excel('123.xlsx',sheet_name=0)
df_nan.replace('无',np.nan,inplace = True)
print(df_nan[df_nan.isna().any(1)]) #把存在nan的行筛选出了
‘’’
df.fillna(
value:用于填充缺失值的数值
也可以提供dict/Series/DataFrame 敬进一步知名哪些索引/列会被替换
不能使用list
method = None :有索引时的具体填充方法,向前填充,向后填充等
limit = None :指定了method后 设定的最大填充步长,大于此步长不能填充
axis :(o or ‘index’,1 or ‘columns’)
inplace = False
)
‘’’
‘’’
在构建新索引时完成缺失值填充任务
df.reindex(labels = None,fill_value = np.nan) #reindex 的特殊用法
‘’’
df_nan.fillna('未知',inplace = True)
print(df_nan)
df_nan.replace('未知',np.NaN,inplace = True)
df_nan.fillna(df_nan['均消费'].mean())
print(df_nan)
‘’’
df.dropna(
axis = 0:(index(0),columns(1))
how = any:(any,all)有一个就删除,全是才删除
thresh = 删除的数量阈值,int
subset :希望在处理中包括的行列子集
inplace = False
)
‘’’
df_nan_drop = df_nan.dropna()
print(df_nan_drop)
‘’’ 标识出重复的行的意义在于进一步检查重复的原因,以便将可能的错误数据加以修改
duplicated
‘’’
df_duplicated = pd.read_excel('123.xlsx',sheet_name=0)
df_dup_done = df_duplicated.duplicated()
print(df_dup_done)
‘’’ df.index.duplicated()’’’
print(df_duplicated[df_duplicated.set_index(['店家','地址']).index.duplicated()])
‘’’
drop_duplicates(subset=’’):加subset则按照指定的行进行去重
keep = ‘first’/last/False(是否直接删除除有重复的所有记录)
‘’’
df_delet = df_duplicated.drop_duplicates(['店家','地址'])
print(df_delet)
print(df_duplicated[~df_duplicated.duplicated(['店家','地址'])])#直接取反~