DataFrame整理

  1. dropna()去空
af=af.dropna()
  1. value_counts() 分类、遍历
vc=af['a'].value_counts()
delist=[]
for i,v in vc.iteritems():
    if v<len(af)*0.03:
        delist.append(i)
  1. 把val_count()的结果转成DataFrame
g=af['col'].value_counts()
dic_g={'a':g.index,'数量':g.values}
df_g=pd.DataFrame(dic_g)
  1. 把val_count()的结果,一列求和,求比率
s=np.sum(df.数量)
#可以直接这样新建一列
df_g['比率']=df_g.数量.apply(lambda x:str(x*100/s)[:6]+'%')
  1. 按条件筛选
af=af[~af['a'].isin(delist)]
  1. drop_duplicates()去重
colist=af.drop_duplicates()
  1. 二维数组合成一维
colist=af.values.tolist()
print(colist)
colist=reduce(operator.add,colist)
print(colist)
[['Z96.101'], ['H25.901'], ['H25.900'], ['H52.701']]
['Z96.101', 'H25.901', 'H25.900', 'H52.701']
  1. 注意若dtype改为str,空值==‘nan’,此时dropna无用
  2. read_csv(,low_memory=False)需要在读取大量数据时加上
  3. 拼接多个不同文件,结构一样的DataFrame
all_df=pd.DataFrame()
for root,_,files in os.walk(dir_path):
    for file in files:
        if '.csv' in file:
            data=pd.read_csv(os.path.join(root,file),encoding='gbk',low_memory=False,usecols=['主码','其他码'])
            all_df=all_df.append(data,ignore_index=True)
  1. 转置
all_df = pd.DataFrame(all_df.values.T, index=all_df.columns, columns=all_df.index)
  1. 某列的某值数量少于规定数量时,删除此列
colist=list(af.columns)
for col in colist:
    num=af[str(col)].value_counts()['False']
    if num>btsize*0.98:
        print(num, col)
        af=af.drop(columns=[col])
  1. list去重
newlist={}.fromkeys(newlist)
klist=list(newlist.keys())
  1. concat合并
    axis=0 按列
    axis=1 按行
#t为一个list
out=pd.concat([out,pd.DataFrame(t)],axis=0)
  1. 获取有特定字符的列的行
af=af[af['ICD'].str.contains(',')]

你可能感兴趣的:(DataFrame整理)