- dropna()去空
af=af.dropna()
- value_counts() 分类、遍历
vc=af['a'].value_counts()
delist=[]
for i,v in vc.iteritems():
if v<len(af)*0.03:
delist.append(i)
- 把val_count()的结果转成DataFrame
g=af['col'].value_counts()
dic_g={'a':g.index,'数量':g.values}
df_g=pd.DataFrame(dic_g)
- 把val_count()的结果,一列求和,求比率
s=np.sum(df.数量)
df_g['比率']=df_g.数量.apply(lambda x:str(x*100/s)[:6]+'%')
- 按条件筛选
af=af[~af['a'].isin(delist)]
- drop_duplicates()去重
colist=af.drop_duplicates()
- 二维数组合成一维
colist=af.values.tolist()
print(colist)
colist=reduce(operator.add,colist)
print(colist)
[['Z96.101'], ['H25.901'], ['H25.900'], ['H52.701']]
['Z96.101', 'H25.901', 'H25.900', 'H52.701']
- 注意若dtype改为str,空值==‘nan’,此时dropna无用
- read_csv(,low_memory=False)需要在读取大量数据时加上
- 拼接多个不同文件,结构一样的DataFrame
all_df=pd.DataFrame()
for root,_,files in os.walk(dir_path):
for file in files:
if '.csv' in file:
data=pd.read_csv(os.path.join(root,file),encoding='gbk',low_memory=False,usecols=['主码','其他码'])
all_df=all_df.append(data,ignore_index=True)
- 转置
all_df = pd.DataFrame(all_df.values.T, index=all_df.columns, columns=all_df.index)
- 某列的某值数量少于规定数量时,删除此列
colist=list(af.columns)
for col in colist:
num=af[str(col)].value_counts()['False']
if num>btsize*0.98:
print(num, col)
af=af.drop(columns=[col])
- list去重
newlist={}.fromkeys(newlist)
klist=list(newlist.keys())
- concat合并
axis=0 按列
axis=1 按行
out=pd.concat([out,pd.DataFrame(t)],axis=0)
- 获取有特定字符的列的行
af=af[af['ICD'].str.contains(',')]