pandas按照 某类别 等比例划分test集

df = pd.read_csv('data4000.csv')   # 读取数据
test = pd.DataFrame()              # 划分出的test集合
train = pd.DataFrame()             # 剩余的train集合
tags = df['标签'].unique().tolist() # 按照该标签进行等比例抽取


for tag in tags:
    for i in ['Y','N']:
        # 随机选取0.2的数据
        data = df[(df['标签'] == tag) & (df['是否'] == i)]
        sample = data.sample(int(0.2*len(data)))
        sample_index = sample.index
        # 剩余数据
        all_index = data.index
        residue_index = all_index.difference(sample_index) # 去除sample之后剩余的数据
        residue = data.loc[residue_index]  # 这里要使用.loc而非.iloc
        # 保存
        test = pd.concat([test, sample], ignore_index=True)
        train = pd.concat([train, residue], ignore_index=True)

        
# 保存为tab分隔的文本
test.to_csv('test.tsv',sep='\t',index=False)
train.to_csv('train.tsv',sep='\t',index=False)

>>> len(df), len(test),len(train)
(4946, 972, 3974)

你可能感兴趣的:(pandas用法)