k折划分数据

采用 K 折交叉验证之前需要先划分好数据,这里记录一下各种 K 折划分数据的方法

文章目录

  • 0. 示例 csv
  • 1. KFold
  • 2. StratifiedKFold
  • 3. StratifiedGroupKFold

0. 示例 csv

构造一个 example.csv 为例,其中 image_name 为特征,patient_id 为分组,target 为标签。

1. KFold

太简单不写了


2. StratifiedKFold

import os
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold

k = 3  # 将分为 3 折
df = pd.read_csv('example.csv')
df.insert(len(df.columns), 'StratifiedKFold', np.nan)

skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=2020)
for fold, (train_ids, valid_ids) in enumerate(skf.split(X=np.zeros(len(df)), y=df['target']),
                                              start=1):
    df.loc[valid_ids, 'StratifiedKFold'] = fold
    
# 保存
df.to_csv('example_skf.csv')

# 看看 fold == 1 作为验证集时,训练集和验证集的标签计数
df = pd.read_csv('example_skf.csv', index_col=0)
train_df = df[df['StratifiedKFold'] != 1]
valid_df = df[df['StratifiedKFold'] == 1]
print(train_df['target'].value_counts())
print(valid_df['target'].value_counts())

print(train_df['patient_id'].value_counts())
print(valid_df['patient_id'].value_counts())


3. StratifiedGroupKFold

参考:一则 Kaggle的帖子

import random
import numpy as np
import pandas as pd
from collections import Counter, defaultdict


def eval_y_counts_per_fold(y_counts, fold):
    # 看看若加入这组的 y_counts 加入到这个 fold 之中,
    y_counts_per_fold[fold] += y_counts
    std_per_label = []  # 每个标签在所有折中分布的标准差
    for label in range(num_labels):
        # 所有折中标签分布的标准差
        label_std = np.std([y_counts_per_fold[i][label] / y_distr[label] for i in range(k)])
        std_per_label.append(label_std)
    y_counts_per_fold[fold] -= y_counts
    
    # 返回"每个标签在所有折中分布的标准差"的均值
    return np.mean(std_per_label)


def get_ids(all_groups, groups_per_fold):
    """
    一个生成器, 返回训练集和测试(验证)集的索引.
    Params:
        all_groups (set): 所有分组的标签组成的集合
        groups_per_fold (defaultdict): 一个字典类型的对象; 键为整型, 表示折数; 值为集合, 其元素表示该折中包含的组
    Returns:
        train_ids: 训练集索引
        test_ids: 测试(验证)集索引
    """
    for i in range(k):
        train_groups = all_groups - groups_per_fold[i]
        test_groups = groups_per_fold[i]

        train_ids = [key for key, value in groups.items() if value in train_groups]
        test_ids = [key for key, value in groups.items() if value in test_groups]

        yield train_ids, test_ids
        
        
if __name__ == '__main__':
    k = 3  # 将分为 3 折
    df = pd.read_csv('example.csv')

    # 得到特征(包含一些id)、标签和分组
    X = df.drop(columns='target')
    y = df['target']
    groups = df['patient_id']
    
    # 得到分类的类别总数
    num_labels = len(y.value_counts())
    
    # 对总体的类别进行计数;对每个分组的类别进行计数
    y_counts_per_group = defaultdict(lambda: np.zeros(num_labels))
    y_distr = Counter()
    for label, g in zip(y, groups):
        y_counts_per_group[g][label] += 1
        y_distr[label] += 1
        
    # 打乱顺序
    seed = 2020
    groups_and_y_counts = list(y_counts_per_group.items())  # 以二元组为元素的列表
    random.Random(seed).shuffle(groups_and_y_counts)
    
    y_counts_per_fold = defaultdict(lambda: np.zeros(num_labels))
    groups_per_fold = defaultdict(set)
    
    # 按同一组中标签分布的离散程度从大到小排列,之后进行迭代
    for g, y_counts in sorted(groups_and_y_counts, key=lambda x: np.std(x[1]), reverse=True):
        best_fold = None
        min_eval = None
        for i in range(k):
            fold_eval = eval_y_counts_per_fold(y_counts, i)
            if min_eval is None or fold_eval < min_eval:
                min_eval = fold_eval
                best_fold = i
        y_counts_per_fold[best_fold] += y_counts
        groups_per_fold[best_fold].add(g)
        
    # 插入一列空列,以写入分折信息
    len_cols = len(df.columns)
    df.insert(len_cols, 'StratifiedGroupKFold', value=np.nan)
    
    # 进行分折
    all_groups = set(groups)
    generator = get_ids(all_groups, groups_per_fold)
    for fold, (train_ids, valid_ids) in enumerate(generator, start=1):
        df.loc[valid_ids, 'StratifiedGroupKFold'] = fold

    # 保存
    df.to_csv('example_sgkf.csv')

    # 看看 fold == 1 作为验证集时,训练集和验证集的标签计数
    df = pd.read_csv('example_sgkf.csv', index_col=0)
    train_df = df[df['StratifiedGroupKFold'] != 1]
    valid_df = df[df['StratifiedGroupKFold'] == 1]
    print(train_df['target'].value_counts())
    print(valid_df['target'].value_counts())
    print()
    print(train_df['patient_id'].value_counts())
    print(valid_df['patient_id'].value_counts())


你可能感兴趣的:(杂七杂八)