python 将一个数据集按比例随机分割成训练集、验证集、测试集

sklearn 里面有分割数据集的方法,如下:

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

如果想要保存为文件的话,可以用下面的代码:

import random

def split(fname, train_ratio, var_ratio):
    lines = fname.readlines()
    n_total = len(lines)          # 获取数据集的总长度

    train_offset = int(n_total * train_ratio)
    val_offset = int(n_total * (train_ratio + var_ratio))
    random.shuffle(fname.read())  # 按行打乱顺序

    train_data = open('train.txt.bio', 'wb')
    val_data = open('val.txt.bio', 'wb')
    test_data = open('test.txt.bio', 'wb')


    for i, line in enumerate(lines):
        if i < train_offset:
            train_data.write(line)
        elif i < val_offset:
            val_data.write(line)
        else:
            test_data.write(line)

    train_data.close()
    val_data.close()
    test_data.close()


if __name__ == "__main__":
   fname = open('en/en_total.txt.bio', "rb")
   split(fname, train_ratio = 0.6, var_ratio = 0.2)
   fname.close()

你可能感兴趣的:(Python,python)