sklearn 里面有分割数据集的方法,如下:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
如果想要保存为文件的话,可以用下面的代码:
import random
def split(fname, train_ratio, var_ratio):
lines = fname.readlines()
n_total = len(lines) # 获取数据集的总长度
train_offset = int(n_total * train_ratio)
val_offset = int(n_total * (train_ratio + var_ratio))
random.shuffle(fname.read()) # 按行打乱顺序
train_data = open('train.txt.bio', 'wb')
val_data = open('val.txt.bio', 'wb')
test_data = open('test.txt.bio', 'wb')
for i, line in enumerate(lines):
if i < train_offset:
train_data.write(line)
elif i < val_offset:
val_data.write(line)
else:
test_data.write(line)
train_data.close()
val_data.close()
test_data.close()
if __name__ == "__main__":
fname = open('en/en_total.txt.bio', "rb")
split(fname, train_ratio = 0.6, var_ratio = 0.2)
fname.close()