大量excel 数据划分测试集和训练集 tensorflow pytorch

excel 数据划分测试集和训练集

速度很快,在大量数据下仍然能够保持较高的速度,觉得不错的,请帮忙点个赞!

代码

// An highlighted block
import random
import xlrd

import numpy as np
from xlutils.copy import copy


data_path = 'C:\\Users\\gj7520\\Desktop\\pythob_files\\file_select\\data2\\train10000_2.xls'
train_file = 'C:\\Users\\gj7520\\Desktop\\pythob_files\\file_select\\data2\\train_split14000.xls'
test_file = 'C:\\Users\\gj7520\\Desktop\\pythob_files\\file_select\\data2\\test_split14000.xls'


def write_excel_xls_append(path, value):
    index = len(value)  # 获取需要写入数据的行数
    workbook = xlrd.open_workbook(path)  # 打开工作簿
    sheets = workbook.sheet_names()  # 获取工作簿中的所有表格
    worksheet = workbook.sheet_by_name(sheets[0])  # 获取工作簿中所有表格中的的第一个表格
    rows_old = worksheet.nrows  # 获取表格中已存在的数据的行数
    new_workbook = copy(workbook)  # 将xlrd对象拷贝转化为xlwt对象
    new_worksheet = new_workbook.get_sheet(0)  # 获取转化后工作簿中的第一个表格

    for i in range(0, index):
        for j in range(0, len(value[i])):
            new_worksheet.write(i+rows_old, j, value[i][j]) # 追加写入数据,注意是从i+rows_old行开始写入
    new_workbook.save(path)  # 保存工作簿


data = xlrd.open_workbook(data_path)
sheet = data.sheet_by_index(0)
list = []
for i in range(sheet.nrows):
    list.append(i)
test_rows = random.sample(list,int(sheet.nrows * 0.1))

for item in test_rows:
    list.remove(item)
train_rows = list

mat_train = np.zeros([len(train_rows) ,sheet.ncols])
mat_test = np.zeros([len(test_rows) ,sheet.ncols])
print(mat_train.shape)
print(mat_test.shape)
num_test = 0
num_train = 0

for row in range(0,sheet.nrows):
    if row in test_rows:
        mat_test[num_test] = sheet.row_values(row)
        num_test += 1
    else:
        mat_train[num_train] = sheet.row_values(row)
        num_train += 1

write_excel_xls_append(train_file, mat_train)
write_excel_xls_append(test_file, mat_test)

你可能感兴趣的:(深度学习,tensorflow,pytorch,数据挖掘,神经网络)