文本分类问题的训练集+测试集的生成

# 文本处理,也就是样本生成过程
def text_processing(folder_path, test_size=0.2):
    folder_list = os.listdir(folder_path)
    data_list = []
    class_list = []

    # 遍历文件夹
    for folder in folder_list:
        new_folder_path = os.path.join(folder_path, folder)
        files = os.listdir(new_folder_path)
        # 读取文件
        j = 1
        for file in files:
            if j > 100:  # 怕内存爆掉,只取100个样本文件,你可以注释掉取完
                break
            with open(os.path.join(new_folder_path, file), 'r',encoding='UTF-8') as fp:
                raw = fp.read()
            ## 是的,随处可见的jieba中文分词
            #jieba.enable_parallel(4)  # 开启并行分词模式,参数为并行进程数,不支持windows
            word_cut = jieba.cut(raw, cut_all=False)  # 精确模式,返回的结构是一个可迭代的genertor
            word_list = list(word_cut)  # genertor转化为list,每个词unicode格式

            #jieba.disable_parallel()  # 关闭并行分词模式

            data_list.append(word_list)  # 训练集list
            with open(os.path.join(os.path.dirname(folder_path),'ClassList.txt'),'r',encoding='UTF-8') as aa:
                for aaa in aa.readlines():
                    u1,u2=aaa.strip().split()
                    if u1==folder:
                        class_list.append(u2)
                        break
                  # 类别
            j += 1
    ## 粗暴地划分训练集和测试集
    data_class_list =list(zip(data_list, class_list))
    random.shuffle(data_class_list)
    index = int(len(data_class_list) * test_size) + 1
    train_list = data_class_list[index:]
    test_list = data_class_list[:index]
    train_data_list, train_class_list = zip(*train_list)
    test_data_list, test_class_list = zip(*test_list)

    # 其实可以用sklearn自带的部分做
    # train_data_list, test_data_list, train_class_list, test_class_list = sklearn.cross_validation.train_test_split(data_list, class_list, test_size=test_size)

class_list和data_list的项目数目必须是一样的,所以,每遍历一个文件(切分中文件中的所有词语),就要遍历一遍class.txt给该文件分类,类别存在class_list里面,之后和data_list里的数据结合成为all_word_list,对all_word_llist进行切分就得到了训练集train_list和测试集test_list。

你可能感兴趣的:(python问题)