基于Xgboost的文本分类——Python实现

1、背景
数据集大小为1万,其中训练集5000,测试集5000,共十类。
开发环境:Python3.6+Windows+PyCharm

2、前期准备
获取1万数据集,分词,去停止词并存入MySQL数据库。

3、从数据库读取数据

def train_corpus_generator():
    global db
    # coding:utf-8
    num = 0
    for topic in topics:
        num += 1
        # 到第十类时要终止迭代器
        print(topic)
        if num == 11:
            return [[topic, item[1],item[2]] for item in db.query(topic, 0, config["trainset_num"])]
        else:
            yield [[topic, item[1],item[2]] for item in db.query(topic, 0, config["trainset_num"])]

4、获取训练集与测试集数据并转换为要求格式

    train = train_corpus_generator()
    train = tuple(train)
    train_opinion, train_content = stop_words_ch(train)
    train_opinion = transLabel(train_opinion)  # 将类别改为数字格式
    train_opinion = np.array(train_opinion)
    print("train data load finished")

    # 加载测试集数据
    test = test_corpus_generator()
    test = tuple(test)
    test_opinion, test_content = stop_words_ch(test)
    test_opinion = transLabel(test_opinion)  # 将类别改为数字格式
    test_opinion = np.array(test_opinion)
    print("test data load finished")

转换字符串类别为数字

def transLabel(labels):
    for i in range(len(labels)):
        for j in range(len(topics)):
            if labels[i]==topics[j]:
                labels[i]=j
    return labels

5、使用sklearn计算TF-IDF

    # 计算训练集 TF-IDF
    vectorizer = CountVectorizer()
    tf_idf_transformer = TfidfTransformer()
    tf_idf = tf_idf_transformer.fit_transform(vectorizer.fit_transform(train_content))
    weight = tf_idf.toarray()
    wordss = vectorizer.get_feature_names() #特征
    print(tf_idf.shape)

    # 将数据转化为DMatrix类型
    dtrain = xgb.DMatrix(weight, label=train_opinion)  



    # 计算测试集-TF-IDF
    test_tf_idf = tf_idf_transformer.transform(vectorizer.transform(test_content))
    test_weight = test_tf_idf.toarray()
    print(test_weight.shape)
    dtest = xgb.DMatrix(test_weight, label=test_opinion)

    # 保存测试集数据,以便模型训练完成直接调用
    dtest.save_binary('dtest.buffer') 

6、设置参数+开始训练

def xgboo_train():

    # 设置一系列训练参数
    param = {
        'booster': 'gbtree',
        'objective': 'multi:softmax',
        'num_class': 10,  # 类数,与 multisoftmax 并用
        'max_depth': 20,
        'eta': 0.4,
        'eval_metric': 'merror',
        'silent': 1,
    }

    dtrain,dtest = Get_data()
    evallist = [(dtrain, 'train'), (dtest, 'test')]
    num_round = 200

    # 开始训练
    print("开始训练!")
    bst = xgb.train(param, dtrain, num_round, evallist, early_stopping_rounds=10)

    # 保存训练模型
    bst.save_model('train.model')
    return dtest,bst

7、测试数据,并计算准确度,召回率和F测度

def xgboo_test(dtest,bst):

    print("开始测试!")
    preds = bst.predict(dtest)

    TP = [0 for n in range(10)]
    FP = [0 for n in range(10)]
    FN = [0 for n in range(10)]
    TN = [0 for n in range(10)]

    for i, pre in enumerate(preds):
        real_type = int(i / config["testset_num"])
        pre_type = int(pre)
        print("real,pre", real_type, pre_type)
        if real_type == pre_type:
            TP[real_type] += 1   # 正预测为正,正值+1
        else:
            FP[real_type] += 1   # 正预测为负,负值+1
            FN[pre_type] += 1    # 负预测为正,正值+1

    ACC_SUM = 0
    for i in range(0, 10):
        TN[i] = sum(TP) - TP[i]
        ACC = TP[i] / (TP[i] + FP[i])    
        REC = TP[i] / (TP[i] + FN[i])
        FM = 2 * TP[i] / (2 * TP[i] + FP[i] + TN[i])
        ACC_SUM += ACC
        print(i, " 准确率:", ACC, " 召回率:",REC, " F-测度:", FM)
        print("各类平均准确率为:", float('%.3f' % (ACC_SUM / 10)))

8、利用训练完毕的模型直接测试

    bst = xgb.Booster(model_file='T_30.model')  # init model
    dtest = xgb.DMatrix('dtest.buffer')

    xgboo_test(dtest,bst)

半年前的草稿竟然一直忘记发出来,,失策失策。

你可能感兴趣的:(数据挖掘)