1、背景
数据集大小为1万,其中训练集5000,测试集5000,共十类。
开发环境:Python3.6+Windows+PyCharm
2、前期准备
获取1万数据集,分词,去停止词并存入MySQL数据库。
3、从数据库读取数据
def train_corpus_generator():
global db
# coding:utf-8
num = 0
for topic in topics:
num += 1
# 到第十类时要终止迭代器
print(topic)
if num == 11:
return [[topic, item[1],item[2]] for item in db.query(topic, 0, config["trainset_num"])]
else:
yield [[topic, item[1],item[2]] for item in db.query(topic, 0, config["trainset_num"])]
4、获取训练集与测试集数据并转换为要求格式
train = train_corpus_generator()
train = tuple(train)
train_opinion, train_content = stop_words_ch(train)
train_opinion = transLabel(train_opinion) # 将类别改为数字格式
train_opinion = np.array(train_opinion)
print("train data load finished")
# 加载测试集数据
test = test_corpus_generator()
test = tuple(test)
test_opinion, test_content = stop_words_ch(test)
test_opinion = transLabel(test_opinion) # 将类别改为数字格式
test_opinion = np.array(test_opinion)
print("test data load finished")
转换字符串类别为数字
def transLabel(labels):
for i in range(len(labels)):
for j in range(len(topics)):
if labels[i]==topics[j]:
labels[i]=j
return labels
5、使用sklearn计算TF-IDF
# 计算训练集 TF-IDF
vectorizer = CountVectorizer()
tf_idf_transformer = TfidfTransformer()
tf_idf = tf_idf_transformer.fit_transform(vectorizer.fit_transform(train_content))
weight = tf_idf.toarray()
wordss = vectorizer.get_feature_names() #特征
print(tf_idf.shape)
# 将数据转化为DMatrix类型
dtrain = xgb.DMatrix(weight, label=train_opinion)
# 计算测试集-TF-IDF
test_tf_idf = tf_idf_transformer.transform(vectorizer.transform(test_content))
test_weight = test_tf_idf.toarray()
print(test_weight.shape)
dtest = xgb.DMatrix(test_weight, label=test_opinion)
# 保存测试集数据,以便模型训练完成直接调用
dtest.save_binary('dtest.buffer')
6、设置参数+开始训练
def xgboo_train():
# 设置一系列训练参数
param = {
'booster': 'gbtree',
'objective': 'multi:softmax',
'num_class': 10, # 类数,与 multisoftmax 并用
'max_depth': 20,
'eta': 0.4,
'eval_metric': 'merror',
'silent': 1,
}
dtrain,dtest = Get_data()
evallist = [(dtrain, 'train'), (dtest, 'test')]
num_round = 200
# 开始训练
print("开始训练!")
bst = xgb.train(param, dtrain, num_round, evallist, early_stopping_rounds=10)
# 保存训练模型
bst.save_model('train.model')
return dtest,bst
7、测试数据,并计算准确度,召回率和F测度
def xgboo_test(dtest,bst):
print("开始测试!")
preds = bst.predict(dtest)
TP = [0 for n in range(10)]
FP = [0 for n in range(10)]
FN = [0 for n in range(10)]
TN = [0 for n in range(10)]
for i, pre in enumerate(preds):
real_type = int(i / config["testset_num"])
pre_type = int(pre)
print("real,pre", real_type, pre_type)
if real_type == pre_type:
TP[real_type] += 1 # 正预测为正,正值+1
else:
FP[real_type] += 1 # 正预测为负,负值+1
FN[pre_type] += 1 # 负预测为正,正值+1
ACC_SUM = 0
for i in range(0, 10):
TN[i] = sum(TP) - TP[i]
ACC = TP[i] / (TP[i] + FP[i])
REC = TP[i] / (TP[i] + FN[i])
FM = 2 * TP[i] / (2 * TP[i] + FP[i] + TN[i])
ACC_SUM += ACC
print(i, " 准确率:", ACC, " 召回率:",REC, " F-测度:", FM)
print("各类平均准确率为:", float('%.3f' % (ACC_SUM / 10)))
8、利用训练完毕的模型直接测试
bst = xgb.Booster(model_file='T_30.model') # init model
dtest = xgb.DMatrix('dtest.buffer')
xgboo_test(dtest,bst)
半年前的草稿竟然一直忘记发出来,,失策失策。