目录
1、基于TF的关键词提取
2、根据词频将文本转化为向量
3、基于树模型的重要特征选择
5、完整代码实现
6、分类结果
使用TF词频对训练集clean_data_train进行关键词提取,选取topK个关键词作为特征词,即topK=10000。
# 训练集中词频统计,并计算TF值
def words_tf():
train_data = pd.read_csv('data/clean_data_train.csv', sep=',', names=['contents', 'labels']).astype(str)
sentence_list = []
lenth = len(train_data)
for i in range(lenth):
sentence_list.append(str(train_data['contents'][i]).split())
# 总词频统计
doc_frequency = defaultdict(int)
for word_list in sentence_list:
for i in word_list:
doc_frequency[i] += 1
# 计算每个词的TF值
word_tf = {} # 存储每个词的tf值
for i in doc_frequency:
word_tf[i] = doc_frequency[i] / sum(doc_frequency.values())
words_tf = sorted(word_tf.items(), key=lambda x: x[1], reverse=True)
return words_tf[:10000]
根据所提取的关键词及其词频权重值,将训练集clean_data_train和测试集clean_data_test转换为向量,作为模型的输入。
# 根据词频,将文本转换为向量
def word2vec(keywords_tf, doc_sentence):
keywords = list(dict(keywords_tf).keys()) # 获取关键词
tf_weight = list(dict(keywords_tf).values()) # 获取关键词tf值
docvec_list = []
for sentence in doc_sentence:
docvec = [0] * len(keywords_tf)
for word in sentence:
if word in keywords:
docvec[keywords.index(word)] = tf_weight[keywords.index(word)]
docvec_list.append(docvec)
return docvec_list
# 将训练集和测试集换为文本向量
def doc_vec(x_train, x_test):
keywords_tf = words_tf() # 获取词频关键词
# 训练集转换为向量
train_lenth = len(x_train)
train_data_list = []
for i in range(train_lenth):
train_data_list.append(str(x_train[i]).split())
train_docvec_list = word2vec(keywords_tf, train_data_list)
# 测试集转换为向量
test_lenth = len(x_test)
test_data_list = []
for i in range(test_lenth):
test_data_list.append(str(x_test[i]).split())
test_docvec_list = word2vec(keywords_tf, test_data_list)
return train_docvec_list, test_docvec_list
使用树模型对样本进行训练,从10000个特征中选择重要特征(选择特征重要性为1.5倍均值的特征),即features=986。
# 导入SelectFromModel结合ExtraTreesClassifier计算特征重要性,并按重要性阈值选择特征。
clf_model = ExtraTreesClassifier(n_estimators=250, random_state=0)
# clf_model=RandomForestClassifier(n_estimators=250,random_state=0)
clf_model.fit(x_train, y_train)
# 获取每个词的特征权重,数值越高特征越重要l
importances = clf_model.feature_importances_
'''
# 将词和词的权重存入字典并写入文件
feature_words_dic = {}
for i in range(len(words_list)):
feature_words_dic[words_list[i][0]] = importances[i]
# 对字典按权重由大到小进行排序
words_info_dic_sort = sorted(feature_words_dic.items(), key=lambda x: x[1], reverse=True)
#将前2000个词的权重字典写入文件
key_words_importance=dict(words_info_dic_sort[:2000])
with open('data/key_words_importance','w') as f:
f.write(str(key_words_importance))
'''
# 选择特征重要性为1.5倍均值的特征
model = SelectFromModel(clf_model, threshold='1.5*mean', prefit=True)
x_train_new = model.transform(x_train) # 返回训练集所选特征
x_test_new = model.transform(x_test) # 返回测试集所选特征
4、lightGBM模型构建
lightGBM参数设置:num_round=2000,max_depth=6,learning_rate=0.1,其余采用模型默认参数。
# 创建成lgb特征的数据集格式
lgb_train = lgb.Dataset(x_train_new, y_train)
lgb_val = lgb.Dataset(x_test_new, y_test, reference=lgb_train)
# 构建lightGBM模型
params = {'max_depth': 6, 'min_data_in_leaf': 20, 'num_leaves': 35, 'learning_rate': 0.1, 'lambda_l1': 0.1,
'lambda_l2': 0.2, 'objective': 'multiclass', 'num_class': 3, 'verbose': -1}
# 设置迭代次数,默认为100,通常设置为100+
num_boost_round = 2000
# 训练lightGBM模型
gbm = lgb.train(params, lgb_train, num_boost_round, verbose_eval=100, valid_sets=lgb_val)
# coding=utf-8
import pandas as pd
import numpy as np
from collections import defaultdict
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
import lightgbm as lgb
from sklearn import metrics
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
# 训练集中词频统计,并计算TF值
def words_tf():
train_data = pd.read_csv('data/clean_data_train.csv', sep=',', names=['contents', 'labels']).astype(str)
sentence_list = []
lenth = len(train_data)
for i in range(lenth):
sentence_list.append(str(train_data['contents'][i]).split())
# 总词频统计
doc_frequency = defaultdict(int)
for word_list in sentence_list:
for i in word_list:
doc_frequency[i] += 1
# 计算每个词的TF值
word_tf = {} # 存储每个词的tf值
for i in doc_frequency:
word_tf[i] = doc_frequency[i] / sum(doc_frequency.values())
words_tf = sorted(word_tf.items(), key=lambda x: x[1], reverse=True)
return words_tf[:10000]
# 根据词频,将文本转换为向量
def word2vec(keywords_tf, doc_sentence):
keywords = list(dict(keywords_tf).keys()) # 获取关键词
tf_weight = list(dict(keywords_tf).values()) # 获取关键词tf值
docvec_list = []
for sentence in doc_sentence:
docvec = [0] * len(keywords_tf)
for word in sentence:
if word in keywords:
docvec[keywords.index(word)] = tf_weight[keywords.index(word)]
docvec_list.append(docvec)
return docvec_list
# 将训练集和测试集换为文本向量
def doc_vec(x_train, x_test):
keywords_tf = words_tf() # 获取词频关键词
# 训练集转换为向量
train_lenth = len(x_train)
train_data_list = []
for i in range(train_lenth):
train_data_list.append(str(x_train[i]).split())
train_docvec_list = word2vec(keywords_tf, train_data_list)
# 测试集转换为向量
test_lenth = len(x_test)
test_data_list = []
for i in range(test_lenth):
test_data_list.append(str(x_test[i]).split())
test_docvec_list = word2vec(keywords_tf, test_data_list)
return train_docvec_list, test_docvec_list
if __name__ == '__main__':
train_data = pd.read_csv('data/clean_data_train.csv', sep=',', names=['contents', 'labels']).astype(str)
x_train, x_test, y_train, y_test = train_test_split(train_data['contents'], train_data['labels'], test_size=0.05)
x_train = np.array(x_train)
x_test = np.array(x_test)
cw = lambda x: int(x)
y_train = np.array(y_train.apply(cw))
y_test = np.array(y_test.apply(cw))
x_train, x_test = doc_vec(x_train, x_test) # 训练集和测试集向量化
x_train, y_train = shuffle(x_train, y_train, random_state=0) # 打乱顺序
# 导入SelectFromModel结合ExtraTreesClassifier计算特征重要性,并按重要性阈值选择特征。
clf_model = ExtraTreesClassifier(n_estimators=250, random_state=0)
# clf_model=RandomForestClassifier(n_estimators=250,random_state=0)
clf_model.fit(x_train, y_train)
# 获取每个词的特征权重,数值越高特征越重要l
importances = clf_model.feature_importances_
'''
# 将词和词的权重存入字典并写入文件
feature_words_dic = {}
for i in range(len(words_list)):
feature_words_dic[words_list[i][0]] = importances[i]
# 对字典按权重由大到小进行排序
words_info_dic_sort = sorted(feature_words_dic.items(), key=lambda x: x[1], reverse=True)
#将前2000个词的权重字典写入文件
key_words_importance=dict(words_info_dic_sort[:2000])
with open('data/key_words_importance','w') as f:
f.write(str(key_words_importance))
'''
# 选择特征重要性为1.5倍均值的特征
model = SelectFromModel(clf_model, threshold='1.5*mean', prefit=True)
x_train_new = model.transform(x_train) # 返回训练集所选特征
x_test_new = model.transform(x_test) # 返回测试集所选特征
print(x_train_new.shape)
print(x_test_new.shape)
# 创建成lgb特征的数据集格式
lgb_train = lgb.Dataset(x_train_new, y_train)
lgb_val = lgb.Dataset(x_test_new, y_test, reference=lgb_train)
# 构建lightGBM模型
params = {'max_depth': 6, 'min_data_in_leaf': 20, 'num_leaves': 35, 'learning_rate': 0.1, 'lambda_l1': 0.1,
'lambda_l2': 0.2, 'objective': 'multiclass', 'num_class': 3, 'verbose': -1}
# 设置迭代次数,默认为100,通常设置为100+
num_boost_round = 2000
# 训练lightGBM模型
gbm = lgb.train(params, lgb_train, num_boost_round, verbose_eval=100, valid_sets=lgb_val)
# 保存模型到文件
# gbm.save_model('data/lightGBM_model')
# 预测数据集
result = gbm.predict(x_test_new, num_iteration=gbm.best_iteration)
y_predict = np.argmax(result, axis=1) # 获得最大概率对应的标签
label_all = ['负面', '中性', '正面']
confusion_mat = metrics.confusion_matrix(y_test, y_predict)
df = pd.DataFrame(confusion_mat, columns=label_all)
df.index = label_all
print('准确率:', metrics.accuracy_score(y_test, y_predict))
print('confusion_matrix:', df)
print('分类报告:', metrics.classification_report(y_test, y_predict))