本文是作者,总结学到的知识,使用Sklearn,gensim,jieba等,完成分类实验。若有错误,欢迎指正。
import joblib
import jieba
import gensim
from numpy import *
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import confusion_matrix
# 加载停用词
def stop_words(path='/Users/apple/PycharmProjects/机器学习项目实训/外卖/stopwords.txt'):
with open(path,'r',encoding='gbk',errors='ignore') as f:
return[l.strip() for l in f]
# 文本预处理
def text_preprocessing():
# 加载停用词
stopwords = stop_words()
# 读取文件
df = pd.read_csv("/Users/apple/PycharmProjects/机器学习项目实训/外卖/waimai_10k.csv")
# 切词并过滤调停用词
df["review"] = df["review"].map(lambda x: " ".join([i for i in jieba.cut(x) if i not in stopwords]))
# 保存处理好的文本
df.to_csv("/Users/apple/PycharmProjects/机器学习项目实训/外卖/waimai.csv", index=False, header=False,
columns=["label", "review"])
# 数据集划分
def partition_data_set():
data = pd.read_csv("/Users/apple/PycharmProjects/机器学习项目实训/外卖/waimai.csv", header=None) #加载数据
data:pd.DataFrame = data.sample(frac=1.0) #将数据打乱
rows, cols = data.shape
# train: val:test = 7:1: 2
split_index_1 = int(rows * 0.2)
split_index_2 = int(rows * 0.3)
#数据分割
data_test:pd.DataFrame = data.iloc[0: split_index_1, :]
data_validate:pd.DataFrame = data.iloc[split_index_1:split_index_2, :]
data_train:pd.DataFrame = data.iloc[split_index_2: rows, :]
#数据保存
data_test.to_csv("test.csv", header=None, index=False)
data_validate.to_csv("validate.csv", header=None, index=False)
data_train.to_csv("train.csv", header=None, index=False)
print("划分完毕")
# 获取数据集的标签和相应的数据(darry格式)
def get_label_And_review(file):
data = pd.read_csv(file, header=None)
return data[0].values, data[1].values
'''
=============================================================
'''
# 获取词向量
def get_wordvec(data_review, filedesignation):
model = gensim.models.Word2Vec(data_review, vector_size=128, workers=4, min_count=0)
model.wv.save_word2vec_format(f"word_vec_{filedesignation}.txt", binary=False)
return model
# 整句话的词向量,我们知道词向量的叠加同时也会将语义进行叠加,这里我们将每句话中的所有词进行词向量的相加,我们就可以定义词向量相加的方法:
def total_vector(words, model):
vec = np.zeros(128).reshape((1, 128))
for word in words:
try:
vec += model.wv[word].reshape((1, 128))
except KeyError:
continue
return vec
if __name__ == '__main__':
text_preprocessing()
partition_data_set()
all_label, all_review = get_label_And_review("waimai.csv")
train_label, train_review = get_label_And_review("train.csv")
test_label, test_review = get_label_And_review("test.csv")
validate_label, validate_review = get_label_And_review("validate.csv")
model_all = get_wordvec(all_review, "all_review")
model_train = get_wordvec(train_review,"train_review")
train_vec = np.concatenate([total_vector(words, model_train) for words in train_review])
model_test = get_wordvec(test_review,"test_review")
test_vec = np.concatenate([total_vector(words, model_train) for words in test_review])
model_validate = get_wordvec(validate_review, "validate_review")
validate_vec = np.concatenate([total_vector(words, model_train) for words in validate_review])
svm = svm.SVC()
svm.fit(train_vec, train_label)
pre_test_label = svm.predict(test_vec) # 预测
ac_test = accuracy_score(pre_test_label, test_label)
print(f"accuracy:{ac_test}")
svm.score(test_vec, test_label)
joblib.dump(svm, 'svm_waimai11.pkl')
# f1
f1 = f1_score(pre_test_label,test_label,average='micro')
print(f"f1:{f1}")
# 混淆矩阵
metrics_out = confusion_matrix(test_label, pre_test_label) # 可以指定label
print(f"TP: {metrics_out[0][0]}")
print(f"FN: {metrics_out[0][1]}")
print(f"FP: {metrics_out[1][0]}")
print(f"TN: {metrics_out[1][1]}")
Dataset