使用SVM+Word2Vec 解决外卖的好评坏评分类问题

0. 声明

本文是作者,总结学到的知识,使用Sklearn,gensim,jieba等,完成分类实验。若有错误,欢迎指正。

1. 整体代码

import joblib
import jieba
import gensim
from numpy import *
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import confusion_matrix

# 加载停用词
def stop_words(path='/Users/apple/PycharmProjects/机器学习项目实训/外卖/stopwords.txt'):
    with open(path,'r',encoding='gbk',errors='ignore') as f:
        return[l.strip() for l in f]

# 文本预处理
def text_preprocessing():
    # 加载停用词
    stopwords = stop_words()
    # 读取文件
    df = pd.read_csv("/Users/apple/PycharmProjects/机器学习项目实训/外卖/waimai_10k.csv")
    # 切词并过滤调停用词
    df["review"] = df["review"].map(lambda x: " ".join([i for i in jieba.cut(x) if i not in stopwords]))
    # 保存处理好的文本
    df.to_csv("/Users/apple/PycharmProjects/机器学习项目实训/外卖/waimai.csv", index=False, header=False,
              columns=["label", "review"])

# 数据集划分
def partition_data_set():
    data = pd.read_csv("/Users/apple/PycharmProjects/机器学习项目实训/外卖/waimai.csv", header=None)  #加载数据
    data:pd.DataFrame = data.sample(frac=1.0)    #将数据打乱
    rows, cols = data.shape

    # train: val:test = 7:1: 2
    split_index_1 = int(rows * 0.2)
    split_index_2 = int(rows * 0.3)
    #数据分割
    data_test:pd.DataFrame = data.iloc[0: split_index_1, :]
    data_validate:pd.DataFrame = data.iloc[split_index_1:split_index_2, :]
    data_train:pd.DataFrame = data.iloc[split_index_2: rows, :]
    #数据保存
    data_test.to_csv("test.csv", header=None, index=False)
    data_validate.to_csv("validate.csv", header=None, index=False)
    data_train.to_csv("train.csv", header=None, index=False)
    print("划分完毕")

# 获取数据集的标签和相应的数据(darry格式)
def get_label_And_review(file):
    data = pd.read_csv(file, header=None)
    return data[0].values, data[1].values

'''
=============================================================
'''

# 获取词向量
def get_wordvec(data_review, filedesignation):
    model = gensim.models.Word2Vec(data_review, vector_size=128, workers=4, min_count=0)
    model.wv.save_word2vec_format(f"word_vec_{filedesignation}.txt", binary=False)
    return model

# 整句话的词向量,我们知道词向量的叠加同时也会将语义进行叠加,这里我们将每句话中的所有词进行词向量的相加,我们就可以定义词向量相加的方法:
def total_vector(words, model):
    vec = np.zeros(128).reshape((1, 128))
    for word in words:
        try:
            vec += model.wv[word].reshape((1, 128))
        except KeyError:
            continue
    return vec


if __name__ == '__main__':
    text_preprocessing()
    partition_data_set()
    all_label, all_review = get_label_And_review("waimai.csv")
    train_label, train_review = get_label_And_review("train.csv")
    test_label, test_review = get_label_And_review("test.csv")
    validate_label, validate_review = get_label_And_review("validate.csv")

    model_all = get_wordvec(all_review, "all_review")

    model_train = get_wordvec(train_review,"train_review")
    train_vec = np.concatenate([total_vector(words, model_train) for words in train_review])

    model_test = get_wordvec(test_review,"test_review")
    test_vec = np.concatenate([total_vector(words, model_train) for words in test_review])


    model_validate = get_wordvec(validate_review, "validate_review")
    validate_vec = np.concatenate([total_vector(words, model_train) for words in validate_review])

    svm = svm.SVC()
    svm.fit(train_vec, train_label)

    pre_test_label = svm.predict(test_vec)  # 预测

    ac_test = accuracy_score(pre_test_label, test_label)
    print(f"accuracy:{ac_test}")
    svm.score(test_vec, test_label)

    joblib.dump(svm, 'svm_waimai11.pkl')

    # f1
    f1 = f1_score(pre_test_label,test_label,average='micro')
    print(f"f1:{f1}")

    # 混淆矩阵
    metrics_out = confusion_matrix(test_label, pre_test_label)  # 可以指定label
    print(f"TP: {metrics_out[0][0]}")
    print(f"FN: {metrics_out[0][1]}")
    print(f"FP: {metrics_out[1][0]}")
    print(f"TN: {metrics_out[1][1]}")

2. 结论使用SVM+Word2Vec 解决外卖的好评坏评分类问题_第1张图片

3. 附件(数据集)

Dataset

你可能感兴趣的:(机器学习,Python,python,svm,中文分词)