[机器学习-实战篇]Imdb数据集情感分析之贝叶斯

一, 前言

1.Imdb影评的数据集介绍与下载
2.贝叶斯原理介绍
3.TF-IDF是什么

二,代码介绍

preprocess_data() 方法包含以下步骤:

  1. 加载Imdb影评数据。
  2. 用数据训练CountVectorizer或者TfidfVectorizer。
  3. 保持CountVectorizer与TfidfVectorizer模型,因为预测的时候要用这两个模型把要预测的词转化为词向量。
  4. 保持转换为词向量的数据集。
def preprocess_data():
    X_orig, Y_orig = get_data(aclImdb_train_dir)
    X_orig_test, Y_orig_test = get_data(aclImdb_test_dir)
    X_orig = np.concatenate([X_orig, X_orig_test])
    Y_orig = np.concatenate([Y_orig, Y_orig_test])

    cv = CountVectorizer(max_features=vocab_size)
    tfidf = TfidfVectorizer(max_features=vocab_size)

    cv.fit(X_orig)
    tfidf.fit(X_orig)

    print(cv.vocabulary_)
    # 3. 4.transform training data into a 'document-term matrix' (which is a sparse matrix) use “transform()”
    train_data = cv.transform(X_orig)
    tfidf_train_data = tfidf.transform(X_orig)

    # (the index of the list , the index of the dict ) the frequency of the list[index]
    print(cv.get_feature_names())
    print(train_data)
    train_data = train_data.toarray()

    tfidf_train_data = tfidf_train_data.toarray()

    print(train_data)

    joblib.dump(cv, "data/CountVectorizer.joblib")
    joblib.dump(tfidf, "data/TfidfVectorizer.joblib")
    np.savez(r'data\CountVectorizer_trainData', x=train_data, y=Y_orig)
    np.savez(r'data\TfidfVectorizer_trainData', x=tfidf_train_data, y=Y_orig)

运行后会生成下面四个文件
[机器学习-实战篇]Imdb数据集情感分析之贝叶斯_第1张图片

train_my_module()方法:

  1. 加载已经生成的词向量。
  2. 训练你MultinomialNB贝叶斯模型。
  3. 测试你的模型
def train_my_module(is_tfidf):
    if is_tfidf:
        trainDataNew = np.load(r'data/TfidfVectorizer_trainData.npz')
    else:
        trainDataNew = np.load('data/CountVectorizer_trainData.npz')
    x = trainDataNew['x']
    y = trainDataNew['y']

    x_train,x_test,y_train,y_test=train_test_split(x, y, test_size=0.3)
    x_train,x_test,y_train,y_test=np.array(x_train),np.array(x_test),np.array(y_train),np.array(y_test)

    print(x_train.shape,x_test.shape)
    #训练数据
    module=MultinomialNB()
    module.fit(x_train,y_train)
    #测试数据
    y_pred=module.predict(x_test)
    if is_tfidf:
        joblib.dump(module, r'data/Tfidf_bayes_module.joblib')
    else:
        joblib.dump(module, r'data/bayes_module.joblib')

    #输出
    print("正确值:{0}".format(y_test))
    print("预测值:{0}".format(y_pred))
    print("准确率:%f%%"%(accuracy_score(y_test, y_pred)*100))

predict_my_module()方法

  1. 加载训练好的贝叶斯模型
  2. 加载训练好的词向量模型
  3. 把要预测的数据转为词向量
  4. 预测
def predict_my_module(is_tfidf):
    if is_tfidf:
        model = joblib.load(r'data/Tfidf_bayes_module.joblib')
    else:
        model = joblib.load(r'data/bayes_module.joblib')

    #neg:0 postive:1
    review =["the character is so poorly written.", "this is bad movie ",  "I'm not very disappoint for this movie", "I'm very happy for this movie" ]
    if is_tfidf:
        cv = joblib.load(r'data/TfidfVectorizer.joblib')
    else:
        cv = joblib.load(r'data/CountVectorizer.joblib')

    train_data = cv.transform(review)
    train_data = train_data.toarray()
    s = model.predict(train_data)
    print(s)

三,CountVectorizer与TfidfVectorizer的结果对比

如果is_tfidf为False,那么就是用CountVectorizer生成的词向量训练的模型
如果is_tfidf为True,那么就是用TfidfVectorizer生成的词向量训练的模型

if __name__ == '__main__':
    preprocess_data()
    is_tfidf =True
    train_my_module(is_tfidf)
    #predict_my_module(is_tfidf)

运行结果如下,可知TfidfVectorizer 生成的词向量准确率最高

Count Vectorizer Tfidf Vectorizer
准确率:84.326667% 准确率: 85.893333%

四,情感分析的完整代码如下

import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import os as os
import joblib

vocab_size = 30000
aclImdb_train_dir = r'D:\train_data\aclImdb\aclImdb\train'
aclImdb_test_dir = r'D:\train_data\aclImdb\aclImdb\test'
# remove html tag like '

'
def rm_tags(text): re_tag = re.compile(r'<[^>]+>') return re_tag.sub(' ', text) def clean_str(string): return string.strip().lower() def process(text): text = clean_str(text) text = rm_tags(text) return text def get_data(datapath): pos_files = os.listdir(datapath + '/pos') neg_files = os.listdir(datapath + '/neg') print(len(pos_files)) print(len(neg_files)) pos_all = [] neg_all = [] for pf, nf in zip(pos_files, neg_files): with open(datapath + '/pos' + '/' + pf, encoding='utf-8') as f: s = f.read() s = process(s) pos_all.append(s) with open(datapath + '/neg' + '/' + nf, encoding='utf-8') as f: s = f.read() s = process(s) neg_all.append(s) X_orig= np.array(pos_all + neg_all) Y_orig = np.array([1 for _ in range(len(pos_all))] + [0 for _ in range(len(neg_all))]) return X_orig, Y_orig def preprocess_data(): X_orig, Y_orig = get_data(aclImdb_train_dir) X_orig_test, Y_orig_test = get_data(aclImdb_test_dir) X_orig = np.concatenate([X_orig, X_orig_test]) Y_orig = np.concatenate([Y_orig, Y_orig_test]) cv = CountVectorizer(max_features=vocab_size) tfidf = TfidfVectorizer(max_features=vocab_size) cv.fit(X_orig) tfidf.fit(X_orig) print(cv.vocabulary_) # 3. 4.transform training data into a 'document-term matrix' (which is a sparse matrix) use “transform()” train_data = cv.transform(X_orig) tfidf_train_data = tfidf.transform(X_orig) # (the index of the list , the index of the dict ) the frequency of the list[index] print(cv.get_feature_names()) print(train_data) train_data = train_data.toarray() tfidf_train_data = tfidf_train_data.toarray() print(train_data) joblib.dump(cv, "data/CountVectorizer.joblib") joblib.dump(tfidf, "data/TfidfVectorizer.joblib") np.savez(r'data\CountVectorizer_trainData', x=train_data, y=Y_orig) np.savez(r'data\TfidfVectorizer_trainData', x=tfidf_train_data, y=Y_orig) def train_my_module(is_tfidf): if is_tfidf: trainDataNew = np.load(r'data/TfidfVectorizer_trainData.npz') else: trainDataNew = np.load('data/CountVectorizer_trainData.npz') x = trainDataNew['x'] y = trainDataNew['y'] x_train,x_test,y_train,y_test=train_test_split(x, y, test_size=0.3) x_train,x_test,y_train,y_test=np.array(x_train),np.array(x_test),np.array(y_train),np.array(y_test) print(x_train.shape,x_test.shape) #训练数据 module=MultinomialNB() module.fit(x_train,y_train) #测试数据 y_pred=module.predict(x_test) if is_tfidf: joblib.dump(module, r'data/Tfidf_bayes_module.joblib') else: joblib.dump(module, r'data/bayes_module.joblib') #输出 print("正确值:{0}".format(y_test)) print("预测值:{0}".format(y_pred)) print("准确率:%f%%"%(accuracy_score(y_test, y_pred)*100)) def predict_my_module(is_tfidf): if is_tfidf: model = joblib.load(r'data/Tfidf_bayes_module.joblib') else: model = joblib.load(r'data/bayes_module.joblib') #neg:0 postive:1 review =["the character is so poorly written.", "this is bad movie ", "I'm not very disappoint for this movie", "I'm very happy for this movie" ] if is_tfidf: cv = joblib.load(r'data/TfidfVectorizer.joblib') else: cv = joblib.load(r'data/CountVectorizer.joblib') train_data = cv.transform(review) train_data = train_data.toarray() s = model.predict(train_data) print(s) if __name__ == '__main__': preprocess_data() is_tfidf =True train_my_module(is_tfidf) #predict_my_module(is_tfidf)

你可能感兴趣的:(Sklearn,机器学习)