1.Imdb影评的数据集介绍与下载
2.贝叶斯原理介绍
3.TF-IDF是什么
def preprocess_data():
X_orig, Y_orig = get_data(aclImdb_train_dir)
X_orig_test, Y_orig_test = get_data(aclImdb_test_dir)
X_orig = np.concatenate([X_orig, X_orig_test])
Y_orig = np.concatenate([Y_orig, Y_orig_test])
cv = CountVectorizer(max_features=vocab_size)
tfidf = TfidfVectorizer(max_features=vocab_size)
cv.fit(X_orig)
tfidf.fit(X_orig)
print(cv.vocabulary_)
# 3. 4.transform training data into a 'document-term matrix' (which is a sparse matrix) use “transform()”
train_data = cv.transform(X_orig)
tfidf_train_data = tfidf.transform(X_orig)
# (the index of the list , the index of the dict ) the frequency of the list[index]
print(cv.get_feature_names())
print(train_data)
train_data = train_data.toarray()
tfidf_train_data = tfidf_train_data.toarray()
print(train_data)
joblib.dump(cv, "data/CountVectorizer.joblib")
joblib.dump(tfidf, "data/TfidfVectorizer.joblib")
np.savez(r'data\CountVectorizer_trainData', x=train_data, y=Y_orig)
np.savez(r'data\TfidfVectorizer_trainData', x=tfidf_train_data, y=Y_orig)
def train_my_module(is_tfidf):
if is_tfidf:
trainDataNew = np.load(r'data/TfidfVectorizer_trainData.npz')
else:
trainDataNew = np.load('data/CountVectorizer_trainData.npz')
x = trainDataNew['x']
y = trainDataNew['y']
x_train,x_test,y_train,y_test=train_test_split(x, y, test_size=0.3)
x_train,x_test,y_train,y_test=np.array(x_train),np.array(x_test),np.array(y_train),np.array(y_test)
print(x_train.shape,x_test.shape)
#训练数据
module=MultinomialNB()
module.fit(x_train,y_train)
#测试数据
y_pred=module.predict(x_test)
if is_tfidf:
joblib.dump(module, r'data/Tfidf_bayes_module.joblib')
else:
joblib.dump(module, r'data/bayes_module.joblib')
#输出
print("正确值:{0}".format(y_test))
print("预测值:{0}".format(y_pred))
print("准确率:%f%%"%(accuracy_score(y_test, y_pred)*100))
def predict_my_module(is_tfidf):
if is_tfidf:
model = joblib.load(r'data/Tfidf_bayes_module.joblib')
else:
model = joblib.load(r'data/bayes_module.joblib')
#neg:0 postive:1
review =["the character is so poorly written.", "this is bad movie ", "I'm not very disappoint for this movie", "I'm very happy for this movie" ]
if is_tfidf:
cv = joblib.load(r'data/TfidfVectorizer.joblib')
else:
cv = joblib.load(r'data/CountVectorizer.joblib')
train_data = cv.transform(review)
train_data = train_data.toarray()
s = model.predict(train_data)
print(s)
如果is_tfidf为False,那么就是用CountVectorizer生成的词向量训练的模型
如果is_tfidf为True,那么就是用TfidfVectorizer生成的词向量训练的模型
if __name__ == '__main__':
preprocess_data()
is_tfidf =True
train_my_module(is_tfidf)
#predict_my_module(is_tfidf)
运行结果如下,可知TfidfVectorizer 生成的词向量准确率最高。
Count Vectorizer | Tfidf Vectorizer |
---|---|
准确率:84.326667% | 准确率: 85.893333% |
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import os as os
import joblib
vocab_size = 30000
aclImdb_train_dir = r'D:\train_data\aclImdb\aclImdb\train'
aclImdb_test_dir = r'D:\train_data\aclImdb\aclImdb\test'
# remove html tag like '
'
def rm_tags(text):
re_tag = re.compile(r'<[^>]+>')
return re_tag.sub(' ', text)
def clean_str(string):
return string.strip().lower()
def process(text):
text = clean_str(text)
text = rm_tags(text)
return text
def get_data(datapath):
pos_files = os.listdir(datapath + '/pos')
neg_files = os.listdir(datapath + '/neg')
print(len(pos_files))
print(len(neg_files))
pos_all = []
neg_all = []
for pf, nf in zip(pos_files, neg_files):
with open(datapath + '/pos' + '/' + pf, encoding='utf-8') as f:
s = f.read()
s = process(s)
pos_all.append(s)
with open(datapath + '/neg' + '/' + nf, encoding='utf-8') as f:
s = f.read()
s = process(s)
neg_all.append(s)
X_orig= np.array(pos_all + neg_all)
Y_orig = np.array([1 for _ in range(len(pos_all))] + [0 for _ in range(len(neg_all))])
return X_orig, Y_orig
def preprocess_data():
X_orig, Y_orig = get_data(aclImdb_train_dir)
X_orig_test, Y_orig_test = get_data(aclImdb_test_dir)
X_orig = np.concatenate([X_orig, X_orig_test])
Y_orig = np.concatenate([Y_orig, Y_orig_test])
cv = CountVectorizer(max_features=vocab_size)
tfidf = TfidfVectorizer(max_features=vocab_size)
cv.fit(X_orig)
tfidf.fit(X_orig)
print(cv.vocabulary_)
# 3. 4.transform training data into a 'document-term matrix' (which is a sparse matrix) use “transform()”
train_data = cv.transform(X_orig)
tfidf_train_data = tfidf.transform(X_orig)
# (the index of the list , the index of the dict ) the frequency of the list[index]
print(cv.get_feature_names())
print(train_data)
train_data = train_data.toarray()
tfidf_train_data = tfidf_train_data.toarray()
print(train_data)
joblib.dump(cv, "data/CountVectorizer.joblib")
joblib.dump(tfidf, "data/TfidfVectorizer.joblib")
np.savez(r'data\CountVectorizer_trainData', x=train_data, y=Y_orig)
np.savez(r'data\TfidfVectorizer_trainData', x=tfidf_train_data, y=Y_orig)
def train_my_module(is_tfidf):
if is_tfidf:
trainDataNew = np.load(r'data/TfidfVectorizer_trainData.npz')
else:
trainDataNew = np.load('data/CountVectorizer_trainData.npz')
x = trainDataNew['x']
y = trainDataNew['y']
x_train,x_test,y_train,y_test=train_test_split(x, y, test_size=0.3)
x_train,x_test,y_train,y_test=np.array(x_train),np.array(x_test),np.array(y_train),np.array(y_test)
print(x_train.shape,x_test.shape)
#训练数据
module=MultinomialNB()
module.fit(x_train,y_train)
#测试数据
y_pred=module.predict(x_test)
if is_tfidf:
joblib.dump(module, r'data/Tfidf_bayes_module.joblib')
else:
joblib.dump(module, r'data/bayes_module.joblib')
#输出
print("正确值:{0}".format(y_test))
print("预测值:{0}".format(y_pred))
print("准确率:%f%%"%(accuracy_score(y_test, y_pred)*100))
def predict_my_module(is_tfidf):
if is_tfidf:
model = joblib.load(r'data/Tfidf_bayes_module.joblib')
else:
model = joblib.load(r'data/bayes_module.joblib')
#neg:0 postive:1
review =["the character is so poorly written.", "this is bad movie ", "I'm not very disappoint for this movie", "I'm very happy for this movie" ]
if is_tfidf:
cv = joblib.load(r'data/TfidfVectorizer.joblib')
else:
cv = joblib.load(r'data/CountVectorizer.joblib')
train_data = cv.transform(review)
train_data = train_data.toarray()
s = model.predict(train_data)
print(s)
if __name__ == '__main__':
preprocess_data()
is_tfidf =True
train_my_module(is_tfidf)
#predict_my_module(is_tfidf)