import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
def review_to_wordlist(review):
'''
把IMDB的评论转成词序列
参考:http://blog.csdn.net/longxinchen_ml/article/details/50629613
'''
review_text = BeautifulSoup(review, "html.parser").get_text()
review_text = re.sub("[^a-zA-Z]"," ", review_text)
words = review_text.lower().split()
return words
载入数据集
train = pd.read_csv('data/new_train.csv', header=0)
test = pd.read_csv('data/new_test.csv', header=0)
print (train.head())
print (test.head())
ID sentiment review
0 1 1 Jo bhi ap se tou behtar hoon
1 2 0 ya Allah meri sister Affia ki madad farma
2 3 1 Yeh khud chahta a is umar main shadi krna ha...
3 4 1 Tc Apky mun xe exe alfax achy nae lgty
4 5 0 Good
id review
0 1 Jis ke aiteraf mien inhe behtareen muaawin ac...
1 2 Thank you same to you
2 3 ALLAH ki marzi hai Beshak wohi ata karne wala ...
3 4 Asal masla yehi hei k wo iss umar mein bhi sha...
4 5 Chaudhry Rehmat Ali ne January ko Ab...
预处理数据
label = train['sentiment']
train_data = []
for i in range(len(train['review'])):
train_data.append(' '.join(review_to_wordlist(train['review'][i])))
test_data = []
for i in range(len(test['review'])):
test_data.append(' '.join(review_to_wordlist(test['review'][i])))
print (train_data[0], '\n')
print (test_data[0])
jo bhi ap se tou behtar hoon
jis ke aiteraf mien inhe behtareen muaawin actor ke national film award se nawaza gaya
特征处理
from sklearn.feature_extraction.text import TfidfVectorizer as TFIDF
tfidf = TFIDF(min_df=2,
max_features=None,
strip_accents='unicode',
analyzer='word',
token_pattern=r'\w{1,}',
ngram_range=(1, 3),
use_idf=1,
smooth_idf=1,
sublinear_tf=1,
stop_words = 'english')
data_all = train_data + test_data
len_train = len(train_data)
tfidf.fit(data_all)
data_all = tfidf.transform(data_all)
train_x = data_all[:len_train]
test_x = data_all[len_train:]
print ('TF-IDF处理结束.')
TF-IDF处理结束.
D:\anaconda\lib\site-packages\sklearn\feature_extraction\text.py:1059: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
朴素贝叶斯训练
from sklearn.naive_bayes import MultinomialNB as MNB
model_NB = MNB()
model_NB.fit(train_x, label)
MNB(alpha=1.0, class_prior=None, fit_prior=True)
from sklearn.cross_validation import cross_val_score
import numpy as np
print ("多项式贝叶斯分类器10折交叉验证得分: ", np.mean(cross_val_score(model_NB, train_x, label, cv=10, scoring='roc_auc')))
多项式贝叶斯分类器10折交叉验证得分: 0.8631634970590059
test_predicted = np.array(model_NB.predict_proba(test_x))
test_predicted
array([[0.88318156, 0.11681844],
[0.87972973, 0.12027027],
[0.68929881, 0.31070119],
...,
[0.5871227 , 0.4128773 ],
[0.38977763, 0.61022237],
[0.46662657, 0.53337343]])