【预选赛】2019中国高校计算机大赛——大数据挑战赛

# -*- coding: UTF-8 -*-
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup

def review_to_wordlist(review):
    '''
    把IMDB的评论转成词序列
    参考:http://blog.csdn.net/longxinchen_ml/article/details/50629613
    '''
    # 去掉HTML标签,拿到内容
    review_text = BeautifulSoup(review, "html.parser").get_text()
    # 用正则表达式取出符合规范的部分
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    # 小写化所有的词,并转成词list
    words = review_text.lower().split()
    # 返回words
    return words

载入数据集

# 载入数据集
train = pd.read_csv('data/new_train.csv', header=0)
test = pd.read_csv('data/new_test.csv', header=0)
print (train.head())
print (test.head())
   ID  sentiment                                             review
0   1          1                       Jo bhi ap se tou behtar hoon
1   2          0          ya Allah meri sister Affia ki madad farma
2   3          1  Yeh khud chahta a is umar main shadi krna   ha...
3   4          1      Tc   Apky mun xe exe alfax achy nae lgty     
4   5          0                                               Good
   id                                             review
0   1   Jis ke aiteraf mien inhe behtareen muaawin ac...
1   2                           Thank you   same to you 
2   3  ALLAH ki marzi hai Beshak wohi ata karne wala ...
3   4  Asal masla yehi hei k wo iss umar mein bhi sha...
4   5  Chaudhry Rehmat Ali  ne    January      ko  Ab...

预处理数据

# 预处理数据
label = train['sentiment']
train_data = []
for i in range(len(train['review'])):
    train_data.append(' '.join(review_to_wordlist(train['review'][i])))
test_data = []
for i in range(len(test['review'])):
    test_data.append(' '.join(review_to_wordlist(test['review'][i])))
# 预览数据
print (train_data[0], '\n')
print (test_data[0])
jo bhi ap se tou behtar hoon 

jis ke aiteraf mien inhe behtareen muaawin actor ke national film award se nawaza gaya

特征处理

#TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer as TFIDF
# 参考:http://blog.csdn.net/longxinchen_ml/article/details/50629613
tfidf = TFIDF(min_df=2, # 最小支持度为2
           max_features=None,
           strip_accents='unicode',
           analyzer='word',
           token_pattern=r'\w{1,}',
           ngram_range=(1, 3),  # 二元文法模型
           use_idf=1,
           smooth_idf=1,
           sublinear_tf=1,
           stop_words = 'english') # 去掉英文停用词

# 合并训练和测试集以便进行TFIDF向量化操作
data_all = train_data + test_data
len_train = len(train_data)

tfidf.fit(data_all)
data_all = tfidf.transform(data_all)
# 恢复成训练集和测试集部分
train_x = data_all[:len_train]
test_x = data_all[len_train:]
print ('TF-IDF处理结束.')
TF-IDF处理结束.


D:\anaconda\lib\site-packages\sklearn\feature_extraction\text.py:1059: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):

朴素贝叶斯训练

from sklearn.naive_bayes import MultinomialNB as MNB

model_NB = MNB()
model_NB.fit(train_x, label)
MNB(alpha=1.0, class_prior=None, fit_prior=True)

from sklearn.cross_validation import cross_val_score
import numpy as np

print ("多项式贝叶斯分类器10折交叉验证得分: ", np.mean(cross_val_score(model_NB, train_x, label, cv=10, scoring='roc_auc')))
多项式贝叶斯分类器10折交叉验证得分:  0.8631634970590059
test_predicted = np.array(model_NB.predict_proba(test_x))
# print ('保存结果...')
# nb_output = pd.DataFrame(data=test_predicted, columns=['sentiment'])
# nb_output['id'] = test['id']
# nb_output = nb_output[['id', 'sentiment']]
# nb_output.to_csv('nb_output.csv', index=False)
# print ('结束.')
test_predicted
array([[0.88318156, 0.11681844],
       [0.87972973, 0.12027027],
       [0.68929881, 0.31070119],
       ...,
       [0.5871227 , 0.4128773 ],
       [0.38977763, 0.61022237],
       [0.46662657, 0.53337343]])

你可能感兴趣的:(比赛)