import pandas as pd
import warnings
warnings.filterwarnings(‘ignore’)
“”"
功能:分析互联网新闻情感,并贴上标签
0:正面情感
1:中性情感
2:负面情感
“”"
#读取训练集数据
data_train = pd.read_csv(’./Train_DataSet.csv’)
#读取测试集数据
data_test = pd.read_csv(’./Test_DataSet.csv’)
#构建语料库,将训练集和测试集的内容都提取出来
train_sentences = data_train[‘content’]
test_sentences = data_test[‘content’]
#把文本内容合并
sentences = pd.concat([train_sentences,test_sentences])
#提取训练集中的情感标签
label = data_train[‘label’]
#导入停词库,对于一些语气词,可以不做分析,因为对情感分析没有帮助
stopwords = open(’./stopwords.txt’,encoding = ‘utf-8’).read().splitlines()
#用sklearn库中的CountVectorizer构建词袋模型
#analyzer = ‘word’指的是以词为单位进行分析,对于拉丁语系语言,有时需要以字母‘character’为单位进行分析
#ngram指分析相邻的几个词,避免原始的词袋模型中词序丢失问题
#max_features指最终的词袋矩阵里包含语料库中出现次数最多的多少个词
from sklearn.feature_extraction.text import CountVectorizer
co = CountVectorizer(
analyzer = ‘word’,
ngram_range = (1,4),
stop_words = stopwords,
max_features = 7000
)
#使用语料库,构建词袋模型
co.fit(sentences)
#将训练集随机拆分为新的训练集和验证集,默认3:1,然后进行词频统计
#新的训练集和验证集都来自于最初的训练集,都有标签
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(train_sentences,label,random_state=1234)
#用上面构建的词袋模型,把训练集和验证集中每一个词都进行特征工程,变成向量
x_train= co.transform(x_train)
x_text = co.transform(x_test)
from sklearn.linear_model import LogisticRegression
lgl = LogisticRegression()
lgl.fit(x_train,y_train)
print(‘词袋方法进行文本特征工程,使用sklearn默认的逻辑回归分类器,验证集上的预测准确率:’,lgl.score(x_test,y_test))
报错:
Traceback (most recent call last):
File “D:/Python/梁花/qing_gan_fen_xi/nature_language_V1.py”, line 41, in
co.fit(sentences)
File “C:\Users\花花\AppData\Local\Programs\Python\Python37\lib\site-packages\sklearn\feature_extraction\text.py”, line 997, in fit
self.fit_transform(raw_documents)
File “C:\Users\花花\AppData\Local\Programs\Python\Python37\lib\site-packages\sklearn\feature_extraction\text.py”, line 1031, in fit_transform
self.fixed_vocabulary_)
File “C:\Users\花花\AppData\Local\Programs\Python\Python37\lib\site-packages\sklearn\feature_extraction\text.py”, line 943, in _count_vocab
for feature in analyze(doc):
File “C:\Users\花花\AppData\Local\Programs\Python\Python37\lib\site-packages\sklearn\feature_extraction\text.py”, line 329, in
tokenize(preprocess(self.decode(doc))), stop_words)
File “C:\Users\花花\AppData\Local\Programs\Python\Python37\lib\site-packages\sklearn\feature_extraction\text.py”, line 144, in decode
raise ValueError("np.nan is an invalid document, expected byte or "
ValueError: np.nan is an invalid document, expected byte or unicode string.