文本情感分析步骤:
1.数据的爬取:
通过开发者工具来进行抓包,获取真实网址,然后利用爬虫技术对网页进行数据爬取。
import requests
import json
import csv
class comments_spider(object):
def __init__(self,filename='y'):
self.header = {
"User-Agent": "Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 92.0.4515.159Safari / 537.36Edg / 92.0.902.84"
}
#打开文件
self.fp = open(f'./{filename}.csv','w',encoding='utf-8')
self.csv_writer = csv.writer(self.fp)
print('爬虫开始!')
def parse_one_page(self,url):
respons = requests.get(url, headers=self.header) # 模拟客服端向服务器发出请求
js_data = json.loads(respons.text) # 转化成字典
# 提取数据
comment_list = js_data['comments']
for comment in comment_list:
star = comment.get('score')
userid = comment.get('id')
content = comment.get('content')
content = ' '.join(content.split('\n'))
# print(content)
self.csv_writer.writerow([userid,content,star])
#self.fp.write(f'{content}\n')
def parse_max_page(self):
for i in range(30):
print(f'正在抓取{i}页!')
#生成URL
url = f'https://club.jd.com/comment/productPageComments.action?&productId=100011151038&score=3&sortType=5&page={i}&pageSize=10&isShadowSku=0&fold=1'
#调用函数
print(f'正在解析{i}页!')
def close(self):
self.fp.close()
print('结束!')
if __name__ == '__main__' :
js_spider = comments_spider()
js_spider.parse_max_page()
js_spider.close()
import pandas as pd
import re
import jieba.posseg as psg
import numpy as np
# 数据清洗
data = pd.read_csv('1600train.csv', encoding='utf-8', header=None)
l1 = len(data)
# 去重,去除完全重复的数据
data = data.drop_duplicates()
l2 = len(data)
print('删除了%s条评论' % (l1-l2))
# 索引重置
data.reset_index()
# 删除缺失值
data = data.dropna()
# 去掉评论中的数字、字母,以及“京东”“京东商城”“美的”“热水器”“电热水器""符号"
content = data[1] # 读取第一列
useless = re.compile(r'[0-9]|京东|美的|电热水器|热水器|京东商城|[!,!。,~、?]+') # 正则表达式
content = content.apply(lambda x: useless.sub('', x))
data[1] = content
# 分词
worker = lambda s: [(x.word, x.flag) for x in psg.cut(s)] # 自定义简单分词函数
seg_word = content.apply(worker) # 对数据分词 [x.word,x.flag]
# 将词语转为数据框形式,一列是词,一列是词语所在的句子ID,最后一列是词语在该句子的位置
n_word = seg_word.apply(lambda x: len(x)) # 统计每一评论中词的个数
n_content = [[x+1]*y for x,y in zip(list(seg_word.index), list(n_word))]
index_content = sum(n_content, []) # 将嵌套的列表展开,作为词所在评论的id
seg_word = sum(seg_word, [])
word = [x[0] for x in seg_word] # 词
nature = [x[1] for x in seg_word] # 词性
content_userid = [[x] * y for x, y in zip(list(data[0]), list(n_word))]
content_userid = sum(content_userid, []) # 用户ID
content_star = [[x] * y for x, y in zip(list(data[2]), list(n_word))]
content_star = sum(content_star, []) # 评论类型 ,
result = pd.DataFrame({"index_content": index_content,
"word": word,
"nature": nature,
"content_userid": content_userid,
"content_star":content_star})
# 删除标点符号
result = result[result['nature'] != 'x'] # x表示标点符号
# 删除停用词
stop_path = open('stoplist.csv', 'r', encoding='utf-8')
stop = stop_path.readlines()
stop = [x.replace('\n', '') for x in stop]
word = list(set(word) - set(stop))
result = result[result['word'].isin(word)]
# 构造各词在对应评论的位置列
n_word = list(result.groupby(by=['index_content'])['index_content'].count())
index_word = [list(np.arange(0, y)) for y in n_word]
index_word = sum(index_word, []) # 表示词语在改评论的位置
# 合并评论id,评论中词的id,词,词性,评论类型
result['index_word'] = index_word
ind = result[['n' in x for x in result['nature']]]['index_content'].unique()
result = result[[x in ind for x in result['index_content']]]
# 将结果写出
result.to_csv("test2.csv", index=False, encoding='utf-8')
3.模型训练:将分词后的文本读入,训练各种模型,对比各种模型的优缺点,选择合适的模型进行分类。
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn import metrics
df = pd.read_csv("train.csv", encoding='utf-8')
x = df['word'] # 词
y = df['content_star'] # 标签
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=1) # 划分测试集和训练集
# TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_df=0.80, min_df=2)
# SVM Classifier
svm = SGDClassifier()
pipe = make_pipeline(tfidf_vectorizer, svm)
pipe.fit(x_train, y_train)
y_pred = pipe.predict(x_test) #预测
print('SVM')
print(metrics.classification_report(y_test, y_pred))
# KNN Classifier K近邻算法
Knn = KNeighborsClassifier()
pipe1 = make_pipeline(tfidf_vectorizer, Knn)
pipe1.fit(x_train, y_train)
y_pred1 = pipe1.predict(x_test) #预测
print('KNN')
print(metrics.classification_report(y_test, y_pred1))
# Logistic Regression Classifier 逻辑回归
Lr = LogisticRegression()
pipe2 = make_pipeline(tfidf_vectorizer, Lr)
pipe2.fit(x_train, y_train)
y_pred2 = pipe2.predict(x_test) #预测
print('LRC')
print(metrics.classification_report(y_test, y_pred2))
# Naive Bayes 朴素贝叶斯
Nb = MultinomialNB()
pipe3 = make_pipeline(tfidf_vectorizer, Nb)
pipe3.fit(x_train, y_train)
y_pred3 = pipe3.predict(x_test) #预测
print('bayes')
print(metrics.classification_report(y_test, y_pred3))