目的:利用snownlp包,对京东评论进行情感分析
涉及: mysql,snownlp,pandas等工具包
代码结构如下:
注意:
import snownlp
from snownlp import SnowNLP
import pandas as pd
from snownlp import sentiment
import os
import random
current_path = os.getcwd()
df_train = pd.read_csv('jd_comments.csv', header=None)
if __name__ == '__main__':
# 1 数据收集 执行一次即可
# df_train = pd.read_csv('jd_comments.csv', header=None)
# df_train[df_train[2] == 3].iloc[:, 1].to_csv('pos.txt', sep='\t', index=False)
# df_train[df_train[2] == 1].iloc[:, 1].to_csv('neg.txt', sep='\t', index=False)
# 2 训练
# neg_path = os.path.abspath(os.path.join(os.getcwd(), 'neg.txt'))
# pos_path = os.path.abspath(os.path.join(os.getcwd(), 'pos.txt'))
# mod_path = os.path.abspath(os.path.join(os.getcwd(), 'sentiment.marshal'))
# print(mod_path)
# sentiment.train(neg_path, pos_path)
# sentiment.save(mod_path)
# print(pos_path)
# 3 随机测试
# rand = random.randint(0, df_train.shape[0])
# print(list(df_train.iloc[rand]))
#
# df_test_text = df_train.iloc[rand, 1]
# s = SnowNLP(df_test_text)
# print(s.sentiments)
# print(df_train.shape[0])
# 4 拿训练集集进行测试 识别准确率
prob_list = []
for i in range(0, df_train.shape[0]):
s = SnowNLP(df_train.iloc[i, 1])
prob = round(s.sentiments)
prob_list.append(prob)
# print(type(s.sentiments))
df_train[df_train.shape[1]] = prob_list
columns = ['good', 'content', 'eval', 'created_at', 'prob']
df_train.columns = columns
df_train_result = df_train.loc[:, ['content', 'eval', 'prob']]
df_train_result['eval'] = df_train_result['eval'].map({3: 1, 1: 0})
accurate = df_train_result[df_train_result['eval'] == df_train_result['prob']].shape[0] / df_train_result.shape[0]
# df_train_result.to_csv('new_result.csv', index=False)
# print('使用 新 model 识别准确率为:', accurate)
df_train_result.to_csv('origin_result.csv', index=False)
print('使用 原始model 识别准确率为:', accurate)
注意:
新训练的模型和原始模型的识别准确率对比如下:
使用 原始model 识别准确率为: 0.6932203389830508
使用 新model 识别准确率为: 0.9050847457627119
注意:上面是拿训练集进行测试的,正常情况是要使用新的数据来测试
利用Python对京东网站二手苹果手机的评论进行爬取,并存入mysql数据库中(多余了),csv文件中。jd_crawler.py代码如下 :
import requests
from config import *
import re
import json
import math
from retrying import retry
from sql import jd_sql
import csv
# import pandas
class Jd_Comments(object):
def __init__(self):
self.req = Request()
self.start_url = "https://club.jd.com/comment/productCommentSummaries.action?referenceIds=37245978364&callback=jQuery2257914&_=1567839796282"
self.base_url = "https://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv1182&productId=37245978364&score=%s&sortType=5&page=%s" \
"&pageSize=10&isShadowSku=0&fold=1"
self.headers = {
'Referer': 'https://item.jd.com/37245978364.html'
}
self.sql = jd_sql
def get_num(self):
proxies, headers = self.req.proxy(type2=0, headers=self.headers)
response = requests.get(url=self.start_url, headers=headers, proxies=proxies)
if response.status_code in [200, 201]:
data = re.findall('jQuery2257914\((.*?)\);', response.text, re.S)[0]
data_json = json.loads(data)
CommentsCount = data_json.get('CommentsCount')[0]
PoorCount = CommentsCount.get('PoorCount')
GoodCount = CommentsCount.get('GoodCount')
CommentCount = CommentsCount.get('CommentCount')
return GoodCount, PoorCount, CommentCount
else:
print("获取好评数以及差评数失败!!")
def create_urls(self, rank, num):
urls = []
for i in range(0, math.ceil(num/10)):
url = self.base_url%(rank, i)
print(url)
urls.append(url)
return urls
@retry(stop_max_attempt_number=3, wait_random_min=1000, wait_random_max=2000)
def spider_one(self, rank, url):
proxies, headers = self.req.proxy(type2=0, headers=self.headers)
response = requests.get(url=url, headers=headers, proxies=proxies)
if response.status_code in [200, 201] and response.text:
data = re.findall('fetchJSON_comment98vv1182\((.*?)\);', response.text, re.S)[0]
data_json = json.loads(data)
comments = data_json.get('comments')
if comments:
items = []
for one in comments:
item = {}
item['good'] = '二手apple ' + one.get('referenceId')
item['content'] = one.get('content').replace('\n', '')
item['eval'] = rank
item['created_at'] = one.get('creationTime')
# 数据转存至mysql数据库中
# self.sql.insert_one(item)
# 数据保存到本地的csv文件中
self.save_to_csv(filename='jd_comments', item=item.values())
print(item)
if item:
items.append(item)
return items
else:
return False
else:
print("访问失败!!")
def spider_many(self):
GoodCount, PoorCount, CommentCount = self.get_num()
print(GoodCount, PoorCount, CommentCount)
good_urls = self.create_urls(rank=3, num=GoodCount)
for g_url in good_urls:
result = self.spider_one(rank=3, url=g_url)
if not result:
break
# print("111111111111111111111111111111111111111111111111111")
poor_urls = self.create_urls(rank=1, num=PoorCount)
page = 0
for p_url in poor_urls:
print('page: ', page)
page += 1
result = self.spider_one(rank=1, url=p_url)
if not result:
break
def save_to_csv(self, filename=None, item=[]):
name = '{}.csv'.format(filename)
with open(name, 'a', encoding='utf-8-sig') as f:
f_csv = csv.writer(f)
f_csv.writerow(item)
# f_csv.writerows(rows)
if __name__ == '__main__':
jd_comment = Jd_Comments()
jd_comment.spider_many()