机器学习之情感分析

文本情感分析步骤:
1.数据的爬取:
通过开发者工具来进行抓包,获取真实网址,然后利用爬虫技术对网页进行数据爬取。

import requests
import json
import csv
class comments_spider(object):

    def __init__(self,filename='y'):
        self.header = {
            "User-Agent": "Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 92.0.4515.159Safari / 537.36Edg / 92.0.902.84"
        }
        #打开文件
        self.fp = open(f'./{filename}.csv','w',encoding='utf-8')
        self.csv_writer = csv.writer(self.fp)
        print('爬虫开始!')

    def parse_one_page(self,url):
        respons = requests.get(url, headers=self.header)  # 模拟客服端向服务器发出请求
        js_data = json.loads(respons.text)  # 转化成字典
        # 提取数据
        comment_list = js_data['comments']
        for comment in comment_list:
            star = comment.get('score')
            userid = comment.get('id')
            content = comment.get('content')
            content = ' '.join(content.split('\n'))
           # print(content)
            self.csv_writer.writerow([userid,content,star])
            #self.fp.write(f'{content}\n')

    def parse_max_page(self):
        for i in range(30):
            print(f'正在抓取{i}页!')
            #生成URL
            url = f'https://club.jd.com/comment/productPageComments.action?&productId=100011151038&score=3&sortType=5&page={i}&pageSize=10&isShadowSku=0&fold=1'
             #调用函数
            print(f'正在解析{i}页!')
    def close(self):
        self.fp.close()
        print('结束!')
if __name__ ==  '__main__' :
    js_spider = comments_spider()
    js_spider.parse_max_page()
    js_spider.close()
  1. 数据预处理:首先对获取到的数据进行去重、删除缺失值等操作。然后利用jieba分词库对其进行分词操作,最后将其输出。
import pandas as pd
import re
import jieba.posseg as psg
import numpy as np


# 数据清洗
data = pd.read_csv('1600train.csv', encoding='utf-8', header=None)
l1 = len(data)
# 去重,去除完全重复的数据
data = data.drop_duplicates()
l2 = len(data)
print('删除了%s条评论' % (l1-l2))
# 索引重置
data.reset_index()
# 删除缺失值
data = data.dropna()
# 去掉评论中的数字、字母,以及“京东”“京东商城”“美的”“热水器”“电热水器""符号"
content = data[1]  # 读取第一列
useless = re.compile(r'[0-9]|京东|美的|电热水器|热水器|京东商城|[!,!。,~、?]+')  # 正则表达式
content = content.apply(lambda x: useless.sub('', x))
data[1] = content

# 分词
worker = lambda s: [(x.word, x.flag) for x in psg.cut(s)]  # 自定义简单分词函数
seg_word = content.apply(worker)  # 对数据分词  [x.word,x.flag]
# 将词语转为数据框形式,一列是词,一列是词语所在的句子ID,最后一列是词语在该句子的位置
n_word = seg_word.apply(lambda x: len(x))  # 统计每一评论中词的个数
n_content = [[x+1]*y for x,y in zip(list(seg_word.index), list(n_word))]
index_content = sum(n_content, [])  # 将嵌套的列表展开,作为词所在评论的id
seg_word = sum(seg_word, [])
word = [x[0] for x in seg_word]  # 词
nature = [x[1] for x in seg_word]  # 词性
content_userid = [[x] * y for x, y in zip(list(data[0]), list(n_word))]
content_userid = sum(content_userid, [])  # 用户ID
content_star = [[x] * y for x, y in zip(list(data[2]), list(n_word))]
content_star = sum(content_star, [])  # 评论类型 ,

result = pd.DataFrame({"index_content": index_content,
                       "word": word,
                       "nature": nature,
                       "content_userid": content_userid,
                       "content_star":content_star})
# 删除标点符号
result = result[result['nature'] != 'x']  # x表示标点符号
# 删除停用词
stop_path = open('stoplist.csv', 'r', encoding='utf-8')
stop = stop_path.readlines()
stop = [x.replace('\n', '') for x in stop]
word = list(set(word) - set(stop))
result = result[result['word'].isin(word)]
# 构造各词在对应评论的位置列
n_word = list(result.groupby(by=['index_content'])['index_content'].count())
index_word = [list(np.arange(0, y)) for y in n_word]
index_word = sum(index_word, [])  # 表示词语在改评论的位置
# 合并评论id,评论中词的id,词,词性,评论类型
result['index_word'] = index_word
ind = result[['n' in x for x in result['nature']]]['index_content'].unique()
result = result[[x in ind for x in result['index_content']]]
# 将结果写出
result.to_csv("test2.csv", index=False, encoding='utf-8')

3.模型训练:将分词后的文本读入,训练各种模型,对比各种模型的优缺点,选择合适的模型进行分类。

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn import metrics

df = pd.read_csv("train.csv", encoding='utf-8')
x = df['word']          # 词
y = df['content_star']  # 标签

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=1)  # 划分测试集和训练集

# TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_df=0.80, min_df=2)

# SVM Classifier
svm = SGDClassifier()
pipe = make_pipeline(tfidf_vectorizer, svm)
pipe.fit(x_train, y_train)
y_pred = pipe.predict(x_test) #预测
print('SVM')
print(metrics.classification_report(y_test, y_pred))

# KNN Classifier K近邻算法
Knn = KNeighborsClassifier()
pipe1 = make_pipeline(tfidf_vectorizer, Knn)
pipe1.fit(x_train, y_train)
y_pred1 = pipe1.predict(x_test) #预测
print('KNN')
print(metrics.classification_report(y_test, y_pred1))

# Logistic Regression Classifier 逻辑回归
Lr = LogisticRegression()
pipe2 = make_pipeline(tfidf_vectorizer, Lr)
pipe2.fit(x_train, y_train)
y_pred2 = pipe2.predict(x_test) #预测
print('LRC')
print(metrics.classification_report(y_test, y_pred2))

# Naive Bayes 朴素贝叶斯
Nb = MultinomialNB()
pipe3 = make_pipeline(tfidf_vectorizer, Nb)
pipe3.fit(x_train, y_train)
y_pred3 = pipe3.predict(x_test) #预测
print('bayes')
print(metrics.classification_report(y_test, y_pred3))

你可能感兴趣的:(机器学习,python,爬虫)