使用textblob进行情感分析并分别对正负面情绪进行关键词提取(LDA)

import numpy as np
import pandas as pd
import matplotlib as mb
import matplotlib.pyplot as plt
from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer
import re
import collections
from nltk.tokenize import word_tokenize
import string
from nltk.corpus import stopwords

import numpy as np

import pymongo,pandas as pd
from bson import ObjectId

import matplotlib as mb
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import plotnine as p9

from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer

from dateutil import parser

from ggplot import *
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

读入

pcfr = pd.read_excel('microwave.xlsx')
#下面这一行‘==’后面的部分就是这次想要的品牌的名字,替换就好
#df = pcfr[pcfr['product_title']=='andis 1875-watt fold-n-go ionic hair dryer , silver/black (80020)']
df = pcfr
df.head(3)

分离

c = df[['review_date','review_headline','review_body']]
#a.info()
#a.review_body

非常重要的一步!to_datetime

# c['review_date'] = c.review_date.apply(lambda x:parser.parse(x))
c['review_date'] = pd.to_datetime(c['review_date'])
c=c.set_index('review_date')

使用textblob进行情感分析并分别对正负面情绪进行关键词提取(LDA)_第1张图片

按年份提取评论。注意对列索引结合loc的使用。

tmp=c.loc['2015']

使用textblob进行情感分析并分别对正负面情绪进行关键词提取(LDA)_第2张图片

情感分析

与上篇blog相同

a = tmp
# 计算polarity
def sentiment_calc(text):
    try:
        return TextBlob(text).sentiment.polarity
    except:
        return None
        
# subjectivity
def sentiment_calc_sub(text):
    try:
        return TextBlob(text).sentiment.subjectivity
    except:
        return None
# a['sentiment'] = a['review_body'].apply(sentiment_calc)
a['polarity'] = a['review_body'].apply(sentiment_calc)

a['subjectivity'] = a['review_body'].apply(sentiment_calc_sub)

这里会warning,无视之。使用textblob进行情感分析并分别对正负面情绪进行关键词提取(LDA)_第3张图片

a.head(30)

使用textblob进行情感分析并分别对正负面情绪进行关键词提取(LDA)_第4张图片

LDA

本部分参考链接:传送门1,传送门2

划分正负面评论
# 注意这里用法 a[a.polarity>0]也可以
data_post = a[a['polarity']>0]
#data_post
postry = data_post['review_body']
#postry
data_neg = a[a['polarity']<0]
#data_neg
negtry = data_neg['review_body']
#negtry
postry.to_csv('comments_正.txt')
negtry.to_csv('comments_负.txt')

会warning,无视之。使用textblob进行情感分析并分别对正负面情绪进行关键词提取(LDA)_第5张图片

将数据中影响数据分析的符号去除掉
去除无用符号与small words(认为过短的词汇是无意义的)

Remove numbers if they are not relevant to your analyses. Usually, regular expressions are used to remove numbers.

#如果要提取负面的,就把open()里面的comments_正改为负
#下面的程序都照常运行就可以

with open('comments_负.txt') as fn1:
    textpos = fn1.read()    # 使用read方法读取整段文本

# remove number
pattern = re.compile(r'\d+')  # 建立正则表达式匹配模式
textpos = re.sub(pattern, '', textpos)         # 将符合模式的字符串替换掉

# remove len<4 words
pattern2 = re.compile(r'\W*\b\w{1,3}\b')
textpos = re.sub(pattern2,'',textpos)

#print(textpos)
去除标点

The following code removes this set of symbols [!”#$%&’()*+,-./:;<=>?@[]^_`{|}~]:

textpos = textpos.translate(str.maketrans('','',string.punctuation))
#print(textpos)
Verifying the Stopwords
stop_words = set(stopwords.words('english'))
tokens = word_tokenize(textpos)
result = [i for i in tokens if not i in stop_words]
word_counts = collections.Counter(result)
word_counts_top10 = word_counts.most_common(100)  # 提取前100个频率最高的词

# f = open('nz_2005_正.txt','a')
for w, c in word_counts_top10:     
    print(w)
#     f.write(w)
#     f.write('\t')
#     f.write(str(c))
#     f.write('\n')
    
# f.close()

使用textblob进行情感分析并分别对正负面情绪进行关键词提取(LDA)_第6张图片

你可能感兴趣的:(2020mcm)