import numpy as np
import pandas as pd
import matplotlib as mb
import matplotlib.pyplot as plt
from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer
import re
import collections
from nltk.tokenize import word_tokenize
import string
from nltk.corpus import stopwords
import numpy as np
import pymongo,pandas as pd
from bson import ObjectId
import matplotlib as mb
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import plotnine as p9
from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer
from dateutil import parser
from ggplot import *
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
pcfr = pd.read_excel('microwave.xlsx')
#下面这一行‘==’后面的部分就是这次想要的品牌的名字,替换就好
#df = pcfr[pcfr['product_title']=='andis 1875-watt fold-n-go ionic hair dryer , silver/black (80020)']
df = pcfr
df.head(3)
c = df[['review_date','review_headline','review_body']]
#a.info()
#a.review_body
非常重要的一步!to_datetime
# c['review_date'] = c.review_date.apply(lambda x:parser.parse(x))
c['review_date'] = pd.to_datetime(c['review_date'])
c=c.set_index('review_date')
按年份提取评论。注意对列索引结合loc的使用。
tmp=c.loc['2015']
与上篇blog相同
a = tmp
# 计算polarity
def sentiment_calc(text):
try:
return TextBlob(text).sentiment.polarity
except:
return None
# subjectivity
def sentiment_calc_sub(text):
try:
return TextBlob(text).sentiment.subjectivity
except:
return None
# a['sentiment'] = a['review_body'].apply(sentiment_calc)
a['polarity'] = a['review_body'].apply(sentiment_calc)
a['subjectivity'] = a['review_body'].apply(sentiment_calc_sub)
a.head(30)
本部分参考链接:传送门1,传送门2
# 注意这里用法 a[a.polarity>0]也可以
data_post = a[a['polarity']>0]
#data_post
postry = data_post['review_body']
#postry
data_neg = a[a['polarity']<0]
#data_neg
negtry = data_neg['review_body']
#negtry
postry.to_csv('comments_正.txt')
negtry.to_csv('comments_负.txt')
Remove numbers if they are not relevant to your analyses. Usually, regular expressions are used to remove numbers.
#如果要提取负面的,就把open()里面的comments_正改为负
#下面的程序都照常运行就可以
with open('comments_负.txt') as fn1:
textpos = fn1.read() # 使用read方法读取整段文本
# remove number
pattern = re.compile(r'\d+') # 建立正则表达式匹配模式
textpos = re.sub(pattern, '', textpos) # 将符合模式的字符串替换掉
# remove len<4 words
pattern2 = re.compile(r'\W*\b\w{1,3}\b')
textpos = re.sub(pattern2,'',textpos)
#print(textpos)
The following code removes this set of symbols [!”#$%&’()*+,-./:;<=>?@[]^_`{|}~]:
textpos = textpos.translate(str.maketrans('','',string.punctuation))
#print(textpos)
stop_words = set(stopwords.words('english'))
tokens = word_tokenize(textpos)
result = [i for i in tokens if not i in stop_words]
word_counts = collections.Counter(result)
word_counts_top10 = word_counts.most_common(100) # 提取前100个频率最高的词
# f = open('nz_2005_正.txt','a')
for w, c in word_counts_top10:
print(w)
# f.write(w)
# f.write('\t')
# f.write(str(c))
# f.write('\n')
# f.close()