重要提示!!!!!本文中的部分代码都已经公开发表在论文中,不建议直接引用,可能重复比例比较大,任何导致学术造假的后果请浏览者自行承担!!!
# #!-*- coding:utf-8 -*-
import pandas as pd
import numpy as np
import xlrd
import vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.preprocessing import StandardScaler
from openpyxl import load_workbook # 写入excel
from wordcloud import WordCloud as wc
import jieba # 结巴分词
import matplotlib.pyplot as plt # 绘图
from collections import defaultdict # 字典,用于词频统计
from PIL import Image # 打开图片,用于词云背景层
import cv2
from pyecharts.charts import Bar
import datetime
from pandas import Series
# #######################################################################################
# 1.读取数据与数据预处理
# data = pd.read_csv("C:\\Users\\thous\\PycharmProjects\\untitled4\\2018Q3.csv")
hair_dryer = pd.read_csv("C:\\Users\\thous\\PycharmProjects\\untitled4\\hair_dryer.csv")
hair_dryer_review = hair_dryer['review_body']
# 删除重复数据
print('hair_dryer删除重复数据前大小:',hair_dryer.shape)
hair_dryer.drop_duplicates()
print('hair_dryer删除重复数据后大小:',hair_dryer.shape)
#查看是否有缺失值
hair_dryer.isnull().sum()
# #######################################################################################
# 2.计算三件商品评论的情感得分;
analyzer = SentimentIntensityAnalyzer()
hair_dryer_sentiments = [analyzer.polarity_scores(review_body) for review_body in hair_dryer_review]
# float() argument must be a string or a number, not 'dict'
# 'float' object has no attribute 'split',需要把float换成其他格式
# 特征格式变换
# data['term'] = data['term'].str.replace(' months', '').astype('float')
# col = data.select_dtypes(include=['int64', 'float64']).columns
# col = col.drop('loan_status') # 剔除目标变量
# 将得分作为新的数据列与hair_dryer数据合并;
hair_dryer = hair_dryer.join(pd.DataFrame(hair_dryer_sentiments))
# #######################################################################
# 4.将Date列转换为日期格式;时间分析
hair_dryer['review_date'] = pd.to_datetime(hair_dryer['review_date'])
# 5.将新的日期设置为index;
hair_dryer.set_index(hair_dryer['review_date'], inplace=True)
"""
hair_dryer['weekday'] = hair_dryer['review_date'].dt.weekday
df1 = hair_dryer.set_index('review_date')
df1.resample('D').size().sort_values(ascending=False).head(100)
df2 = df1.resample('M').size().to_period()
# df2 = df2.reset_index(df2['review_date'], inplace=True)
print(hair_dryer['weekday'])
weekday
0 1777
1 1702
2 1641
3 1849
4 1531
5 1536
6 1434
"""
# ###################################################################
# 获取带星期的简化日期,如:11-25周六
def getWeek(x):
tstr = x.strftime('%Y-%m-%d')
dDay = tstr.split('-', 1)[1]
weekDict = {0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 3: 'Thursday', 4: 'Friday', 5: 'Saturday', 6: 'Sunday'}
dWeek = x.weekday()
return weekDict[dWeek]
hair_dryer['weekday'] = hair_dryer['review_date'].map(getWeek)
# print("weekday_hair_dryer", hair_dryer['weekday'])
weekday_hair_dryer = hair_dryer['weekday']
# hair_dryer = hair_dryer.join(pd.DataFrame(weekday_hair_dryer))
hair_dryer['month'] = pd.to_datetime(hair_dryer['review_date']).dt.month
# print("hair_dryer['month']", hair_dryer['month'])
month_hair_dryer = hair_dryer['month']
hair_dryer['months'] = hair_dryer['month']
hair_dryer = hair_dryer.drop(["month"], axis=1)
# #######################################################################################
# 购买量的增长情况
hair_dryer['date'] = pd.to_datetime(hair_dryer['review_date']).dt.strftime('%Y-%m-%d')
date_num = hair_dryer['date']
date_num.apply(pd.value_counts)
data_counts = hair_dryer['date'].value_counts()
# print(data_counts)
date_num_describe = date_num.describe()
"""
print(date_num_describe)
count 11470
unique 2307
top 2010-08-05
freq 146
"""
hair_dryer_describe = hair_dryer.describe()
# print(hair_dryer_describe)
hair_dryer_corr = hair_dryer.corr()
# print(hair_dryer_corr)
# data_counts.to_csv('hair_dryer_data_counts.csv')
# hair_dryer.to_csv('hair_dryer_new.csv')
# 手动读取,数据预处理
# #################################################################################
# 数据预处理完 我们调用新数据
hair_dryer_new = pd.read_csv("C:\\Users\\thous\\PycharmProjects\\untitled4\\hair_dryer_new.csv")
# hair_dryer = pd.read_csv("C:\\Users\\thous\\PycharmProjects\\untitled4\\hair_dryer.csv")
# hair_dryer_review = hair_dryer['review_body']
# #################################################################################
# 按产品id统计,并绘制散点图
hair_dryer_productid = hair_dryer_new['product_id'].value_counts().sort_values(ascending=False)
print("hair_dryer_productid", hair_dryer_productid)
# 很容易获取 前10后10的产品
# 如何对应他们的评分平均值 EXCEL,波士顿矩阵
# hair_dryer_productid.to_csv('hair_dryer_productid.csv')
# hair_dryer_productid.plot(kind='scatter')
# ValueError: plot kind scatter can only be used for data frames
# #################################################################################
# 统计指定单词出现的次数
"""
import sys
File_tuple1 = open(r'english.txt') #打开目标文件
File_tuple2 = File_tuple1.read()
File_tuple1.close()
File_list = File_tuple2.split(' ') #以空格来划分文件中的单词
#print(File_list)
x = input('请输入要查询的单词:')
a = 0
i = 0
for i in range(len(File_list)):
if File_list[i]==x:
a+=1
print (x,'在english.txt中出现的次数为',a, '次。')
"""
# ################################################################################################
# 预测
# hair_dryer_new = pd.read_csv("C:\\Users\\thous\\PycharmProjects\\untitled4\\hair_dryer_new.csv")
# 查看因变量情况
star_rating_hairdryer = hair_dryer_new['star_rating']
# print("star_rating_hairdryer", hair_dryer_new['star_rating'].value_counts())
sns.countplot(hair_dryer_new.star_rating)
# plt.tick_params(axis='x', labelsize=6)
plt.show()
词云1 ###############################################
hair_dryer_review_text = open(r"C:\\Users\\thous\\PycharmProjects\\untitled4\\hair_dryer_review.txt",
'r', encoding='UTF-8')
hair_dryer_text = hair_dryer_review_text.read()
cut_hair_dryer_text = " ".join(jieba.cut(hair_dryer_text))
color_mask_hair_dryer = cv2.imread('mask.jpg')
# coloring=np.array(Image.open("cat_new.jpg"))
# #获取背景图片,new.jpg
# alice_mask = np.array(Image.open(path.join(d, "alice_mask.png")))
cloud = wc(
# 设置字体,不指定就会出现乱码
font_path="C:\\Windows\\Fonts\\Times New Roman.TTF",
# font_path=path.join(d,'simsun.ttc'),
# 设置背景色
background_color='white',
# 词云形状
mask=color_mask_hair_dryer,
# 允许最大词汇
max_words=2000,
# 最大号字体
max_font_size=40
)
# 直接根据分词结果生成简单的词云图
wordcloud = wc().generate(cut_hair_dryer_text)
# wc = WordCloud(background_color="white", max_words=2000, mask=alice_mask, stopwords=stopwords)
# wordcloud = wd(width=1000, height=860, margin=2, font_path="simsun.ttf", background_color="white", max_font_size=180,
mask=myimg).fit_words(wordfrequency) # 根据词频字典生成词云
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')# 不显示坐标轴
plt.show()