美国数学建模比赛2020 MCM C题代码详解

重要提示!!!!!本文中的部分代码都已经公开发表在论文中,不建议直接引用,可能重复比例比较大,任何导致学术造假的后果请浏览者自行承担!!!

# #!-*- coding:utf-8 -*-

import pandas as pd
import numpy as np
import xlrd
import vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.preprocessing import StandardScaler
from openpyxl import load_workbook  # 写入excel
from wordcloud import WordCloud as wc
import jieba  # 结巴分词
import matplotlib.pyplot as plt  # 绘图
from collections import defaultdict  # 字典,用于词频统计
from PIL import Image  # 打开图片,用于词云背景层
import cv2
from pyecharts.charts import Bar
import datetime
from pandas import Series


# #######################################################################################
# 1.读取数据与数据预处理
# data = pd.read_csv("C:\\Users\\thous\\PycharmProjects\\untitled4\\2018Q3.csv")
hair_dryer = pd.read_csv("C:\\Users\\thous\\PycharmProjects\\untitled4\\hair_dryer.csv")
hair_dryer_review = hair_dryer['review_body']


# 删除重复数据
print('hair_dryer删除重复数据前大小:',hair_dryer.shape)
hair_dryer.drop_duplicates()
print('hair_dryer删除重复数据后大小:',hair_dryer.shape)

#查看是否有缺失值
hair_dryer.isnull().sum()


# #######################################################################################
# 2.计算三件商品评论的情感得分;
analyzer = SentimentIntensityAnalyzer()
hair_dryer_sentiments = [analyzer.polarity_scores(review_body) for review_body in hair_dryer_review]
# float() argument must be a string or a number, not 'dict'


# 'float' object has no attribute 'split',需要把float换成其他格式
# 特征格式变换
# data['term'] = data['term'].str.replace(' months', '').astype('float')
# col = data.select_dtypes(include=['int64', 'float64']).columns
# col = col.drop('loan_status') # 剔除目标变量


# 将得分作为新的数据列与hair_dryer数据合并;
hair_dryer = hair_dryer.join(pd.DataFrame(hair_dryer_sentiments))


# #######################################################################
# 4.将Date列转换为日期格式;时间分析
hair_dryer['review_date'] = pd.to_datetime(hair_dryer['review_date'])

# 5.将新的日期设置为index;
hair_dryer.set_index(hair_dryer['review_date'], inplace=True)


"""
hair_dryer['weekday'] = hair_dryer['review_date'].dt.weekday
df1 = hair_dryer.set_index('review_date')
df1.resample('D').size().sort_values(ascending=False).head(100)
df2 = df1.resample('M').size().to_period()
# df2 = df2.reset_index(df2['review_date'], inplace=True)
print(hair_dryer['weekday'])

weekday
0    1777
1    1702
2    1641
3    1849
4    1531
5    1536
6    1434
"""

# ###################################################################
# 获取带星期的简化日期,如:11-25周六
def getWeek(x):
    tstr = x.strftime('%Y-%m-%d')
    dDay = tstr.split('-', 1)[1]
    weekDict = {0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 3: 'Thursday', 4: 'Friday', 5: 'Saturday', 6: 'Sunday'}
    dWeek = x.weekday()
    return weekDict[dWeek]

hair_dryer['weekday'] = hair_dryer['review_date'].map(getWeek)
# print("weekday_hair_dryer", hair_dryer['weekday'])
weekday_hair_dryer = hair_dryer['weekday']
# hair_dryer = hair_dryer.join(pd.DataFrame(weekday_hair_dryer))


hair_dryer['month'] = pd.to_datetime(hair_dryer['review_date']).dt.month
# print("hair_dryer['month']", hair_dryer['month'])
month_hair_dryer = hair_dryer['month']
hair_dryer['months'] = hair_dryer['month']

hair_dryer = hair_dryer.drop(["month"], axis=1)


# #######################################################################################
# 购买量的增长情况

hair_dryer['date'] = pd.to_datetime(hair_dryer['review_date']).dt.strftime('%Y-%m-%d')

date_num = hair_dryer['date']
date_num.apply(pd.value_counts)

data_counts = hair_dryer['date'].value_counts()
# print(data_counts)


date_num_describe = date_num.describe()
"""
print(date_num_describe)
count          11470
unique          2307
top       2010-08-05
freq             146
"""

hair_dryer_describe = hair_dryer.describe()
# print(hair_dryer_describe)

hair_dryer_corr = hair_dryer.corr()
# print(hair_dryer_corr)


# data_counts.to_csv('hair_dryer_data_counts.csv')
# hair_dryer.to_csv('hair_dryer_new.csv')
# 手动读取,数据预处理


# #################################################################################
# 数据预处理完 我们调用新数据
hair_dryer_new = pd.read_csv("C:\\Users\\thous\\PycharmProjects\\untitled4\\hair_dryer_new.csv")
# hair_dryer = pd.read_csv("C:\\Users\\thous\\PycharmProjects\\untitled4\\hair_dryer.csv")
# hair_dryer_review = hair_dryer['review_body']


# #################################################################################

# 按产品id统计,并绘制散点图
hair_dryer_productid = hair_dryer_new['product_id'].value_counts().sort_values(ascending=False)
print("hair_dryer_productid", hair_dryer_productid)

# 很容易获取 前10后10的产品
# 如何对应他们的评分平均值 EXCEL,波士顿矩阵
# hair_dryer_productid.to_csv('hair_dryer_productid.csv')


# hair_dryer_productid.plot(kind='scatter')
# ValueError: plot kind scatter can only be used for data frames

# #################################################################################
# 统计指定单词出现的次数
"""
import sys

File_tuple1 = open(r'english.txt') #打开目标文件
File_tuple2 = File_tuple1.read()
File_tuple1.close()
File_list = File_tuple2.split(' ') #以空格来划分文件中的单词
#print(File_list)


x = input('请输入要查询的单词:')
a = 0
i = 0
for i in range(len(File_list)):
    if File_list[i]==x:
        a+=1
print (x,'在english.txt中出现的次数为',a, '次。')
"""

# ################################################################################################
# 预测
# hair_dryer_new = pd.read_csv("C:\\Users\\thous\\PycharmProjects\\untitled4\\hair_dryer_new.csv")


# 查看因变量情况
star_rating_hairdryer = hair_dryer_new['star_rating']
# print("star_rating_hairdryer", hair_dryer_new['star_rating'].value_counts())
sns.countplot(hair_dryer_new.star_rating)
# plt.tick_params(axis='x', labelsize=6)
plt.show()



词云1 ###############################################

hair_dryer_review_text = open(r"C:\\Users\\thous\\PycharmProjects\\untitled4\\hair_dryer_review.txt", 
'r', encoding='UTF-8')
hair_dryer_text = hair_dryer_review_text.read()
cut_hair_dryer_text = " ".join(jieba.cut(hair_dryer_text))
color_mask_hair_dryer = cv2.imread('mask.jpg')
# coloring=np.array(Image.open("cat_new.jpg"))
# #获取背景图片,new.jpg
# alice_mask = np.array(Image.open(path.join(d, "alice_mask.png")))

cloud = wc(
    # 设置字体,不指定就会出现乱码
    font_path="C:\\Windows\\Fonts\\Times New Roman.TTF",
    # font_path=path.join(d,'simsun.ttc'),
    # 设置背景色
    background_color='white',
    # 词云形状
    mask=color_mask_hair_dryer,
    # 允许最大词汇
    max_words=2000,
    # 最大号字体
    max_font_size=40
)

# 直接根据分词结果生成简单的词云图
wordcloud = wc().generate(cut_hair_dryer_text)
# wc = WordCloud(background_color="white", max_words=2000, mask=alice_mask, stopwords=stopwords)
# wordcloud = wd(width=1000, height=860, margin=2, font_path="simsun.ttf", background_color="white", max_font_size=180,
               mask=myimg).fit_words(wordfrequency)  # 根据词频字典生成词云
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')# 不显示坐标轴
plt.show()

你可能感兴趣的:(python,mcm/icm)