对获取的天猫商品-智能手机评价
进行数据分析
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import time
import pandas as pd
#----------------------------------------------读取数据----------------------------------------------
# 读取数据
n = '../file/CSV/手机_评价.csv'
data = pd.read_csv(n)
#查看数据维度(行,列)
#print(data.shape)
#取出商品标题,区域,价格,销售四个维度的数据
#data=data[['商品名','价格','销售','省份']]
#对每个标题进行分词,使用jieba分词
#----------------------------------------------s商品名称分词处理----------------------------------------------
import jieba
title=data['评价内容'].astype(str)
title_s=[]
#商品名分词
for line in title:
title_cut=jieba.cut(line)
for i in title_cut:
title_s.append(i)
#print(title_s)
# 导入停用此表
stopwords = [line.strip() for line in open('../file/TXT/StopWords.txt', 'r', encoding='utf-8').readlines()]
#print(stopwords)
# 剔除停用词
title_clean = []
for line in title_s:
if line not in stopwords:
title_clean.append(line)
#print(title_clean)
# 把列表 allwords_clean_dist 转为数据框
df_allwords_clean_dist = pd.DataFrame({
'allwords': title_clean
})
#print(df_allwords_clean_dist)
#
# # 对过滤_去重的词语 进行分类汇总
word_count = df_allwords_clean_dist.allwords.value_counts().reset_index()
word_count.columns = ['word', 'count']
#print(word_count)
#
#----------------------------------------------词云可视化----------------------------------------------
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import imageio as im
#尺寸大小
plt.figure(figsize=(8, 8))
# 读取图片,用于限制大小
pic = im.imread("../file/PNG/猫.PNG")
#print(pic)
w_c = WordCloud(font_path="simhei.ttf", background_color="black",mask=pic,max_font_size=100, margin=1)
wc = w_c.fit_words({
x[0]: x[1] for x in word_count.head(100).values
})
#显示词云
plt.imshow(wc, interpolation='bilinear')
#坐标刻度隐藏
plt.axis("off")
plt.show()
# 保存到文件
wc.to_file('../file/PNG/17评价词云图.PNG')
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import pandas as pd
df=pd.read_csv('../file/CSV/data.csv',header=None,names=['date','num'])
#将数据类型转换为日期类型
df['date']=pd.to_datetime(df['date'])
df=df.set_index('date')
s=pd.Series(df['num'],index=df.index)
#TODO 1 按照日期筛选数据
df=df.sort_values('date')
k=df.truncate('2020-03-01')
k=k.to_dict()
names=[]
nums=[]
for o in k['num']:
names.append((str(o))[:-9])
nums.append(k['num'][o])
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False #用来正常显示负号
# 绘图
plt.figure(figsize=(10,4))
plt.plot(names,nums)
plt.xticks(rotation=-90) # X轴标签旋转,避免重叠
plt.grid(linestyle='-.')
plt.ylabel('销售量')
plt.title('三月份手机销量图')
# 显示数字
for a, b in zip(list(names), list(nums)):
#x轴,y轴,显示数值,水平居中,垂直底部,字体大小
plt.text(a,b,b, ha='center', va='bottom', size=10)
plt.grid(linestyle='-.')
plt.savefig('../file/PNG/19日期销售图.png')
plt.show()
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
__title__ = ''
__author__ = 'jia666666'
__time__ = '2020/3/27'
"""
import pandas as pd
import matplotlib.pyplot as plt
# 读取数据
n = '../file/CSV/手机_评价.csv'
data = pd.read_csv(n)
#print(data['评价时间']['2020'])
#所有数据
total_data = {}
for item in data['存储容量']:
item=str(item).split('(')[0]
if item=='128GB':
item='8+'+item
elif item=='64GB':
item='6+'+item
elif item=="32GB":
item='4+'+item
elif item=="512GB":
item='12+'+item
elif item == "256GB":
item = '8+' + item
#
k = item.split('+')
k[0]=k[0].replace('GB','').replace('G','')
k[1] = k[1].replace('B','')
item = k[0] + '+' + k[1]
if item not in total_data:
total_data.update({item:1})
else:
total_data[item] +=1
print(total_data.keys())
#
ROM_data={}
RAM_data = {}
for i in total_data:
k = str(i).split('+')
#运行内存处理
k[0]=k[0].replace('G', '').replace('B', '')+'GB'
if k[0] not in ROM_data:
ROM_data.update({k[0]:0})
ROM_data[k[0]] += total_data[i]
else:
ROM_data[k[0]]+=total_data[i]
#机身内存处理
if k[1] not in RAM_data:
RAM_data.update({k[1]: 0})
RAM_data[k[1]] += total_data[i]
else:
RAM_data[k[1]] += total_data[i]
# print(RAM_data)
# print(ROM_data)
#------------------------------------------------------------------------------
# 绘制柱状图
#------------------------------------------------------------------------------
import matplotlib.pyplot as plt
import numpy as np
plt.rcParams['font.sans-serif'] = ['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False #用来正常显示负号
names=list(total_data.keys())
nums=list(total_data.values())
# 绘图
plt.figure(figsize=(10,4))
plt.bar(names,nums)
plt.xticks(rotation=-90) # X轴标签旋转,避免重叠
plt.grid(linestyle='-.')
plt.ylabel('销售量')
plt.title('机身内存销量关系图')
# 显示数字
for a, b in zip(list(names), list(nums)):
#x轴,y轴,显示数值,水平居中,垂直底部,字体大小
plt.text(a,b,b, ha='center', va='bottom', size=10)
plt.grid(linestyle='-.')
plt.savefig('../file/PNG/13机身内存销量关系图.png')
plt.show()
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
__title__ = ''
__author__ = 'jia666666'
__time__ = '2020/3/27'
"""
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
__title__ = ''
__author__ = 'jia666666'
__time__ = '2020/3/27'
"""
import pandas as pd
import matplotlib.pyplot as plt
# 读取数据
n = '../file/CSV/手机_评价.csv'
data = pd.read_csv(n)
#print(data['评价时间']['2020'])
#所有数据
total_data = {}
for item in data['存储容量']:
item = str(item).split('(')[0]
if item=='128GB':
item='8+'+item
elif item=='64GB':
item='6+'+item
elif item=="32GB":
item='4+'+item
elif item=="512GB":
item='12+'+item
elif item == "256GB":
item = '8+' + item
k = item.split('+')
k[0]=k[0].replace('GB','').replace('G','')
k[1] = k[1].replace('B','')
item = k[0] + '+' + k[1]
if item not in total_data:
total_data.update({item:0})
total_data[item] += 1
else:
total_data[item] +=1
print(total_data)
#
ROM_data={}
RAM_data = {}
for i in total_data:
k = str(i).split('+')
#运行内存处理
k[0]=k[0].replace('G', '').replace('B', '')+'GB'
if k[0] not in ROM_data:
ROM_data.update({k[0]:0})
ROM_data[k[0]] += total_data[i]
else:
ROM_data[k[0]]+=total_data[i]
#机身内存处理
if k[1] not in RAM_data:
RAM_data.update({k[1]: 0})
RAM_data[k[1]] += total_data[i]
else:
RAM_data[k[1]] += total_data[i]
#
# print(RAM_data)
# print(ROM_data)
import matplotlib.pyplot as plt
import numpy as np
plt.rcParams['font.sans-serif'] = ['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False #用来正常显示负号
names=list(ROM_data.keys())
nums=list(ROM_data.values())
# 绘图
plt.figure(figsize=[10,6])
plt.pie(nums,labels=names,autopct='%.2f%%')
plt.title("运行内存销量比重", fontproperties='SimHei', size=12)
plt.axis('equal')
plt.legend()
plt.savefig('../file/PNG/14运行内存比重.png')
plt.show()
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import pandas as pd
import matplotlib.pyplot as plt
# 读取数据
n = '../file/CSV/手机_评价.csv'
data = pd.read_csv(n)
#print(data['评价时间']['2020'])
#所有数据
total_data = {}
for item in data['存储容量']:
item = str(item).split('(')[0]
if item=='128GB':
item='8+'+item
elif item=='64GB':
item='6+'+item
elif item=="32GB":
item='4+'+item
elif item=="512GB":
item='12+'+item
elif item == "256GB":
item = '8+' + item
#字符格式整理
k = item.split('+')
k[0]=k[0].replace('GB','').replace('G','')
k[1] = k[1].replace('B','')
item = k[0] + '+' + k[1]
if item not in total_data:
total_data.update({item:0})
total_data[item] += 1
else:
total_data[item] +=1
print(total_data)
#
ROM_data={}
RAM_data = {}
for i in total_data:
k = str(i).split('+')
#运行内存处理
k[0]=k[0].replace('G', '').replace('B', '')+'GB'
if k[0] not in ROM_data:
ROM_data.update({k[0]:0})
ROM_data[k[0]] += total_data[i]
else:
ROM_data[k[0]]+=total_data[i]
#机身内存处理
if k[1] not in RAM_data:
RAM_data.update({k[1]: 0})
RAM_data[k[1]] += total_data[i]
else:
RAM_data[k[1]] += total_data[i]
# print(RAM_data)
# print(ROM_data)
#
import matplotlib.pyplot as plt
import numpy as np
plt.rcParams['font.sans-serif'] = ['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False #用来正常显示负号
names=list(RAM_data.keys())
nums=list(RAM_data.values())
# 绘图
plt.figure(figsize=(10,4))
plt.bar(names,nums)
plt.xticks(rotation=-90) # X轴标签旋转,避免重叠
plt.grid(linestyle='-.')
plt.ylabel('销售量')
plt.title('存储内存销量关系图')
# 显示数字
for a, b in zip(list(names), list(nums)):
#x轴,y轴,显示数值,水平居中,垂直底部,字体大小
plt.text(a,b,b, ha='center', va='bottom', size=10)
plt.grid(linestyle='-.')
plt.savefig('../file/PNG/15机身内存销量关系图.png')
plt.show()
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import pandas as pd
import matplotlib.pyplot as plt
# 读取数据
n = '../file/CSV/手机_评价.csv'
data = pd.read_csv(n)
#print(data['评价时间']['2020'])
#所有数据
total_data = {}
for item in data['存储容量']:
item = str(item).split('(')[0]
if item=='128GB':
item='8+'+item
elif item=='64GB':
item='6+'+item
elif item=="32GB":
item='4+'+item
elif item=="512GB":
item='12+'+item
elif item == "256GB":
item = '8+' + item
k = item.split('+')
k[0]=k[0].replace('GB','').replace('G','')
k[1] = k[1].replace('B','')
item = k[0] + '+' + k[1]
if item not in total_data:
total_data.update({item:1})
else:
total_data[item] +=1
#print(total_data)
#
ROM_data={}
RAM_data = {}
for i in total_data:
k = str(i).split('+')
#运行内存处理
k[0]=k[0].replace('G', '').replace('B', '')+'GB'
if k[0] not in ROM_data:
ROM_data.update({k[0]:0})
ROM_data[k[0]] += total_data[i]
else:
ROM_data[k[0]]+=total_data[i]
#机身内存处理
if k[1] not in RAM_data:
RAM_data.update({k[1]: 0})
RAM_data[k[1]] += total_data[i]
else:
RAM_data[k[1]] += total_data[i]
# print(RAM_data)
# print(ROM_data)
#
#
import matplotlib.pyplot as plt
import numpy as np
plt.figure(figsize=[20,16])
plt.rcParams['font.sans-serif'] = ['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False #用来正常显示负号
names=list(ROM_data.keys())
nums=list(ROM_data.values())
# 绘图
p1=plt.subplot(133)
plt.pie(nums,labels=names,autopct='%.2f%%')
#plt.plot(names,nums)
plt.title("运行内存饼图", fontproperties='SimHei', size=12)
plt.axis('equal')
plt.legend()
plt.sca(p1)
p2=plt.subplot(132)
names=list(RAM_data.keys())
nums=list(RAM_data.values())
plt.bar(names,nums)
plt.xticks(rotation=-90) # X轴标签旋转,避免重叠
plt.grid(linestyle='-.')
plt.ylabel('销售量')
plt.title('存储内存销量图')
# 显示数字
for a, b in zip(list(names), list(nums)):
#x轴,y轴,显示数值,水平居中,垂直底部,字体大小
plt.text(a,b,b, ha='center', va='bottom', size=10)
plt.sca(p2)
p3=plt.subplot(131)
names=list(total_data.keys())
nums=list(total_data.values())
# plt.bar(names,nums)
# plt.xticks(rotation=-90) # X轴标签旋转,避免重叠
# plt.grid(linestyle='-.')
# plt.ylabel('销售量')
# plt.title('机身内存销量图')
#plt.figure(figsize=(10,4))
plt.plot(names,nums)
plt.xticks(rotation=-90) # X轴标签旋转,避免重叠
plt.grid(linestyle='-.')
plt.ylabel('销售量')
plt.title('机身内存销量关系图')
#plt.show()
# 显示数字
for a, b in zip(list(names), list(nums)):
#x轴,y轴,显示数值,水平居中,垂直底部,字体大小
plt.text(a,b,b, ha='center', va='bottom', size=10)
plt.sca(p3)
plt.grid(linestyle='-.')
plt.savefig('../file/PNG/16存储容量影响关系图.png')
plt.show()