咳咳~不要怀疑,这是一个正经的可视化项目,而且附带一点科普
数据来源
数据来自爬虫获取,淘宝约50个文胸商品的20W条评论数据~
数据源来自chenjiandongx/cup-size
前言
对于很多只知道A/B/C的绅士们,我们在看数据之前可能先得了解点知识~
首先我们得先了解两个概念——上胸围 & 下胸围,具体看示意图:
通过上胸围与下胸围的差值,我们就可以确定罩杯的大小了,具体的对应关系可参考下图:
有了下胸围 & 罩杯就能确定文胸对应的尺码了~
当然这又有分为英式尺码和国际尺码,具体参考下图:
好了,接下俩就可以开始我们的可视化了~
依赖模块
from pyecharts.charts import *
from pyecharts import options as opts
from pyecharts.commons.utils import JsCode
from collections import Counter
import re
import pandas as pd
import jieba
import jieba.posseg as psg
from stylecloud import gen_stylecloud
from IPython.display import Image
数据处理
原始数据是txt格式,为了方便处理,这边转为Dataframe~
尺码部分通过正则表达式提取出对应的下胸围和罩杯,具体代码如下:
patterns = re.compile(r'(?P.*),颜色分类:(?P.*?);尺码:(?P.*?),(?P.*)')
with open('/home/kesci/input/cup6439/cup_all.txt', 'r') as f:
data = f.readlines()
obj_list = []
for item in data:
obj = patterns.search(item)
obj_list.append(obj.groupdict())
data = pd.DataFrame(obj_list)
data = pd.concat([data, data['size'].str.extract('(?P[7-9]{1}[0|5]{1}).*(?P[a-zA-Z])',
expand=True)], axis=1)
data.head()
商品类别
我们通过jieba
分词来看看商品分类中最常出现的是哪些关键词~
- 代码:
w_all = []
for item in data.color:
w_l = psg.cut(item)
w_l = [w for w, f in w_l if f in ('n', 'nr') and len(w)>1]
w_all.extend(w_l)
c = Counter(w_all)
counter = c.most_common(50)
bar = (Bar(init_opts=opts.InitOpts(theme='purple-passion', width='1000px', height='800px'))
.add_xaxis([x for x, y in counter[::-1]])
.add_yaxis('出现次数', [y for x, y in counter[::-1]], category_gap='30%')
.set_global_opts(title_opts=opts.TitleOpts(title="出现最多的关键词",
pos_left="center",
title_textstyle_opts=opts.TextStyleOpts(font_size=20)),
datazoom_opts=opts.DataZoomOpts(range_start=70, range_end=100, orient='vertical'),
visualmap_opts=opts.VisualMapOpts(is_show=False, max_=6e4, min_=3000, dimension=0,
range_color=['#f5d69f', '#f5898b', '#ef5055']),
legend_opts=opts.LegendOpts(is_show=False),
xaxis_opts=opts.AxisOpts(is_show=False,),
yaxis_opts=opts.AxisOpts(axistick_opts=opts.AxisTickOpts(is_show=False),
axisline_opts=opts.AxisLineOpts(is_show=False)))
.set_series_opts(label_opts=opts.LabelOpts(is_show=True,
position='right',
font_style='italic'),
itemstyle_opts={"normal": {
"barBorderRadius": [30, 30, 30, 30],
'shadowBlur': 10,
'shadowColor': 'rgba(120, 36, 50, 0.5)',
'shadowOffsetY': 5,
}
}
).reversal_axis())
bar.render_notebook()
- 颜色:肤色 > 黑色 > 粉色 > 白色;
- 薄款 > 厚款;
- 钢圈似乎是个比较重要的卖点;
尺码分布
- 代码:
t_data = data.groupby(['circumference', 'cup'])['datetime'].count().reset_index()
t_data.columns = ['circumference', 'cup', 'num']
#t_data.num = round(t_data.num.div(t_data.num.sum(axis=0), axis=0) * 100, 1)
data_pair = [
{"name": 'A',
"label":{"show": True},
"children": []},
{"name": 'B',
"label":{"show": True},
"children": []},
{"name": 'C',
"label":{"show": True},
'shadowBlur': 10,
'shadowColor': 'rgba(120, 36, 50, 0.5)',
'shadowOffsetY': 5,
"children": []},
{"name": 'D',
"label":{"show": False},
"children": []},
{"name": 'E',
"label":{"show": False},
"children": []}
]
for idx, row in t_data.iterrows():
t_dict = {"name": row.cup,
"label":{"show": True},
"children": []}
if row.num > 3000:
child_data = {"name": '{}-{}'.format(row.circumference, row.cup), "value":row.num, "label":{"show": True}}
else:
child_data = {"name": '{}-{}'.format(row.circumference, row.cup), "value":row.num, "label":{"show": False}}
if row.cup == "A":
data_pair[0]['children'].append(child_data)
elif row.cup == "B":
data_pair[1]['children'].append(child_data)
elif row.cup == "C":
data_pair[2]['children'].append(child_data)
elif row.cup == "D":
data_pair[3]['children'].append(child_data)
elif row.cup == "E":
data_pair[4]['children'].append(child_data)
c = (Sunburst(
init_opts=opts.InitOpts(
theme='purple-passion',
width="1000px",
height="1000px"))
.add(
"",
data_pair=data_pair,
highlight_policy="ancestor",
radius=[0, "100%"],
sort_='null',
levels=[
{},
{
"r0": "20%",
"r": "48%",
"itemStyle": {"borderColor": 'rgb(220,220,220)', "borderWidth": 2}
},
{"r0": "50%", "r": "80%", "label": {"align": "right"},
"itemStyle": {"borderColor": 'rgb(220,220,220)', "borderWidth": 1}}
],
)
.set_global_opts(
visualmap_opts=opts.VisualMapOpts(is_show=False, max_=90000, min_=3000,
range_color=['#f5d69f', '#f5898b', '#ef5055']),
title_opts=opts.TitleOpts(title="文 胸\n\n尺 码 分 布",
pos_left="center",
pos_top="center",
title_textstyle_opts=opts.TextStyleOpts(font_style='oblique', font_size=30),))
.set_series_opts(label_opts=opts.LabelOpts(font_size=18, formatter="{b}: {c}"))
)
c.render_notebook()
- 单看罩杯的话:B > A > C
- 细分到具体尺码:75B > 80B > 75A > 70A
罩杯分布
我们通过不同的胸围来看看罩杯的比例:
- 代码:
grid = Grid(init_opts=opts.InitOpts(theme='purple-passion', width='1000px', height='1000px'))
for idx, c in enumerate(['70', '75', '80', '85', '90', '95']):
if idx % 2 == 0:
x = 30
y = int(idx/2) * 30 + 20
else:
x = 70
y = int(idx/2) * 30 + 20
pos_x = str(x)+'%'
pos_y = str(y)+'%'
pie = Pie(init_opts=opts.InitOpts())
pie.add(
c,
[[row.cup, row.num]for i, row in t_data[t_data.circumference==c].iterrows()],
center=[pos_x, pos_y],
radius=[70, 100],
label_opts=opts.LabelOpts(formatter='{b}:{d}%'),
)
pie.set_global_opts(
title_opts=opts.TitleOpts(title="下胸围={}".format(c),
pos_top=str(y-1)+'%', pos_left=str(x-4)+'%',
title_textstyle_opts=opts.TextStyleOpts(font_size=15)),
legend_opts=opts.LegendOpts(is_show=True))
grid.add(pie,grid_opts=opts.GridOpts(pos_left='20%'))
grid.render_notebook()
- 下胸围=70:A > B > C
- 下胸围=75:B > A > C
- 下胸围=80:B > A > C
- 下胸围=85:B > C > A
- 下胸围=90:C > B > A
- 下胸围=95:C > B > D
评论词云
最后我们来看看评论中经常说到的是什么词语吧~
- 代码:
w_all = []
for item in data.comment:
w_l = jieba.lcut(item)
w_all.extend(w_l)
c = Counter(w_all)
gen_stylecloud(' '.join(w_all),
size=1000,
#max_words=1000,
font_path='/home/kesci/work/font/simhei.ttf',
#palette='palettable.tableau.TableauMedium_10',
icon_name='fas fa-heartbeat',
output_name='comment.png',
custom_stopwords=['没有','用户','填写','评论']
)
Image(filename='comment.png')
文章内只能上传没有交互效果的图片,更好的阅读体验欢迎访问的我KLab ——【Pyecharts】20W条淘宝文胸商品评论数据可视化~
欢迎点赞支持~