python爬虫天猫商品数据及分析(5)

目的

对获取的天猫商品-智能手机评价
进行数据分析

实现


一 评价词云
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import time
import pandas as pd


#----------------------------------------------读取数据----------------------------------------------
# 读取数据
n = '../file/CSV/手机_评价.csv'
data = pd.read_csv(n)

#查看数据维度(行,列)
#print(data.shape)

#取出商品标题,区域,价格,销售四个维度的数据
#data=data[['商品名','价格','销售','省份']]



#对每个标题进行分词,使用jieba分词

#----------------------------------------------s商品名称分词处理----------------------------------------------
import jieba

title=data['评价内容'].astype(str)

title_s=[]
#商品名分词
for line in title:
    title_cut=jieba.cut(line)
    for i in title_cut:
        title_s.append(i)
#print(title_s)


# 导入停用此表
stopwords = [line.strip() for line in open('../file/TXT/StopWords.txt', 'r', encoding='utf-8').readlines()]
#print(stopwords)

# 剔除停用词
title_clean = []
for line in title_s:
        if line not in stopwords:
            title_clean.append(line)
#print(title_clean)


# 把列表 allwords_clean_dist 转为数据框
df_allwords_clean_dist = pd.DataFrame({
    'allwords': title_clean
})

#print(df_allwords_clean_dist)
#
# # 对过滤_去重的词语 进行分类汇总
word_count = df_allwords_clean_dist.allwords.value_counts().reset_index()
word_count.columns = ['word', 'count']
#print(word_count)
#




#----------------------------------------------词云可视化----------------------------------------------

from wordcloud import WordCloud
import matplotlib.pyplot as plt
import imageio as im
#尺寸大小
plt.figure(figsize=(8, 8))

# 读取图片,用于限制大小
pic = im.imread("../file/PNG/猫.PNG")
#print(pic)
w_c = WordCloud(font_path="simhei.ttf", background_color="black",mask=pic,max_font_size=100, margin=1)
wc = w_c.fit_words({
    x[0]: x[1] for x in word_count.head(100).values
})

#显示词云
plt.imshow(wc, interpolation='bilinear')
#坐标刻度隐藏
plt.axis("off")
plt.show()

# 保存到文件
wc.to_file('../file/PNG/17评价词云图.PNG')


python爬虫天猫商品数据及分析(5)_第1张图片

二 日期销售
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import pandas as pd

df=pd.read_csv('../file/CSV/data.csv',header=None,names=['date','num'])


#将数据类型转换为日期类型
df['date']=pd.to_datetime(df['date'])

df=df.set_index('date')

s=pd.Series(df['num'],index=df.index)

#TODO 1 按照日期筛选数据
df=df.sort_values('date')

k=df.truncate('2020-03-01')
k=k.to_dict()
names=[]
nums=[]
for o in k['num']:
    names.append((str(o))[:-9])
    nums.append(k['num'][o])




import matplotlib.pyplot as plt

plt.rcParams['font.sans-serif'] = ['SimHei']  #用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False    #用来正常显示负号

# 绘图
plt.figure(figsize=(10,4))
plt.plot(names,nums)
plt.xticks(rotation=-90) # X轴标签旋转,避免重叠
plt.grid(linestyle='-.')
plt.ylabel('销售量')
plt.title('三月份手机销量图')

# 显示数字
for a, b in zip(list(names), list(nums)):
    #x轴,y轴,显示数值,水平居中,垂直底部,字体大小
    plt.text(a,b,b, ha='center', va='bottom', size=10)

plt.grid(linestyle='-.')
plt.savefig('../file/PNG/19日期销售图.png')
plt.show()

python爬虫天猫商品数据及分析(5)_第2张图片
类似的还有
python爬虫天猫商品数据及分析(5)_第3张图片

三 机身内存
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
__title__ = ''
__author__ = 'jia666666'
__time__ = '2020/3/27'
"""
import pandas as pd
import matplotlib.pyplot as plt

# 读取数据
n = '../file/CSV/手机_评价.csv'
data = pd.read_csv(n)
#print(data['评价时间']['2020'])

#所有数据
total_data = {}
for item in data['存储容量']:
    item=str(item).split('(')[0]
    if item=='128GB':
        item='8+'+item
    elif item=='64GB':
        item='6+'+item
    elif item=="32GB":
        item='4+'+item
    elif item=="512GB":
        item='12+'+item
    elif item == "256GB":
        item = '8+' + item
    #

    k = item.split('+')
    k[0]=k[0].replace('GB','').replace('G','')
    k[1] = k[1].replace('B','')
    item = k[0] + '+' + k[1]



    if item not in total_data:
        total_data.update({item:1})
    else:
        total_data[item] +=1
print(total_data.keys())





#
ROM_data={}
RAM_data = {}
for i in total_data:
    k = str(i).split('+')
    #运行内存处理
    k[0]=k[0].replace('G', '').replace('B', '')+'GB'
    if k[0] not in ROM_data:
        ROM_data.update({k[0]:0})
        ROM_data[k[0]] += total_data[i]
    else:
        ROM_data[k[0]]+=total_data[i]
    #机身内存处理
    if k[1] not in RAM_data:
        RAM_data.update({k[1]: 0})
        RAM_data[k[1]] += total_data[i]
    else:
        RAM_data[k[1]] += total_data[i]

# print(RAM_data)
# print(ROM_data)



#------------------------------------------------------------------------------
# 绘制柱状图
#------------------------------------------------------------------------------

import matplotlib.pyplot as plt
import numpy as np

plt.rcParams['font.sans-serif'] = ['SimHei']  #用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False    #用来正常显示负号
names=list(total_data.keys())
nums=list(total_data.values())
# 绘图
plt.figure(figsize=(10,4))
plt.bar(names,nums)
plt.xticks(rotation=-90) # X轴标签旋转,避免重叠
plt.grid(linestyle='-.')
plt.ylabel('销售量')
plt.title('机身内存销量关系图')

# 显示数字
for a, b in zip(list(names), list(nums)):
    #x轴,y轴,显示数值,水平居中,垂直底部,字体大小
    plt.text(a,b,b, ha='center', va='bottom', size=10)

plt.grid(linestyle='-.')
plt.savefig('../file/PNG/13机身内存销量关系图.png')
plt.show()


python爬虫天猫商品数据及分析(5)_第4张图片

四 运行内存
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
__title__ = ''
__author__ = 'jia666666'
__time__ = '2020/3/27'
"""
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
__title__ = ''
__author__ = 'jia666666'
__time__ = '2020/3/27'
"""
import pandas as pd
import matplotlib.pyplot as plt

# 读取数据
n = '../file/CSV/手机_评价.csv'
data = pd.read_csv(n)
#print(data['评价时间']['2020'])

#所有数据
total_data = {}
for item in data['存储容量']:
    item = str(item).split('(')[0]
    if item=='128GB':
        item='8+'+item
    elif item=='64GB':
        item='6+'+item
    elif item=="32GB":
        item='4+'+item
    elif item=="512GB":
        item='12+'+item
    elif item == "256GB":
        item = '8+' + item


    k = item.split('+')
    k[0]=k[0].replace('GB','').replace('G','')
    k[1] = k[1].replace('B','')
    item = k[0] + '+' + k[1]

    if item not in total_data:
        total_data.update({item:0})
        total_data[item] += 1
    else:
        total_data[item] +=1
print(total_data)





#
ROM_data={}
RAM_data = {}
for i in total_data:
    k = str(i).split('+')
    #运行内存处理
    k[0]=k[0].replace('G', '').replace('B', '')+'GB'
    if k[0] not in ROM_data:
        ROM_data.update({k[0]:0})
        ROM_data[k[0]] += total_data[i]
    else:
        ROM_data[k[0]]+=total_data[i]
    #机身内存处理
    if k[1] not in RAM_data:
        RAM_data.update({k[1]: 0})
        RAM_data[k[1]] += total_data[i]
    else:
        RAM_data[k[1]] += total_data[i]
#
# print(RAM_data)
# print(ROM_data)





import matplotlib.pyplot as plt
import numpy as np

plt.rcParams['font.sans-serif'] = ['SimHei']  #用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False    #用来正常显示负号

names=list(ROM_data.keys())
nums=list(ROM_data.values())
# 绘图
plt.figure(figsize=[10,6])
plt.pie(nums,labels=names,autopct='%.2f%%')
plt.title("运行内存销量比重", fontproperties='SimHei', size=12)
plt.axis('equal')
plt.legend()

plt.savefig('../file/PNG/14运行内存比重.png')
plt.show()

python爬虫天猫商品数据及分析(5)_第5张图片

五 存储内存
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import pandas as pd
import matplotlib.pyplot as plt

# 读取数据
n = '../file/CSV/手机_评价.csv'
data = pd.read_csv(n)
#print(data['评价时间']['2020'])

#所有数据
total_data = {}
for item in data['存储容量']:
    item = str(item).split('(')[0]
    if item=='128GB':
        item='8+'+item
    elif item=='64GB':
        item='6+'+item
    elif item=="32GB":
        item='4+'+item
    elif item=="512GB":
        item='12+'+item
    elif item == "256GB":
        item = '8+' + item

    #字符格式整理
    k = item.split('+')
    k[0]=k[0].replace('GB','').replace('G','')
    k[1] = k[1].replace('B','')
    item = k[0] + '+' + k[1]

    if item not in total_data:
        total_data.update({item:0})
        total_data[item] += 1
    else:
        total_data[item] +=1
print(total_data)





#
ROM_data={}
RAM_data = {}
for i in total_data:
    k = str(i).split('+')
    #运行内存处理
    k[0]=k[0].replace('G', '').replace('B', '')+'GB'
    if k[0] not in ROM_data:
        ROM_data.update({k[0]:0})
        ROM_data[k[0]] += total_data[i]
    else:
        ROM_data[k[0]]+=total_data[i]
    #机身内存处理
    if k[1] not in RAM_data:
        RAM_data.update({k[1]: 0})
        RAM_data[k[1]] += total_data[i]
    else:
        RAM_data[k[1]] += total_data[i]

# print(RAM_data)
# print(ROM_data)
#




import matplotlib.pyplot as plt
import numpy as np

plt.rcParams['font.sans-serif'] = ['SimHei']  #用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False    #用来正常显示负号

names=list(RAM_data.keys())
nums=list(RAM_data.values())
# 绘图
plt.figure(figsize=(10,4))
plt.bar(names,nums)
plt.xticks(rotation=-90) # X轴标签旋转,避免重叠
plt.grid(linestyle='-.')
plt.ylabel('销售量')
plt.title('存储内存销量关系图')

# 显示数字
for a, b in zip(list(names), list(nums)):
    #x轴,y轴,显示数值,水平居中,垂直底部,字体大小
    plt.text(a,b,b, ha='center', va='bottom', size=10)
plt.grid(linestyle='-.')
plt.savefig('../file/PNG/15机身内存销量关系图.png')
plt.show()

python爬虫天猫商品数据及分析(5)_第6张图片

六 存储容量影响
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import pandas as pd
import matplotlib.pyplot as plt

# 读取数据
n = '../file/CSV/手机_评价.csv'
data = pd.read_csv(n)
#print(data['评价时间']['2020'])

#所有数据
total_data = {}
for item in data['存储容量']:
    item = str(item).split('(')[0]
    if item=='128GB':
        item='8+'+item
    elif item=='64GB':
        item='6+'+item
    elif item=="32GB":
        item='4+'+item
    elif item=="512GB":
        item='12+'+item
    elif item == "256GB":
        item = '8+' + item

    k = item.split('+')
    k[0]=k[0].replace('GB','').replace('G','')
    k[1] = k[1].replace('B','')
    item = k[0] + '+' + k[1]

    if item not in total_data:
        total_data.update({item:1})
    else:
        total_data[item] +=1
#print(total_data)





#
ROM_data={}
RAM_data = {}
for i in total_data:
    k = str(i).split('+')
    #运行内存处理
    k[0]=k[0].replace('G', '').replace('B', '')+'GB'
    if k[0] not in ROM_data:
        ROM_data.update({k[0]:0})
        ROM_data[k[0]] += total_data[i]
    else:
        ROM_data[k[0]]+=total_data[i]
    #机身内存处理
    if k[1] not in RAM_data:
        RAM_data.update({k[1]: 0})
        RAM_data[k[1]] += total_data[i]
    else:
        RAM_data[k[1]] += total_data[i]

# print(RAM_data)
# print(ROM_data)
#
#



import matplotlib.pyplot as plt
import numpy as np
plt.figure(figsize=[20,16])
plt.rcParams['font.sans-serif'] = ['SimHei']  #用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False    #用来正常显示负号

names=list(ROM_data.keys())
nums=list(ROM_data.values())
# 绘图
p1=plt.subplot(133)
plt.pie(nums,labels=names,autopct='%.2f%%')
#plt.plot(names,nums)
plt.title("运行内存饼图", fontproperties='SimHei', size=12)
plt.axis('equal')
plt.legend()
plt.sca(p1)


p2=plt.subplot(132)

names=list(RAM_data.keys())
nums=list(RAM_data.values())
plt.bar(names,nums)
plt.xticks(rotation=-90) # X轴标签旋转,避免重叠
plt.grid(linestyle='-.')
plt.ylabel('销售量')
plt.title('存储内存销量图')
# 显示数字
for a, b in zip(list(names), list(nums)):
    #x轴,y轴,显示数值,水平居中,垂直底部,字体大小
    plt.text(a,b,b, ha='center', va='bottom', size=10)
plt.sca(p2)

p3=plt.subplot(131)

names=list(total_data.keys())
nums=list(total_data.values())
# plt.bar(names,nums)
# plt.xticks(rotation=-90) # X轴标签旋转,避免重叠
# plt.grid(linestyle='-.')
# plt.ylabel('销售量')
# plt.title('机身内存销量图')

#plt.figure(figsize=(10,4))
plt.plot(names,nums)
plt.xticks(rotation=-90) # X轴标签旋转,避免重叠
plt.grid(linestyle='-.')
plt.ylabel('销售量')
plt.title('机身内存销量关系图')
#plt.show()
# 显示数字
for a, b in zip(list(names), list(nums)):
    #x轴,y轴,显示数值,水平居中,垂直底部,字体大小
    plt.text(a,b,b, ha='center', va='bottom', size=10)
plt.sca(p3)


plt.grid(linestyle='-.')
plt.savefig('../file/PNG/16存储容量影响关系图.png')
plt.show()

python爬虫天猫商品数据及分析(5)_第7张图片

你可能感兴趣的:(python爬虫实战,python爬虫,天猫商品,数据)