Python 爬取天猫 iPhone8plus 销售数据

流程:

一,爬取数据 保存到mysql数据库

二,读取数据 分析三种颜色的占比。

 

1,爬取数据保存到mysql数据库:

# -*- coding: utf-8 -*-
"""
Created on Mon Mar  4 11:09:45 2019

@author: Lenovo
"""""
import urllib
import mysql.connector
import re
import urllib.error
import json
import time as t

#设置请求头
headers = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36")
opener=urllib.request.build_opener()
opener.addheaders=[headers]#添加报头
urllib.request.install_opener(opener)#设置opner全局化

#设置代理服务器
def use_proxy_1(url,proxy_add):
    proxy=urllib.request.ProxyHandler({'http':proxy_add})
    opener=urllib.request.build_opener(proxy,urllib.request.HTTPHandler)
    urllib.request.install_opener(opener)
    data=urllib.request.urlopen(url).read().decode('utf-8')
    return data

for currentPage in range(25,100):
    try:
        ratecontent = []
        color = []
        commt_url='https://rate.tmall.com/list_detail_rate.htm?itemId=558760911386&spuId=877095771&sellerId=2616970884&order=3¤tPage='+str(currentPage)
        proxy_add="182.44.224.198:9999"#设置代理服务器
        commt_data=use_proxy_1(commt_url,proxy_add) #爬取网页的评论内容
        #筛选json格式数据
        jsondata=re.search('^[^(]*?\((.*)\)[^)]*$',commt_data).group(1)
        #用json加载数据
        data = json.loads(jsondata)
        
         #数据保存在变量里
        conn = mysql.connector.connect(host='localhost', port=3306, user='root', passwd='password', db='tianmao', charset='utf8mb4')
        cur = conn.cursor()
         #连接mysql
        print('连接成功!!!!')
        for i in range(0, len(data['rateDetail']['rateList'])):
            name = data['rateDetail']['rateList'][i]['displayUserNick']
            content = data['rateDetail']['rateList'][i]['rateContent']
            time = data['rateDetail']['rateList'][i]['rateDate']
            type = data['rateDetail']['rateList'][i]['auctionSku']
            
            typeDetails=re.split('[:;]',type)
            color=typeDetails[3]#颜色
            rom=typeDetails[7]#存贮容量
            net=typeDetails[1]#网络类型

            sql_content = "replace into iphone(name,content,time,color,rom,net) values (\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\")"  \
            %(str(name),str(content),str(time),str(color),str(rom),str(net))
            cur.execute(sql_content)
            #提交数据
            conn.commit()
            t.sleep(2)
        print('第'+str(currentPage)+'页数据保存完毕!')
        #关闭连接
        conn.close()
    except urllib.error.URLError as e:
        if hasattr(e,"code"):
            print(e.code)
        if hasattr(e,"reason"):
            print(e.reason)

效果:

Python 爬取天猫 iPhone8plus 销售数据_第1张图片

 

2,读取数据 并分析颜色占比

# -*- coding: utf-8 -*-
"""
Created on Mon Mar  4 16:55:25 2019

@author: Lenovo
"""
import mysql.connector
import matplotlib.pyplot as plt

plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus']=False #用来正常显示负号


conn = mysql.connector.connect(host='localhost', port=3306, user='root', passwd='password', db='tianmao', charset='utf8mb4')
cur = conn.cursor()

cur.execute("select count(*) from iphone;")
alldata = cur.fetchall()
print("总数: "+str(alldata))
#颜色统计
cur.execute("select count(*) from iphone where color='银色';")
color1 = cur.fetchall()
    
cur.execute("select count(*) from iphone where color='金色';")
color2 = cur.fetchall()
    
cur.execute("select count(*) from iphone where color='深空灰色';")
color3 = cur.fetchall()
    
xlabels=[u'银色',u'金色',u'深空灰色']
xValues=[color1[0],color2[0],color3[0]]
fig=plt.figure()
plt.pie(xValues,labels=xlabels,autopct='%.2f%%')
plt.title("颜色比例图",14)
plt.show()


    

    
    

效果:

由图可见 最受欢迎的颜色为 深空灰

Python 爬取天猫 iPhone8plus 销售数据_第2张图片

 

词云图:

# -*- coding: utf-8 -*-
"""
Created on Mon Mar  4 16:55:25 2019

@author: Lenovo
"""
import mysql.connector
import matplotlib.pyplot as plt

from wordcloud import WordCloud,STOPWORDS,ImageColorGenerator
import jieba

#词云图
comments=[]
with open('content.txt',mode='r',encoding='utf-8') as f:
    rows=f.readlines()
    print('readlines:'+str(len(rows)))
    for row in rows:
        comments.append(row)

#设置分词
comment_after_split = jieba.cut(str(comments),cut_all=False)#非全模式分词
words=' '.join(comment_after_split) #以空格进行拼接

#设置屏蔽词
stopwords=STOPWORDS.copy()
stopwords.add('此用户没有填写评论!')
stopwords.add('儿子')
stopwords.add('第一次')
stopwords.add('手机')
stopwords.add('苏宁')
stopwords.add('苹果')

stopwords.add('还是')
stopwords.add('不错')
stopwords.add('问题')
stopwords.add('收到')
stopwords.add('用户没有')
stopwords.add('那天')
stopwords.add('非常')

#导入背景图片
bg_image = plt.imread('bg.jpg')
# 设置词云参数,参数分别表示:画布宽高、背景颜色、背景图形状、字体、屏蔽词、最大词的字体大小
wc = WordCloud(width=1024, height=768, background_color='white', mask=bg_image, font_path='STKAITI.TTF',
               stopwords=stopwords, max_font_size=400, random_state=50)
# 将分词后数据传入云图
wc.generate_from_text(words)
plt.imshow(wc)
plt.axis('off')  # 不显示坐标轴
plt.show()
# 保存结果到本地
wc.to_file('词云图.jpg')

    

    
    

Python 爬取天猫 iPhone8plus 销售数据_第3张图片

你可能感兴趣的:(其它,爬虫,python)