python爬去京东商品数据以及Flask可视化

python爬去京东商品数据以及Flask可视化

废话不多说直接上代码,详细解释,让小白也能操作

爬虫部分(含有json请求返回的价格,销售量,好评率,评论等)

爬去京东笔记本电脑920页,每页60条自己算

# _*_ coding: utf-8 _*_
__author__ = 'qyg'
__date__ = '2019/3/28 8:45'
import requests
import json
import urllib3.contrib.pyopenssl
from lxml  import etree
import random
import csv
import time
pages = 920

class Craw_JD():
    def __init__(self,pages):
        urllib3.contrib.pyopenssl.inject_into_urllib3()
        self.start_url = 'https://item.jd.com/'
        self.pages = pages
        self.url = 'https://list.jd.com/list.html?cat=670,671,672&page=' # 构造请求头京东的商品有规律,cat指的是某个商品类型
        self.USER_AGENTS = [
            "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
            "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
            "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
            "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
            "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
            "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
            "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
            "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
            "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
            "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
            "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
            "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
            "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
            "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
        ]
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',  # 添加请求头,躲避机器检查
            'Connection': 'Keep-Alive',
         	
        }
    def response_handler(self):
        for page in range(0,self.pages):  # 遍历所有的页码
            url_ = self.url + str(page)			传递页码
            prox = self.pro_x()  # 获取代理
            self.headers['User-Agent'] = random.choice(self.USER_AGENTS)# 随机数选择请求头 
            res = requests.get(url=url_, proxies=prox, headers=self.headers, verify=False) # verify=False关闭SSL验证
            res=etree.HTML(res.text) # 使用xpath解析页面 
            items = res.xpath('//*[ @id = "plist"]/ul/li/div/div[1]/a/@href') # xpath 获取 每一页所有的详情地址标签 href属性值样式://item.jd.com/8674557.html
            responses = [requests.get(url='https:'+str(item),proxies=prox,headers = self.headers,verify=False) for item in items if items is not None]  # 列表推到式 每个页面的所有响应放入列表 
            print('取出60条-----')
            self.parse(responses,items)# 返回响应 交给parse去解析页面提取字段

    def pro_x(self):
    	'''代理ip自动选择部分 使用choice函数对序列自动选择并筛选'''
        prox = dict()   # 声明 字典类型 代理也是k-v映射 ’http‘ : ’ip:port‘
        ips = []
        with open('ip.txt', 'r') as fp:# 读取 ip.txt文件 将ip读取到列表中
            while True:
                line = fp.readline()  # 每次读取一行
                ips.append(line.rstrip()) # rstrip函数消除右端空字符(回车字符)并添加到 ips列表中
                if not line:
                    break
        prox['http'] = random.choice(ips)
        try:																			# proxies=prox,使用代理 timeout=2 2秒内返回则正常
            res = requests.get(url='https://item.jd.com/', proxies=prox, headers=self.headers, timeout=2)
            if res.status_code == 200:
                return prox		#测试 ip代理 能否使用 能则返回 不能继续调用函数随机选择  ip  
        except:
            self.pro_x()

    def parse(self,responses,items):
        data = []
        if responses:
            for index,res in enumerate(responses):
                html=etree.HTML(res.text)
                url=items[index]
                d_name = html.xpath('//*[@id="crumb-wrap"]/div/div[2]/div[2]/div[1]/div/a/text()')
                if d_name:
                    d_name=d_name[0]
                else:
                    continue
                strId = url.split("/")
                ids = strId[3].split(".")
                sku_id = ids[0]
                name=html.xpath('//*[@id="parameter-brand"]/li/@title')   #品牌名字//*[@id="parameter-brand"]/li/a
                if name :
                    name = name[0] # 获取字符串
                else:
                    continue # 没有就跳过本条
                XL,GoodRade= self.get_xiaoliang(sku_id)  # 销量 和 好评率
                price = self.get_price(sku_id)# 价格

                # comms = self.get_comm(sku_id,page=1)
                # if comms:
                #     comms = '|'.join(comms)
                # else:				# 获取评论数据不稳定,如果还想爬去评论信息,自行解除注释,data添加评论字段
                #     comms = '此用户无评价或者没有抓到' # 这部分因为代理ip是免费的网站爬的不稳定。所以我注释掉了
                good_info = html.xpath('//*[@id="detail"]/div[2]/div[1]/div[1]/ul[2]/li/text()')
                good_info=','.join(good_info)
                data.append(['sku_id:'+str(sku_id),'sku_name:'+name,'d_name:'+d_name,'price:'+str(price),'销量:'+str(XL),'GoodRade:'+str(GoodRade),good_info])
            self.save_data(data)
        else:
            print('400')

    def get_price(self,sku_id):
    	’‘’和获取好评率 和销售量 get_xiaoliang()差不多 看下边的介绍不重复解释了‘’‘
        url = 'https://p.3.cn/prices/mgets?skuIds='+str(sku_id)
        while True:  # 一直循环,知道访问站点成功
            try:
                # 以下except都是用来捕获当requests请求出现异常时,
                # 通过捕获然后等待网络情况的变化,以此来保护程序的不间断运行
                prox=self.pro_x()
                req = requests.get(url,proxies=prox, headers=self.headers, timeout=20)
                break
            except requests.exceptions.ConnectionError:
                print('ConnectionError -- please wait 3 seconds')
                time.sleep(3)
            except requests.exceptions.ChunkedEncodingError:
                print('ChunkedEncodingError -- please wait 3 seconds')
                time.sleep(3)
            except:
                print('Unfortunitely -- An Unknow Error Happened, Please wait 3 seconds')
                time.sleep(3)
        #t = requests.get(url,headers = self.headers,timeout=3,verify=False)
        dd = json.loads(req.text)
        prcie=dd[0].get('p')
        return prcie

    # def get_comm(self,sku_id,page):
    #     '''获取评论---每个商品获取一页 10条'''
    #     comms = []
    #     proc = self.pro_x()
    #     commUrl = "https://sclub.jd.com/comment/productPageComments.action?&productId={0}&score=0&sortType=5&pageSize=10&page={1}&isShadowSku=0&fold=1" .format(sku_id,page)
    #     while True:  # 一直循环,知道访问站点成功
    #         try:
    #             # 以下except都是用来捕获当requests请求出现异常时,
    #             # 通过捕获然后等待网络情况的变化,以此来保护程序的不间断运行
    #             req = requests.get(commUrl,proxies=proc, headers=self.headers, timeout=20)
    #             if req.content:
    #                 dd = json.loads(req.text)
    #                 break
    #         except requests.exceptions.ConnectionError:
    #             print('ConnectionError -- please wait 3 seconds')
    #             time.sleep(3)
    #         except requests.exceptions.ChunkedEncodingError:
    #             print('ChunkedEncodingError -- please wait 3 seconds')
    #             time.sleep(3)
    #         except:
    #             print('Unfortunitely -- An Unknow Error Happened, Please wait 3 seconds')
    #             time.sleep(3)
    # 
    #     tags = dd.get("comments")
    #     for i in tags:
    #         con = i.get('content')
    #         comms.append(con)
    #     return comms



    def get_xiaoliang(self,sku_id):
    	’‘’获取销售量and好评率json api接口请求在下边 ‘’‘
        url = 'https://club.jd.com/comment/productCommentSummaries.action?referenceIds=' + str(sku_id)
        while True:  # 一直循环,知道访问站点成功
            try:
                # 以下except都是用来捕获当requests请求出现异常时,
                # 通过捕获然后等待网络情况的变化,以此来保护程序的不间断运行
                prox=self.pro_x()
                req = requests.get(url,proxies=prox, headers=self.headers, timeout=20)
                if req.status_code==200:
                    break
            except requests.exceptions.ConnectionError:
                print('ConnectionError -- please wait 3 seconds')
                time.sleep(3)
            except requests.exceptions.ChunkedEncodingError:
                print('ChunkedEncodingError -- please wait 3 seconds')
                time.sleep(3)
            except:
                print('Unfortunitely -- An Unknow Error Happened, Please wait 3 seconds')
                time.sleep(3)
        dd = json.loads(req.text) # 返回的数据是json格式的 所以使用 json 包去反序列化获取键值
        comment = dd.get("CommentsCount")
        ss = comment[0]
        comment_sum = ss.get('CommentCount') # 评论总数
        goodrate = ss.get("GoodRate")   # 好评率
        return comment_sum,goodrate  

    def save_data(self,data):  # newline='' 保证没有空行出现
        with open('jd_data.csv', 'a+', encoding='utf-8',newline='') as fp:
            writer = csv.writer(fp)   # 写入使用csv 包
            for row in data:
                writer.writerow(row)  # 每次写入一行  一个循环 写入一页60条
        print("60已经存储")



def main():
     craw=Craw_JD(pages)
     urllib3.disable_warnings()#  消除ssl警告
     urllib3.contrib.pyopenssl.inject_into_urllib3()  # openssl验证
     #  monkey.patch_ssl()
     craw.response_handler()
if __name__ == '__main__':
    main()

维护ip代理部分,西刺代理

# !/usr/bin/env python
# -*- coding: utf-8 -*-

import requests
from lxml import etree

# 请求路径,西刺代理网站
url = 'https://www.xicidaili.com/?tdsourcetag=s_pctim_aiomsg'
# 请求响应头
headers = header = {
    'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1'}
# 通过requests的get方法访问目标网站,获得响应对象
response = requests.get(url=url, headers=headers)
print(response.text)
# 创建一个etree对象,response.text为访问后的到的整个西刺代理页面
etree_obj = etree.HTML(response.text)
# 通过筛选response.text,得到包含ip信息的列表
ip_list = etree_obj.xpath("//tr[@class='odd']")
item = []
# 遍历得到的集合,将ip,和端口信息进行拼接,添加到item列表
for ip in ip_list:
    ip_num = ip.xpath('./td[2]/text()')[0] 
    port_num = ip.xpath('./td[3]/text()')[0]
    http = ip_num + ':' + port_num
    item.append(http)

ips = []
with open('ip.txt','r') as fp:
    while True:
        line = fp.readline()
        ips.append(line.rstrip())
        if not line:
            break
print(ips)  # 打印已经存在的

# 遍历访问,检测IP活性
for it in item:
    # 因为并不是每个IP都是能用,所以要进行异常处理
    try:
        proxy = {
            'http': it
        }
        url1 = 'https://www.baidu.com/'
        # 遍历时,利用访问百度,设定timeout=1,即在1秒内,未送到响应就断开连接
        res = requests.get(url=url1, proxies=proxy, headers=headers, timeout=1)
        if res.status_code == 200:
            if it not in ips: # 测试是否已经存在
                with open('ip.txt', 'a+') as fp:
                    fp.write(it + '\n')  # 没有重复就写入
            else:
                print(it,'已经存在')
        # 打印检测信息,elapsed.total_seconds()获取响应的时间
        print(it + '--', res.elapsed.total_seconds())
    except BaseException as e:
        print(e)

数据清洗和处理部分

用到hadoop大数据处理,python对字段的清洗,后便发现 商品介绍字段的商品名称不好处理就删掉了

# 这里仅贴出来 python正则表达式 清洗数据部分
# s数据清洗 清除 最后一列的第一个不规则 字段 K-V  商品名称:
import re
import csv
a = []
with open('jd_data2.csv','r',encoding='utf-8') as fp:
    reader = csv.reader(fp)
    for i in reader:
        # print(i[6])
        str5 = re.sub(r'商品名称.*商品编号', '商品编号', i[6])
        i[6] = str5
        a.append(i)
        print(i)
    with open('jd_clear.csv','a',encoding='utf-8',newline="") as f:
        writer=csv.writer(f)
        for one in a:
            writer.writerow(one)

Flask+echarts可视化部分

  1. Flask采用 model 分离的结构,并解决循环导入db问题
    app.py 主文件内容:
# coding=utf8
"""
Migrate pyecharts and Flask with custom template functions.
"""
from __future__ import unicode_literals


from flask import Flask, render_template
from flask.templating import Environment

from pyecharts import HeatMap, Map, Pie, Line, Bar,WordCloud,TreeMap
from pyecharts.engine import ECHAERTS_TEMPLATE_FUNCTIONS
from pyecharts.conf import PyEchartsConfig
from models import Computer_Cpu,Computer_Price,Computer_Brand

from exts import db

# ----- Adapter ---------
class FlaskEchartsEnvironment(Environment):# 集成jinja2模板
    def __init__(self, *args, **kwargs):
        super(FlaskEchartsEnvironment, self).__init__(*args, **kwargs)
        self.pyecharts_config = PyEchartsConfig(jshost='/static/js')  # 使用本地 echarts js文件
        self.globals.update(ECHAERTS_TEMPLATE_FUNCTIONS)    #  添加模板函数到全局字典中。


# ---User Code ----

class MyFlask(Flask):# 继承Flask
    jinja_environment = FlaskEchartsEnvironment # 指定 Flask EchartsEnvironment 为默认模板引擎

app = MyFlask(__name__)

# 配置数据库url
app.config['SQLALCHEMY_DATABASE_URI'] = 'mysql://root:12345@localhost:3306/*****?' \
                                        'charset=utf8'

db.init_app(app)

@app.route("/")
def index():
    return render_template('index.html')


@app.route("/brand/")
def computer_brand():
    '''笔记本销量 牌子 '''

    data_list=Computer_Brand.query.all()
    brands=[]
    count=[]
    for item in data_list:
        brands.append(item.brand)
        count.append(item.count)
    pie = Pie('笔记本-销量/牌子Info',title_pos='center',width=1000)

    pie.add('',brands,count,center=[25,50],is_random=True,radius=[30,65],rosetype='area',is_legend_show=False,
            is_label_show=True)

    # Computer_Count.query.all().order_by
    pie.add('',brands,count,center=[80,50],is_random=True,radius=[30,65],rosetype='radius',is_legend_show=False)

    return render_template('Com_brand.html', hm=pie)

@app.route("/cpu/")
def Compu_cpu():
    '''笔记本销量 CPU版本关系'''
    cpus = []
    count = []
    data_list = Computer_Cpu.query.all()
    for item in data_list:
        # print(item.cpu_version)
        cpus.append(item.cpu_version)
        count.append(item.count)
    bar = Bar("笔记本销量-CPU关系柱状图图")
    # bar.use_theme('dark')
    bar.add("cpu型号", cpus, count, mark_point=["min", "average","max",])
    # return 'hello world'
    return render_template('Com_cpu.html',hm=bar)

@app.route("/price/")
def Compu_Price():
    '''笔记本 价格'''

    prices = []
    count = []
    data_list = Computer_Price.query.all()
    for item in data_list:

        prices.append(item.price)
        count.append(item.count)
    line = Line("笔记本-价格需求分析折线图")
    line.add("哈哈", prices, count, mark_point=["average"])

    return render_template('Com_price.html', hm=line)

if __name__ == '__main__':
    app.run(debug=True)

models.py 模型类

# _*_ coding: utf-8 _*_
__author__ = 'qyg'
__date__ = '2019/3/29 15:47'

from exts import db


class Computer_Brand(db.Model):
    __tablename__ = 'computer_brand' # 表名
    '''可视化model字段'''
    id = db.Column(db.Integer,primary_key=True)
    brand = db.Column(db.String(255))
    count = db.Column(db.Integer)


class Computer_Cpu(db.Model):
    __tablename__ = 'computer_cpu'
    '''可视化model字段'''
    id = db.Column(db.Integer,primary_key=True)
    cpu_version = db.Column(db.String(255))
    count = db.Column(db.BigInteger)


class Computer_Price(db.Model):
    __tablename__ = 'computer_price'
    '''可视化model字段'''
    id = db.Column(db.Integer,primary_key=True)
    price = db.Column(db.String(255))
    count = db.Column(db.Integer)

  1. 中间层 产生db的 exts.py 文件 是为了解决循环导入问题
# _*_ coding: utf-8 _*_
__author__ = 'qyg'
__date__ = '2019/4/2 13:24'

from flask_sqlalchemy import SQLAlchemy

db = SQLAlchemy()

python爬去京东商品数据以及Flask可视化_第1张图片

  1. Web页面
    python爬去京东商品数据以及Flask可视化_第2张图片
    python爬去京东商品数据以及Flask可视化_第3张图片
    python爬去京东商品数据以及Flask可视化_第4张图片
    python爬去京东商品数据以及Flask可视化_第5张图片

前端代码有需要我在粘贴

你可能感兴趣的:(爬虫和可视化)