scrapy爬取华为商城所有商品信息--科技快人一步

华为商城 https://www.vmall.com/index.html

目标:华为商城下的商品信息

  • 按主页的左边手机,笔记本&平板,智能穿戴……分类
  • 每一个分类下的小分类
    • 商品标题
    • 商品价格
  • 规格参数
    • 主要参数
    • 主体
    • ……
    • 商品编码
  • 写入excel
  • 设置好excel数据表,分析数据

代码如下(scrapy):

# -*- coding: utf-8 -*-
import os
import re
import urllib.request
from copy import deepcopy

import scrapy
import xlrd
import xlwt
from ..items import HuaweiItem


class HuaWei(scrapy.Spider):
    name = 'huawei'
    allowed_domains = ['vmall.com', 'vmallres.com']
    start_urls = ['http://vmall.com/']

    def parse(self, response):
        self.new_xls()
        # 主页
        print("分割线-----------------------主页------------------------分割线")
        classify_list_A = response.xpath('//div[@id="category-block"]/div/ol/li')
        print("大分类长度:", len(classify_list_A))
        for i in classify_list_A:
            # print("现在位置:", classify_list_A)
            item = HuaweiItem()
            item['classify_A'] = i.xpath('.//input[2]/@value').extract_first()
            classify_list = i.xpath('.//div[2]//li[not(@class="subcate-btn")]')
            # classify_list = i.xpath('.//div[2]//li[last()]')
            for i in classify_list:
                item['classify_B'] = i.xpath('.//input[1]/@value').extract_first()
                href = "https://www.vmall.com" + str(i.xpath('.//a/@href').extract_first()) + '-1-3-0'
                # print("href:", href)
                yield scrapy.Request(
                    href,
                    callback=self.parse_A,
                    meta={
     "item": deepcopy(item)}
                )
        rb = xlrd.open_workbook('华为商城.xls')
        # 通过sheet_by_index()获取的sheet
        rs = rb.sheet_by_index(0)
        print("已爬取的商品数量:", rs.nrows - 1)

    def parse_A(self, response):
        # 中间页
        print("分割线-----------------------中间页------------------------分割线")
        li_list = response.xpath('//div[@class="layout"]/div[@class="channel-list"]/div[@class="pro-list clearfix"]/ul/li')
        if li_list:
            print("正在爬取页面链接:", response.request.url)
            print("此页面商品数量:", len(li_list))
            for i in li_list:
                item = response.meta["item"]
                rb = xlrd.open_workbook('华为商城.xls')
                # 通过sheet_by_index()获取的sheet
                rs = rb.sheet_by_index(0)
                cods = rs.col_values(0, start_rowx=0, end_rowx=None)
                item['title'] = i.xpath('./div[1]/p[2]/a/span[1]/text()').extract_first()
                # print("+++++++++++++++++++++++++++++++++++++++++++", item['title'])
                item['price'] = round(float(i.xpath('./div[1]/p[3]/b/text()').extract_first().split("¥")[1]) if i.xpath('./div[1]/p[3]/b/text()') else 0, 2)
                item['comments'] = int(i.xpath('./div[1]/div[@class="p-button clearfix"]//label//text()').extract_first().split("人")[0])
                item['img'] = i.xpath('./div[1]/p[1]/a/img/@src').extract_first()
                item['href'] = "https://www.vmall.com" + i.xpath('./div[1]/p[1]/a/@href').extract_first()
                item['coding'] = re.findall('[(]\'(.*?)\'[)]', i.xpath('./div[1]/p[1]/a/@onclick').extract_first())
                # for s in cods:
                #     if s == item['coding']:
                #         break
                # print(cods)
                # print(item['coding'])
                if item['coding'][0] not in cods:
                    yield scrapy.Request(
                        item['href'],
                        callback=self.parse_B,
                        meta={
     "item": deepcopy(item)}
                    )
            next_url_len = len(response.xpath('//ul[@id="page_ul"]/a'))
            # print("::::::::::::::::::::::::;;", next_url_len)
            if int(response.request.url.split("-")[2]) < next_url_len:
                href = response.request.url.split("-")[0] + "-" + response.request.url.split("-")[1] + "-" + str(
                    int(response.request.url.split("-")[2]) + 1) + '-3-0'
                print("next_href:", href)
                yield scrapy.Request(
                    href,
                    callback=self.parse_A,
                    meta={
     "item": deepcopy(item)}
                )


    def parse_B(self, response):
        # 详情页
        print("分割线-----------------------详情页------------------------分割线")
        item = response.meta["item"]
        print("现在位置%s/%s" % (item["classify_A"], item["classify_B"]))
        print("正在爬取:", item['title'])
        content = response.xpath('//div[@id="product-property-recommand"]')
        if content:
            item['promotion'] = self.get_cx(response)
            item['coding'] = content.xpath(
                './div[@class="product-description clearfix"]/div[@class="fl"]/text()').extract_first().strip()
            item['explain'] = content.xpath('.//div[@id="skuPromWord"]//span/text()').extract_first()
            server_explain = content.xpath(
                './/div[@id="product-pulldown1"]/div[1]/div[@class="product-description-list clearfix"]/ul/li')
            item['server_explain'] = self.get_cm(server_explain)
            item['content'] = content.xpath('.//h1[@id="pro-name"]/text()').extract_first()
            cu_1 = re.findall(r'', response.text)[1]
            yield scrapy.Request(
                cu_1,
                callback=self.get_cu_1,
                meta={
     "item": deepcopy(item)},
                dont_filter=True
            )
        else:
            content = response.xpath('//div[@class="pro-meta-area"]')
            item['content'] = content.xpath('.//h1[@id="pro-name"]/text()').extract_first()
            item['explain'] = content.xpath('.//div[@id="skuPromWord"]//span/text()').extract_first()
            item['server_explain'] = content.xpath('.//div[@class="pro-service"]/text()').extract_first()
            item['promotion'] = "暂无活动"
            yield item

    def get_cx(self, response):
        print("获取促销")
        """获取促销数据"""
        str = ""
        cu = re.findall(r'_groupPhotoList.push[(]{name:.*?}[)]; (_promotionsList+.*?); _prolongLst.push', response.text)
        # print(cu)
        if cu:
            try:
                cs = re.findall(r'"(.*?)"', cu[1])
            except:
                cs = re.findall(r'"(.*?)"', cu[0])
                print(cu)
                print(len(cu))
            # print(cs)
            index = 0
            pop_list = []
            for i in cs:
                # 遍历促销,去掉没用的数据
                # print("开始", index)
                i = i.replace(r'/', "/")
                if i.find('&#x') != -1:
                    i = i.replace("&#x", "\\u")
                    i = i.replace(";", "")
                    i = i.replace("\n", "")
                    i = i.replace("\t", "")
                    i = i.replace(" ", "")
                    i = i.encode().decode('unicode-escape')
                    cs[index] = i
                else:
                    # print("添加了:", index)
                    pop_list.append(index)
                index += 1
                # print("结束")
            for i in pop_list[::-1]:
                cs.pop(i)
            ins = 0
            for i in cs:
                # print("index:", cs.index(i))
                str += i
                ins += 1
                if ins % 2 is 0:
                    str += ";"
                elif ins % 2 is 1:
                    str += ":"
        return str
        # cu_1 = re.findall(r'', response.text)[1]
        # print(cu_1)
        # yield scrapy.Request(
        #     cu_1,
        #     callback=self.get_cu_1,
        #     meta={"item": item, "str": str}
        # )
        # req = urllib.request.urlopen(cu_1)
        # req = req.read()
        # req = req.decode("utf-8")
        # str += self.get_cu_1(req)
        # print("str_s", str)
        # return str
        # item['promotion'] = str

        # print("cu_1", cu_1)
        # scrapy.Request(
        #     cu_1,
        #     callback=self.get_cu_1,
        #     meta={"item": item}
        # )
        # print(rs)
        #
        # return str
        # yield scrapy.Request(
        #     cu_1,
        #     callback=self.get_cu_1,
        #     meta={"item": item, "str": str}
        # )

    def get_cu_1(self, response):
        #获取促销(购买可的积分)
        print("进入GET_CU_1")
        item = response.meta["item"]
        print(item)
        cu1 = re.findall(r' \\x3e\'[)],a.push[(](.*?")[)],', response.text)[0]
        cul_1 = re.findall(r'\\x3e(.*?)\\x3c', cu1)[0].encode().decode('unicode-escape')
        cul_2 = re.findall(r'a.push[(]"(.*?)"', cu1)[0].encode().decode('unicode-escape')
        str = cul_1 + ":" + cul_2 + ";"
        print("--------------------------str----------------------------------")
        item['promotion'] += str
        if item['promotion'] is '':
            item['promotion'] = "暂无活动"
        yield item
        # self.get_cu_1(self)



    def new_xls(self):
        """创建表格"""
        if not os.path.exists("华为商城.xls"):
            print("正在创建。。。")
            wb = xlwt.Workbook(encoding='utf-8')
            # 括号内参数为表名
            ws = wb.add_sheet('商品数据')
            # 参数1:行数
            # 参数2:列数 从0开始计数
            # 参数3:值   即单元格的内容
            ws.write(0, 0, label='商品编码')
            ws.write(0, 1, label='祖分类')
            ws.write(0, 2, label='父分类')
            ws.write(0, 3, label='标题')
            ws.write(0, 4, label='图片')
            ws.write(0, 5, label='链接')
            ws.write(0, 6, label='价格')
            ws.write(0, 7, label='评价数量')
            ws.write(0, 8, label='内容')
            ws.write(0, 9, label='说明')
            ws.write(0, 10, label='服务说明')
            ws.write(0, 11, label="促销")
            wb.save('华为商城.xls')

    def get_cm(self, server_explain):
        # 获取服务说明
        cm = ""
        for i in server_explain:
            text = i.xpath('./text()')
            if len(text) > 1:
                mm = ""
                str_1 = re.findall(r'data=\'(.+?)\'>', str(text))
                if i.xpath('./span/text()'):
                    for k in str_1:
                        mm += k
                        if str_1.index(k) == 0:
                            mm += i.xpath('./span/text()').extract_first()
                cm += mm
            else:
                cm += str(text.extract_first()) + ';'

        return cm

git地址

你可能感兴趣的:(爬虫,python,python,scrapy)