# -*- coding: utf-8 -*-
import os
import re
import urllib.request
from copy import deepcopy
import scrapy
import xlrd
import xlwt
from ..items import HuaweiItem
class HuaWei(scrapy.Spider):
name = 'huawei'
allowed_domains = ['vmall.com', 'vmallres.com']
start_urls = ['http://vmall.com/']
def parse(self, response):
self.new_xls()
# 主页
print("分割线-----------------------主页------------------------分割线")
classify_list_A = response.xpath('//div[@id="category-block"]/div/ol/li')
print("大分类长度:", len(classify_list_A))
for i in classify_list_A:
# print("现在位置:", classify_list_A)
item = HuaweiItem()
item['classify_A'] = i.xpath('.//input[2]/@value').extract_first()
classify_list = i.xpath('.//div[2]//li[not(@class="subcate-btn")]')
# classify_list = i.xpath('.//div[2]//li[last()]')
for i in classify_list:
item['classify_B'] = i.xpath('.//input[1]/@value').extract_first()
href = "https://www.vmall.com" + str(i.xpath('.//a/@href').extract_first()) + '-1-3-0'
# print("href:", href)
yield scrapy.Request(
href,
callback=self.parse_A,
meta={
"item": deepcopy(item)}
)
rb = xlrd.open_workbook('华为商城.xls')
# 通过sheet_by_index()获取的sheet
rs = rb.sheet_by_index(0)
print("已爬取的商品数量:", rs.nrows - 1)
def parse_A(self, response):
# 中间页
print("分割线-----------------------中间页------------------------分割线")
li_list = response.xpath('//div[@class="layout"]/div[@class="channel-list"]/div[@class="pro-list clearfix"]/ul/li')
if li_list:
print("正在爬取页面链接:", response.request.url)
print("此页面商品数量:", len(li_list))
for i in li_list:
item = response.meta["item"]
rb = xlrd.open_workbook('华为商城.xls')
# 通过sheet_by_index()获取的sheet
rs = rb.sheet_by_index(0)
cods = rs.col_values(0, start_rowx=0, end_rowx=None)
item['title'] = i.xpath('./div[1]/p[2]/a/span[1]/text()').extract_first()
# print("+++++++++++++++++++++++++++++++++++++++++++", item['title'])
item['price'] = round(float(i.xpath('./div[1]/p[3]/b/text()').extract_first().split("¥")[1]) if i.xpath('./div[1]/p[3]/b/text()') else 0, 2)
item['comments'] = int(i.xpath('./div[1]/div[@class="p-button clearfix"]//label//text()').extract_first().split("人")[0])
item['img'] = i.xpath('./div[1]/p[1]/a/img/@src').extract_first()
item['href'] = "https://www.vmall.com" + i.xpath('./div[1]/p[1]/a/@href').extract_first()
item['coding'] = re.findall('[(]\'(.*?)\'[)]', i.xpath('./div[1]/p[1]/a/@onclick').extract_first())
# for s in cods:
# if s == item['coding']:
# break
# print(cods)
# print(item['coding'])
if item['coding'][0] not in cods:
yield scrapy.Request(
item['href'],
callback=self.parse_B,
meta={
"item": deepcopy(item)}
)
next_url_len = len(response.xpath('//ul[@id="page_ul"]/a'))
# print("::::::::::::::::::::::::;;", next_url_len)
if int(response.request.url.split("-")[2]) < next_url_len:
href = response.request.url.split("-")[0] + "-" + response.request.url.split("-")[1] + "-" + str(
int(response.request.url.split("-")[2]) + 1) + '-3-0'
print("next_href:", href)
yield scrapy.Request(
href,
callback=self.parse_A,
meta={
"item": deepcopy(item)}
)
def parse_B(self, response):
# 详情页
print("分割线-----------------------详情页------------------------分割线")
item = response.meta["item"]
print("现在位置%s/%s" % (item["classify_A"], item["classify_B"]))
print("正在爬取:", item['title'])
content = response.xpath('//div[@id="product-property-recommand"]')
if content:
item['promotion'] = self.get_cx(response)
item['coding'] = content.xpath(
'./div[@class="product-description clearfix"]/div[@class="fl"]/text()').extract_first().strip()
item['explain'] = content.xpath('.//div[@id="skuPromWord"]//span/text()').extract_first()
server_explain = content.xpath(
'.//div[@id="product-pulldown1"]/div[1]/div[@class="product-description-list clearfix"]/ul/li')
item['server_explain'] = self.get_cm(server_explain)
item['content'] = content.xpath('.//h1[@id="pro-name"]/text()').extract_first()
cu_1 = re.findall(r'', response.text)[1]
yield scrapy.Request(
cu_1,
callback=self.get_cu_1,
meta={
"item": deepcopy(item)},
dont_filter=True
)
else:
content = response.xpath('//div[@class="pro-meta-area"]')
item['content'] = content.xpath('.//h1[@id="pro-name"]/text()').extract_first()
item['explain'] = content.xpath('.//div[@id="skuPromWord"]//span/text()').extract_first()
item['server_explain'] = content.xpath('.//div[@class="pro-service"]/text()').extract_first()
item['promotion'] = "暂无活动"
yield item
def get_cx(self, response):
print("获取促销")
"""获取促销数据"""
str = ""
cu = re.findall(r'_groupPhotoList.push[(]{name:.*?}[)]; (_promotionsList+.*?); _prolongLst.push', response.text)
# print(cu)
if cu:
try:
cs = re.findall(r'"(.*?)"', cu[1])
except:
cs = re.findall(r'"(.*?)"', cu[0])
print(cu)
print(len(cu))
# print(cs)
index = 0
pop_list = []
for i in cs:
# 遍历促销,去掉没用的数据
# print("开始", index)
i = i.replace(r'/', "/")
if i.find('') != -1:
i = i.replace("", "\\u")
i = i.replace(";", "")
i = i.replace("\n", "")
i = i.replace("\t", "")
i = i.replace(" ", "")
i = i.encode().decode('unicode-escape')
cs[index] = i
else:
# print("添加了:", index)
pop_list.append(index)
index += 1
# print("结束")
for i in pop_list[::-1]:
cs.pop(i)
ins = 0
for i in cs:
# print("index:", cs.index(i))
str += i
ins += 1
if ins % 2 is 0:
str += ";"
elif ins % 2 is 1:
str += ":"
return str
# cu_1 = re.findall(r'', response.text)[1]
# print(cu_1)
# yield scrapy.Request(
# cu_1,
# callback=self.get_cu_1,
# meta={"item": item, "str": str}
# )
# req = urllib.request.urlopen(cu_1)
# req = req.read()
# req = req.decode("utf-8")
# str += self.get_cu_1(req)
# print("str_s", str)
# return str
# item['promotion'] = str
# print("cu_1", cu_1)
# scrapy.Request(
# cu_1,
# callback=self.get_cu_1,
# meta={"item": item}
# )
# print(rs)
#
# return str
# yield scrapy.Request(
# cu_1,
# callback=self.get_cu_1,
# meta={"item": item, "str": str}
# )
def get_cu_1(self, response):
#获取促销(购买可的积分)
print("进入GET_CU_1")
item = response.meta["item"]
print(item)
cu1 = re.findall(r' \\x3e\'[)],a.push[(](.*?")[)],', response.text)[0]
cul_1 = re.findall(r'\\x3e(.*?)\\x3c', cu1)[0].encode().decode('unicode-escape')
cul_2 = re.findall(r'a.push[(]"(.*?)"', cu1)[0].encode().decode('unicode-escape')
str = cul_1 + ":" + cul_2 + ";"
print("--------------------------str----------------------------------")
item['promotion'] += str
if item['promotion'] is '':
item['promotion'] = "暂无活动"
yield item
# self.get_cu_1(self)
def new_xls(self):
"""创建表格"""
if not os.path.exists("华为商城.xls"):
print("正在创建。。。")
wb = xlwt.Workbook(encoding='utf-8')
# 括号内参数为表名
ws = wb.add_sheet('商品数据')
# 参数1:行数
# 参数2:列数 从0开始计数
# 参数3:值 即单元格的内容
ws.write(0, 0, label='商品编码')
ws.write(0, 1, label='祖分类')
ws.write(0, 2, label='父分类')
ws.write(0, 3, label='标题')
ws.write(0, 4, label='图片')
ws.write(0, 5, label='链接')
ws.write(0, 6, label='价格')
ws.write(0, 7, label='评价数量')
ws.write(0, 8, label='内容')
ws.write(0, 9, label='说明')
ws.write(0, 10, label='服务说明')
ws.write(0, 11, label="促销")
wb.save('华为商城.xls')
def get_cm(self, server_explain):
# 获取服务说明
cm = ""
for i in server_explain:
text = i.xpath('./text()')
if len(text) > 1:
mm = ""
str_1 = re.findall(r'data=\'(.+?)\'>', str(text))
if i.xpath('./span/text()'):
for k in str_1:
mm += k
if str_1.index(k) == 0:
mm += i.xpath('./span/text()').extract_first()
cm += mm
else:
cm += str(text.extract_first()) + ';'
return cm
git地址