Python爬虫系列之爬取某优选微信小程序全国店铺商品数据

Python爬虫系列之爬取某优选微信小程序全国商品数据

小程序爬虫接单、app爬虫接单、网页爬虫接单、接口定制、网站开发、小程序开发 > 点击这里联系我们 <

微信请扫描下方二维码

在这里插入图片描述

代码仅供学习交流,请勿用于非法用途

  • 数据库仅用于去重使用,数据主要存储于excel

一、准备数据库

drop database if exists shop;

create database shop default charset utf8;

use shop;

create table `store`(
	`id` int primary key auto_increment,
	`store_id` varchar(18) not null comment 'store_id',
	`area_id` varchar(18) not null comment 'area_id',
	UNIQUE KEY `area_id` (`area_id`, `store_id`)
)engine=INNODB charset=utf8;

create table `goods`(
	`id` int primary key auto_increment,
	`goods_id` varchar(18) not null unique comment 'goods_id'
)engine=INNODB charset=utf8;

二、代码实现

import requests
import json
from queue import Queue
import threading
import time
import xlrd
import xlwt
from xlutils.copy import copy
import MySQLdb
import datetime

'''
    @Author     :王磊
    @Date       :2019/9/20
    @Description:优选微信小程序全国店铺商品数据爬取
'''

# -----------------------------------------------------------
threadNum = 1
excelPath = "c:/users/it1002/Desktop/data/excel"
imgPath = "c:/users/it1002/Desktop/data/img"
# 数据库账号
mysql_user = "root"
# 数据库密码
mysql_password = "root"
# 数据库名称
mysql_database = "shop"
# -----------------------------------------------------------

headers = {
    "content-type": "application/x-www-form-urlencoded",
    "User-Agent": "Mozilla/5.0 (Linux; Android 5.1.1; OPPO R11 Build/NMF26X; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/74.0.3729.136 Mobile Safari/537.36 MicroMessenger/7.0.6.1460(0x27000634) Process/appbrand0 NetType/WIFI Language/zh_CN",
    "Host": "mall-store.xsyxsc.com",
}

headers_ = {
    "User-Agent": "Mozilla/5.0 (Linux; Android 5.1.1; OPPO R11 Build/NMF26X; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/74.0.3729.136 Mobile Safari/537.36 MicroMessenger/7.0.6.1460(0x27000634) Process/appbrand0 NetType/WIFI Language/zh_CN",
}

userKey = "8be3d5dd-23e1-4b04-bf12-efca98d69d12"

areaList = []


class xsyxSpider(threading.Thread):
    def __init__(self, loaQueue, *args, **kwargs):
        super(xsyxSpider, self).__init__(*args, **kwargs)
        self.excelPath = ""
        self.loaQueue = loaQueue
        self.excelTitle = ['prId', 'acId', 'preId', 'sku', 'dailySaleTime', 'tmBuyStart', 'tmBuyEnd', 'tmShowStart', 'tmShowEnd', 'adUrl', 'prName', 'tmPickUp', 'limitQty', 'ulimitQty', 'marketAmt', 'saleAmt', 'prType', 'areaId', 'shelfLife', 'folQty', 'daySaleQty', 'saleQty', 'vesName', 'attrs', 'prDetail', 'shTitle', 'prBrief', 'primaryUrls', 'detailUrls', 'consumerNum', 'hasImgTxt', 'prTitle', 'yieldly', 'brName', 'specialSale', 'brId', 'status']

    def getDate(self):
        return str(datetime.date.today())

    def initExcel(self, areaName):
        self.excelPath = excelPath + "/" + self.getDate() + "-" + str(areaName) + ".xls"
        f = xlwt.Workbook()
        sheet1 = f.add_sheet(u'double', cell_overwrite_ok=True)
        for i in range(0, len(self.excelTitle)):
            sheet1.write(0, i, self.excelTitle[i])
        f.save(self.excelPath)

    def writeExcel(self, data):
        workbook = xlrd.open_workbook(self.excelPath)
        sheets = workbook.sheet_names()
        worksheet = workbook.sheet_by_name(sheets[0])
        rows_old = worksheet.nrows
        new_workbook = copy(workbook)
        new_worksheet = new_workbook.get_sheet(0)
        for j in range(0, len(data)):
            try:
                new_worksheet.write(rows_old, j, data[j])
            except Exception as e:
                continue
        new_workbook.save(self.excelPath)

    def getShopList(self, mapX, mapY):
        url = "https://mall-store.xsyxsc.com/mall-store/store/getNearStoreList"
        data = {
            "mapX": str(mapX),
            "mapY": str(mapY),
            "userKey": userKey,
        }
        shopListResp = postHtml(url, data)
        try:
            return shopListResp['data']
        except Exception as e:
            return None

    def updateStore(self, storeId):
        url = "https://user.xsyxsc.com/api/user/user/updateCurrStoreId?userKey=" + userKey + "&storeId=" + str(storeId)
        updateStoreResp = getHtml(url)
        print("切换店铺响应:" + str(updateStoreResp))
        try:
            return updateStoreResp['rspCode'] == 'success'
        except Exception as e:
            pass
        return False

    def getTs(self):
        return str(time.time()).replace(".", "")[: -4]

    def getGoodsList(self, storeId, areaId):
        url = "https://mall.xsyxsc.com/user/product/indexData?storeId=" + str(storeId) + "&areaId=" + str(areaId) + "&ts=" + str(self.getTs()) + "&userKey=" + userKey
        goodsListResp = getHtml(url)
        try:
            return goodsListResp['data']['products']
        except Exception as e:
            return None

    def saveImg(self, url, productId):
        try:
            img = requests.get(url)
            path = imgPath + "/" + str(productId) + ".jpg"
            with open(path, 'wb') as f:
                f.write(img.content)
            return path
        except Exception as e:
            return None

    def getGoodsDetail(self, productId, activityId, storeId, areaId):
        url = "https://mall.xsyxsc.com/user/product/productInfo?productId=" + str(productId) + "&activityId=" + str(activityId) + "&storeId=" + str(storeId) + "&productType=CHOICE&areaId=" + str(areaId) + "&userKey=" + userKey
        goodsDetailResp = getHtml(url)
        try:
            goods_ = goodsDetailResp['data']
            goods = []
            try:
                goods.append(str(goods_['prId']))
            except Exception as e:
                goods.append("")
            try:
                goods.append(str(goods_['acId']))
            except Exception as e:
                goods.append("")
            try:
                goods.append(str(goods_['preId']))
            except Exception as e:
                goods.append("")
            try:
                goods.append(str(goods_['sku']))
            except Exception as e:
                goods.append("")
            try:
                goods.append(str(goods_['dailySaleTime']))
            except Exception as e:
                goods.append("")
            try:
                goods.append(str(goods_['tmBuyStart']))
            except Exception as e:
                goods.append("")
            try:
                goods.append(str(goods_['tmBuyEnd']))
            except Exception as e:
                goods.append("")
            try:
                goods.append(str(goods_['tmShowStart']))
            except Exception as e:
                goods.append("")
            try:
                goods.append(str(goods_['tmShowEnd']))
            except Exception as e:
                goods.append("")
            try:
                goods.append(str(goods_['adUrl']))
            except Exception as e:
                goods.append("")
            try:
                goods.append(str(goods_['prName']))
            except Exception as e:
                goods.append("")
            try:
                goods.append(str(goods_['tmPickUp']))
            except Exception as e:
                goods.append("")
            try:
                goods.append(str(goods_['limitQty']))
            except Exception as e:
                goods.append("")
            try:
                goods.append(str(goods_['ulimitQty']))
            except Exception as e:
                goods.append("")
            try:
                goods.append(str(goods_['marketAmt']))
            except Exception as e:
                goods.append("")
            try:
                goods.append(str(goods_['saleAmt']))
            except Exception as e:
                goods.append("")
            try:
                goods.append(str(goods_['prType']))
            except Exception as e:
                goods.append("")
            try:
                goods.append(str(goods_['areaId']))
            except Exception as e:
                goods.append("")
            try:
                goods.append(str(goods_['shelfLife']))
            except Exception as e:
                goods.append("")
            try:
                goods.append(str(goods_['folQty']))
            except Exception as e:
                goods.append("")
            try:
                goods.append(str(goods_['daySaleQty']))
            except Exception as e:
                goods.append("")
            try:
                goods.append(str(goods_['saleQty']))
            except Exception as e:
                goods.append("")
            try:
                goods.append(str(goods_['vesName']))
            except Exception as e:
                goods.append("")
            try:
                goods.append(str(goods_['attrs']))
            except Exception as e:
                goods.append("")
            try:
                goods.append(str(goods_['prDetail']))
            except Exception as e:
                goods.append("")
            try:
                goods.append(str(goods_['shTitle']))
            except Exception as e:
                goods.append("")
            try:
                goods.append(str(goods_['prBrief']))
            except Exception as e:
                goods.append("")
            try:
                currImgPath = self.saveImg(goods_['primaryUrls'][0], productId)
                goods.append(currImgPath)
            except Exception as e:
                goods.append("")
            try:
                goods.append(str(goods_['detailUrls']))
            except Exception as e:
                goods.append("")
            try:
                goods.append(str(goods_['consumerNum']))
            except Exception as e:
                goods.append("")
            try:
                goods.append(str(goods_['hasImgTxt']))
            except Exception as e:
                goods.append("")
            try:
                goods.append(str(goods_['prTitle']))
            except Exception as e:
                goods.append("")
            try:
                goods.append(str(goods_['yieldly']))
            except Exception as e:
                goods.append("")
            try:
                goods.append(str(goods_['brName']))
            except Exception as e:
                goods.append("")
            try:
                goods.append(str(goods_['specialSale']))
            except Exception as e:
                goods.append("")
            try:
                goods.append(str(goods_['brId']))
            except Exception as e:
                goods.append("")
            try:
                goods.append(str(goods_['status']))
            except Exception as e:
                goods.append("")
            return goods
        except Exception as e:
            pass
        return None

    def addStore(self, store):
        try:
            conn = MySQLdb.connect(user=mysql_user, password=mysql_password, database=mysql_database, charset='utf8')
            cursor = conn.cursor()
            cursor.execute(
                "insert into store(store_id, area_id) "
                "values('%s', '%s')" %
                (store['store_id'], store['area_id'])
            )
            conn.commit()
            return True
        except Exception as e:
            return False

    def addGoods(self, goods):
        try:
            conn = MySQLdb.connect(user=mysql_user, password=mysql_password, database=mysql_database, charset='utf8')
            cursor = conn.cursor()
            cursor.execute(
                "insert into goods(goods_id) "
                "values('%s')" %
                (goods['goods_id'])
            )
            conn.commit()
            return True
        except Exception as e:
            return False

    def initDb(self):
        try:
            conn = MySQLdb.connect(user=mysql_user, password=mysql_password, database=mysql_database, charset='utf8')
            cursor = conn.cursor()
            cursor.execute(
                "delete from goods"
            )
            conn.commit()
            cursor.execute(
                "delete from store"
            )
            conn.commit()
            return True
        except Exception as e:
            return False

    def run(self):
        dbStatus = self.initDb()
        if dbStatus:
            while True:
                if self.loaQueue.empty():
                    break
                loa = self.loaQueue.get()
                area = loa[0]
                mapX = loa[1]
                mapY = loa[2]
                self.initExcel(area)
                print("当前选中区域:" + area + ", 经纬度:mapX : " + str(mapX) + ", mapY : " + str(mapY))
                shopList = self.getShopList(mapX, mapY)
                if not shopList:
                    print("该区域没有任何数据")
                for shop in shopList:
                    storeId = shop['storeId']
                    areaId =shop['areaId']
                    if areaId not in areaList:
                        areaList.append(areaId)
                        store = {}
                        store['store_id'] = storeId
                        store['area_id'] = areaId
                        storeStatus = self.addStore(store)
                        if storeStatus:
                            print("当前区域id:" + str(areaId))
                            updateStatus = self.updateStore(storeId)
                            if updateStatus:
                                goodsList = self.getGoodsList(storeId, areaId)
                                for goods in goodsList:
                                    productId = goods['prId']
                                    activityId = goods['acId']
                                    goodsBean = {}
                                    goodsBean['goods_id'] = productId
                                    goodsStatus = self.addGoods(goodsBean)
                                    if goodsStatus:
                                        goods_ = self.getGoodsDetail(productId, activityId, storeId, areaId)
                                        self.writeExcel(goods_)
                                        time.sleep(1)


def getLoaQueue():
    loaQueue = Queue(0)
    with open("loas_.txt", "r", encoding="utf-8") as f:
        for line in f:
            line = line.replace("\n", "").replace(" ", "").split(",")
            area = line[0]
            lng = line[1]
            lat = line[2]
            loaQueue.put([area, lng, lat])
    return loaQueue


def postHtml(url, data):
    while True:
        try:
            resp = requests.post(url, data=data, headers=headers, timeout=10)
            return json.loads(resp.content.decode("utf-8"))
        except Exception as e:
            continue


def getHtml(url):
    while True:
        try:
            resp = requests.get(url, headers=headers_, timeout=10)
            return json.loads(resp.content.decode("utf-8"))
        except Exception as e:
            continue


def main():
    loaQueue = getLoaQueue()
    for i in range(threadNum):
        x = xsyxSpider(loaQueue)
        x.start()


if __name__ == '__main__':
    main()

小程序爬虫接单、app爬虫接单、网页爬虫接单、接口定制、网站开发、小程序开发 > 点击这里联系我们 <

你可能感兴趣的:(Python)