淘宝爬虫爬取商品详情和销量

废话不说直接上代码,由于获取销量的接口需要登录后的cookies,并且需要指定获取的权限,所以需要在web上登录一次,然后在通过代码获取到销量字段

#!/usr/bin/python
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import re
from module.TaobaoItem import TaobaoItem
import json
import urllib
from pycookiecheat import chrome_cookies
from module.CookiesUtil import get_cookie_path

class TaobaoProcessor(object):

    def process(self,url):

        browser = webdriver.Chrome()
        browser.get(url)
        browser.implicitly_wait(10)

        # soap = BeautifulSoup(browser.page_source)
        # print(soap.prettify())
        meta = browser.find_element_by_xpath("/html/head/meta[9]")
        meta_content = meta.get_attribute("content")
        userid = re.findall(r".userid=(\d+)", meta_content)
        title = browser.find_element_by_class_name('tb-main-title').text
        origin_price = browser.find_element_by_id('J_StrPrice').find_element_by_class_name('tb-rmb-num').text
        # new_price = browser.find_element_by_class_name("tb-promo-price").find_element_by_class_name('tb-rmb-num').text
        # new_price = WebDriverWait(browser,5).until(lambda x: x.find_element_by_class_name("tb-promo-price").find_element_by_class_name('tb-rmb-num')).text
        imgs = browser.find_elements_by_xpath('//ul[@id="J_UlThumb"]/li/div/a/img')
        img_list = []
        for img in imgs:
            img_url = img.get_attribute('src')
            img_url = "_".join(img_url.split("_")[:-1])
            img_list.append(img_url)
        chima = browser.find_element_by_xpath('//*[@id="J_isku"]/div/dl[1]/dt').text
        chima_element = browser.find_element_by_xpath('//*[@id="J_isku"]/div/dl[1]/dd/ul').text
        span_list = []
        for span in chima_element.split("\n"):
            span_list.append(span)
        color_pro = browser.find_element_by_xpath('//*[@id="J_isku"]/div/dl[2]/dt').text
        color_val = browser.find_elements_by_xpath('//*[@id="J_isku"]/div/dl[2]/dd/ul/li/a/span')
        color_list = []
        for color in color_val:
            color_list.append(color.get_attribute('innerHTML'))
        attribute = browser.find_element_by_id("attributes")
        attr_val = attribute.get_attribute("innerHTML")
        description = browser.find_element_by_id("description")
        description_val = description.get_attribute("innerHTML")

        sale_count = browser.find_element_by_id("J_SellCounter").get_attribute('innerHTML')
        comment_count = browser.find_element_by_id("J_RateCounter").text

        item = TaobaoItem()
        item.title = title
        item.origin_price = origin_price
        # item.new_price = new_price
        item.img_list = img_list
        item.chima = chima
        item.color_pro = color_pro
        item.span_list = span_list
        item.color_list = color_list
        item.attr_val = attr_val
        item.description_val = description_val
        item.sale_count = sale_count
        item.comment_count = comment_count

        print(item.__dict__)

    def get_price(self,userid,itemid):
        url = "https://detailskip.taobao.com/service/getData/1/p1/item/detail/sib.htm?itemId=" + str(itemid)+"&sellerId=" + str(userid)+"&modules=dynStock,qrcode,viewer,price,duty,xmpPromotion,delivery,upp,activity,fqg,zjys,amountRestriction,couponActivity,soldQuantity,originalPrice,tradeContract&callback=onSibRequestSuccess"
        req = requests.session()
        _headers = {
            'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
            'authority': 'detailskip.taobao.com',
            'method': 'GET',

        }
        resp = req.get(url, headers=_headers)
        content = resp.content.decode()
        content = re.findall(r"onSibRequestSuccess\((.+)\)",content)
        data = json.loads(content[0])
        new_price = data['data']['promotion']['promoData']['def']['price']
        print(new_price)

    def process_html(self,url):
        req = requests.session()
        _headers = {
            'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
        }
        resp = req.get(url, headers=_headers)
        content = resp.content
        soap = BeautifulSoup(content)
        print(soap.prettify())

    def taobao_spider(self,url):
        headers = {
            'Accept': 'application/json, text/plain, */*',
            'Accept-Language': 'zh-CN,zh;q=0.3',
            'Referer': 'https://item.taobao.com/item.htm',
            'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
            'Connection': 'keep-alive',
        }
        goods_id = re.findall('id=(\d+)', url)[0]
        try:
            req = urllib.request.Request(url=url, headers=headers)
            res = urllib.request.urlopen(req).read().decode('gbk', 'ignore')
        except Exception as e:
            print('无法打开网页:', e.reason)
        try:
            title = re.findall('

(.*?)', res)[0] des_url = re.findall(r"descUrl\s+:(.+)", res) sellerid = re.findall(r"sellerId\s+:\s\'(.+)\'",res)[0] des_url = des_url[0].split(":")[2] des_url = re.findall(r"\'(.+)\'", des_url) des_url = 'https:' + des_url[0] des_request = urllib.request.Request(url=des_url, headers=headers) des_resp = urllib.request.urlopen(des_request).read() des_val = des_resp.decode('gbk', 'ignore') des_val = re.findall(r"var desc=((.+\s)+)",des_val) des_val = des_val[0][0] des_val = des_val.replace("';",'') des_val = des_val.replace("'", '') des_val = des_val.replace("\\",'') soap = BeautifulSoup(res) desciption = str(soap.find_all('div',id='attributes')[0]) # 30-42行为抓取淘宝商品真实价格,该数据是动态加载的 # purl = "https://detailskip.taobao.com/service/getData/1/p1/item/detail/sib.htm?itemId={}&sellerId={}&modules=dynStock,qrcode,viewer,price,duty,xmpPromotion,delivery,upp,activity,fqg,zjys,amountRestriction,couponActivity,soldQuantity,originalPrice,tradeContract&callback=onSibRequestSuccess".format(goods_id,sellerid) purl = "https://detailskip.taobao.com/service/getData/1/p1/item/detail/sib.htm?itemId={}&sellerId={}&modules=price,soldQuantity".format( goods_id,sellerid) headers['Referer'] = url cookie_path = get_cookie_path() cookies = chrome_cookies('https://item.taobao.com/', cookie_path) cookie_str = '' for k in cookies: cookie_str = cookie_str + k+"="+cookies[k]+";" headers['Cookie'] = cookie_str price_req = urllib.request.Request(url=purl, headers=headers) price_res = urllib.request.urlopen(price_req).read() resp_data = price_res.decode() data = list(set(re.findall('"price":"(.*?)"', resp_data))) confirm_sell = re.findall(r'"confirmGoodsCount":"(.*?)"', resp_data)[0] sell_total = re.findall(r'"soldTotalCount":"(.*?)"', resp_data)[0] # data列表中的价格可能是定值与区间的组合,也可能只是定值,而且不一定有序 real_price = "" for t in data: if '-' in t: real_price = t break if not real_price: real_price = sorted(map(float, data))[0] # 45-53行为抓取评论数据,该数据也是动态加载的 # comment_url = "https://rate.tmall.com/list_detail_rate.htm?itemId={}&sellerId=880734502Page=1".format( # goods_id) # comment_data = urllib.request.urlopen(comment_url).read().decode("gbk", "ignore") # print(comment_data) # temp_data = re.findall('("commentTime":.*?),"days"', comment_data) # temp_data = temp_data if temp_data else re.findall('("rateContent":.*?),"reply"', comment_data) # comment = "" # for data in temp_data: # comment += data.encode('utf-8') # comment = comment if comment else "暂无评论" except Exception as e: print('数据抽取失败!!!') print('商品名:', title) print('划线价格:', line_price) print('真实价格:', real_price) print('商品链接:', url) # print('部分评论内容:', comment) print('确认订单:',confirm_sell) print('30天内销售订单:', sell_total) print("描述:",desciption,des_val) if __name__ == '__main__': # text = """ # onSibRequestSuccess({"code":{"code":0,"message":"SUCCESS"},"data":{"viewer":{"admin":false,"bs":"","buyDomain":"buy.taobao.com","buyerId":"","cartDomain":"cart.taobao.com","cc":false,"countryCode":"CN","ctUser":false,"lgin":false,"serviceTab":"ITEM","tkn":"5759fa35b33bb"},"deliveryFee":{"data":{"areaId":440100,"areaName":"\u5E7F\u4E1C\u5E7F\u5DDE","sendCity":"\u6D59\u6C5F\u676D\u5DDE","serviceInfo":{"list":[{"id":"100_-4","info":"\u5FEB\u9012 \u514D\u8FD0\u8D39","isDefault":true,"markInfo":"7\u5929\u5185\u53D1\u8D27"}]}},"dataUrl":"\/\/detailskip.taobao.com\/json\/deliveryFee.htm","message":"ok","success":true},"upp":{"3790159085892":"192<\/strong>\u6DD8\u91D1\u5E01<\/em> \u62B5\uFFE51.92<\/em> \uFFE594.08<\/em>","3790159085893":"192<\/strong>\u6DD8\u91D1\u5E01<\/em> \u62B5\uFFE51.92<\/em> \uFFE594.08<\/em>","-2":"\u6DD8\u91D1\u5E01\u6700\u9AD8\u53EF\u62B5\u5546\u54C1\u4EF7 2%<\/em>","3790159085894":"192<\/strong>\u6DD8\u91D1\u5E01<\/em> \u62B5\uFFE51.92<\/em> \uFFE594.08<\/em>","3790159085895":"192<\/strong>\u6DD8\u91D1\u5E01<\/em> \u62B5\uFFE51.92<\/em> \uFFE594.08<\/em>","-5":"192<\/strong>\u6DD8\u91D1\u5E01<\/em> \u62B5\uFFE51.92<\/em> \uFFE594.08<\/em>","3790159085890":"192<\/strong>\u6DD8\u91D1\u5E01<\/em> \u62B5\uFFE51.92<\/em> \uFFE594.08<\/em>","3790159085891":"192<\/strong>\u6DD8\u91D1\u5E01<\/em> \u62B5\uFFE51.92<\/em> \uFFE594.08<\/em>"},"originalPrice":{";20509:28315;1627207:28320;":{"price":"99.00"},";20509:28314;1627207:28338;":{"price":"99.00"},";20509:28315;1627207:28341;":{"price":"99.00"},"def":{"price":"99.00"},";20509:28314;1627207:28341;":{"price":"99.00"},";20509:28315;1627207:28338;":{"price":"99.00"},";20509:28314;1627207:28320;":{"price":"99.00"}},"activity":{"bigpromotion":[{"bg":"\/\/img.alicdn.com\/tfs\/TB1yqnZr0knBKNjSZKPXXX6OFXa-480-40.png","img":["\/\/img.alicdn.com\/tfs\/TB1yqnZr0knBKNjSZKPXXX6OFXa-480-40.png","\/\/img.alicdn.com\/tfs\/TB18j50sk7mBKNjSZFyXXbydFXa-330-40.png"],"time":1536508800000,"type":"pre"},{"bg":"\/\/img.alicdn.com\/tfs\/TB172yRsiAnBKNjSZFvXXaTKXXa-480-40.png","img":["\/\/img.alicdn.com\/tfs\/TB172yRsiAnBKNjSZFvXXaTKXXa-480-40.png","\/\/img.alicdn.com\/tfs\/TB1NQGysXkoBKNjSZFkXXb4tFXa-330-40.png"],"time":1536836400000,"type":"start"},{"time":1537113599000,"type":"end"}]},"price":"99.00","tradeContract":{"pay":[{"icons":["\/\/img.alicdn.com\/tfs\/TB1KTHfQFXXXXbnXFXXXXXXXXXX-16-16.png","\/\/img.alicdn.com\/tfs\/TB1XeDvQFXXXXc5XXXXXXXXXXXX-32-32.png"],"title":"\u8682\u8681\u82B1\u5457","url":"\/\/payservice.alipay.com\/intro\/index.htm?c=hb"},{"icons":["\/\/img.alicdn.com\/tfs\/TB1w6O3QFXXXXX4aXXXXXXXXXXX-16-16.png","\/\/img.alicdn.com\/tfs\/TB1c7HAQFXXXXakXXXXXXXXXXXX-32-32.png"],"title":"\u4FE1\u7528\u5361\u652F\u4ED8","url":"\/\/payservice.alipay.com\/intro\/index.htm?c=xyk"},{"icons":["\/\/img.alicdn.com\/tfs\/TB1dvGWQFXXXXcFaXXXXXXXXXXX-16-16.png","\/\/img.alicdn.com\/tfs\/TB1FdDlQFXXXXa5XpXXXXXXXXXX-32-32.png"],"title":"\u96C6\u5206\u5B9D","url":"\/\/jf.alipay.com"}],"service":[{"desc":"\u6EE1\u8DB37\u5929\u65E0\u7406\u7531\u9000\u6362\u8D27\u7533\u8BF7\u7684\u524D\u63D0\u4E0B\uFF0C\u5305\u90AE\u5546\u54C1\u9700\u8981\u4E70\u5BB6\u627F\u62C5\u9000\u8D27\u90AE\u8D39\uFF0C\u975E\u5305\u90AE\u5546\u54C1\u9700\u8981\u4E70\u5BB6\u627F\u62C5\u53D1\u8D27\u548C\u9000\u8D27\u90AE\u8D39\u3002","icons":["\/\/img.alicdn.com\/tps\/i1\/T1EQA5FpVgXXceOP_X-16-16.jpg",null],"linkType":1,"title":"7\u5929\u65E0\u7406\u7531"},{"icons":["\/\/img.alicdn.com\/tfs\/TB1CgB6QVXXXXbwXXXXXXXXXXXX-16-16.png",null],"linkType":2,"title":"\u65B0\u54C1","url":"\/\/service.taobao.com\/support\/knowledge-1138476.htm?spm=2013.1.1000372.17.3wGlNf"}]},"dynStock":{"holdQuantity":0,"sellableQuantity":911,"sku":{";20509:28315;1627207:28320;":{"holdQuantity":0,"oversold":false,"sellableQuantity":557,"stock":557},";20509:28314;1627207:28338;":{"holdQuantity":0,"oversold":false,"sellableQuantity":542,"stock":542},";20509:28315;1627207:28341;":{"holdQuantity":0,"oversold":false,"sellableQuantity":911,"stock":911},";20509:28314;1627207:28341;":{"holdQuantity":0,"oversold":false,"sellableQuantity":911,"stock":911},";20509:28315;1627207:28338;":{"holdQuantity":0,"oversold":false,"sellableQuantity":552,"stock":552},";20509:28314;1627207:28320;":{"holdQuantity":0,"oversold":false,"sellableQuantity":425,"stock":425}},"stock":911,"stockType":"channel"},"qrcodeImgUrl":"\/\/gcodex.alicdn.com\/qrcode.do?biz_code=xcode&short_name=a.ZRs8&cmd=createSub¶m=id:576081757954;scm:20140619.pc_detail.itemId.0","couponActivity":{"buyerHasMianxi":false,"coupon":{"couponList":[{"activityId":"d71180f1c5d14d18aa2dca099dc7c46c","sellerId":"2448721589","icon":["\/\/img.alicdn.com\/tps\/TB1xlnONpXXXXa9aXXXXXXXXXXX-80-16.png","\/\/img.alicdn.com\/tps\/TB1HZofNpXXXXacXpXXXXXXXXXX-155-32.png"],"type":"shopcoupon","title":"50\u5143\u5E97\u94FA\u4F18\u60E0\u5238\uFF0C\u6EE1499\u5143\u53EF\u7528","isGot":false},{"activityId":"3c89bb542b3d49cd9fe942f102961bd0","sellerId":"2448721589","icon":["\/\/img.alicdn.com\/tps\/TB1xlnONpXXXXa9aXXXXXXXXXXX-80-16.png","\/\/img.alicdn.com\/tps\/TB1HZofNpXXXXacXpXXXXXXXXXX-155-32.png"],"type":"shopcoupon","title":"30\u5143\u5E97\u94FA\u4F18\u60E0\u5238\uFF0C\u6EE1299\u5143\u53EF\u7528","isGot":false}]},"shopProm":[{"icon":["\/\/img.alicdn.com\/tfs\/TB1ZrfnRFXXXXXgXXXXXXXXXXXX-57-16.png","\/\/img.alicdn.com\/tfs\/TB1qX5SRFXXXXciXFXXXXXXXXXX-116-32.png"],"type":"kdmnajian","title":"9\/13-9\/16\u6BCF\u6EE1199\u51CF10,\u4E0A\u4E0D\u5C01\u9876"},{"icon":["\/\/img.alicdn.com\/tfs\/TB1Kz8VQFXXXXa6XFXXXXXXXXXX-56-16.png","\/\/img.alicdn.com\/tfs\/TB1CDp8QFXXXXakXpXXXXXXXXXX-112-32.png"],"title":"\u6EE1299,\u4EAB\u90E8\u5206\u5730\u533A\u5305\u90AE"}],"showMianxiTips":false},"soldQuantity":{"confirmGoodsCount":"1452","soldTotalCount":"8863"},"promotion":{"promoData":{";20509:28315;1627207:28320;":[{"cart":true,"icon":"\/\/img.alicdn.com\/tfs\/TB1.KehskomBKNjSZFqXXXtqVXa-78-16.png","loginPromotion":false,"price":"96.00","start":false,"type":"\u6DD8\u62A2\u8D2D"}],";20509:28314;1627207:28338;":[{"cart":true,"icon":"\/\/img.alicdn.com\/tfs\/TB1.KehskomBKNjSZFqXXXtqVXa-78-16.png","loginPromotion":false,"price":"96.00","start":false,"type":"\u6DD8\u62A2\u8D2D"}],";20509:28315;1627207:28341;":[{"cart":true,"icon":"\/\/img.alicdn.com\/tfs\/TB1.KehskomBKNjSZFqXXXtqVXa-78-16.png","loginPromotion":false,"price":"96.00","start":false,"type":"\u6DD8\u62A2\u8D2D"}],"def":[{"cart":true,"icon":"\/\/img.alicdn.com\/tfs\/TB1.KehskomBKNjSZFqXXXtqVXa-78-16.png","loginPromotion":false,"price":"96.00","start":false,"type":"\u6DD8\u62A2\u8D2D"}],";20509:28314;1627207:28341;":[{"cart":true,"icon":"\/\/img.alicdn.com\/tfs\/TB1.KehskomBKNjSZFqXXXtqVXa-78-16.png","loginPromotion":false,"price":"96.00","start":false,"type":"\u6DD8\u62A2\u8D2D"}],";20509:28315;1627207:28338;":[{"cart":true,"icon":"\/\/img.alicdn.com\/tfs\/TB1.KehskomBKNjSZFqXXXtqVXa-78-16.png","loginPromotion":false,"price":"96.00","start":false,"type":"\u6DD8\u62A2\u8D2D"}],";20509:28314;1627207:28320;":[{"cart":true,"icon":"\/\/img.alicdn.com\/tfs\/TB1.KehskomBKNjSZFqXXXtqVXa-78-16.png","loginPromotion":false,"price":"96.00","start":false,"type":"\u6DD8\u62A2\u8D2D"}]},"saleDetailMap":{"result":"tqg","status":"online"}}}}); # """ # des_url = re.findall(r'"confirmGoodsCount":"(.*?)"',text) # total = re.findall(r'"soldTotalCount":"(.*?)"', text) # print(des_url[0],total[0]) taobao = TaobaoProcessor() url = 'https://item.taobao.com/item.htm?spm=2013.1.w16867253-18554788179.1.7aa41c3dXWLKMm&id=556805373975' # taobao.process(url) # taobao.get_price(2448721589,573379814923) # taobao.process_html(url) taobao.taobao_spider(url)

你可能感兴趣的:(淘宝爬虫爬取商品详情和销量)