1688网数据抓取-js抓取篇

简单明了的接口分析,轻而易举的js数据抓取,你值得了解

最近写爬虫抓取了一下1688网的数据,特写此博客记录一下此次抓取的过程分析

进入1688首页后,输入关键字后进行商品的搜索(这里搜索的是手机)。显示页面如下
1688网数据抓取-js抓取篇_第1张图片
检查源码的时候却发现无该商品的信息,查询ajax接口时发现了该数据是如何传输过来的
1688网数据抓取-js抓取篇_第2张图片
对响应的数据进行分析发现数据一次性传递二十个商品的数据,在offerList中,包含了商品图片信息,厂家信息,起批量及其起批价格等信息,在此我们就已经获取了商品详情页的大部分信息,在该数据对象中都能够找到。但是我们想抓取评论和成交详情时,却发现没有相关的信息1688网数据抓取-js抓取篇_第3张图片
于是我进一步分析js接口,总于找到了相关的信息,原来是将信息隐藏在了js中
先找到了成交的详情信息,如下
1688网数据抓取-js抓取篇_第4张图片
评论的详情页也抓取到了呢,就在rates.json文件中呢,如下
1688网数据抓取-js抓取篇_第5张图片
接下来就模拟js发送请求,获得评论的详情信息。
请求url如下:
https://rate.1688.com/remark/offerDetail/rates.json?callback=jQuery17205288144649342565_1596117170400&_input_charset=GBK&offerId=622311787712&page=1&pageSize=15&starLevel=&orderBy=&semanticId=&showStat=1&content=1&t=1596117179370&memberId=b2b-2208236930474a47ee&isNeedInitRate=false

经过自己试验发现,必要参数只有offerId,page,和memberId三个,这里的offerId容易获取,就是该商品url的最后一段数字 如下:
https://detail.1688.com/offer/622311787712.html?spm=a26352.13672862.offerlist.296.68526458UjZwS3
加粗字体就是商品的offerId,page就是需要爬取的页数,剩下的就是这个memberId不知从何下手,但经过自己的努力查找,终于让我捕获到了这个memberId,就在主页面的json数据中,如下图所示
1688网数据抓取-js抓取篇_第6张图片
成交信息与评论同样获取即可,接下来便可以拼凑自己的url发起请求了,思路如下:
1.获取总页面数,自己拼凑url发送请求,获取商品数据
2.根据商品数据,向每一个商品发送js请求,获取评论信息(这里需要注意一下,除了url外请求头还需要referer参数,该参数为当前页面的url,在json中也可以获取到,即为detailUrl)
3.开启多线程进行爬取
4.解析出自己需要的商品信息和对应多少页码的评论
5.该方法为js接口抓取,实测有效,但需要很多高可用的IP进行切换爬取,不然会被防爬识别,进而需要登录验证

代码如下(代码并未爬取完全,需要爬取完全需要自己更改需要爬取的商品数据,代码中有做注释哦!)

from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time
from urllib.parse import urlencode
import requests
import json


def open_html(url, send_msgs_dict):
    send_key = send_msgs_dict['send_key']
    broswer = webdriver.Chrome()
    wait = WebDriverWait(broswer,50)
    broswer.get(url)
    input = wait.until(
        EC.presence_of_element_located((By.ID,'home-header-searchbox'))
        )
    input.clear()
    input.send_keys(send_key)
    button = wait.until(
        EC.element_to_be_clickable((By.XPATH,'//*[@id="app"]/div/div[3]/section/div[2]/div/div/div/form/fieldset/div/div[2]/button'))
        )
    button.click()
    url = broswer.current_url

    time.sleep(2)
    broswer.close()

    return url

def ajax_get(r_url,send_msgs_dict,pages,ua):
    print('正在获取页面接口......')
    r_text_obj_list = []
    lis = r_url.split('?')[1].split('&')
    dic = {}
    for i in lis:
        dic[i.split('=')[0]] = i.split('=')[1]
    keywords = dic['keywords']
    del dic['keywords']
    url = "https://search.1688.com/service/marketOfferResultViewService?"
    parmeter = {
        'sortType' : send_msgs_dict['send_type'],  #(综合,成交额,价格)
        'descendOrder' : send_msgs_dict['send_order'],  #(升序,降序)
        'uniqfield' : 'userid',   #不变
        'async' : 'true',   #不变
        'asyncCount' : 20,   #不变
        'beginPage' : 1,     #页码值,默认第一页
        'pageSize' : 60,    #不变
        'startIndex' : ''  #(起始值)
        }
    headers = {
        'user-agent': ua
    }
    for page in range(1,pages+1):
        for i in range(3):
            if i == 0:
                parmeter['offset'] = 7
            parmeter['beginPage'] = page
            parmeter['startIndex'] = i*20
            new_parmeter = {}
            new_parmeter.update(dic)
            new_parmeter.update(parmeter)
            new_url = url + 'keywords=' + keywords +'&' + urlencode(new_parmeter)
            r = requests.get(new_url,headers=headers)
            r_text_obj_list.append(r)
        time.sleep(2)
    print('页面接口获取完毕!')
    return r_text_obj_list

def js_success_month_get_total_page(url,refer_url,ua):
    headers = {
    'referer': refer_url,
    'user-agent': ua
    }
    res = requests.get(url,headers=headers)
    print(url)
    res_json = json.loads(res.text)
    total_pages = res_json['data']['totalPage']
    total_count = res_json['data']['totalCount']
    return total_pages,total_count

def js_comment_get_total_page(url,refer_url,ua):
    headers = {
    'referer': refer_url,
    'user-agent': ua
    }
    res = requests.get(url,headers=headers).text
    res_json = json.loads(res)
    total_pages = res_json['data']['pageParam']['totalPage']
    total_count = res_json['data']['pageParam']['totalCount']
    return total_pages,total_count

def js_success_month_get(url,refer_url,ua):
    headers = {
    'referer': refer_url,
    'user-agent': ua
    }
    res_list = []
    res = requests.get(url,headers=headers).text
    res_json = json.loads(res)
    try:
        order_list = res_json['data']['orderDisplayEntryList']
        for order_one in order_list:
            order_dic = {}
            #采购商
            order_dic['buyerName'] = order_one['buyerName']
            #采购数量
            order_dic['quantity'] = order_one['quantity']
            #采购总数
            order_dic['countBuyerQuantity'] = order_one['countBuyerQuantity']
            #单位
            order_dic['unit'] = order_one['unit']
            #采购时间
            order_dic['buyerPayTime'] = order_one['buyerPayTime']
            #采购价格
            order_dic['price'] = order_one['price']
            #规格
            order_dic['specInfo'] = order_one['specInfo']
            res_list.append(order_dic)
            # pass
    except:
        #月销量为空
        pass
    return res_list

def js_comment_get(url,refer_url,ua):
    # print('正在获取评论页面数据......')
    headers = {
    'referer': refer_url,
    'user-agent': ua
    }
    res_list = []
    res = requests.get(url,headers=headers).text
    res_json = json.loads(res)
    try:
        rates_list = res_json['data']['rates']
        for rates_one in rates_list:
            rates_dic = {}
            #评论内容
            rates_dic['remarkContent'] = rates_one['rateItem'][0]['remarkContent']
            if rates_dic['remarkContent'] == '评价方未及时做出评价,系统默认好评!':
                continue
            #采购数量
            rates_dic['quantity'] = rates_one['quantity']
            #采购总数
            rates_dic['countQuantity'] = rates_one['countQuantity']
            #用户名字
            rates_dic['member'] = rates_one['member']
            #规格
            rates_dic['specInfo'] = rates_one['specInfo']
            #评论时间
            rates_dic['remarkTime'] = rates_one['rateItem'][0]['remarkTime']
            res_list.append(rates_dic)
            # pass
    except:
        #评论为空
        pass
    return res_list

def parse_r_text(r_text_obj_list,data_all,ua):
    for r_text_obj in r_text_obj_list:
        print('-------------------------------------------')
        print('正在分析第  '+str(r_text_obj_list.index(r_text_obj)+1)+'  个ajax接口对象...')
        r_text_json = json.loads(r_text_obj.text)
        all_list = r_text_json['data']['data']['offerList']
        print('商品数据长度=',len(all_list))
        n = 0
        for one in all_list:
            data = {}
            current_num_str = str(all_list.index(one)+1)
            member_id = one['company']['memberId']
            detail_url = one['information']['detailUrl']
            offerId = str(one['aliTalk']['infoId'])

            success_month_js_url = 'https://rate.1688.com/remark/offerDetail/saleRecords.json?offerId='+ offerId +'¤tPage=1&memberId=' + member_id
            success_month_total_pages,success_month_total_count = js_success_month_get_total_page(success_month_js_url,detail_url,ua)
            success_month_info_list = []
            print('正在获取第'+current_num_str+'个商品月销量页面数据......')
            for i in range(1,success_month_total_pages+1):
                new_js_url = 'https://rate.1688.com/remark/offerDetail/saleRecords.json?offerId='+ offerId +'¤tPage='+str(i)+'&memberId=' + member_id
                success_month_info_list += js_success_month_get(new_js_url,detail_url,ua)
                time.sleep(0.5)
            print('获取第'+current_num_str+'个商品月销量页面数据完毕!')

            comment_js_url = 'https://rate.1688.com/remark/offerDetail/rates.json?offerId='+ offerId +'¤tPage=1&memberId=' + member_id
            comment_total_pages,comment_total_count = js_comment_get_total_page(comment_js_url,detail_url,ua)
            comment_info_list = []
            print('正在获取第'+current_num_str+'个商品评论页面数据......')
            for i in range(1,comment_total_pages+1):
                new_js_url = 'https://rate.1688.com/remark/offerDetail/rates.json?offerId='+ offerId +'¤tPage='+str(i)+'&memberId=' + member_id
                comment_info_list += js_comment_get(new_js_url,detail_url,ua)
                time.sleep(0.5)
            print('获取第'+current_num_str+'个商品评论页面数据完毕!')

            #名称
            data['title'] = one['information']['simpleSubject']
            #图片地址
            data['img_url'] = one['image']['imgUrlOf290x290']
            #回购率
            data['repurchaseRate'] = one['information']['rePurchaseRate']
            #价格-起批量
            priceStart_list = one['tradePrice']['offerPrice']['quantityPrices']
            priceStart_dic = {}
            for one_priceStart in priceStart_list:
                price = one_priceStart['valueString']
                numStart = one_priceStart['quantity']
                priceStart_dic['price'] = numStart
            #月成交量
            data['success_month'] = success_month_total_count
            #月成交详情
            data['success_month_info_list'] = success_month_info_list
            #评论数
            data['comment'] = comment_total_count
            #评论详情
            data['comment_info_list'] = comment_info_list
            #地址
            data['addr'] = one['company']['province']+one['company']['city']
            #规格
            data['brief'] = one['information']['brief']

            data_all.append(data)
            #用n变量记录每页爬取的商品数量,爬取全部可将下列三行代码进行注释
            n += 1
            if n > 2:
                break
        print('第  '+str(r_text_obj_list.index(r_text_obj)+1)+'  个ajax接口对象分析完毕!')
    print('-------------------------------------------')
    print('全部对象分析完毕!')

def main():
    data_all = []
    ua = 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2224.3 Safari/537.36'
    url = 'https://www.1688.com/'
    send_key = '手机'
    send_type = '综合'
    send_order = '升序'
    #这里定义需要爬取的页码数
    pages = 1
    if pages > 50:
        pages = 50
    sortTypeDict = {
    '综合':'normal',
    '成交额':'va_rmdarkgmv30rt',
    '价格':'price'
    }
    descendOrderDict = {
    '升序':'false',
    '降序':'true'
    }
    send_msgs_dict = {'send_key':send_key,'send_type':sortTypeDict[send_type],'send_order':descendOrderDict[send_order]}
    r_url = open_html(url,send_msgs_dict)
    r_text_obj_list = ajax_get(r_url,send_msgs_dict,pages,ua)
    print('页面ajax接口对象个数------>',len(r_text_obj_list))
    parse_r_text(r_text_obj_list,data_all,ua)
    print('正在打印全部数据......')
    print(data_all)
    print('数据长度为=',len(data_all))
    print('爬取完成!')


if __name__ == '__main__':
    main()

此文章为个人探索得出,记录此文章仅为学习交流使用,希望能够帮助到学习中的你,一起努力一起加油吧!

你可能感兴趣的:(_spider)