最近写爬虫抓取了一下1688网的数据,特写此博客记录一下此次抓取的过程分析
进入1688首页后,输入关键字后进行商品的搜索(这里搜索的是手机)。显示页面如下
检查源码的时候却发现无该商品的信息,查询ajax接口时发现了该数据是如何传输过来的
对响应的数据进行分析发现数据一次性传递二十个商品的数据,在offerList中,包含了商品图片信息,厂家信息,起批量及其起批价格等信息,在此我们就已经获取了商品详情页的大部分信息,在该数据对象中都能够找到。但是我们想抓取评论和成交详情时,却发现没有相关的信息
于是我进一步分析js接口,总于找到了相关的信息,原来是将信息隐藏在了js中
先找到了成交的详情信息,如下
评论的详情页也抓取到了呢,就在rates.json文件中呢,如下
接下来就模拟js发送请求,获得评论的详情信息。
请求url如下:
https://rate.1688.com/remark/offerDetail/rates.json?callback=jQuery17205288144649342565_1596117170400&_input_charset=GBK&offerId=622311787712&page=1&pageSize=15&starLevel=&orderBy=&semanticId=&showStat=1&content=1&t=1596117179370&memberId=b2b-2208236930474a47ee&isNeedInitRate=false
经过自己试验发现,必要参数只有offerId,page,和memberId三个,这里的offerId容易获取,就是该商品url的最后一段数字 如下:
https://detail.1688.com/offer/622311787712.html?spm=a26352.13672862.offerlist.296.68526458UjZwS3
加粗字体就是商品的offerId,page就是需要爬取的页数,剩下的就是这个memberId不知从何下手,但经过自己的努力查找,终于让我捕获到了这个memberId,就在主页面的json数据中,如下图所示
成交信息与评论同样获取即可,接下来便可以拼凑自己的url发起请求了,思路如下:
1.获取总页面数,自己拼凑url发送请求,获取商品数据
2.根据商品数据,向每一个商品发送js请求,获取评论信息(这里需要注意一下,除了url外请求头还需要referer参数,该参数为当前页面的url,在json中也可以获取到,即为detailUrl)
3.开启多线程进行爬取
4.解析出自己需要的商品信息和对应多少页码的评论
5.该方法为js接口抓取,实测有效,但需要很多高可用的IP进行切换爬取,不然会被防爬识别,进而需要登录验证
代码如下(代码并未爬取完全,需要爬取完全需要自己更改需要爬取的商品数据,代码中有做注释哦!)
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time
from urllib.parse import urlencode
import requests
import json
def open_html(url, send_msgs_dict):
send_key = send_msgs_dict['send_key']
broswer = webdriver.Chrome()
wait = WebDriverWait(broswer,50)
broswer.get(url)
input = wait.until(
EC.presence_of_element_located((By.ID,'home-header-searchbox'))
)
input.clear()
input.send_keys(send_key)
button = wait.until(
EC.element_to_be_clickable((By.XPATH,'//*[@id="app"]/div/div[3]/section/div[2]/div/div/div/form/fieldset/div/div[2]/button'))
)
button.click()
url = broswer.current_url
time.sleep(2)
broswer.close()
return url
def ajax_get(r_url,send_msgs_dict,pages,ua):
print('正在获取页面接口......')
r_text_obj_list = []
lis = r_url.split('?')[1].split('&')
dic = {}
for i in lis:
dic[i.split('=')[0]] = i.split('=')[1]
keywords = dic['keywords']
del dic['keywords']
url = "https://search.1688.com/service/marketOfferResultViewService?"
parmeter = {
'sortType' : send_msgs_dict['send_type'], #(综合,成交额,价格)
'descendOrder' : send_msgs_dict['send_order'], #(升序,降序)
'uniqfield' : 'userid', #不变
'async' : 'true', #不变
'asyncCount' : 20, #不变
'beginPage' : 1, #页码值,默认第一页
'pageSize' : 60, #不变
'startIndex' : '' #(起始值)
}
headers = {
'user-agent': ua
}
for page in range(1,pages+1):
for i in range(3):
if i == 0:
parmeter['offset'] = 7
parmeter['beginPage'] = page
parmeter['startIndex'] = i*20
new_parmeter = {}
new_parmeter.update(dic)
new_parmeter.update(parmeter)
new_url = url + 'keywords=' + keywords +'&' + urlencode(new_parmeter)
r = requests.get(new_url,headers=headers)
r_text_obj_list.append(r)
time.sleep(2)
print('页面接口获取完毕!')
return r_text_obj_list
def js_success_month_get_total_page(url,refer_url,ua):
headers = {
'referer': refer_url,
'user-agent': ua
}
res = requests.get(url,headers=headers)
print(url)
res_json = json.loads(res.text)
total_pages = res_json['data']['totalPage']
total_count = res_json['data']['totalCount']
return total_pages,total_count
def js_comment_get_total_page(url,refer_url,ua):
headers = {
'referer': refer_url,
'user-agent': ua
}
res = requests.get(url,headers=headers).text
res_json = json.loads(res)
total_pages = res_json['data']['pageParam']['totalPage']
total_count = res_json['data']['pageParam']['totalCount']
return total_pages,total_count
def js_success_month_get(url,refer_url,ua):
headers = {
'referer': refer_url,
'user-agent': ua
}
res_list = []
res = requests.get(url,headers=headers).text
res_json = json.loads(res)
try:
order_list = res_json['data']['orderDisplayEntryList']
for order_one in order_list:
order_dic = {}
#采购商
order_dic['buyerName'] = order_one['buyerName']
#采购数量
order_dic['quantity'] = order_one['quantity']
#采购总数
order_dic['countBuyerQuantity'] = order_one['countBuyerQuantity']
#单位
order_dic['unit'] = order_one['unit']
#采购时间
order_dic['buyerPayTime'] = order_one['buyerPayTime']
#采购价格
order_dic['price'] = order_one['price']
#规格
order_dic['specInfo'] = order_one['specInfo']
res_list.append(order_dic)
# pass
except:
#月销量为空
pass
return res_list
def js_comment_get(url,refer_url,ua):
# print('正在获取评论页面数据......')
headers = {
'referer': refer_url,
'user-agent': ua
}
res_list = []
res = requests.get(url,headers=headers).text
res_json = json.loads(res)
try:
rates_list = res_json['data']['rates']
for rates_one in rates_list:
rates_dic = {}
#评论内容
rates_dic['remarkContent'] = rates_one['rateItem'][0]['remarkContent']
if rates_dic['remarkContent'] == '评价方未及时做出评价,系统默认好评!':
continue
#采购数量
rates_dic['quantity'] = rates_one['quantity']
#采购总数
rates_dic['countQuantity'] = rates_one['countQuantity']
#用户名字
rates_dic['member'] = rates_one['member']
#规格
rates_dic['specInfo'] = rates_one['specInfo']
#评论时间
rates_dic['remarkTime'] = rates_one['rateItem'][0]['remarkTime']
res_list.append(rates_dic)
# pass
except:
#评论为空
pass
return res_list
def parse_r_text(r_text_obj_list,data_all,ua):
for r_text_obj in r_text_obj_list:
print('-------------------------------------------')
print('正在分析第 '+str(r_text_obj_list.index(r_text_obj)+1)+' 个ajax接口对象...')
r_text_json = json.loads(r_text_obj.text)
all_list = r_text_json['data']['data']['offerList']
print('商品数据长度=',len(all_list))
n = 0
for one in all_list:
data = {}
current_num_str = str(all_list.index(one)+1)
member_id = one['company']['memberId']
detail_url = one['information']['detailUrl']
offerId = str(one['aliTalk']['infoId'])
success_month_js_url = 'https://rate.1688.com/remark/offerDetail/saleRecords.json?offerId='+ offerId +'¤tPage=1&memberId=' + member_id
success_month_total_pages,success_month_total_count = js_success_month_get_total_page(success_month_js_url,detail_url,ua)
success_month_info_list = []
print('正在获取第'+current_num_str+'个商品月销量页面数据......')
for i in range(1,success_month_total_pages+1):
new_js_url = 'https://rate.1688.com/remark/offerDetail/saleRecords.json?offerId='+ offerId +'¤tPage='+str(i)+'&memberId=' + member_id
success_month_info_list += js_success_month_get(new_js_url,detail_url,ua)
time.sleep(0.5)
print('获取第'+current_num_str+'个商品月销量页面数据完毕!')
comment_js_url = 'https://rate.1688.com/remark/offerDetail/rates.json?offerId='+ offerId +'¤tPage=1&memberId=' + member_id
comment_total_pages,comment_total_count = js_comment_get_total_page(comment_js_url,detail_url,ua)
comment_info_list = []
print('正在获取第'+current_num_str+'个商品评论页面数据......')
for i in range(1,comment_total_pages+1):
new_js_url = 'https://rate.1688.com/remark/offerDetail/rates.json?offerId='+ offerId +'¤tPage='+str(i)+'&memberId=' + member_id
comment_info_list += js_comment_get(new_js_url,detail_url,ua)
time.sleep(0.5)
print('获取第'+current_num_str+'个商品评论页面数据完毕!')
#名称
data['title'] = one['information']['simpleSubject']
#图片地址
data['img_url'] = one['image']['imgUrlOf290x290']
#回购率
data['repurchaseRate'] = one['information']['rePurchaseRate']
#价格-起批量
priceStart_list = one['tradePrice']['offerPrice']['quantityPrices']
priceStart_dic = {}
for one_priceStart in priceStart_list:
price = one_priceStart['valueString']
numStart = one_priceStart['quantity']
priceStart_dic['price'] = numStart
#月成交量
data['success_month'] = success_month_total_count
#月成交详情
data['success_month_info_list'] = success_month_info_list
#评论数
data['comment'] = comment_total_count
#评论详情
data['comment_info_list'] = comment_info_list
#地址
data['addr'] = one['company']['province']+one['company']['city']
#规格
data['brief'] = one['information']['brief']
data_all.append(data)
#用n变量记录每页爬取的商品数量,爬取全部可将下列三行代码进行注释
n += 1
if n > 2:
break
print('第 '+str(r_text_obj_list.index(r_text_obj)+1)+' 个ajax接口对象分析完毕!')
print('-------------------------------------------')
print('全部对象分析完毕!')
def main():
data_all = []
ua = 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2224.3 Safari/537.36'
url = 'https://www.1688.com/'
send_key = '手机'
send_type = '综合'
send_order = '升序'
#这里定义需要爬取的页码数
pages = 1
if pages > 50:
pages = 50
sortTypeDict = {
'综合':'normal',
'成交额':'va_rmdarkgmv30rt',
'价格':'price'
}
descendOrderDict = {
'升序':'false',
'降序':'true'
}
send_msgs_dict = {'send_key':send_key,'send_type':sortTypeDict[send_type],'send_order':descendOrderDict[send_order]}
r_url = open_html(url,send_msgs_dict)
r_text_obj_list = ajax_get(r_url,send_msgs_dict,pages,ua)
print('页面ajax接口对象个数------>',len(r_text_obj_list))
parse_r_text(r_text_obj_list,data_all,ua)
print('正在打印全部数据......')
print(data_all)
print('数据长度为=',len(data_all))
print('爬取完成!')
if __name__ == '__main__':
main()
此文章为个人探索得出,记录此文章仅为学习交流使用,希望能够帮助到学习中的你,一起努力一起加油吧!