在这里我就不再一一介绍每个步骤的具体操作了,因为在爬取老版今日头条数据的时候都已经讲的非常清楚了,所以在这里我只会在重点上讲述这个是这么实现的,如果想要看具体步骤请先去看我今日头条的文章内容,里面有非常详细的介绍以及是怎么找到加密js代码和api接口。
Python3爬取今日头条文章视频数据,完美解决as、cp、_signature的加密方法
WAP端跟APP端完全没啥区别,所以能用WAP端就用WAP端爬取数据,APP端涉及逆向APP比较复杂,所以推荐爬取WAP端的数据。
855262907
因为有拿后续数据的操作,肯定就会有值进行修改,所以我们得需要知道哪些值被修改
了,哪些值是固定
的,所以对比Form Data
尤为重要,看下面的内容就知道问题所在。
下面Name为app的就是返回广告数据的连接地址
数据取出来后进行JSON格式化,可以百度搜索JSON格式化
,这样方便我们进行对比。
{
"adReqData": {
"chid": 6,
"ipv4": "你自己的ip地址",
"adtype": 0,
"pf": "other",
"uin": "",
"qq_openid": "",
"ams_openid": "",
"netstatus": "unknown",
"slot": [{
"cur": 0, // 第一次为0,第二次为11,后续每次加10
"channel": "24h",
"loid": "1",
"orders_info": [], //后面的文章有讲解
"current_rot": "", //第二次为1,2第三次为1,2,3,4后续以此类推,第四次1,2,3,4,5,6
"article_id": "",
"refresh_type": 1, //第一次为1,后续兼为2
"seq": "", //第二次为5,10第三次为5,10,15,20后续以此类推,第四次5,10,15,20,25,30
"seq_loid": "" //全部为1,因为每次返回的数据有两条所以有两个1,第二次为1,1第三次为1,1,1,1
}],
"appversion": "190125",
"plugin_news_cnt": 10,
"plugin_page_type": "",
"plugin_tbs_version": 0,
"plugin_text_ad": false,
"plugin_bucket_id": "",
"plugin_osv": "",
"wap_source": "default"
}
}
第二次请求的Form Data:
{
"adReqData": {
"chid": 6,
"ipv4": "你自己的ip地址",
"adtype": 0,
"pf": "other",
"uin": "",
"qq_openid": "",
"ams_openid": "",
"netstatus": "unknown",
"slot": [{
"cur": 11,
"channel": "24h",
"loid": "1",
"orders_info": ["272163938,15437425,3123255098,1000,505,110,2", "273302998,14922017,1731713627,1000,4109,110,2"],
"current_rot": "1,2",
"article_id": "",
"refresh_type": 2,
"seq": "5,10",
"seq_loid": "1,1"
}],
"appversion": "190125",
"plugin_news_cnt": 10,
"plugin_page_type": "",
"plugin_tbs_version": 0,
"plugin_text_ad": false,
"plugin_bucket_id": "",
"plugin_osv": "",
"wap_source": "default"
}
}
{
"adReqData": {
"chid": 6,
"ipv4": "你自己的ip地址",
"adtype": 0,
"pf": "other",
"uin": "",
"qq_openid": "",
"ams_openid": "",
"netstatus": "unknown",
"slot": [{
"cur": 21,
"channel": "24h",
"loid": "1",
"orders_info": ["272163938,15437425,3123255098,1000,505,110,2", "273302998,14922017,1731713627,1000,4109,110,2", "273124923,16877408,2641249431,1000,4109,110,2", "273311058,17099839,3340342053,1000,808,110,2"],
"current_rot": "1,2,3,4",
"article_id": "",
"refresh_type": 2,
"seq": "5,10,15,20",
"seq_loid": "1,1,1,1"
}],
"appversion": "190125",
"plugin_news_cnt": 10,
"plugin_page_type": "",
"plugin_tbs_version": 0,
"plugin_text_ad": false,
"plugin_bucket_id": "",
"plugin_osv": "",
"wap_source": "default"
}
}
我们请求了三次,发现广告数据连接的Form Data
有变化的只有slot
里面的cur、orders_info、current_rot、refresh_type、seq、seq_loid
这几个字段。
搜索orders_info
得到构造请求Form Data
的JS函数
,直接开始读源码打断点。
发现我们的构造请求Form Data已经真相大白了,简单吧。
这下我们只需要知道window.SSPAd
是怎么生成的即可,直接搜索window.SSPAd
。
发现window.SSPAd
是new s
生成的,搜索var s =
,在这里面又发现了我们之前搜索orders_info的时候,orders_info
是在requestOrder
里面的。
然后通过打断点发现window.orders_info
是由getOrderInfo
返回的,这个是每次请求返回的结果里面的数据,并且通过观察其他的参数发现,他们都是有规律的,只有orders_info
没有规律,所以这一切都联系到了一起了,简单吧。
e.prototype.getOrderInfo = function(e) {
return e.oid + "," + e.advertiser_id + "," + e.product_id + "," + e.product_type + "," + e.industry_id + "," + e.order_source + "," + e.act_type
}
import requests
import json
requests.packages.urllib3.disable_warnings()
'''
腾讯新闻广告数据爬取
'''
class news_qq():
def __init__(self,number):
self.session = requests.Session()
self.cur = 0
self.orders_info = []
self.current_rot_tmp = 0
self.current_rot_list = []
self.current_rot = ''
self.refresh_type = 1
self.seq = ''
self.seq_loid = ''
for num in range(number): # 这是控制循环次数的
self.payload = {
"adReqData": {
"chid": 6,
"ipv4": self.get_client_ip(),
"adtype": 0,
"pf": "aphone",
"uin": "",
"qq_openid": "",
"ams_openid": "",
"netstatus": "unknown",
"slot": [
{
"cur": self.cur,
"channel": "24h",
"loid": "1",
"orders_info": self.orders_info,
"current_rot": self.current_rot,
"article_id": "",
"refresh_type": self.refresh_type,
"seq": self.seq,
"seq_loid": self.seq_loid
}
],
"appversion": "190125",
"plugin_news_cnt": 10,
"plugin_page_type": "",
"plugin_tbs_version": 0,
"plugin_text_ad": False,
"plugin_bucket_id": "",
"plugin_osv": "5.0.0",
"wap_source": "default"
}
}
js = self.app() # 这个就是获取到的广告json数据
print(js)
# 获取本机IP地址
def get_client_ip(self):
headers = {
'Accept': 'application/json, text/plain, */*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Host': 'ipv4.gdt.qq.com',
'Origin': 'https://xw.qq.com',
'Referer': 'https://xw.qq.com/m/24h',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-site',
'User-Agent': 'Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Mobile Safari/537.36'
}
url = 'https://ipv4.gdt.qq.com/get_client_ip'
ip = self.session.get(url,headers=headers).text
return ip
# 构造提交数据
# orders_info等于返回值中的e.oid + "," + e.advertiser_id + "," + e.product_id + "," + e.product_type + "," + e.industry_id + "," + e.order_source + "," + e.act_type
def set_params(self,js):
self.cur += 11 if self.cur == 0 else 10
adlist = json.loads(js['adList'])
order_tmp = 0
order_source = adlist['index'][0]['stream']['order_source'].split(',')
for order in adlist['order']:
oid = order['oid']
advertiser_id = order['advertiser_id']
product_id = order['product_id']
product_type = order['product_type']
industry_id = order['industry_id']
act_type = order['act_type']
self.orders_info.append(','.join([oid,str(advertiser_id), str(product_id), str(product_type), str(industry_id),order_source[order_tmp],str(act_type)]))
order_tmp += 1
self.current_rot_tmp += 1
self.current_rot_list.append(str(self.current_rot_tmp))
self.current_rot = ','.join(self.current_rot_list)
self.refresh_type = 2
self.seq += adlist['index'][0]['stream']['seq'] if self.seq == '' else ',' + adlist['index'][0]['stream']['seq']
self.seq_loid += '1,1' if self.seq_loid == '' else ',' + '1,1'
# 获取广告数据
def app(self):
url = 'https://news.ssp.qq.com/app'
headers = {
'Accept': 'application/json, text/plain, */*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Cache-Control': 'no-cache',
'Content-Type': 'application/x-www-form-urlencoded',
'Host': 'news.ssp.qq.com',
'Origin': 'https://xw.qq.com',
'Referer': 'https://xw.qq.com/m/24h',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-site',
'User-Agent': 'Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Mobile Safari/537.36'
}
response = self.session.post(url,headers=headers,data=json.dumps(self.payload),verify=False)
js = response.json()
self.set_params(js)
return js
if __name__ == '__main__':
news_qq(1000)