cloudflare 5s后跳转的网页怎么爬取

有很多网站使用了cloudflare 的反爬虫服务,第一次打开任何页面都需要等待5s才能进入后面的页面。这种页面需要解析js才能获取到跳转参数。完成跳转后才能获取有效cookie。

不喜欢bb,直接上代码了。

# -*- coding: utf-8 -*-
# @Time    : 2019/8/21 20:48
# @Author  : meng_zhihao
# @Email   : [email protected]
# @File    : five_seconds_redirect.py

'curl "https://steamdb.info/cdn-cgi/l/chk_jschl?s=a008fbe38534ed25da1fcfeee8818c71088155e2-1566391545-1800-AS6fBv4Md5hbFH5KuOu3rUO53K8YLifU6bByW039xKgE^%^2BB^%^2Fl3rJNXQjLqvAq^%^2FCNSWqfrbNCiBprNC4fTtXfmasWS20yWx2vBKGjya^%^2BTVhU8PsS8myK8ty1gUsqY7iuvZmw^%^3D^%^3D^&jschl_vc=bd263529a25342dfc2bf2d06ec6a32f9^&pass=1566391549.871-VHEZj8fTNM^&jschl_answer=16.6665600261" -H "authority: steamdb.info" -H "pragma: no-cache" -H "cache-control: no-cache" -H "upgrade-insecure-requests: 1" -H "user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36" -H "accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3" -H "referer: https://steamdb.info/" -H "accept-encoding: gzip, deflate, br" -H "accept-language: zh-CN,zh;q=0.9" -H "cookie: __cfduid=d1117a0185a634e26f5f076daff94a2c01566391545" --compressed'

import requests
from lxml import etree
import re
from js2py import eval_js
import time
import urllib
HEADERS = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36'}

def getXpath(xpath, content):  # xptah操作貌似会把中文变成转码&#xxxx;  /text()变unicode编码
    tree = etree.HTML(content)
    out = []
    results = tree.xpath(xpath)
    for result in results:
        if 'ElementStringResult' in str(type(result)) or 'ElementUnicodeResult' in str(type(result)):
            out.append(result)
        else:
            out.append(etree.tostring(result))
    return out

def get_js_return(content):
    """
    
""" # jschl_vc = "bd263529a25342dfc2bf2d06ec6a32f9" # passwd = "1566391549.871-VHEZj8fTNM" # jschl_answer = "16.6665600261" l = re.findall(r'name="jschl_vc" value="(.*?)"', content) if l: jschl_vc = l[0] l = re.findall(r'name="pass" value="(.*?)"', content) if l: passwd = l[0] m = re.search(r'setTimeout\(function\(\)\{((?:.|\n)*?)f\.submit\(\)', content) if m: s = m.group(1) l = s.split("\n") l = [i for i in l if i.split()] first = l[0] last = l[-2] _ret = re.search(r"(.*?)a\.value\s+=\s+((.*?)121')", last) if _ret: last = _ret.group(1) ret = _ret.group(2) js = "function f(){ %s %s %s return %s }" % ( first, """ t = 'https://steamdb.info/'; r = t.match(/https?:\/\//)[0]; t = t.substr(r.length); t = t.substr(0,t.length-1); """, last,ret) jschl_answer = eval_js(js)() print(jschl_answer) return jschl_vc, passwd, jschl_answer def get_cookie(): se = requests.session() page = se.get('https://steamdb.info/',timeout=10,headers=HEADERS).content.decode('utf8') print(page) l = re.findall(r'name="s" value="(.*?)"', page) if l: s = l[0] s = urllib.parse.quote(s) else: raise Exception jschl_vc, passwd, jschl_answer = get_js_return(page) print(s,jschl_vc, passwd, jschl_answer) time.sleep(4) url = 'https://steamdb.info/cdn-cgi/l/chk_jschl?s=%s&jschl_vc=%s&pass=%s&jschl_answer=%s'%(s,jschl_vc,passwd,jschl_answer) print(url) se.get(url,timeout=10,headers=HEADERS) time.sleep(2) new_page = se.get('https://steamdb.info/',timeout=10,headers=HEADERS,).content.decode('utf8') print(new_page) # 完整的页面 cookie = se.cookies.get_dict() # 完整的cookie print(cookie) return cookie if __name__ == '__main__': get_cookie()

你可能感兴趣的:(cloudflare 5s后跳转的网页怎么爬取)