python 爬虫返回521

今天爬取网站返回状态码521,经过分析发现是JS混淆加密,以下是具体破解代码:

import execjs
import re
import requests
import time

url = 'http://www.mps.gov.cn/n2254536/n2254544/n2254552/n6636622/n6636639/c6641737/content.html'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'}
req = requests.get(url, headers=headers, timeout=5)
print(req.status_code,req.text)

直接请求返回的状态码是521,内容是一串看不懂的js字符串

用python 执行js代码,获取到我们需要的__jsl_clearance

func_return = js_con.replace('eval(', 'return(')  #js代码里eval替换return
content = execjs.compile(func_return)
fn = js_con.split('=')[0].split(' ')[1]  #获取js函数名
evaled_func = content.call(fn)
fn = evaled_func.split('=')[0].split(' ')[1]  # 获取动态函数名
aa = evaled_func.split("")  # 获取标签的内容
aa = aa[1].split("")[0] if len(aa) >= 2 else  ''

#替换js字符串
mode_func = evaled_func. \
        replace(
        "setTimeout('location.href=location.pathname+location.search.replace(/[\\?|&]captcha-challenge/,\\'\\')',1500);document.cookie=",
        'return'). \
        replace(';if((function(){try{return !!window.addEventListener;}', ''). \
        replace(
        "}catch(e){return false;}})()){document.addEventListener('DOMContentLoaded'," + fn + ",false)}else{document.attachEvent('onreadystatechange'," + fn + ")",
        ''). \
        replace(
        "if((function(){try{return !!window.addEventListener;}catch(e){return false;}})()){document.addEventListener('DOMContentLoaded'," + fn + ",false)}else{document.attachEvent('onreadystatechange'," + fn + ")",
        ''). \
        replace("return'__jsl_clearance", "var window={};return '__jsl_clearance"). \
        replace(
        "var " + fn + "=document.createElement('div');" + fn + ".innerHTML='" + aa + "';" + fn + "=" + fn + ".firstChild.href",
        "var " + fn + "='" + url + "'")
content = execjs.compile(mode_func)
cookies = content.call(fn) 
__jsl_clearance = cookies.split(';')[0]
print(__jsl_clearance)  #最终得到__jsl_clearance
        

获取到cookies 的__jsl_clearance

以下是完整代码

import execjs
import re
import requests
import time

url = 'http://www.mps.gov.cn/n2254536/n2254544/n2254552/n6636622/n6636639/c6641737/content.html'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'}

def get_521_content(url):
    req = requests.get(url, headers=headers, timeout=5)
    print(req.status_code,req.text)
    if req.status_code == 521:
        cookies = dict(req.cookies.items())
        print(cookies)
        js_con = ''.join(re.findall('', req.text))
        if js_con:
            __jsl_clearance = fixed_fun(js_con, url)
            if __jsl_clearance:
                key, value = __jsl_clearance.split('=')
                cookies[key] = value
                return cookies



# 执行js代码获取cookies 的__jsl_clearance的键值
def fixed_fun(js_con, url):  # js_con 第一次请求获取的js内容

    func_return = js_con.replace('eval(', 'return(')
    print('第一次替换eval==》return后:  ', func_return)
    content = execjs.compile(func_return)
    fn = js_con.split('=')[0].split(' ')[1]
    evaled_func = content.call(fn)
    print('第一次执行js代码后: ', evaled_func)
    fn = evaled_func.split('=')[0].split(' ')[1]  # 获取动态函数名
    aa = evaled_func.split("")  # 获取标签的内容
    aa = aa[1].split("")[0] if len(aa) >= 2 else  ''
    mode_func = evaled_func. \
        replace(
        "setTimeout('location.href=location.pathname+location.search.replace(/[\\?|&]captcha-challenge/,\\'\\')',1500);document.cookie=",
        'return'). \
        replace(';if((function(){try{return !!window.addEventListener;}', ''). \
        replace(
        "}catch(e){return false;}})()){document.addEventListener('DOMContentLoaded'," + fn + ",false)}else{document.attachEvent('onreadystatechange'," + fn + ")",
        ''). \
        replace(
        "if((function(){try{return !!window.addEventListener;}catch(e){return false;}})()){document.addEventListener('DOMContentLoaded'," + fn + ",false)}else{document.attachEvent('onreadystatechange'," + fn + ")",
        ''). \
        replace("return'__jsl_clearance", "var window={};return '__jsl_clearance"). \
        replace(
        "var " + fn + "=document.createElement('div');" + fn + ".innerHTML='" + aa + "';" + fn + "=" + fn + ".firstChild.href",
        "var " + fn + "='" + url + "'")
    print('第二次替换后的js代码:', mode_func)
    try:
        content = execjs.compile(mode_func)
        cookies = content.call(fn)
        __jsl_clearance = cookies.split(';')[0]
        print(__jsl_clearance)
        return __jsl_clearance
    except:
        print('js执行错误:', mode_func)
        return None

# 携带解密后的cookies第二次爬取详情页
def con_spider(cookies, url):
    response = requests.get(url, headers=headers, cookies=cookies, timeout=5)
    if response.status_code == 200:
        response.encoding = 'utf-8'
        print(response.status_code)
        print(response.text)
        return response
    else:
        print('第二次爬取错误状态码:', response.status_code)
        return None

if __name__ == "__main__":
    cookies = get_521_content(url)
    con_spider(cookies, url)

最终返回200

python 爬虫返回521_第1张图片

你可能感兴趣的:(爬虫)