今天爬取网站返回状态码521,经过分析发现是JS混淆加密,以下是具体破解代码:
import execjs
import re
import requests
import time
url = 'http://www.mps.gov.cn/n2254536/n2254544/n2254552/n6636622/n6636639/c6641737/content.html'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'}
req = requests.get(url, headers=headers, timeout=5)
print(req.status_code,req.text)
直接请求返回的状态码是521,内容是一串看不懂的js字符串
用python 执行js代码,获取到我们需要的__jsl_clearance
func_return = js_con.replace('eval(', 'return(') #js代码里eval替换return
content = execjs.compile(func_return)
fn = js_con.split('=')[0].split(' ')[1] #获取js函数名
evaled_func = content.call(fn)
fn = evaled_func.split('=')[0].split(' ')[1] # 获取动态函数名
aa = evaled_func.split("") # 获取标签的内容
aa = aa[1].split("")[0] if len(aa) >= 2 else ''
#替换js字符串
mode_func = evaled_func. \
replace(
"setTimeout('location.href=location.pathname+location.search.replace(/[\\?|&]captcha-challenge/,\\'\\')',1500);document.cookie=",
'return'). \
replace(';if((function(){try{return !!window.addEventListener;}', ''). \
replace(
"}catch(e){return false;}})()){document.addEventListener('DOMContentLoaded'," + fn + ",false)}else{document.attachEvent('onreadystatechange'," + fn + ")",
''). \
replace(
"if((function(){try{return !!window.addEventListener;}catch(e){return false;}})()){document.addEventListener('DOMContentLoaded'," + fn + ",false)}else{document.attachEvent('onreadystatechange'," + fn + ")",
''). \
replace("return'__jsl_clearance", "var window={};return '__jsl_clearance"). \
replace(
"var " + fn + "=document.createElement('div');" + fn + ".innerHTML='" + aa + "';" + fn + "=" + fn + ".firstChild.href",
"var " + fn + "='" + url + "'")
content = execjs.compile(mode_func)
cookies = content.call(fn)
__jsl_clearance = cookies.split(';')[0]
print(__jsl_clearance) #最终得到__jsl_clearance
获取到cookies 的__jsl_clearance
以下是完整代码
import execjs
import re
import requests
import time
url = 'http://www.mps.gov.cn/n2254536/n2254544/n2254552/n6636622/n6636639/c6641737/content.html'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'}
def get_521_content(url):
req = requests.get(url, headers=headers, timeout=5)
print(req.status_code,req.text)
if req.status_code == 521:
cookies = dict(req.cookies.items())
print(cookies)
js_con = ''.join(re.findall('', req.text))
if js_con:
__jsl_clearance = fixed_fun(js_con, url)
if __jsl_clearance:
key, value = __jsl_clearance.split('=')
cookies[key] = value
return cookies
# 执行js代码获取cookies 的__jsl_clearance的键值
def fixed_fun(js_con, url): # js_con 第一次请求获取的js内容
func_return = js_con.replace('eval(', 'return(')
print('第一次替换eval==》return后: ', func_return)
content = execjs.compile(func_return)
fn = js_con.split('=')[0].split(' ')[1]
evaled_func = content.call(fn)
print('第一次执行js代码后: ', evaled_func)
fn = evaled_func.split('=')[0].split(' ')[1] # 获取动态函数名
aa = evaled_func.split("") # 获取标签的内容
aa = aa[1].split("")[0] if len(aa) >= 2 else ''
mode_func = evaled_func. \
replace(
"setTimeout('location.href=location.pathname+location.search.replace(/[\\?|&]captcha-challenge/,\\'\\')',1500);document.cookie=",
'return'). \
replace(';if((function(){try{return !!window.addEventListener;}', ''). \
replace(
"}catch(e){return false;}})()){document.addEventListener('DOMContentLoaded'," + fn + ",false)}else{document.attachEvent('onreadystatechange'," + fn + ")",
''). \
replace(
"if((function(){try{return !!window.addEventListener;}catch(e){return false;}})()){document.addEventListener('DOMContentLoaded'," + fn + ",false)}else{document.attachEvent('onreadystatechange'," + fn + ")",
''). \
replace("return'__jsl_clearance", "var window={};return '__jsl_clearance"). \
replace(
"var " + fn + "=document.createElement('div');" + fn + ".innerHTML='" + aa + "';" + fn + "=" + fn + ".firstChild.href",
"var " + fn + "='" + url + "'")
print('第二次替换后的js代码:', mode_func)
try:
content = execjs.compile(mode_func)
cookies = content.call(fn)
__jsl_clearance = cookies.split(';')[0]
print(__jsl_clearance)
return __jsl_clearance
except:
print('js执行错误:', mode_func)
return None
# 携带解密后的cookies第二次爬取详情页
def con_spider(cookies, url):
response = requests.get(url, headers=headers, cookies=cookies, timeout=5)
if response.status_code == 200:
response.encoding = 'utf-8'
print(response.status_code)
print(response.text)
return response
else:
print('第二次爬取错误状态码:', response.status_code)
return None
if __name__ == "__main__":
cookies = get_521_content(url)
con_spider(cookies, url)
最终返回200