写爬虫的时候,觉得只要自己每次请求都使用不同的代理 IP,每次请求的 Headers 都写得跟浏览器的一模一样,就不会被网站发现。
但实际上,还有一个东西,叫做浏览器指纹,它是不会随着你更换 IP 或者 User-Agent 而改变的。而且即使你不使用模拟浏览器,你直接使用 Golang、使用 Python,它们也有自己各自的指纹,并且他们的指纹每次请求也是固定的。只要网站发现某个拥有特定指纹的客户端持续高频率请求网站,它就可以把你封掉。
检测算法,叫做JA3算法
根据tls校验程度从小到大排序:
# -*- coding: utf-8 -*-
# @Time : 2022/1/14 9:35
# @Author : Cocktail_py
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.ssl_ import create_urllib3_context
CIPHERS = (
'ECDH+AESGCM:DH+AESGCM:ECDH+AES256:DH+AES256:RSA+3DES:!aNULL:'
'!eNULL:!MD5'
)
headers = {
'accept': 'application/json, text/javascript, */*; q=0.01',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'cache-control': 'no-cache',
'pragma': 'no-cache',
'user-agent': 'yuanrenxue.project',
'x-requested-with': 'XMLHttpRequest',
}
class DESAdapter(HTTPAdapter):
def init_poolmanager(self, *args, **kwargs):
context = create_urllib3_context(ciphers=CIPHERS)
kwargs['ssl_context'] = context
return super(DESAdapter, self).init_poolmanager(*args, **kwargs)
def proxy_manager_for(self, *args, **kwargs):
context = create_urllib3_context(ciphers=CIPHERS)
kwargs['ssl_context'] = context
return super(DESAdapter, self).proxy_manager_for(*args, **kwargs)
url = 'http://httpbin.org/get'
s = requests.Session()
s.mount('http://httpbin.org', DESAdapter())
rep = s.get(url,headers=headers)
print(rep.text)
# -*- coding: utf-8 -*-
# @Time : 2022/1/14 9:35
# @Author : Cocktail_py
import requests
import urllib3
url = 'http://httpbin.org/get'
urllib3.util.ssl_.DEFAULT_CIPHERS += 'HIGH:!DH:!aNULL'
headers = {
'accept': 'application/json, text/javascript, */*; q=0.01',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'cache-control': 'no-cache',
'pragma': 'no-cache',
'user-agent': 'yuanrenxue.project',
'x-requested-with': 'XMLHttpRequest',
}
r = requests.get(url=url, headers=headers)
print(r.text)
# -*- coding: utf-8 -*-
# @Time : 2023/3/30 9:35
# @Author : Cocktail_py
from curl_cffi import requests
url = "https://match.yuanrenxue.cn/api/match/19?page=1"
payload = {}
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'
}
# 注意这个 impersonate 参数,指定了模拟哪个浏览器
r = requests.get(url,headers=headers,impersonate="chrome101")
print(r.text)
# 参考https://www.cnblogs.com/Eeyhan/p/15662849.html
## requests
import random
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.ssl_ import create_urllib3_context
ORIGIN_CIPHERS = ('DH+3DES:RSA+3DES:ECDH+AES256:DH+AESGCM:DH+AES256:DH+AES:ECDH+AES128:'
'DH+HIGH:RSA+AESGCM:ECDH+3DES:RSA+AES:RSA+HIGH:ECDH+AESGCM:ECDH+HIGH')
class DESAdapter(HTTPAdapter):
def __init__(self, *args, **kwargs):
CIPHERS = ORIGIN_CIPHERS.split(':')
random.shuffle(CIPHERS)
CIPHERS = ':'.join(CIPHERS)
self.CIPHERS = CIPHERS + ':!aNULL:!eNULL:!MD5'
super().__init__(*args, **kwargs)
def init_poolmanager(self, *args, **kwargs):
context = create_urllib3_context(ciphers=self.CIPHERS)
kwargs['ssl_context'] = context
return super(DESAdapter, self).init_poolmanager(*args, **kwargs)
def proxy_manager_for(self, *args, **kwargs):
context = create_urllib3_context(ciphers=self.CIPHERS)
kwargs['ssl_context'] = context
return super(DESAdapter, self).proxy_manager_for(*args, **kwargs)
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36 Edg/92.0.902.67'}
session = requests.Session()
session.headers.update(headers)
ssl = DESAdapter()
for _ in range(5):
# 设置只绑定在 https://ja3er.com 这个网站
# s.mount('https://ja3er.com', adapter=DESAdapter())
# 设置绑定在任何 https 的请求上
session.mount('https://', adapter=ssl)
result = session.get('https://ja3er.com/json').json()
print(result)
##aiohttp
import asyncio
import random
import ssl
import aiohttp
ORIGIN_CIPHERS = ('RSA+3DES:RSA+AES:RSA+AESGCM:ECDH+AESGCM:DH+AESGCM:ECDH+AES256:DH+AES256:ECDH+AES128:ECDH+HIGH:'
'DH+HIGH:DH+3DES:RSA+HIGH:DH+AES:ECDH+3DES')
class SSLFactory:
def __init__(self):
self.ciphers = ORIGIN_CIPHERS.split(":")
def __call__(self) -> ssl.SSLContext:
random.shuffle(self.ciphers)
ciphers = ":".join(self.ciphers)
ciphers = ciphers + ":!aNULL:!eNULL:!MD5"
context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
context.set_ciphers(ciphers)
return context
sslgen = SSLFactory()
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'}
async def main():
async with aiohttp.ClientSession() as session:
for _ in range(5):
async with session.get("https://ja3er.com/json", headers=headers, ssl=sslgen()) as resp:
data = await resp.text()
print(data)
asyncio.get_event_loop().run_until_complete(main())
##scrapy
1.配置文件中直接修改
# 最末尾需要确定是 :!aNULL:!eNULL:!MD5 结尾
DOWNLOADER_CLIENT_TLS_CIPHERS = 'RSA+AES:RSA+3DES:RSA+AESGCM:ECDH+AESGCM:ECDH+AES256:DH+AES256:ECDH+AES128:ECDH+HIGH:DH+HIGH:DH+3DES:RSA+HIGH:DH+AES:ECDH+3DES:!aNULL:!eNULL:!MD5'
2.启动自动随机修改
# 在scrapy中定义一个函数
def ssl():
ciphers = 'RSA+3DES:RSA+AES:RSA+AESGCM:ECDH+AESGCM:DH+AESGCM:ECDH+AES256' \
':DH+AES256:ECDH+AES128:ECDH+HIGH:DH+HIGH:DH+3DES:RSA+HIGH:DH+AES:ECDH+3DES'.split(":")
random.shuffle(ciphers)
ciphers = ":".join(ciphers)
ciphers = ciphers + ":!aNULL:!eNULL:!MD5"
return ciphers
# 在内部进行配置参数的修改
class PaSpider(scrapy.Spider):
name = 'pa'
start_urls = ['https://ja3er.com/json']
custom_settings = {
'DOWNLOADER_CLIENT_TLS_CIPHERS': ssl(),
}
def start_requests(self):
***
##参考https://assistest.cn/2022/01/03/ja3/
# -*- coding: utf-8 -*-
# @Time : 2023/5/18 9:52
# @Author : Cocktail_py
import random
import tls_client
client_identifier_list=[
'chrome_103',
# 'chrome_104',
# 'chrome_105',
# 'chrome_106',
# 'chrome_107',
# 'chrome_108',
# 'chrome109',
# 'Chrome110',
# 'chrome111',
# 'chrome112',
# 'firefox_102',
# 'firefox_104',
# 'firefox108',
# 'Firefox110',
# 'opera_89',
# 'opera_90',
# 'safari_15_3',
# 'safari_15_6_1',
# 'safari_16_0',
# 'safari_ios_15_5',
# 'safari_ios_15_6',
# 'safari_ios_16_0',
# 'safari_ios_15_6',
# 'okhttp4_android_7',
# 'okhttp4_android_8',
# 'okhttp4_android_9',
# 'okhttp4_android_10',
# 'okhttp4_android_11',
# 'okhttp4_android_12',
# 'okhttp4_android_13',
]
session = tls_client.Session(
client_identifier=random.choice(client_identifier_list),
random_tls_extension_order=True
)
res = session.get(
"https://api.ipify.org/?format=json",
headers={
"key1": "value1",
},
# proxy="http://user:password@host:port"
)
print(res.text)
# 参考https://pypi.org/project/tls-client/
Go ja3指纹突破
https://github.com/wangluozhe/requests
好库推荐|强烈推荐,支持Ja3指纹修改的golang请求库
tls指纹之到底怎么判断是否有tls、到底怎么对抗tls
https://github.com/Danny-Dasilva/CycleTLS
https://github.com/zero3301/pyhttpx
python完美突破tls/ja3
其实就是魔改的pycurl的简易安装版,群里一个大佬编译好的,
里面有详细的步骤:
https://github.com/synodriver/pycurl/blob/master/special.markdown
我的docker镜像:
docker pull geekbyte1/pyantitls:v1.0
docker run -it -d geekbyte1/pyantitls:v1.0
Q佬的docker file:
https://mp.weixin.qq.com/s/UZlLuzlQZrI7w82HI7zGuw
就是魔改版的pycurl,用的核心的东西,直接操作终端然后调用curl_impersonnate
现在curl_impersonnate已经出windows版
能过tls检测的curl-impersonate win版,搞起来啊
https://github.com/Danny-Dasilva/cycletls_python
https://kawayiyi.com/tls
https://ja3.zone/check
https://browserleaks.com/ssl
https://tls.peet.ws/api/all
参考:
tls指纹之到底怎么判断是否有tls、到底怎么对抗tls
为什么随机 IP、随机 UA 也逃不掉被反爬虫的命运
akamai TLS指纹(即JA3)