突破tls校验常用方法

写爬虫的时候,觉得只要自己每次请求都使用不同的代理 IP,每次请求的 Headers 都写得跟浏览器的一模一样,就不会被网站发现。
但实际上,还有一个东西,叫做浏览器指纹,它是不会随着你更换 IP 或者 User-Agent 而改变的。而且即使你不使用模拟浏览器,你直接使用 Golang、使用 Python,它们也有自己各自的指纹,并且他们的指纹每次请求也是固定的。只要网站发现某个拥有特定指纹的客户端持续高频率请求网站,它就可以把你封掉。
检测算法,叫做JA3算法

根据tls校验程度从小到大排序:

一.1.原生python层面修改tls套件
# -*- coding: utf-8 -*-
# @Time    : 2022/1/14 9:35
# @Author  : Cocktail_py
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.ssl_ import create_urllib3_context

CIPHERS = (
    'ECDH+AESGCM:DH+AESGCM:ECDH+AES256:DH+AES256:RSA+3DES:!aNULL:'
    '!eNULL:!MD5'
)

headers = {
    'accept': 'application/json, text/javascript, */*; q=0.01',
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'zh-CN,zh;q=0.9',
    'cache-control': 'no-cache',
    'pragma': 'no-cache',
    'user-agent': 'yuanrenxue.project',
    'x-requested-with': 'XMLHttpRequest',
}


class DESAdapter(HTTPAdapter):
    def init_poolmanager(self, *args, **kwargs):
        context = create_urllib3_context(ciphers=CIPHERS)
        kwargs['ssl_context'] = context
        return super(DESAdapter, self).init_poolmanager(*args, **kwargs)
    def proxy_manager_for(self, *args, **kwargs):
        context = create_urllib3_context(ciphers=CIPHERS)
        kwargs['ssl_context'] = context
        return super(DESAdapter, self).proxy_manager_for(*args, **kwargs)

url = 'http://httpbin.org/get'

s = requests.Session()
s.mount('http://httpbin.org', DESAdapter())
rep = s.get(url,headers=headers)
print(rep.text)
# -*- coding: utf-8 -*-
# @Time    : 2022/1/14 9:35
# @Author  : Cocktail_py
import requests
import urllib3

url = 'http://httpbin.org/get'

urllib3.util.ssl_.DEFAULT_CIPHERS += 'HIGH:!DH:!aNULL'

headers = {
    'accept': 'application/json, text/javascript, */*; q=0.01',
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'zh-CN,zh;q=0.9',
    'cache-control': 'no-cache',
    'pragma': 'no-cache',
    'user-agent': 'yuanrenxue.project',
    'x-requested-with': 'XMLHttpRequest',
}



r = requests.get(url=url, headers=headers)
print(r.text)
# -*- coding: utf-8 -*-
# @Time    : 2023/3/30 9:35
# @Author  : Cocktail_py
from curl_cffi import requests

url = "https://match.yuanrenxue.cn/api/match/19?page=1"

payload = {}
headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'
}
# 注意这个 impersonate 参数,指定了模拟哪个浏览器
r = requests.get(url,headers=headers,impersonate="chrome101")

print(r.text)

# 参考https://www.cnblogs.com/Eeyhan/p/15662849.html
一.2.python random ja3
## requests
import random

import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.ssl_ import create_urllib3_context

ORIGIN_CIPHERS = ('DH+3DES:RSA+3DES:ECDH+AES256:DH+AESGCM:DH+AES256:DH+AES:ECDH+AES128:'
                  'DH+HIGH:RSA+AESGCM:ECDH+3DES:RSA+AES:RSA+HIGH:ECDH+AESGCM:ECDH+HIGH')


class DESAdapter(HTTPAdapter):
    def __init__(self, *args, **kwargs):
        CIPHERS = ORIGIN_CIPHERS.split(':')
        random.shuffle(CIPHERS)
        CIPHERS = ':'.join(CIPHERS)
        self.CIPHERS = CIPHERS + ':!aNULL:!eNULL:!MD5'
        super().__init__(*args, **kwargs)

    def init_poolmanager(self, *args, **kwargs):
        context = create_urllib3_context(ciphers=self.CIPHERS)
        kwargs['ssl_context'] = context
        return super(DESAdapter, self).init_poolmanager(*args, **kwargs)

    def proxy_manager_for(self, *args, **kwargs):
        context = create_urllib3_context(ciphers=self.CIPHERS)
        kwargs['ssl_context'] = context
        return super(DESAdapter, self).proxy_manager_for(*args, **kwargs)


headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36 Edg/92.0.902.67'}
session = requests.Session()
session.headers.update(headers)

ssl = DESAdapter()

for _ in range(5):
    # 设置只绑定在 https://ja3er.com 这个网站
    # s.mount('https://ja3er.com', adapter=DESAdapter())
    # 设置绑定在任何 https 的请求上
    session.mount('https://', adapter=ssl)

    result = session.get('https://ja3er.com/json').json()
    print(result)

##aiohttp
import asyncio
import random
import ssl

import aiohttp

ORIGIN_CIPHERS = ('RSA+3DES:RSA+AES:RSA+AESGCM:ECDH+AESGCM:DH+AESGCM:ECDH+AES256:DH+AES256:ECDH+AES128:ECDH+HIGH:'
                  'DH+HIGH:DH+3DES:RSA+HIGH:DH+AES:ECDH+3DES')


class SSLFactory:
    def __init__(self):
        self.ciphers = ORIGIN_CIPHERS.split(":")

    def __call__(self) -> ssl.SSLContext:
        random.shuffle(self.ciphers)
        ciphers = ":".join(self.ciphers)
        ciphers = ciphers + ":!aNULL:!eNULL:!MD5"

        context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
        context.set_ciphers(ciphers)
        return context


sslgen = SSLFactory()
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'}


async def main():
    async with aiohttp.ClientSession() as session:
        for _ in range(5):
            async with session.get("https://ja3er.com/json", headers=headers, ssl=sslgen()) as resp:
                data = await resp.text()
                print(data)


asyncio.get_event_loop().run_until_complete(main())

##scrapy
1.配置文件中直接修改
# 最末尾需要确定是  :!aNULL:!eNULL:!MD5    结尾
DOWNLOADER_CLIENT_TLS_CIPHERS = 'RSA+AES:RSA+3DES:RSA+AESGCM:ECDH+AESGCM:ECDH+AES256:DH+AES256:ECDH+AES128:ECDH+HIGH:DH+HIGH:DH+3DES:RSA+HIGH:DH+AES:ECDH+3DES:!aNULL:!eNULL:!MD5'
2.启动自动随机修改
# 在scrapy中定义一个函数
def ssl():
    ciphers = 'RSA+3DES:RSA+AES:RSA+AESGCM:ECDH+AESGCM:DH+AESGCM:ECDH+AES256' \
              ':DH+AES256:ECDH+AES128:ECDH+HIGH:DH+HIGH:DH+3DES:RSA+HIGH:DH+AES:ECDH+3DES'.split(":")
    random.shuffle(ciphers)
    ciphers = ":".join(ciphers)
    ciphers = ciphers + ":!aNULL:!eNULL:!MD5"
    return ciphers

# 在内部进行配置参数的修改
class PaSpider(scrapy.Spider):
    name = 'pa'

    start_urls = ['https://ja3er.com/json']

    custom_settings = {
        'DOWNLOADER_CLIENT_TLS_CIPHERS': ssl(),
    }

    def start_requests(self):
		***


##参考https://assistest.cn/2022/01/03/ja3/
一.3.python tls-client ja3
# -*- coding: utf-8 -*-
# @Time    : 2023/5/18 9:52
# @Author  : Cocktail_py
import random
import tls_client
client_identifier_list=[
'chrome_103',
# 'chrome_104',
# 'chrome_105',
# 'chrome_106',
# 'chrome_107',
# 'chrome_108',
# 'chrome109',
# 'Chrome110',
# 'chrome111',
# 'chrome112',
# 'firefox_102',
# 'firefox_104',
# 'firefox108',
# 'Firefox110',
# 'opera_89',
# 'opera_90',
# 'safari_15_3',
# 'safari_15_6_1',
# 'safari_16_0',
# 'safari_ios_15_5',
# 'safari_ios_15_6',
# 'safari_ios_16_0',
# 'safari_ios_15_6',
# 'okhttp4_android_7',
# 'okhttp4_android_8',
# 'okhttp4_android_9',
# 'okhttp4_android_10',
# 'okhttp4_android_11',
# 'okhttp4_android_12',
# 'okhttp4_android_13',
]
session = tls_client.Session(

    client_identifier=random.choice(client_identifier_list),

    random_tls_extension_order=True

)
res = session.get(
    "https://api.ipify.org/?format=json",
    headers={
        "key1": "value1",
    },
    # proxy="http://user:password@host:port"

)
print(res.text)
# 参考https://pypi.org/project/tls-client/
二.用go的库ja3transport

Go ja3指纹突破

三.用go的库requests

https://github.com/wangluozhe/requests
好库推荐|强烈推荐,支持Ja3指纹修改的golang请求库

tls指纹之到底怎么判断是否有tls、到底怎么对抗tls

四.用go的库cycletls

https://github.com/Danny-Dasilva/CycleTLS

五.魔改openssl
六.用python的库 pyhttpx

https://github.com/zero3301/pyhttpx

七.用杆总魔改的pycurl

python完美突破tls/ja3

八.用python的库

其实就是魔改的pycurl的简易安装版,群里一个大佬编译好的,
里面有详细的步骤:
https://github.com/synodriver/pycurl/blob/master/special.markdown

九.pycurl 的docker版

我的docker镜像:

docker pull geekbyte1/pyantitls:v1.0
docker run -it -d geekbyte1/pyantitls:v1.0

Q佬的docker file:
https://mp.weixin.qq.com/s/UZlLuzlQZrI7w82HI7zGuw

十.直接调用curl_impersonnate

就是魔改版的pycurl,用的核心的东西,直接操作终端然后调用curl_impersonnate
现在curl_impersonnate已经出windows版
能过tls检测的curl-impersonate win版,搞起来啊

十一.cycletls的python版

https://github.com/Danny-Dasilva/cycletls_python

常用ja3检验网站
https://kawayiyi.com/tls
https://ja3.zone/check
https://browserleaks.com/ssl
https://tls.peet.ws/api/all

参考:
tls指纹之到底怎么判断是否有tls、到底怎么对抗tls
为什么随机 IP、随机 UA 也逃不掉被反爬虫的命运
akamai TLS指纹(即JA3)

你可能感兴趣的:(爬虫,python,开发语言)