爬虫反反爬之5秒盾 - cloudflare
原创文章
cloudscraper
pip install cloudscraper
# 更新最新版本
pip install cloudscraper -U
# 创建实例
scraper = cloudscraper.create_scraper()
# 请求url
res = scraper.get(url)
# 打印结果
print(res.text)
middlewares.py
class CloudScraperMiddleware:
def process_response(self, request, response, spider):
if response.status == 403:
url = request.url
req = spider.scraper.get(url, headers={'referer': url})
return HtmlResponse(url=url, body=req.text, encoding="utf-8", request=request)
return response
spider.py
import cloudscraper
# 启用中间件
custom_settings = {
"DOWNLOADER_MIDDLEWARES": {
'testspider.middlewares.CloudScraperMiddleware': 520,
}
}
def __init__(self, **kwargs):
# 创建实例
self.scraper = cloudscraper.create_scraper()
还有一个库(cfscrape)和cloudscraper用法一模一样,但是经过测试,该库失败
https://webcache.googleusercontent.com/search?q=cache:
即可,如:import requests
url = 'https://webcache.googleusercontent.com/search?q=cache:https://www.xxx.com/'
response = requests.get(url)
这时我们就可以从响应中提取到我们想要的数据
pip3 install undetected-chromedriver
import undetected_chromedriver as uc
url = 'https://www.baidu.com/'
driver = uc.Chrome()
driver.get(url)
详细用法可以去看看:https://github.com/ultrafunkamsterdam/undetected-chromedriver
url = "https://xxxx.com/"
# 这个密钥是用我自己的邮箱生成的
api_key = '一长串密钥'
proxy = f"http://{api_key}:@proxy.zenrows.com:8001"
proxies = {"http": proxy, "https": proxy}
response = requests.get(url, proxies=proxies, verify=False)
试用完全没问题(毕竟是收费的)