最近实验室要求在爬取一些论文数据,过程中遇到了不少问题,在此记录一下。
https://chemistry-europe.onlinelibrary.wiley.com/doi/full/10.1002/cctc.202101625
这个网页,当我用requests去获得它的论文数据时,无论怎么设置headers和cookie,还是显示503错误,不知道是什么反爬的措施。在此把代码贴出来,期待能收获大佬的解答。
import requests
from hyper.contrib import HTTP20Adapter
url = 'https://chemistry-europe.onlinelibrary.wiley.com/doi/full/10.1002/cctc.202101625'
session = requests.session()
session.mount(url, HTTP20Adapter())
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="100", "Google Chrome";v="100"',
'sec-ch-ua-platform': '"macOS"',
':authority': 'chemistry-europe.onlinelibrary.wiley.com',
':method': 'GET',
':path': '/doi/full/10.1002/cctc.202101625',
':scheme': 'https',
'cache-control': 'max-age=0',
'sec-ch-ua-mobile': '?0',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'none',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
# 'Referrer Policy': 'strict-origin-when-cross-origin',
# 'cookie': 'MAID=iImQcxImwEcsAuPViqHF7g==; I2KBRCK=1; rskxRunCookie=0; rCookie=n6kpmd70nxn778u3lvvsl24fvjva; JSESSIONID=aaak2yOooYbRg6Gc5Hscy; SERVER=WZ6myaEXBLEFIVkvFByAkw==; MACHINE_LAST_SEEN=2022-05-05T00%3A13%3A33.259-07%3A00; osano_consentmanager_uuid=46ef2e31-9949-4791-83ef-f2bfcd350a39; osano_consentmanager=Z5Jmq7YcBW6VF0NmVnrm8ddM-uK_4FpCnLkTSBUQRdz_SWfUEgn8RGw5hLVMg_RZGyD9M-qu7TA9hT-Sn4rCfx0GbfUJFe8MZi18Fhhnzx4tQ-4i7Iq0HjsYaUZrMiDhgn_R5ozBmkU-1JnRuXhg9tkq6gsKBmgQ2l2w4A0OqrMQITndoSeoM_V9PaWKv8VS1IGG4QXTIpB9K3jY4Nv0bU1y0Gepp6dZdjWnyESH7T6tGAnc9AnAe8uoAdKQ0XAvAhrPL30akgccIARhsppLhX3Oel4zohx2MnNBJA==; test=cookie; test=cookie; randomizeUser=0.477873797932322; _gcl_au=1.1.1103617219.1651737556; _gcl_au=1.1.1103617219.1651737556; _gcl_au=1.1.1103617219.1651737556; _gcl_au=1.1.1103617219.1651737556; s_fid=619609E880384F7D-02C271051EF8EA58; s_cc=true; _gcnz=; AMCVS_1B6E34B85282A0AC0A490D44%40AdobeOrg=1; _ga=GA1.2.1498128703.1651737601; _gid=GA1.2.894999273.1651737601; oc-js-session=r1ll68jt54n4md9ckid5tphcn6; __gads=ID=cfe90f393000399c:T=1651737600:S=ALNI_MZ9x1J9zz0XffeRlpGfXljo-7MO0A; __gpi=UID=0000052b503806c1:T=1651737600:RT=1651737600:S=ALNI_MZgyYjUfCtOFpNtHoXu8Upl7-uEeQ; AMCV_1B6E34B85282A0AC0A490D44%40AdobeOrg=-1124106680%7CMCIDTS%7C19118%7CMCMID%7C84453321464341752683811349891845498632%7CMCAAMLH-1652342406%7C11%7CMCAAMB-1652342406%7C6G1ynYcLPuiQxYZrsz_pkqfLG9yMXBpb2zX5dvJdYQJzPXImdj0y%7CMCOPTOUT-1651744806s%7CNONE%7CMCAID%7CNONE%7CMCSYNCSOP%7C411-19125%7CvVersion%7C5.2.0; __tempcookie=1079ad1151dfc2cfb6c615a62e2bba20b6e8a6d1b25b538ef5430bc6ac860300; realReferer=; _gcna=0.1079ad1151dfc2cfb6c615a62e2bba20b6e8a6d1b25b538ef5430bc6ac860300.1651737607.1; __atuvc=1%7C18; __atuvs=6273843097554a74000; lastRskxRun=1651737863738; _gcnb=1651739010.4; _gat_campaignacct=1; __cf_bm=UICM6gJxatUst13I8u7iLVfvvO2qbQVHkECJcvrcbCU-1651739011-0-AYReIZOIRGB6uv29hxGgqZNat8Tv18nSuPzqI9lVjQGklbqLmfd6EE1TgAS3sHLD2+MuL4GcOh+sJHw87/GcDvMlmlFHl8/TdavZ74deYHisbgOt9qpobZOis+g/uqialdt6oli4rGXx7WvP/LsJbRo='
'cookie': 'MAID=iImQcxImwEcsAuPViqHF7g==; I2KBRCK=1; rskxRunCookie=0; rCookie=n6kpmd70nxn778u3lvvsl24fvjva; osano_consentmanager_uuid=46ef2e31-9949-4791-83ef-f2bfcd350a39; osano_consentmanager=Z5Jmq7YcBW6VF0NmVnrm8ddM-uK_4FpCnLkTSBUQRdz_SWfUEgn8RGw5hLVMg_RZGyD9M-qu7TA9hT-Sn4rCfx0GbfUJFe8MZi18Fhhnzx4tQ-4i7Iq0HjsYaUZrMiDhgn_R5ozBmkU-1JnRuXhg9tkq6gsKBmgQ2l2w4A0OqrMQITndoSeoM_V9PaWKv8VS1IGG4QXTIpB9K3jY4Nv0bU1y0Gepp6dZdjWnyESH7T6tGAnc9AnAe8uoAdKQ0XAvAhrPL30akgccIARhsppLhX3Oel4zohx2MnNBJA==; test=cookie; test=cookie; randomizeUser=0.477873797932322; _gcl_au=1.1.1103617219.1651737556; _gcl_au=1.1.1103617219.1651737556; _gcl_au=1.1.1103617219.1651737556; _gcl_au=1.1.1103617219.1651737556; s_fid=619609E880384F7D-02C271051EF8EA58; s_cc=true; AMCVS_1B6E34B85282A0AC0A490D44%40AdobeOrg=1; _ga=GA1.2.1498128703.1651737601; _gid=GA1.2.894999273.1651737601; oc-js-session=r1ll68jt54n4md9ckid5tphcn6; __gads=ID=cfe90f393000399c:T=1651737600:S=ALNI_MZ9x1J9zz0XffeRlpGfXljo-7MO0A; __gpi=UID=0000052b503806c1:T=1651737600:RT=1651737600:S=ALNI_MZgyYjUfCtOFpNtHoXu8Upl7-uEeQ; __tempcookie=1079ad1151dfc2cfb6c615a62e2bba20b6e8a6d1b25b538ef5430bc6ac860300; realReferer=; __e_inc=1; __cdZG9pLm9yZw===1; s_sq=%5B%5BB%5D%5D; _gcna=0.1079ad1151dfc2cfb6c615a62e2bba20b6e8a6d1b25b538ef5430bc6ac860300.1651756937.4; AMCV_1B6E34B85282A0AC0A490D44%40AdobeOrg=-1124106680%7CMCIDTS%7C19118%7CMCMID%7C84453321464341752683811349891845498632%7CMCAAMLH-1652361740%7C11%7CMCAAMB-1652361740%7C6G1ynYcLPuiQxYZrsz_pkqfLG9yMXBpb2zX5dvJdYQJzPXImdj0y%7CMCOPTOUT-1651764140s%7CNONE%7CMCAID%7CNONE%7CMCSYNCSOP%7C411-19125%7CvVersion%7C5.2.0; JSESSIONID=aaaqJ2Zf3wttW5lc-pucy; SERVER=WZ6myaEXBLHQFyoOjAxmIQ==; MACHINE_LAST_SEEN=2022-05-05T06%3A26%3A32.744-07%3A00; __atuvc=12%7C18; lastRskxRun=1651758051490'
}
# url = 'https://chemistry-europe.onlinelibrary.wiley.com/'
response = session.get(url=url, headers=headers)
print(response.status_code)
print(response.headers)
# cookie = head.get(b'set-cookie')
# headers['cookie'] = cookie
# print(cookie)
# response = session.get(url=url, headers=headers)
# print(response.status_code)
# print(headers)
from selenium import webdriver
options = webdriver.ChromeOptions()
# options.add_argument('headless')
options.add_argument("start-maximized")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option("useAutomationExtension", False)
options.page_load_strategy = 'eager'
driver = webdriver.Chrome(options=options, executable_path='./chromedriver')
# driver.set_page_load_timeout(10)
# driver.set_script_timeout(10)
with open('./stealth.min.js') as f:
js = f.read()
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": js
})
其中的stealth.min.js文件可以帮助模拟浏览器绕过webdriver的检测
下载方法:npx extract-stealth-evasions
有的网站加载过程会非常缓慢,webdriver的默认模式是全部加载出来再停止,这样很容易卡死
解决办法:options.page_load_strategy = 'eager'
相比chromedriver, undetected_chromedriver更加安全,尤其是对于国外的网站,可以绕过cloudflare检测。
from selenium import webdriver
import undetected_chromedriver as uc
import ssl
# ssl._create_default_https_context = ssl._create_unverified_context
uc.TARGET_VERSION = 101
options = webdriver.ChromeOptions()
# options.add_argument('headless')
options.add_argument("start-maximized")
# options.add_experimental_option("excludeSwitches", ["enable-automation"])
# options.add_experimental_option("useAutomationExtension", False)
options.page_load_strategy = 'eager'
driver = uc.Chrome(options=options)
# driver = uc.Chrome(options=options, executable_path='./chromedriver')
# driver.set_page_load_timeout(10)
# driver.set_script_timeout(10)
with open('./stealth.min.js') as f:
js = f.read()
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": js
})
使用xvfb模拟一个图形化界面
nohup sudo xvfb-run python3 ce.py -s -screen 0 1920x1080x16 > ce.log &
同时需要在代码中添加一下:
options.add_argument("start-maximized")
options.add_argument('--no-sandbox')
options.add_argument('--disable-gpu')
options.add_argument('--disable-dev-shm-usage')
from retry import retry
@retry()
def main():
print("retry")
raise
'Connection': 'close'
@retry()
def download_img(url, figure_name, dirpath):
# 下载图片
try:
# r = requests.get(url, stream=True, headers=headers)
with requests.get(url, stream=False, headers=headers, timeout=30) as r:
print(r.status_code) # 返回状态码
# while r.status_code != 200:
# del r
# time.sleep(5)
# r = requests.get(url, headers=headers)
content = r.content
open(dirpath + '/{}.jpg'.format(figure_name), 'wb').write(content) # 将内容写入图片
print("done")
r.close()
except Exception as e:
print("下载图片时发生错误:" + str(e))
r.close()
os.system('pause')