模拟浏览器进行爬取时遇到的一些问题记录

最近实验室要求在爬取一些论文数据,过程中遇到了不少问题,在此记录一下。

未解决的问题

https://chemistry-europe.onlinelibrary.wiley.com/doi/full/10.1002/cctc.202101625
这个网页,当我用requests去获得它的论文数据时,无论怎么设置headers和cookie,还是显示503错误,不知道是什么反爬的措施。在此把代码贴出来,期待能收获大佬的解答。

import requests
from hyper.contrib import HTTP20Adapter
url = 'https://chemistry-europe.onlinelibrary.wiley.com/doi/full/10.1002/cctc.202101625'
session = requests.session()
session.mount(url, HTTP20Adapter())
headers = {
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36',
        'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="100", "Google Chrome";v="100"',
        'sec-ch-ua-platform': '"macOS"',
        ':authority': 'chemistry-europe.onlinelibrary.wiley.com',
        ':method': 'GET',
        ':path': '/doi/full/10.1002/cctc.202101625',
        ':scheme': 'https',
        'cache-control': 'max-age=0',
        'sec-ch-ua-mobile': '?0',
        'sec-fetch-dest': 'document',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-site': 'none',
        'sec-fetch-user': '?1',
        'upgrade-insecure-requests': '1',
        # 'Referrer Policy': 'strict-origin-when-cross-origin',
        # 'cookie': 'MAID=iImQcxImwEcsAuPViqHF7g==; I2KBRCK=1; rskxRunCookie=0; rCookie=n6kpmd70nxn778u3lvvsl24fvjva; JSESSIONID=aaak2yOooYbRg6Gc5Hscy; SERVER=WZ6myaEXBLEFIVkvFByAkw==; MACHINE_LAST_SEEN=2022-05-05T00%3A13%3A33.259-07%3A00; osano_consentmanager_uuid=46ef2e31-9949-4791-83ef-f2bfcd350a39; osano_consentmanager=Z5Jmq7YcBW6VF0NmVnrm8ddM-uK_4FpCnLkTSBUQRdz_SWfUEgn8RGw5hLVMg_RZGyD9M-qu7TA9hT-Sn4rCfx0GbfUJFe8MZi18Fhhnzx4tQ-4i7Iq0HjsYaUZrMiDhgn_R5ozBmkU-1JnRuXhg9tkq6gsKBmgQ2l2w4A0OqrMQITndoSeoM_V9PaWKv8VS1IGG4QXTIpB9K3jY4Nv0bU1y0Gepp6dZdjWnyESH7T6tGAnc9AnAe8uoAdKQ0XAvAhrPL30akgccIARhsppLhX3Oel4zohx2MnNBJA==; test=cookie; test=cookie; randomizeUser=0.477873797932322; _gcl_au=1.1.1103617219.1651737556; _gcl_au=1.1.1103617219.1651737556; _gcl_au=1.1.1103617219.1651737556; _gcl_au=1.1.1103617219.1651737556; s_fid=619609E880384F7D-02C271051EF8EA58; s_cc=true; _gcnz=; AMCVS_1B6E34B85282A0AC0A490D44%40AdobeOrg=1; _ga=GA1.2.1498128703.1651737601; _gid=GA1.2.894999273.1651737601; oc-js-session=r1ll68jt54n4md9ckid5tphcn6; __gads=ID=cfe90f393000399c:T=1651737600:S=ALNI_MZ9x1J9zz0XffeRlpGfXljo-7MO0A; __gpi=UID=0000052b503806c1:T=1651737600:RT=1651737600:S=ALNI_MZgyYjUfCtOFpNtHoXu8Upl7-uEeQ; AMCV_1B6E34B85282A0AC0A490D44%40AdobeOrg=-1124106680%7CMCIDTS%7C19118%7CMCMID%7C84453321464341752683811349891845498632%7CMCAAMLH-1652342406%7C11%7CMCAAMB-1652342406%7C6G1ynYcLPuiQxYZrsz_pkqfLG9yMXBpb2zX5dvJdYQJzPXImdj0y%7CMCOPTOUT-1651744806s%7CNONE%7CMCAID%7CNONE%7CMCSYNCSOP%7C411-19125%7CvVersion%7C5.2.0; __tempcookie=1079ad1151dfc2cfb6c615a62e2bba20b6e8a6d1b25b538ef5430bc6ac860300; realReferer=; _gcna=0.1079ad1151dfc2cfb6c615a62e2bba20b6e8a6d1b25b538ef5430bc6ac860300.1651737607.1; __atuvc=1%7C18; __atuvs=6273843097554a74000; lastRskxRun=1651737863738; _gcnb=1651739010.4; _gat_campaignacct=1; __cf_bm=UICM6gJxatUst13I8u7iLVfvvO2qbQVHkECJcvrcbCU-1651739011-0-AYReIZOIRGB6uv29hxGgqZNat8Tv18nSuPzqI9lVjQGklbqLmfd6EE1TgAS3sHLD2+MuL4GcOh+sJHw87/GcDvMlmlFHl8/TdavZ74deYHisbgOt9qpobZOis+g/uqialdt6oli4rGXx7WvP/LsJbRo='
        'cookie': 'MAID=iImQcxImwEcsAuPViqHF7g==; I2KBRCK=1; rskxRunCookie=0; rCookie=n6kpmd70nxn778u3lvvsl24fvjva; osano_consentmanager_uuid=46ef2e31-9949-4791-83ef-f2bfcd350a39; osano_consentmanager=Z5Jmq7YcBW6VF0NmVnrm8ddM-uK_4FpCnLkTSBUQRdz_SWfUEgn8RGw5hLVMg_RZGyD9M-qu7TA9hT-Sn4rCfx0GbfUJFe8MZi18Fhhnzx4tQ-4i7Iq0HjsYaUZrMiDhgn_R5ozBmkU-1JnRuXhg9tkq6gsKBmgQ2l2w4A0OqrMQITndoSeoM_V9PaWKv8VS1IGG4QXTIpB9K3jY4Nv0bU1y0Gepp6dZdjWnyESH7T6tGAnc9AnAe8uoAdKQ0XAvAhrPL30akgccIARhsppLhX3Oel4zohx2MnNBJA==; test=cookie; test=cookie; randomizeUser=0.477873797932322; _gcl_au=1.1.1103617219.1651737556; _gcl_au=1.1.1103617219.1651737556; _gcl_au=1.1.1103617219.1651737556; _gcl_au=1.1.1103617219.1651737556; s_fid=619609E880384F7D-02C271051EF8EA58; s_cc=true; AMCVS_1B6E34B85282A0AC0A490D44%40AdobeOrg=1; _ga=GA1.2.1498128703.1651737601; _gid=GA1.2.894999273.1651737601; oc-js-session=r1ll68jt54n4md9ckid5tphcn6; __gads=ID=cfe90f393000399c:T=1651737600:S=ALNI_MZ9x1J9zz0XffeRlpGfXljo-7MO0A; __gpi=UID=0000052b503806c1:T=1651737600:RT=1651737600:S=ALNI_MZgyYjUfCtOFpNtHoXu8Upl7-uEeQ; __tempcookie=1079ad1151dfc2cfb6c615a62e2bba20b6e8a6d1b25b538ef5430bc6ac860300; realReferer=; __e_inc=1; __cdZG9pLm9yZw===1; s_sq=%5B%5BB%5D%5D; _gcna=0.1079ad1151dfc2cfb6c615a62e2bba20b6e8a6d1b25b538ef5430bc6ac860300.1651756937.4; AMCV_1B6E34B85282A0AC0A490D44%40AdobeOrg=-1124106680%7CMCIDTS%7C19118%7CMCMID%7C84453321464341752683811349891845498632%7CMCAAMLH-1652361740%7C11%7CMCAAMB-1652361740%7C6G1ynYcLPuiQxYZrsz_pkqfLG9yMXBpb2zX5dvJdYQJzPXImdj0y%7CMCOPTOUT-1651764140s%7CNONE%7CMCAID%7CNONE%7CMCSYNCSOP%7C411-19125%7CvVersion%7C5.2.0; JSESSIONID=aaaqJ2Zf3wttW5lc-pucy; SERVER=WZ6myaEXBLHQFyoOjAxmIQ==; MACHINE_LAST_SEEN=2022-05-05T06%3A26%3A32.744-07%3A00; __atuvc=12%7C18; lastRskxRun=1651758051490'

    }
# url = 'https://chemistry-europe.onlinelibrary.wiley.com/'
response = session.get(url=url, headers=headers)
print(response.status_code)
print(response.headers)
# cookie = head.get(b'set-cookie')
# headers['cookie'] = cookie
# print(cookie)
# response = session.get(url=url, headers=headers)
# print(response.status_code)
# print(headers)

已解决的问题

使用selenium并进行浏览器伪装

from selenium import webdriver
options = webdriver.ChromeOptions()
# options.add_argument('headless')
options.add_argument("start-maximized")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option("useAutomationExtension", False)
options.page_load_strategy = 'eager'
driver = webdriver.Chrome(options=options, executable_path='./chromedriver')
# driver.set_page_load_timeout(10)
# driver.set_script_timeout(10)
with open('./stealth.min.js') as f:
    js = f.read()
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
    "source": js
})

其中的stealth.min.js文件可以帮助模拟浏览器绕过webdriver的检测
下载方法:npx extract-stealth-evasions

使用eager模式防止模拟浏览器卡死在加载过程中

有的网站加载过程会非常缓慢,webdriver的默认模式是全部加载出来再停止,这样很容易卡死
解决办法:options.page_load_strategy = 'eager'

使用undetected_chromedriver进行浏览器模拟

相比chromedriver, undetected_chromedriver更加安全,尤其是对于国外的网站,可以绕过cloudflare检测。

from selenium import webdriver
import undetected_chromedriver as uc
import ssl
# ssl._create_default_https_context = ssl._create_unverified_context
uc.TARGET_VERSION = 101
options = webdriver.ChromeOptions()
# options.add_argument('headless')
options.add_argument("start-maximized")
# options.add_experimental_option("excludeSwitches", ["enable-automation"])
# options.add_experimental_option("useAutomationExtension", False)
options.page_load_strategy = 'eager'
driver = uc.Chrome(options=options)
# driver = uc.Chrome(options=options, executable_path='./chromedriver')
# driver.set_page_load_timeout(10)
# driver.set_script_timeout(10)
with open('./stealth.min.js') as f:
    js = f.read()
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
    "source": js
})

如何在没有图形化界面的linux服务器上运行有头模式的selenium

使用xvfb模拟一个图形化界面

nohup sudo xvfb-run python3 ce.py -s -screen 0 1920x1080x16 > ce.log &

同时需要在代码中添加一下:

options.add_argument("start-maximized")
options.add_argument('--no-sandbox')
options.add_argument('--disable-gpu')
options.add_argument('--disable-dev-shm-usage')

代码发生错误想要重新运行一遍

from retry import retry

@retry()
def main():
	print("retry")
	raise

requests下载图片次数过多后阻塞住

  • 设置阿里云公共dns
  • 将requests中的参数stream设置为false
  • 在headers中添加参数'Connection': 'close'
  • 及时读取r.content才能结束请求将请求放回请求池,或者使用with方法,在结束后及时关闭
@retry()
def download_img(url, figure_name, dirpath):
    # 下载图片
    try: 
        # r = requests.get(url, stream=True, headers=headers)
        with requests.get(url, stream=False, headers=headers, timeout=30) as r:
            print(r.status_code) # 返回状态码
        # while r.status_code != 200:
        #     del r
        #     time.sleep(5)
        #     r = requests.get(url, headers=headers)
            content = r.content
            open(dirpath + '/{}.jpg'.format(figure_name), 'wb').write(content) # 将内容写入图片
            print("done")
        r.close()
    except Exception as e:
        print("下载图片时发生错误:" + str(e))
        r.close()
        os.system('pause')

你可能感兴趣的:(python,python,爬虫)