playwright 下载pdf

import time

import requests
from playwright.sync_api import Playwright, sync_playwright, expect
from urllib.parse import unquote
import shutil


def get_md5(content):
    import hashlib
    m = hashlib.md5(content.encode())
    return m.hexdigest()


def save_file():
    # 点击文件另存为 保存文件到本地
    import win32gui,win32con
    hwnd = win32gui.FindWindow("#32770", "另存为")
    hwnd_save = win32gui.FindWindowEx(hwnd, None, "Button", None)
    win32gui.PostMessage(hwnd_save, win32con.WM_KEYDOWN, win32con.VK_RETURN, 0)
    win32gui.PostMessage(hwnd_save, win32con.WM_KEYUP, win32con.VK_RETURN, 0)


def judge_element_exist(page, js_path, type):
    # 判断标签是否存在
    if type == 1:
        # 元素路径需要双引号包裹
        result = page.evaluate(
            '''var temp = document.querySelector("%s");if(temp){(function(){return true})()}else{(function(){return false})()}''' % js_path)
    else:
        # 元素路径需要单引号包裹(模糊查询时 'div.gh-menu > a[href*="/mys/home"]')
        result = page.evaluate('''var temp = document.querySelector('%s');if(temp){(function(){return true})()}else{(function(){return false})()}''' % js_path)
    print('result:', result)
    return result


def get_pdf_down_url(url_tiao):
    # 获取PDF重定向链接
    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'Connection': 'keep-alive',
        'Host': 'guide.medlive.cn',
        'Referer': 'https://guide.medlive.cn/',
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.0.0 Safari/537.36',
    }
    # url_tiao = 'https://guide.medlive.cn/guideline/full_text_link_redirect.php?l=CMzWD6BHlMVquezOt93sI1okuBNUTADoQDTCLnHwY914Ugeaoip%2Br8%2FTilxXZ%2Fl4LmWPHxmY8GnNotB59oS8PA%3D%3D&t=pRLU9F34HdrIJYk6oKmrsA%3D%3D'
    res = requests.get(url_tiao, headers=headers, allow_redirects=False)
    print(res.status_code)
    location_url = res.headers['Location']
    print('location_url:',location_url)
    pdf_url = str(location_url).replace('doi', 'doi/pdf') + '?download=true'
    print('pdf_url-------', pdf_url)
    return pdf_url


def get_pdf_down_status(page, pdf_url, pdf_save_path):
    # 获取文件下载状态,并迁移到指定文件夹下
    js = '''
            (function(){
                var temps = document.querySelector("body > downloads-manager").shadowRoot.querySelectorAll("downloads-item[id*='frb']");
                var dics = [];
                for(var temp=0;temp None:
    executable_path = f'C:\Program Files\Google\Chrome\Application\chrome.exe'
    browser = playwright.chromium.launch(executable_path=executable_path, headless=False)
    # browser = playwright.chromium.launch(headless=False)
    context = browser.new_context()
    page = context.new_page()

    # url = 'https://guide.medlive.cn/guideline/26790'
    # url = 'https://guide.medlive.cn/guideline/26559'
    # url = 'https://guide.medlive.cn/guideline/27086'
    url = 'https://guide.medlive.cn/guideline/3896'
    #i = 'https://guide.medlive.cn/guidelinesub/7885'
    #page.goto("https://guide.medlive.cn/guideline/3896")
    page.goto(url)

    # cookie 注入登录,直接点击下载时需要登录
    if judge_element_exist(page, 'div.pdf_list div[class*="pdf_btn"]>a', 2):
        context.add_cookies([{'name': 'sess', 'value': '', 'domain': 'guide.medlive.cn', 'path': '/', 'expires': -1, 'httpOnly': False, 'secure': False, 'sameSite': 'Lax'}])
        time.sleep(1)
        page.goto(url)
        time.sleep(3)

    # 判断cookie是否失效
    page.reload() # 刷新页面
    icon_login = judge_element_exist(page, "li.icon.login>a", 1)
    icon_user = judge_element_exist(page, "li#get_icon_user_width>a", 1)
    if icon_login or not icon_user:
        # 账号登录
        username_login(page)

    cookies = context.cookies()
    print("cookies", cookies)

    # 下载PDF方式1 #直接点击下载时需要登录
    if judge_element_exist(page, 'div.pdf_list div[class*="pdf_btn"]>a', 2):
        onclick = page.evaluate('(function(){return document.querySelector("#_article_viewer_1 > div > div.pdf_btn > a").getAttribute("onclick")})()')
        down_text = page.evaluate('(function(){return document.querySelector("#_article_viewer_1 > div > div.pdf_btn > a").textContent})()')
        print('onclick:', onclick)
        print('down_text:', down_text)
        if 'download(' in onclick or '下载' in down_text:
            page.get_by_role("link", name="下载").click()
        else:
            print(f'{url} 不存在PDF下载链接')
            return
        time.sleep(0.5)
        # 点击弹窗 方式1
        page.evaluate('document.querySelector("div.tipMask-checkBtn").click()')
        page.evaluate('document.querySelector("div.tipMask-btm.clearfix > div.tipMask-btnNext").click()')
        # 点击弹窗 方式2
        # page.locator(".tipMask-checkBtn").click()
        # page.get_by_text("同意本协议,继续下载").click()

        with page.expect_download() as download_info:
            with page.expect_popup() as page1_info:
                page.get_by_role("link", name="下载").click()
            page1 = page1_info.value
        ### 下载
        download = download_info.value
        page.wait_for_timeout(3000)
        ### 下载保存的路径
        pdf_oss_name = get_md5(f'{url}') + '.pdf'
        print('url地址-----------', url)
        print('pdf_oss_name-----------', pdf_oss_name)
        download.save_as(f'./{pdf_oss_name}')
    else:# 下载PDF方式2(5秒盾)
        return
        data_l = page.evaluate('(function(){return document.querySelector("div.one_info_L>span.icon-card1.full_text_link").getAttribute("data-l")})()')
        data_t = page.evaluate('(function(){return document.querySelector("div.one_info_L>span.icon-card1.full_text_link").getAttribute("data-t")})()')
        print('data_l:', data_l)
        print('data_t:', data_t)
        ### 这是一个跳转url, 需要data_l 和 data_t 这 两个参数
        url_tiao = f'https://guide.medlive.cn/guideline/full_text_link_redirect.php?l={data_l}&t={data_t}'
        print('url_tiao---------', url_tiao)
        pdf_url = get_pdf_down_url(url_tiao)
        print('pdf_url:', pdf_url)
        input('stop')
        try:
            page.goto(pdf_url, timeout=100000, wait_until="domcontentloaded")
        except:pass

        # 点击保存弹窗
        save_file()
        time.sleep(5)
        # 打开浏览器下载页面
        try:
            page.goto('chrome://downloads', timeout=100000, wait_until="domcontentloaded")
        except:pass
        time.sleep(10)

        # 指定保存路径及文件名, 获取下载路径并将文件移到当前目录
        file_name = 'test.pdf'
        pdf_save_path = f'./{file_name}'
        for i in range(5):
            time.sleep(5)
            flug = get_pdf_down_status(page, pdf_url, pdf_save_path)
            if flug:
                break
            page.reload()


with sync_playwright() as playwright:
    run(playwright)

你可能感兴趣的:(pdf,python)