Python:使用PhantomJS将网页保存为长图

 

一、项目描述
        最近接到公司市场部的需求:提供一个Excel,里面有一些新闻链接地址,需要将链接网页保存为长图片,而selenium只能截屏当前屏幕,做不到整个网页,经过调研最终选择了PhantomJS。

二、PhantomJS介绍
        PhantomJS是一个基于webkit的JavaScript API。它使用QtWebKit作为它核心浏览器的功能,使用webkit来编译解释执行JavaScript代码。任何你可以在基于webkit浏览器做的事情,它都能做到。它不仅是个隐形的浏览器,提供了诸如CSS选择器、支持Web标准、DOM操作、JSON、HTML5、Canvas、SVG等,同时也提供了处理文件I/O的操作,从而使你可以向操作系统读写文件等。PhantomJS的用处可谓非常广泛,诸如前端无界面自动化测试(需要结合Jasmin)、网络监测、网页截屏等。       

        PhantomJS官方地址:http://phantomjs.org/。

        PhantomJS官方API:http://phantomjs.org/api/。

        PhantomJS官方示例:http://phantomjs.org/examples/。

        PhantomJS GitHub:https://github.com/ariya/phantomjs/。

三、详细代码如下:

#!/usr/bin/env python
# -*- coding:utf-8 -*-
import os,re

from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import time
from urllib3.connectionpool import xrange
from openpyxl import load_workbook
from multiprocessing import Pool
from selenium.common.exceptions import TimeoutException



def get_excel():
    news = []
    try:
        # 打开一个workbook
        wb = load_workbook(filename='天津20190110-4.xlsx')
        # 获取当前活跃的worksheet,默认就是第一个worksheet
        ws = wb.active
        max_row = ws.max_row  # 获取已有行数
        list_abc = ['A', 'B', 'C', 'D', 'E']
        with open('news.txt','a') as f:
            for l in range(1, max_row):
                row = ''
                for column in list_abc:
                    if column != 'E':
                        sss = '{}{}'.format(column, str(l))
                        row_l = ws[sss].value
                        if type(row_l) not in (str, int):
                            row_l = row_l.strftime('%Y%m%d%H%M%S')
                        elif type(row_l) == int:
                            row_l = str(row_l)
                        row = row + '-' + row_l
                    else:
                        sss = '{}{}'.format(column, str(l))
                        row_l = ws[sss].value
                        if type(row_l) not in (str, int):
                            row_l = row_l.strftime('%Y%m%d_%H%M%S')
                        elif type(row_l) == int:
                            row_l = str(row_l)
                        row = row + '@@@@@' + row_l
                if "http" in row:
                    print(row.strip('-').replace(" ", ''))
                    news.append(row.strip('-').replace(" ", ''))
                #print(row.strip('-').replace(" ", ''))
    except Exception as e:
        print(e)
    return news


def screenshot(url):
    title,base_url = url.split("@@@@@")
    rstr = r"[\/\\\:\*\?\"\<\>\|]"
    title = re.sub(rstr, "",title.replace(" ", '')).replace("\n",'').replace(" ", '')
    dcap = dict(DesiredCapabilities.PHANTOMJS)
    dcap["phantomjs.page.settings.userAgent"] = (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36")
    browser = webdriver.PhantomJS(executable_path=r"C:\installed\phantomjs-2.1.1-windows\bin\phantomjs.exe",
                                  desired_capabilities=dcap)  # 启动时添加定制的选项

    browser.set_window_size(1200, 900)
    browser.maximize_window()
    browser.set_page_load_timeout(20)
    try :
        browser.get(base_url)
    except TimeoutException:
        print(title + "@@@@@" + base_url + "!")

    time.sleep(10)

    # 将页面的滚动条拖到最下方,然后再拖回顶部
    browser.execute_script("""
                (function () {
                    var y = 0;
                    var step = 100;
                    window.scroll(0, 0);
                    function f() {
                        if (y < document.body.scrollHeight) {
                            y += step;
                            window.scroll(0, y);
                            setTimeout(f, 100);
                        } else {
                            window.scroll(0, 0);
                            document.title += "scroll-done";
                        }
                    }
                    setTimeout(f, 1000);
                })();
            """)
    for i in xrange(30):
        if "scroll-done" in browser.title:
            break
    time.sleep(5)
    if os.path.exists("C:\case" + "/pic/"):
        pass
        #print("C:\case" + "/pic/"+ "已经存在")
    else:
        os.makedirs("C:\case" + "/pic/")
    try:
        browser.save_screenshot("C:\case" + "/pic/" + title + '.png')
    except:
        print(title + "@@@@@" + base_url)
    browser.quit()


if __name__ == '__main__':
    news = get_excel()
    print(len(news))
    pool = Pool(16)
    data_list = pool.map(screenshot, news)
    pool.close()
    pool.join()
    print("finish")

 

你可能感兴趣的:(Python:使用PhantomJS将网页保存为长图)