java+selenium无头浏览器爬虫技术

 standalone-chrome-debug-zh:docker 部署 

 

docker run -d -p 4444:4444 -p 5900:5900 -v /etc/localtime:/etc/localtime:ro -v /dev/shm:/dev/shm --name selenium-chrome -e SCREEN_WIDTH=1366 -e SCREEN_HEIGHT=768 -e SCREEN_DEPTH=24 -e SCREEN_DPI=74 selenium/standalone-chrome-debug:3.141.59-xenon

无头 vnc连接密码默认:secret

 

 

java  springboot引入jar

org.seleniumhq.selenium

selenium-chrome-driver

3.9.1

commons-io

commons-io

2.6

com.google.guava

guava

28.1-jre

 

 自己写了个连接池:

连接池配置类 SeleniumConfig

package com.qboa.oa.util.selenuim;

import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Component;

/**
 * 说明:配置类
 * Created by luojie on 2019/12/19.
 */
@Component
public class SeleniumConfig {
    @Value("${selenium.chrome.url}")
    private String seleniumChromeUrl;
    @Value("${selenium.chrome.checkUrl}")
    private String checkUrl = "http://cpquery.sipo.gov.cn";

    /**
     * 连接池
     *
     * @return
     */
    @Value("${selenium.chrome.pool.initSize}")
    private int initSize = 2;
    @Value("${selenium.chrome.pool.maxSize}")
    private int maxSize = 4;
    @Value("${selenium.chrome.pool.stepSize}")
    private int stepSize = 1;
    @Value("${selenium.chrome.pool.timeout}")
    private int timeout = 20000;


    public String getSeleniumChromeUrl() {
        return seleniumChromeUrl;
    }

    public void setSeleniumChromeUrl(String seleniumChromeUrl) {
        this.seleniumChromeUrl = seleniumChromeUrl;
    }

    public String getCheckUrl() {
        return checkUrl;
    }

    public void setCheckUrl(String checkUrl) {
        this.checkUrl = checkUrl;
    }

    public int getInitSize() {
        return initSize;
    }

    public void setInitSize(int initSize) {
        this.initSize = initSize;
    }

    public int getMaxSize() {
        return maxSize;
    }

    public void setMaxSize(int maxSize) {
        this.maxSize = maxSize;
    }

    public int getStepSize() {
        return stepSize;
    }

    public void setStepSize(int stepSize) {
        this.stepSize = stepSize;
    }

    public int getTimeout() {
        return timeout;
    }

    public void setTimeout(int timeout) {
        this.timeout = timeout;
    }
}

连接对象类 SeleniumConnect: 

package com.qboa.oa.util.selenuim;

import org.openqa.selenium.Cookie;
import org.openqa.selenium.WebDriver;

import java.util.Set;

/**
 * 说明:无头浏览器连接对象,使用完必须归还连接
 * Created by luojie on 2019/12/19.
 */
public class SeleniumConnect {
    private WebDriver webDriver;
    //false--繁忙,true--空闲
    private boolean status;
    private long cookiFailTime;//cookie是否有效不报412,cookie 失效时间
    private String phone; //连接用户

    private long bgnTime;


    public SeleniumConnect() {

    }

    public SeleniumConnect(WebDriver webDriver, boolean status) {
        this.webDriver = webDriver;
        this.status = status;
    }

    public WebDriver getWebDriver() {
        return webDriver;
    }

    public void setWebDriver(WebDriver webDriver) {
        this.webDriver = webDriver;
    }

    public boolean isStatus() {
        return status;
    }

    public void setStatus(boolean status) {
        this.status = status;
    }

    public String getPhone() {
        return phone;
    }

    public void setPhone(String phone) {
        this.phone = phone;
    }

    public long getBgnTime() {
        return bgnTime;
    }

    public void setBgnTime(long bgnTime) {
        this.bgnTime = bgnTime;
    }

    public long getCookiFailTime() {
        return cookiFailTime;
    }

    public void setCookiFailTime(long cookiFailTime) {
        this.cookiFailTime = cookiFailTime;
    }

    //释放连接池中的连接对象
    public void releaseConnect() {
        System.out.println("-----------释放连接-----------");
        status = true;
        bgnTime = 0;
    }

    //释放连接池中的连接对象
    public void releaseConnectCookieFile() {
        System.out.println("-----------释放连接-----------");
        status = true;
        bgnTime = 0;
        cookiFailTime = System.currentTimeMillis();
    }


    /**
     * 获取提交url后的cookie
     *
     * @return
     * @throws Exception
     */
    public String getChromeCookeString(String url) throws Exception {
        if (url != null) {
            this.webDriver.get(url);
        }
        return getChromeCookeString(webDriver);

    }

    /**
     * 获取提交url后的cookie
     *
     * @return
     * @throws Exception
     */
    public String getChromeCookeString() throws Exception {
        Set cookies = webDriver.manage().getCookies();
        String cook = "";
        for (Cookie cookie : cookies) {
            cook = cook + cookie.getName() + "=" + cookie.getValue() + ";";
        }
        return cook;

    }

    /**
     * 报400删除后重新获取
     *
     * @return
     * @throws Exception
     */
    public String delete80SCookie() throws Exception {
        Set cookies = webDriver.manage().getCookies();
        String cook = "";
        for (Cookie cookie : cookies) {
            cook = cook + cookie.getName() + "=" + cookie.getValue() + ";";
            if (cookie.getName().endsWith("80S")) { //有可能变成80T 删掉
                webDriver.manage().deleteCookieNamed(cookie.getName());
                break;
            }
        }
        return cook;

    }

    /**
     * 获取提交url后的cookie
     *
     * @return
     * @throws Exception
     */
    public String getChromeCookeString(WebDriver webDriver) throws Exception {
        Set cookies = webDriver.manage().getCookies();
        String cook = "";
        for (Cookie cookie : cookies) {
            cook = cook + cookie.getName() + "=" + cookie.getValue() + ";";
        }
        return cook;

    }


}

 

 

 连接池SeleniumPool

package com.qboa.oa.util.selenuim;

import com.qboa.common.utils.StringUtils;
import org.openqa.selenium.WebDriver;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.locks.ReentrantLock;

/**
 * 说明:连接工厂
 * Created by luojie on 2019/12/19.
 */
@Component
public class SeleniumPool {
    //重入锁
    private ReentrantLock lock = new ReentrantLock();
    //定义连接池中连接对象的存储容器
    private static final List list = Collections.synchronizedList(new ArrayList());

    public static long userOutTime = 60000;


//    private final static SeleniumPool seleniumPool = new SeleniumPool();


    //    //让构造函数为 private,这样该类就不会被实例化
//    private SeleniumPool() {
//
//    }
//
//    //获取唯一可用的对象
//    public static SeleniumPool getInstance() {
//        //单例模式
//        return seleniumPool;
//    }
    @Autowired
    SeleniumConfig config;

    private int count = 0;

    @Scheduled(fixedDelay = 5000)
    public void reportCurrentTimeAfterSleep() throws InterruptedException {
//        System.out.println(String.format("===第%s次执行,当前时间为:%s", count++, DateUtils.dateTimeToStr(new Date())));
        for (SeleniumConnect connect : list) {
            //超时为归还的连接
            if (!connect.isStatus() && (System.currentTimeMillis() -
                    connect.getBgnTime()) > userOutTime) {
                connect.setStatus(true);
                connect.setBgnTime(0);
            }
        }
    }

    /**
     * 获取连接  按类型
     *
     * @param type
     * @return
     */
    public SeleniumConnect getSeleniumConnect(String type) {
        SeleniumConnect seleniumConnect = null;
        long bgn = System.currentTimeMillis();
        try {
            lock.lock(); //加锁
            //连接池对象为空时,初始化连接对象
            if (list.size() == 0) {
                createWebDriver(config.getInitSize());
            }

            //获取可用连接对象
            seleniumConnect = getAvailableConnection();
            long end;
            int count = 0;
            //没有可用连接对象时,等待连接对象的释放或者创建新的连接对象使用
            while (seleniumConnect == null) {
                end = System.currentTimeMillis();
                System.out.println("第" + (count++) + "次获取连接");
                if (end - bgn > 8000) {
                    throw new Exception("获取连接超时");
                }
                createWebDriver(config.getStepSize());
                seleniumConnect = getAvailableConnection();
                if (seleniumConnect == null) {
                    TimeUnit.MILLISECONDS.sleep(30);
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            lock.unlock();
        }


        return seleniumConnect;
    }


    //创建浏览器连接
    private void createWebDriver(int count) throws Exception {
        if (list.size() + count <= config.getMaxSize()) {
            for (int i = 0; i < count; i++) {
//                System.out.println("初始化了" + (i + 1) + "个连接");
                WebDriver webDriver = WebDriverFactroy.initChrome(config);
                SeleniumConnect seleniumConnect = new SeleniumConnect(webDriver, true);
                list.add(seleniumConnect);
            }
        }
    }

    //获取可用连接对象
    private SeleniumConnect getAvailableConnection() throws Exception {
        List idxArrLogin = new ArrayList();
        List idxArrNotLogin = new ArrayList();
        int idx = 0;
        for (SeleniumConnect seleniumConnect : list) {
            if (seleniumConnect.isStatus()
                    && (System.currentTimeMillis() - seleniumConnect.getCookiFailTime() > 20000)
                    ) {//可用状态,且是激活cookie
                WebDriver webDriver = seleniumConnect.getWebDriver();
                //验证连接是否超时
                try {
                    webDriver.getWindowHandle();
                } catch (Exception e) {
                    WebDriver webDriver1 = WebDriverFactroy.initChrome(config);
                    seleniumConnect.setWebDriver(webDriver1);

                }
                //有限获取已经登录的
                if (StringUtils.isNotBlank(seleniumConnect.getPhone())) {
                    idxArrLogin.add(idx);
                } else {
                    idxArrNotLogin.add(idx);
                }

            }
            idx++;

        }
        int loginSize = idxArrLogin.size();
        int notLoginSizesize = idxArrNotLogin.size();
        if (loginSize == 0 && notLoginSizesize == 0) {
            return null;
        }
        //随机取可以用的(以登录的优先)
        SeleniumConnect seleniumConnect = null;
        if (loginSize > 0) {
            int getidx = (int) (Math.round(Math.random() * (loginSize - 1)));
            seleniumConnect = list.get(idxArrLogin.get(getidx));
        } else if (notLoginSizesize > 0) {
            int getidx = (int) (Math.round(Math.random() * (notLoginSizesize - 1)));
            seleniumConnect = list.get(idxArrNotLogin.get(getidx));
        }
        seleniumConnect.setStatus(false);
        seleniumConnect.setBgnTime(System.currentTimeMillis());
        return seleniumConnect;


    }


    /**
     * 正在使用中的用户
     *
     * @return
     */
    public List curPhones() {
        List curPhones = new ArrayList();
        if (list.size() != 0) {
            for (SeleniumConnect connect : list) {
                if (StringUtils.isNotBlank(connect.getPhone())) {
                    curPhones.add(connect.getPhone());
                }
            }
        }
        return curPhones;
    }

    //释放连接池中的连接对象
    public void quit(SeleniumConnect seleniumConnect) {
        System.out.println("-----------结束连接-----------");
        seleniumConnect.getWebDriver().close();
        seleniumConnect.getWebDriver().quit();
        list.remove(seleniumConnect);
    }

}

 无头工程类 WebDriverFactroy  主要创建java应用与selenium连接:

 

package com.qboa.oa.util.selenuim;

import org.openqa.selenium.Point;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import org.openqa.selenium.remote.DesiredCapabilities;
import org.openqa.selenium.remote.RemoteWebDriver;
import org.springframework.stereotype.Component;

import java.net.URL;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.TimeUnit;

/**
 * 说明:无头浏览器工程类
 * Created by luojie on 2019/12/17.
 */
@Component
public class WebDriverFactroy {


    /**
     * 建立连接
     * key :每个用户为一个连接
     *
     * @throws Exception
     */
    public static WebDriver initChrome(SeleniumConfig config) throws Exception {
        ChromeOptions chromeOptions = new ChromeOptions();
        //设置启动为无头模式
//            chromeOptions.addArguments("--headless");
        chromeOptions.addArguments("--user-agent=Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1");
        chromeOptions.addArguments("--disable-gpu");//谷歌文档提到需要加上这个属性来规避bug
        chromeOptions.addArguments("--disable-dev-shm-usage");
        chromeOptions.addArguments("--no-sandbox");//最高权限运行
        chromeOptions.addArguments("lang=zh-CN.UTF-8");
        //禁用js
//            chromeOptions.addArguments("--disable-javascript");
        //设置Chrome启动时的参数,忽略证书(SSL)错误
        chromeOptions.addArguments("--ignore-certificate-errors");
        // 不加载图片, 提升速度
        chromeOptions.addArguments("blink-settings=imagesEnabled=false");
        Map prefs = new HashMap();
        prefs.put("profile.managed_default_content_settings.images", 2);
        prefs.put("intl.accept_languages", "zh-CN,zh");
        chromeOptions.setExperimentalOption("prefs", prefs);
        //设置Chrome启动时的参数,设置窗口大小
//            chromeOptions.addArguments("----window-size=1600,1080");
        //开启一个实验性参数excludeSwitches,用来隐藏window.navigator.webdriver返回true,这个参数必须是List
        chromeOptions.setExperimentalOption("excludeSwitches", Collections.singletonList("enable-automation"));
        DesiredCapabilities capabilities = DesiredCapabilities.chrome();
        capabilities.setCapability(ChromeOptions.CAPABILITY, chromeOptions);
        //初始化
        WebDriver driver = new RemoteWebDriver(new URL(config.getSeleniumChromeUrl()),
                capabilities);
        driver.manage().window().maximize();
        driver.manage().deleteAllCookies();
        driver.manage().window().setPosition(new Point(100, 50));
        // 与浏览器同步非常重要,必须等待浏览器加载完毕
        driver.manage().timeouts().implicitlyWait(5, TimeUnit.SECONDS);

        return driver;
    }

}

配置 spring yml

java+selenium无头浏览器爬虫技术_第1张图片 

test:

SeleniumConnect seleniumConn = pool.getSeleniumConnect("1");
用完归还  
seleniumConn.releaseConnect()  后续考虑用切面注解统一归还,不侵入业务代码

你可能感兴趣的:(java+selenium无头浏览器爬虫技术)