Java使用selenium爬取网页源代码并通过邮件发送

1、项目搭建:idea + gradle + springboot
 build.gradle:引入selenium相关依赖

apply plugin: 'java'
apply plugin: 'maven'
apply plugin: 'idea'
apply plugin: 'org.springframework.boot'

sourceCompatibility = 1.8
group = 'com.boom.basement'

def version = '1.0.0.RELEASE'
def artifactId = 'boom-selenium'

buildscript {
    ext {
        springBootVersion = '1.5.12.RELEASE'
    }
    repositories {
        maven { url 'http://maven.aliyun.com/nexus/content/groups/public/' }
    }
    dependencies {
        classpath("org.springframework.boot:spring-boot-gradle-plugin:${springBootVersion}")
        classpath("org.springframework.boot:spring-boot-maven-plugin:${springBootVersion}")
    }
}

repositories {
    maven { url 'http://maven.aliyun.com/nexus/content/groups/public/' }
}

dependencies {
    // springboot
    compile "org.springframework.boot:spring-boot-starter:$springBootVersion"
    compile "org.springframework.boot:spring-boot-starter-web:$springBootVersion"
    compile "org.springframework.boot:spring-boot-starter-thymeleaf:$springBootVersion"
    // 邮件发送
//    compile "org.springframework.boot:spring-boot-starter-mail:$springBootVersion"
    compile group: 'javax.mail', name: 'mail', version: '1.4.7'
    // selenium
    compile group: 'org.seleniumhq.selenium', name: 'selenium-api', version: '3.141.59'
    compile group: 'org.seleniumhq.selenium', name: 'selenium-remote-driver', version: '3.141.59'
    compile('org.seleniumhq.selenium:selenium-chrome-driver:3.141.59') {
        exclude module: 'selenium-api'
        exclude module: 'selenium-remote-driver'
    }
    // pdf
    compile 'com.itextpdf:itextpdf:5.4.2'
    compile 'org.xhtmlrenderer:flying-saucer-pdf:9.0.8'
}

 注意点:
  ①selenium-chrome-driver包自动依赖的selenium-api和selenium-remote-driver的版本并不是与其版本相同的3.141.59,需排除后手动引入正确版本
  ②需使用javax.mail,SpringBoot集成的spring-boot-starter-mail可能会导致邮箱服务器连接超时(具体不知道原因)
  ③flying-saucer-pdf的版本使用9.0.8
 主启动类:

@SpringBootApplication
public class BoomSeleniumApplication {
    public static void main(String[] args) {
        SpringApplication.run(BoomSeleniumApplication.class, args);
    }
}

2、编写controller:完成源代码爬取和转为pdf并通过邮件发送等

import com.lowagie.text.DocumentException;
import com.sun.mail.util.MailSSLSocketFactory;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import org.springframework.stereotype.Controller;
import org.springframework.web.bind.annotation.RequestMapping;
import org.xhtmlrenderer.pdf.ITextRenderer;

import javax.activation.DataHandler;
import javax.activation.FileDataSource;
import javax.mail.*;
import javax.mail.internet.*;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.security.GeneralSecurityException;
import java.util.Properties;

@Controller
@RequestMapping("/selenium")
public class SeleniumController {

    @RequestMapping("/index")
    public String index() {
        // 测试页面
        return "index";
    }

    @RequestMapping("/sendMail")
    public void sendMail() throws GeneralSecurityException, MessagingException, IOException, DocumentException {
        // 设置环境变量:指明chrome驱动的位置,chromedriver须提前下载并置于指定位置下
        System.setProperty("webdriver.chrome.driver", "d:\\chromedriver.exe");

        ChromeOptions chromeOptions = new ChromeOptions();
        // 设置为 headless 模式:必须设置为无头模式
        chromeOptions.addArguments("--headless");
//        chromeOptions.addArguments("--disable-gpu");
        // 设置浏览器窗口大小
        chromeOptions.addArguments("--window-size=1920,1080");
        // 相当于创建一个虚拟浏览器
        WebDriver driver = new ChromeDriver(chromeOptions);
        // 相当于在浏览器中输入网址并回车
        driver.get("http://localhost:8080/selenium/index");
        // 爬取网页的源代码
        String pageSource = driver.getPageSource();
        // 调用方法将源代码转为pdf
        createPDF(new FileOutputStream("d:\\index.pdf"), pageSource);
        // 邮件发送相关参数设置
        Properties props = new Properties();
        // 开启debug调试
        props.setProperty("mail.debug", "true");
        // 发送服务器需要身份验证
        props.setProperty("mail.smtp.auth", "true");
        // 设置邮件服务器主机名
        props.setProperty("mail.host", "smtp.qq.com");
        // 发送邮件协议名称
        props.setProperty("mail.transport.protocol", "smtp");

        MailSSLSocketFactory sf = new MailSSLSocketFactory();
        sf.setTrustAllHosts(true);
        props.put("mail.smtp.ssl.enable", "true");
        props.put("mail.smtp.ssl.socketFactory", sf);

        // 创建会话
        Session session = Session.getInstance(props);

        // 根据会话创建邮件信息
        Message msg = new MimeMessage(session);
        // 邮件主题
        msg.setSubject("JavaMail Test");

//        //创建图片节点
//        MimeBodyPart image = new MimeBodyPart();
//        //读取本地文件
//        DataHandler dataHandler = new DataHandler(new FileDataSource("src/123.jpg"));
//        //将图片添加至结点
//        image.setDataHandler(dataHandler);
//        //为"节点"设置一个唯一编号
//        image.setContentID("pic");

        // 创建文本"节点"
        MimeBodyPart text = new MimeBodyPart();
        // 将爬取的源代码作为邮件的正文
        text.setContent(pageSource, "text/html;charset=UTF-8");

        // 创建附件结点
        MimeBodyPart attachment = new MimeBodyPart();
        // 读取本地文件:将源代码转为的pdf文件
        DataHandler dataHandler2 = new DataHandler(new FileDataSource("d:\\index.pdf"));
        // 将文件添加至结点
        attachment.setDataHandler(dataHandler2);
        // 设置附件的文件名(需要编码)
        attachment.setFileName(MimeUtility.encodeText(dataHandler2.getName()));

        // 创建混合节点  将图片节点 文件结点 附件结点 加入
        MimeMultipart multipart = new MimeMultipart();
//        multipart.addBodyPart(image);
        multipart.addBodyPart(text);
        multipart.addBodyPart(attachment);
        // 将混合节点加入邮件中
        msg.setContent(multipart);

        // 设置邮件发送方
        msg.setFrom(new InternetAddress("[email protected]"));

        // 开始会话传输
        Transport transport = session.getTransport();
        // 连接邮箱:指定邮箱和授权码
        transport.connect("smtp.qq.com", "[email protected]", "xxxxxx");

        // 给目标邮箱发送邮件
        transport.sendMessage(msg, new Address[]{new InternetAddress("[email protected]")});
        transport.close();
    }

    /**
     * 将html转为pdf
     *
     * @param out
     * @param html
     * @throws IOException
     * @throws DocumentException
     */
    public static void createPDF(OutputStream out, String html) throws IOException, DocumentException {
        ITextRenderer renderer = new ITextRenderer();
        renderer.setDocumentFromString(html);
        // 解决中文支持问题
//        ITextFontResolver fontResolver = renderer.getFontResolver();
//        fontResolver.addFont("pdf/font/fangsong.ttf", BaseFont.IDENTITY_H, BaseFont.NOT_EMBEDDED);
//        fontResolver.addFont("pdf/font/PingFangSC.ttf", BaseFont.IDENTITY_H, BaseFont.NOT_EMBEDDED);
        renderer.layout();
        renderer.createPDF(out);
    }
}

 注意:需提前下载chromedriver.exe,下载地址:chromedriver
3、index.html:注意须在resources/templates/文件夹下,因为SpringBoot默认将thymeleaf的模板位置设置在了该位置下,否则需要手动配置thymeleaf相关参数


<html lang="en" xmlns:th="http://www.thymeleaf.org">
    <head>
        <title>Titletitle>
    head>
    <body>
        <a href="https://www.baidu.com">Helloa>
    body>
html>

 注意:
  ①在html中尽量不要有等标签,会影响pdf的转换,且将html作为邮件的正文时邮件厂商出于安全考虑是不支持外联(link)的。详情参考:HTML邮件兼容问题
  ②邮件厂商对于发送text/html有限制,有可能在发送几次之后就发送不过去了,建议将html转换为pdf后发送

你可能感兴趣的:(杂七杂八)