HttpClient+Jsoup抓取页面下载表格文件,fastjson和WorkBook处理表格数据,Spring Schedule和cron表达式开启定时任务

这次在SpringBoot项目中使用Schedule注解实现定时任务

需求:每月初下载http://www.safe.gov.cn/safe/gzhbdmyzslb/index.html中上个月的表格进行解析。

导入依赖

<dependencies>
		<!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
		<dependency>
			<groupId>org.apache.httpcomponents</groupId>
			<artifactId>httpclient</artifactId>
			<version>4.5.5</version>
		</dependency>
		<dependency>
			<groupId>commons-io</groupId>
			<artifactId>commons-io</artifactId>
			<version>2.5</version>
		</dependency>
		<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
		<dependency>
			<groupId>org.jsoup</groupId>
			<artifactId>jsoup</artifactId>
			<version>1.11.3</version>
		</dependency>
		<dependency>
			<groupId>org.jsoup</groupId>
			<artifactId>jsoup</artifactId>
			<version>1.11.3</version>
		</dependency>
		<dependency>
			<groupId>org.seleniumhq.selenium</groupId>
			<artifactId>selenium-java</artifactId>
			<version>3.3.1</version>
		</dependency>
		<dependency>
			<groupId>org.apache.poi</groupId>
			<artifactId>poi</artifactId>
			<version>3.17</version>
		</dependency>
		<dependency>
			<groupId>org.apache.poi</groupId>
			<artifactId>poi-ooxml</artifactId>
			<version>3.17</version>
		</dependency>
		<dependency>
			<groupId>org.apache.poi</groupId>
			<artifactId>poi-examples</artifactId>
			<version>3.17</version>
		</dependency>
		<dependency>
			<groupId>org.apache.poi</groupId>
			<artifactId>poi-excelant</artifactId>
			<version>3.17</version>
		</dependency>
		<dependency>
			<groupId>com.alibaba</groupId>
			<artifactId>fastjson</artifactId>
			<version>1.2.46</version>
		</dependency>
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-autoconfigure</artifactId>
            <version>2.1.5.RELEASE</version>
        </dependency>
		<dependency>
			<groupId>org.springframework</groupId>
			<artifactId>spring-context</artifactId>
			<version>5.1.7.RELEASE</version>
		</dependency>
	</dependencies>

解决思路

HttpClient+Jsoup抓取页面下载表格文件,fastjson和WorkBook处理表格数据,Spring Schedule和cron表达式开启定时任务_第1张图片
列表类型,每月初获取第一条中的超链接,打开下载页面
HttpClient+Jsoup抓取页面下载表格文件,fastjson和WorkBook处理表格数据,Spring Schedule和cron表达式开启定时任务_第2张图片

                //根据唯一标识获得className为"list_conr"的标签
                Elements listConr = document.getElementsByClass("list_conr");

                //主页按照不同月份有不同链接,是一个列表,我们需要获得最新的月份,即第一个列表里的链接,该链接能打开表格下载的网页
                Element ahref=listConr.select("ul").get(0).select("li").get(0).selectFirst("dt").selectFirst("a");
                String hrefDetail=ahref.attr("href");

HttpClient+Jsoup抓取页面下载表格文件,fastjson和WorkBook处理表格数据,Spring Schedule和cron表达式开启定时任务_第3张图片
在新的页面要用同样的方法获得xlsx的下载地址

                        //获得xlsx的下载链接url
                        Element axlsxhref=document1.getElementsByClass("detail_content").first().select("div").first().select("p").get(2).select("a").first();
                        String axlsxhrefDetail=axlsxhref.attr("href");

下载表格,加了定时任务为了区分,在表格文件名上包括当前的时间

                                SimpleDateFormat dateFormat=new SimpleDateFormat("yyyy年MM月dd日HH_mm_ss");
                                String date=dateFormat.format(new Date());

                                //文件复制
                                InputStream inputStream = entity3.getContent();
                                FileUtils.copyToFile(inputStream,new File("E:\\rmbrate\\rate"+date+fileType));

处理表格数据

在这里插入图片描述
根据表格内容设计POJO

public class ConversionRateToDollar {
    private String month;
    private String currency;
    private String currencyCN;
    private String unit;
    private double rate;

    public String getMonth() {
        return month;
    }

    public void setMonth(String month) {
        this.month = month;
    }

    public String getCurrency() {
        return currency;
    }

    public void setCurrency(String currency) {
        this.currency = currency;
    }

    public String getCurrencyCN() {
        return currencyCN;
    }

    public void setCurrencyCN(String currencyCN) {
        this.currencyCN = currencyCN;
    }

    public String getUnit() {
        return unit;
    }

    public void setUnit(String unit) {
        this.unit = unit;
    }

    public double getRate() {
        return rate;
    }

    public void setRate(double rate) {
        this.rate = rate;
    }
}

我写了一个处理表格的ExcelUtil类,分别对.xls和.xlsx文件进行处理,返回JavaBean的集合对象

import java.io.FileInputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import com.alibaba.fastjson.JSONObject;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.xssf.usermodel.XSSFCell;
import org.apache.poi.xssf.usermodel.XSSFRow;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import com.alibaba.fastjson.JSON;
public class ExcelUtil {

    //导入Excel数据
    public List<ConversionRateToDollar> importExcelActionForXLSX(String filePath) throws Exception {

        //打开工作簿
        XSSFWorkbook wookbook = new XSSFWorkbook(new FileInputStream(filePath));

        //获得第一张表
        XSSFSheet sheet = wookbook.getSheetAt(0);

        //获得表内容的第三行第二列单元格,作为日期
        XSSFRow dateRow=sheet.getRow(2);
        XSSFCell dateCell=dateRow.getCell(1);
        String date=dateCell.getStringCellValue();

        //获取到Excel文件中的所有行数
        int numberOfRows = sheet.getPhysicalNumberOfRows();

        //遍历行(表的第7行开始,是我们想要的数据,最后三行是表的“注”,无需遍历)
        List<Map<String, Object>> list = new ArrayList<Map<String, Object>>();
        for (int i = 6; i < numberOfRows - 3; i++) {

            // 对单独每行的操作:
            XSSFRow row = sheet.getRow(i);

            // 行不为空
            if (row != null) {
                Map<String, Object> map = new HashMap<String, Object>();
                //ConversionRateToDollar对象的“month”属性设置为标题的日期
                String month = date;

                //ConversionRateToDollar对象的“currency”属性
                XSSFCell currencyCell = row.getCell(1);
                String currency = getValue(currencyCell);

                //ConversionRateToDollar对象的“currencyCN”属性
                XSSFCell currencyCNCell = row.getCell(2);
                String currencyCN = getValue(currencyCNCell);

                //ConversionRateToDollar对象的“unit”属性
                XSSFCell unitCell = row.getCell(3);
                String unit = getValue(unitCell);

                //ConversionRateToDollar对象的“rate”属性
                XSSFCell rateCell = row.getCell(4);
                String rate = getValue(rateCell);

                map.put("month",month);
                map.put("currency",currency);
                map.put("currencyCN",currencyCN);
                map.put("unit",unit);
                map.put("rate",rate);

                list.add(map);
            }
        }
        //由于“各种货币对美元折算率”表一行有两个对象,将右边一组的对象按同样方法遍历
        for (int i = 6; i < numberOfRows - 3; i++) {//最后三行是表的注解,不是我们需要的内容
            //对每行的操作
            XSSFRow row = sheet.getRow(i);
            // 行不为空
            if (row != null) {
                Map<String, Object> map1 = new HashMap<String, Object>();
                //获取到Excel文件中的所有的列
                int cells = row.getPhysicalNumberOfCells();
                String month = "2019-05";
                XSSFCell currencyCell = row.getCell(5);
                String currency = getValue(currencyCell);
                XSSFCell currencyCNCell = row.getCell(6);
                String currencyCN = getValue(currencyCNCell);
                XSSFCell unitCell = row.getCell(7);
                String unit = getValue(unitCell);
                XSSFCell rateCell = row.getCell(8);
                String rate = getValue(rateCell);
                map1.put("month",month);
                map1.put("currency",currency);
                map1.put("currencyCN",currencyCN);
                map1.put("unit",unit);
                map1.put("rate",rate);
                list.add(map1);
            }
        }
        //输出list的json
        System.out.println("list = "+JSON.toJSONString(list));
        //将json转化为javaBean
        List<ConversionRateToDollar> conversionRateToDollarList= JSONObject.parseArray(JSON.toJSONString(list),ConversionRateToDollar.class);
        return conversionRateToDollarList;
        //System.out.println("各种货币对美元折算率 对象个数: "+conversionRateToDollarList.size());
    }
    private String getValue(XSSFCell xSSFCell){
        if(null == xSSFCell){
            return "";
        }
        if (xSSFCell.getCellType() == xSSFCell.CELL_TYPE_BOOLEAN) {
            // 返回布尔类型的值  
            return String.valueOf(xSSFCell.getBooleanCellValue());
        } else if (xSSFCell.getCellType() == xSSFCell.CELL_TYPE_NUMERIC) {
            // 返回数值类型的值  
            return String.valueOf(xSSFCell.getNumericCellValue());
        } else {
            // 返回字符串类型的值  
            return String.valueOf(xSSFCell.getStringCellValue());
        }
    }

    //导入Excel数据
    public List<ConversionRateToDollar> importExcelActionForXLS(String filePath) throws Exception {


        //打开工作簿
        HSSFWorkbook wookbook = new HSSFWorkbook(new FileInputStream(filePath));

        //获得第一张表
        HSSFSheet sheet = wookbook.getSheetAt(0);

        //获得表内容的第三行第二列单元格,作为日期
        HSSFRow dateRow=sheet.getRow(2);
        HSSFCell dateCell=dateRow.getCell(1);
        String date=dateCell.getStringCellValue();

        //获取到Excel文件中的所有行数
        int numberOfRows = sheet.getPhysicalNumberOfRows();

        //遍历行(表的第7行开始,是我们想要的数据,最后三行是表的“注”,无需遍历)
        List<Map<String, Object>> list = new ArrayList<Map<String, Object>>();
        for (int i = 6; i < numberOfRows - 3; i++) {

            // 对单独每行的操作:
            HSSFRow row = sheet.getRow(i);

            // 行不为空
            if (row != null) {
                Map<String, Object> map = new HashMap<String, Object>();
                //ConversionRateToDollar对象的“month”属性设置为标题的日期
                String month = date;

                //ConversionRateToDollar对象的“currency”属性
                HSSFCell currencyCell = row.getCell(1);
                String currency = getValueForXLS(currencyCell);

                //ConversionRateToDollar对象的“currencyCN”属性
                HSSFCell currencyCNCell = row.getCell(2);
                String currencyCN = getValueForXLS(currencyCNCell);

                //ConversionRateToDollar对象的“unit”属性
                HSSFCell unitCell = row.getCell(3);
                String unit = getValueForXLS(unitCell);

                //ConversionRateToDollar对象的“rate”属性
                HSSFCell rateCell = row.getCell(4);
                String rate = getValueForXLS(rateCell);

                map.put("month",month);
                map.put("currency",currency);
                map.put("currencyCN",currencyCN);
                map.put("unit",unit);
                map.put("rate",rate);

                list.add(map);
            }
        }
        //由于“各种货币对美元折算率”表一行有两个对象,将右边一组的对象按同样方法遍历
        for (int i = 6; i < numberOfRows - 3; i++) {//最后三行是表的注解,不是我们需要的内容
            //对每行的操作
            HSSFRow row = sheet.getRow(i);
            // 行不为空
            if (row != null) {
                Map<String, Object> map1 = new HashMap<String, Object>();
                //获取到Excel文件中的所有的列
                int cells = row.getPhysicalNumberOfCells();
                String month = "2019-05";
                HSSFCell currencyCell = row.getCell(5);
                String currency = getValueForXLS(currencyCell);
                HSSFCell currencyCNCell = row.getCell(6);
                String currencyCN = getValueForXLS(currencyCNCell);
                HSSFCell unitCell = row.getCell(7);
                String unit = getValueForXLS(unitCell);
                HSSFCell rateCell = row.getCell(8);
                String rate = getValueForXLS(rateCell);
                map1.put("month",month);
                map1.put("currency",currency);
                map1.put("currencyCN",currencyCN);
                map1.put("unit",unit);
                map1.put("rate",rate);
                list.add(map1);
            }
        }
        //输出list的json
        System.out.println("list = "+JSON.toJSONString(list));
        //将json转化为javaBean
        List<ConversionRateToDollar> conversionRateToDollarList= JSONObject.parseArray(JSON.toJSONString(list),ConversionRateToDollar.class);
        return conversionRateToDollarList;
    }
    private String getValueForXLS(HSSFCell hSSFCell){
        if(null == hSSFCell){
            return "";
        }
        if (hSSFCell.getCellType() == hSSFCell.CELL_TYPE_BOOLEAN) {
            // 返回布尔类型的值  
            return String.valueOf(hSSFCell.getBooleanCellValue());
        } else if (hSSFCell.getCellType() == hSSFCell.CELL_TYPE_NUMERIC) {
            // 返回数值类型的值  
            return String.valueOf(hSSFCell.getNumericCellValue());
        } else {
            // 返回字符串类型的值  
            return String.valueOf(hSSFCell.getStringCellValue());
        }
    }
}

实现定时任务

在SpringBoot的启动类添加注解

import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.scheduling.annotation.EnableScheduling;

@SpringBootApplication
@EnableScheduling
public class GetExcelByHref {
    public static void main(String[] args) {
        SpringApplication.run(GetExcelByHref.class,args);
    }
}

配置组件,在里面写入需要定时执行的任务方法,使用cron表达式。本来表达式应该配置为每月初执行,这里为了测试效果写了一个每五分钟执行一次。

@Component
public class GetConversionRateToDollar {
    @Scheduled(cron = "0 0/5 * * * *")
    public void getConversionRateToDollar() throws IOException {
    .....
        }
}

源代码

package com.crawler;

public class ConversionRateToDollar {
    private String month;
    private String currency;
    private String currencyCN;
    private String unit;
    private double rate;

    public String getMonth() {
        return month;
    }

    public void setMonth(String month) {
        this.month = month;
    }

    public String getCurrency() {
        return currency;
    }

    public void setCurrency(String currency) {
        this.currency = currency;
    }

    public String getCurrencyCN() {
        return currencyCN;
    }

    public void setCurrencyCN(String currencyCN) {
        this.currencyCN = currencyCN;
    }

    public String getUnit() {
        return unit;
    }

    public void setUnit(String unit) {
        this.unit = unit;
    }

    public double getRate() {
        return rate;
    }

    public void setRate(double rate) {
        this.rate = rate;
    }
}

package com.crawler;
import java.io.FileInputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import com.alibaba.fastjson.JSONObject;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.xssf.usermodel.XSSFCell;
import org.apache.poi.xssf.usermodel.XSSFRow;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import com.alibaba.fastjson.JSON;
public class ExcelUtil {

    //导入Excel数据
    public List<ConversionRateToDollar> importExcelActionForXLSX(String filePath) throws Exception {

        //打开工作簿
        XSSFWorkbook wookbook = new XSSFWorkbook(new FileInputStream(filePath));

        //获得第一张表
        XSSFSheet sheet = wookbook.getSheetAt(0);

        //获得表内容的第三行第二列单元格,作为日期
        XSSFRow dateRow=sheet.getRow(2);
        XSSFCell dateCell=dateRow.getCell(1);
        String date=dateCell.getStringCellValue();

        //获取到Excel文件中的所有行数
        int numberOfRows = sheet.getPhysicalNumberOfRows();

        //遍历行(表的第7行开始,是我们想要的数据,最后三行是表的“注”,无需遍历)
        List<Map<String, Object>> list = new ArrayList<Map<String, Object>>();
        for (int i = 6; i < numberOfRows - 3; i++) {

            // 对单独每行的操作:
            XSSFRow row = sheet.getRow(i);

            // 行不为空
            if (row != null) {
                Map<String, Object> map = new HashMap<String, Object>();
                //ConversionRateToDollar对象的“month”属性设置为标题的日期
                String month = date;

                //ConversionRateToDollar对象的“currency”属性
                XSSFCell currencyCell = row.getCell(1);
                String currency = getValue(currencyCell);

                //ConversionRateToDollar对象的“currencyCN”属性
                XSSFCell currencyCNCell = row.getCell(2);
                String currencyCN = getValue(currencyCNCell);

                //ConversionRateToDollar对象的“unit”属性
                XSSFCell unitCell = row.getCell(3);
                String unit = getValue(unitCell);

                //ConversionRateToDollar对象的“rate”属性
                XSSFCell rateCell = row.getCell(4);
                String rate = getValue(rateCell);

                map.put("month",month);
                map.put("currency",currency);
                map.put("currencyCN",currencyCN);
                map.put("unit",unit);
                map.put("rate",rate);

                list.add(map);
            }
        }
        //由于“各种货币对美元折算率”表一行有两个对象,将右边一组的对象按同样方法遍历
        for (int i = 6; i < numberOfRows - 3; i++) {//最后三行是表的注解,不是我们需要的内容
            //对每行的操作
            XSSFRow row = sheet.getRow(i);
            // 行不为空
            if (row != null) {
                Map<String, Object> map1 = new HashMap<String, Object>();
                //获取到Excel文件中的所有的列
                int cells = row.getPhysicalNumberOfCells();
                String month = "2019-05";
                XSSFCell currencyCell = row.getCell(5);
                String currency = getValue(currencyCell);
                XSSFCell currencyCNCell = row.getCell(6);
                String currencyCN = getValue(currencyCNCell);
                XSSFCell unitCell = row.getCell(7);
                String unit = getValue(unitCell);
                XSSFCell rateCell = row.getCell(8);
                String rate = getValue(rateCell);
                map1.put("month",month);
                map1.put("currency",currency);
                map1.put("currencyCN",currencyCN);
                map1.put("unit",unit);
                map1.put("rate",rate);
                list.add(map1);
            }
        }
        //输出list的json
        System.out.println("list = "+JSON.toJSONString(list));
        //将json转化为javaBean
        List<ConversionRateToDollar> conversionRateToDollarList= JSONObject.parseArray(JSON.toJSONString(list),ConversionRateToDollar.class);
        return conversionRateToDollarList;
        //System.out.println("各种货币对美元折算率 对象个数: "+conversionRateToDollarList.size());
    }
    private String getValue(XSSFCell xSSFCell){
        if(null == xSSFCell){
            return "";
        }
        if (xSSFCell.getCellType() == xSSFCell.CELL_TYPE_BOOLEAN) {
            // 返回布尔类型的值  
            return String.valueOf(xSSFCell.getBooleanCellValue());
        } else if (xSSFCell.getCellType() == xSSFCell.CELL_TYPE_NUMERIC) {
            // 返回数值类型的值  
            return String.valueOf(xSSFCell.getNumericCellValue());
        } else {
            // 返回字符串类型的值  
            return String.valueOf(xSSFCell.getStringCellValue());
        }
    }

    //导入Excel数据
    public List<ConversionRateToDollar> importExcelActionForXLS(String filePath) throws Exception {


        //打开工作簿
        HSSFWorkbook wookbook = new HSSFWorkbook(new FileInputStream(filePath));

        //获得第一张表
        HSSFSheet sheet = wookbook.getSheetAt(0);

        //获得表内容的第三行第二列单元格,作为日期
        HSSFRow dateRow=sheet.getRow(2);
        HSSFCell dateCell=dateRow.getCell(1);
        String date=dateCell.getStringCellValue();

        //获取到Excel文件中的所有行数
        int numberOfRows = sheet.getPhysicalNumberOfRows();

        //遍历行(表的第7行开始,是我们想要的数据,最后三行是表的“注”,无需遍历)
        List<Map<String, Object>> list = new ArrayList<Map<String, Object>>();
        for (int i = 6; i < numberOfRows - 3; i++) {

            // 对单独每行的操作:
            HSSFRow row = sheet.getRow(i);

            // 行不为空
            if (row != null) {
                Map<String, Object> map = new HashMap<String, Object>();
                //ConversionRateToDollar对象的“month”属性设置为标题的日期
                String month = date;

                //ConversionRateToDollar对象的“currency”属性
                HSSFCell currencyCell = row.getCell(1);
                String currency = getValueForXLS(currencyCell);

                //ConversionRateToDollar对象的“currencyCN”属性
                HSSFCell currencyCNCell = row.getCell(2);
                String currencyCN = getValueForXLS(currencyCNCell);

                //ConversionRateToDollar对象的“unit”属性
                HSSFCell unitCell = row.getCell(3);
                String unit = getValueForXLS(unitCell);

                //ConversionRateToDollar对象的“rate”属性
                HSSFCell rateCell = row.getCell(4);
                String rate = getValueForXLS(rateCell);

                map.put("month",month);
                map.put("currency",currency);
                map.put("currencyCN",currencyCN);
                map.put("unit",unit);
                map.put("rate",rate);

                list.add(map);
            }
        }
        //由于“各种货币对美元折算率”表一行有两个对象,将右边一组的对象按同样方法遍历
        for (int i = 6; i < numberOfRows - 3; i++) {//最后三行是表的注解,不是我们需要的内容
            //对每行的操作
            HSSFRow row = sheet.getRow(i);
            // 行不为空
            if (row != null) {
                Map<String, Object> map1 = new HashMap<String, Object>();
                //获取到Excel文件中的所有的列
                int cells = row.getPhysicalNumberOfCells();
                String month = "2019-05";
                HSSFCell currencyCell = row.getCell(5);
                String currency = getValueForXLS(currencyCell);
                HSSFCell currencyCNCell = row.getCell(6);
                String currencyCN = getValueForXLS(currencyCNCell);
                HSSFCell unitCell = row.getCell(7);
                String unit = getValueForXLS(unitCell);
                HSSFCell rateCell = row.getCell(8);
                String rate = getValueForXLS(rateCell);
                map1.put("month",month);
                map1.put("currency",currency);
                map1.put("currencyCN",currencyCN);
                map1.put("unit",unit);
                map1.put("rate",rate);
                list.add(map1);
            }
        }
        //输出list的json
        System.out.println("list = "+JSON.toJSONString(list));
        //将json转化为javaBean
        List<ConversionRateToDollar> conversionRateToDollarList= JSONObject.parseArray(JSON.toJSONString(list),ConversionRateToDollar.class);
        return conversionRateToDollarList;
    }
    private String getValueForXLS(HSSFCell hSSFCell){
        if(null == hSSFCell){
            return "";
        }
        if (hSSFCell.getCellType() == hSSFCell.CELL_TYPE_BOOLEAN) {
            // 返回布尔类型的值  
            return String.valueOf(hSSFCell.getBooleanCellValue());
        } else if (hSSFCell.getCellType() == hSSFCell.CELL_TYPE_NUMERIC) {
            // 返回数值类型的值  
            return String.valueOf(hSSFCell.getNumericCellValue());
        } else {
            // 返回字符串类型的值  
            return String.valueOf(hSSFCell.getStringCellValue());
        }
    }
}
package com.crawler;

import org.apache.commons.io.FileUtils;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
import java.util.List;

@Component
public class GetConversionRateToDollar {
    @Scheduled(cron = "0 0/5 * * * *")
    public void getConversionRateToDollar() throws IOException {
        /**
         1.创建HttpClient对象;
         2.创建请求方法的实例,并指定请求URL。如果需要发送GET请求,创建HttpGet对象;如果需要发送POST请求,创建HttpPost对象;
         3.如果需要发送请求参数,可调用HttpGet,HttpPost共同的setParams(HttpParams params)方法来添加请求参数;
         4.调用HttpClient对象的execute(HttpUriRequest request)发送请求,该方法返回一个HttpResponse;
         5.调用HttpResponse的getAllHeaders().getHeaders(String name)等方法可获得服务器的响应头;调用HttpResponse的get
         6.释放连接,无论执行方法是否成功,都必须释放资源
         */
        //创建HttpClientBuilder
        HttpClientBuilder httpClientBuilder= HttpClientBuilder.create();
        //创建HttpClient
        CloseableHttpClient closeableHttpClient=httpClientBuilder.build();
        HttpGet httpGet=new HttpGet("http://www.safe.gov.cn/safe/gzhbdmyzslb/index.html");
        System.out.println(httpGet.getRequestLine());
        try {
            String urlHeader = "http://www.safe.gov.cn";
            //执行get请求
            HttpResponse httpResponse = closeableHttpClient.execute(httpGet);
            //获取响应消息实体
            HttpEntity entity = httpResponse.getEntity();
            //响应状态
            System.out.println("status:" + httpResponse.getStatusLine());
            if (entity != null) {
                System.out.println("contentEncoding:" + entity.getContentEncoding());
                //返回html文本信息
                String response = EntityUtils.toString(entity, "UTF-8");
                //System.out.println(response);

                //解析页面内容
                Document document= Jsoup.parse(response);

                //根据唯一标识获得className为"list_conr"的标签
                Elements listConr = document.getElementsByClass("list_conr");

                //主页按照不同月份有不同链接,是一个列表,我们需要获得最新的月份,即第一个列表里的链接,该链接能打开表格下载的网页
                Element ahref=listConr.select("ul").get(0).select("li").get(0).selectFirst("dt").selectFirst("a");
                String hrefDetail=ahref.attr("href");

                //新的页面,创建新的HttpClientBuilder
                HttpClientBuilder httpClientBuilder1= HttpClientBuilder.create();

                //创建HttpClient
                CloseableHttpClient closeableHttpClient1=httpClientBuilder1.build();

                //请求某个月份的表格下载页面
                HttpGet httpGet1=new HttpGet(urlHeader+hrefDetail);
                System.out.println(httpGet.getRequestLine());

                try {
                    //执行get请求
                    HttpResponse httpResponse1 = closeableHttpClient1.execute(httpGet1);

                    //获取响应消息实体
                    HttpEntity entity1 = httpResponse1.getEntity();

                    //响应状态
                    System.out.println("status:" + httpResponse1.getStatusLine());

                    if(entity1!=null){
                        //html文本信息
                        String response1 = EntityUtils.toString(entity1, "UTF-8");
                        //System.out.println(response1);

                        //解析页面内容
                        Document document1= Jsoup.parse(response1);

                        //获得xlsx的下载链接url
                        Element axlsxhref=document1.getElementsByClass("detail_content").first().select("div").first().select("p").get(2).select("a").first();
                        String axlsxhrefDetail=axlsxhref.attr("href");
                        String url = urlHeader+axlsxhrefDetail;

                        //创建httpClient实例
                        CloseableHttpClient httpClient3 = HttpClients.createDefault();
                        //创建httpGet实例
                        HttpGet httpGet3 = new HttpGet(url);

                        //获得页面响应
                        CloseableHttpResponse response3 = httpClient3.execute(httpGet3);

                        //获取.后缀,为"xlsx"
                        String fileType = url.substring(url.lastIndexOf("."), url.length());

                        if (response3 != null){
                            HttpEntity entity3 = response3.getEntity();
                            if (entity3 != null){

//                                //如果设置定时任务,那么在文件标题中显示上个月的月份,便于区分
//                                Calendar calendar=Calendar.getInstance();
//                                calendar.add(Calendar.MONTH,-1);
//                                String date=new SimpleDateFormat("yyyy年MM月").format(calendar.getTime());

                                SimpleDateFormat dateFormat=new SimpleDateFormat("yyyy年MM月dd日HH_mm_ss");
                                String date=dateFormat.format(new Date());

                                //文件复制
                                InputStream inputStream = entity3.getContent();
                                FileUtils.copyToFile(inputStream,new File("E:\\rmbrate\\rate"+date+fileType));

                                List<ConversionRateToDollar> conversionRateToDollarList=new ArrayList<ConversionRateToDollar>();
                                ExcelUtil excelUtil=new ExcelUtil();
                                try {
                                    //.xlsx后缀
                                    if (fileType.equals(".xlsx")) {
                                        conversionRateToDollarList=excelUtil.importExcelActionForXLSX("E:\\rmbrate\\rate"+date+fileType);
                                    }
                                    //.xls后缀
                                    else if (fileType.equals(".xls")) {
                                        conversionRateToDollarList=excelUtil.importExcelActionForXLS("E:\\rmbrate\\rate"+date+fileType);
                                    }
                                    System.out.println(conversionRateToDollarList.size());

                                } catch (Exception e) {
                                    e.printStackTrace();
                                }
                            }
                        }
                        if (response3 != null){
                            response3.close();
                        }
                        if (httpClient3 != null){
                            httpClient3.close();
                        }
                    }
                }catch (IOException e){
                    e.printStackTrace();
                }
                finally {
                    try {
                        closeableHttpClient1.close();
                    }catch (IOException e){
                        e.printStackTrace();
                    }
                }

            }
        }catch (IOException e){
            e.printStackTrace();
        }
        finally {
            try {
                closeableHttpClient.close();
            }catch (IOException e){
                e.printStackTrace();
            }
        }
    }
}

package com.crawler;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.scheduling.annotation.EnableScheduling;

@SpringBootApplication
@EnableScheduling
public class GetExcelByHref {
    public static void main(String[] args) {
        SpringApplication.run(GetExcelByHref.class,args);
    }
}

运行结果

每五分钟成功执行
每五分钟执行 获得了表
HttpClient+Jsoup抓取页面下载表格文件,fastjson和WorkBook处理表格数据,Spring Schedule和cron表达式开启定时任务_第4张图片
输出了json

你可能感兴趣的:(HttpClient+Jsoup抓取页面下载表格文件,fastjson和WorkBook处理表格数据,Spring Schedule和cron表达式开启定时任务)