Spring Boot + JSoup 抓取京东商品信息

需求分析

  1. 导入京东商品URL列表
  2. 生成京东商品信息并输出到excel表

思路

  1. 读取excel获取URL列表
  2. 访问url并获得HTML源码
  3. 提取对应的商品信息字段
  4. 输出到excel

搭建框架

  1. 创建Spring Boot工程

    Spring Boot + JSoup 抓取京东商品信息_第1张图片

    Spring Boot + JSoup 抓取京东商品信息_第2张图片

  2. 选择依赖

    Spring Boot + JSoup 抓取京东商品信息_第3张图片

  3. 配置pom.xml

    
    
        4.0.0
    
        com.landawang
        operation-web
        0.0.1-SNAPSHOT
        jar
    
        operation-web
        Demo project for Spring Boot
    
        
            org.springframework.boot
            spring-boot-starter-parent
            2.0.3.RELEASE
             
        
    
        
            UTF-8
            UTF-8
            10
        
    
        
            
                org.springframework.boot
                spring-boot-starter-web
            
    
            
                org.springframework.boot
                spring-boot-devtools
                runtime
            
            
                org.springframework.boot
                spring-boot-starter-test
                test
            
    
            
            
                org.springframework.boot
                spring-boot-starter-thymeleaf
            
    
    
            
                org.apache.poi
                poi-ooxml
                3.10-FINAL
            
    
            
                org.apache.httpcomponents
                httpclient
                4.5.1
            
    
            
                
                org.jsoup
                jsoup
                1.11.3
            
        
    
        
            
                
                    org.springframework.boot
                    spring-boot-maven-plugin
                
            
        
    
    
    
  4. 重命名application.properties为application.yml,修改内容

    server:
      port: 8080 # 配置端口
    
    spring:
      thymeleaf:
        # 开发时关闭缓存,不然无法看到实时页面
        cache: false
        prefix: classpath:/templates/
        suffix: .html
    
  5. 创建resources/templates/index.html

    
    
    
        
        
        生成商品信息
    
    
    
    

    请选择excel文件

  6. 创建PageController并访问页面

    /**
     * Created with IntelliJ IDEA.
     * Project: operation-web
     * Package: com.landawang.operationweb.controller
     * User: Wangxin
     * Date: 2018/7/12
     * Time: 16:46
     * Copyright © 2018年 黄旺鑫. All rights reserved.
     */
    
    
    package com.landawang.operationweb.controller;
    
    import org.springframework.stereotype.Controller;
    import org.springframework.web.bind.annotation.RequestMapping;
    
    @Controller
    public class PageController {
    
        @RequestMapping("/")
        public String index() {
            return "index";
        }
    }
    

提取商品信息

  1. ExcelController.java,编写实现逻辑

    /**
     * Created with IntelliJ IDEA.
     * Project: operation-web
     * Package: com.landawang.operationweb.controller
     * User: Wangxin
     * Date: 2018/7/12
     * Time: 17:26
     * Copyright © 2018年 黄旺鑫. All rights reserved.
     */
    
    
    package com.landawang.operationweb.controller;
    
    import org.apache.http.Consts;
    import org.apache.http.HttpEntity;
    import org.apache.http.client.methods.CloseableHttpResponse;
    import org.apache.http.client.methods.HttpGet;
    import org.apache.http.impl.client.CloseableHttpClient;
    import org.apache.http.impl.client.HttpClients;
    import org.apache.http.util.EntityUtils;
    import org.apache.poi.xssf.usermodel.*;
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    import org.jsoup.nodes.Element;
    import org.jsoup.select.Elements;
    import org.openxmlformats.schemas.spreadsheetml.x2006.main.STCellType;
    import org.springframework.web.bind.annotation.PostMapping;
    import org.springframework.web.bind.annotation.RequestMapping;
    import org.springframework.web.bind.annotation.RequestParam;
    import org.springframework.web.bind.annotation.RestController;
    import org.springframework.web.multipart.MultipartFile;
    
    import javax.servlet.http.HttpServletResponse;
    import java.util.Date;
    import java.util.HashMap;
    import java.util.Map;
    
    
    @RestController
    @RequestMapping("/excel")
    public class ExcelController {
    
        @PostMapping("/upload")
        public void uploadExcel(HttpServletResponse response, @RequestParam("file") MultipartFile file) {
    
            /**
             * 1. 获取上传的url列表
             * 2. 遍历获取url对应页面的HTML源码
             * 3. 提取对应的商品信息字段
             * 4. 输出的excel
             * */
    
            try {
                // 读取Excel文件
                XSSFWorkbook workbook = new XSSFWorkbook(file.getInputStream());
                // 读取Excel工作表
                XSSFSheet sheet = workbook.getSheetAt(0);
    
    
                // 创建输出Excel文件
                XSSFWorkbook newWorkbook = new XSSFWorkbook();
                // 创建Sheet
                XSSFSheet newSheet = newWorkbook.createSheet();
                // 创建标题行
                XSSFRow titleRow = newSheet.createRow(0);
                // 设置标题行
                XSSFCell cell1 = titleRow.createCell(0, STCellType.INT_STR);
                cell1.setCellValue("商品编码");
                XSSFCell cell2 = titleRow.createCell(1, STCellType.INT_STR);
                cell2.setCellValue("商品名称");
                XSSFCell cell3 = titleRow.createCell(2, STCellType.INT_STR);
                cell3.setCellValue("商品分类");
                // 设置宽度
                newSheet.setColumnWidth(0, 2560);
                newSheet.setColumnWidth(1, 25600);
                newSheet.setColumnWidth(2, 5120);
    
                // 遍历获取HTML源码,提取信息
                for (int i = 0; i < sheet.getLastRowNum(); i++) {
                    // 获取行
                    XSSFRow row = sheet.getRow(i);
                    // 获取列
                    XSSFCell cell = row.getCell(0);
                    // 获取url
                    String url = cell.getStringCellValue();
    
                    // 输出的Excel创建行
                    XSSFRow newRow = newSheet.createRow(i + 1);
    
                    // 判断url不为空并且包含http
                    if (!url.isEmpty() && url.contains("http")) {
                        // 获取商品信息集合
                        Map data = getProductInfo(url);
    
                        // 输出商品信息到Excel表
                        if (data != null) {
                            XSSFCell cellOne = newRow.createCell(0, STCellType.INT_STR);
                            cellOne.setCellValue(data.get("sku"));
                            XSSFCell cellTwo = newRow.createCell(1, STCellType.INT_STR);
                            cellTwo.setCellValue(data.get("name"));
                            XSSFCell cellThree = newRow.createCell(2, STCellType.INT_STR);
                            cellThree.setCellValue(data.get("cat"));
                        }
                    }
    
                    // 打印调试
                    System.out.println("\n内容是:" + url);
                }
    
                // 下载excel
                response.setContentType("application/octet-stream");
                // 以时间戳命名
                String fileName = String.valueOf(new Date().getTime()) + ".xlsx";
                response.setHeader("Content-disposition", "attachment;filename=" + fileName);
                response.flushBuffer();
    
                // 输出excel
                newWorkbook.write(response.getOutputStream());
    
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
    
    
        /**
         * 提取商品信息
         * */
        private Map getProductInfo(String url) throws Exception {
    
    
            CloseableHttpClient httpclient = HttpClients.createDefault();
            HttpGet httpGet = new HttpGet(url);
    
            // 模拟浏览器浏览
            httpGet.setHeader("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:60.0) Gecko/20100101 Firefox/60.0");
            CloseableHttpResponse response1 = httpclient.execute(httpGet);
            // The underlying HTTP connection is still held by the response object
            // to allow the response content to be streamed directly from the network socket.
            // In order to ensure correct deallocation of system resources
            // the user MUST call CloseableHttpResponse#close() from a finally clause.
            // Please note that if response content is not fully consumed the underlying
            // connection cannot be safely re-used and will be shut down and discarded
            // by the connection manager.
    
            // 结果集合
            Map reslut = null;
    
            //获取响应状态码
            int StatusCode = response1.getStatusLine().getStatusCode();
    
            try {
    
                HttpEntity entity1 = response1.getEntity();
    
                //如果状态响应码为200,则获取html实体内容或者json文件
                if(StatusCode == 200){
                    String html = EntityUtils.toString(entity1, Consts.UTF_8);
    
                    // 提取HTML得到商品信息结果
                    reslut = getData(html);
                    
                    // 消耗掉实体
                    EntityUtils.consume(response1.getEntity());
                }else {
                    //否则,消耗掉实体
                    EntityUtils.consume(response1.getEntity());
                }
            } finally {
                response1.close();
            }
    
            return reslut;
        }
    
        private static Map getData (String html) throws Exception{
            //获取的数据,存放在集合中
            Map data = new HashMap();
            
            //采用Jsoup解析
            Document doc = Jsoup.parse(html);
            
            //获取html标签中的内容
            // 标题
            String name = doc.select("div[class=sku-name]").text();
            if (name != null) {
                data.put("name", name);
            }
    
            // sku
            String sku = "";
            Elements elements = doc.select("a[data-sku]");
            for (Element ele: elements) {
                if (ele.hasAttr("data-sku")) {
                    sku = ele.attr("data-sku");
                    break;
                }
            }
    
            if (sku != null) {
                data.put("sku", sku);
            }
    
            String cat = doc.select("a[clstag=shangpin|keycount|product|mbNav-1]").text();
    
            if (cat != null) {
                data.put("cat", cat);
            }
    
            System.out.print(sku + "---------" + cat + "---------" + name);
            
            //返回数据
            return data;
        }
    
    }
    
  2. 解析HTML参考:https://jsoup.org

  3. 网络请求参考:http://hc.apache.org/httpcomponents-client-4.5.x/quickstart.html

测试

  1. 准备excel表
    Spring Boot + JSoup 抓取京东商品信息_第4张图片

  2. 访问http://localhost:8080,选取excel表
    Spring Boot + JSoup 抓取京东商品信息_第5张图片

  3. 获取时间戳命名的excel表
    Spring Boot + JSoup 抓取京东商品信息_第6张图片

你可能感兴趣的:(Java)