凌晨不睡觉 用爬虫抓取京东数据

在酒店隔离第10天
凌晨的厦门还是有些凉,我把黑色卫衣的帽子戴上
仿佛我是名在网络上为维护世界和平而重拳出击的黑客......
直到看了看我的发量.. 想想算了还是当个普通人比较好

前阵子学了爬虫,一直没用来实战
突然想试试, 我说可以, 很快啊
上来就是一左重蹬 右鞭腿 左刺拳..

爬虫有啥用

大数据时代, 爬虫用来获取互联网中的有价值的数据
比如爬取微博 积木的小姐姐啦
分析pronhub有没有你女朋友的视频啦

爬到的数据库截图

image.png

建表语句


/*
 Source Server Type    : MySQL
 Source Server Version : 50730
*/
SET NAMES utf8mb4;
SET FOREIGN_KEY_CHECKS = 0;
DROP TABLE IF EXISTS `jd_item`;
CREATE TABLE `jd_item` (
  `id` bigint(20) NOT NULL AUTO_INCREMENT COMMENT '主键',
  `create_time` timestamp NULL DEFAULT NULL COMMENT '创建时间',
  `type` bigint(1) DEFAULT NULL COMMENT '类型',
  `sku` bigint(20) DEFAULT NULL COMMENT '最小商品单元',
  `spu` bigint(20) DEFAULT NULL COMMENT '聚合单元',
  `item_name` text COLLATE utf8mb4_unicode_ci COMMENT '商品名',
  `img` text COLLATE utf8mb4_unicode_ci COMMENT '图片url',
  PRIMARY KEY (`id`) USING BTREE
) ENGINE=InnoDB AUTO_INCREMENT=348 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci COMMENT='用户';

SET FOREIGN_KEY_CHECKS = 1;


建立项目, 上代码

创建maven工程,引入spring boot
pom文件



    4.0.0
    
        org.springframework.boot
        spring-boot-starter-parent
        2.4.1
         
    
    com.example
    demo
    0.0.1-SNAPSHOT
    JingDongGetDemo
    JingDongGetDemo

    
        1.8
    

    

        
            org.springframework.boot
            spring-boot-starter-data-jpa
        
        
            org.springframework.boot
            spring-boot-starter-quartz
        
        
            org.springframework.boot
            spring-boot-starter-web
        

        
            mysql
            mysql-connector-java
            runtime
        
        
            org.projectlombok
            lombok
            true
        
        
            org.springframework.boot
            spring-boot-starter-test
            test
        

        
        
            org.jsoup
            jsoup
            1.11.3
        

        
            org.apache.httpcomponents
            httpclient
            4.5.6
        

        
            junit
            junit
            4.12
            test
        

        
        
            org.apache.commons
            commons-lang3
            3.9
        

        
        
            commons-io
            commons-io
            2.6
        

    

    
        
            
                org.springframework.boot
                spring-boot-maven-plugin
                
                    
                        
                            org.projectlombok
                            lombok
                        
                    
                
            
        
    





封装http请求类

package com.example.demo.utils;

import lombok.extern.slf4j.Slf4j;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;
import org.apache.tomcat.jni.OS;
import org.springframework.stereotype.Component;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.UUID;

/**
 * 封装http请求类
 */
@Component
@Slf4j
public class HttpUtils {
    String cookie = "__jdu=15964473360901187258403; shshshfpa=fadd29f9-813e-e8c5-fad1-5e70d45f5276-1596617912; shshshfpb=lgnFkT9%20fxrnZdfzawNuExA%3D%3D; o2State={%22webp%22:true}; pinId=sAzz8XNBRxQF0-GrxufAZLV9-x-f3wj7; pin=jd_49802f10b0cfc; unick=jd_49802f10b0cfc; _tp=yfPXFZws%2F1K7xphvbgdzaamZ1L9rpl0RAQIHmzRCPQM%3D; _pst=jd_49802f10b0cfc; user-key=a109a42d-1a2b-49d3-bd98-53783efeaa94; areaId=18; ipLoc-djd=18-1482-1485-49034; cn=20; unpl=V2_ZzNtbUtVQUFwDhZQeBpfVmIFGwhLAkIdcAhAU3lLXAViV0UKclRCFnQURldnGV8UZAIZXUFcQhxFCENkexhdBWMGEV5EVnMlMEsWBi8FXAdkAxJURlVAEXEIRVV%2bEFsAZjMRXXJWcxVxAEFRfh9eB2ACGlxCV0oTdgxPUXobbDVnCxZtQlZEFnQOQlN%2fGl41sa2GiMromqXQ3uz6rpTs0O6oxuPEZ0McdQpOVXoZWwZXAiJcchYtFXYARlR4VFwBbwQXWERVQRJ0AEdUexBaBmMKF1xAZ0Ildg%3d%3d; __jdv=122270672|www.zhihu.com|t_1001542270_1003231966_4000320817_3002725071|tuiguang|833e47a5222b478d8d0941763c114eff|1607775130483; PCSYCityID=CN_350000_350200_350203; TrackID=1vODXrK7bZO2EpDh1Nm6sfAqCk-Gs3dPa04wZrr88-S-XPgS5RmPdRgOkuyjgDCzx2l0bqptjiyuNbfelGhkPs7DZ-HlXTRWLdRgbeyueJDoB9TE9EWxFy0YQZ5CZBDGe; thor=762FF186473632AA10BE04F04DD8167499B0ED4286B401207CEE4E033A5989B5862D74A22CEE3390783A6D473BFD74E494C4013BBEA8D5C4D930A373DE163CDDDB5E5902ED9FF16E0B0AEAD53CB7EC3DE05C85A08AE5187A72BB58F825939F3DFB0A273D903500254B0E25CEC29CF7967E7C477315C18A139D9127C91D116828DCF057F4BDB76479B81213D06249FF700DD79CF1ED15C4E4BDCF3E9A6DFB9219; ceshi3.com=000; __jda=76161171.15964473360901187258403.1596447336.1608198065.1608206528.119; __jdb=76161171.6.15964473360901187258403|119.1608206528; __jdc=76161171; shshshfp=96550553530deab8d4b189c0755c3ff2; shshshsID=a55dfa051705e3f0f93c49319b54e32d_3_1608207381885";

    /**
     * 连接池管理器
     */
    private PoolingHttpClientConnectionManager cm;

    /**
     * 构造器
     */
    public HttpUtils() {
        this.cm = new PoolingHttpClientConnectionManager();

        // 设置最大连接数
        this.cm.setMaxTotal(100);
        // 每个主机数连接数
        this.cm.setDefaultMaxPerRoute(10);
    }

    /**
     * 根据url下载数据
     *
     * @param url
     * @return
     */
    public String doGet(String url) {

        // 获取http client对象
        CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(this.cm).build();

        // 创建http请求对象 设置url地址
        HttpGet httpGet = new HttpGet(url);
//        httpGet.setHeader("Cookie",  cookie);
        httpGet.setHeader("User-Agent","Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36");
        httpGet.setConfig(this.getConfig());

        CloseableHttpResponse response = null;
        try {

            // 使用http client
            response = httpClient.execute(httpGet);

            // 发起请求获取响应
            if (response.getStatusLine().getStatusCode() == 200) {

                // 判断响应体是否不为空
                if (response.getEntity() != null) {
                    String string = EntityUtils.toString(response.getEntity(), "utf-8");
                    return string;
                } else {
                    return "";
                }

            }

        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            if (response != null) {
                try {
                    response.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }

        return "";
    }

    /**
     * 设置请求配置
     *
     * @return
     */
    private RequestConfig getConfig() {

        RequestConfig requestConfig = RequestConfig.custom()
                // 创建连接时间
                .setConnectTimeout(1000)
                // 获取连接的最长时间
                .setConnectionRequestTimeout(1000)

                // 获取数据时间
                .setSocketTimeout(10000)
                .build();
        return requestConfig;
    }

    /**
     * 下载图片
     *
     * @param url
     * @return 图片硬盘地址
     */
    public String doGetImg(String url) {

        // 获取http client对象
        CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(this.cm).build();

        // 创建http请求对象 设置url地址
        HttpGet httpGet = new HttpGet(url);

        httpGet.setConfig(this.getConfig());

        CloseableHttpResponse response = null;
        try {

            // 使用http client
            response = httpClient.execute(httpGet);

            // 发起请求获取响应
            if (response.getStatusLine().getStatusCode() == 200) {

                // 判断响应体是否不为空
                if (response.getEntity() != null) {

                    // 获取图片的后缀
                    String extImg = url.substring(url.lastIndexOf("."));

                    // 重命名图片
                    String picName = UUID.randomUUID().toString() + extImg;

                    // 下载图片
                    File file = new File("/Users/giaogiao/Documents/hewei/code/myCode/JingDongGetDemo/src/main/resources/static/img/" + picName);
                    OutputStream outputStream = new FileOutputStream(file);
                    response.getEntity().writeTo(outputStream);

                    // 返回图片名称
                    return picName;

                } else {
                    return "";
                }
            }

        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            if (response != null) {
                try {
                    response.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
        return "";
    }

}

爬虫定时任务, 核心代码


package com.example.demo.task;

import com.example.demo.pojo.JdItem;
import com.example.demo.service.JdItemService;
import com.example.demo.utils.HttpUtils;
import lombok.extern.slf4j.Slf4j;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;

import java.sql.Date;

/**
 * 定时任务
 */
@Component
@Slf4j
public class ItemTask {

    @Autowired
    private HttpUtils httpUtils;

    @Autowired
    private JdItemService jdItemService;

    // 第一次完成后的间隔时间,100秒
    @Scheduled(fixedDelay = 100 * 1000)
    public void task() {


//        http://pua.show/index.php?act=pl&id=200
        log.debug("task....");

        // 分页查询
        String url = "https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&wq=%E6%89%8B%E6%9C%BA&page=1&s=1&click=0";
//        String url = "https://movie.douban.com/top250";


        String htmlString = httpUtils.doGet(url);
        log.debug("htmlString:" + htmlString);


        log.debug("完成");

    }

    /**
     * 解析数据
     */
    private void parse(String htmlString) {
        Elements goodsLists = Jsoup.parse(htmlString).select("#J_goodsList > ul > li");

        for (Element element : goodsLists) {

            // 获取所有SKU
            Elements items = element.select("li.ps-item");

            for (Element item : items) {

                Long sku = Long.valueOf(item.select("[data-sku]").attr("data-sku"));
                Long spu = null;

                String color = item.select("[title]").attr("title");
                String title = color + element.select("em").text();
                String img = item.select("[data-lazy-img]").attr("data-lazy-img");


                // 获取spu
                if (element.attr("data-spu").equals("")) {
                    spu = sku;
                } else {
                    spu = Long.valueOf(element.attr("data-spu"));
                }

                JdItem jdItem = new JdItem();
                jdItem.setCreateTime(new Date(new java.util.Date().getTime()));
                jdItem.setType(0L);
                jdItem.setSku(sku);
                jdItem.setSpu(spu);
                jdItem.setItemName(title);
                jdItem.setImg(httpUtils.doGetImg("https:" + img));

                jdItemService.save(jdItem);

            }
        }
    }
}

本项目爬取京东只是一个引子,一个最基础的案例, 将来还会想到其他比较好玩的项目,我再来分享

本项目优化和改进的点:

  • 比较简易,没有使用爬虫框架 如webmagic
  • 没有多线程优化

你可能感兴趣的:(凌晨不睡觉 用爬虫抓取京东数据)