网络爬虫HttpClient+Jsoup+WebMagic学习笔记

网络爬虫 HttpClient Jsoup WebMagic

  • 一、入门程序体验
    • 1、HttpClient处理网络请求
    • 2、Jsoup页面解析
  • 二、小案例:爬取JD手机数据
  • 三、WebMagic学习使用
    • 1、基础概念
    • 2、案例
      • 1、爬取页面某些标签属性值
      • 2、爬取某工作网站计算机软件行业的信息存入mysql
      • 3、爬取某网站所有POI数据存入ES
    • 3、数据去重算法
    • 4、使用代理,解决网页反爬

补充:爬虫相关文档笔记链接

一、入门程序体验

1、HttpClient处理网络请求

1、添加依赖

<dependency>
       <groupId>org.apache.httpcomponentsgroupId>
       <artifactId>httpclientartifactId>
       <version>4.5.10version>
dependency>

2、初识网络爬虫

import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;

import java.io.IOException;

public class testdemo {
     
    public static void main(String[] args) throws Exception {
     
        //案例一:入门程序,模拟get请求,爬取网页原页面
        /*CloseableHttpClient closeableHttpClient = HttpClients.createDefault();
        URIBuilder uriBuilder = new URIBuilder("http://www.itcast.cn/search");
        uriBuilder.setParameter("keys","java").setParameter("","");
        HttpGet httpGet = new HttpGet(uriBuilder.build());
        //HttpPost httpPost = new HttpPost(uriBuilder.build());
        CloseableHttpResponse response = closeableHttpClient.execute(httpGet);
        if(response.getStatusLine().getStatusCode()==200){
            HttpEntity entity = response.getEntity();
            String s = EntityUtils.toString(entity, "utf-8");
            System.out.println(s);
            //关闭连接
            response.close();
            closeableHttpClient.close();
        }*/

        //案例二:模拟表单post请求
        /*CloseableHttpClient httpClient= HttpClients.createDefault();
        HttpPost httpPost = new HttpPost("http://www.itcast.cn/search");
        ArrayList params = new ArrayList<>();
        params.add(new BasicNameValuePair("keys","java"));
        UrlEncodedFormEntity urlEncodedFormEntity = new UrlEncodedFormEntity(params,"utf-8");
        httpPost.setEntity(urlEncodedFormEntity);
        CloseableHttpResponse response = httpClient.execute(httpPost);
        if(response.getStatusLine().getStatusCode()==200){
            HttpEntity entity = response.getEntity();
            String s = EntityUtils.toString(entity, "utf-8");
            System.out.println(s);
        }*/

        //案例三:连接池管理器
        PoolingHttpClientConnectionManager pool = new PoolingHttpClientConnectionManager();
        pool.setMaxTotal(10);
        doGet(pool);
    }

    private static void doGet(PoolingHttpClientConnectionManager pool) {
     
        CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(pool).build();
        HttpPost httpPost = new HttpPost("http://www.itcast.cn");
        CloseableHttpResponse response = null;
        try {
     
            response = httpClient.execute(httpPost);
            if(response.getStatusLine().getStatusCode()==200){
     
                HttpEntity entity = response.getEntity();
                String s = EntityUtils.toString(entity);
                System.out.println(s);
            }
        } catch (IOException e) {
     
            e.printStackTrace();
        }finally {
     
            if (response!=null){
     
                try {
     
                    response.close();
                } catch (IOException e) {
     
                    e.printStackTrace();
                }
            }
        }

    }
}

2、Jsoup页面解析

1、引入依赖

 <dependency>
      <groupId>org.jsoupgroupId>
      <artifactId>jsoupartifactId>
      <version>1.10.2version>
dependency>

2、小demo

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

import java.net.URL;

public class JsoupDemo {
     
    public static void main(String[] args) throws Exception {
     
    //Jsoup.parse()这个方法参数可以是文件、字符串、网页地址,都能转成Document
        Document document = Jsoup.parse(new URL("http://47.97.200.76/login"), 10000);
        String title = document.getElementsByTag("title").first().text();
        System.out.println("打印=========>"+title);
    }
}

//el#id: 元素+ID,比如: h3#city_bj
String str = document.select("h3#city_bj").text();

//el.class: 元素+class,比如: li.class_a
str = document.select("li.class_a").text();

//el[attr]: 元素+属性名,比如: span[abc]
str = document.select("span[abc]").text();

//任意组合,比如:span[abc].s_name
str = document.select("span[abc].s_name").text();

//ancestor child: 查找某个元素下子元素,比如:.city_con li 查找"city_con"下的所有li
str = document.select(".city_con li").text();

//parent > child: 查找某个父元素下的直接子元素,
//比如:.city_con > ul > li 查找city_con第一级(直接子元素)的ul,再找所有ul下的第一级li
str = document.select(".city_con > ul > li").text();

//parent > * 查找某个父元素下所有直接子元素.city_con > *
str = document.select(".city_con > *").text();

二、小案例:爬取JD手机数据

网络爬虫HttpClient+Jsoup+WebMagic学习笔记_第1张图片
1、HttpClient抓取数据
2、Jsoup解析数据
3、存储数据

三、WebMagic学习使用

1、基础概念

网络爬虫HttpClient+Jsoup+WebMagic学习笔记_第2张图片
网络爬虫HttpClient+Jsoup+WebMagic学习笔记_第3张图片
网络爬虫HttpClient+Jsoup+WebMagic学习笔记_第4张图片

2、案例

1、爬取页面某些标签属性值

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover;
import us.codecraft.webmagic.scheduler.QueueScheduler;

public class WebMagicTest implements PageProcessor {
     
    private Site site = Site.me()
            .setTimeOut(1000*10)//设置请求超时,单位ms
            .setCharset("utf8")//设置编码
            .setRetrySleepTime(1000*3)//设置请求失败后的重新请求时间
            .setSleepTime(3)//设置重置次数 ;
            ;
    public static void main(String[] args) {
     
        Spider.create(new WebMagicTest())
                //.addPipeline(new FilePipeline("E:\\STUDY\\study\\jsoup-crawlers\\src\\main\\resources\\static"))//将结果保存到文件中需要单独设置,默认是控制台打印
                .addUrl("http://ace.piesat.cn/login.xhtml")
                .thread(3)//设置五个线程处理
                .setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(10000000)))//设置去重过滤器为布隆过滤器
                .run();
    }

    @Override
    public void process(Page page) {
     
        //1、 使用css选择器 解析页面,获取
标签下并且class=mod_conatiner 里面的

你可能感兴趣的:(springboot项目相关,java)