java爬取页面

一、所需环境

1、idea
2、maven 3.9
3、jdk 1.8
4、jsoup 1.7.3

二、环境搭建

1、首先分析页面的布局,我抓取的是csdn中文章内容,打开一篇文章

2、按f12查看布局,我这里只抓取 下面的上一篇 链接以及名字,查看页面的代码

java爬取页面_第1张图片

3、可以看到我们要找到class为prev_article的 ,然后抓取里面的超链接,然后是超链接内的文字。

4、分析后就可以编码了。

5、在idea中新建一个maven项目,名为csdn,结构如下图

java爬取页面_第2张图片

6、pom内容如下


<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0modelVersion>

    <groupId>com.hjduangroupId>
    <artifactId>csdnartifactId>
    <version>1.0-SNAPSHOTversion>
    <packaging>jarpackaging>

    <properties>
        <project.build.sourceEncoding>UTF-8project.build.sourceEncoding>
    properties>

    <dependencies>
        <dependency>
            <groupId>org.jsoupgroupId>
            <artifactId>jsoupartifactId>
            <version>1.7.3version>
        dependency>
        <dependency>
            <groupId>junitgroupId>
            <artifactId>junitartifactId>
            <version>4.11version>
        dependency>
    dependencies>

    <build>
        <plugins>
            <plugin>
                <groupId>org.apache.maven.pluginsgroupId>
                <artifactId>maven-compiler-pluginartifactId>
                <version>3.3version>
                <configuration>
                    <source>1.8source>
                    <target>1.8target>
                configuration>
            plugin>

            <plugin>
                <groupId>org.apache.maven.pluginsgroupId>
                <artifactId>maven-surefire-pluginartifactId>
                <version>2.18.1version>
                <configuration>
                    <skipTests>trueskipTests>
                configuration>
            plugin>
        plugins>
    build>
project>

7、新建一个data类,用来存放抓取的内容

package com.hjduan.csdn.model;

/**
 * Created with IntelliJ IDEA.
 * Description: 数据类
 *    2017-08-05-22:34
 */
public class Data {
    private String time;
    private String href;
    private String linkName;

    public Data() {
    }

    public Data(String time, String href, String linkName) {
        this.time = time;
        this.href = href;
        this.linkName = linkName;
    }

    public String getTime() {
        return time;
    }

    public void setTime(String time) {
        this.time = time;
    }

    public String getHref() {
        return href;
    }

    public void setHref(String href) {
        this.href = href;
    }

    public String getLinkName() {
        return linkName;
    }

    public void setLinkName(String linkName) {
        this.linkName = linkName;
    }

    public String toString() {
        return linkName+"-->"+href+"-->"+"在"+time+" 抓取";
    }
}

8、新建一个抓取规则类

package com.hjduan.csdn.rule;

/**
 * Created with IntelliJ IDEA.
 * Description: 规则类
 *    2017-08-05-22:40
 */
public class Rule {
    /**
     * 链接
     */
    private String url;

    /**
     * 对返回的HTML进行过滤
     */
    private String resultTagName;

    /**
     * CLASS / ID / SELECTION
     * 设置过滤resultTagName的类型,默认为ID
     */
    private int type = ID;

    /**
     * GET / POST
     * 请求的类型,默认GET
     */
    private int requestMoethod = GET;

    public final static int GET = 0;
    public final static int POST = 1;


    public final static int CLASS = 0;
    public final static int ID = 1;
    public final static int SELECTION = 2;

    public Rule(String url, String resultTagName, int type, int requestMoethod) {
        this.url = url;
        this.resultTagName = resultTagName;
        this.type = type;
        this.requestMoethod = requestMoethod;
    }

    public Rule() {
    }

    public String getUrl() {
        return url;
    }

    public void setUrl(String url) {
        this.url = url;
    }

    public String getResultTagName() {
        return resultTagName;
    }

    public void setResultTagName(String resultTagName) {
        this.resultTagName = resultTagName;
    }

    public int getType() {
        return type;
    }

    public void setType(int type) {
        this.type = type;
    }

    public int getRequestMoethod() {
        return requestMoethod;
    }

    public void setRequestMoethod(int requestMoethod) {
        this.requestMoethod = requestMoethod;
    }
}

9、自定义规则异常类

package com.hjduan.csdn.rule;

/**
 * Created with IntelliJ IDEA.
 * Description: 自定义异常类
 *    2017-08-05-22:44
 */
public class RuleException extends RuntimeException {

    public RuleException() {
        super();
    }

    public RuleException(String message) {
        super(message);
    }

    public RuleException(String message, Throwable cause) {
        super(message, cause);
    }

    public RuleException(Throwable cause) {
        super(cause);
    }
}

10、核心抓取类

package com.hjduan.csdn.service;


import com.hjduan.csdn.model.Data;
import com.hjduan.csdn.rule.Rule;
import com.hjduan.csdn.rule.RuleException;
import com.hjduan.csdn.utils.DateUtil;
import com.hjduan.csdn.utils.StringUtil;

import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;


/**
 * Created with IntelliJ IDEA.
 * Description: 抓取核心类
 *    2017-08-05-22:47
 */
public class GrabService {

    public static List extract(Rule rule) {

        // 进行对rule的必要校验
        validateRule(rule);

        List datas = new ArrayList();
        Data data = null;
        try {
            /**
             * 解析rule
             */
            String url = rule.getUrl();
            String resultTagName = rule.getResultTagName();
            int type = rule.getType();
            int requestType = rule.getRequestMoethod();

            Connection conn = Jsoup.connect(url);


            // 设置请求类型
            Document doc = null;
            switch (requestType) {
                case Rule.GET:
                    doc = conn.timeout(100000).get();
                    break;
                case Rule.POST:
                    doc = conn.timeout(100000).post();
                    break;
            }

            //处理返回数据
            Elements results = new Elements();
            switch (type) {
                case Rule.CLASS:
                    results = doc.getElementsByClass(resultTagName);
                    break;
                case Rule.ID:
                    Element result = doc.getElementById(resultTagName);
                    results.add(result);
                    break;
                case Rule.SELECTION:
                    results = doc.select(resultTagName);
                    break;
                default:
                    //当resultTagName为空时默认去body标签
                    if (StringUtil.isEmpty(resultTagName)) {
                        results = doc.getElementsByTag("body");
                    }
            }

            for (Element result : results) {
                Elements links = result.getElementsByTag("a");

                for (Element link : links) {
                    //必要的筛选
                    String linkHref = link.attr("href");
                    String linkText = link.text();
                    data = new Data();
                    data.setHref(linkHref);
                    data.setLinkName(linkText);
                    data.setTime(DateUtil.getFormatDate());
                    datas.add(data);
                }
            }

        } catch (IOException e) {
            e.printStackTrace();
        }

        return datas;
    }


    /**
     * 对传入的参数进行必要的校验
     */
    private static void validateRule(Rule rule) {
        String url = rule.getUrl();
        if (StringUtil.isEmpty(url)) {
            throw new RuleException("url不能为空!");
        }
        if (!url.startsWith("http://blog.csdn.net/")) {
            throw new RuleException("url的格式不正确!");
        }
    }
}

11、测试代码

package com.hjduan.csdn.test;

import com.hjduan.csdn.service.GrabService;
import com.hjduan.csdn.model.Data;
import com.hjduan.csdn.rule.Rule;
import com.hjduan.csdn.utils.StringUtil;

import java.util.List;

/**
 * Created with IntelliJ IDEA.
 * Description:
 *    2017-08-05-23:40
 */
public class TestGrab {
    /**
     * 按照clss来抓去
     */
    public void getGrab(Rule rule) {
        List datas = GrabService.extract(rule);
        for (Data data : datas) {
            print(data);
            if (!StringUtil.isEmpty(data.getHref())) {
                rule.setUrl(data.getHref());
                getGrab(rule);
            }
        }
    }

    public void print(Data data) {
        System.out.println("<===============================================================" +
                "===============================================================" +
                "===============================================================>");
        System.out.println(data);
    }

    public static void main(String args[]) {
        Rule rule = new Rule("http://blog.csdn.net/two_people/article/details/76783943",
                "prev_article", Rule.CLASS, Rule.GET);
        TestGrab testGrab = new TestGrab();
        testGrab.getGrab(rule);
    }
}

12、辅助类

package com.hjduan.csdn.utils;

/**
 * Created with IntelliJ IDEA.
 * Description: 字符串辅助类
 *    2017-08-05-22:46
 */
public class StringUtil {

    public static boolean isEmpty(String str) {
        if (str == null || str.trim().length() == 0) {
            return true;
        }
        return false;
    }
}
package com.hjduan.csdn.utils;

import java.text.SimpleDateFormat;
import java.util.Date;

/**
 * Created with IntelliJ IDEA.
 * Description:
 *   2017-08-05-23:34
 */
public class DateUtil {
    public static  String getFormatDate(){
        Date date=new Date();
        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        // public final String format(Date date)
        String sdate= sdf.format(date);
        return sdate;
    }
}

13、源码地址

https://gitee.com/lgr123/grab

你可能感兴趣的:(java)