1、idea
2、maven 3.9
3、jdk 1.8
4、jsoup 1.7.3
1、首先分析页面的布局,我抓取的是csdn中文章内容,打开一篇文章
2、按f12查看布局,我这里只抓取 下面的上一篇 链接以及名字,查看页面的代码
3、可以看到我们要找到class为prev_article的 ,然后抓取里面的超链接,然后是超链接内的文字。
4、分析后就可以编码了。
5、在idea中新建一个maven项目,名为csdn,结构如下图
6、pom内容如下
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0modelVersion>
<groupId>com.hjduangroupId>
<artifactId>csdnartifactId>
<version>1.0-SNAPSHOTversion>
<packaging>jarpackaging>
<properties>
<project.build.sourceEncoding>UTF-8project.build.sourceEncoding>
properties>
<dependencies>
<dependency>
<groupId>org.jsoupgroupId>
<artifactId>jsoupartifactId>
<version>1.7.3version>
dependency>
<dependency>
<groupId>junitgroupId>
<artifactId>junitartifactId>
<version>4.11version>
dependency>
dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.pluginsgroupId>
<artifactId>maven-compiler-pluginartifactId>
<version>3.3version>
<configuration>
<source>1.8source>
<target>1.8target>
configuration>
plugin>
<plugin>
<groupId>org.apache.maven.pluginsgroupId>
<artifactId>maven-surefire-pluginartifactId>
<version>2.18.1version>
<configuration>
<skipTests>trueskipTests>
configuration>
plugin>
plugins>
build>
project>
7、新建一个data类,用来存放抓取的内容
package com.hjduan.csdn.model;
/**
* Created with IntelliJ IDEA.
* Description: 数据类
* 2017-08-05-22:34
*/
public class Data {
private String time;
private String href;
private String linkName;
public Data() {
}
public Data(String time, String href, String linkName) {
this.time = time;
this.href = href;
this.linkName = linkName;
}
public String getTime() {
return time;
}
public void setTime(String time) {
this.time = time;
}
public String getHref() {
return href;
}
public void setHref(String href) {
this.href = href;
}
public String getLinkName() {
return linkName;
}
public void setLinkName(String linkName) {
this.linkName = linkName;
}
public String toString() {
return linkName+"-->"+href+"-->"+"在"+time+" 抓取";
}
}
8、新建一个抓取规则类
package com.hjduan.csdn.rule;
/**
* Created with IntelliJ IDEA.
* Description: 规则类
* 2017-08-05-22:40
*/
public class Rule {
/**
* 链接
*/
private String url;
/**
* 对返回的HTML进行过滤
*/
private String resultTagName;
/**
* CLASS / ID / SELECTION
* 设置过滤resultTagName的类型,默认为ID
*/
private int type = ID;
/**
* GET / POST
* 请求的类型,默认GET
*/
private int requestMoethod = GET;
public final static int GET = 0;
public final static int POST = 1;
public final static int CLASS = 0;
public final static int ID = 1;
public final static int SELECTION = 2;
public Rule(String url, String resultTagName, int type, int requestMoethod) {
this.url = url;
this.resultTagName = resultTagName;
this.type = type;
this.requestMoethod = requestMoethod;
}
public Rule() {
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getResultTagName() {
return resultTagName;
}
public void setResultTagName(String resultTagName) {
this.resultTagName = resultTagName;
}
public int getType() {
return type;
}
public void setType(int type) {
this.type = type;
}
public int getRequestMoethod() {
return requestMoethod;
}
public void setRequestMoethod(int requestMoethod) {
this.requestMoethod = requestMoethod;
}
}
9、自定义规则异常类
package com.hjduan.csdn.rule;
/**
* Created with IntelliJ IDEA.
* Description: 自定义异常类
* 2017-08-05-22:44
*/
public class RuleException extends RuntimeException {
public RuleException() {
super();
}
public RuleException(String message) {
super(message);
}
public RuleException(String message, Throwable cause) {
super(message, cause);
}
public RuleException(Throwable cause) {
super(cause);
}
}
10、核心抓取类
package com.hjduan.csdn.service;
import com.hjduan.csdn.model.Data;
import com.hjduan.csdn.rule.Rule;
import com.hjduan.csdn.rule.RuleException;
import com.hjduan.csdn.utils.DateUtil;
import com.hjduan.csdn.utils.StringUtil;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/**
* Created with IntelliJ IDEA.
* Description: 抓取核心类
* 2017-08-05-22:47
*/
public class GrabService {
public static List extract(Rule rule) {
// 进行对rule的必要校验
validateRule(rule);
List datas = new ArrayList();
Data data = null;
try {
/**
* 解析rule
*/
String url = rule.getUrl();
String resultTagName = rule.getResultTagName();
int type = rule.getType();
int requestType = rule.getRequestMoethod();
Connection conn = Jsoup.connect(url);
// 设置请求类型
Document doc = null;
switch (requestType) {
case Rule.GET:
doc = conn.timeout(100000).get();
break;
case Rule.POST:
doc = conn.timeout(100000).post();
break;
}
//处理返回数据
Elements results = new Elements();
switch (type) {
case Rule.CLASS:
results = doc.getElementsByClass(resultTagName);
break;
case Rule.ID:
Element result = doc.getElementById(resultTagName);
results.add(result);
break;
case Rule.SELECTION:
results = doc.select(resultTagName);
break;
default:
//当resultTagName为空时默认去body标签
if (StringUtil.isEmpty(resultTagName)) {
results = doc.getElementsByTag("body");
}
}
for (Element result : results) {
Elements links = result.getElementsByTag("a");
for (Element link : links) {
//必要的筛选
String linkHref = link.attr("href");
String linkText = link.text();
data = new Data();
data.setHref(linkHref);
data.setLinkName(linkText);
data.setTime(DateUtil.getFormatDate());
datas.add(data);
}
}
} catch (IOException e) {
e.printStackTrace();
}
return datas;
}
/**
* 对传入的参数进行必要的校验
*/
private static void validateRule(Rule rule) {
String url = rule.getUrl();
if (StringUtil.isEmpty(url)) {
throw new RuleException("url不能为空!");
}
if (!url.startsWith("http://blog.csdn.net/")) {
throw new RuleException("url的格式不正确!");
}
}
}
11、测试代码
package com.hjduan.csdn.test;
import com.hjduan.csdn.service.GrabService;
import com.hjduan.csdn.model.Data;
import com.hjduan.csdn.rule.Rule;
import com.hjduan.csdn.utils.StringUtil;
import java.util.List;
/**
* Created with IntelliJ IDEA.
* Description:
* 2017-08-05-23:40
*/
public class TestGrab {
/**
* 按照clss来抓去
*/
public void getGrab(Rule rule) {
List datas = GrabService.extract(rule);
for (Data data : datas) {
print(data);
if (!StringUtil.isEmpty(data.getHref())) {
rule.setUrl(data.getHref());
getGrab(rule);
}
}
}
public void print(Data data) {
System.out.println("<===============================================================" +
"===============================================================" +
"===============================================================>");
System.out.println(data);
}
public static void main(String args[]) {
Rule rule = new Rule("http://blog.csdn.net/two_people/article/details/76783943",
"prev_article", Rule.CLASS, Rule.GET);
TestGrab testGrab = new TestGrab();
testGrab.getGrab(rule);
}
}
12、辅助类
package com.hjduan.csdn.utils;
/**
* Created with IntelliJ IDEA.
* Description: 字符串辅助类
* 2017-08-05-22:46
*/
public class StringUtil {
public static boolean isEmpty(String str) {
if (str == null || str.trim().length() == 0) {
return true;
}
return false;
}
}
package com.hjduan.csdn.utils;
import java.text.SimpleDateFormat;
import java.util.Date;
/**
* Created with IntelliJ IDEA.
* Description:
* 2017-08-05-23:34
*/
public class DateUtil {
public static String getFormatDate(){
Date date=new Date();
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
// public final String format(Date date)
String sdate= sdf.format(date);
return sdate;
}
}
13、源码地址
https://gitee.com/lgr123/grab