Jsoup简单使用

Jsoup简介

Jsoup简介jsoup 是一款Java 的HTML解析器,可直接解析某个URL地址、HTML文本内容。它提供了一套非常省力的API,可通过DOM,CSS以及类似于jQuery的操作方法来取出和操作数据。

简单使用

新建maven项目添加

1
2
3
4
5

org.jsoup
jsoup
1.11.3

首先需要使用httpclient获取网页内容需要添加

1
2
3
4
5

org.apache.httpcomponents
httpclient
4.5.7

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43

import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;

import java.io.IOException;

public class JsoupDemo {
public static void main(String[] args) throws IOException {
//创建httpclient实例
CloseableHttpClient httpclient = HttpClients.createDefault();
// 创建httpget实例
HttpGet httpget = new HttpGet("https://www.cnblogs.com/");

//设置请求头模拟浏览器
httpget.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36");
CloseableHttpResponse response = httpclient.execute(httpget);
//返回实体
HttpEntity entity = response.getEntity();

String html = EntityUtils.toString(entity, "utf-8");


//解析网页
Document document = Jsoup.parse(html);
//获取网页中的title
Element title = document.getElementsByTag("title").first();
String text = title.text();
System.out.println(text);


//关闭流和释放系统资源
response.close();
httpclient.close();

}
}

输出

1
博客园 - 代码改变世界

查找DOM元素

常用:
getElementById(String id) 根据id来查询DOM

getElementsByTag(String tagName) 根据tag名称来查询DOM

getElementsByClass(String className) 根据样式名称来查询DOM

getElementsByAttribute(String key) 根据属性名来查询DOM

getElementsByAttributeValue(String key,String value) 根据属性名和属性值来查询DOM

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.IOException;

public class JsoupDemo {
public static void main(String[] args) throws IOException {
//创建httpclient实例
CloseableHttpClient httpclient = HttpClients.createDefault();
// 创建httpget实例
HttpGet httpget = new HttpGet("https://www.cnblogs.com/");

//设置请求头模拟浏览器
httpget.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36");
CloseableHttpResponse response = httpclient.execute(httpget);
//返回实体
HttpEntity entity = response.getEntity();

String html = EntityUtils.toString(entity, "utf-8");


//解析网页
Document document = Jsoup.parse(html);
//获取网页中的title
//通过tag获取
Element title = document.getElementsByTag("title").first();
String text = title.text();
System.out.println(text);

//通过id获取
Element site_nav_top = document.getElementById("site_nav_top");
String text1 = site_nav_top.text();
System.out.println(text1);

//通过class获取
Elements post_item = document.getElementsByClass("post_item");
for (Element e : post_item) {
System.out.println(e.html());
}


// 根据属性名来查询DOM
Elements widthElements = document.getElementsByAttribute("width");
for (Element e : widthElements) {
System.out.println(e.toString());
}

// 根据属性名和属性值来查询DOM
Elements targetElements=document.getElementsByAttributeValue("target", "_blank");
for(Element e:targetElements){
System.out.println(e.toString());
}


//关闭流和释放系统资源
response.close();
httpclient.close();

}
}

选择器查找DOM元素

Jsoup支持css,jquery类似的选择器语法;

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
package com.felix.project.jsoup;

import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.IOException;

public class JsoupSelectDemo {
public static void main(String[] args) throws IOException {
//创建httpclient实例
CloseableHttpClient httpclient = HttpClients.createDefault();
// 创建httpget实例
HttpGet httpget = new HttpGet("https://www.cnblogs.com/");
//设置请求头模拟浏览器
httpget.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36");
CloseableHttpResponse response = httpclient.execute(httpget);
//返回实体
HttpEntity entity = response.getEntity();

String html = EntityUtils.toString(entity, "utf-8");

//解析网页
Document document = Jsoup.parse(html);

// 查找所有帖子DOM
Elements select = document.select(".post_item .post_item_body h3 a");
for (Element e : select) {
System.out.println("博客标题:" + e.text());
System.out.println("-------------");
}
// 带有href属性的a元素
Elements hrefElements = document.select("a[href]");
for (Element e : hrefElements) {
System.out.println(e.toString());
System.out.println("-------------");


}
// 查找扩展名为.png的图片DOM节点
Elements imgElements = document.select("img[src$=.png]");
for (Element e : imgElements) {
System.out.println(e.toString());
System.out.println("-------------");
}
// 获取tag是title的所有DOM元素
Element element = document.getElementsByTag("title").first();
String title = element.text();
System.out.println("网页标题是:" + title);

//关闭流和释放系统资源
response.close();
httpclient.close();

}
}

获取属性值

1
2
3
4
5
6
7
8
9
10
// 带有href属性的a元素
Elements hrefElements = document.select("a[href]");
for (Element e : hrefElements) {
System.out.println(e.toString());
System.out.println("-------------");

//获取属性值
String href = e.attr("href");
System.out.println("href属性值为 "+href);
}

获取如下

1
href属性值为 http://www.beian.gov.cn/portal/registerSystemInfo?recordcode=31011502001144

参考

https://jsoup.org/

https://www.open-open.com/jsoup/

你可能感兴趣的:(Jsoup简单使用)