jsoup是一个 Java 库,用于使用实际 HTML。它提供了一个非常方便的 API,用于获取 URL 和提取和操作数据,使用最好的 HTML5 DOM 方法和 CSS 选择器。
主要功能:
导入maven依赖
org.jsoup
jsoup
1.8.3
demo
package com.sihi.crawler.jsoup;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
public class Demo {
public static void main(String[] args) {
String html = " Try jsoup This is jsoup.
";
Document doc = Jsoup.parse(html);
Elements allElements = doc.getAllElements(); //获得子元素
System.out.println(doc);
}
}
package com.sihi.crawler.jsoup;
import com.sihi.crawler.test.HttpClientUtil;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import java.io.File;
import java.io.IOException;
import java.net.URL;
public class TestDocument {
public static void main(String[] args) throws IOException {
//1.通过解析url来产生document
// Document document = Jsoup.parse(new URL("http://www.sikiedu.com"), 2000);
//2.通过解析字符串生成document
// String s = HttpClientUtil.doGet("http://www.sikiedu.com");
// Document document = Jsoup.parse(s);
//3.通过解析文件的形式生成document
Document document = Jsoup.parse(new File("E://SiKi.html"), "UTF-8");
System.out.println(document);
}
}
package com.sihi.crawler.jsoup;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.File;
import java.io.IOException;
public class ParseDocumentByDom {
public static void main(String[] args) throws IOException {
//生成document对象
Document document = Jsoup.parse(new File("E://SiKi.html"), "UTF-8");
//使用dom方法查找元素
//1.通过id查找
Element elementById = document.getElementById("announcements-alert");
// System.out.println(elementById);
//2.通过元素查找
Elements divs = document.getElementsByTag("div");
// for(Element div : divs){
// System.out.println(div);
// }
//3.通过class查找
Elements elementsByClass = document.getElementsByClass("alert-link");
// for (Element byClass : elementsByClass) {
// System.out.println(byClass);
// }
//4.通过属性查找
Elements href = document.getElementsByAttribute("href");
// for (Element element : href) {
// System.out.println(element);
// }
//4.通过属性名和值一起查找
Elements elementsByAttributeValue = document.getElementsByAttributeValue("rel", "icon");
for (Element element : elementsByAttributeValue) {
System.out.println(element);
}
}
}
package com.sihi.crawler.jsoup;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.File;
import java.io.IOException;
public class ParseDocumentBySelector {
public static void main(String[] args) throws IOException {
Document dom = Jsoup.parse(new File("E://SiKi.html"), "UTF-8");
//使用选择器查找元素
//1.通过id选择器#查找get()方法表示第几个从0开始
Element element = dom.select("#announcements-alert").get(0);
//System.out.println(element);
//2.通过class选择器.查找
Elements classes = dom.select(".es-icon");
// System.out.println(classes);
//3.通过标签选择器查找
Elements is = dom.select("i");
//System.out.println(is);
//4.通过属性选择器[]查找
Elements hrefs = dom.select("[href]");
// System.out.println(hrefs);
//5.通过属性名前缀来[^ ]查找元素
Elements datas = dom.select("[^data-]");
//System.out.println(datas);
//6.通过属性名和值来查找元素
Elements selects = dom.select("[type=\"button\"]");
// System.out.println(selects);
//7.通过自由组合的模式查找
// Elements divannouncements = dom.select("div#announcements-alert");
Elements divannouncements = dom.select("a[^data-].alert-link");
// System.out.println(divannouncements);
//8.通过空格表示查找子元素 ; >号查找父元素的直接子元素
// Elements select = dom.select(".nav.navbar-nav.clearfix.hidden-xs#nav li");
Elements select = dom.select(".nav.navbar-nav.clearfix.hidden-xs#nav>li");
// System.out.println(select);
//9.多条件查找元素 用,
Elements select1 = dom.select("a,img");
System.out.println(select1);
}
}
package com.sihi.crawler.jsoup;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Attribute;
import org.jsoup.nodes.Attributes;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.parser.Tag;
import org.jsoup.select.Elements;
import java.io.File;
import java.io.IOException;
public class ParseElementData {
public static void main(String[] args) throws IOException {
Document dom = Jsoup.parse(new File("E://SiKi.html"), "UTF-8");
Element body = dom.select("body").get(0);
// System.out.println(body.text());//标签内的所有文本
// System.out.println(body.html());//标签内的html代码 也可以修改标签内的代码
// System.out.println(body.outerHtml());//包含标签的html代码
// System.out.println(body.className()); //标签class属性的名称
// System.out.println(body.classNames());//标签class属性的名称,并且转成一个set集合
// System.out.println(body.id()); //标签id属性的名称
Tag tag = body.tag(); //获得标签名字
String s = body.tagName();//获得标签名字
System.out.println(tag);
System.out.println(s);
Elements a = dom.select("a");
for (Element element : a) {
//System.out.println(element.attr("href")); //获得元素属性值
Attributes attributes = element.attributes(); //获得元素所有属性
for (Attribute attribute : attributes) {
// System.out.println(attribute.getKey()+"---"+attribute.getValue()); //属性名字和值
}
}
}
}
页面分析,代码编写
package com.sihi.crawler.jsoup;
import com.sihi.crawler.test.HttpClientUtil;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class PaserJavaEE {
public static void main(String[] args) {
String url = "http://www.sikiedu.com/course/explore/javaee?page=";
for (int i=1; i<=3; i++){
String content = HttpClientUtil.doGet(url + i);
//解析成dom
Document dom = Jsoup.parse(content);
//获取课程列表的元素
Element courseList = dom.select(".course-list>div").get(0);
//获取课程集合
Elements list = courseList.select(">div");
for (Element course : list) {
//获取课程名称和价格
String courseName = course.select(".link-dark").get(0).text();
String price = course.select(".course-price-widget>span").get(0).text();
System.out.println("课程名:"+courseName+",价格:"+price);
}
}
}
}
Java爬虫①HttpClient