网页爬取Jsoup的Document转换(含html字符串转)带maven

<!-- 解析网页 -->
<dependency>
  <groupId>org.jsoup</groupId>
  <artifactId>jsoup</artifactId>
<version>1.10.2</version>
</dependency>

html字符串转Document

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

String html = "IT学习者"
                + "
" + " IT学习者论坛 " + " " + " " + "
"
; Document doc = Jsoup.parse(html); Element content = doc.getElementById("content"); Elements links = content.getElementsByTag("a"); for (Element link : links) { String linkHref = link.attr("href"); String linkText = link.text(); System.out.println("linkHref:" + linkHref); System.out.println("linkText:" + linkText); } //---------两种获取不同内容 Elements links = content.getElementsByTag("img"); for (Element link : links) { String linkHref = link.getElementsByTag("img").eq(0).attr("src"); System.out.println("linkHref:" + linkHref); }

URL直接转Document(弊端ajax无法获取,需要自己找模拟打开网页后在获取)

String url="https:///";
Document document=Jsoup.parse(new URL(url),30000);
Element content = doc.getElementById("content");
Elements links = content.getElementsByTag("a");
for (Element link : links) {
     String linkHref = link.attr("href");
     String linkText = link.text();
     System.out.println("linkHref:" + linkHref);
     System.out.println("linkText:" + linkText);
 }

你可能感兴趣的:(工具类(方法),html,maven,java)