package jsoup;
import java.io.File;
import java.util.HashMap;
import java.util.Map;
import org.apache.commons.lang3.StringEscapeUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.safety.Whitelist;
import org.jsoup.select.Elements;
//http://www.open-open.com/jsoup/
public class TestDocument {
public static void main(String[] args) throws Exception {
// parseBodyFragment();
// parserHTML();
// parseGmail();
// download();
// parserFromFile();
// parseLink();
// visitDom();
// select();
// parserURL();
// Cleaner();
// setContent();
String html = " - 為妳變┳乖";
html=unhtml(html);
Document doc = Jsoup.parse(html);
Element span = doc.select("span").first();
String input = span.text();
System.out.println(input);
System.out.println(span.attr("email"));
// - 您好,邮件我已经收到,我会尽快给您回复。祝你学习进步,
// 工作顺利!
// 10月16日
}
public static String html(String content) {
if (content == null)
return "";
String html = content;
// html = html.replace( "'", "'");
html = html.replaceAll("&", "&");
html = html.replace("\"", """); // "
html = html.replace("\t", " ");// 替换跳格
html = html.replace(" ", " ");// 替换空格
html = html.replace("<", "<");
html = html.replaceAll(">", ">");
return html;
}
public static String unhtml(String content) {
if (content == null)
return "";
String html = content;
html = html.replaceAll("&","&");
html = html.replace(""","\"");
html = html.replace(" ","\t");// 替换跳格
html = html.replace("- "," ");// 替换空格
html = html.replace(" "," ");// 替换空格
html = html.replace("<","<");
html = html.replaceAll(">",">");
return html;
}
private static void setContent() {
String html = "An exampletest字体
link.";
Document doc = Jsoup.parse(html);
Element div = doc.select("div").first(); //
div.html("lorem ipsum
"); // lorem ipsum
div.prepend("First
");// 在div前添加html内容
div.append("Last
");// 在div之后添加html内容
// 添完后的结果: First
lorem ipsum
Last
Element span = doc.select("span").first(); // One
span.wrap(" ");
// 添完后的结果: One
Element div2 = doc.select("li").first(); //
div2.text("five > four"); // five > four
div2.prepend("First ");
div2.append(" Last");
doc.select("div.masthead").attr("title", "jsoup").addClass("round-box");
System.out.println(doc);
}
private static void Cleaner() {
String unsafe = "";
String safe = Jsoup.clean(unsafe, Whitelist.basic());
System.out.println(safe);
// now: f
}
private static void parserURL() throws Exception {
Document doc = Jsoup.connect("http://www.open-open.com/").get();
Element link = doc.select("a").first();
String relHref = link.attr("href"); // == "/"
String absHref = link.attr("abs:href"); // "http://www.open-open.com/"
System.out.println(relHref);
System.out.println(absHref);
}
private static void select() {
String html = "An example link.
";
Document doc = Jsoup.parse(html);// 解析HTML字符串返回一个Document实现
Element link = doc.select("a").first();// 查找第一个a元素
String text = doc.body().text(); // "An example link"//取得字符串中的文本
String linkHref = link.attr("href"); // "http://example.com/"//取得链接地址
String linkText = link.text(); // "example""//取得链接地址中的文本
String linkOuterH = link.outerHtml();
// "example"
String linkInnerH = link.html(); // "example"//取得链接内的html内容
System.out.println(text);
System.out.println(linkHref);
System.out.println(linkText);
System.out.println(linkInnerH);
System.out.println(linkOuterH);
}
private static void visitDom() throws Exception {
File input = new File("d:/login.html");
Document doc = Jsoup.parse(input, "UTF-8", "http://www.baidu.com/");
Element content = doc.getElementById("body");
Elements links = content.getElementsByTag("a");
for (Element link : links) {
String linkHref = link.attr("href");
String linkText = link.text();
System.out.println(linkHref);
System.out.println(linkText);
}
}
private static void parseLink() {
String html = "An example link.
";
Document doc = Jsoup.parse(html);// 解析HTML字符串返回一个Document实现
Element link = doc.select("a").first();// 查找第一个a元素
String text = doc.body().text(); // "An example link"//取得字符串中的文本
String linkHref = link.attr("href"); // "http://example.com/"//取得链接地址
String linkText = link.text(); // "example""//取得链接地址中的文本
String linkOuterH = link.outerHtml();
// "example"
String linkInnerH = link.html(); // "example"//取得链接内的html内容
System.out.println(text);
System.out.println(linkHref);
System.out.println(linkText);
System.out.println(linkOuterH);
System.out.println(linkInnerH);
}
private static void parserFromFile() throws Exception {
File input = new File("d:/login.html");
Document doc = Jsoup.parse(input, "UTF-8", "http://www.baidu.com/");
System.err.println(doc);
}
private static void download() throws Exception {
Document doc = Jsoup.connect("http://www.baidu.com/").data("query",
"Java").userAgent("Mozilla").cookie("auth", "token").timeout(
3000).get();
System.out.println(doc);
}
private static void parserHTML() {
String html = "First parse "
+ "Parsed HTML into a doc.
";
Document doc = Jsoup.parse(html);
System.out.println(doc);
}
private static void parseGmail() throws Exception {
Document doc = Jsoup
.connect("https://accounts.google.com/ServiceLogin").get();
Element content = doc.getElementById("gaia_loginform");
// System.out.println(content);
Elements inputs = content.select("input[name]");
// StringBuffer sb=new StringBuffer();
Map maps = new HashMap();
for (Element element : inputs) {
// System.out.println(element);
String name = element.attr("name");
String value = element.attr("value");
// System.out.println(name+"="+value);
if (value != null && !"".equals(value)) {
maps.put(name, value);
}
}
// Email= Passwd=
System.out.println(maps);
}
// 解析body片段
private static void parseBodyFragment() {
String html = "Lorem ipsum.
";
Document doc = Jsoup.parseBodyFragment(html);
Element body = doc.body();
System.out.println(body);
}
}
package jsoup;
import org.jsoup.Jsoup;
import org.jsoup.helper.Validate;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
//http://www.open-open.com/jsoup/
public class ListLinks {
public static void main(String[] args) throws IOException {
Validate.isTrue(args.length == 1, "usage: supply url to fetch");
String url = args[0];
print("Fetching %s...", url);
Document doc = Jsoup.connect(url).get();
Elements links = doc.select("a[href]");
Elements media = doc.select("[src]");
Elements imports = doc.select("link[href]");
print("\nMedia: (%d)", media.size());
for (Element src : media) {
if (src.tagName().equals("img"))
print(" * %s: <%s> %sx%s (%s)",
src.tagName(), src.attr("abs:src"), src.attr("width"), src.attr("height"),
trim(src.attr("alt"), 20));
else
print(" * %s: <%s>", src.tagName(), src.attr("abs:src"));
}
print("\nImports: (%d)", imports.size());
for (Element link : imports) {
print(" * %s <%s> (%s)", link.tagName(),link.attr("abs:href"), link.attr("rel"));
}
print("\nLinks: (%d)", links.size());
for (Element link : links) {
print(" * a: <%s> (%s)", link.attr("abs:href"), trim(link.text(), 35));
}
}
private static void print(String msg, Object... args) {
System.out.println(String.format(msg, args));
}
private static String trim(String s, int width) {
if (s.length() > width)
return s.substring(0, width-1) + ".";
else
return s;
}
}
package jsoup;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
public class Link {
public static void main(String[] args) {
String html = "An example link.
";
Document doc = Jsoup.parse(html);//解析HTML字符串返回一个Document实现
Element link = doc.select("a").first();//查找第一个a元素
String text = doc.body().text(); // "An example link"//取得字符串中的文本
String linkHref = link.attr("href"); // "http://example.com/"//取得链接地址
String linkText = link.text(); // "example""//取得链接地址中的文本
String linkOuterH = link.outerHtml();
// "example"
String linkInnerH = link.html(); // "example"//取得链接内的html内容
System.out.println(text);
System.out.println(linkHref);
System.out.println(linkText);
System.out.println(linkOuterH);
System.out.println(linkInnerH);
}
}