jsoup使用实例

package jsoup;

import java.io.File;
import java.util.HashMap;
import java.util.Map;

import org.apache.commons.lang3.StringEscapeUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.safety.Whitelist;
import org.jsoup.select.Elements;

//http://www.open-open.com/jsoup/
public class TestDocument {

	public static void main(String[] args) throws Exception {

		// parseBodyFragment();
		// parserHTML();
		// parseGmail();
		// download();
		// parserFromFile();
		// parseLink();
		// visitDom();
		// select();
		// parserURL();
		// Cleaner();
		// setContent();

		String html = " - 為妳變┳乖";
		html=unhtml(html);
		Document doc = Jsoup.parse(html);
		Element span = doc.select("span").first();
		String input = span.text();
		System.out.println(input);
		System.out.println(span.attr("email"));
		//  - 您好,邮件我已经收到,我会尽快给您回复。祝你学习进步,
		// 工作顺利!
		// 10月16日
	}

	public static String html(String content) {
		if (content == null)
			return "";
		String html = content;
		// html = html.replace( "'", "'");
		html = html.replaceAll("&", "&");
		html = html.replace("\"", """); // "
		html = html.replace("\t", "  ");// 替换跳格
		html = html.replace(" ", " ");// 替换空格
		html = html.replace("<", "<");
		html = html.replaceAll(">", ">");
		return html;
	}

	public static String unhtml(String content) {
		if (content == null)
			return "";
		String html = content;
		html = html.replaceAll("&","&");
		html = html.replace(""","\"");
		html = html.replace("  ","\t");// 替换跳格
		html = html.replace("- "," ");// 替换空格
		html = html.replace(" "," ");// 替换空格
		html = html.replace("<","<");
		html = html.replaceAll(">",">");
		return html;
	}
	private static void setContent() {

		String html = "

An example

字体
  • link.

    "; Document doc = Jsoup.parse(html); Element div = doc.select("div").first(); //
    div.html("

    lorem ipsum

    "); //

    lorem ipsum

    div.prepend("

    First

    ");// 在div前添加html内容 div.append("

    Last

    ");// 在div之后添加html内容 // 添完后的结果:

    First

    lorem ipsum

    Last

    Element span = doc.select("span").first(); // One span.wrap("
  • "); // 添完后的结果:
  • One
  • Element div2 = doc.select("li").first(); //
    div2.text("five > four"); //
    five > four
    div2.prepend("First "); div2.append(" Last"); doc.select("div.masthead").attr("title", "jsoup").addClass("round-box"); System.out.println(doc); } private static void Cleaner() { String unsafe = "

    Link

    "; String safe = Jsoup.clean(unsafe, Whitelist.basic()); System.out.println(safe); // now:

    Link

    f } private static void parserURL() throws Exception { Document doc = Jsoup.connect("http://www.open-open.com/").get(); Element link = doc.select("a").first(); String relHref = link.attr("href"); // == "/" String absHref = link.attr("abs:href"); // "http://www.open-open.com/" System.out.println(relHref); System.out.println(absHref); } private static void select() { String html = "

    An example link.

    "; Document doc = Jsoup.parse(html);// 解析HTML字符串返回一个Document实现 Element link = doc.select("a").first();// 查找第一个a元素 String text = doc.body().text(); // "An example link"//取得字符串中的文本 String linkHref = link.attr("href"); // "http://example.com/"//取得链接地址 String linkText = link.text(); // "example""//取得链接地址中的文本 String linkOuterH = link.outerHtml(); // "example" String linkInnerH = link.html(); // "example"//取得链接内的html内容 System.out.println(text); System.out.println(linkHref); System.out.println(linkText); System.out.println(linkInnerH); System.out.println(linkOuterH); } private static void visitDom() throws Exception { File input = new File("d:/login.html"); Document doc = Jsoup.parse(input, "UTF-8", "http://www.baidu.com/"); Element content = doc.getElementById("body"); Elements links = content.getElementsByTag("a"); for (Element link : links) { String linkHref = link.attr("href"); String linkText = link.text(); System.out.println(linkHref); System.out.println(linkText); } } private static void parseLink() { String html = "

    An example link.

    "; Document doc = Jsoup.parse(html);// 解析HTML字符串返回一个Document实现 Element link = doc.select("a").first();// 查找第一个a元素 String text = doc.body().text(); // "An example link"//取得字符串中的文本 String linkHref = link.attr("href"); // "http://example.com/"//取得链接地址 String linkText = link.text(); // "example""//取得链接地址中的文本 String linkOuterH = link.outerHtml(); // "example" String linkInnerH = link.html(); // "example"//取得链接内的html内容 System.out.println(text); System.out.println(linkHref); System.out.println(linkText); System.out.println(linkOuterH); System.out.println(linkInnerH); } private static void parserFromFile() throws Exception { File input = new File("d:/login.html"); Document doc = Jsoup.parse(input, "UTF-8", "http://www.baidu.com/"); System.err.println(doc); } private static void download() throws Exception { Document doc = Jsoup.connect("http://www.baidu.com/").data("query", "Java").userAgent("Mozilla").cookie("auth", "token").timeout( 3000).get(); System.out.println(doc); } private static void parserHTML() { String html = "First parse" + "

    Parsed HTML into a doc.

    "; Document doc = Jsoup.parse(html); System.out.println(doc); } private static void parseGmail() throws Exception { Document doc = Jsoup .connect("https://accounts.google.com/ServiceLogin").get(); Element content = doc.getElementById("gaia_loginform"); // System.out.println(content); Elements inputs = content.select("input[name]"); // StringBuffer sb=new StringBuffer(); Map maps = new HashMap(); for (Element element : inputs) { // System.out.println(element); String name = element.attr("name"); String value = element.attr("value"); // System.out.println(name+"="+value); if (value != null && !"".equals(value)) { maps.put(name, value); } } // Email= Passwd= System.out.println(maps); } // 解析body片段 private static void parseBodyFragment() { String html = "

    Lorem ipsum.

    "; Document doc = Jsoup.parseBodyFragment(html); Element body = doc.body(); System.out.println(body); } }
    package jsoup;
    import org.jsoup.Jsoup;
    import org.jsoup.helper.Validate;
    import org.jsoup.nodes.Document;
    import org.jsoup.nodes.Element;
    import org.jsoup.select.Elements;
    
    import java.io.IOException;
    
    //http://www.open-open.com/jsoup/
    public class ListLinks {
    	
        public static void main(String[] args) throws IOException {
            Validate.isTrue(args.length == 1, "usage: supply url to fetch");
            String url = args[0];
            print("Fetching %s...", url);
    
            Document doc = Jsoup.connect(url).get();
            Elements links = doc.select("a[href]");
            Elements media = doc.select("[src]");
            Elements imports = doc.select("link[href]");
    
            print("\nMedia: (%d)", media.size());
            for (Element src : media) {
                if (src.tagName().equals("img"))
                    print(" * %s: <%s> %sx%s (%s)",
                            src.tagName(), src.attr("abs:src"), src.attr("width"), src.attr("height"),
                            trim(src.attr("alt"), 20));
                else
                    print(" * %s: <%s>", src.tagName(), src.attr("abs:src"));
            }
    
            print("\nImports: (%d)", imports.size());
            for (Element link : imports) {
                print(" * %s <%s> (%s)", link.tagName(),link.attr("abs:href"), link.attr("rel"));
            }
    
            print("\nLinks: (%d)", links.size());
            for (Element link : links) {
                print(" * a: <%s>  (%s)", link.attr("abs:href"), trim(link.text(), 35));
            }
        }
    
        private static void print(String msg, Object... args) {
            System.out.println(String.format(msg, args));
        }
    
        private static String trim(String s, int width) {
            if (s.length() > width)
                return s.substring(0, width-1) + ".";
            else
                return s;
        }
    }
    package jsoup;
    
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    import org.jsoup.nodes.Element;
    
    public class Link {
    
    	public static void main(String[] args) {
    		String html = "

    An example link.

    "; Document doc = Jsoup.parse(html);//解析HTML字符串返回一个Document实现 Element link = doc.select("a").first();//查找第一个a元素 String text = doc.body().text(); // "An example link"//取得字符串中的文本 String linkHref = link.attr("href"); // "http://example.com/"//取得链接地址 String linkText = link.text(); // "example""//取得链接地址中的文本 String linkOuterH = link.outerHtml(); // "example" String linkInnerH = link.html(); // "example"//取得链接内的html内容 System.out.println(text); System.out.println(linkHref); System.out.println(linkText); System.out.println(linkOuterH); System.out.println(linkInnerH); } }



    你可能感兴趣的:(java)