jsouptest

import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;


public class T {

	/**
	 * @param args
	 * @throws IOException 
	 */
	public static void main(String[] args) throws IOException {
		Document doc = Jsoup.connect("http://www.xxxx.net/new/new_1.htm")
		  .get();
		  //.data("query", "Java")
		  //.userAgent("Mozilla")
		  //.cookie("auth", "token")
		  //.timeout(3000)
		  //.post();
		Elements resultLinks = doc.select("div.main_l_l"); 
		for(Element e:resultLinks){
			Elements tresultLinks = e.select("div.list_body a");
			for(Element te:tresultLinks){
				String href=te.attr("href");
				System.out.println("Start:"+href);
				Document art = Jsoup.connect(href)
				  .get();
				String title = art.select("h1").get(0).html();
				String content = art.select("#art_content").get(0).html();

				Pattern pattern = Pattern.compile("(?si)<!--NEWSZW_HZH_BEGIN-->(.+?)<!--NEWSZW_HZH_END-->");
				Matcher m = pattern.matcher(content);
				while (m.find()) {
					content=m.group(1);
				}
				System.out.println("*************title********************");
				System.out.println(title);
				System.out.println("*************content********************");
				System.out.println(content);

			}
		}
	}
}

你可能感兴趣的:(JSoup)