jsoup页面抓取测试

package com.xy.xmweb.Controller;
/**
 * 此类为页面抓取工具类
 */

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.xy.entity.INewsData;


public class JsoupFirstExtract {

	/**
	 * @param args
	 */
	public static void main(String[] args) {
		//parseHtml();		
		//parseBody();
		//parseUrl();
		System.out.println("=========================================");
		System.out.println("=========================================");
		System.out.println("=========================================");
		System.out.println("=========================================");
		//navigation();		
		//extractElement();
//		navigation();
		
		try {
			String httpCount = JsoupFirstExtract.clawer2("http://www1.xy.com/myoffice/news.do?optType=1&pageNo=1&pageSize=10");
			
			//使用jSoup解析里头的内容  
	        //就像操作html doc文档对象一样操作网页中的元素  
			Document doc = Jsoup.parse(httpCount, "http://www1.xy.com/");
			Element body = doc.body();
			Element span = body.select("td").first();
			Elements links = span.getElementsByTag("a");
			for (Element element : links) {
				String linkAbsHref = element.absUrl("href");
				String linkText = element.text();
				System.out.println("linkAbsHref=:"+linkAbsHref);
				System.out.println(""+linkText+"");
			}
			
		} catch (Exception e) {
			e.printStackTrace();
		}
		
		int pageSize = 10;

		try {			
			//http://www1.xy.com/myoffice/news.do?optType=2&pageNo=78&pageSize=10
			Document doc = Jsoup.connect("http://www1.xy.com/myoffice/news.do?optType=2&pageNo=78&pageSize="+pageSize).timeout(10000).get(); 
			Elements as = doc.select("a[href]");
			System.out.println(as.size());
			if(pageSize > as.size()){
				pageSize = as.size();
			}
//			for (Element a : as) {
//				System.out.println(a.attr("href") + "###" + a.html()); 
//			}
			Elements tds = doc.select("td:not([title])");
//			for (Element td : tds) {
//				System.out.println(td.html()); 
//			}
			for(int i=0;i list = getIntfaceData("http://www1.xy.com/myoffice/news.do?optType=2&pageNo=78&pageSize=",10);
		if (list != null && list.size() > 0) {
			for (int i = 0; i < list.size(); i++) {
				INewsData newsData = list.get(i);
				System.out.println("=============newDate----getAhref-----:"+newsData.getAhref());
				System.out.println("=============newDate----getDatetime-----:"+newsData.getDatetime());
				System.out.println("=============newDate----getTitle-----:"+newsData.getTitle());
			}
		}
		
		}
	
	
	public static List getIntfaceData(String url, int pageSize) {
		
		List list = new ArrayList();
		try {
			//Document docconect = Jsoup.connect("http://www1.xy.com/myoffice/news.do?optType=2&pageNo=78&pageSize="+pageSize).timeout(10000).get();
			Document doc = Jsoup.connect(url+pageSize).timeout(10000).get();
//			Document doc = Jsoup.parse(docconect.toString(),"http://www1.xy.com/");
			Elements as = doc.select("a[href]");
			//System.out.println("======条数====="+as.size());
			if(pageSize > as.size()){
				pageSize = as.size();
			}
			Elements tds = doc.select("td:not([title])");
			for(int i=0;i

Parsed HTML into a doc.

"; Document doc = Jsoup.parse(html); System.out.println(doc); System.out.println("Print the html head --------------------"); System.out.println(doc.head()); System.out.println("Print the html body --------------------"); System.out.println(doc.body()); System.out.println("Print the html title --------------------"); System.out.println(doc.title()); } public static void parseBody() { String html = "

Lorem ipsum.

"; Document doc = Jsoup.parseBodyFragment(html); Element body = doc.body(); System.out.println("Print the body --------------------"); System.out.println(body); } public static void parseUrl() { try { Document doc = Jsoup.connect("http://www1.xy.com/myoffice/news.do?optType=1&pageNo=1&pageSize=10").get(); System.out.println("Print the Url --------------------"); System.out.println(doc); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } public static void navigation() { String html="First parse" + "

Parsed HTML into a doc.

" + " hahaha" + " bababa" +"
"; Document doc = Jsoup.parse(html, "http://192.168.3.84/gamestore/index.html"); Element content = doc.getElementById("content"); Elements links = content.getElementsByTag("a"); for (Element link : links) { String linkHref = link.attr("href"); String linkAbsHref = link.absUrl("href"); String linkText = link.text(); System.out.println(linkHref); System.out.println(linkAbsHref); System.out.println(linkText); } } public static void extractElement() { String html = "

An example link.

"; Document doc = Jsoup.parse(html); Element link = doc.select("a").first(); String text = doc.body().text(); // "An example link" String linkHref = link.attr("href"); // "http://example.com/" String linkText = link.text(); // "example"" String linkOuterH = link.outerHtml(); // " example" String linkInnerH = link.html(); // " example" System.out.println(text); System.out.println(linkHref); System.out.println(linkText); System.out.println(linkOuterH); System.out.println(linkInnerH); } /** * 当有些网页中存在一些嵌套的redirect连接时,它就会报Server redirected too many times这样的错误, * 这是因为此网页内部又有一些代码是转向其它网页的,循环过多导致程序出错。如果只想抓取本URL中的网页内容, * 而不愿意让它有其它 的网页跳转,可以用以下的代码。 * @param myurl * @throws Exception */ @SuppressWarnings("static-access") public static String clawer2(String myurl) throws Exception { URL urlmy = new URL(myurl); HttpURLConnection con = (HttpURLConnection) urlmy.openConnection(); con.setFollowRedirects(true); con.setInstanceFollowRedirects(false); con.connect(); BufferedReader br = new BufferedReader(new InputStreamReader(con.getInputStream(),"UTF-8")); String s = ""; StringBuffer sb = new StringBuffer(""); while ((s = br.readLine()) != null) { sb.append(s+"\r\n"); } return sb.toString(); } }

转载于:https://juejin.im/post/5aad0a7bf265da23793bedcc

你可能感兴趣的:(jsoup页面抓取测试)