java jsoup 网络爬虫 学习例子(五)宽度优先

阅读更多

java jsoup 网络爬虫 学习例子(五) 宽度优先

 

package com.iteye.injavawetrust.gethtml;

import java.util.Map;
import java.util.Set;

/**
 * 
 * @author InJavaWeTrust
 *
 */
public class GetHtml {
	
	private static JsoupUtil ju = JsoupUtil.getInstance();
	
	public void getHtml(String url){
		ju.initUnvisitedUrl(url);
		//待访问队列不为空,已访问<10000
		while(!HtmlQueue.unVisitedUrlsEmpty() && HtmlQueue.getVisitedUrlNum() < 10000){
			String visitUrl = (String) HtmlQueue.unVisitedUrlDeQueue();
			if(null == visitUrl){
				continue;
			}
			Map map = ju.getHtml(visitUrl);
			if(0 == map.size()){
				continue;
			}
			ju.getHtmlToLocal(map); //将html写如本地文件
			HtmlQueue.addVisitedUrl(visitUrl); //将该URL放入到已访问的URL队列中
			Set links = ju.getAllUrl(visitUrl); //提取出下载网页中的URL
			for(String link :links){
				if(!link.startsWith(Constants.URL)){
					continue;
				}
				if(!ju.checkURL(link)){
					continue;
				}
				// 新的未访问的 URL加入队待访问的 URL队列
				HtmlQueue.addUnvisitedUrl(link);
			}
			
		}
	}
	
	public static void main(String[] args) {
		GetHtml gh = new GetHtml();
		long starTime = System.currentTimeMillis();
		gh.getHtml(Constants.URL);
		long endTime = System.currentTimeMillis();
		System.out.println("共下载 [" + HtmlQueue.getVisitedUrlNum() + "]");
		System.out.println("用时 [" + ju.msToss(endTime - starTime) + "]");
	}

}


package com.iteye.injavawetrust.gethtml;

import java.util.HashSet;
import java.util.PriorityQueue;
import java.util.Queue;
import java.util.Set;

/**
 * 
 * @author InJavaWeTrust
 *
 */
public class HtmlQueue {

	/**
	 * 已访问的URL队列
	 */
	private static Set visitedUrl = new HashSet();
	/**
	 * 待访问的 URL队列
	 */
	private static Queue unVisitedUrl = new PriorityQueue();
	/**
	 * 获得待访问URL队列
	 * @return
	 */
	public static Queue getUnVisitedUrl() {
		return unVisitedUrl;
	}
	/**
	 * 添加到访问过的URL队列中
	 * @param url
	 */
	public static void addVisitedUrl(String url) {
		visitedUrl.add(url);
	}
	/**
	 * 移除访问过的URL
	 * @param url
	 */
	public static void removeVisitedUrl(String url) {
		visitedUrl.remove(url);
	}
	/**
	 * 未访问的URL出队列
	 * @return
	 */
	public static Object unVisitedUrlDeQueue() {
		return unVisitedUrl.poll();
	}
	/**
	 * 添加到待访问的 URL队列,保证每个URL只被访问一次
	 * @param url
	 */
	public static void addUnvisitedUrl(String url) {
		if (url != null && !url.trim().equals("") && !visitedUrl.contains(url)
				&& !unVisitedUrl.contains(url))
			unVisitedUrl.add(url);
	}
	/**
	 * 获得已经访问的URL数目
	 * @return
	 */
	public static int getVisitedUrlNum() {
		return visitedUrl.size();
	}
	/**
	 * 判断未访问的URL队列中是否为空
	 * @return true-空;false-非空
	 */
	public static boolean unVisitedUrlsEmpty() {
		return unVisitedUrl.isEmpty();
	}

}


package com.iteye.injavawetrust.gethtml;

/**
 * 
 * @author InJavaWeTrust
 *
 */
public class Constants {
	
	public static String URL = "http://www.jqu.net.cn";
	
	public static String HTMLPATH = "E:\\InJavaWeTrust\\jsoup\\html\\";

}


package com.iteye.injavawetrust.gethtml;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.text.SimpleDateFormat;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.TimeZone;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**
 * 
 * @author InJavaWeTrust
 *
 */
public class JsoupUtil {
	
	private JsoupUtil() {
		
	}
	
	private static final JsoupUtil instance = new JsoupUtil();
	
	public static JsoupUtil getInstance() {
		return instance;
	}
	
	/**
	 * 初始化待访问URL队列
	 * @param url URL
	 */
	public void initUnvisitedUrl(String url) {
		HtmlQueue.addUnvisitedUrl(url);
	}
	
	/**
	 * 获取URL
	 * @param url URL
	 * @return URL
	 */
	public Set getAllUrl(String url){
		Set urls = new HashSet();
		try {
			Document document = Jsoup.connect(url).timeout(5000).get();
			Elements hrefs = document.select("a[href]");
			Iterator hrefIter = hrefs.iterator();
			while (hrefIter.hasNext()) {
				Element href = hrefIter.next();
				urls.add(href.attr("href"));
			}
			Elements srcs = document.select("img[src]");
			Iterator srcIter = srcs.iterator();
			while(srcIter.hasNext()){
				Element src = srcIter.next();
				urls.add(src.attr("src"));
			}
			Elements opts = document.select("option[value]");
			Iterator optIter = opts.iterator();
			while(optIter.hasNext()){
				Element opt = optIter.next();
				urls.add(opt.attr("value"));
			}
			Elements links = document.select("link[href]");
			Iterator linkIter = links.iterator();
			while(linkIter.hasNext()){
				Element li =  linkIter.next();
				urls.add(li.attr("href"));
			}
		} catch (IOException e) {
			e.printStackTrace();
		}
		return urls;
	}
	
	/**
	 * 得到html内容和html名称
	 * @param url URL
	 * @return map[html-内容;title-名称]
	 */
	public Map getHtml(String url){
		Map map = new HashMap();
		try {
			Document document = Jsoup.connect(url).timeout(5000).get();
			map.put("html", document.html());
			map.put("title", url.replaceAll("/", "").replaceAll(":", ""));
		} catch (IOException e) {
			System.out.println("This is html has exception [" + url + "]");
			System.out.println(e.getMessage());
		}
		return map;
		
	}
	
	/**
	 * URL是否以html结尾
	 * @param url
	 * @return true-是;false-否
	 */
	public boolean checkURL(String url) {
		String html = url.substring(url.lastIndexOf(".") + 1);
		return "html".equals(html) ? true : false;
	}
	
	/**
	 * 将html写入本地文件
	 * @param htmlText html内容
	 * @param htmlName html名称
	 */
	public void getHtmlToLocal(Map map){
		Writer writer = null;
		try {
			writer = new OutputStreamWriter(new FileOutputStream(new File(
					Constants.HTMLPATH + map.get("title"))), "UTF-8");
			writer.write(map.get("html"));
			writer.flush();
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		} finally {
			if (writer != null) {
				try {
					writer.close();
				} catch (IOException e) {
					e.printStackTrace();
				}
			}
		}
	}
	
	/**
	 * 毫秒转换成hhmmss
	 * @param ms 毫秒
	 * @return hh:mm:ss
	 */
	public String msToss(long ms) {
		SimpleDateFormat formatter = new SimpleDateFormat("HH:mm:ss");
		formatter.setTimeZone(TimeZone.getTimeZone("GMT+00:00"));
		String ss = formatter.format(ms);
		return ss;
	}
	
}

 

 

 

运行结果:

 

This is html has exception [http://www.jqu.net.cn/node/1166/10483.html]

404 error loading URL http://www.jqu.net.cn/node/1166/10483.html

This is html has exception [http://www.jqu.net.cn/node/459/16310.html]

404 error loading URL http://www.jqu.net.cn/node/459/16310.html

This is html has exception [http://www.jqu.net.cn/node/459/16310.html]

404 error loading URL http://www.jqu.net.cn/node/459/16310.html

This is html has exception [http://www.jqu.net.cn/node/858/16309.html]

404 error loading URL http://www.jqu.net.cn/node/858/16309.html

共下载 [3537]

用时 [00:04:20]

 

你可能感兴趣的:(java,jsoup,网络爬虫,InJavaWeTrust)