MinerUtil.java 爬虫工具类

阅读更多

MinerUtil.java 爬虫工具类

package com.iteye.injavawetrust.miner;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TimeZone;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**
 * 爬虫工具类
 * @author InJavaWeTrust
 *
 */
public class MinerUtil {
	
	private static final Log LOG = LogFactory.getLog(MinerUtil.class);
	
	public static long starTime = 0;
	
	/**
	 * 判断是否为空
	 * @param param
	 * @return true-为空;false-非空
	 */
	public static boolean isBlank(String param) {
		return (null == param || "".equals(param.trim())) ? true : false;
	}
	
	/**
	 * URL是否以html结尾
	 * @param url
	 * @return true-是;false-否
	 */
	public static boolean checkURL(String url) {
		String html = url.substring(url.lastIndexOf(".") + 1);
		return "html".equals(html) ? true : false;
	}
	/**
	 * URL列表是否包含关键字
	 * @param key 关键字
	 * @param keys URL列表
	 * @return true-是;false-否
	 */
	public static boolean checkKeys(String key, List keys) {
		boolean flag = false;
		for(String k : keys) {
			if(key.contains(k)){
				flag = true;
				break;
			}
		}
		return flag;
	}
	
	public static boolean isValidFileName(String fileName) {
		if (fileName == null || fileName.length() > 255){
			return false;
		} else {
			return fileName
					.matches("[^\\s\\\\/:\\*\\?\\\"<>\\|](\\x20|[^\\s\\\\/:\\*\\?\\\"<>\\|])*[^\\s\\\\/:\\*\\?\\\"<>\\|\\.]$");
		}
	} 
	
	/**
	 * 获取URL
	 * @param url URL
	 * @return URL
	 */
	public static Set getAllUrl(String url){
		Set urls = new HashSet();
		try {
			Connection conn = Jsoup.connect(url);
			conn.header("User-Agent", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13");//模拟浏览器  
			Document document = conn.timeout(5000).get();
			Elements hrefs = document.select("a[href]");
			Iterator hrefIter = hrefs.iterator();
			while (hrefIter.hasNext()) {
				Element href = hrefIter.next();
				urls.add(href.attr("href"));
			}
		} catch (Exception e) {
			LOG.info("获取URL出现异常,异常URL[" + url + "]");
			LOG.info("异常信息[" + e.getMessage() + "]");
		}
		return urls;
	}
	
	/**
	 * 毫秒转换成hhmmss
	 * @param ms 毫秒
	 * @return hh:mm:ss
	 */
	public static String msToss(long ms) {
		SimpleDateFormat formatter = new SimpleDateFormat("HH:mm:ss");
		formatter.setTimeZone(TimeZone.getTimeZone("GMT+00:00"));
		String ss = formatter.format(ms);
		return ss;
	}
	
	/**
	 * 将html写入本地文件
	 * @param htmlText html内容
	 * @param htmlName html名称
	 */
	public static void getHtmlToLocal(Map map){
		Writer writer = null;
		try {
			String path = MinerConstanits.HTMLPATH + getToday();
			makeDir(path);
			writer = new OutputStreamWriter(new FileOutputStream(new File(path
					+ File.separator + map.get("title"))), "UTF-8");
			writer.write(map.get("html"));
			writer.flush();
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		} finally {
			if (writer != null) {
				try {
					writer.close();
				} catch (IOException e) {
					e.printStackTrace();
				}
			}
		}
	}
	/**
	 * 文件名不能包含下列任何字符:
* \/:*?"<>| * @param title 标题 * @return 去掉文件名不能包含的字符 */ public static String fileName(String title){ return title .replaceAll("\\\\", "") .replaceAll("/", "") .replaceAll(":", "") .replaceAll("\\*", "") .replaceAll("\\?", "") .replaceAll("\"", "") .replaceAll("<", "") .replaceAll(">", "") .replaceAll("\\|", ""); } /** * 获取当天日期 * @return 当天日期 */ public static String getToday(){ String result = ""; Date date = new Date(); result = format(date); return result; } /** * 格式化日期 * @param date 日期 * @return yyyymmdd 日期 */ public static String format(Date date){ String format = "yyyyMMdd"; SimpleDateFormat fmt = new SimpleDateFormat(format); return fmt.format(date); } /** * 创建存储目录 * @param path 存储目录 */ public static void makeDir(String path) { File file = new File(path); if(!file.exists()){ file.mkdirs(); LOG.info("创建存储目录[" + path + "]"); } } public static boolean checkBeforeStart(MinerConfig config) { if(null == config){ LOG.info("config未配置!!!"); return false; } if(null == config.getKeys() || 0 == config.getKeys().size()){ LOG.info("包含关键字未配置!!!"); return false; } if(null == config.getStoreType()){ LOG.info("存储方式未配置!!!"); return false; } if(config.getMaxDepth() < 1){ LOG.info("爬取页面最大深度配置错误!!!"); return false; } if(config.getMinerHtmlThreadNum() < 1){ LOG.info("下载页面线程数配置错误!!!"); return false; } if(config.getMiseringThreadNum() < 1){ LOG.info("分析页面线程数配置错误!!!"); return false; } if(config.getMinserStoreThreadNum() < 1){ LOG.info("存储线程数配置错误!!!"); return false; } return true; } public static void main(String[] args) { String path = MinerConstanits.HTMLPATH + File.separator + getToday(); makeDir(path); // System.out.println(getToday()); // String test = "http://my.163.com/2015/11/27/17763_578935.html"; // System.out.println(fileName(test)); // System.out.println(MinerUtil.isBlank(null)); // System.out.println(MinerUtil.isBlank("")); // System.out.println(MinerUtil.isBlank(" ")); // System.out.println(MinerUtil.isBlank("bbb")); // System.out.println(MinerUtil.isBlank(" bbb ")); // String key = "http://www.jqu.net.cn"; // List keys = new ArrayList(); // keys.add("http://www.jqu.net.cn"); // System.out.println(MinerUtil.checkKeys(key, keys)); } }

 

返回列表

 

你可能感兴趣的:(java,jsoup,网络爬虫)