基于Spindle的增强HTTP Spider

   构建于lucene之上的可用的Java开源Spider少之又少,spindle长期没有更新且功能不够完善,故而自己参考其源代码重新编写了一个可扩展的WebCrawler,本着开源共享,共同进步的想法发布于此,期冀得到大家的批评指正,有任何意见及建议均可Email联系我 ([email protected])
   以下代码基于lucene-2.3.1,htmlparser-1.6,je-analysis-1.5.3,以及自己修改过的cpdetector-1.0.5;
下载地址分别为
htmlparser: http://sourceforge.net/project/showfiles.php?group_id=24399
je-analysis: http://www.jesoft.cn/je-analysis-1.5.3.jar
lucene就不用说了,cpdetector-1.0.5见附件.
(应部分网友要求,把所用到的工具打包成一个spider.rar,方便下载测试)
spindle的官方站点: http://www.bitmechanic.com/projects/spindle/
主类SiteCapturer代码如下:
package com.huizhi.kanine.util;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.UnsupportedEncodingException;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.SocketException;
import java.net.SocketTimeoutException;
import java.net.URL;
import java.net.UnknownHostException;
import java.util.ArrayList;
import java.util.HashSet;

import jeasy.analysis.MMAnalyzer;

import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.store.RAMDirectory;
import org.htmlparser.Parser;
import org.htmlparser.PrototypicalNodeFactory;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.tags.BaseHrefTag;
import org.htmlparser.tags.FrameTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.tags.MetaTag;
import org.htmlparser.util.EncodingChangeException;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;

/**
 * @author 张波 
 * E-mail:[email protected] 
 * Created On : 2008-03-30
 * Updated On : 2008-04-06
 */

public class SiteCapturer implements Runnable {

	/* 基准(初始)URL */
	protected URL baseURL = null;

	/* 索引文件或抓取页面的存放位置 */
	protected String indexDir = null;

	/**
	 * 待解析的URL地址集合,所有新检测到的链接均存放于此;
	 * 解析时按照先入先出(First-In First-Out)法则线性取出
	 */
	protected ArrayList URLs = new ArrayList();

	/* 已索引的URL地址集合,避免链接的重复抓取 */
	protected HashSet indexedURLs = new HashSet();

	protected Parser parser = new Parser();;

	/* 程序运行线程数,默认2个线程 */
	protected int threads = 2;

	/* 存储于磁盘的IndexWriter */
	protected IndexWriter FSDWriter;

	/* 存储于内存的IndexWriter */
	protected IndexWriter RAMWriter;

	protected IndexSearcher indexSearcher;

	protected RAMDirectory ramDirectory = new RAMDirectory();

	/* 筛选页面内容的分词器 */
	protected Analyzer luceneAnalyzer = new MMAnalyzer();

	/* 解析页面时的字符编码 */
	protected String charset;

	/* 基准端口 */
	protected int basePort;

	/* 基准主机 */
	protected String baseHost;
	
	/* 是否索引,默认true */
	protected boolean justIndex = true;

	/* 是否保存,默认false */
	protected boolean justCopy = false;

	/* 检测索引中是否存在当前URL信息,避免重复抓取 */
	protected boolean isRepeatedCheck = false;

	/* 索引操作的写入线程锁 */
	public static final Object indexLock = new Object();

	public static Logger logger = Logger
			.getLogger(SiteCapturer.class.getName());

	public SiteCapturer() {
		PrototypicalNodeFactory factory = new PrototypicalNodeFactory();
		factory.registerTag(new LocalLinkTag());
		factory.registerTag(new LocalFrameTag());
		factory.registerTag(new LocalBaseHrefTag());
		parser.setNodeFactory(factory);
	}

	/**
	 * 程序入口,在此初始化mPages、IndexWriter
	 * 通过协调各线程间的活动完成website的抓取工作
	 * 任务完成后将所有的索引片段合并为一个以优化检索
	 */
	public void capture() {

		URLs.clear();
		URLs.add(getBaseURL());

		int responseCode = 0;
		String contentType = "";

		PropertyConfigurator.configure("/log4j.properties");
		
		try {
			HttpURLConnection uc = (HttpURLConnection) baseURL.openConnection();
			responseCode = uc.getResponseCode();
			contentType = uc.getContentType();
		} catch (MalformedURLException mue) {
			logger.error("Invalid URL : " + getBaseURL());
		} catch (UnknownHostException uhe) {
			logger.error("UnknowHost : " + getBaseURL());
		} catch (SocketException se) {
			logger.error("Socket Error : " + se.getMessage() + " "
					+ getBaseURL());
		} catch (IOException ie) {
			logger.error("IOException : " + ie);
		}
		
		if (responseCode == HttpURLConnection.HTTP_OK
				&& contentType.startsWith("text/html")) {
			
			charset = ParserUtils.autoDetectCharset(baseURL);

			basePort = baseURL.getPort();
			baseHost = baseURL.getHost();
			if (charset.equals("windows-1252"))
				charset = "GBK";
			
			/* 存放索引文件的位置 */
			File indexDirectory = new File(indexDir);
			/* 标记是否重新建立索引,true为重新建立索引 */
			boolean flag = true;
			if (!indexDirectory.exists()) {
				/* 如果文件夹不存在则创建 */
				indexDirectory.mkdir();
			} else if (IndexReader.indexExists(indexDirectory)) {
				/* 如果已存在索引,则追加索引 */
				flag = false;
				File lockfile = new File(indexDirectory + File.separator
						+ "write.lock");
				if (lockfile.exists())
					lockfile.delete();
			}
			try {
				if (justIndex) {
					FSDWriter = new IndexWriter(indexDirectory, luceneAnalyzer,
							flag);
					RAMWriter = new IndexWriter(ramDirectory, luceneAnalyzer,
							true);
					if (isRepeatedCheck) {
						IndexReader indexReader = IndexReader.open(indexDir);
						indexSearcher = new IndexSearcher(indexReader);
					}
				}
				long start = System.currentTimeMillis();
				ArrayList threadList = new ArrayList();
				for (int i = 0; i < threads; i++) {
					Thread t = new Thread(this, "K-9 Spider Thread #" + (i + 1));
					t.start();
					threadList.add(t);
				}
				while (threadList.size() > 0) {
					Thread child = (Thread) threadList.remove(0);
					try {
						child.join();
					} catch (InterruptedException ie) {
						logger.error("InterruptedException : " + ie);
					}
				}
				long elapsed = System.currentTimeMillis() - start;
				if (justIndex) {
					RAMWriter.close();
					FSDWriter.addIndexes(new Directory[] { ramDirectory });
					FSDWriter.optimize();
					FSDWriter.close();
				}
				logger.info("Finished in " + (elapsed / 1000) + " seconds");
				logger.info("The Count of the Links Captured is "
						+ indexedURLs.size());
			} catch (CorruptIndexException cie) {
				logger.error("CorruptIndexException : " + cie);
			} catch (LockObtainFailedException lofe) {
				logger.error("LockObtainFailedException : " + lofe);
			} catch (IOException ie) {
				logger.error("IOException : " + ie);
			}
		}
	}

	public void run() {
		String url;
		while ((url = dequeueURL()) != null) {
			if (justIndex)
				process(url);
		}
		threads--;
	}

	/**
	 * 判断提取到的链接是否符合解析条件;标准为Port及Host与基准URL相同且类型为text/html或text/plain
	 */
	public boolean isToBeCaptured(String url) {
		boolean flag = false;

		HttpURLConnection uc = null;
		int responseCode = 0;
		String contentType = "";
		String host = "";
		int port = 0;
		
		try {
			URL source = new URL(url);
			String protocol = source.getProtocol();
			if (protocol != null && protocol.equals("http")) {
				host = source.getHost();
				port = source.getPort();
				uc = (HttpURLConnection) source.openConnection();
				uc.setConnectTimeout(8000);
				responseCode = uc.getResponseCode();
				contentType = uc.getContentType();
			}
		} catch (MalformedURLException mue) {
			logger.error("Invalid URL : " + url);
		} catch (UnknownHostException uhe) {
			logger.error("UnknowHost : " + url);
		} catch (SocketException se) {
			logger.error("Socket Error : " + se.getMessage() + " " + url);
		} catch (SocketTimeoutException ste) {
			logger.error("Socket Connection Time Out : " + url);
		} catch (FileNotFoundException fnfe) {
			logger.error("broken link " + url + " ignored");
		} catch (IOException ie) {
			logger.error("IOException : " + ie);
		}
		if (port == basePort
				&& responseCode == HttpURLConnection.HTTP_OK
				&& host.equals(baseHost)
				&& (contentType.startsWith("text/html") || contentType
						.startsWith("text/plain")))
			flag = true;
		return flag;
	}

	/* 从URL队列mPages里取出单个的URL */
	public synchronized String dequeueURL() {
		while (true)
			if (URLs.size() > 0) {
				String url = (String) URLs.remove(0);
				indexedURLs.add(url);
				if (isToBeCaptured(url)) {
					NodeList list;
					try {
						int bookmark = URLs.size();
						/* 获取页面所有节点 */
						parser.setURL(url);
						try {
							list = new NodeList();
							for (NodeIterator e = parser.elements(); e
									.hasMoreNodes();)
								list.add(e.nextNode());
						} catch (EncodingChangeException ece) {
							/* 解码出错的异常处理 */
							parser.reset();
							list = new NodeList();
							for (NodeIterator e = parser.elements(); e
									.hasMoreNodes();)
								list.add(e.nextNode());
						}
						/* 抓取静态页面 */
						if (-1 == url.indexOf("?") && justCopy)
							copy(url, list);
						/**
						 * 依据 http://www.robotstxt.org/wc/meta-user.html 处理
						 * Robots <META> tag
						 */
						NodeList robots = list
								.extractAllNodesThatMatch(
										new AndFilter(new NodeClassFilter(
												MetaTag.class),
												new HasAttributeFilter("name",
														"robots")), true);
						if (0 != robots.size()) {
							MetaTag robot = (MetaTag) robots.elementAt(0);
							String content = robot.getAttribute("content")
									.toLowerCase();
							if ((-1 != content.indexOf("none"))
									|| (-1 != content.indexOf("nofollow")))
								for (int i = bookmark; i < URLs.size(); i++)
									URLs.remove(i);
						}
					} catch (ParserException pe) {
						logger.error("ParserException : " + pe);
					}
					return url;
				}
			} else {
				threads--;
				if (threads > 0) {
					try {
						wait();
						threads++;
					} catch (InterruptedException ie) {
						logger.error("InterruptedException : " + ie);
					}
				} else {
					notifyAll();
					return null;
				}
			}
	}

	/**
	 * 处理单独的URL地址,解析页面并加入到lucene索引中;通过自动探测页面编码保证抓取工作的顺利执行
	 */
	protected void process(String url) {

		String result[];
		String content = null;
		String title = null;

		/* 此项操作较耗性能,故默认不予检测 */
		if (isRepeatedCheck) {
			try {
				TermQuery query = new TermQuery(new Term("url", url));
				Hits hits = indexSearcher.search(query);
				if (hits.length() > 0) {
					logger.info("The URL : " + url
							+ " has already been captured");
				} else {
					result = ParserUtils.parseHtml(url, charset);
					content = result[0];
					title = result[1];
				}
			} catch (IOException ie) {
				logger.error("IOException : " + ie);
			}
		} else {
			result = ParserUtils.parseHtml(url, charset);
			content = result[0];
			title = result[1];
		}

		if (content != null && content.trim().length() > 0) {

			Document document = new Document();
			document.add(new Field("content", content, Field.Store.YES,
					Field.Index.TOKENIZED,
					Field.TermVector.WITH_POSITIONS_OFFSETS));
			document.add(new Field("url", url, Field.Store.YES,
					Field.Index.UN_TOKENIZED));
			document.add(new Field("title", title, Field.Store.YES,
					Field.Index.TOKENIZED,
					Field.TermVector.WITH_POSITIONS_OFFSETS));
			document.add(new Field("date", DateTools.timeToString(System
					.currentTimeMillis(), DateTools.Resolution.DAY),
					Field.Store.YES, Field.Index.UN_TOKENIZED));

			synchronized (indexLock) {
				try {
					RAMWriter.addDocument(document);
					/**
					 * 当存放索引的内存使用大于指定值时将其写入硬盘;采用此方法的目的是通过内存缓冲避免频繁的
					 * IO操作,提高索引创建性能;合并索引时一定要调用被合并一方的IndexWriter的close()方法
					 */
					if (RAMWriter.ramSizeInBytes() > 512 * 1024) {
						RAMWriter.close();
						FSDWriter.addIndexes(new Directory[] { ramDirectory });
						RAMWriter = new IndexWriter(ramDirectory,
								luceneAnalyzer, true);
					}
					logger.info("Indexed link : " + url);
				} catch (CorruptIndexException cie) {
					logger.error("CorruptIndexException : " + cie);
				} catch (IOException ie) {
					logger.error("IOException : " + ie);
				}
			}
		}
	}
	
	/* 将URL链接转换为本地目录的形式 */
	protected String makeLocalLink(String link, String current) {
		
		String localLink;

		if (link.equals(getBaseURL()))
			localLink = "index.html";
		else if (link.startsWith(getBaseURL())
				&& (link.length() > getBaseURL().length())) {
			localLink = link.substring(getBaseURL().length() + 1);
			if (-1 == localLink.indexOf("."))
				localLink += "/" + "index.html";
		} else
			localLink = link;

		if ((null != current) && link.startsWith(getBaseURL())
				&& (current.length() > getBaseURL().length())) {
			current = current.substring(getBaseURL().length() + 1);
			int i = 0, j;
			while (-1 != (j = current.indexOf('/', i))) {
				localLink = "../" + localLink;
				i = j + 1;
			}
		}
		return localLink;
	}
	
	/* 将页面按结构层次保存到本地硬盘 */
	protected void copy(String url, NodeList list) {

		File file = new File(indexDir, makeLocalLink(url, ""));
		File dir = file.getParentFile();

		if (!dir.exists())
			dir.mkdirs();
		else if (!dir.isDirectory()) {
			dir = new File(dir.getParentFile(), dir.getName() + ".content");
			if (!dir.exists())
				dir.mkdirs();
			file = new File(dir, file.getName());
		}
		try {
			PrintWriter out = new PrintWriter(new OutputStreamWriter(
					new FileOutputStream(file), charset));
			for (int i = 0; i < list.size(); i++)
				out.print(list.elementAt(i).toHtml());
			out.close();
			logger.info("Captured link : " + url);
		} catch (FileNotFoundException fnfe) {
			logger.error("FileNotFoundException : " + fnfe);
		} catch (UnsupportedEncodingException uee) {
			logger.error("UnsupportedEncodingException : " + uee);
		}
	}
	
	/**
	 * Link tag that rewrites the HREF.
	 * The HREF is changed to a local target if it matches the source.
	 */
	class LocalLinkTag extends LinkTag {
		public void doSemanticAction() {
			String link = getLink();
			if (link.endsWith("/"))
				link = link.substring(0, link.length() - 1);
			int pos = link.indexOf("#");
			if (pos != -1)
				link = link.substring(0, pos);
			/* 将链接加入到处理队列中 */
			if (!(indexedURLs.contains(link) || URLs.contains(link)))
				URLs.add(link);
			setLink(link);
		}
	}

	/**
	 * Frame tag that rewrites the SRC URLs. The SRC URLs are mapped to local
	 * targets if they match the source.
	 */
	class LocalFrameTag extends FrameTag {
		public void doSemanticAction() {
			String link = getFrameLocation();
			if (link.endsWith("/"))
				link = link.substring(0, link.length() - 1);
			int pos = link.indexOf("#");
			if (pos != -1)
				link = link.substring(0, pos);
			/* 将链接加入到处理队列中 */
			if (!(indexedURLs.contains(link) || URLs.contains(link)))
				URLs.add(link);
			setFrameLocation(link);
		}
	}

	/**
	 * Base tag that doesn't show. The toHtml() method is overridden to return
	 * an empty string, effectively shutting off the base reference.
	 */
	class LocalBaseHrefTag extends BaseHrefTag {
		public String toHtml() {
			return ("");
		}
	}

	public static void main(String[] args) {

		SiteCapturer worker = new SiteCapturer();

		if (args.length < 6) {
			System.out
					.println("Usage: -u <start url> -d <index dir> -t <threads> [-r] [-c] [-i]");
			return;
		}

		for (int i = 0; i < args.length; i++) {
			if (args[i].equals("-u"))
				worker.setBaseURL(args[++i]);
			else if (args[i].equals("-d"))
				worker.setIndexDir(args[++i]);
			else if (args[i].equals("-t"))
				worker.setThreads(Integer.parseInt(args[++i]));
			else if (args[i].equals("-r"))
				worker.setIsRepeatedCheck(true);
			else if (args[i].equals("-c"))
				worker.setJustCopy(true);
			else if (args[i].equals("-i"))
				worker.setJustIndex(false);
		}
		if (worker.getThreads() < 1)
			throw new IllegalArgumentException("Invalid number of threads: "
					+ worker.getThreads());

		worker.capture();
		System.exit(0);
	}

	public String getBaseURL() {
		return baseURL.toString();
	}

	public void setBaseURL(String source) {
		if (source.endsWith("/"))
			source = source.substring(0, source.length() - 1);
		try {
			baseURL = new URL(source);
		} catch (MalformedURLException e) {
			logger.error("Invalid URL : " + getBaseURL());
		}
	}

	public void setIndexDir(String indexDirectory) {
		indexDir = indexDirectory;
	}

	public int getThreads() {
		return threads;
	}

	public void setThreads(int threadCount) {
		threads = threadCount;
	}

	public void setIsRepeatedCheck(boolean check) {
		isRepeatedCheck = check;
	}
	
	public void setJustIndex(boolean justIndex) {
		this.justIndex = justIndex;
	}

	public void setJustCopy(boolean justCopy) {
		this.justCopy = justCopy;
	}
}

工具类ParserUtils代码如下:
package com.huizhi.kanine.util;

import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.SocketException;
import java.net.SocketTimeoutException;
import java.net.URL;
import java.net.UnknownHostException;
import java.nio.charset.Charset;

import org.htmlparser.Parser;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.visitors.HtmlPage;

import cpdetector.io.ASCIIDetector;
import cpdetector.io.CodepageDetectorProxy;
import cpdetector.io.JChardetFacade;
import cpdetector.io.ParsingDetector;
import cpdetector.io.UnicodeDetector;

public class ParserUtils {

	/* StringBuffer的缓冲区大小 */
	public static int TRANSFER_SIZE = 4096;

	/* 当前平台的行分隔符 */
	public static String lineSep = System.getProperty("line.separator");

	/* 自动探测页面编码,避免中文乱码的出现 */
	public static String autoDetectCharset(URL url) {

		CodepageDetectorProxy detector = CodepageDetectorProxy.getInstance();
		/**
		 * ParsingDetector可用于检查HTML、XML等文件或字符流的编码
		 * 构造方法中的参数用于指示是否显示探测过程的详细信息
		 * 为false则不显示
		 */
		detector.add(new ParsingDetector(false));
		detector.add(JChardetFacade.getInstance());
		detector.add(ASCIIDetector.getInstance());
		detector.add(UnicodeDetector.getInstance());

		Charset charset = null;
		try {
			charset = detector.detectCodepage(url);
		} catch (MalformedURLException mue) {
			mue.printStackTrace();
		} catch (IOException ie) {
			ie.printStackTrace();
		}
		if (charset == null)
			charset = Charset.defaultCharset();
		return charset.name();
	}

	/* 按照指定编码解析标准的html页面,为建立索引做准备*/
	public static String[] parseHtml(String url, String charset) {

		String result[] = null;
		String content = null;

		try {
			URL source = new URL(url);
			InputStream in = source.openStream();
			BufferedReader reader = new BufferedReader(new InputStreamReader(
					in, charset));
			String line = new String();
			StringBuffer temp = new StringBuffer(TRANSFER_SIZE);
			while ((line = reader.readLine()) != null) {
				temp.append(line);
				temp.append(lineSep);
			}
			reader.close();
			in.close();
			content = temp.toString();
		} catch (UnsupportedEncodingException uee) {
			uee.printStackTrace();
		} catch (MalformedURLException mue) {
			System.err.println("Invalid URL : " + url);
		} catch (UnknownHostException uhe) {
			System.err.println("UnknowHost : " + url);
		} catch (SocketException se) {
			System.err.println("Socket Error : " + se.getMessage() + " " + url);
		} catch (SocketTimeoutException ste) {
			System.err.println("Socket Connection Time Out : " + url);
		} catch (FileNotFoundException fnfe) {
			System.err.println("broken link "
					+ ((FileNotFoundException) fnfe.getCause()).getMessage()
					+ " ignored");
		} catch (IOException ie) {
			ie.printStackTrace();
		}

		if (content != null) {
			Parser myParser = Parser.createParser(content, charset);
			HtmlPage visitor = new HtmlPage(myParser);
			try {
				myParser.visitAllNodesWith(visitor);
				String body = null;
				String title = "Untitled";
				if (visitor.getBody() != null) {
					NodeList nodelist = visitor.getBody();
					body = nodelist.asString().trim();
				}
				if (visitor.getTitle() != null)
					title = visitor.getTitle();
				result = new String[] { body, title };
			} catch (ParserException pe) {
				pe.printStackTrace();
			}
		}
		return result;
	}
}

程序运行可选择控制台或新建一JSP页面,加入以下代码即可
(另,示例代码中log4j的配置文件须放在项目所在磁盘的根目录下;可在capture()
方法的PropertyConfigurator.configure("/log4j.properties")处自由修改)
<%@ page contentType="text/html; charset=UTF-8"%>
<%@ page import="com.huizhi.kanine.util.*"%>
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">

<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<title>Lucene</title>
</head>
<body>
<%
	SiteCapturer worker= new SiteCapturer();
	worker.setBaseURL("http://www.blabla.cn");
	worker.setIndexDir("c:\\luceneIndex");
	//worker.setIsRepeatedCheck(true);//可选,检测链接是否和索引重复
	//worker.setJustCopy(true);//可选,将链接保存到本地
	worker.setThreads(20);
	worker.capture();
%>
</body>
</html>

你可能感兴趣的:(apache,log4j,.net,IE,Lucene)