构建于lucene之上的可用的Java开源Spider少之又少,spindle长期没有更新且功能不够完善,故而自己参考其源代码重新编写了一个可扩展的WebCrawler,本着开源共享,共同进步的想法发布于此,期冀得到大家的批评指正,有任何意见及建议均可Email联系我
(
[email protected])
以下代码基于lucene-2.3.1,htmlparser-1.6,je-analysis-1.5.3,以及自己修改过的cpdetector-1.0.5;
下载地址分别为
htmlparser:
http://sourceforge.net/project/showfiles.php?group_id=24399
je-analysis:
http://www.jesoft.cn/je-analysis-1.5.3.jar
lucene就不用说了,cpdetector-1.0.5见附件.
(应部分网友要求,把所用到的工具打包成一个spider.rar,方便下载测试)
spindle的官方站点:
http://www.bitmechanic.com/projects/spindle/
主类SiteCapturer代码如下:
package com.huizhi.kanine.util;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.UnsupportedEncodingException;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.SocketException;
import java.net.SocketTimeoutException;
import java.net.URL;
import java.net.UnknownHostException;
import java.util.ArrayList;
import java.util.HashSet;
import jeasy.analysis.MMAnalyzer;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.store.RAMDirectory;
import org.htmlparser.Parser;
import org.htmlparser.PrototypicalNodeFactory;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.tags.BaseHrefTag;
import org.htmlparser.tags.FrameTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.tags.MetaTag;
import org.htmlparser.util.EncodingChangeException;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
/**
* @author 张波
* E-mail:[email protected]
* Created On : 2008-03-30
* Updated On : 2008-04-06
*/
public class SiteCapturer implements Runnable {
/* 基准(初始)URL */
protected URL baseURL = null;
/* 索引文件或抓取页面的存放位置 */
protected String indexDir = null;
/**
* 待解析的URL地址集合,所有新检测到的链接均存放于此;
* 解析时按照先入先出(First-In First-Out)法则线性取出
*/
protected ArrayList URLs = new ArrayList();
/* 已索引的URL地址集合,避免链接的重复抓取 */
protected HashSet indexedURLs = new HashSet();
protected Parser parser = new Parser();;
/* 程序运行线程数,默认2个线程 */
protected int threads = 2;
/* 存储于磁盘的IndexWriter */
protected IndexWriter FSDWriter;
/* 存储于内存的IndexWriter */
protected IndexWriter RAMWriter;
protected IndexSearcher indexSearcher;
protected RAMDirectory ramDirectory = new RAMDirectory();
/* 筛选页面内容的分词器 */
protected Analyzer luceneAnalyzer = new MMAnalyzer();
/* 解析页面时的字符编码 */
protected String charset;
/* 基准端口 */
protected int basePort;
/* 基准主机 */
protected String baseHost;
/* 是否索引,默认true */
protected boolean justIndex = true;
/* 是否保存,默认false */
protected boolean justCopy = false;
/* 检测索引中是否存在当前URL信息,避免重复抓取 */
protected boolean isRepeatedCheck = false;
/* 索引操作的写入线程锁 */
public static final Object indexLock = new Object();
public static Logger logger = Logger
.getLogger(SiteCapturer.class.getName());
public SiteCapturer() {
PrototypicalNodeFactory factory = new PrototypicalNodeFactory();
factory.registerTag(new LocalLinkTag());
factory.registerTag(new LocalFrameTag());
factory.registerTag(new LocalBaseHrefTag());
parser.setNodeFactory(factory);
}
/**
* 程序入口,在此初始化mPages、IndexWriter
* 通过协调各线程间的活动完成website的抓取工作
* 任务完成后将所有的索引片段合并为一个以优化检索
*/
public void capture() {
URLs.clear();
URLs.add(getBaseURL());
int responseCode = 0;
String contentType = "";
PropertyConfigurator.configure("/log4j.properties");
try {
HttpURLConnection uc = (HttpURLConnection) baseURL.openConnection();
responseCode = uc.getResponseCode();
contentType = uc.getContentType();
} catch (MalformedURLException mue) {
logger.error("Invalid URL : " + getBaseURL());
} catch (UnknownHostException uhe) {
logger.error("UnknowHost : " + getBaseURL());
} catch (SocketException se) {
logger.error("Socket Error : " + se.getMessage() + " "
+ getBaseURL());
} catch (IOException ie) {
logger.error("IOException : " + ie);
}
if (responseCode == HttpURLConnection.HTTP_OK
&& contentType.startsWith("text/html")) {
charset = ParserUtils.autoDetectCharset(baseURL);
basePort = baseURL.getPort();
baseHost = baseURL.getHost();
if (charset.equals("windows-1252"))
charset = "GBK";
/* 存放索引文件的位置 */
File indexDirectory = new File(indexDir);
/* 标记是否重新建立索引,true为重新建立索引 */
boolean flag = true;
if (!indexDirectory.exists()) {
/* 如果文件夹不存在则创建 */
indexDirectory.mkdir();
} else if (IndexReader.indexExists(indexDirectory)) {
/* 如果已存在索引,则追加索引 */
flag = false;
File lockfile = new File(indexDirectory + File.separator
+ "write.lock");
if (lockfile.exists())
lockfile.delete();
}
try {
if (justIndex) {
FSDWriter = new IndexWriter(indexDirectory, luceneAnalyzer,
flag);
RAMWriter = new IndexWriter(ramDirectory, luceneAnalyzer,
true);
if (isRepeatedCheck) {
IndexReader indexReader = IndexReader.open(indexDir);
indexSearcher = new IndexSearcher(indexReader);
}
}
long start = System.currentTimeMillis();
ArrayList threadList = new ArrayList();
for (int i = 0; i < threads; i++) {
Thread t = new Thread(this, "K-9 Spider Thread #" + (i + 1));
t.start();
threadList.add(t);
}
while (threadList.size() > 0) {
Thread child = (Thread) threadList.remove(0);
try {
child.join();
} catch (InterruptedException ie) {
logger.error("InterruptedException : " + ie);
}
}
long elapsed = System.currentTimeMillis() - start;
if (justIndex) {
RAMWriter.close();
FSDWriter.addIndexes(new Directory[] { ramDirectory });
FSDWriter.optimize();
FSDWriter.close();
}
logger.info("Finished in " + (elapsed / 1000) + " seconds");
logger.info("The Count of the Links Captured is "
+ indexedURLs.size());
} catch (CorruptIndexException cie) {
logger.error("CorruptIndexException : " + cie);
} catch (LockObtainFailedException lofe) {
logger.error("LockObtainFailedException : " + lofe);
} catch (IOException ie) {
logger.error("IOException : " + ie);
}
}
}
public void run() {
String url;
while ((url = dequeueURL()) != null) {
if (justIndex)
process(url);
}
threads--;
}
/**
* 判断提取到的链接是否符合解析条件;标准为Port及Host与基准URL相同且类型为text/html或text/plain
*/
public boolean isToBeCaptured(String url) {
boolean flag = false;
HttpURLConnection uc = null;
int responseCode = 0;
String contentType = "";
String host = "";
int port = 0;
try {
URL source = new URL(url);
String protocol = source.getProtocol();
if (protocol != null && protocol.equals("http")) {
host = source.getHost();
port = source.getPort();
uc = (HttpURLConnection) source.openConnection();
uc.setConnectTimeout(8000);
responseCode = uc.getResponseCode();
contentType = uc.getContentType();
}
} catch (MalformedURLException mue) {
logger.error("Invalid URL : " + url);
} catch (UnknownHostException uhe) {
logger.error("UnknowHost : " + url);
} catch (SocketException se) {
logger.error("Socket Error : " + se.getMessage() + " " + url);
} catch (SocketTimeoutException ste) {
logger.error("Socket Connection Time Out : " + url);
} catch (FileNotFoundException fnfe) {
logger.error("broken link " + url + " ignored");
} catch (IOException ie) {
logger.error("IOException : " + ie);
}
if (port == basePort
&& responseCode == HttpURLConnection.HTTP_OK
&& host.equals(baseHost)
&& (contentType.startsWith("text/html") || contentType
.startsWith("text/plain")))
flag = true;
return flag;
}
/* 从URL队列mPages里取出单个的URL */
public synchronized String dequeueURL() {
while (true)
if (URLs.size() > 0) {
String url = (String) URLs.remove(0);
indexedURLs.add(url);
if (isToBeCaptured(url)) {
NodeList list;
try {
int bookmark = URLs.size();
/* 获取页面所有节点 */
parser.setURL(url);
try {
list = new NodeList();
for (NodeIterator e = parser.elements(); e
.hasMoreNodes();)
list.add(e.nextNode());
} catch (EncodingChangeException ece) {
/* 解码出错的异常处理 */
parser.reset();
list = new NodeList();
for (NodeIterator e = parser.elements(); e
.hasMoreNodes();)
list.add(e.nextNode());
}
/* 抓取静态页面 */
if (-1 == url.indexOf("?") && justCopy)
copy(url, list);
/**
* 依据 http://www.robotstxt.org/wc/meta-user.html 处理
* Robots <META> tag
*/
NodeList robots = list
.extractAllNodesThatMatch(
new AndFilter(new NodeClassFilter(
MetaTag.class),
new HasAttributeFilter("name",
"robots")), true);
if (0 != robots.size()) {
MetaTag robot = (MetaTag) robots.elementAt(0);
String content = robot.getAttribute("content")
.toLowerCase();
if ((-1 != content.indexOf("none"))
|| (-1 != content.indexOf("nofollow")))
for (int i = bookmark; i < URLs.size(); i++)
URLs.remove(i);
}
} catch (ParserException pe) {
logger.error("ParserException : " + pe);
}
return url;
}
} else {
threads--;
if (threads > 0) {
try {
wait();
threads++;
} catch (InterruptedException ie) {
logger.error("InterruptedException : " + ie);
}
} else {
notifyAll();
return null;
}
}
}
/**
* 处理单独的URL地址,解析页面并加入到lucene索引中;通过自动探测页面编码保证抓取工作的顺利执行
*/
protected void process(String url) {
String result[];
String content = null;
String title = null;
/* 此项操作较耗性能,故默认不予检测 */
if (isRepeatedCheck) {
try {
TermQuery query = new TermQuery(new Term("url", url));
Hits hits = indexSearcher.search(query);
if (hits.length() > 0) {
logger.info("The URL : " + url
+ " has already been captured");
} else {
result = ParserUtils.parseHtml(url, charset);
content = result[0];
title = result[1];
}
} catch (IOException ie) {
logger.error("IOException : " + ie);
}
} else {
result = ParserUtils.parseHtml(url, charset);
content = result[0];
title = result[1];
}
if (content != null && content.trim().length() > 0) {
Document document = new Document();
document.add(new Field("content", content, Field.Store.YES,
Field.Index.TOKENIZED,
Field.TermVector.WITH_POSITIONS_OFFSETS));
document.add(new Field("url", url, Field.Store.YES,
Field.Index.UN_TOKENIZED));
document.add(new Field("title", title, Field.Store.YES,
Field.Index.TOKENIZED,
Field.TermVector.WITH_POSITIONS_OFFSETS));
document.add(new Field("date", DateTools.timeToString(System
.currentTimeMillis(), DateTools.Resolution.DAY),
Field.Store.YES, Field.Index.UN_TOKENIZED));
synchronized (indexLock) {
try {
RAMWriter.addDocument(document);
/**
* 当存放索引的内存使用大于指定值时将其写入硬盘;采用此方法的目的是通过内存缓冲避免频繁的
* IO操作,提高索引创建性能;合并索引时一定要调用被合并一方的IndexWriter的close()方法
*/
if (RAMWriter.ramSizeInBytes() > 512 * 1024) {
RAMWriter.close();
FSDWriter.addIndexes(new Directory[] { ramDirectory });
RAMWriter = new IndexWriter(ramDirectory,
luceneAnalyzer, true);
}
logger.info("Indexed link : " + url);
} catch (CorruptIndexException cie) {
logger.error("CorruptIndexException : " + cie);
} catch (IOException ie) {
logger.error("IOException : " + ie);
}
}
}
}
/* 将URL链接转换为本地目录的形式 */
protected String makeLocalLink(String link, String current) {
String localLink;
if (link.equals(getBaseURL()))
localLink = "index.html";
else if (link.startsWith(getBaseURL())
&& (link.length() > getBaseURL().length())) {
localLink = link.substring(getBaseURL().length() + 1);
if (-1 == localLink.indexOf("."))
localLink += "/" + "index.html";
} else
localLink = link;
if ((null != current) && link.startsWith(getBaseURL())
&& (current.length() > getBaseURL().length())) {
current = current.substring(getBaseURL().length() + 1);
int i = 0, j;
while (-1 != (j = current.indexOf('/', i))) {
localLink = "../" + localLink;
i = j + 1;
}
}
return localLink;
}
/* 将页面按结构层次保存到本地硬盘 */
protected void copy(String url, NodeList list) {
File file = new File(indexDir, makeLocalLink(url, ""));
File dir = file.getParentFile();
if (!dir.exists())
dir.mkdirs();
else if (!dir.isDirectory()) {
dir = new File(dir.getParentFile(), dir.getName() + ".content");
if (!dir.exists())
dir.mkdirs();
file = new File(dir, file.getName());
}
try {
PrintWriter out = new PrintWriter(new OutputStreamWriter(
new FileOutputStream(file), charset));
for (int i = 0; i < list.size(); i++)
out.print(list.elementAt(i).toHtml());
out.close();
logger.info("Captured link : " + url);
} catch (FileNotFoundException fnfe) {
logger.error("FileNotFoundException : " + fnfe);
} catch (UnsupportedEncodingException uee) {
logger.error("UnsupportedEncodingException : " + uee);
}
}
/**
* Link tag that rewrites the HREF.
* The HREF is changed to a local target if it matches the source.
*/
class LocalLinkTag extends LinkTag {
public void doSemanticAction() {
String link = getLink();
if (link.endsWith("/"))
link = link.substring(0, link.length() - 1);
int pos = link.indexOf("#");
if (pos != -1)
link = link.substring(0, pos);
/* 将链接加入到处理队列中 */
if (!(indexedURLs.contains(link) || URLs.contains(link)))
URLs.add(link);
setLink(link);
}
}
/**
* Frame tag that rewrites the SRC URLs. The SRC URLs are mapped to local
* targets if they match the source.
*/
class LocalFrameTag extends FrameTag {
public void doSemanticAction() {
String link = getFrameLocation();
if (link.endsWith("/"))
link = link.substring(0, link.length() - 1);
int pos = link.indexOf("#");
if (pos != -1)
link = link.substring(0, pos);
/* 将链接加入到处理队列中 */
if (!(indexedURLs.contains(link) || URLs.contains(link)))
URLs.add(link);
setFrameLocation(link);
}
}
/**
* Base tag that doesn't show. The toHtml() method is overridden to return
* an empty string, effectively shutting off the base reference.
*/
class LocalBaseHrefTag extends BaseHrefTag {
public String toHtml() {
return ("");
}
}
public static void main(String[] args) {
SiteCapturer worker = new SiteCapturer();
if (args.length < 6) {
System.out
.println("Usage: -u <start url> -d <index dir> -t <threads> [-r] [-c] [-i]");
return;
}
for (int i = 0; i < args.length; i++) {
if (args[i].equals("-u"))
worker.setBaseURL(args[++i]);
else if (args[i].equals("-d"))
worker.setIndexDir(args[++i]);
else if (args[i].equals("-t"))
worker.setThreads(Integer.parseInt(args[++i]));
else if (args[i].equals("-r"))
worker.setIsRepeatedCheck(true);
else if (args[i].equals("-c"))
worker.setJustCopy(true);
else if (args[i].equals("-i"))
worker.setJustIndex(false);
}
if (worker.getThreads() < 1)
throw new IllegalArgumentException("Invalid number of threads: "
+ worker.getThreads());
worker.capture();
System.exit(0);
}
public String getBaseURL() {
return baseURL.toString();
}
public void setBaseURL(String source) {
if (source.endsWith("/"))
source = source.substring(0, source.length() - 1);
try {
baseURL = new URL(source);
} catch (MalformedURLException e) {
logger.error("Invalid URL : " + getBaseURL());
}
}
public void setIndexDir(String indexDirectory) {
indexDir = indexDirectory;
}
public int getThreads() {
return threads;
}
public void setThreads(int threadCount) {
threads = threadCount;
}
public void setIsRepeatedCheck(boolean check) {
isRepeatedCheck = check;
}
public void setJustIndex(boolean justIndex) {
this.justIndex = justIndex;
}
public void setJustCopy(boolean justCopy) {
this.justCopy = justCopy;
}
}
工具类ParserUtils代码如下:
package com.huizhi.kanine.util;
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.SocketException;
import java.net.SocketTimeoutException;
import java.net.URL;
import java.net.UnknownHostException;
import java.nio.charset.Charset;
import org.htmlparser.Parser;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.visitors.HtmlPage;
import cpdetector.io.ASCIIDetector;
import cpdetector.io.CodepageDetectorProxy;
import cpdetector.io.JChardetFacade;
import cpdetector.io.ParsingDetector;
import cpdetector.io.UnicodeDetector;
public class ParserUtils {
/* StringBuffer的缓冲区大小 */
public static int TRANSFER_SIZE = 4096;
/* 当前平台的行分隔符 */
public static String lineSep = System.getProperty("line.separator");
/* 自动探测页面编码,避免中文乱码的出现 */
public static String autoDetectCharset(URL url) {
CodepageDetectorProxy detector = CodepageDetectorProxy.getInstance();
/**
* ParsingDetector可用于检查HTML、XML等文件或字符流的编码
* 构造方法中的参数用于指示是否显示探测过程的详细信息
* 为false则不显示
*/
detector.add(new ParsingDetector(false));
detector.add(JChardetFacade.getInstance());
detector.add(ASCIIDetector.getInstance());
detector.add(UnicodeDetector.getInstance());
Charset charset = null;
try {
charset = detector.detectCodepage(url);
} catch (MalformedURLException mue) {
mue.printStackTrace();
} catch (IOException ie) {
ie.printStackTrace();
}
if (charset == null)
charset = Charset.defaultCharset();
return charset.name();
}
/* 按照指定编码解析标准的html页面,为建立索引做准备*/
public static String[] parseHtml(String url, String charset) {
String result[] = null;
String content = null;
try {
URL source = new URL(url);
InputStream in = source.openStream();
BufferedReader reader = new BufferedReader(new InputStreamReader(
in, charset));
String line = new String();
StringBuffer temp = new StringBuffer(TRANSFER_SIZE);
while ((line = reader.readLine()) != null) {
temp.append(line);
temp.append(lineSep);
}
reader.close();
in.close();
content = temp.toString();
} catch (UnsupportedEncodingException uee) {
uee.printStackTrace();
} catch (MalformedURLException mue) {
System.err.println("Invalid URL : " + url);
} catch (UnknownHostException uhe) {
System.err.println("UnknowHost : " + url);
} catch (SocketException se) {
System.err.println("Socket Error : " + se.getMessage() + " " + url);
} catch (SocketTimeoutException ste) {
System.err.println("Socket Connection Time Out : " + url);
} catch (FileNotFoundException fnfe) {
System.err.println("broken link "
+ ((FileNotFoundException) fnfe.getCause()).getMessage()
+ " ignored");
} catch (IOException ie) {
ie.printStackTrace();
}
if (content != null) {
Parser myParser = Parser.createParser(content, charset);
HtmlPage visitor = new HtmlPage(myParser);
try {
myParser.visitAllNodesWith(visitor);
String body = null;
String title = "Untitled";
if (visitor.getBody() != null) {
NodeList nodelist = visitor.getBody();
body = nodelist.asString().trim();
}
if (visitor.getTitle() != null)
title = visitor.getTitle();
result = new String[] { body, title };
} catch (ParserException pe) {
pe.printStackTrace();
}
}
return result;
}
}
程序运行可选择控制台或新建一JSP页面,加入以下代码即可
(另,示例代码中log4j的配置文件须放在项目所在磁盘的根目录下;可在capture()
方法的PropertyConfigurator.configure("/log4j.properties")处自由修改)
<%@ page contentType="text/html; charset=UTF-8"%>
<%@ page import="com.huizhi.kanine.util.*"%>
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<title>Lucene</title>
</head>
<body>
<%
SiteCapturer worker= new SiteCapturer();
worker.setBaseURL("http://www.blabla.cn");
worker.setIndexDir("c:\\luceneIndex");
//worker.setIsRepeatedCheck(true);//可选,检测链接是否和索引重复
//worker.setJustCopy(true);//可选,将链接保存到本地
worker.setThreads(20);
worker.capture();
%>
</body>
</html>