一、环境
1、平台:myeclipse8.5
2、框架:Lucene2.9.4/htmlparser
二、开发调试
1、直接上源码吧,加了很多注释,应该可以看得明白,如下
package org.cyxl.lucene.test; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.Searcher; import org.apache.lucene.search.TopScoreDocCollector; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; import org.htmlparser.NodeFilter; import org.htmlparser.Parser; import org.htmlparser.beans.StringBean; import org.htmlparser.filters.NodeClassFilter; import org.htmlparser.tags.LinkTag; import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; import org.htmlparser.visitors.HtmlPage; public class ParseURL { //索引目录 private static final String INDEX_DIR = "myindex"; //已经存在的url列表 private static List<String> urls=new ArrayList<String>(); /** * 索引器,对目标url创建索引 * @param url 目标网址 * @throws IOException * @throws ParserException */ @SuppressWarnings("deprecation") private static void indexer(String url) throws IOException, ParserException { //判断是新创建索引文件,还是附加方式 boolean create=false; //存储索引的目录 File indexDir = new File(INDEX_DIR); //目录不存在,创建该目录 if (!indexDir.exists()) { indexDir.mkdir(); } //判断是否需要新创建索引文件 if(indexDir.list().length<=0) { create=true; } //获取网页纯文本 String content = getText(url); //获取网页标题 String title = getTitle(url); System.out.println("title:" + title); if(title==null || content==null || content.trim().equals("")) { return; } // System.out.println("content:" + content); // URL path=new URL(url); // InputStream stream=path.openStream(); // // Reader reader=new InputStreamReader(stream); // Reader reader=new InputStreamReader(new ByteArrayInputStream(content.getBytes())); // Reader reader2=new InputStreamReader(new ByteArrayInputStream(title.getBytes())); Document doc = new Document(); //加入url域 doc.add(new Field("url", url, Field.Store.YES, Field.Index.UN_TOKENIZED)); //加入标题域 doc.add(new Field("title", title, Field.Store.YES, Field.Index.ANALYZED)); // doc.add(new Field("title",reader2)); //Index.ANALYZED分词后构建索引 //加入内容域 doc.add(new Field("content", content, Field.Store.YES, Field.Index.ANALYZED)); // doc.add(new Field("content",reader)); //根据索引目录和create的状态创建索引写磁盘对象 IndexWriter writer = new IndexWriter(FSDirectory.open(indexDir), new StandardAnalyzer(Version.LUCENE_CURRENT), create, IndexWriter.MaxFieldLength.LIMITED); writer.addDocument(doc); writer.optimize(); writer.close(); //创建了索引的网址加入到已经存在的网址列表中 urls.add(url); } /** * 搜索器,根据输入的文本去搜索 * @param words 输入的文本 * @throws CorruptIndexException * @throws IOException * @throws ParseException */ @SuppressWarnings("deprecation") private static void searcher(String words) throws CorruptIndexException, IOException, ParseException { File indexDir = new File(INDEX_DIR); //根据索引目录创建读索引对象 IndexReader reader = IndexReader.open(FSDirectory.open(indexDir), true); //搜索对象创建 Searcher searcher = new IndexSearcher(reader); //分词 Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT); //指定搜索的域 String field="content"; //创建查询解析对象 QueryParser parser = new QueryParser(field, analyzer); //根据域和目标搜索文本创建查询器 Query query = parser.parse(words); System.out.println("Searching for: " + query.toString(field)); //对结果进行打分排序 TopScoreDocCollector collector = TopScoreDocCollector.create(5 * 10,false); searcher.search(query, collector); //获取结果 ScoreDoc[] hits = collector.topDocs().scoreDocs; int numTotalHits = collector.getTotalHits(); System.out.println(numTotalHits + " total matching documents"); //显示搜索结果 for (int i = 0; i < hits.length; i++) { Document doc = searcher.doc(hits[i].doc); String url = doc.get("url"); String title=doc.get("title"); String content=doc.get("content"); System.out.println((i + 1) + "." + title); System.out.println("-----------------------------------"); System.out.println(content.substring(0,100)+"......"); System.out.println("-----------------------------------"); System.out.println(url); System.out.println(); } } /** * 收入网站 * @param url 网站首页url,也可以为网站地图url * @throws ParserException * @throws IOException * @throws ParseException */ private static void addSite(String url) throws ParserException, IOException, ParseException { long start=System.currentTimeMillis(); System.out.println("start add..."); //获取目标网页的所有链接 List<String> links = getLinks(url); System.out.println("url count:"+links.size()); for(int i=0;i<links.size();i++) { String link=links.get(i); System.out.println((i+1)+"."+link); if(!urls.contains(link)) { //对未创建过索引的网页创建索引 indexer(link); } else { System.out.println("["+link+"] exist"); } } System.out.println("end..."); long end=System.currentTimeMillis(); System.out.println("cost "+(end-start)/1000+" seconds"); } /** * 获取网页纯文本 * @param url 目标网址 * @return * @throws ParserException */ private static String getText(String url) throws ParserException { StringBean sb = new StringBean(); // 设置不需要得到页面所包含的链接信息 sb.setLinks(false); // 设置将不间断空格由正规空格所替代 sb.setReplaceNonBreakingSpaces(true); // 设置将一序列空格由一个单一空格所代替 sb.setCollapse(true); // 传入要解析的URL sb.setURL(url); // 返回解析后的网页纯文本信息 String content = sb.getStrings(); // System.out.println(content); return content; } /** * 获取网页标题 * @param path * @return * @throws IOException * @throws ParserException */ private static String getTitle(String path) throws IOException, ParserException { String title = ""; try { Parser parser=new Parser(path); HtmlPage page = new HtmlPage(parser); parser.visitAllNodesWith(page); title=page.getTitle(); // URL url = new URL(path); // URLConnection conn = url.openConnection(); // conn.setConnectTimeout(1000*5); // // InputStream stream = conn.getInputStream(); // // byte[] bs = new byte[stream.available()]; // stream.read(bs); // String str = new String(bs, "utf-8"); // // List<String> list = new ArrayList<String>(); // Pattern pa = Pattern.compile("<title>.*?</title>");// 源码中标题正则表达式 // Matcher ma = pa.matcher(str); // while (ma.find())// 寻找符合el的字串 // { // list.add(ma.group());// 将符合el的字串加入到list中 // } // for (int i = 0; i < list.size(); i++) { // title = title + list.get(i); // } // title = title.replaceAll("<.*?>", ""); } catch (Exception e) { // TODO Auto-generated catch block //e.printStackTrace(); title = "no title"; } return title.trim(); } /** * 获取网页中所有的链接 * @param url * @return * @throws ParserException */ private static List<String> getLinks(String url) throws ParserException { List<String> links=new ArrayList<String>(); //创建链接节点的过滤器 NodeFilter filter = new NodeClassFilter(LinkTag.class); Parser parser = new Parser(); parser.setURL(url); //设置目标网页的编码方式 //parser.setEncoding("utf-8"); //因为有些时候不清楚目标网页的编码方式,这里我们采用指定一 //个编码集合,然后通过试探的方式得到目标网页的编码方式 parser.setEncoding(CharsetAutoSwitch.dectedEncode(url)); NodeList list = parser.extractAllNodesThatMatch(filter); for (int i = 0; i < list.size(); i++) { LinkTag node = (LinkTag) list.elementAt(i); //获取链接的目标网址 String link=node.extractLink(); if(link!=null && !link.trim().equals("")) { //将目标网址加入到该页面的所有网址列表中 links.add(link); } } return links; } public static void main(String[] args) throws IOException, ParseException, InterruptedException, ParserException { String url = "http://struts.apache.org/"; //收入网站 addSite(url); //搜有带有lucene词的网页 searcher("lucene"); } }
2、这里用到了网上找的一个方法,就是获取网页的编码
package org.cyxl.lucene.test; import org.htmlparser.Node; import org.htmlparser.Parser; import org.htmlparser.tags.BodyTag; import org.htmlparser.tags.Html; import org.htmlparser.util.NodeIterator; /** * 根据网页的编码类型自动匹配编码,此方法从网上搜索的,准确性有待长时间测试 * * @author Administrator * */ public class CharsetAutoSwitch { // 字符编码集合,可根据实际编码类型进行扩充,试探器会不断试探该字符 //编码集合直到得到正确的编码方式 private static final String oriEncode = "utf-8,gb2312,gbk,iso-8859-1"; /** * 检测URL指定的网页的字符集 * * @param url * @return 返回网页的实际编码方式 */ public static String dectedEncode(String url) { String[] encodes = oriEncode.split(","); for (int i = 0; i < encodes.length; i++) { if (dectedCode(url, encodes[i])) { String code = encodes[i]; System.out.println("code:" + code); return code; } } return null; } /** * 编码匹配试探器,不断去试探utf-8,gb2312,gbk,iso-8859-1等编码方式,直到得到正确的结果 * * @param url * @param encode * @return */ public static boolean dectedCode(String url, String encode) { try { Parser parser = new Parser(url); parser.setEncoding(encode); for (NodeIterator e = parser.elements(); e.hasMoreNodes();) { Node node = (Node) e.nextNode(); // System.out.println(node.getClass()); if (node instanceof Html || node instanceof BodyTag) { return true; } } } catch (Exception e) { } return false; } }
3、测试结果
1)项目的根目录下创建了一个名为myindex的目录,里面的问价如下
2)搜索的结果为
Searching for: lucene 7 total matching documents 1.Home - Confluence ----------------------------------- Home - Confluence Dashboard > Bookstore > Home Page Operations View Info Browse Space Pages L...... ----------------------------------- http://www.ApacheBookstore.com/ 2.Welcome to The Apache Software Foundation! ----------------------------------- Welcome to The Apache Software Foundation! The Apache Software Foundation Community-led developmen...... ----------------------------------- http://www.apache.org/ 3.Apache Struts Stats ----------------------------------- Apache Struts Stats apache > cocoon Home Stats About Index Changes FAQ Apache Stats Index ...... ----------------------------------- http://people.apache.org/~vgritsenko/stats/projects/struts 4.Thanks ----------------------------------- Thanks The Apache Software Foundation Thanks Foundation Projects People Get Involved Download...... ----------------------------------- http://apache.org/foundation/thanks.html 5.Apache Tapestry Home Page ----------------------------------- Apache Tapestry Home Page Home Getting Started Documentation Download About Community Apache ...... ----------------------------------- http://tapestry.apache.org/ 6.Licenses ----------------------------------- Licenses The Apache Software Foundation Licenses Foundation Projects People Get Involved Down...... ----------------------------------- http://www.apache.org/licenses/ 7.Sponsorship ----------------------------------- Sponsorship The Apache Software Foundation Sponsorship Foundation Projects People Get Involved...... ----------------------------------- http://apache.org/foundation/sponsorship.html
三、总结
1、该程序初步完成三个功能:网页爬虫、索引器和搜索器
2、该程序中收录的网址最好为英文网站,因为这里面的分词不支持中文。所以中文分词是接下来研究的一个重点内容