前面介绍了Lucene的索引与搜索,及中文分词,下面用一个小实例模拟一下搜索引擎。
mmseg4j : 中文分词
htmlparser:html解析
maven pom:
<properties> <lucene.version>5.2.1</lucene.version> </properties> <dependencies> <dependency> <groupId> org.apache.lucene</groupId> <artifactId> lucene-core</artifactId> <version> ${lucene.version}</version> </dependency> <dependency> <groupId> org.apache.lucene</groupId> <artifactId> lucene-analyzers-common</artifactId> <version> ${lucene.version}</version> </dependency> <dependency> <groupId> org.apache.lucene</groupId> <artifactId> lucene-queryparser</artifactId> <version> ${lucene.version}</version> </dependency> <dependency> <groupId> org.apache.lucene</groupId> <artifactId> lucene-highlighter</artifactId> <version> ${lucene.version}</version> </dependency> <dependency> <groupId>com.chenlb.mmseg4j</groupId> <artifactId>mmseg4j-core</artifactId> <version>1.10.0</version> </dependency> <dependency> <groupId>org.htmlparser</groupId> <artifactId>htmlparser</artifactId> <version>2.1</version> <exclusions> <exclusion> <groupId>com.sun</groupId> <artifactId>tools</artifactId> </exclusion> </exclusions> </dependency> <!-- 如下可以编译最新的mmseg4j-analysis替换,不然加载报错 --> <dependency> <groupId>com.chenlb.mmseg4j</groupId> <artifactId>mmseg4j-solr</artifactId> <version>2.3.0</version> <exclusions> <exclusion> <groupId>org.apache.solr</groupId> <artifactId>solr-core</artifactId> </exclusion> </exclusions> </dependency> </dependencies>收录网站,创建索引:
package cn.slimsmart.lucene.mmseg4j.simple; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.store.FSDirectory; import org.htmlparser.NodeFilter; import org.htmlparser.Parser; import org.htmlparser.beans.StringBean; import org.htmlparser.filters.NodeClassFilter; import org.htmlparser.tags.LinkTag; import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; import org.htmlparser.visitors.HtmlPage; import com.chenlb.mmseg4j.analysis.ComplexAnalyzer; public class Indexer { // 已经存在的url列表 private static List<String> urls = new ArrayList<String>(); /** * 索引器,对目标url创建索引 * * @param url * 目标网址 * @param indexPath * 索引目录 */ private static void indexer(String url, String indexPath) throws IOException, ParserException { // 存储索引的目录 File indexDir = new File(indexPath); // 目录不存在,创建该目录 if (!indexDir.exists()) { indexDir.mkdir(); } // 获取网页纯文本 String content = getText(url); // 获取网页标题 String title = getTitle(url); System.out.println("title:" + title); if (title == null || content == null || content.trim().equals("")) { return; } Document doc = new Document(); FieldType fieldType = new FieldType(); fieldType.setStored(true); fieldType.setIndexOptions(IndexOptions.NONE); // 加入url域 doc.add(new Field("url", url, fieldType)); // 加入标题域 doc.add(new StringField("title", title, Field.Store.YES)); // Index.ANALYZED分词后构建索引 // 加入内容域 doc.add(new TextField("content", content, Field.Store.YES)); Analyzer analyzer = new ComplexAnalyzer(); IndexWriterConfig iwConfig = new IndexWriterConfig(analyzer); iwConfig.setOpenMode(OpenMode.CREATE_OR_APPEND); IndexWriter writer = new IndexWriter(FSDirectory.open(indexDir.toPath()), iwConfig); // 写入文档 writer.addDocument(doc); // 关闭 writer.close(); // 创建了索引的网址加入到已经存在的网址列表中 urls.add(url); } /** * 获取网页纯文本 * @param url * 目标网址 * @return * @throws ParserException */ private static String getText(String url) throws ParserException { StringBean sb = new StringBean(); // 设置不需要得到页面所包含的链接信息 sb.setLinks(false); // 设置将不间断空格由正规空格所替代 sb.setReplaceNonBreakingSpaces(true); // 设置将一序列空格由一个单一空格所代替 sb.setCollapse(true); // 传入要解析的URL sb.setURL(url); // 返回解析后的网页纯文本信息 String content = sb.getStrings(); // System.out.println(content); return content; } /** * 获取网页标题 * @param path * @return * @throws IOException * @throws ParserException */ private static String getTitle(String path) throws IOException, ParserException { String title = ""; try { Parser parser = new Parser(path); HtmlPage page = new HtmlPage(parser); parser.visitAllNodesWith(page); title = page.getTitle(); } catch (Exception e) { title = "no title"; } return title.trim(); } /** * 获取网页中所有的链接 * @param url * @return * @throws ParserException */ private static List<String> getLinks(String url) throws ParserException { List<String> links = new ArrayList<String>(); // 创建链接节点的过滤器 NodeFilter filter = new NodeClassFilter(LinkTag.class); Parser parser = new Parser(); parser.setURL(url); // 设置目标网页的编码方式,java获取encode可以使用cpdetector //这里我们给一个默认值 parser.setEncoding("UTF-8"); NodeList list = parser.extractAllNodesThatMatch(filter); for (int i = 0; i < list.size(); i++) { LinkTag node = (LinkTag) list.elementAt(i); // 获取链接的目标网址 String link = node.extractLink(); if (link != null && !link.trim().equals("") && !link.equals("#")) { // 将目标网址加入到该页面的所有网址列表中 links.add(link); } } return links; } /** * 收录网站 * * @param url * 网站首页url,也可以为网站地图url * @param indexPath * 索引目录 * @throws ParserException * @throws IOException * @throws ParseException */ public static void addSite(String url, String indexPath) throws ParserException, IOException, ParseException { long start = System.currentTimeMillis(); System.out.println("start add..."); // 获取目标网页的所有链接 List<String> links = getLinks(url); System.out.println("url count:" + links.size()); for (int i = 0; i < links.size(); i++) { String link = links.get(i); System.out.println((i + 1) + "." + link); if (!urls.contains(link)) { // 对未创建过索引的网页创建索引 indexer(link, indexPath); } else { System.out.println("[" + link + "] exist"); } } System.out.println("end..."); long end = System.currentTimeMillis(); System.out.println("cost " + (end - start) / 1000 + " seconds"); } }
搜索查询:
package cn.slimsmart.lucene.mmseg4j.simple; import java.io.File; import java.io.IOException; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.queryparser.classic.MultiFieldQueryParser; import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopScoreDocCollector; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import com.chenlb.mmseg4j.analysis.ComplexAnalyzer; public class Searcher { /** * 搜索器,根据输入的文本去搜索 * @param words 输入的文本 * @param indexPath 索引目录 * @throws CorruptIndexException * @throws IOException * @throws ParseException */ public static void searcher(String words,String indexPath) throws CorruptIndexException, IOException, ParseException { File indexDir = new File(indexPath); //索引目录 Directory dir=FSDirectory.open(indexDir.toPath()); //根据索引目录创建读索引对象 IndexReader reader = DirectoryReader.open(dir); //搜索对象创建 IndexSearcher searcher = new IndexSearcher(reader); //中文分词 Analyzer analyzer = new ComplexAnalyzer(); //创建查询解析对象 QueryParser parser = new MultiFieldQueryParser(new String[]{"title","content"}, analyzer); parser.setDefaultOperator(QueryParser.AND_OPERATOR); //根据域和目标搜索文本创建查询器 Query query = parser.parse(words); System.out.println("Searching for: " + query.toString(words)); //对结果进行相似度打分排序 TopScoreDocCollector collector = TopScoreDocCollector.create(5 * 10); searcher.search(query, collector); //获取结果 ScoreDoc[] hits = collector.topDocs().scoreDocs; int numTotalHits = collector.getTotalHits(); System.out.println("一共匹配"+numTotalHits+"个网页"); //显示搜索结果 for (int i = 0; i < hits.length; i++) { Document doc = searcher.doc(hits[i].doc); String url = doc.get("url"); String title=doc.get("title"); String content=doc.get("content"); System.out.println((i + 1) + "." + title); System.out.println("-----------------------------------"); System.out.println(content.substring(0,100)+"......"); System.out.println("-----------------------------------"); System.out.println(url); System.out.println(); } } }测试:
package cn.slimsmart.lucene.mmseg4j.simple; /** * 测试 * @author slimina * 2015年6月23日 */ public class Test { public static String indexPath="src/main/resources/index"; public static void main(String[] args) throws Exception, Exception, Exception { //收录网站,创建索引 Indexer.addSite("http://www.baidu.com/", indexPath); Indexer.addSite("http://www.csdn.net/", indexPath); Indexer.addSite("http://www.oschina.net/", indexPath); //搜索 Searcher.searcher("java", indexPath); } }
在解析网站时,需要指定网站的编码,实例中默认了UTF-8,大家可以自己解析网站中的编码,也可以使用cpdetector解析获取,参考:http://liulijun-cn-2011.iteye.com/blog/1629477,否则出现乱码。
参考文章:
1.基于htmlparser实现网页内容解析
2.Lucene学习——初探搜索引擎