Lucene简单搜索引擎模拟

前面介绍了Lucene的索引与搜索,及中文分词,下面用一个小实例模拟一下搜索引擎。

mmseg4j : 中文分词

htmlparser:html解析

maven pom:

<properties>
		<lucene.version>5.2.1</lucene.version>
	</properties>

	<dependencies>
		<dependency>
			<groupId> org.apache.lucene</groupId>
			<artifactId> lucene-core</artifactId>
			<version> ${lucene.version}</version>
		</dependency>
		<dependency>
			<groupId> org.apache.lucene</groupId>
			<artifactId> lucene-analyzers-common</artifactId>
			<version> ${lucene.version}</version>
		</dependency>
		<dependency>
			<groupId> org.apache.lucene</groupId>
			<artifactId> lucene-queryparser</artifactId>
			<version> ${lucene.version}</version>
		</dependency>
		<dependency>
			<groupId> org.apache.lucene</groupId>
			<artifactId> lucene-highlighter</artifactId>
			<version> ${lucene.version}</version>
		</dependency>
		<dependency>
			<groupId>com.chenlb.mmseg4j</groupId>
			<artifactId>mmseg4j-core</artifactId>
			<version>1.10.0</version>
		</dependency>
		<dependency>
			<groupId>org.htmlparser</groupId>
			<artifactId>htmlparser</artifactId>
			<version>2.1</version>
			<exclusions>
				<exclusion>
					 <groupId>com.sun</groupId>
          			 <artifactId>tools</artifactId>
				</exclusion>
			</exclusions>
		</dependency>
		
		<!-- 如下可以编译最新的mmseg4j-analysis替换,不然加载报错 -->
		<dependency>
			<groupId>com.chenlb.mmseg4j</groupId>
			<artifactId>mmseg4j-solr</artifactId>
			<version>2.3.0</version>
			<exclusions>
				<exclusion>
					<groupId>org.apache.solr</groupId>
					<artifactId>solr-core</artifactId>
				</exclusion>
			</exclusions>
		</dependency>
	</dependencies>
收录网站,创建索引:

package cn.slimsmart.lucene.mmseg4j.simple;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.store.FSDirectory;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.beans.StringBean;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.visitors.HtmlPage;

import com.chenlb.mmseg4j.analysis.ComplexAnalyzer;

public class Indexer {

	// 已经存在的url列表
	private static List<String> urls = new ArrayList<String>();

	/**
	 * 索引器,对目标url创建索引
	 * 
	 * @param url
	 *            目标网址
	 * @param indexPath
	 * 			  索引目录
	 */
	private static void indexer(String url, String indexPath) throws IOException, ParserException {
		// 存储索引的目录
		File indexDir = new File(indexPath);
		// 目录不存在,创建该目录
		if (!indexDir.exists()) {
			indexDir.mkdir();
		}
		// 获取网页纯文本
		String content = getText(url);
		// 获取网页标题
		String title = getTitle(url);

		System.out.println("title:" + title);

		if (title == null || content == null || content.trim().equals("")) {
			return;
		}
		Document doc = new Document();
		FieldType fieldType = new FieldType();
		fieldType.setStored(true);
		fieldType.setIndexOptions(IndexOptions.NONE);
		// 加入url域
		doc.add(new Field("url", url, fieldType));
		// 加入标题域
		doc.add(new StringField("title", title, Field.Store.YES));
		// Index.ANALYZED分词后构建索引
		// 加入内容域
		doc.add(new TextField("content", content, Field.Store.YES));
		Analyzer analyzer = new ComplexAnalyzer();
		IndexWriterConfig iwConfig = new IndexWriterConfig(analyzer);
		iwConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);
		IndexWriter writer = new IndexWriter(FSDirectory.open(indexDir.toPath()), iwConfig);
		// 写入文档
		writer.addDocument(doc);
		// 关闭
		writer.close();
		// 创建了索引的网址加入到已经存在的网址列表中
		urls.add(url);
	}

	/**
	 * 获取网页纯文本
	 * @param url
	 *            目标网址
	 * @return
	 * @throws ParserException
	 */
	private static String getText(String url) throws ParserException {
		StringBean sb = new StringBean();
		// 设置不需要得到页面所包含的链接信息
		sb.setLinks(false);
		// 设置将不间断空格由正规空格所替代
		sb.setReplaceNonBreakingSpaces(true);
		// 设置将一序列空格由一个单一空格所代替
		sb.setCollapse(true);
		// 传入要解析的URL
		sb.setURL(url);
		// 返回解析后的网页纯文本信息
		String content = sb.getStrings();
		// System.out.println(content);
		return content;
	}

	/**
	 * 获取网页标题
	 * @param path
	 * @return
	 * @throws IOException
	 * @throws ParserException
	 */
	private static String getTitle(String path) throws IOException, ParserException {
		String title = "";
		try {
			Parser parser = new Parser(path);
			HtmlPage page = new HtmlPage(parser);
			parser.visitAllNodesWith(page);
			title = page.getTitle();
		} catch (Exception e) {
			title = "no title";
		}
		return title.trim();
	}

	/**
	 * 获取网页中所有的链接
	 * @param url
	 * @return
	 * @throws ParserException
	 */
	private static List<String> getLinks(String url) throws ParserException {
		List<String> links = new ArrayList<String>();
		// 创建链接节点的过滤器
		NodeFilter filter = new NodeClassFilter(LinkTag.class);
		Parser parser = new Parser();
		parser.setURL(url);
		// 设置目标网页的编码方式,java获取encode可以使用cpdetector
		//这里我们给一个默认值
		parser.setEncoding("UTF-8"); 
		NodeList list = parser.extractAllNodesThatMatch(filter);
		for (int i = 0; i < list.size(); i++) {
			LinkTag node = (LinkTag) list.elementAt(i);
			// 获取链接的目标网址
			String link = node.extractLink();
			if (link != null && !link.trim().equals("") && !link.equals("#")) {
				// 将目标网址加入到该页面的所有网址列表中
				links.add(link);
			}
		}
		return links;
	}

	/**
	 * 收录网站
	 * 
	 * @param url
	 *            网站首页url,也可以为网站地图url
	 * @param indexPath
	 * 			  索引目录
	 * @throws ParserException
	 * @throws IOException
	 * @throws ParseException
	 */
	public static void addSite(String url, String indexPath) throws ParserException, IOException, ParseException {
		long start = System.currentTimeMillis();
		System.out.println("start add...");
		// 获取目标网页的所有链接
		List<String> links = getLinks(url);
		System.out.println("url count:" + links.size());
		for (int i = 0; i < links.size(); i++) {
			String link = links.get(i);
			System.out.println((i + 1) + "." + link);
			if (!urls.contains(link)) {
				// 对未创建过索引的网页创建索引
				indexer(link, indexPath);
			} else {
				System.out.println("[" + link + "] exist");
			}
		}
		System.out.println("end...");
		long end = System.currentTimeMillis();
		System.out.println("cost " + (end - start) / 1000 + " seconds");
	}
}

搜索查询:

package cn.slimsmart.lucene.mmseg4j.simple;

import java.io.File;
import java.io.IOException;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryparser.classic.MultiFieldQueryParser;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

import com.chenlb.mmseg4j.analysis.ComplexAnalyzer;

public class Searcher {
	
	 /** 
     * 搜索器,根据输入的文本去搜索 
     * @param words     输入的文本 
     * @param indexPath     索引目录
     * @throws CorruptIndexException 
     * @throws IOException 
     * @throws ParseException 
     */  
    public static void searcher(String words,String indexPath) throws CorruptIndexException,  
            IOException, ParseException {  
        File indexDir = new File(indexPath);  
        //索引目录  
        Directory dir=FSDirectory.open(indexDir.toPath()); 
        //根据索引目录创建读索引对象  
        IndexReader reader = DirectoryReader.open(dir);  
        //搜索对象创建  
        IndexSearcher searcher = new IndexSearcher(reader);  
        //中文分词  
        Analyzer analyzer = new ComplexAnalyzer();
        //创建查询解析对象  
        QueryParser parser = new MultiFieldQueryParser(new String[]{"title","content"}, analyzer);  
        parser.setDefaultOperator(QueryParser.AND_OPERATOR);  
        //根据域和目标搜索文本创建查询器  
        Query query = parser.parse(words);  
        System.out.println("Searching for: " + query.toString(words));  
        //对结果进行相似度打分排序  
        TopScoreDocCollector collector = TopScoreDocCollector.create(5 * 10);  
        searcher.search(query, collector);  
        //获取结果  
        ScoreDoc[] hits = collector.topDocs().scoreDocs;  
        int numTotalHits = collector.getTotalHits();  
        System.out.println("一共匹配"+numTotalHits+"个网页");  
        //显示搜索结果  
        for (int i = 0; i < hits.length; i++) {  
            Document doc = searcher.doc(hits[i].doc);  
            String url = doc.get("url");  
            String title=doc.get("title");  
            String content=doc.get("content");  
            System.out.println((i + 1) + "." + title);  
            System.out.println("-----------------------------------");  
            System.out.println(content.substring(0,100)+"......");  
            System.out.println("-----------------------------------");  
            System.out.println(url);  
            System.out.println();  
        }  
    } 
}
测试:

package cn.slimsmart.lucene.mmseg4j.simple;


/**
 * 测试
 * @author slimina
 * 2015年6月23日
 */
public class Test {

	public static String indexPath="src/main/resources/index";
	
	public static void main(String[] args) throws Exception, Exception, Exception {
		//收录网站,创建索引
		Indexer.addSite("http://www.baidu.com/", indexPath);
		Indexer.addSite("http://www.csdn.net/", indexPath);
		Indexer.addSite("http://www.oschina.net/", indexPath);
		
		//搜索
		Searcher.searcher("java", indexPath);
	}
}

在解析网站时,需要指定网站的编码,实例中默认了UTF-8,大家可以自己解析网站中的编码,也可以使用cpdetector解析获取,参考:http://liulijun-cn-2011.iteye.com/blog/1629477,否则出现乱码。

参考文章:

1.基于htmlparser实现网页内容解析

2.Lucene学习——初探搜索引擎

你可能感兴趣的:(Lucene,全文检索)