lucene初探(二):创建索引,查询索引

上一次  lucene初探(一):IKAnalyzer2012中文分词扩展初探 http://beyondqinghua.iteye.com/admin/blogs/1835986 已经尝试使用IK来分词,这次我们将学习如何将IK整合到lucene创建索引,并检索索引,例子分别使用lucene的英文分词工具、IK中文分词工具,代码依赖的包跟《IKAnalyzer2012中文分词扩展初探》一样。

1、创建一个模型对象

写道
package com.iris.scm.lucene.model;

public class Publication {

private Long id;
private String zhTitle;
private String enTitle;
private String zhAbstract;
private String enAbstract;
private Integer publishYear;

public Publication() {
super();
}
.....get set method
}

 

2、创建索引、查询索引

 

package com.iris.scm.lucene.test;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryParser.MultiFieldQueryParser;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Formatter;
import org.apache.lucene.search.highlight.Fragmenter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.Scorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.lucene.IKAnalyzer;

import com.iris.scm.lucene.model.Publication;

public class LuceneTest {

	// 分词器
	private Analyzer analyzerEn;
	private Analyzer analyzerZh;
	// 索引存放目录
	private Directory directoryZh;
	// 索引存放目录
	private Directory directoryEn;

	public static void main(String[] args) throws Exception {

		LuceneTest test = new LuceneTest();
		test.initDir();
		// test.createIndex();
		test.searchZhPub();
		test.searchEnPub();
	}

	/**
	 * 初始化Analyzer和Directory.
	 * 
	 * @throws IOException
	 */
	public void initDir() throws IOException {

		// 建立一个标准分词器
		// Version.LUCENE_36 表示匹配Lucene3.6版本,使用英文分词解析工具
		analyzerEn = new EnglishAnalyzer(Version.LUCENE_36);

		analyzerZh = new IKAnalyzer();
		// 使用智能分词
		((IKAnalyzer) analyzerZh).setUseSmart(true);

		// 在当前路径下建立一个目录叫indexDir
		File indexDirZh = new File("d:/lucene/LuceneTestZh");

		File indexDirEn = new File("d:/lucene/LuceneTestEn");

		// 创建索引目录
		directoryZh = FSDirectory.open(indexDirZh);
		directoryEn = FSDirectory.open(indexDirEn);
	}

	/**
	 * 建立索引文件
	 * 
	 * @throws IOException
	 */
	public void createIndex() throws IOException {

		// 获取文献信息
		// 中文
		Publication pubZh1 = new Publication();
		pubZh1.setId(123456L);
		pubZh1.setPublishYear(2010);
		pubZh1.setZhTitle("金刚石薄膜抗激光破坏研究");
		pubZh1.setZhAbstract("介绍了金刚石优异的光学和力学特性,对金刚石薄膜在从紫外到红外波段以及不同脉宽激光参数下的激光损伤行为和损伤阈值进行了评述。");
		// 英文
		Publication pubEn1 = new Publication();
		pubEn1.setId(123456L);
		pubEn1.setPublishYear(2010);
		pubEn1.setEnTitle("Laser induced damage for diamond films");
		pubEn1.setEnAbstract("The outstanding optical and mechanical properties of diamond films are summarized.  ");

		// 中文
		Publication pubZh2 = new Publication();
		pubZh2.setId(68545L);
		pubZh2.setPublishYear(2009);
		pubZh2.setZhTitle("激光破坏金刚石薄膜研究");
		pubZh2.setZhAbstract("分析了不同激光工作参数对金刚石薄膜的激光损伤机理,认为石墨化导致晶格失稳是金刚石薄膜激光损伤的主要原因。金刚石薄膜石墨化有两种方式:垂直表面向体层方向石墨化和平行表面按分层的方式逐层石墨化。");
		// 英文
		Publication pubEn2 = new Publication();
		pubEn2.setId(68545L);
		pubEn2.setPublishYear(2009);
		pubEn2.setEnTitle("Laser destruction of the diamond thin films");
		pubEn2.setEnAbstract(" Laser damage for films irradiated with different wave lengths and pulse width are reviewed and the laser damage mechanism analyzed for different parameters. It is found that graphitization induced instability of the crystal lattice is the main reason for laser induced damage. There are two ways that lead to graphitized damage on the surface of diamond films under long and short laser pulses. For nanosecond or longer laser pul...");

		// 建立Document
		Document docZh1 = new Document();
		// Store指定Field是否需要存储,Index指定Field是否需要分词索引
		docZh1.add(new Field("id", pubZh1.getId().toString(), Store.YES, Index.NOT_ANALYZED));
		docZh1.add(new Field("publish_year", pubZh1.getPublishYear().toString(), Store.YES, Index.NOT_ANALYZED));
		docZh1.add(new Field("zh_title", pubZh1.getZhTitle(), Store.YES, Index.ANALYZED));
		docZh1.add(new Field("zh_abstract", pubZh1.getZhAbstract(), Store.YES, Index.ANALYZED));
		// 建立Document
		Document docZh2 = new Document();
		docZh2.add(new Field("id", pubZh2.getId().toString(), Store.YES, Index.NOT_ANALYZED));
		docZh2.add(new Field("publish_year", pubZh2.getPublishYear().toString(), Store.YES, Index.NOT_ANALYZED));
		docZh2.add(new Field("zh_title", pubZh2.getZhTitle(), Store.YES, Index.ANALYZED));
		docZh2.add(new Field("zh_abstract", pubZh2.getZhAbstract(), Store.YES, Index.ANALYZED));

		Document docEn1 = new Document();
		docEn1.add(new Field("id", pubEn1.getId().toString(), Store.YES, Index.NOT_ANALYZED));
		docEn1.add(new Field("publish_year", pubEn1.getPublishYear().toString(), Store.YES, Index.NOT_ANALYZED));
		docEn1.add(new Field("en_title", pubEn1.getEnTitle(), Store.YES, Index.ANALYZED));
		docEn1.add(new Field("en_abstract", pubEn1.getEnAbstract(), Store.YES, Index.ANALYZED));

		Document docEn2 = new Document();
		docEn2.add(new Field("id", pubEn2.getId().toString(), Store.YES, Index.NOT_ANALYZED));
		docEn2.add(new Field("publish_year", pubEn2.getPublishYear().toString(), Store.YES, Index.NOT_ANALYZED));
		docEn2.add(new Field("en_title", pubEn2.getEnTitle(), Store.YES, Index.ANALYZED));
		docEn2.add(new Field("en_abstract", pubEn2.getEnAbstract(), Store.YES, Index.ANALYZED));

		// 建立一个IndexWriter配置,指定匹配的版本,以及分词器
		IndexWriterConfig indexWriterConfigZh = new IndexWriterConfig(Version.LUCENE_36, analyzerZh);
		IndexWriterConfig indexWriterConfigEn = new IndexWriterConfig(Version.LUCENE_36, analyzerEn);
		// 创建IndexWriter,它负责索引的创建和维护
		IndexWriter indexWriterZh = new IndexWriter(directoryZh, indexWriterConfigZh);
		IndexWriter indexWriterEn = new IndexWriter(directoryEn, indexWriterConfigEn);

		// 把Document加入到索引中
		indexWriterZh.addDocument(docZh1);
		indexWriterZh.addDocument(docZh2);
		indexWriterEn.addDocument(docEn1);
		indexWriterEn.addDocument(docEn2);

		// 提交改变到索引,然后关闭
		indexWriterZh.close();
		indexWriterEn.close();

	}

	/**
	 * 搜索文献中文内容.
	 * 
	 * @throws ParseException
	 * @throws CorruptIndexException
	 * @throws IOException
	 * @throws InvalidTokenOffsetsException
	 */
	public void searchZhPub() throws ParseException, CorruptIndexException, IOException, InvalidTokenOffsetsException {
		// 搜索的关键词
		String queryKeyWord = "金刚石薄膜";

		// 创建查询分析器,把查询关键词转化为查询对象Query(单个Field中搜索)
		// 在标题的索引中搜索
		// QueryParser queryParser = new QueryParser(Version.LUCENE_36, "zh_title", analyzerZh);

		String[] fields = { "zh_title", "zh_abstract" };
		// (在多个Filed中搜索)
		QueryParser queryParser = new MultiFieldQueryParser(Version.LUCENE_36, fields, analyzerZh);
		Query query = queryParser.parse(queryKeyWord);

		// 获取访问索引的接口,进行搜索
		IndexReader indexReader = IndexReader.open(directoryZh);
		IndexSearcher indexSearcher = new IndexSearcher(indexReader);

		// TopDocs 搜索返回的结果
		TopDocs topDocs = indexSearcher.search(query, 100);// 只返回前100条记录

		int totalCount = topDocs.totalHits; // 搜索结果总数量
		System.out.println("搜索到的结果总数量为:" + totalCount);

		ScoreDoc[] scoreDocs = topDocs.scoreDocs; // 搜索的结果列表

		// 创建高亮器,使搜索的关键词突出显示
		Formatter formatter = new SimpleHTMLFormatter("", "");
		Scorer fragmentScore = new QueryScorer(query);
		Highlighter highlighter = new Highlighter(formatter, fragmentScore);
		Fragmenter fragmenter = new SimpleFragmenter(100);
		highlighter.setTextFragmenter(fragmenter);

		List pubs = new ArrayList();
		// 把搜索结果取出放入到集合中
		for (ScoreDoc scoreDoc : scoreDocs) {
			int docID = scoreDoc.doc;// 当前结果的文档编号
			float score = scoreDoc.score;// 当前结果的相关度得分
			System.out.println("score is : " + score);

			Document document = indexSearcher.doc(docID);
			Publication pubZh = new Publication();
			pubZh.setId(Long.parseLong(document.get("id")));

			// 高亮显示title
			String zhTitle = document.get("zh_title");
			String highlighterTitle = highlighter.getBestFragment(analyzerZh, "zh_title", zhTitle);
			// 如果title中没有找到关键词
			if (highlighterTitle == null) {
				highlighterTitle = zhTitle;
			}
			pubZh.setZhTitle(highlighterTitle);

			// 高亮显示abstract
			String zhAbstract = document.get("zh_abstract");
			String highlighterAbstract = highlighter.getBestFragment(analyzerZh, "zh_abstract", zhAbstract);
			// 如果Abstract中没有找到关键词
			if (highlighterAbstract == null) {
				highlighterAbstract = zhAbstract;
			}
			pubZh.setZhAbstract(highlighterAbstract);

			pubZh.setPublishYear(Integer.parseInt(document.get("publish_year")));

			pubs.add(pubZh);
		}
		// 关闭
		indexReader.close();
		indexSearcher.close();
		for (Publication pub : pubs) {
			System.out.println("pub'id is : " + pub.getId());
			System.out.println("pub'publish year is : " + pub.getPublishYear());
			System.out.println("pub'title is : " + pub.getZhTitle());
			System.out.println("pub'abstract is : " + pub.getZhAbstract());
		}
	}

	/**
	 * 搜索文献英文内容.
	 * 
	 * @throws ParseException
	 * @throws CorruptIndexException
	 * @throws InvalidTokenOffsetsException
	 */
	public void searchEnPub() throws ParseException, CorruptIndexException, IOException, InvalidTokenOffsetsException {
		// 搜索的关键词
		String queryKeyWord = "diamond films";

		// 创建查询分析器,把查询关键词转化为查询对象Query(单个Field中搜索)
		// 在标题的索引中搜索
		// QueryParser queryParser = new QueryParser(Version.LUCENE_36, "en_title", analyzerEn);

		String[] fields = { "en_title", "en_abstract" };
		// (在多个Filed中搜索)
		QueryParser queryParser = new MultiFieldQueryParser(Version.LUCENE_36, fields, analyzerEn);
		Query query = queryParser.parse(queryKeyWord);

		// 获取访问索引的接口,进行搜索
		IndexReader indexReader = IndexReader.open(directoryEn);
		IndexSearcher indexSearcher = new IndexSearcher(indexReader);

		// TopDocs 搜索返回的结果
		TopDocs topDocs = indexSearcher.search(query, 100);// 只返回前100条记录

		int totalCount = topDocs.totalHits; // 搜索结果总数量
		System.out.println("搜索到的结果总数量为:" + totalCount);

		ScoreDoc[] scoreDocs = topDocs.scoreDocs; // 搜索的结果列表

		// 创建高亮器,使搜索的关键词突出显示
		Formatter formatter = new SimpleHTMLFormatter("", "");
		Scorer fragmentScore = new QueryScorer(query);
		Highlighter highlighter = new Highlighter(formatter, fragmentScore);
		Fragmenter fragmenter = new SimpleFragmenter(100);
		highlighter.setTextFragmenter(fragmenter);

		List pubs = new ArrayList();
		// 把搜索结果取出放入到集合中
		for (ScoreDoc scoreDoc : scoreDocs) {
			int docID = scoreDoc.doc;// 当前结果的文档编号
			float score = scoreDoc.score;// 当前结果的相关度得分
			System.out.println("score is : " + score);

			Document document = indexSearcher.doc(docID);
			Publication pubEn = new Publication();
			pubEn.setId(Long.parseLong(document.get("id")));

			// 高亮显示title
			String enTitle = document.get("en_title");
			String highlighterTitle = highlighter.getBestFragment(analyzerEn, "en_title", enTitle);
			// 如果title中没有找到关键词
			if (highlighterTitle == null) {
				highlighterTitle = enTitle;
			}
			pubEn.setEnTitle(highlighterTitle);

			// 高亮显示abstract
			String enAbstract = document.get("en_abstract");
			String highlighterAbstract = highlighter.getBestFragment(analyzerEn, "en_abstract", enAbstract);
			// 如果Abstract中没有找到关键词
			if (highlighterAbstract == null) {
				highlighterAbstract = enAbstract;
			}
			pubEn.setEnAbstract(highlighterAbstract);

			pubEn.setPublishYear(Integer.parseInt(document.get("publish_year")));

			pubs.add(pubEn);
		}
		// 关闭
		indexReader.close();
		indexSearcher.close();
		for (Publication pub : pubs) {
			System.out.println("pub'id is : " + pub.getId());
			System.out.println("pub'publish year is : " + pub.getPublishYear());
			System.out.println("pub'title is : " + pub.getEnTitle());
			System.out.println("pub'abstract is : " + pub.getEnAbstract());
		}
	}
}

 3、结果

写道
加载扩展词典:ext.dic
加载扩展停止词典:stopword.dic
加载扩展停止词典:stopword_chinese.dic
搜索到的结果总数量为:2
score is : 0.30121902
score is : 0.24961227
pub'id is : 68545
pub'publish year is : 2009
pub'title is : 激光破坏金刚石薄膜研究
pub'abstract is : 分析了不同激光工作参数对金刚石薄膜的激光损伤机理,认为石墨化导致晶格失稳是金刚石薄膜激光损伤的主要原因。金刚石薄膜石墨化有两种方式:垂直表面向体层方向石墨化和平行表面按分层的方式逐层石墨化。
pub'id is : 123456
pub'publish year is : 2010
pub'title is : 金刚石薄膜抗激光破坏研究
pub'abstract is : 介绍了金刚石优异的光学和力学特性,对金刚石薄膜在从紫外到红外波段以及不同脉宽激光参数下的激光损伤行为和损伤阈值进行了评述。
搜索到的结果总数量为:2
score is : 0.48305953
score is : 0.34981734
pub'id is : 123456
pub'publish year is : 2010
pub'title is : Laser induced damage for diamond films
pub'abstract is : The outstanding optical and mechanical properties of diamond films are summarized.
pub'id is : 68545
pub'publish year is : 2009
pub'title is : Laser destruction of the diamond thin films
pub'abstract is : that lead to graphitized damage on the surface of diamond films under long and short laser pulses

 

你可能感兴趣的:(lucene)