转自:http://callan.javaeye.com/blog/155602
TermVector是Lucene 1.4新增的 它提供一种向量机制来进行模糊查询,TermVector保存Token.getPositionIncrement() 和Token.startOffset() 以及Token.endOffset() 信息.
Field.TermVector.NO:不保存term vectors
Field.TermVector.YES:保存term vectors
Field.TermVector.WITH_POSITIONS:保存term vectors.(保存值和token位置信息)
Field.TermVector.WITH_OFFSETS:保存term vectors.(保存值和Token的offset)
Field.TermVector.WITH_POSITIONS_OFFSETS:保存term vectors.(保存值和token位置信息和Token的offset)
代码:
package com.lucene.search; import java.io.IOException; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.Field.Index; import org.apache.lucene.document.Field.Store; import org.apache.lucene.document.Field.TermVector; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermFreqVector; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.Hits; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.TermQuery; import org.apache.lucene.store.RAMDirectory; public class TermVectorTest { Analyzer analyzer = new StandardAnalyzer(); RAMDirectory directory = new RAMDirectory(); public void index() throws IOException { IndexWriter indexWriter = new IndexWriter(directory, analyzer, true); Document doc1 = new Document(); doc1.add(new Field("title", "java", Store.YES, Index.TOKENIZED)); doc1.add(new Field("author", "John", Store.YES, Index.TOKENIZED)); doc1.add(new Field("subject", "java一门编程语言", Store.YES, Index.TOKENIZED, TermVector.WITH_POSITIONS_OFFSETS)); indexWriter.addDocument(doc1); Document doc2 = new Document(); doc2.add(new Field("title", "english", Store.YES, Index.TOKENIZED)); doc2.add(new Field("author", "Lucy", Store.YES, Index.TOKENIZED)); doc2.add(new Field("subject", "英语用的人很多", Store.YES, Index.TOKENIZED, TermVector.WITH_POSITIONS_OFFSETS)); indexWriter.addDocument(doc2); Document doc3 = new Document(); doc3.add(new Field("title", "asp", Store.YES, Index.TOKENIZED)); doc3.add(new Field("author", "Lily", Store.YES, Index.TOKENIZED)); doc3.add(new Field("subject", "asp很多人用", Store.YES, Index.TOKENIZED, TermVector.WITH_POSITIONS_OFFSETS)); indexWriter.addDocument(doc3); indexWriter.optimize(); indexWriter.close(); } public void searcher() throws IOException { IndexSearcher searcher = new IndexSearcher(directory); // 搜索书名为java的索引 TermQuery query = new TermQuery(new Term("title", "java")); Hits hits = searcher.search(query); // 能找到一条记录 for (int i = 0; i < hits.length(); i++) { Document doc = hits.doc(i); System.out.println("书名:" + doc.get("title") + " 作者: " + doc.get("author") + " 简介: " + doc.get("subject")); System.out.println("相关的书:"); docsLike(hits.id(i)); } } // 在subject中模糊搜索与doc相进的索引 public void docsLike(int id) throws IOException { IndexReader reader = IndexReader.open(directory); TermFreqVector vector = reader.getTermFreqVector(id, "subject"); BooleanQuery query = new BooleanQuery(); for (int j = 0; j < vector.size(); j++) { TermQuery tq = new TermQuery(new Term("subject", vector.getTerms()[j])); query.add(tq, BooleanClause.Occur.SHOULD); } IndexSearcher searcher = new IndexSearcher(directory); Hits hits = searcher.search(query); printResult(hits); } // 显示结果 public void printResult(Hits hits) throws IOException { for (int i = 0; i < hits.length(); i++) { Document doc = hits.doc(i); System.out.println("书名:" + doc.get("title") + " 作者: " + doc.get("author") + " 简介: " + doc.get("subject")); } } public static void main(String[] args) throws IOException { TermVectorTest test = new TermVectorTest(); test.index(); test.searcher(); } }
搜索结果:
书名:java 作者: John 简介:java一门编程语言
相关的书:
书名:java 作者: John 简介:java一门编程语言
书名:english 作者: Lucy 简介:英语用的人很多
搜索书名为java 的索引,并且搜索与java的简介相关的索引.
将书<<java>>的subject分词为java/一/门/编/程/语/言/
在subject中搜索包含java/一/门/编/程/语/言/的索引
<<english>>包含语
更多信息:http://callan.javaeye.com/blog/155611