使用luncene3.6
下载地址 http://www.apache.org/dyn/closer.cgi/lucene/java/3.6.1
分词器使用 mmseg4j
下载地址 http://code.google.com/p/mmseg4j/downloads/list
项目中使用到的jar包有 :
lucene-3.6.1/lucene-core-3.6.1.jar(核心包)
lucene-3.6.1/contrib/lucene-analyzers-3.6.1.jar(分词包)
lucene-3.6.1/contrib/lucene-highlighter-3.6.1.jar(高亮包)
mmseg4j-1.8.3/mmseg4j-all-1.8.3.jar(第三方分词器,因为luncene自带的分词器没有词库,汉字都是一个一个拆开的)
使用时记得设置词库的路径
mmseg4j-1.8.3/data
package lucene; import java.io.File; import java.io.StringReader; import java.util.List; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.Field.Index; import org.apache.lucene.document.Field.Store; import org.apache.lucene.document.Fieldable; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.Term; import org.apache.lucene.queryParser.MultiFieldQueryParser; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.PrefixQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopScoreDocCollector; import org.apache.lucene.search.WildcardQuery; import org.apache.lucene.search.highlight.Formatter; import org.apache.lucene.search.highlight.Highlighter; import org.apache.lucene.search.highlight.QueryScorer; import org.apache.lucene.search.highlight.SimpleHTMLFormatter; import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.Version; import com.chenlb.mmseg4j.analysis.MaxWordAnalyzer; public class LuceneSearch { //mmseg4j词库路径 private static final String DISC_URL = "D:/My Documents/Downloads/mmseg4j-1.8.3/data"; //指定分词器 StandardAnalyzer、MaxWordAnalyzer、SimpleAnalyzer、ComplexAnalyzer private static Analyzer analyzer = new MaxWordAnalyzer(new File(DISC_URL)); //lucene版本 private static Version version = Version.LUCENE_36; //指定索引位置 RAMDirectory(内存)、FSDirectory(文件) private static Directory directory = new RAMDirectory(); //配置 private static IndexWriterConfig conf = new IndexWriterConfig(version,analyzer); //配置IndexWriter private static IndexWriter writer; static{ try { writer = new IndexWriter(directory, conf); } catch (Exception e) { } } /** * 全量索引 * @Author TangJiaZhi */ public void fullIndex(Document[] documentes) throws Exception { writer.deleteAll(); for (Document document : documentes) { writer.addDocument(document); } writer.commit(); // writer.close(); } /** * 根据id删除索引 * @Author TangJiaZhi */ public void deleteIndex(Document document)throws Exception{ Term term = new Term("id", document.get("id")); writer.deleteDocuments(term); writer.commit(); } /** * 根据id增量索引 * @Author TangJiaZhi */ public void updateIndex(Document[] documentes) throws Exception{ for (Document document : documentes) { Term term = new Term("id", document.get("id")); writer.updateDocument(term, document); } writer.commit(); // writer.close(); } /** * 直接查询 * @Author TangJiaZhi */ public void simpleSearch(String filedStr,String queryStr,int page, int pageSize) throws Exception{ IndexReader reader = IndexReader.open(directory); IndexSearcher searcher = new IndexSearcher(reader); TopScoreDocCollector topCollector = TopScoreDocCollector.create(searcher.maxDoc(), false); Term term = new Term(filedStr, queryStr); Query query = new TermQuery(term); searcher.search(query, topCollector); ScoreDoc[] docs = topCollector.topDocs((page-1)*pageSize, pageSize).scoreDocs; printScoreDoc(docs, searcher); } /** * 高亮查询 * @Author TangJiaZhi */ public void highLightSearch(String filedStr,String queryStr,int page, int pageSize) throws Exception{ IndexReader reader = IndexReader.open(directory); IndexSearcher searcher = new IndexSearcher(reader); TopScoreDocCollector topCollector = TopScoreDocCollector.create(searcher.maxDoc(), false); Term term = new Term(filedStr, queryStr); Query query = new TermQuery(term); searcher.search(query, topCollector); ScoreDoc[] docs = topCollector.topDocs((page-1)*pageSize, pageSize).scoreDocs; Formatter formatter = new SimpleHTMLFormatter("<span>","</span>"); Highlighter highlighter = new Highlighter(formatter, new QueryScorer(query)); for (int i = 0; i < docs.length; i++) { List<Fieldable> list = searcher.doc(docs[i].doc).getFields(); for (Fieldable fieldable : list) { String fieldName = fieldable.name(); String fieldValue = fieldable.stringValue(); TokenStream ts = analyzer.tokenStream(fieldName, new StringReader(fieldValue)); String frament = highlighter.getBestFragment(ts, fieldValue); System.out.println(fieldName+" : "+frament); } } } /** * 根据前缀查询 * @Author TangJiaZhi */ public void prefixSearch(String filedStr,String queryStr) throws Exception{ IndexReader reader = IndexReader.open(directory); IndexSearcher searcher = new IndexSearcher(reader); Term term = new Term(filedStr, queryStr); Query query = new PrefixQuery(term); ScoreDoc[] docs = searcher.search(query, 3).scoreDocs; printScoreDoc(docs, searcher); } /** * 通配符查询 * @Author TangJiaZhi */ public void wildcardSearch(String filedStr,String queryStr) throws Exception{ IndexReader reader = IndexReader.open(directory); IndexSearcher searcher = new IndexSearcher(reader); Term term = new Term(filedStr, queryStr); Query query = new WildcardQuery(term); ScoreDoc[] docs = searcher.search(query, 3).scoreDocs; printScoreDoc(docs, searcher); } /** * 分词查询 * @Author TangJiaZhi */ public void analyzerSearch(String filedStr,String queryStr) throws Exception{ IndexReader reader = IndexReader.open(directory); IndexSearcher searcher = new IndexSearcher(reader); QueryParser queryParser = new QueryParser(version, filedStr, analyzer); Query query = queryParser.parse(queryStr); ScoreDoc[] docs = searcher.search(query, 3).scoreDocs; printScoreDoc(docs, searcher); } /** * 多属性分词查询 * @Author TangJiaZhi */ public void multiAnalyzerSearch(String[] filedStr,String queryStr) throws Exception{ IndexReader reader = IndexReader.open(directory); IndexSearcher searcher = new IndexSearcher(reader); QueryParser queryParser = new MultiFieldQueryParser(version, filedStr, analyzer); Query query = queryParser.parse(queryStr); ScoreDoc[] docs = searcher.search(query, 3).scoreDocs; printScoreDoc(docs, searcher); } public void printScoreDoc(ScoreDoc[] docs,IndexSearcher searcher)throws Exception{ for (int i = 0; i < docs.length; i++) { List<Fieldable> list = searcher.doc(docs[i].doc).getFields(); for (Fieldable fieldable : list) { String fieldName = fieldable.name(); String fieldValue = fieldable.stringValue(); System.out.println(fieldName+" : "+fieldValue); } } } public static void main(String[] args) throws Exception { LuceneSearch t = new LuceneSearch(); Document d1 = new Document(); d1.add(new Field("id", "1", Store.YES, Index.ANALYZED)); d1.add(new Field("name", "苦逼的程序员", Store.YES, Index.ANALYZED)); Document d2 = new Document(); d2.add(new Field("id", "2", Store.YES, Index.ANALYZED)); d2.add(new Field("name", "2b的程序员", Store.YES, Index.ANALYZED)); Document[] documentes = {d1,d2}; System.out.println("--------------------------全量索引--------------------------"); t.fullIndex(documentes); t.simpleSearch("name", "程序", 1, 10); t.highLightSearch("name", "程序", 1, 10); System.out.println("--------------------------增量索引--------------------------"); d1.removeField("name"); d1.add(new Field("name", "程序", Store.YES, Index.ANALYZED)); t.updateIndex(documentes); t.simpleSearch("name", "程序", 1, 10); System.out.println("--------------------------删除索引--------------------------"); t.deleteIndex(d1); t.simpleSearch("name", "2b", 1, 10); System.out.println("--------------------------分词查询--------------------------"); t.multiAnalyzerSearch(new String[]{"id","name"}, "苦逼的程序员"); } }
使用QueryParser接口查询时,会根据指定的分词器对查询条件先分词再搜索
这就是为什么
t.multiAnalyzerSearch(new String[]{"id","name"}, "苦逼的程序员");
能搜索出结果的原因
不过实际项目中所要求的站内搜索直接使用lucene的子项目solr就可以轻松的实现了。