废话不说了,直接上实例代码,如果你看过前面几篇文章,这些代码对你来说都是小case了,理解最重要
下面两个代码是一个工程:
IndexDocument.java
package baseSample; import java.io.IOException; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.store.Directory; import org.apache.lucene.store.LockObtainFailedException; public class IndexDocument { public static Directory getIndexDirectory(Directory directory, Analyzer analyzer) throws CorruptIndexException, LockObtainFailedException, IOException { IndexWriter iwriter = new IndexWriter(directory, analyzer, true, new IndexWriter.MaxFieldLength(25000)); // 索引过程的调优 // iwriter.setMergeFactor(10); // 激励因子 // iwriter.setMaxMergeDocs(2000); // segment最大文档数量 // iwriter.setMaxBufferedDocs(1); // 内存文档数量 // news Fields Field newsId = null; Field newsName = null; Field publishDate = null; Field newsSource = null; Field newssummay = null; // 第1篇新闻 Document doc1 = new Document(); newsId = new Field("newsId", "aaaa", Field.Store.YES, Field.Index.NOT_ANALYZED); newsName = new Field("newsName", "江苏常州曝疫苗造假大案7人被捕超百万人受害", Field.Store.YES, Field.Index.ANALYZED); publishDate = new Field("publishDate", "2010/3/30", Field.Store.YES, Field.Index.NOT_ANALYZED); newsSource = new Field("newsSource", "网易新闻频道", Field.Store.YES, Field.Index.ANALYZED); newssummay = new Field( "newssummay", "据香港明报报道,江苏常州爆出疫苗造假大案。当地著名疫苗生产商江苏延申生物科技股份有限公司(简称“江苏延申”)被国家药监局查实在疫苗生产过程中长期故意造假,导致大量问题疫苗流向市场,受害者最少超过100万人。", Field.Store.YES, Field.Index.ANALYZED); doc1.add(newsId); doc1.add(newsName); doc1.add(publishDate); doc1.add(newsSource); doc1.add(newssummay); iwriter.addDocument(doc1); // 第2篇新闻 Document doc2 = new Document(); newsId = new Field("newsId", "bbbb", Field.Store.YES, Field.Index.NOT_ANALYZED); newsName = new Field("newsName", "富士康一月内发生三起坠楼案", Field.Store.YES, Field.Index.ANALYZED); publishDate = new Field("publishDate", "2010/3/30", Field.Store.YES, Field.Index.NOT_ANALYZED); newsSource = new Field("newsSource", "广州日报", Field.Store.YES, Field.Index.ANALYZED); newssummay = new Field("newssummay", "昨日凌晨3时左右,富士康科技集团龙华厂区的一名23岁湖南籍男性员工从宿舍楼上坠下,当场死亡", Field.Store.YES, Field.Index.ANALYZED); doc2.add(newsId); doc2.add(newsName); doc2.add(publishDate); doc2.add(newsSource); doc2.add(newssummay); iwriter.addDocument(doc2); // 第3篇新闻 Document doc3 = new Document(); newsId = new Field("newsId", "cccc", Field.Store.YES, Field.Index.NOT_ANALYZED); newsName = new Field("newsName", "普京称要消灭掉制造地铁爆炸案恐怖分子", Field.Store.YES, Field.Index.ANALYZED); publishDate = new Field("publishDate", "2010/3/30", Field.Store.YES, Field.Index.NOT_ANALYZED); newsSource = new Field("newsSource", "网易新闻频道", Field.Store.YES, Field.Index.ANALYZED); newssummay = new Field("newssummay", "据外电报道,俄罗斯总理普京29日表示,当天制造莫斯科地铁连环爆炸案的恐怖分子一定会被抓到,并被消灭掉。", Field.Store.YES, Field.Index.ANALYZED); doc3.add(newsId); doc3.add(newsName); doc3.add(publishDate); doc3.add(newsSource); doc3.add(newssummay); // doc3.setBoost(2); iwriter.addDocument(doc3); // 第4篇新闻 Document doc4 = new Document(); newsId = new Field("newsId", "cccc", Field.Store.YES, Field.Index.NOT_ANALYZED); newsName = new Field("newsName", "最天使", Field.Store.YES, Field.Index.ANALYZED); publishDate = new Field("publishDate", "2009/3/30", Field.Store.YES, Field.Index.NOT_ANALYZED); newsSource = new Field("newsSource", "易", Field.Store.YES, Field.Index.ANALYZED); newssummay = new Field("newssummay", "长肥了", Field.Store.YES, Field.Index.ANALYZED); doc4.add(newsId); doc4.add(newsName); doc4.add(publishDate); doc4.add(newsSource); doc4.add(newssummay); iwriter.addDocument(doc4); iwriter.close(); return directory; } }
SampleSearch.java
package baseSample; import java.io.File; import java.io.IOException; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.cjk.CJKAnalyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.Term; import org.apache.lucene.queryParser.MultiFieldQueryParser; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.LockObtainFailedException; import org.apache.lucene.util.Version; public class SampleSearch{ public static void main(String arg[]) throws CorruptIndexException, LockObtainFailedException, IOException, ParseException{ //Store the index in memory: // Directory directory = new RAMDirectory(); //To store an index on disk, use this instead: File file = new File("D:/mapreduce-out/lucenetmp/cache.txt") ; if(file.exists()) { System.out.println("文件已存在,删除掉"); file.delete() ; } Directory directory = FSDirectory.open(file); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30); analyzer = new CJKAnalyzer(Version.LUCENE_30); //Now search the index 这一步同时也写入了lucene的cache文件 IndexSearcher isearcher = new IndexSearcher(IndexDocument.getIndexDirectory(directory, analyzer), true); /** * IndexSearcher 的主要检索方法 * isearcher.search(Query query, Collector results); * isearcher.search(Query query,int n); * isearcher.search(Query query, Filter filter, Collector results); */ //Term 是查询的基本单位 //1.termQuery Query termQuery = new TermQuery(new Term("newsSource","网易")); System.out.println("--- termQuery : "+termQuery.toString()); //2.BooleanQuery ,类似还提供RangeQuery范围搜索; PrefixQuery 前缀搜索 ;FuzzyQuery 模糊搜索 ..etc Query a = new TermQuery(new Term("newsSource", "网")); Query b = new TermQuery(new Term("newsSource", "易")); BooleanQuery booleanQuery = new BooleanQuery(); booleanQuery.add(a, BooleanClause.Occur.MUST); booleanQuery.add(b, BooleanClause.Occur.MUST); System.out.println("--- booleanQuery :"+ booleanQuery.toString()); //3.用QueryParser 切词出 query System.out.println("lucene的当前版本 : " + Version.LUCENE_CURRENT); QueryParser parser = new QueryParser(Version.LUCENE_CURRENT, "newsSource", analyzer); parser.setDefaultOperator(QueryParser.AND_OPERATOR);//默认term之间是or关系 Query parserQuery = parser.parse("java lucene"); System.out.println("--- parserQuery : "+parserQuery.toString()); //4.利用MultiFieldQueryParser实现对多Field查询 String[] fields = {"newsName","newsSource"}; MultiFieldQueryParser mparser = new MultiFieldQueryParser(Version.LUCENE_CURRENT, fields, analyzer); Query mQuery = mparser.parse("江苏"); System.out.println("---- mQuery :"+mQuery); ScoreDoc[] docs = isearcher.search(termQuery, 10).scoreDocs; for (int i = 0; i < docs.length; i++){ System.out.println(docs[i].doc); System.out.println("searcher score :" + docs[i].score); Document hitDoc = isearcher.doc(docs[i].doc); System.out.println("--- explain : "+isearcher.explain(termQuery, docs[i].doc)); System.out.println("boost:" + hitDoc.getBoost()); System.out.println("newsId:" + hitDoc.get("newsId")); System.out.println("newsName:" + hitDoc.get("newsName")); System.out.println("publishDate:" + hitDoc.get("publishDate")); System.out.println("newsSource:" + hitDoc.get("newsSource")); System.out.println("newssummay:" + hitDoc.get("newssummay")); System.out.println("------------------------------------------"); } } }
下面两个代码,是一起的
TextFileIndexer.java
package lighter.javaeye.com; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileReader; import java.io.IOException; import java.io.InputStreamReader; import java.util.Date; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriter.MaxFieldLength; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; public class TextFileIndexer { public static void main(String[] args) throws IOException { //致命要索引文件夹的位置 File fileDir = new File("D:/mapreduce-out/lucenetmp/demo1") ; //这里放索引文件的位置 File indexDir = new File("D:/mapreduce-out/lucenetmp/demo2") ; //此处的indexDir应该是放置生成缓存的文件夹 Directory docx = FSDirectory.open(indexDir); Analyzer luceneAnalyzer = new StandardAnalyzer(Version.LUCENE_CURRENT) ; IndexWriter.MaxFieldLength mf = new MaxFieldLength(100); IndexWriter indexWriter = new IndexWriter(docx, luceneAnalyzer, mf) ; File[] textFiles = fileDir.listFiles(); long startTime = new Date().getTime(); for(int i=0;i<textFiles.length;i++) { if(textFiles[i].isFile() && textFiles[i].getName().endsWith(".txt")) { System.out.println("文件 " + textFiles[i].getCanonicalPath() + "正在呗索引") ; String temp = fileReaderAll(textFiles[i].getCanonicalPath(), "GBK") ; System.out.println("temp = " + temp); Document document = new Document(); Field fieldPath = new Field("path", textFiles[i].getPath(),Field.Store.YES, Field.Index.NO) ; Field fieldBody = new Field("body", temp, Field.Store.YES, Field.Index.ANALYZED,Field.TermVector.WITH_POSITIONS_OFFSETS) ; document.add(fieldPath); document.add(fieldBody); indexWriter.addDocument(document); } } //optimize()方法是对索引进行优化 indexWriter.optimize(); indexWriter.close(); long endTime = new Date().getTime(); System.out.println("这花费了" + (endTime - startTime) + " 毫秒来把文档增加到索引里面去!" + fileDir.getPath()); } public static String fileReaderAll(String fileName, String charset) throws IOException { BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(fileName),charset)); String line = new String() ; String temp = new String() ; while((line = reader.readLine()) != null) { temp += line ; } reader.close(); return temp ; } }
package lighter.javaeye.com; import java.io.File; import java.io.IOException; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; public class TestQuery { public static void main(String[] args) throws IOException { TopDocs topDoc = null ; String queryString = "中华" ; Query query = null ; Directory directory = FSDirectory.open(new File("D:/mapreduce-out/lucenetmp/demo2")); IndexSearcher search = new IndexSearcher(directory) ; Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT); try { QueryParser qp = new QueryParser(Version.LUCENE_CURRENT, "body", analyzer) ; query = qp.parse(queryString); } catch (ParseException e) { e.printStackTrace() ; } if(search != null) { topDoc = search.search(query, 100); if (topDoc.getMaxScore() > 0) { System.out.println("topDoc.totalHits" + topDoc.totalHits); System.out.println("topDoc.getMaxScore()" + topDoc.getMaxScore()); System.out.println("topDoc.toString()" + topDoc.toString()); } else { System.out.println("没有查询到结果"); } } } }