先准备下工具类
package com.cs.lucene.utils; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.InputStreamReader; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.NumberTools; import org.apache.lucene.document.Field.Index; import org.apache.lucene.document.Field.Store; public class File2DocumentUtiles { /** *文件到document的转换 * @param filepath * @return */ public static Document file2Document(String filepath) { File file = new File(filepath) ; Document doc = new Document(); doc.add(new Field("name",file.getName(),Store.YES,Index.ANALYZED)) ; //索引并分词 doc.add(new Field("content",readFileContent(file),Store.YES,Index.ANALYZED)) ; //索引并分词 doc.add(new Field("size",NumberTools.longToString(file.length()),Store.YES,Index.NOT_ANALYZED)) ; //索引不分词 doc.add(new Field("path",file.getPath(),Store.YES,Index.NO)) ; //不索引 return doc; } /** * 根据文件读取文件内容 * @param file * @return */ private static String readFileContent(File file) { try { BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file))); StringBuffer content = new StringBuffer(); for(String line=null; (line = reader.readLine())!=null ;){ content.append(line).append("\n") ; } return content.toString() ; } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } return null; } public static void printDocInfo(Document doc){ System.out.println("--------------------------"); System.out.println("name =" + doc.get("name")); System.out.println("content =" + doc.get("content")); System.out.println("size =" + NumberTools.stringToLong(doc.get("size"))); System.out.println("path =" + doc.get("path")); } }
先了解下分词器
package com.cs.lucene.analyzer; import java.io.StringReader; import jeasy.analysis.MMAnalyzer; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.junit.Test; public class AnalyzerTest { String text = "资源来自互联网吴朝辉wwwa的a-b放到" ; Analyzer analyzer = new MMAnalyzer() ; @Test public void testAnalyze() throws Exception{ analyze(analyzer,text); } private void analyze(Analyzer analyzer2, String text2) throws Exception { System.out.println("----------分词器-------------------"); TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(text)) ; for(Token token = new Token();(token = tokenStream.next(token))!=null;){ System.out.println(token); } } }
现在看看FSDirectory和RAMDirectory
package com.cs.lucene.directory; import jeasy.analysis.MMAnalyzer; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriter.MaxFieldLength; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.RAMDirectory; import org.junit.Test; import com.cs.lucene.utils.File2DocumentUtiles; public class DirectoryTest { //创建索引用的文件路径 String filePath = "E:\\EclipseStudyWorkspace\\LucenceTest\\lucenceDataSource\\资源来自互联网,版权归原创作者或原单位公司所有.txt"; //存放索引的目录 String indexPath = "E:\\EclipseStudyWorkspace\\LucenceTest\\lucenceIndex" ; //分词器 Analyzer analyzer = new MMAnalyzer(); //je分词器 /** * 利用FSDirectory 创建索引 * FSDirectory:在文件系统上存放 * @throws Exception */ @Test public void testFSDirectory() throws Exception{ //测试文件系统目录 Directory dir = FSDirectory.getDirectory(indexPath) ; Document doc = File2DocumentUtiles.file2Document(filePath); //参数true表示是否删除原来的索引后再重新创建,MaxFieldLength.LIMITED:表示只对前10000个字做索引 IndexWriter indexWriter = new IndexWriter(dir,analyzer,MaxFieldLength.LIMITED) ; //没有参数true,添加索引 indexWriter.addDocument(doc) ; indexWriter.close() ; } /** * 利用RAMDirectory 创建索引 * RAMDirectory:在内存中存放 * 优点:读取快 * 缺点:重新开机,索引没了 * @throws Exception */ @Test public void testRAMDirectory() throws Exception{ //测试文件系统目录 Directory dir = new RAMDirectory() ; Document doc = File2DocumentUtiles.file2Document(filePath); //参数true表示是否删除原来的索引后再重新创建,MaxFieldLength.LIMITED:表示只对前10000个字做索引 IndexWriter indexWriter = new IndexWriter(dir,analyzer,MaxFieldLength.LIMITED) ; //没有参数true,添加索引 indexWriter.addDocument(doc) ; indexWriter.close() ; } /** * 实际应用中,FSDirectory和RAMDirectory联合起来用 * 操控内存的索引要快,所以在运行时操作RAMDirectory, * 但退出时必须保存到到文件系统上,所以退出时操控FSDirectory * @throws Exception */ @Test public void testRAMDirectoryAndFSDirectory() throws Exception{ //整个过程:从文件系统上读取所以到内存,运行时添加索引,此时的全部索引都在内存中, //退出时再把全部保存到文件系统上 Directory fsDir = FSDirectory.getDirectory(indexPath) ; //1.启动时读取 Directory ramDir = new RAMDirectory(fsDir) ; //运行时操作ramDir IndexWriter ramIndexWriter = new IndexWriter(ramDir,analyzer,MaxFieldLength.LIMITED); //添加document Document doc = File2DocumentUtiles.file2Document(filePath) ; ramIndexWriter.addDocument(doc) ; ramIndexWriter.close() ;//一定要关闭再合并,因为有缓存 //2.退出时保存 //参数true表示把以前的索引删掉,全部重写 (默认为false) IndexWriter fsIndexWriter = new IndexWriter(fsDir,analyzer,true,MaxFieldLength.LIMITED); //new Directory[]{ramDir}:要合并的目录 //addIndexesNoOptimize:表示不做优化,做优化检索时相对要慢,但占用的存储空间小 fsIndexWriter.addIndexesNoOptimize(new Directory[]{ramDir}) ; fsIndexWriter.flush() ; //优化之前一定要先刷新缓存 fsIndexWriter.optimize() ; //优化一定要在关闭之前做,优化可以提高检索的速度 fsIndexWriter.close() ; } @Test public void testOptimize() throws Exception{ Directory fsDir = FSDirectory.getDirectory(indexPath) ; IndexWriter fsIndexWriter = new IndexWriter(fsDir,analyzer,MaxFieldLength.LIMITED); fsIndexWriter.optimize() ; fsIndexWriter.close() ; } }
现在来测测索引如何建立以及搜索
package com.cs.lucene.lucene; import java.io.File; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import jeasy.analysis.MMAnalyzer; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriter.MaxFieldLength; import org.apache.lucene.queryParser.MultiFieldQueryParser; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.Filter; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.highlight.Formatter; import org.apache.lucene.search.highlight.Fragmenter; import org.apache.lucene.search.highlight.Highlighter; import org.apache.lucene.search.highlight.QueryScorer; import org.apache.lucene.search.highlight.Scorer; import org.apache.lucene.search.highlight.SimpleFragmenter; import org.apache.lucene.search.highlight.SimpleHTMLFormatter; import com.cs.lucene.utils.File2DocumentUtiles; public class IndexDao { // 存放索引的目录 private String indexPath; private Analyzer analyzer = null; // 分词器 public IndexDao() { this.indexPath = "E:\\EclipseStudyWorkspace\\LucenceTest\\lucenceIndex"; this.analyzer = new MMAnalyzer(); // je分词器对中文支持很好 } public IndexDao(Analyzer analyzer, String indexPath) { this.analyzer = analyzer; this.indexPath = indexPath; } /** * 接受一个QuerString字符串 搜索索引并返回结果 * */ public QueryResult search(String queryString, int firstResult, int maxResults) throws Exception { // 1.把要搜索的fields解析为Query String[] fields = { "name", "content" }; // boosts:需要的理由,标题和内容中出现关键字的得分不一样,在标题中出现时的得分理应高些 Mapboosts = new HashMap (); boosts.put("name", 3.0f); boosts.put("content", 1.0f); // 默认值 QueryParser queryParser = new MultiFieldQueryParser(fields, analyzer, boosts);// 多field搜索 Query query = queryParser.parse(queryString); return search(query, firstResult, maxResults); } /* * 接受一个Query对象 搜索索引并返回结果 */ public QueryResult search(Query query, int firstResult, int maxResults) throws Exception { IndexSearcher indexSearcher = null; // 2.进行查询 indexSearcher = new IndexSearcher(indexPath); Filter filter = null; // 搜索时的过滤器 /** ********过滤器************* */ // 过滤器:把结果再过滤一遍,效率会很低 // filter = new // RangeFilter("size",NumberTools.longToString(200),NumberTools.longToString(500),true,true); /** ************************* */ Sort sort = new Sort(); // 默认是按升序排序,参数true:排序结果改为按降序排列 sort.setSort(new SortField[] { new SortField("size", true) }); TopDocs topDocs = indexSearcher.search(query, filter, 10000, sort); int recordCount = topDocs.totalHits; /** ***********准备高亮器******************** */ Formatter formatter = new SimpleHTMLFormatter("", ""); Scorer scorer = new QueryScorer(query); Highlighter highlighter = new Highlighter(formatter, scorer); // 50表示只显示50个字符 这里的50个字符是有关键字的左右部分(称之为最佳部分) 这里只是测试用 Fragmenter fragmenter = new SimpleFragmenter(500); highlighter.setTextFragmenter(fragmenter); /** ************************************ */ // 3.取出当前的数据 List recordList = new ArrayList (); int end = Math.min(firstResult + maxResults, recordCount); for (int i = firstResult; i < end; i++) { ScoreDoc scoreDoc = topDocs.scoreDocs[i]; int docSn = scoreDoc.doc; Document doc = indexSearcher.doc(docSn); // 使用高亮器 String hc = highlighter.getBestFragment(analyzer, "content", doc .get("content")); // 如果content中没有搜索的关键字,则截取content的前200个字符 if (hc == null) { String content = doc.get("content"); int endIndex = Math.min(200, content.length()); hc = content.substring(0, endIndex); } doc.getField("content").setValue(hc); recordList.add(doc); } // 打开结果 /* * for(ScoreDoc scoreDoc :topDocs.scoreDocs){ int docSn = scoreDoc.doc ; * //文档内部编号 Document doc = indexSearcher.doc(docSn); //根据编号查找相应的文档 * File2DocumentUtiles.printDocInfo(doc) ; } */ // 4.返回结果 return new QueryResult(recordCount, recordList); } /* * 建立索引并保存 */ public void save(String filePath) throws Exception { Document doc = File2DocumentUtiles.file2Document(filePath); // 在添加doc的时候,可以设定文档的分数,不过不建议这样做 // doc.setBoost(1.0f); //默认值 // 参数true表示是否删除原来的索引后再重新创建,MaxFieldLength.LIMITED:表示只对前10000个字做索引 IndexWriter indexWriter = new IndexWriter(indexPath, analyzer, false, MaxFieldLength.LIMITED); indexWriter.addDocument(doc); indexWriter.commit(); indexWriter.optimize(); indexWriter.close(); } public void save(File file) throws Exception { save(file.getAbsolutePath()) ; } /* * 建立索引并保存 可以直接传入的是目录 */ public void saveDirectory(File file) throws Exception { if (file.isFile()) { // 如果是文件就建索引并保存 save(file.getAbsolutePath()); return; } File[] childs = file.listFiles(); for (int i = 0; i < childs.length; i++) { File f = childs[i]; if (f.isDirectory()) {// 如果是目录就递归调用 saveDirectory(f); } else { save(f.getAbsolutePath()); } } } /** * 测试递归 */ public void save(File file, int pointer) throws Exception { StringBuffer str = new StringBuffer(); for (int i = 0; i < pointer; i++) { str.append("--"); } if (file.isFile()) { // 如果是文件就建索引并保存 System.out.println(str + file.getName()); return; } File[] childs = file.listFiles(); for (int i = 0; i < childs.length; i++) { File f = childs[i]; if (f.isDirectory()) {// 如果是目录就递归调用 System.out.println(str + f.getName()); save(f, pointer + 1); } else { System.out.println(str + f.getName()); } } } }
package com.cs.lucene.lucene; import java.util.ArrayList; import java.util.List; import org.apache.lucene.document.Document; public class QueryResult { private int recordCount = 0; private ListrecordResults = new ArrayList (); public QueryResult(int recordCount, List recordResults) { this.recordCount = recordCount; this.recordResults = recordResults; } public int getRecordCount() { return recordCount; } public void setRecordCount(int recordCount) { this.recordCount = recordCount; } public List getRecordResults() { return recordResults; } public void setRecordResults(List recordResults) { this.recordResults = recordResults; } }
测试索引
package com.cs.lucene.lucene; import java.io.File; import jeasy.analysis.MMAnalyzer; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.Document; import org.junit.Test; import com.cs.lucene.utils.File2DocumentUtiles; public class IndexDaoTest { private IndexDao indexDao = new IndexDao() ; /* *搜索索引库,并返回结果 */ @Test public void testSearch() throws Exception{ String queryString = "www*" ; QueryResult queryResults = indexDao.search(queryString ,0, 10) ; //测试结果 System.out.println("总共有【"+queryResults.getRecordCount()+"】条匹配结果"); for(int i =0 ; i
最后我们来看看lucene的查询功能
package com.cs.lucene.query; import java.util.Date; import org.apache.lucene.document.DateTools; import org.apache.lucene.document.Document; import org.apache.lucene.document.NumberTools; import org.apache.lucene.document.DateTools.Resolution; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.RangeQuery; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.WildcardQuery; import org.apache.lucene.search.BooleanClause.Occur; import org.junit.Test; import com.cs.lucene.lucene.IndexDao; import com.cs.lucene.lucene.QueryResult; import com.cs.lucene.utils.File2DocumentUtiles; public class QueryTest { IndexDao indexDao = new IndexDao() ; /* * 关键词查询 */ @Test public void testTermQuery() throws Exception{ Term term = new Term("name","资源"); Query query = new TermQuery(term); //查询打印结果 QueryAndPrintResult(query) ; } /* * 范围索引 * 数字在query中都是字符串,所以要借助NumberTools工具类做转换 */ @Test public void testRangeQuery() throws Exception{ Term lowerTerm = new Term("size",NumberTools.longToString(200)); Term upperTerm = new Term("size",NumberTools.longToString(500)); //true表示是否包含边界 Query query = new RangeQuery(lowerTerm,upperTerm,true) ; /* Term lowerTerm2 = new Term("size","200"); Term upperTerm2 = new Term("size","500"); Query query = new RangeQuery(lowerTerm2,upperTerm2,true) ; //true表示是否包含边界 */ //查询打印结果 QueryAndPrintResult(query) ; } /* * 测试NumberTools和DateTools */ @Test public void testNumberToolsAndDateTools() throws Exception{ System.out.println("数字测试:"); System.out.println(NumberTools.longToString(200)); System.out.println(NumberTools.longToString(500)); System.out.println(NumberTools.stringToLong("000000000000dw")); System.out.println("日期测试:"); System.out.println(DateTools.dateToString(new Date(), Resolution.SECOND)); System.out.println(DateTools.dateToString(new Date(), Resolution.DAY)); System.out.println(DateTools.stringToDate("20101005080855")); } /* * 通配符查询 * ?:代表一个字符,*:代表0个或多个字符 */ @Test public void testWildcardQuery() throws Exception{ Term term = new Term("name","*me"); Query query = new WildcardQuery(term) ; //查询打印结果 QueryAndPrintResult(query) ; } /* * 短语查询:查询包含多个短语的query */ @Test public void testPhraseQuery() throws Exception{ PhraseQuery phraseQuery = new PhraseQuery() ; phraseQuery.add(new Term("name","资源")) ; phraseQuery.add(new Term("name","作者")) ; //setSlop:用来设置两个短语之间的最多可以隔多少个字符 phraseQuery.setSlop(20); //查询打印结果 QueryAndPrintResult(phraseQuery) ; } /** * 布尔查询:非常重要 * 三种关系: * 1.MUST和MUST:取得两个查询子句的交集。 * 2.MUST和MUST_NOT:包含MUST但并且查询结果中不包含MUST_NOT的检索结果。 * 3.SHOULT和SHOULT:表示"或"关系,最终检索结果为所有检索子句的并集。 * 注意:有些组合是没有意义的 * @throws Exception */ @Test public void testBooleanQuery() throws Exception{ //条件1 PhraseQuery phraseQuery = new PhraseQuery() ; phraseQuery.add(new Term("name","资源")) ; phraseQuery.add(new Term("name","作者")) ; phraseQuery.setSlop(20); //条件2 Term lowerTerm2 = new Term("size","200"); Term upperTerm2 = new Term("size","500"); Query rangeQuery = new RangeQuery(lowerTerm2,upperTerm2,true) ; //true表示是否包含边界 //合并两个查询 BooleanQuery booleanQuery = new BooleanQuery() ; booleanQuery.add(phraseQuery, Occur.MUST) ; booleanQuery.add(rangeQuery,Occur.MUST) ; //查询打印结果 QueryAndPrintResult(booleanQuery) ; } private void QueryAndPrintResult(Query query) throws Exception{ System.out.println("相对应的查询字符串:"+query); QueryResult qr = indexDao.search(query, 0, 100) ; System.out.println("总共有【"+qr.getRecordCount()+"】条匹配结果"); //打印结果 for(int i =0 ; i
通过以上学习 应该对lucene开发没什么问题了 恭喜您 您又向前迈进了一步