在本篇博客中我们来构建一个简单而通用的搜索查询接口,在这个接口里面我们需要实现基本的增、删、改、查功能,并且做到通用而又使用简单,可扩展性强。一般在实际应用Lucene过程中,主要两个作用最为常见,一个为文档库的搜索查询(可以扩展为各种搜索引擎),另一个为知识问答库(可以扩展为类似小黄鸡的智能对话机器人)的搜索查询。接口的类图如下:
为了便于大家使用,在此处将全部源码公开,
DAO的基类LuceneDao,提供常用的增删改查方法,并且将根据资料生成Document以及查询结果这两个扩展点进行抽象,在子类中可以根据不同的资料,进行扩展实现:
package com.hsdl.lucene; import java.io.File; import java.io.IOException; import java.util.List; import java.util.Map; import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.StringField; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.index.Term; import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; import org.wltea.analyzer.lucene.IKAnalyzer; public abstract class LuceneDao { private Analyzer analyzer = new IKAnalyzer(true); private String indexPath = "D:/work/lucene/tika/index"; public void add(Stuff stuff) throws Exception { createIndex(stuff); } public void batchAdd(List<Stuff> stuffs) throws Exception { createIndexs(stuffs); } /*** * * 删除方法 * * */ public void delete(String fieldName, String fieldVaule) { try { IndexWriter writer = getIndexWrite(); Query q = new TermQuery(new Term(fieldName, fieldVaule)); writer.deleteDocuments(q);// 删除指定条件的Document writer.commit();// 提交 writer.close();// 关闭 System.out.println("删除" + fieldName + "为" + fieldVaule + "的记录成功"); } catch (Exception e) { e.printStackTrace(); } } /** * 批量删除 * * @param fieldMap * @throws Exception */ public void batchDelete(Map<String, String> fieldMap) throws Exception { IndexWriter writer = getIndexWrite(); for (String fieldName : fieldMap.keySet()) { Query q = new TermQuery( new Term(fieldName, fieldMap.get(fieldName))); writer.deleteDocuments(q);// 删除指定条件的Document System.out.println("删除" + fieldName + "为" + fieldMap.get(fieldName) + "的记录成功"); } writer.commit();// 提交 writer.close();// 关闭 } protected abstract Document getDocument(Stuff stuff) throws Exception; /** * * @param fieldName * @param fieldVaule * @param stuff * @throws Exception */ public void update(String fieldName, String fieldVaule, Stuff stuff) throws Exception { try { IndexWriter writer = getIndexWrite(); Document doc = getDocument(stuff); writer.updateDocument(new Term(fieldName, fieldVaule), doc); writer.commit(); writer.close();// 关闭 System.out.println("更新" + fieldName + "为" + fieldVaule + "的记录成功"); } catch (Exception e) { throw e; } } public void setAnalyzer(Analyzer analyzer) { this.analyzer = analyzer; } /** * 设置索引文件的目录 * * @param indexPath */ public void setIndexPath(String indexPath) { this.indexPath = indexPath; } /** * 创建索引 * * @param analyzer * @param indexPath * @param docPath * @throws Exception */ protected void createIndex(Stuff stuff) throws Exception { IndexWriter iwriter = getIndexWrite(); indexDoc(iwriter, stuff); iwriter.commit(); iwriter.close(); } protected void indexDoc(IndexWriter iwriter, Stuff stuff) throws Exception { Document doc = getDocument(stuff); iwriter.addDocument(doc); } /** * 批量创建索引 * * @param analyzer * @param indexPath * @param docPath * @throws Exception */ protected void createIndexs(List<Stuff> stuffs) throws Exception { IndexWriter iwriter = getIndexWrite(); for (Stuff stuff : stuffs) { indexDoc(iwriter, stuff); } iwriter.close(); } /** * 获取IndexWrite实例 * * @param analyzer * @param indexPath * @return * @throws IOException */ protected IndexWriter getIndexWrite() throws IOException { IndexWriter iwriter; Directory directory = FSDirectory.open(new File(indexPath)); // 配置IndexWriterConfig IndexWriterConfig iwConfig = new IndexWriterConfig(Version.LUCENE_45, analyzer); iwConfig.setOpenMode(OpenMode.CREATE_OR_APPEND); iwriter = new IndexWriter(directory, iwConfig); return iwriter; } /** * 搜索 * * @param searchField * 搜索域 * @param indexPath * 索引目录 * @param topCount * 返回搜索相似度最高的条数 * @throws CorruptIndexException * @throws IOException * @throws ParseException */ public Document[] search(String searchField, String searchKeyStr, int topCount) throws CorruptIndexException, IOException, ParseException { Directory directory = FSDirectory.open(new File(indexPath)); // 搜索过程********************************** // 实例化搜索器 IndexReader ireader = DirectoryReader.open(directory); IndexSearcher isearcher = new IndexSearcher(ireader); // 使用QueryParser查询分析器构造Query对象 QueryParser qp = new QueryParser(Version.LUCENE_45, searchField, analyzer); qp.setDefaultOperator(QueryParser.AND_OPERATOR); Query query = qp.parse(searchKeyStr); // 搜索相似度最高的topCount条记录 TopDocs topDocs = isearcher.search(query, topCount); // 输出结果 Document[] docs=new Document[topDocs.scoreDocs.length]; for(int i=0;i<docs.length;i++){ docs[i]=isearcher.doc(topDocs.scoreDocs[i].doc); } return docs; } public void displaySearchResult(Document[] docs) { System.out.println("开始显示搜索查询结果....\n返回查询条数:"+docs.length); } /** * 为索引文档添加附加的数据,一般为数据库存储相关记录的主键,便于在搜索后查询该文档其它的信息 * * @param attachData * @param doc */ protected void addAttacheData(Document doc, Map<String, String> attachData) { if (attachData != null) { Set<String> keys = attachData.keySet(); for (String key : keys) { doc.add(new StringField(key, attachData.get(key), Field.Store.YES)); } } } }
文档库资料对象的基类Stuff,我们将资料内容之外的其他数据放入到Map中,做为附加数据。
package com.hsdl.lucene; import java.util.Map; /** * 文档库资料对象的基类 * @author alex * */ public class Stuff { private Map<String,String> attacheData; public Map<String,String> getAttacheData() { return attacheData; } public void setAttacheData(Map<String,String> attacheData) { this.attacheData = attacheData; }; }
文件资料对象FileStuff,在这个类中有文件路径以及代表文件内容的域的名字,在构建索引和搜索时使用:
package com.hsdl.lucene; /** * 文件资料 * @author alex * */ public class FileStuff extends Stuff{ private String filePath; private String contentFieldName; public String getContentFieldName() { return contentFieldName; } public void setContentFieldName(String contentFieldName) { this.contentFieldName = contentFieldName; } public String getFilePath() { return filePath; } public void setFilePath(String filePath) { this.filePath = filePath; } }
知识问答资料 AskAnswerStuff:
package com.hsdl.lucene; /** * 知识问答资料 * @author alex * */ public class AskAnswerStuff extends Stuff{ private String ask; private String answer; private String contentFieldName; public String getContentFieldName() { return contentFieldName; } public void setContentFieldName(String contentFieldName) { this.contentFieldName = contentFieldName; } public String getAsk() { return ask; } public void setAsk(String ask) { this.ask = ask; } public String getAnswer() { return answer; } public void setAnswer(String answer) { this.answer = answer; } }
文档库访问之文件对象实现LuceneDaoFileImpl:
package com.hsdl.lucene; import java.io.File; import java.io.IOException; import java.util.Map; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.IndexWriter; import org.apache.tika.Tika; /** * 文档库访问之文件对象实现 * @author alex * */ public class LuceneDaoFileImpl extends LuceneDao{ private static String contentFieldName = "content"; private static Tika tika = new Tika(); protected void indexDoc(IndexWriter iwriter, Stuff stuff) throws Exception { FileStuff fileStuff=(FileStuff)stuff; File file=new File(fileStuff.getFilePath()); if(file.isDirectory()){ indexDocByFileDir(iwriter,new File(fileStuff.getFilePath()),stuff.getAttacheData()); }else{ super.indexDoc(iwriter,stuff); } } /** * 根据指定存放内容的目录创建索引 * * @param iwriter * @param file * @throws IOException */ private void indexDocByFileDir(IndexWriter iwriter, File file,Map<String,String> attachData) throws IOException { if (file.canRead()){ if (file.isDirectory()) { String[] files = file.list(); if (files != null) for (int i = 0; i < files.length; i++) indexDocByFileDir(iwriter, new File(file, files[i]),attachData); } else { Document doc = getDocument(file,attachData); iwriter.addDocument(doc); } } } protected Document getDocument(File file,Map<String,String> attachData) throws IOException { Document doc = new Document(); addAttacheData(doc,attachData ); // 此处添加文件内容时,需要根据tika获取Reader对象 doc.add(new TextField(contentFieldName, tika.parse(file))); doc.add(new StringField("fileName", file.getName(), Field.Store.YES)); doc.add(new StringField("path", file.getAbsolutePath(), Field.Store.YES)); return doc; } public void displaySearchResult(Document[] docs) { super.displaySearchResult(docs); for (int i = 0; i < docs.length; i++) { System.out.println("内容:" + docs[i].toString()); System.out.println(docs[i].get("fileName") + "[" + docs[i].get("path") + "]"); } } @Override protected Document getDocument(Stuff stuff) throws IOException { FileStuff fileStuff=(FileStuff)stuff; File file=new File(fileStuff.getFilePath()); Document doc = new Document(); addAttacheData(doc,stuff.getAttacheData() ); // 此处添加文件内容时,需要根据tika获取Reader对象 doc.add(new TextField(contentFieldName, tika.parse(file))); doc.add(new StringField("fileName", file.getName(), Field.Store.YES)); doc.add(new StringField("path", file.getAbsolutePath(), Field.Store.YES)); return doc; } }
文档库访问之知识问答实现LuceneDaoAskAnswerImpl:
package com.hsdl.lucene; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; /** * 文档库访问之知识问答实现 * @author alex * */ public class LuceneDaoAskAnswerImpl extends LuceneDao{ @Override protected Document getDocument(Stuff stuff) throws Exception { AskAnswerStuff fileStuff=(AskAnswerStuff)stuff; Document doc = new Document(); addAttacheData(doc,stuff.getAttacheData() ); // 此处添加文件内容时,需要根据tika获取Reader对象 doc.add(new TextField("ask",fileStuff.getAsk(),Field.Store.YES)); doc.add(new StringField("answer", fileStuff.getAnswer(), Field.Store.YES)); return doc; } public void displaySearchResult(Document[] docs) { super.displaySearchResult(docs); for (int i = 0; i < docs.length; i++) { System.out.println("内容:" + docs[i].toString()); System.out.println(docs[i].get("ask") + ":[" + docs[i].get("answer") + "]"); } } }
下面我们来编写两个测试类,分别测试文件库的访问以及知识问答库:
LuceneDaoFileTest
package com.hsdl.lucene; import java.util.HashMap; import java.util.Map; import org.apache.lucene.document.Document; /** * 测试文件索引与搜索 * @author alex * */ public class LuceneDaoFileTest { public static void main(String[] args) { LuceneDao luceneDao=new LuceneDaoFileImpl(); luceneDao.setIndexPath("D:/work/lucene/filetest/index"); FileStuff fileStuff=new FileStuff(); fileStuff.setFilePath("D:/work/lucene/filetest/doc/test.txt"); Map<String,String> attacheData=new HashMap<String,String>(); attacheData.put("ID", "001"); fileStuff.setAttacheData(attacheData); fileStuff.setContentFieldName("content"); try { Document[] docs; //添加测试 System.err.println("------------开始添加测试------------"); luceneDao.add(fileStuff); docs=luceneDao.search(fileStuff.getContentFieldName(),"微信收费",10); luceneDao.displaySearchResult(docs); docs=luceneDao.search(fileStuff.getContentFieldName(),"网站收费",10); luceneDao.displaySearchResult(docs); //删除测试 System.err.println("------------开始删除测试------------"); luceneDao.delete("ID", "001"); docs=luceneDao.search(fileStuff.getContentFieldName(),"微信收费",10); luceneDao.displaySearchResult(docs); //更新测试 fileStuff.setFilePath("D:/work/lucene/filetest/doc/test.xls"); luceneDao.update("ID", "001",fileStuff); System.err.println("------------开始更新测试------------"); docs=luceneDao.search(fileStuff.getContentFieldName(),"微信收费",10); luceneDao.displaySearchResult(docs); docs=luceneDao.search(fileStuff.getContentFieldName(),"网站费用",10); luceneDao.displaySearchResult(docs); } catch (Exception e) { e.printStackTrace(); } } }
LuceneDaoAskAnswerTest
package com.hsdl.lucene; import java.util.HashMap; import java.util.Map; import org.apache.lucene.document.Document; /** * 测试问答索引与搜索 * @author alex * */ public class LuceneDaoAskAnswerTest { public static void main(String[] args){ //测试问答知识的索引与搜索 LuceneDao luceneDao=new LuceneDaoAskAnswerImpl(); luceneDao.setIndexPath("D:/work/lucene/askanswer/index"); AskAnswerStuff askAnswerStuff=new AskAnswerStuff(); askAnswerStuff.setAsk("微信营销怎么收费?"); askAnswerStuff.setAnswer("3000元每年,10年25000"); Map<String,String> attacheData=new HashMap<String,String>(); attacheData.put("ID", "001"); askAnswerStuff.setAttacheData(attacheData); try { Document[] docs; //添加测试 System.err.println("------------开始添加测试------------"); luceneDao.add(askAnswerStuff); docs=luceneDao.search("ask","微信收费",10); luceneDao.displaySearchResult(docs); docs=luceneDao.search("ask","网站收费",10); luceneDao.displaySearchResult(docs); //删除测试 System.err.println("------------开始删除测试------------"); luceneDao.delete("ID", "001"); docs=luceneDao.search("ask","微信收费",10); luceneDao.displaySearchResult(docs); //更新测试 askAnswerStuff.setAsk("网站建设怎么收费?"); askAnswerStuff.setAnswer("普通企业网站6000,商城网站10000,其他网站价格面议!"); luceneDao.update("ID", "001",askAnswerStuff); System.err.println("------------开始更新测试------------"); docs=luceneDao.search("ask","微信收费",10); luceneDao.displaySearchResult(docs); docs=luceneDao.search("ask","网站收费",10); luceneDao.displaySearchResult(docs); } catch (Exception e) { e.printStackTrace(); } } }