使用Lucene+Tika进行文件索引的创建与查询,在Windows环境下测试没问题,可以解析各种文件(Tika支持的),另外从源代码可以看出还对zip压缩文件解析支持!
但是,在Linux环境下发现不能很好的解析2008以上的办公文档!也还没有找到具体原因,希望高手能看看这个问题!
对于 T
相关jar包,
源码附上:
package com.leagsoft.tika.repos; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.zip.ZipEntry; import java.util.zip.ZipFile; import org.apache.log4j.Logger; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.Field.Index; import org.apache.lucene.document.Field.Store; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriter.MaxFieldLength; import org.apache.lucene.queryParser.MultiFieldQueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.highlight.Formatter; import org.apache.lucene.search.highlight.Fragmenter; import org.apache.lucene.search.highlight.Highlighter; import org.apache.lucene.search.highlight.InvalidTokenOffsetsException; import org.apache.lucene.search.highlight.QueryScorer; import org.apache.lucene.search.highlight.Scorer; import org.apache.lucene.search.highlight.SimpleFragmenter; import org.apache.lucene.search.highlight.SimpleHTMLFormatter; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; import org.apache.tika.Tika; import org.apache.tika.metadata.Metadata; import org.wltea.analyzer.lucene.IKAnalyzer; /** * * @author Heweipo * */ public class ReposUtil { private static IndexWriter indexWriter; private static IndexSearcher indexSearcher; private static Analyzer analyzer = new IKAnalyzer(); private static Object lock = new Object(); public final static String ARTICLE_CONTENT = "fileContent"; public final static String ARTICLE_NAME = "fileName"; public final static String ARTICLE_PATH = "filePath"; public final static String ARTICLE_FRAGMENT = "fragment"; // 测试使用,如果不需要,则设置 ARTICLE_FRAGMENT_USE = false public static boolean ARTICLE_FRAGMENT_USE = true; private static Logger log = Logger.getLogger(ReposUtil.class); /** * 创建索引 */ @SuppressWarnings("unchecked") private static boolean writeRepos(IndexWriter indexWriter , File file){ if(file == null || !file.exists() || !file.canRead()) return false; if(file.isDirectory()){ // 文件夹 File[] files = file.listFiles(); for(File f : files){ writeRepos(indexWriter,f); } }else{ // 文件 Document doc = null; ZipFile zf = null; try{ doc = new Document(); Tika tika = new Tika(); Metadata metadata = new Metadata(); metadata.add(Metadata.CONTENT_ENCODING, "utf-8"); if(file.getName().endsWith(".zip")){ zf = new ZipFile(file); List<ZipEntry> list = (List<ZipEntry>) Collections.list(zf.entries()); for(ZipEntry entry : list){ doc.add(new Field(ARTICLE_CONTENT,tika.parse(zf.getInputStream(entry),metadata))); } }else{ doc.add(new Field(ARTICLE_CONTENT,tika.parse(new FileInputStream(file),metadata))); } doc.add(new Field(ARTICLE_NAME,file.getName(),Store.YES,Index.ANALYZED)); doc.add(new Field(ARTICLE_PATH,file.getAbsolutePath(),Store.YES,Index.NO)); if(ARTICLE_FRAGMENT_USE){ doc.add(new Field(ARTICLE_FRAGMENT,new Tika().parseToString(new FileInputStream(file),metadata),Store.YES,Index.NO)); } indexWriter.addDocument(doc); log.info("正在加载文件索引:"+file.getName()+" "+file.getAbsolutePath()); }catch(Exception e){ log.info("索引创建失败:"+e.getMessage()); e.printStackTrace(); return false; }finally{ if(zf != null){ try { zf.close(); } catch (IOException e) { e.printStackTrace(); log.error(e); } } } } return true; } /** * 创建索引 */ public static boolean writeRepos(File file){ IndexWriter indexWriter = getIndexWriter(); if(!writeRepos(indexWriter , file)) return false; try { // 提交,在这里提交保证程序的原子性 getIndexWriter().commit(); log.info("文件索引创建成功:"+file.getName()+" "+file.getAbsolutePath()); } catch (Exception e) { log.info("文件索引创建之后提交失败:"+e.getMessage()); e.printStackTrace(); return false; } return true; } /** * 读取索引 */ public static List<Map<String,String>> readRepos(String[] fields , String value , int numb){ if(fields == null || fields.length == 0){ fields = new String[]{ReposUtil.ARTICLE_CONTENT,ReposUtil.ARTICLE_NAME}; } List<Map<String,String>> list = new ArrayList<Map<String,String>>(); try { Query query = new MultiFieldQueryParser(Version.LUCENE_30, fields, analyzer).parse(value); TopDocs matchs = getIndexSearcher().search(query, numb); numb = Math.min(numb, matchs.totalHits); ScoreDoc[] scoreDocs = matchs.scoreDocs; // 设置style Highlighter lighter = null; if(ARTICLE_FRAGMENT_USE){ Formatter format = new SimpleHTMLFormatter("<font color='red'>","</font>"); Scorer scorer = new QueryScorer(query); Fragmenter fragmenter = new SimpleFragmenter(100); lighter = new Highlighter(format,scorer); lighter.setTextFragmenter(fragmenter); } for(int i = 0 ; i < numb ; i++){ Document document = getIndexSearcher().doc(scoreDocs[i].doc); list.add(document2map(document,lighter)); } } catch (Exception e) { log.info("搜索索引时异常:"+e.getMessage()); e.printStackTrace(); return null; } return list; } /** * transfer */ private static Map<String,String> document2map(Document doc , Highlighter lighter){ Map<String,String> map = new HashMap<String,String>(); if(doc == null) return map; map.put(ARTICLE_NAME, doc.get(ARTICLE_NAME)); map.put(ARTICLE_PATH, doc.get(ARTICLE_PATH)); map.put(ARTICLE_CONTENT, doc.get(ARTICLE_CONTENT)); if(ARTICLE_FRAGMENT_USE){ try { String fragment = lighter.getBestFragment(analyzer,ARTICLE_FRAGMENT,doc.get(ARTICLE_FRAGMENT)); if(fragment != null){ fragment = fragment.trim().replaceAll("\t", "") .replaceAll("\n", "").replaceAll("\r", "") .replaceAll("\n\r", ""); map.put(ARTICLE_FRAGMENT, fragment); } } catch (IOException e) { log.info("document转换Map异常:"+e.getMessage()); e.printStackTrace(); } catch (InvalidTokenOffsetsException e) { log.info("document转换Map异常:"+e.getMessage()); e.printStackTrace(); } } return map; } /** * 获取IndexSearcher * @return */ public static IndexSearcher getIndexSearcher(){ // 单例 if(indexSearcher == null){ synchronized(lock){ Directory directory = null; try { directory = FSDirectory.open(new File("./Repos")); indexSearcher = new IndexSearcher(directory); return indexSearcher; }catch(IOException e){ log.info("获取IndexSearcher异常:"+e.getMessage()); e.printStackTrace(); } } } // 实时更新 try { IndexReader indexReader = indexSearcher.getIndexReader(); if(!indexReader.isCurrent()){ indexSearcher = new IndexSearcher(indexReader.reopen(true)); return indexSearcher; } } catch (CorruptIndexException e) { e.printStackTrace(); log.info("获取IndexSearcher异常:"+e.getMessage()); } catch (IOException e) { e.printStackTrace(); log.info("获取IndexSearcher异常:"+e.getMessage()); } return indexSearcher; } /** * 获取IndexWriter * @return */ public static IndexWriter getIndexWriter(){ if(indexWriter == null){ FSDirectory directory = null; synchronized (lock) { File file = new File("./Repos"); if(!file.exists()) file.mkdir(); try { directory = FSDirectory.open(file); indexWriter = new IndexWriter(directory,analyzer,MaxFieldLength.UNLIMITED); } catch (IOException e) { e.printStackTrace(); log.info("获取IndexWriter异常:"+e.getMessage()); } } } return indexWriter; } /** * 通用分词器 */ public static Analyzer getAnalyzer(){ return analyzer; } }