lucence

 

时间关系暂时做到这,还有很多没有完善的,比如对压缩文件的处理,对索引与搜索的优化等。应早日完善。

 package com.aheadsoft.lucene.test;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.util.Date;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.poi.POIXMLDocument;
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.hslf.HSLFSlideShow;
import org.apache.poi.hslf.model.Slide;
import org.apache.poi.hslf.model.TextRun;
import org.apache.poi.hslf.usermodel.SlideShow;
import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.wltea.analyzer.lucene.IKAnalyzer;
import org.wltea.analyzer.lucene.IKQueryParser;
import org.wltea.analyzer.lucene.IKSimilarity;

public class LuceneUtil
{
 /**
  * 创建目录
  * @param indexPath 索引文件路径
  * @param dataPath 要创建索引的文件目录
  * @throws IOException
  */
 public static void createIndex(String indexPath,String dataPath) throws IOException
 {
   File dataDir = new File(dataPath);
    if (!dataDir.exists() || !dataDir.isDirectory()) {
         throw new IOException(dataDir
           + " does not exist or is not a directory");
       }
   File indexDir = new File(indexPath);
//   Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT); // 创建一个语法分析器  
   Analyzer analyzer = new IKAnalyzer();
         IndexWriter writer = null;  
         // 文件目录  
         Directory directory = null;  
         try {  
             // 索引文件可放的位置:索引可以存放在两个地方1.硬盘,2.内存;  
             // 放在硬盘上可以用FSDirectory(),放在内存的用RAMDirectory()不过一关机就没了  
             directory = FSDirectory.open(indexDir); // 把索引文件存储到磁盘目录  
             // 创建一个IndexWriter(存放索引文件的目录,分析器,Field的最大长度)  
             // 可见构造它需要一个索引文件目录,一个分析器(一般用标准的这个),一个参数是标识是否清空索引目录  
             //create: 此参数为Boolean型,true表示重新创建整个索引, false 表示增量式创建
             //create - true to create the index or overwrite the existing one; false to append to the existing index
             writer = new IndexWriter(directory, analyzer, true,  
                     IndexWriter.MaxFieldLength.UNLIMITED);  
             // 索引合并因子  
             // 一、SetMergeFactor(合并因子)  
             // SetMergeFactor是控制segment合并频率的,其决定了一个索引块中包括多少个文档,当硬盘上的索引块达到多少时,  
             // 将它们合并成一个较大的索引块。当MergeFactor值较大时,生成索引的速度较快。MergeFactor的默认值是10,建议在建立索引前将其设置的大一些。  
             writer.setMergeFactor(1000);  
             // 二、SetMaxBufferedDocs(最大缓存文档数)  
             // SetMaxBufferedDocs是控制写入一个新的segment前内存中保存的document的数目,  
             // 设置较大的数目可以加快建索引速度,默认为10。  
//             writer.setMaxBufferedDocs(100); 
             // 三、SetMaxMergeDocs(最大合并文档数)  
             // SetMaxMergeDocs是控制一个segment中可以保存的最大document数目,值较小有利于追加索引的速度,默认Integer.MAX_VALUE,无需修改。  
             writer.setMaxMergeDocs(1000);  
             //四、 在创建大量数据的索引时,我们会发现索引过程的瓶颈在于大量的磁盘操作,如果内存足够大的话,  
             // 我们应当尽量使用内存,而非硬盘。可以通过SetMaxBufferedDocs来调整,增大Lucene使用内存的次数。  
//             writer.setMaxBufferedDocs(100);
             // 五、SetUseCompoundFile这个方法可以使Lucene在创建索引库时,会合并多个 Segments 文件到一个 .cfs 中。  
             // 此方式有助于减少索引文件数量,对于将来搜索的效率有较大影响。  
             // 压缩存储(True则为复合索引格式)  
             writer.setUseCompoundFile(true);  
             indexDirectory(writer, dataDir);  
             // 对索引进行优化  
             writer.optimize();  
             // 若需要从索引中删除某一个或者某一类文档,IndexReader提供了两种方法:  
             // reader.DeleteDocument(int docNum)  
             // reader.DeleteDocuments(Term term)  
             //  
             // 前者是根据文档的编号来删除该文档,docNum是该文档进入索引时Lucene的编号,是按照顺序编的;后者是删除满足某一个条件的多个文档。  
             //  
             // 在执行了DeleteDocument或者DeleteDocuments方法后,系统会生成一个*.del的文件,该文件中记录了删除的文档,但并未从物理上删除这些文档。
             //此时,这些文档是受保护的,当使用Document  
             // doc = reader.Document(i)来访问这些受保护的文档时,Lucene会报“Attempt to access a  
             // deleted document”异常。如果一次需要删除多个文档时,可以用两种方法来解决:  
             //  
             // 1. 删除一个文档后,用IndexWriter的Optimize方法来优化索引,这样我们就可以继续删除另一个文档。  
             //  
             // 2.  
             // 先扫描整个索引文件,记录下需要删除的文档在索引中的编号。然后,一次性调用DeleteDocument删除这些文档,再调用IndexWriter的Optimize方法来优化索引。  
  
         } catch (IOException e) {  
             e.printStackTrace();  
         } finally {  
             if (writer != null) {  
                 try {  
                     writer.close(); // 关闭IndexWriter时,才把内存中的数据写到文件  
                 } catch (IOException e) {  
                     e.printStackTrace();  
                 }  
             }  
             if (directory != null) {  
                 try {  
                     directory.close(); // 关闭索引存放目录  
                 } catch (IOException e) {  
                     e.printStackTrace();  
                 }  
             }  
         }  
 }
  /** 
     * 递归文件 
     *  
     * @param writer 
     * @param dir 
     * @throws IOException 
     */ 
    private static void indexDirectory(IndexWriter writer, File dataDir)  
            throws IOException {  
 
        File[] files = dataDir.listFiles();  
        for (File src : files) {  
            if (src.isDirectory()) {  
                // 如果是文件继续递归  
                indexDirectory(writer, src); // recurse  
              
            } else  {    
                indexFile(writer, src);  
            }  
        }  
    }  
    /** 
     * 建立索引表 
     *  
     * @param writer 
     * @param src
     * @throws IOException 
     */ 
    private static void indexFile(IndexWriter writer, File src)  
            throws IOException {  
        // 如果文件时隐藏或者文件不存在或则文件不能读,则返回  
        if (src.isHidden() || !src.exists() || !src.canRead()) {  
            return;  
        }  
        // 显示读取的文件内容  
        String text = loadFileToString(src);
       
        Document doc = new Document();  
       
        //NOT_ANALYZED_NO_NORMS  NOT_ANALYZED不分词,NO_NORMS不使用已定义的Analyzer来分词
        //Field.Store.YES 可直接取其内容  Field.Store.NO 查询到相关记录时无法取其内容。
        doc.add(new Field("contents", src.getName()+text, Field.Store.YES,  
                Field.Index.ANALYZED));
        //大段文本内容,会用来检索,但是检索后不需要从index中取内容,可以根据url去load真实的内容   
        doc.add(new Field("filename", src.getName(), Field.Store.YES,  
                Field.Index.ANALYZED));  
        writer.addDocument(doc);  
       
    }  
 
    /** 
     * 将文件读出来转化为字符串 
     *  
     * @param file 
     *            源文件,不能是文件夹 
     * @return 
     */ 
    private static String loadFileToString(File file)
    {
     String text ="";
     if(file.getName().endsWith(".txt"))
     {
      try
   {
       long start = new Date().getTime();
       text= readTxt(file);
     long end = new Date().getTime();  
     
     System.out.println("Indexing  "+file.getName()+" took " 
                   + (end - start) + " milliseconds size is " +file.length());
   
   } catch (Exception e)
   {
    System.out.println("读取TXT文件时出错!");
    e.printStackTrace();
   }
     }else if(file.getName().endsWith(".doc")||file.getName().endsWith(".docx"))
     {
      try
   {
       long start = new Date().getTime();
    text= readDoc(file);
     long end = new Date().getTime();  
     
     System.out.println("Indexing  "+file.getName()+" took " 
                   + (end - start) + " milliseconds size is " +file.length());
   } catch (Exception e)
   {
    System.out.println("读取doc文件时出错!");
    e.printStackTrace();
   }
     }else  if(file.getName().endsWith(".xls")||file.getName().endsWith(".xlsx"))
     {
      try
   {
       long start = new Date().getTime();
       text= readExcel(file);
     long end = new Date().getTime();  
       System.out.println("Indexing  "+file.getName()+" took " 
                   + (end - start) + " milliseconds size is " +file.length());
   
   } catch (Exception e)
   {
    System.out.println("读取excel文件时出错!");
    e.printStackTrace();
   }
     }else  if(file.getName().endsWith(".ppt")||file.getName().endsWith(".pptx"))
     {
      try
   {
       long start = new Date().getTime();
       text= readPpt(file);
     long end = new Date().getTime();  
    
       System.out.println("Indexing  "+file.getName()+" took " 
                   + (end - start) + " milliseconds size is " +file.length());
      
   } catch (Exception e)
   {
    System.out.println("读取ppt文件时出错!");
    e.printStackTrace();
   }
     }else  if(file.getName().endsWith(".pdf"))
     {
      try
   {
       long start = new Date().getTime();
       text= readPdf(file);
     long end = new Date().getTime();  
     
       System.out.println("Indexing  "+file.getName()+" took " 
                   + (end - start) + " milliseconds size is " +file.length());
   
   } catch (Exception e)
   {
    System.out.println("读取pdf文件时出错!");
    e.printStackTrace();
   }
     }else 
     {
      try
   {
       long start = new Date().getTime();
       text= readTxt(file);
     long end = new Date().getTime();  
     
       System.out.println("Indexing  "+file.getName()+" took " 
                   + (end - start) + " milliseconds size is " +file.length());
   
   
   } catch (Exception e)
   {
    System.out.println("读取其它文件格式时出错!");
    e.printStackTrace();
   }
     }
     
     
        
     return text;
    }
    /**
     * 读取Txt文件格式
     * @param xls
     * @return
     * @throws Exception
     */
    private static String readTxt(File txt)throws Exception {
    
      BufferedReader br = null;  
         try {  
             // 字符缓冲流,是个装饰流,提高文件读取速度  
             br = new BufferedReader(new FileReader(txt));  
             StringBuffer sb = new StringBuffer();  
             String line = br.readLine();  
             while (null != line) {  
                 sb.append(line);  
                 line = br.readLine();  
             }  
             return sb.toString();  
         } catch (FileNotFoundException e) {  
             System.out.println("文件不存在!");  
             return null;  
         } catch (IOException e) {  
             e.printStackTrace();  
             return null;  
         } finally {  
             try {  
                 br.close();  
             } catch (IOException e) {  
                 System.out.println("关闭流出现异常");  
                 e.printStackTrace();  
             }  
         } 
    }
    /**
     * 读取XLS文件格式
     * @param xls
     * @return
     * @throws Exception
     */
    private static String readExcel(File xls)throws Exception {
        // 创建输入流读取xls文件
        InputStream in = new FileInputStream(xls);//xls文件存储地址
        HSSFWorkbook workbook = new HSSFWorkbook(in);   //读取一个文件
        ExcelExtractor extractor = new ExcelExtractor(workbook);

        extractor.setFormulasNotResults(true);
        extractor.setIncludeSheetNames(false);

        String text = extractor.getText(); //Retrieves the text contents of the file
        return text;


    }
/**
 * 读取word文档
 * @param doc
 * @return
 * @throws Exception
 */
    private static String readDoc(File doc) throws Exception {
        // 创建输入流读取DOC文件
        FileInputStream in = new FileInputStream(doc);
        String text="";
        if(doc.getName().endsWith(".doc"))
        {
         WordExtractor extractor = null; // 创建WordExtractor
         extractor = new WordExtractor(in);// 对DOC文件进行提取
         text =  extractor.getText();
        }else
        {
         //docx
          OPCPackage opcPackage = POIXMLDocument.openPackage(doc.getAbsolutePath());
             POIXMLTextExtractor ex = new XWPFWordExtractor(opcPackage);  

             text= ex.getText();


        }
       
        return text;

    }

    private static String readPpt(File ppt)throws Exception {
         // 创建输入流读取ppt文件
        FileInputStream is = new FileInputStream(ppt);
        SlideShow ss = new SlideShow(new HSLFSlideShow(is));//is 为文件的InputStream,建立SlideShow
        Slide[] slides = ss.getSlides();//获得每一张幻灯片

        String text = new String();
        for(int i=0;i<slides.length;i++){
            TextRun[] t = slides[i].getTextRuns();//为了取得幻灯片的文字内容,建立TextRun
            for(int j=0;j<t.length;j++){
            //System.out.println(t[j].getText());//这里会将文字内容加到content中去
            text += t[j].getText();
            }
        }
        return text;
     }


    private static String readPdf(File pdf){
         // 创建输入流读取pdf文件
         String result="";
         FileInputStream is = null;
         PDDocument document = null;
          try {
              is = new FileInputStream(pdf);
              PDFParser parser = new PDFParser(is);
              parser.parse();
              document = parser.getPDDocument();
              PDFTextStripper stripper = new PDFTextStripper();
              result = stripper.getText(document);

              }catch (Exception e) {

                e.printStackTrace();
              } finally {
                  if (is != null) {
                    try {
                          is.close();
                     } catch (Exception e) {
                        e.printStackTrace();
                     }
                   }
                if (document != null) {
                   try {
                    document.close();
                  } catch (Exception e) {
                       e.printStackTrace();
                  }
                 }
         }
        return result;
      }


    public static void main(String[] args) throws ParseException
 {
     try
  {
//   createIndex("E://index", "E://index//data");
   search("E://index", "公    司","contents");
  } catch (IOException e)
  {
   e.printStackTrace();
  }
 
 }
   
    public static void  search(String indexDir, String keyword,String field) throws CorruptIndexException, IOException, ParseException
 {
    
     IndexSearcher is = new IndexSearcher(FSDirectory.open(new File(indexDir)),true);//read-only
      //在索引器中使用IKSimilarity相似度评估器,用哪个分词算法好,可以根据实际要求来定,个人认为StandardAnalyzer比IKAnalyzer高效
        is.setSimilarity(new IKSimilarity());
        Query query = IKQueryParser.parse(field, keyword);
//        QueryParser parser = new QueryParser(Version.LUCENE_CURRENT, field, new StandardAnalyzer(Version.LUCENE_CURRENT));
//       
//        Query query = parser.parse(keyword);
//        BooleanQuery q = new BooleanQuery();
//        q.add(query,Occur.MUST);
       
        TopScoreDocCollector collector = TopScoreDocCollector.create(10 , false);
       
        long start = new Date().getTime();// start time
//        TopDocs topDocs = is.search(query , 5);
        is.search(query, collector);
        ScoreDoc[] hits = collector.topDocs().scoreDocs;
       
        System.out.println(hits.length);
        for (int i = 0; i < hits.length; i++)
        {
         Document doc = is.doc(hits[i].doc);//new method is.doc()
//         System.out.println(doc.getField("filename")+"   "+doc.getField("contents")+"  "+hits[i].toString()+"  ");
         System.out.println(doc.getField("filename")+"   " +hits[i].toString()+"  ");
     }
        long end = new Date().getTime();//end time

        System.out.println("Found " + collector.getTotalHits() +
               " document(s) (in " + (end - start) +
               " milliseconds) that matched query '" +
                 keyword + "':");
 }
}

 

本文来自CSDN博客,转载请标明出处:http://blog.csdn.net/cbdhxka/archive/2010/01/07/5151660.aspx

你可能感兴趣的:(Date,exception,String,File,Lucene,文档)