tika

测试代码:
package com.tika.test;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryparser.classic.MultiFieldQueryParser;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.util.Version;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.tika.Tika;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.microsoft.OfficeParser;
import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
import org.apache.tika.sax.BodyContentHandler;
import org.wltea.analyzer.lucene.IKAnalyzer;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

public class IndexUtil {
    
    public static void main(String[] args) throws ParseException, IOException, TikaException
    {
        //index();//创建索引
        //System.out.println("ids="+searche("谷歌"));//查询索引
        File f = new File("C:/高军威.xls");
        //tikaTool(f);
        System.out.println(fileToTxt(f));
    }
    
    public static String tikaTool(File f) throws IOException, TikaException {
        Tika tika = new Tika();
        Metadata metadata = new Metadata();
        String str = tika.parseToString(new FileInputStream(f),metadata); 
        for(String name:metadata.names() ) {
            System.out.println(name+":"+metadata.get(name));
        }
        return str;
    }
    
    public static String fileToTxt(File f) {
        //Parser parser = new OOXMLParser();
        //Parser parser = new PDFParser();//PDF 内容获得
        //Parser parser = new HtmlParser(); //网页信息获得
        //Parser parser = new OOXMLParser(); //2010 office用这个
        //Parser parser = new OfficeParser(); //2003以下用这个
        Parser parser = new AutoDetectParser(); //程序自动检测parser 
        InputStream is = null;
        try {
            Metadata metadata = new Metadata();
            metadata.add(Metadata.CONTENT_ENCODING, "utf-8");//html是 设置 防止乱码
            metadata.set(Metadata.RESOURCE_NAME_KEY, f.getName());
            is = new FileInputStream(f);
            //Workbook wb =new HSSFWorkbook(is);
            //System.out.println(wb.getSheetAt(0).getRow(0).getCell(0).getStringCellValue());
            ContentHandler handler = new BodyContentHandler();
            ParseContext context = new ParseContext();
            context.set(Parser.class,parser);
            parser.parse(is,handler, metadata,context);
            for(String name:metadata.names()) {
                System.out.println(name+":"+metadata.get(name));
            }
            return handler.toString(); 
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } catch (SAXException e)
        {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (TikaException e)
        {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } finally {
            try {
                if(is!=null) is.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        return null;
    }
    
    public static String searche(String searchString) throws ParseException, IOException
    {
        IKAnalyzer analyzer = new IKAnalyzer();
        String[] fields = {"content"};
        QueryParser parser = new MultiFieldQueryParser(Version.LUCENE_40 ,fields,analyzer);
        Query q2 = parser.parse(searchString);
        
        Directory dir = FSDirectory.open(new File("d:/lucene"));
        IndexReader indexReader = DirectoryReader.open(dir);
        IndexSearcher indexSearcher = new IndexSearcher(indexReader);
        ScoreDoc[] docs = indexSearcher.search(q2,6000).scoreDocs;
        String dd ="";
        if(docs.length>0){
            Document document = indexSearcher.doc(docs[0].doc);
            dd = document.get("ids");
        }
        
        return dd;
    }
    
    public static void index() {
        try {
            File f = new File("C:/ITeye.pdf");
            
            IKAnalyzer analyzer = new IKAnalyzer();
            IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_40,analyzer);
            
            FieldType ft = new FieldType();
            ft.setStored(false); // 设置是否进行存储
            ft.setIndexed(true); // 设置是否能够索引到
            ft.setTokenized(true);// 设置是否进行分词分析
            FieldType ft2 = new FieldType();
            ft2.setStored(true); // 设置是否进行存储
            ft2.setIndexed(true); // 设置是否能够索引到
            ft2.setTokenized(false);// 设置是否进行分词分析

            Directory dir = FSDirectory.open(new File("d:/lucene"));
            IndexWriter writer = new IndexWriter(dir,indexWriterConfig);
            writer.deleteAll();
            Document doc = new Document();
            Field field1 = new Field("content",new Tika().parse(f),ft2);
            Field field2 = new Field("ids","110",ft2);
            doc.add(field1);
            doc.add(field2);
            writer.addDocument(doc);
            writer.close();
        } catch (CorruptIndexException e) {
            e.printStackTrace();
        } catch (LockObtainFailedException e) {
            e.printStackTrace();
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }finally{
            System.out.println("索引创建成功!!");
        }
    }
    
}

http://yunpan.cn/Qb93GuJDtIUL5

你可能感兴趣的:(IK)