Tika文本抽取实例

package metadata;





import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;

import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.SAXException;

public class Extract {
    public static void main(String[] args) throws IOException, SAXException, TikaException {

        getTextFronPDF();
    }
    /** * Tika AutoDetectParser类来识别和抽取内容 * @throws TikaException * @throws SAXException * @throws IOException */
    public static void getTextFronPDF() throws IOException, SAXException, TikaException{
        //构建InputStream来读取数据
        FileInputStream  input=new FileInputStream(new File("E:\\上海项目测试\\文档\\37.pdf"));//可以写文件路径,pdf,word,html等
        BodyContentHandler textHandler=new BodyContentHandler();//获取内容
        Metadata matadata=new Metadata();//Metadata对象保存了作者,标题等元数据
        Parser parser=new  AutoDetectParser();//当调用parser,AutoDetectParser会自动估计文档MIME类型,此处输入pdf文件,因此可以使用PDFParser
        ParseContext context=new ParseContext();
        parser.parse(input, textHandler, matadata, context);//执行解析过程
        input.close();
        System.out.println("Title: "+matadata.get(Metadata.TITLE));
        System.out.println("Type: "+matadata.get(Metadata.TYPE));
        System.out.println("Body: "+textHandler.toString());//从textHandler打印正文
    }
}

你可能感兴趣的:(metadata)