提取Office以及PDF里的文字

  1. 利用pdfbox提取pdf文档里的文字
  2. 利用POI提取office文档里的文字

例子比较简单,作为记录

提取pdf文字,可以提取中文,有时会出现乱码

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;

import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;

import com.doc.manager.util.IDocParser;

/** * parse pdf documents * */
public class PdfParser implements IDocParser {
    @Override
    public String parser(String filePath){
        if(filePath==null)
            return null;
        File file=new File(filePath);
        if(!file.exists()){
            return null;
        }
        return parser(file);
    }

    @Override
    public String parser(File file){
        FileInputStream fileInputStream=null;
        try {
            fileInputStream = new FileInputStream(file);
        } catch (FileNotFoundException e1) {
            e1.printStackTrace();
            return null;
        }
        PDFParser parser;
        String reString=null;
        try {
            parser = new PDFParser(fileInputStream);
            parser.parse();
            PDDocument pdDocument=parser.getPDDocument();
            PDFTextStripper pdfTextStripper=new PDFTextStripper();
            reString=pdfTextStripper.getText(pdDocument);
        } catch (IOException e) {
            e.printStackTrace();
        }   
        return reString.trim();
    }

}

提取office文档文字,不支持WPS

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;

import org.apache.poi.hslf.extractor.PowerPointExtractor;
import org.apache.poi.hslf.usermodel.HSLFSlideShowImpl;
import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.xslf.XSLFSlideShow;
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.xmlbeans.XmlException;

import com.doc.manager.util.IDocParser;

/** * parse office, not for wps_office * */
public class OfficeParser implements IDocParser {
    private final String WORD_2003_AND_BEFORE="doc";
    private final String WORD_2007_AND_LATER="docx";
    private final String EXCEL_2003_AND_BEFORE="xls";
    private final String EXCEL_2007_AND_LATER="xlsx";
    private final String PPT_2003_AND_BEFORE="ppt";
    private final String PPT_2007_AND_LATER="pptx";
    @Override
    public String parser(String filePath) {
        if(filePath==null)
            return null;
        File file=new File(filePath);
        if(!file.exists())
            return null;
        return parser(file);
    }

    @Override
    public String parser(File file) {
        String filePath=file.getAbsolutePath();
        String fileExt=filePath.substring(filePath.lastIndexOf(".")+1);
        if(fileExt.equalsIgnoreCase(WORD_2003_AND_BEFORE))
            return parseWord2003AndBefore(file);
        if(fileExt.equalsIgnoreCase(WORD_2007_AND_LATER))
            return parseWord2007AndLater(file);
        if(fileExt.equalsIgnoreCase(EXCEL_2003_AND_BEFORE))
            return parseExcel2003AndBefore(file);
        if(fileExt.equalsIgnoreCase(EXCEL_2007_AND_LATER))
            return parseExcel2007AndLater(file);
        if(fileExt.equalsIgnoreCase(PPT_2003_AND_BEFORE))
            return parsePPT2003AndBefore(file);
        if(fileExt.equalsIgnoreCase(PPT_2007_AND_LATER))
            return parsePPT2007AndLater(file);
        else{
            System.out.println("不支持的文件类型");
        }
        return null;
    }
    /** * 解析word2003及之前版本 * */
    public String parseWord2003AndBefore(File file){
        String text=null;
        WordExtractor extractor=null;
        try {
            HWPFDocument document=new HWPFDocument(new FileInputStream(file));
            extractor=new WordExtractor(document);
            text=extractor.getText();   
        } catch (IOException e) {       
            e.printStackTrace();
        }finally{
            if(extractor!=null)
                try {
                    extractor.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
        }
        return text.trim();
    }
    /** * 解析word2007及以后版本 * */
    public String parseWord2007AndLater(File file){
        String text=null;
        try {
            XWPFDocument document=new XWPFDocument(new FileInputStream(file));
            XWPFWordExtractor extractor=new XWPFWordExtractor(document);
            text=extractor.getText();
            extractor.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
        return  text.trim();
    }
    /** * 解析excel2003及之前版本 * */
    public String parseExcel2003AndBefore(File file){
        String text=null;
        try {
            FileInputStream inp = new FileInputStream(file);
            HSSFWorkbook wb = new HSSFWorkbook(new POIFSFileSystem(inp));
            ExcelExtractor extractor = new ExcelExtractor(wb);
            extractor.setFormulasNotResults(true);
            extractor.setIncludeSheetNames(false);
            text = extractor.getText();
            extractor.close();
            inp.close();
        } catch (IOException e) {       
            e.printStackTrace();
        }
        return text.trim();
    }
    /** * 解析excel2007及以后版本 * */
    public String parseExcel2007AndLater(File file){
        String text=null;
        try {
            FileInputStream inp = new FileInputStream(file);
            XSSFWorkbook wb = new XSSFWorkbook(inp);
            XSSFExcelExtractor extractor=new XSSFExcelExtractor(wb);
            extractor.setFormulasNotResults(true);
            extractor.setIncludeSheetNames(false);
            text = extractor.getText();
            extractor.close();
            inp.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
        return  text.trim();
    }
    /** * 解析powerpoint2003及之前版本 * */
    public String parsePPT2003AndBefore(File file){
        String text=null;
        try {
            FileInputStream inp = new FileInputStream(file);
            HSLFSlideShowImpl ppt=new HSLFSlideShowImpl(inp);
            PowerPointExtractor extractor=new PowerPointExtractor(ppt);
            text=extractor.getText();
            extractor.close();
            inp.close();
        } catch (IOException e) {       
            e.printStackTrace();
        }
        return text.trim();
    }
    /** * 解析powerpoint2007及以后版本 * */
    public String parsePPT2007AndLater(File file){
        String text=null;
        try {
            FileInputStream inp = new FileInputStream(file);
            XSLFSlideShow slideshow=new XSLFSlideShow(file.getAbsolutePath());
            XSLFPowerPointExtractor extractor=new XSLFPowerPointExtractor(slideshow);
            text=extractor.getText();
            extractor.close();
            inp.close();
        } catch (IOException e) {
            e.printStackTrace();
        } catch (OpenXML4JException e) {
            e.printStackTrace();
        } catch (XmlException e) {
            e.printStackTrace();
        }
        return  text.trim();
    }
}

工程源码:
http://download.csdn.net/detail/zlp1992/9275815

你可能感兴趣的:(poi,pdfbox,pdf文字提取,office文字提取)