【Java】使用poi+pdfbox实现office文件提取内容

引入maven依赖



    org.apache.poi
    poi
    3.16


    org.apache.poi
    poi-ooxml
    3.16


    org.apache.poi
    poi-scratchpad
    3.16




    org.apache.pdfbox
    pdfbox
    2.0.4

提取内容

private static String read(File file) {
    StringBuilder builder = new StringBuilder();

    String name = file.getName();
    boolean txt = name.endsWith(".txt");
    if (txt) {
        try (FileInputStream inputStream = new FileInputStream(file)) {
            int len;
            byte[] bytes = new byte[1024];
            while ((len = inputStream.read(bytes)) != -1) {
                builder.append(new String(bytes, 0, len));
            }
            inputStream.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
    POITextExtractor extractor = null;
    boolean word = name.endsWith(".doc") || name.endsWith(".docx");
    if (word) {
        try {
            extractor = new WordExtractor(new HWPFDocument(new FileInputStream(file)));
        } catch (Exception e) {
            try {
                extractor = new XWPFWordExtractor(new XWPFDocument(new FileInputStream(file)));
            } catch (Exception ignored) {
            }
        }
    }
    boolean excel = name.endsWith(".xls") || name.endsWith(".xlsx");
    if (excel) {
        try {
            extractor = new ExcelExtractor(new HSSFWorkbook(new POIFSFileSystem(file)));
        } catch (Exception e) {
            try {
                extractor = new XSSFExcelExtractor(new XSSFWorkbook(file));
            } catch (Exception ignored) {
            }
        }
    }
    boolean slide = name.endsWith(".ppt") || name.endsWith(".pptx");
    if (slide) {
        try {
            extractor = new PowerPointExtractor(new FileInputStream(file));
        } catch (Exception e) {
            try {
                extractor = new XSLFPowerPointExtractor(new XSLFSlideShow(OPCPackage.open(file)));
            } catch (Exception ignored) {
            }
        }
    }
    if (extractor != null) {
        builder.append(extractor.getText());
        try {
            extractor.close();
        } catch (IOException ignored) {
        }
    }
    boolean pdf = name.endsWith(".pdf");
    if (pdf) {
        try {
            PDDocument document = PDDocument.load(file);
            PDFTextStripper stripper = new PDFTextStripper();
            builder.append(stripper.getText(document));
            document.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
    return builder.toString();
}

 

你可能感兴趣的:(java)