java读取word和pdf文件

public static String getWordAndPdfContent(File file) throws Exception {

        String content = "";
        if (file.getName().endsWith("doc")) {//读取word,doc格式
            WordExtractor extractor = new WordExtractor(new FileInputStream(file));
            content = extractor.getText();
        } else if (file.getName().endsWith("docx")) {//docx格式
            OPCPackage opcPackage = POIXMLDocument.openPackage(file.getPath());
            POIXMLTextExtractor extractor = new XWPFWordExtractor(opcPackage);
            content = extractor.getText();
            opcPackage.close();
        } else if (file.getName().endsWith("pdf")) {//读取pdf格式文件
            PDDocument document = PDDocument.load(file);
            if (!document.isEncrypted()) {//是否加密
                PDFTextStripperByArea stripperByArea = new PDFTextStripperByArea();
                stripperByArea.setSortByPosition(true);
                PDFTextStripper stripper = new PDFTextStripper();
//                System.out.println(stripper.getPageStart()+"#################");
                content = stripper.getText(document);
                System.out.println(content);
            }
            document.close();
        }

        return content;
    }

你可能感兴趣的:(java读取word和pdf文件)