POI将Word文档转换成Html

/** 
 * word2007和word2003的构建方式不同, 
 * 前者的构建方式是xml,后者的构建方式是dom树
 * 文件的后缀也不同,前者后缀为.docx,后者后缀为.doc 
 */
public String convertToHtml(InputStream inputStream, String fileName) throws Exception {
    String substring = fileName.substring(fileName.lastIndexOf(".") + 1);
    ByteArrayOutputStream out = new ByteArrayOutputStream();
    
    //word07文档
    if ("docx".equals(substring)) {
        XWPFDocument document = new XWPFDocument(PackageHelper.open(inputStream));
        //获取文档中的图片
        List allPictures = document.getAllPictures();
        for (XWPFPictureData xwpfPictureData : allPictures) {
            String name = xwpfPictureData.getFileName();
            byte[] data = xwpfPictureData.getData();
            InputStream input = new ByteArrayInputStream(data);
            // TODO 图片处理
        }

        final String imageUrl = "";
        XHTMLOptions options = XHTMLOptions.create();
        //不把图片生成出来
        options.setExtractor(null);
        options.setIgnoreStylesIfUnused(false);
        options.setFragment(true);
        options.URIResolver(new IURIResolver() {
            //@Override
            public String resolve(String uri) {
                return imageUrl + uri;
            }
        });
        //转换
        XHTMLConverter.getInstance().convert(document, out, options);
    } else {
        //word03文档
        HWPFDocument wordDocument = new HWPFDocument(inputStream);
        WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(
                DocumentBuilderFactory.newInstance().newDocumentBuilder()
                        .newDocument());
        wordToHtmlConverter.setPicturesManager(new PicturesManager() {
            public String savePicture(byte[] content,
                                      PictureType pictureType, String suggestedName,
                                      float widthInches, float heightInches) {
                //给生成的页面写图片的路径
                return "word/media/" + suggestedName;
            }
        });
        wordToHtmlConverter.processDocument(wordDocument);
        //获取文档中的图片
        List pics = wordDocument.getPicturesTable().getAllPictures();
        if (pics != null) {
            for (int i = 0; i < pics.size(); i++) {
                Picture pic = (Picture) pics.get(i);
                byte[] byteArr = pic.getContent();
                InputStream input = new ByteArrayInputStream(byteArr);
                // TODO 图片处理
            }
        }
        Document htmlDocument = wordToHtmlConverter.getDocument();
        DOMSource domSource = new DOMSource(htmlDocument);
        StreamResult streamResult = new StreamResult(out);

        //转换html文件
        TransformerFactory tf = TransformerFactory.newInstance();
        Transformer serializer = tf.newTransformer();
        serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
        serializer.setOutputProperty(OutputKeys.INDENT, "yes");
        serializer.setOutputProperty(OutputKeys.METHOD, "html");
        serializer.transform(domSource, streamResult);
    }
    out.close();
    //转化数据流,替换特殊字符
    return StringEscapeUtils.escapeHtml(out.toString());
}

你可能感兴趣的:(java)