doc转html

1.pom.xml.主要依赖

<dependency>
	<groupId>org.apache.poi</groupId>
	<artifactId>poi-scratchpad</artifactId>
	<version>3.10-FINAL</version>
</dependency>

2.工具类

package com.exam.main;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.PicturesManager;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.hwpf.usermodel.Picture;
import org.apache.poi.hwpf.usermodel.PictureType;
import org.w3c.dom.Document;
import javax.xml.XMLConstants;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.io.*;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.util.List;
/**
 * Created by xin on 14/11/6.
 */
public class Main {
    private static void trySetSAXFeature(DocumentBuilderFactory documentBuilderFactory, String feature, boolean enabled) {
        try {
            documentBuilderFactory.setFeature(feature, enabled);
        } catch (Exception e) {
            e.printStackTrace();
        } catch (AbstractMethodError ame) {
            ame.printStackTrace();
        }
    }
    public static DocumentBuilderFactory getDocumentBuilderFactory() {
        DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
        factory.setExpandEntityReferences(false);
        trySetSAXFeature(factory, XMLConstants.FEATURE_SECURE_PROCESSING, true);
        trySetSAXFeature(factory, "http://xml.org/sax/features/external-general-entities", false);
        trySetSAXFeature(factory, "http://xml.org/sax/features/external-parameter-entities", false);
        trySetSAXFeature(factory, "http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
        trySetSAXFeature(factory, "http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false);
        return factory;
    }
    /**
     * 获取文件名(不含文件扩展名)
     * @param sourceFile 输入文件
     * @return 返回的文件名
     */
    public static String getFileNameWithoutExtension(File sourceFile){
        String filename=sourceFile.getName();
        return filename.substring(0,filename.lastIndexOf('.'));
    }
    /**
     * doc转html(只支持doc格式,如果是docx,就要看XWPF).html文件与图片(如果有)都保存在doc文件所在目录.
     * @param docFile 输入的doc文件
     * @param savePic 是否保存图片
     * @throws Exception
     */
    public static void docToHtml(final File docFile, boolean savePic) throws Exception {
        HWPFDocument hwpfDocument = new HWPFDocument(new FileInputStream(docFile));
        Document newDocument = getDocumentBuilderFactory().newDocumentBuilder().newDocument();
        WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(newDocument);
        if (savePic) {
            List<Picture> pics=hwpfDocument.getPicturesTable().getAllPictures();
            if(pics!=null&&pics.size()>0){
                for(int i=0;i<pics.size();i++){
                    Picture pic = pics.get(i);
                    pic.writeImageContent(new FileOutputStream(docFile.getParent()+"/"+pic.suggestFullFileName()));
                }
            }
            wordToHtmlConverter.setPicturesManager(new PicturesManager() {
                public String savePicture(byte[] content, PictureType pictureType, String suggestedName, float widthInches, float heightInches) {
                    return suggestedName;
                }
            });
        }
        wordToHtmlConverter.processDocument(hwpfDocument);
        StringWriter stringWriter = new StringWriter();
        Transformer transformer = TransformerFactory.newInstance().newTransformer();
        transformer.setOutputProperty(OutputKeys.INDENT, "yes");
        transformer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
        transformer.setOutputProperty(OutputKeys.METHOD, "html");
        transformer.transform(new DOMSource(wordToHtmlConverter.getDocument()), new StreamResult(stringWriter));
        FileChannel fileChannel = new FileOutputStream(docFile.getParent()+"/"+getFileNameWithoutExtension(docFile)+".html").getChannel();
        fileChannel.write(ByteBuffer.wrap(stringWriter.toString().getBytes()));
        fileChannel.close();
    }
    public static void main(String[] args) throws Exception {
        File file=new File("C:\\Users\\xin\\Desktop\\werwr\\1.doc");
        docToHtml(file, true);
    }
}


你可能感兴趣的:(html,word,doc,转)