JAVA利用POI scratchpad 5.2.1 将Word文档doc格式转换成HTML 格式 含文档里面图片

一、POM文件

 
            org.apache.poi
            poi-scratchpad
            5.2.1
 

二、转换具体代码

package org.zhao.component;

import org.apache.commons.collections4.CollectionUtils;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.hwpf.usermodel.Picture;
import org.w3c.dom.Document;

import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.List;

/**
 * word文档转换HTML 含文档里面的图片
 *
 * @author Administrator
 * @date 2022年03月18日
 */
public class WordComponent {

    private static String getFileExtname(String filename) {
        return filename.substring(filename.lastIndexOf(".")).toLowerCase();
    }

    private static void createFileDir(String dirPath) {
        File file = new File(dirPath);
        if (!file.exists() && !file.isDirectory()) {
            boolean r = file.mkdirs();
            System.out.println(dirPath + "不存在,创建文件夹->" + r);
        }
    }


    public static void docToHtml(String docFilePath) throws Exception {
        File file = new File(docFilePath);
        if (!file.exists()) {
            System.err.println(docFilePath + "->文件不存在");
            return;
        }
        String name = file.getName();
        String dirName = name.replace(getFileExtname(name), "");
        if (dirName.length() > 100) {
            dirName = String.valueOf(System.currentTimeMillis());
        }
        //html文件目录
        String htmlDirPath = file.getParent() + dirName;
        //创建目录
        createFileDir(htmlDirPath);
        //存储图片目录
        String imagePath = htmlDirPath + "/image/";
        HWPFDocument wordDocument = new HWPFDocument(new FileInputStream(docFilePath));
        WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());
        wordToHtmlConverter.setPicturesManager((content, pictureType, suggestedName, widthInches, heightInches) -> imagePath + suggestedName);
        wordToHtmlConverter.processDocument(wordDocument);
        List allPictures = wordDocument.getPicturesTable().getAllPictures();
        if (CollectionUtils.isNotEmpty(allPictures)) {
            createFileDir(imagePath);
            allPictures.forEach(picture -> {
                try {
                    picture.writeImageContent(new FileOutputStream(imagePath + picture.suggestFullFileName()));
                } catch (IOException e) {
                    e.printStackTrace();
                }
            });
        }
        Document htmlDocument = wordToHtmlConverter.getDocument();
        String htmlPath = htmlDirPath + "/" + dirName + ".html";
        File out = new File(htmlPath);
        DOMSource domSource = new DOMSource(htmlDocument);
        StreamResult streamResult = new StreamResult(out);
        TransformerFactory tf = TransformerFactory.newInstance();
        Transformer serializer = tf.newTransformer();
        serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
        serializer.setOutputProperty(OutputKeys.INDENT, "yes");
        serializer.setOutputProperty(OutputKeys.METHOD, "html");
        serializer.transform(domSource, streamResult);
        System.out.println("转换成功");
    }


    public static void main(String[] args) throws Exception {
        docToHtml("D:/我是一篇简历.doc");
    }
}

 三、Word内容

JAVA利用POI scratchpad 5.2.1 将Word文档doc格式转换成HTML 格式 含文档里面图片_第1张图片

 

四、转换后内容

JAVA利用POI scratchpad 5.2.1 将Word文档doc格式转换成HTML 格式 含文档里面图片_第2张图片 

        代码块


    
        
        
        个人简历表格
        
    
    
        

个人简历表格

姓 名

出生年月

民族

籍 贯

毕业时间

学历

性 别

专 业

社会

实践

经验

在校期间担任过何种职务

 

 

 

你可能感兴趣的:(Java,java,html,xml)