Java 将Word文件转换为HTML格式文件

  • 前言:在很多时候我们都需要到项目中导入word文档,但是后期再次在前段显示这个文档的时候前端往往需要的是html格式的,所以这个时候就会提出一个需求: 你们存文档的时候能不能存成html格式的?  于是这篇文章的内容就可以满足这个需求

我是通过MultiPartFile 类来实现的,上代码:

一、首先导入需要的依赖包:

            
                org.apache.poi
                poi-scratchpad
                3.17
            
                    
            
            
                org.apache.poi
                poi-ooxml
                3.17
            
                
            
            
                fr.opensagres.xdocreport
                fr.opensagres.xdocreport.converter.docx.xwpf
                2.0.1
            

二、编写代码:

package com.lmt.service.file;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.util.UUID;

import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;

import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.PicturesManager;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.hwpf.usermodel.PictureType;
import org.apache.poi.util.IOUtils;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import org.springframework.web.multipart.MultipartFile;
import org.w3c.dom.Document;


import fr.opensagres.poi.xwpf.converter.core.ImageManager;
import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLConverter;
import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLOptions;

@Component
public class WordToHtml {
    private static final Logger logger = LoggerFactory.getLogger(WordToHtml.class);
    
    //转换的方法
    public File convert(MultipartFile file) {
        //获得文件的名字
        String filename = file.getOriginalFilename();
        //获得文件的扩展名
        String suffix=filename.substring(filename.lastIndexOf("."));
        String newName=UUID.randomUUID().toString();
        // TODO 需要保存在一个新的位置
        // File =new File 表示目录的一个抽象,可以进一步用exists()和isDirectory()方法判断。
       
        File convFile = new File("D:/test/" + newName +suffix);
        FileOutputStream fos = null;
        try {
            //创建文件
            convFile.createNewFile(); 
 //FileOutputStream 是输出流 将文件输出到磁盘或者数据库中
            fos = new FileOutputStream(convFile); 
            fos.write(file.getBytes());
        } catch (IOException ex) {
            logger.error("上传文件出错!", ex);
            return null;
        } finally {
            IOUtils.closeQuietly(fos);
        }
        
        // 输入文件名的所在文件夹
        // 加上反斜杠
        String parentDirectory = convFile.getParent();
        if (!parentDirectory.endsWith("\\")) {
            parentDirectory = parentDirectory + "\\";
        }
        
        if (filename.endsWith(".docx")) {
            return docxConvert(parentDirectory, convFile.getAbsolutePath(),newName);
        } else if (filename.endsWith(".doc")) {
            return docConvert(parentDirectory, convFile.getAbsolutePath(),newName);
        } else {
            logger.error("不支持的文件格式!");
            return null;
        }
    }
    
    
    private File docxConvert(String parentDirectory, String filename,String newName) {
        try {
            XWPFDocument document = new XWPFDocument(new FileInputStream(filename));
            XHTMLOptions options = XHTMLOptions.create().setImageManager(new ImageManager(new File(parentDirectory), UUID.randomUUID().toString())).indent(4);
            FileOutputStream out = new FileOutputStream(new File(parentDirectory + newName+ ".html"));
            XHTMLConverter.getInstance().convert(document, out, options);
            return new File(parentDirectory + newName+ ".html");
        } catch (IOException ex) {
            logger.error("word转化出错!", ex);
            return null;
        }
        
    }
    
    
    private File docConvert(String parentDirectory, String filename,String newName) {
        try {
            HWPFDocument document = new HWPFDocument(new FileInputStream(filename));
            WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(
                    DocumentBuilderFactory.newInstance().newDocumentBuilder()
                            .newDocument());
            
            // converter默认对图片不作处理,需要手动下载图片并嵌入到html中
             wordToHtmlConverter.setPicturesManager(new PicturesManager() {
                    @Override
                    public String savePicture(byte[] bytes, PictureType pictureType, String s, float v, float v1) {
                        String imageFilename = parentDirectory + "";
                        String identity=UUID.randomUUID().toString();
                        File imageFile = new File(imageFilename, identity+s);
                        imageFile.getParentFile().mkdirs();
                        InputStream in = null;
                        FileOutputStream out = null;

                        try {
                            in = new ByteArrayInputStream(bytes);
                            out = new FileOutputStream(imageFile);
                            IOUtils.copy(in, out);

                        } catch (IOException ex) {
                            logger.error("word转化出错!", ex);
                        } finally {
                            if (in != null) {
                                IOUtils.closeQuietly(in);
                            }

                            if (out != null) {
                                IOUtils.closeQuietly(out);
                            }

                        }
                        return imageFile.getName();
                    }
                });
            
            wordToHtmlConverter.processDocument(document);
            Document htmlDocument = wordToHtmlConverter.getDocument();
            ByteArrayOutputStream out = new ByteArrayOutputStream();
            DOMSource domSource = new DOMSource(htmlDocument);
            StreamResult streamResult = new StreamResult(out);

            TransformerFactory tf = TransformerFactory.newInstance();
            Transformer serializer = tf.newTransformer();
            serializer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
            serializer.setOutputProperty(OutputKeys.INDENT, "yes");
            serializer.setOutputProperty(OutputKeys.METHOD, "html");
            serializer.transform(domSource, streamResult);
            out.close();

            String result = new String(out.toByteArray());
            FileWriter writer = new FileWriter(parentDirectory + newName + ".html");
            writer.write(result);
            writer.close();                        
        } catch (IOException | TransformerException | ParserConfigurationException ex) {
            logger.error("word转化出错!", ex);
        }
        return new File(parentDirectory + newName + ".html");
    }
    
    /**
     * 将上传的Word文档转化成HTML字符串
     * @param attachfile
     * @return
     */
    public String convertToHtml(MultipartFile attachfile) {
        String wordContent = "";
        // 将Word文件转换为html
        File file = convert(attachfile);
        // 读取html文件
        if (file != null) {
            return "文件转换成功"
        }
        return "文件转换失败";
    }

代码的含义已经在代码行的注释上有了,哪里有问题,欢迎大家随时在评论下方留言!

 

你可能感兴趣的:(项目实战)