title: java读取word转换HTML格式,保留内容的样式和格式
date: 2023-08-11
categories:
pom依赖
<dependency>
<groupId>org.apache.poigroupId>
<artifactId>poiartifactId>
<version>3.15version>
dependency>
<dependency>
<groupId>org.apache.poigroupId>
<artifactId>poi-ooxmlartifactId>
<version>3.15version>
dependency>
<dependency>
<groupId>org.apache.poigroupId>
<artifactId>poi-ooxml-schemasartifactId>
<version>3.15version>
dependency>
<dependency>
<groupId>org.apache.poigroupId>
<artifactId>poi-scratchpadartifactId>
<version>3.17version>
dependency>
<dependency>
<groupId>fr.opensagres.xdocreportgroupId>
<artifactId>org.apache.poi.xwpf.converter.xhtmlartifactId>
<version>1.0.6version>
dependency>
java
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.fasterxml.jackson.core.JsonProcessingException;
import org.apache.commons.fileupload.FileItem;
import org.apache.commons.fileupload.FileItemFactory;
import org.apache.commons.fileupload.disk.DiskFileItemFactory;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.xwpf.converter.core.BasicURIResolver;
import org.apache.poi.xwpf.converter.core.FileImageExtractor;
import org.apache.poi.xwpf.converter.xhtml.XHTMLConverter;
import org.apache.poi.xwpf.converter.xhtml.XHTMLOptions;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.springframework.web.multipart.MultipartFile;
import org.springframework.web.multipart.commons.CommonsMultipartFile;
import org.w3c.dom.Document;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.io.*;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.List;
/**
* 功能描述
*
* @author: konglignxin
* @date: 2023年04月04日 16:54
*/
public class Test {
/**
* 上传Word文档,返回解析后的Html
*/
public static String docToHtmlText(MultipartFile file) throws Exception {
//使用字符数组流获取解析的内容
ByteArrayOutputStream baos = new ByteArrayOutputStream();
OutputStream outStream = new BufferedOutputStream(baos);
try {
//将上传的文件传入Document转换
HWPFDocument wordDocument = new HWPFDocument(file.getInputStream());
Document document = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument();
WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(document);
//将读取到的图片上传并添加链接地址
//wordToHtmlConverter.setPicturesManager((imageStream, pictureType, name, width, height) -> {
// try {
// //首先要判断图片是否能识别
// if (pictureType.equals(PictureType.UNKNOWN)) {
// return "[不能识别的图片]";
// }
// //此处上传到自己的文件服务器 todo
// String qiNiuName = "";//文件名
// boolean upload = FileUtil.upload(new FileInputStream(fileImage), qiNiuName);
// return "上传后的图片地址";
//
// } catch (Exception e) {
// logger.info("upload exception", e);
// }
// return "[图片上传失败]";
//});
// word文档转Html文档
wordToHtmlConverter.processDocument(wordDocument);
Document htmlDocument = wordToHtmlConverter.getDocument();
DOMSource domSource = new DOMSource(htmlDocument);
StreamResult streamResult = new StreamResult(outStream);
TransformerFactory factory = TransformerFactory.newInstance();
Transformer serializer = factory.newTransformer();
serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
serializer.setOutputProperty(OutputKeys.INDENT, "yes");
serializer.setOutputProperty(OutputKeys.METHOD, "html");
serializer.transform(domSource, streamResult);
String content = baos.toString();
return content;
} catch (Exception e) {
} finally {
baos.close();
outStream.close();
}
return "";
}
/**
* 上传docx文档,返回解析后的Html
*/
public static String docxToHtmlText(MultipartFile file) throws Exception {
ByteArrayOutputStream htmlStream = new ByteArrayOutputStream();
try {
// 将上传的文件传入Document转换
XWPFDocument docxDocument = new XWPFDocument(file.getInputStream());
XHTMLOptions options = XHTMLOptions.create();
// 设置图片存储路径
String path = System.getProperty("java.io.tmpdir");
String firstImagePathStr = path + "/" + System.currentTimeMillis();
options.setExtractor(new FileImageExtractor(new File(firstImagePathStr)));
options.URIResolver(new BasicURIResolver(firstImagePathStr));
// 转换html
docxDocument.createNumbering();
XHTMLConverter.getInstance().convert(docxDocument, htmlStream, options);
String htmlStr = htmlStream.toString();
String middleImageDirStr = "/word/media";
String imageDirStr = firstImagePathStr + middleImageDirStr;
File imageDir = new File(imageDirStr);
String[] imageList = imageDir.list();
if (imageList != null) {
for (int i = 0; i < imageList.length; i++) {
try {
String oneImagePathStr = imageDirStr + "/" + imageList[i];
File fileImage = new File(oneImagePathStr);
if (fileImage.exists()) {
String name = fileImage.getName();
String suffix = name.substring(name.indexOf("."), name.length()).toLowerCase();
//此处上传到自己的文件服务器 todo
String qiNiuName = "";//文件名
//boolean upload = FileUtil.upload(new FileInputStream(fileImage), qiNiuName);
//if (!upload) {
// continue;
//} else {
// //修改文档中的图片信息
// htmlStr = htmlStr.replace(oneImagePathStr, "上传后的图片地址");
//}
}
} catch (Exception e) {
}
}
}
//删除图片路径
File firstImagePath = new File(firstImagePathStr);
FileUtils.deleteDirectory(firstImagePath);
return htmlStr;
} catch (Exception e) {
} finally {
if (htmlStream != null) {
htmlStream.close();
}
}
return "";
}
public static void main(String[] args) {
try {
String content = docxToHtmlText(getMulFileByPath("E:\\temp.docx"));
// String div = HtmlUtil.unwrapHtmlTag(articleContent, "div");
// String head = HtmlUtil.removeHtmlTag(div, "head");
System.out.println(content);
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* 获取MultipartFile文件
*
* @param picPath
* @return
*/
public static MultipartFile getMulFileByPath(String picPath) {
FileItem fileItem = createFileItem(picPath);
MultipartFile mfile = new CommonsMultipartFile(fileItem);
return mfile;
}
private static FileItem createFileItem(String filePath) {
FileItemFactory factory = new DiskFileItemFactory(16, null);
String textFieldName = "textField";
int num = filePath.lastIndexOf(".");
String extFile = filePath.substring(num);
FileItem item = factory.createItem(textFieldName, "text/plain", true,
"MyFileName" + extFile);
File newfile = new File(filePath);
int bytesRead = 0;
byte[] buffer = new byte[8192];
try {
FileInputStream fis = new FileInputStream(newfile);
OutputStream os = item.getOutputStream();
while ((bytesRead = fis.read(buffer, 0, 8192))
!= -1) {
os.write(buffer, 0, bytesRead);
}
os.close();
fis.close();
} catch (IOException e) {
e.printStackTrace();
}
return item;
}
}