需求:将上传的word文档转为html并返回页面填充到富文本编辑器中
使用方法:
1.openoffice出现问题:图片错位;
2.poi将word转为html;图片格式多样,如wmf,emf等文件格式不能在页面上显示;
3.上传文件限定为docx,更改后缀为zip,解压可得到所有图片并且格式为png,但html需使用其他方法获得再修改img标签的图片路径;
本文使用poi将word转为html,图片格式问题之后发文解决。
使用Maven导入jar包
<dependency>
<groupId>org.apache.poigroupId>
<artifactId>poiartifactId>
<version>3.14version>
dependency>
<dependency>
<groupId>org.apache.poigroupId>
<artifactId>poi-scratchpadartifactId>
<version>3.14version>
dependency>
<dependency>
<groupId>org.apache.poigroupId>
<artifactId>poi-ooxmlartifactId>
<version>3.14version>
dependency>
<dependency>
<groupId>fr.opensagres.xdocreportgroupId>
<artifactId>xdocreportartifactId>
<version>1.0.6version>
dependency>
<dependency>
<groupId>org.apache.poigroupId>
<artifactId>poi-ooxml-schemasartifactId>
<version>3.14version>
dependency>
<dependency>
<groupId>org.apache.poigroupId>
<artifactId>ooxml-schemasartifactId>
<version>1.3version>
dependency>
PoiUtil.java
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.hwpf.usermodel.PictureType;
import org.apache.poi.xwpf.converter.core.BasicURIResolver;
import org.apache.poi.xwpf.converter.core.FileImageExtractor;
import org.apache.poi.xwpf.converter.xhtml.XHTMLConverter;
import org.apache.poi.xwpf.converter.xhtml.XHTMLOptions;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.w3c.dom.Document;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.io.*;
import java.util.*;
/**
* Created by will on 2017/6/9.
* 使用poi将word转为html文件,并从文件中读取内容
*/
public class PoiUtil {
// 在html中图片保存的相对路径
private static String imagePath;
/**
* @param source word文件的File对象
* @param sourceFileName word文件名
* @param savePath 图片保存路径
* @return 转成的html字符串
*/
public static String getHtml(File source, String sourceFileName, String savePath) throws Exception {
imagePath = "/upload/" + sourceFileName.substring(0, sourceFileName.lastIndexOf("."));
String imagePathStr = savePath + File.separator + sourceFileName.substring(0, sourceFileName.lastIndexOf(".")) + File.separator;
String content;
String imgEnd = "";
// 判断word文档类型,使用不同方法进行转换
if (sourceFileName.endsWith(".doc")) {
content = docToStr(source, sourceFileName, imagePathStr);
} else if (sourceFileName.endsWith(".docx")) {
content = docxToStr(source, sourceFileName, imagePathStr);
// 转换docx文件得到的图片路径
imgEnd = "word/media/";
} else {
return "文件类型错误";
}
// 利用正则表达式过滤无用标签和属性
content = RegexAnswerUtil.clear(content);
return content;
}
// doc转换为html
public static String docToStr(File source, String sourceFileName, String imagePathStr) throws Exception {
String targetFileName = imagePathStr + sourceFileName.substring(0, sourceFileName.lastIndexOf(".")) + ".html";
File target = new File(targetFileName);
target.getParentFile().mkdirs();
HWPFDocument wordDocument = new HWPFDocument(new FileInputStream(source));
Document document = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument();
WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(document);
// 保存图片,并返回图片的相对路径
wordToHtmlConverter.setPicturesManager((content, pictureType, name, width, height) -> {
try (FileOutputStream out = new FileOutputStream(new File(imagePathStr + name))) {
out.write(content);
} catch (Exception e) {
e.printStackTrace();
}
return imagePath +"/" + name;
});
wordToHtmlConverter.processDocument(wordDocument);
Document htmlDocument = wordToHtmlConverter.getDocument();
DOMSource domSource = new DOMSource(htmlDocument);
StreamResult streamResult = new StreamResult(new File(targetFileName));
TransformerFactory tf = TransformerFactory.newInstance();
Transformer serializer = tf.newTransformer();
serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
serializer.setOutputProperty(OutputKeys.INDENT, "yes");
serializer.setOutputProperty(OutputKeys.METHOD, "html");
serializer.transform(domSource, streamResult);
String content = splitContext(targetFileName);
// 删除生成的html文件
File file = new File(targetFileName);
file.delete();
return content;
}
// docx转换为html
public static String docxToStr(File source, String sourceFileName, String imagePathStr) throws Exception {
String targetFileName = imagePathStr + sourceFileName.substring(0, sourceFileName.lastIndexOf(".")) + ".html";
File target = new File(targetFileName);
target.getParentFile().mkdirs();
OutputStreamWriter outputStreamWriter = null;
try {
XWPFDocument document = new XWPFDocument(new FileInputStream(source));
XHTMLOptions options = XHTMLOptions.create();
// 存放图片的文件夹
options.setExtractor(new FileImageExtractor(new File(imagePathStr)));
// html中图片的路径
options.URIResolver(new BasicURIResolver(imagePath));
outputStreamWriter = new OutputStreamWriter(new FileOutputStream(target), "utf-8");
XHTMLConverter xhtmlConverter = (XHTMLConverter) XHTMLConverter.getInstance();
xhtmlConverter.convert(document, outputStreamWriter, options);
} finally {
if (outputStreamWriter != null) {
outputStreamWriter.close();
}
}
String content = splitContext(targetFileName);
// 删除生成的html文件
File file = new File(targetFileName);
file.delete();
return content;
}
/**
* docx文件转html会生成html编码
* 该方法能转换大部分
* 富文本编辑器中可以不做处理
*/
public static String htmlEncoding(String html) {
String regExp = "\\d*;";
Matcher m = Pattern.compile(regExp).matcher(html);
StringBuffer sb = new StringBuffer();
if (!m.find()) {
sb.append(html);
} else {
while (m.find()) {
String s = m.group(0);
s = s.replaceAll("()|;", "");
char c = (char) Integer.parseInt(s);
m.appendReplacement(sb, Character.toString(c));
}
}
return sb.toString();
}
/**
* 读取转换得到的html文件,并过滤多余空行
*/
public static String splitContext(String filePath) {
File file = new File(filePath);
BufferedReader reader = null;
try {
InputStreamReader isr = new InputStreamReader(new FileInputStream(file), "UTF-8");
reader = new BufferedReader(isr);
StringBuilder sb = new StringBuilder();
String tempString = null;
// 一次读入一行,直到读入null为文件结束
while ((tempString = reader.readLine()) != null) {
sb.append(tempString);
if(!tempString.equals("")){
sb.append("\n");
}
}
reader.close();
String content = sb.toString().replaceAll("\\n+", "\n");
return content;
} catch (IOException e) {
e.printStackTrace();
} finally {
if (reader != null) {
try {
reader.close();
} catch (IOException e1) {
}
}
}
return "";
}
}
RegexAnswerUtil.java
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Created by will on 2017/6/9.
* 清除无用的标签和属性
*/
public class RegexAnswerUtil {
/**
* @param returnString html字符串
* @return 过滤后的html字符串
*/
public static String clear(String returnString){
int start = returnString.indexOf(")==-1?0:returnString.indexOf(">", returnString.indexOf("))+1;
int end = returnString.indexOf("")==-1?returnString.length():returnString.indexOf("");
returnString = returnString.substring(start, end);
Pattern pattern = Pattern.compile(
"(<\\w+\\s*[^>]+?>)",
Pattern.CASE_INSENSITIVE);
Matcher matcher = pattern.matcher(returnString);
while (matcher.find()) {
String group = matcher.group();
if (group == null) {
continue;
}
String sub = matcher.group();
String imageRegex = "]+?>" ;
returnString = returnString.replaceAll(imageRegex, "");
String otherRegex = "<(?!img)(\\w+)\\s[^>]+>";
Pattern sub_p = Pattern.compile(otherRegex);
Matcher m_html = sub_p.matcher(sub);
String newSub = m_html.replaceAll("<$1>");
returnString = returnString.replace(sub, newSub);
}
return returnString;
}
}