工具类:
import java.io.BufferedWriter;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.util.List;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.apache.log4j.Logger;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.PicturesManager;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.hwpf.usermodel.Picture;
import org.apache.poi.hwpf.usermodel.PictureType;
import org.apache.poi.xwpf.converter.core.BasicURIResolver;
import org.apache.poi.xwpf.converter.xhtml.XHTMLConverter;
import org.apache.poi.xwpf.converter.xhtml.XHTMLOptions;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFPictureData;
import org.jsoup.Jsoup;
import org.w3c.dom.Document;
/**
* word转html工具类
* 兼容2007以上版本
*
* @author zql
*
*/
public class WordToHtml {
private static final Logger logger = Logger.getLogger(WordToHtml.class);
/**
* 输出html文件
*
* @param html html字符串
* @param path 输出的文件路径
*/
public static void writeFile(String html, String path) {
FileOutputStream fos = null;
BufferedWriter bw = null;
org.jsoup.nodes.Document document = Jsoup.parse(html);
html = document.html();
try {
File file = new File(path);
fos = new FileOutputStream(file);
bw = new BufferedWriter(new OutputStreamWriter(fos,"UTF-8"));
bw.write(html);
} catch (FileNotFoundException e) {
logger.info("WordToHtml.writeFile occoured FileNotFoundException! Message:" + e.getMessage());
e.printStackTrace();
} catch (IOException e) {
logger.info("WordToHtml.writeFile occoured IOException! Message:" + e.getMessage());
e.printStackTrace();
} finally {
try {
// 必须先关闭BufferedWriter流再关FileOutputStream流,原因涉及关流顺序
if (bw != null) {
bw.close();
}
if (fos != null) {
fos.close();
}
} catch (IOException e) {
logger.info("WordToHtml.writeFile in finally occoured IOException! Message:" + e.getMessage());
e.printStackTrace();
}
}
}
/**
* 转换html
*
* @param inFilePath
* @param outFilePath html文件输出路径
* @throws IOException
* @throws ParserConfigurationException
* @throws TransformerException
*/
public static void convertToHtml(String inFilePath, String outFilePath) throws IOException, ParserConfigurationException, TransformerException {
if (inFilePath.endsWith("doc")) {
docToHtml(inFilePath, outFilePath);
} else {
docxToHtml(inFilePath, outFilePath);
}
}
/**
*
* doc转html
*
* @param inFilePath doc文档路径
* @param outFilePath html文件输出路径
* @throws IOException
* @throws ParserConfigurationException
* @throws TransformerException
*/
public static void docToHtml(String inFilePath, String outFilePath) throws IOException, ParserConfigurationException, TransformerException {
String baseURL = "images\\";
String path = outFilePath.substring(0, outFilePath.lastIndexOf("\\") + 1);
String imgPath = path + baseURL;
HWPFDocument wordDocument = new HWPFDocument(new FileInputStream(inFilePath));
WordToHtmlConverter wthc = new WordToHtmlConverter(DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());
String time = String.valueOf(System.currentTimeMillis());
// 设置图片路径
wthc.setPicturesManager(new PicturesManager() {
public String savePicture(byte[] paramArrayOfByte, PictureType paramPictureType, String paramString,
float paramFloat1, float paramFloat2) {
// 返回html中图片的路径
return baseURL + time + paramString;
}
});
wthc.processDocument(wordDocument);
List<Picture> pList = wordDocument.getPicturesTable().getAllPictures();
// 保存图片
if (pList != null) {
File file = new File(imgPath);
if (!file.exists()) {
file.mkdirs();
}
for (Picture p : pList) {
p.writeImageContent(new FileOutputStream(imgPath + time + p.suggestFullFileName()));
}
}
Document htmlDocument = wthc.getDocument();
ByteArrayOutputStream out = new ByteArrayOutputStream();
DOMSource ds = new DOMSource(htmlDocument);
StreamResult sr = new StreamResult(out);
TransformerFactory tf = TransformerFactory.newInstance();
Transformer serializer = tf.newTransformer();
serializer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
serializer.setOutputProperty(OutputKeys.INDENT, "yes");
serializer.setOutputProperty(OutputKeys.METHOD, "HTML");
serializer.transform(ds, sr);
writeFile(new String(out.toByteArray()), outFilePath);
out.close();
}
/**
* docx转html
*
* @param inFilePath docx文件路径
* @param outFilePath html输出文件路径
* @throws TransformerException
* @throws IOException
* @throws ParserConfigurationException
*/
public static void docxToHtml(String inFilePath, String outFilePath) throws TransformerException, IOException, ParserConfigurationException {
String baseURL = "images";
String path = outFilePath.substring(0, outFilePath.lastIndexOf("\\") + 1);
XWPFDocument document = new XWPFDocument(new FileInputStream(inFilePath));
// 保存图片
List<XWPFPictureData> picList = document.getAllPictures();
String imgPath = path + baseURL + "\\word\\media\\";
File file = new File(imgPath);
if (!file.exists()) {
file.mkdirs();
}
for (XWPFPictureData pic : picList) {
byte[] bytev = pic.getData();
FileOutputStream fos = new FileOutputStream(imgPath + pic.getFileName());
fos.write(bytev);
fos.close();
}
XHTMLOptions options = XHTMLOptions.create().indent(4);
// 保存并设置 word的html中图片的目录路径
options.URIResolver(new BasicURIResolver(baseURL));
File outFile = new File(outFilePath);
outFile.getParentFile().mkdirs();
OutputStream out = new FileOutputStream(outFile);
XHTMLConverter.getInstance().convert(document, out, options);
}
}
测试类:
import org.apache.log4j.PropertyConfigurator;
public class WordToHtmlTest {
public static void main(String[] args) throws Exception {
/* 用log4包加载配置文件 */
PropertyConfigurator.configure(System.getProperty("user.dir") + "\\src\\log4j.properties");
/* 用java自带peoperties加载配置文件 */
/*
Properties props=new Properties();
try {
props.load(WordToHtmlTest.class
.getClassLoader()
.getResourceAsStream("log4j.properties")
);
} catch (IOException e) {
e.printStackTrace();
}
*/
WordToHtml.convertToHtml("E:\\test\\test.doc", "E:\\test\\test1.html");
WordToHtml.convertToHtml("E:\\test\\test.docx", "E:\\test\\test2.html");
}
}
maven依赖:
<dependency>
<groupId>org.slf4jgroupId>
<artifactId>slf4j-log4j12artifactId>
<version>1.7.2version>
dependency>
<dependency>
<groupId>org.apache.poigroupId>
<artifactId>poiartifactId>
<version>4.0.1version>
dependency>
<dependency>
<groupId>org.apache.poigroupId>
<artifactId>poi-excelantartifactId>
<version>4.0.1version>
dependency>
<dependency>
<groupId>org.apache.poigroupId>
<artifactId>poi-scratchpadartifactId>
<version>4.0.1version>
dependency>
<dependency>
<groupId>fr.opensagres.xdocreportgroupId>
<artifactId>org.apache.poi.xwpf.converter.xhtmlartifactId>
<version>1.0.2version>
dependency>
<dependency>
<groupId>org.jsoupgroupId>
<artifactId>jsoupartifactId>
<version>1.7.2version>
dependency>