java工具类之word转html工具类

工具类:

import java.io.BufferedWriter;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.util.List;

import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;

import org.apache.log4j.Logger;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.PicturesManager;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.hwpf.usermodel.Picture;
import org.apache.poi.hwpf.usermodel.PictureType;
import org.apache.poi.xwpf.converter.core.BasicURIResolver;
import org.apache.poi.xwpf.converter.xhtml.XHTMLConverter;
import org.apache.poi.xwpf.converter.xhtml.XHTMLOptions;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFPictureData;
import org.jsoup.Jsoup;
import org.w3c.dom.Document;

/**
 * word转html工具类
 * 兼容2007以上版本
 * 
 * @author zql
 *
 */
public class WordToHtml {
	
	private static final Logger logger = Logger.getLogger(WordToHtml.class);

	/**
	 * 输出html文件
	 * 
	 * @param html html字符串
	 * @param path 输出的文件路径
	 */
	public static void writeFile(String html, String path) {
		FileOutputStream fos = null;
		BufferedWriter bw = null;
		
		org.jsoup.nodes.Document document = Jsoup.parse(html);
		html = document.html();
		
		try {
			File file = new File(path);
			fos = new FileOutputStream(file);
			bw = new BufferedWriter(new OutputStreamWriter(fos,"UTF-8"));
			bw.write(html);
		} catch (FileNotFoundException e) {
			logger.info("WordToHtml.writeFile occoured FileNotFoundException! Message:" + e.getMessage());
			e.printStackTrace();
		} catch (IOException e) {
			logger.info("WordToHtml.writeFile occoured IOException! Message:" + e.getMessage());
			e.printStackTrace();
		} finally {
			try {
				// 必须先关闭BufferedWriter流再关FileOutputStream流,原因涉及关流顺序
				if (bw != null) {
					bw.close();
				}
				if (fos != null) {
					fos.close();
				}
			} catch (IOException e) {
				logger.info("WordToHtml.writeFile in finally occoured IOException! Message:" + e.getMessage());
				e.printStackTrace();
			}
		}
	}
	
	/**
	 * 转换html
	 * 
	 * @param inFilePath 
	 * @param outFilePath html文件输出路径
	 * @throws IOException
	 * @throws ParserConfigurationException
	 * @throws TransformerException
	 */
	public static void convertToHtml(String inFilePath, String outFilePath) throws IOException, ParserConfigurationException, TransformerException {
		if (inFilePath.endsWith("doc")) {
			docToHtml(inFilePath, outFilePath);
		} else {
			docxToHtml(inFilePath, outFilePath);
		}
	}
	
	/**
	 * 
	 * doc转html
	 * 
	 * @param inFilePath doc文档路径
	 * @param outFilePath html文件输出路径
	 * @throws IOException
	 * @throws ParserConfigurationException
	 * @throws TransformerException
	 */
	public static void docToHtml(String inFilePath, String outFilePath)  throws IOException, ParserConfigurationException, TransformerException {
		String baseURL = "images\\";
		String path = outFilePath.substring(0, outFilePath.lastIndexOf("\\") + 1);
		String imgPath = path + baseURL;
		
		HWPFDocument wordDocument = new HWPFDocument(new FileInputStream(inFilePath));
		
		WordToHtmlConverter wthc = new WordToHtmlConverter(DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());
		
		String time = String.valueOf(System.currentTimeMillis());
		// 设置图片路径
		wthc.setPicturesManager(new PicturesManager() {

			public String savePicture(byte[] paramArrayOfByte, PictureType paramPictureType, String paramString,
					float paramFloat1, float paramFloat2) {
				// 返回html中图片的路径
				return  baseURL + time + paramString;
			}
			
		});
		
		wthc.processDocument(wordDocument);
		
		List<Picture> pList = wordDocument.getPicturesTable().getAllPictures();
		// 保存图片
		if (pList != null) {
			File file = new File(imgPath);
			if (!file.exists()) {
				file.mkdirs();
			}
			for (Picture p : pList) {
				p.writeImageContent(new FileOutputStream(imgPath + time + p.suggestFullFileName()));
			}
		}
		
		Document htmlDocument = wthc.getDocument();
		
		ByteArrayOutputStream out = new ByteArrayOutputStream();
		DOMSource ds = new DOMSource(htmlDocument);
		StreamResult sr = new StreamResult(out);
		
		TransformerFactory tf = TransformerFactory.newInstance();
		Transformer serializer = tf.newTransformer();
		serializer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
		serializer.setOutputProperty(OutputKeys.INDENT, "yes");
		serializer.setOutputProperty(OutputKeys.METHOD, "HTML");
		serializer.transform(ds, sr);
		
		writeFile(new String(out.toByteArray()), outFilePath);
		
		out.close();
	}
	
	/**
	 * docx转html
	 * 
	 * @param inFilePath docx文件路径
	 * @param outFilePath html输出文件路径
	 * @throws TransformerException
	 * @throws IOException
	 * @throws ParserConfigurationException
	 */
	public static void docxToHtml(String inFilePath, String outFilePath) throws TransformerException, IOException, ParserConfigurationException {
		String baseURL = "images";
		String path = outFilePath.substring(0, outFilePath.lastIndexOf("\\") + 1);
		
		XWPFDocument document = new XWPFDocument(new FileInputStream(inFilePath));
		// 保存图片
		List<XWPFPictureData> picList = document.getAllPictures();
		String imgPath = path + baseURL + "\\word\\media\\";
		File file = new File(imgPath);
		if (!file.exists()) {
			file.mkdirs();
		}
		for (XWPFPictureData pic : picList) {
			byte[] bytev = pic.getData();
			FileOutputStream fos = new FileOutputStream(imgPath + pic.getFileName());
			fos.write(bytev);
			fos.close();
		}
		
        XHTMLOptions options = XHTMLOptions.create().indent(4);
        
        // 保存并设置 word的html中图片的目录路径
        options.URIResolver(new BasicURIResolver(baseURL));
        File outFile = new File(outFilePath);
        outFile.getParentFile().mkdirs();
        OutputStream out = new FileOutputStream(outFile);
        
        XHTMLConverter.getInstance().convert(document, out, options);
	}
}

测试类:

import org.apache.log4j.PropertyConfigurator;

public class WordToHtmlTest {

	public static void main(String[] args) throws Exception {
		/* 用log4包加载配置文件 */
        PropertyConfigurator.configure(System.getProperty("user.dir") + "\\src\\log4j.properties");
        /* 用java自带peoperties加载配置文件  */
		/*
        Properties props=new Properties();
        try {
            props.load(WordToHtmlTest.class
                    .getClassLoader()
                    .getResourceAsStream("log4j.properties")
                    );
        } catch (IOException e) {
            e.printStackTrace();
        }
        */
		WordToHtml.convertToHtml("E:\\test\\test.doc", "E:\\test\\test1.html");
		WordToHtml.convertToHtml("E:\\test\\test.docx", "E:\\test\\test2.html");
	}
}

maven依赖:


<dependency>
    <groupId>org.slf4jgroupId>
    <artifactId>slf4j-log4j12artifactId>
    <version>1.7.2version>
dependency>

<dependency>
    <groupId>org.apache.poigroupId>
    <artifactId>poiartifactId>
    <version>4.0.1version>
dependency>
<dependency>  
    <groupId>org.apache.poigroupId>  
    <artifactId>poi-excelantartifactId>  
    <version>4.0.1version>  
dependency>

<dependency>
    <groupId>org.apache.poigroupId>
    <artifactId>poi-scratchpadartifactId>
    <version>4.0.1version>
dependency>

<dependency>
    <groupId>fr.opensagres.xdocreportgroupId>
    <artifactId>org.apache.poi.xwpf.converter.xhtmlartifactId>
    <version>1.0.2version>
dependency>
   
<dependency>
    <groupId>org.jsoupgroupId>
    <artifactId>jsoupartifactId>
    <version>1.7.2version>
dependency>

你可能感兴趣的:(#,java工具类)