word转html包含公式和图片

##把docx里面的文字,公式和图片转成html大体需要如下的方式

  1. 把docx的文字转成poi的XWPFDocument
    InputStream is=new FileInputStream("d:\\1.docx");
    XWPFDocument docx = new XWPFDocument(is);
  1. 得到内容的列表,包括XWPFParagraph和XWPFTable 通过BodyElementType区分
List<IBodyElement> eles = docx.getBodyElements();
for (IBodyElement e : eles) {
	if (e.getElementType().equals(BodyElementType.PARAGRAPH)){
		XWPFParagraph p = (XWPFParagraph) e;
		handleParagraph(e):
	}else if(e.getElementType().equals(BodyElementType.TABLE){
		handleTable(e);
	}
  1. 得到XWPFParagraph 后,通过如下两个方法得到XWPFParagraph里面的具体内容
List runs = p.getRuns();//文本和图片
List pics = run.getEmbeddedPictures();//得到所有图片
再把图片保存起来就可以了。
List oMathList = p.getCTP().getOMathList();//公式
//公式这个就复杂了CTOMath属于XmlObject形式的xml文件,属于OMML,要先转成MathML,再把MathML转成png,保存到硬盘上。
//把XmlObject转成MathML
private static String getMathML(XmlObject xmlObject) throws Exception {
		final String xslFile = "/cn/com/eduedu/jee/util/OMML2MML.XSL";
		StreamSource stylesource = new StreamSource(MSDocxUtils.class.getResourceAsStream(xslFile));
		Transformer transformer = TransformerFactory.newInstance().newTransformer(stylesource);
		Node node = xmlObject.getDomNode();

		DOMSource source = new DOMSource(node);
		StringWriter stringwriter = new StringWriter();
		StreamResult result = new StreamResult(stringwriter);
		transformer.setOutputProperty("omit-xml-declaration", "yes");
		transformer.transform(source, result);

		String mathML = stringwriter.toString();
		stringwriter.close();

		// The native OMML2MML.XSL transforms OMML into MathML as XML having special
		// name spaces.
		// We don't need this since we want using the MathML in HTML, not in XML.
		// So ideally we should changing the OMML2MML.XSL to not do so.
		// But to take this example as simple as possible, we are using replace to get
		// rid of the XML specialities.
		mathML = mathML.replaceAll("xmlns:m=\"http://schemas.openxmlformats.org/officeDocument/2006/math\"", "");
		mathML = mathML.replaceAll("xmlns:mml", "xmlns");
		mathML = mathML.replaceAll("mml:", "");
		return mathML;
	}
//MathML转成Document 
private static Document convertStringToDocument(String xmlStr) {
        DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();  
        DocumentBuilder builder;  
        try  
        {  
            builder = factory.newDocumentBuilder();  
            Document doc = builder.parse( new InputSource( new StringReader( xmlStr ) ) ); 
            return doc;
        } catch (Exception e) {  
            e.printStackTrace();  
        } 
        return null;
    }
//最后的代码是这样的
private static String convertOmathToPng(XmlObject xmlObject,MSDocxToHtmlImageParser imageParser) throws Exception {
		pngNumber++;
		Document document=convertStringToDocument(getMathML(xmlObject));
		Converter mathMLConvert =Converter.getInstance();
		LayoutContextImpl localLayoutContextImpl = new LayoutContextImpl(LayoutContextImpl.getDefaultLayoutContext());
		localLayoutContextImpl.setParameter(Parameter.MATHSIZE, 18);
		ByteArrayOutputStream  os=new ByteArrayOutputStream();
		mathMLConvert.convert(document,os, "image/png", localLayoutContextImpl);
		String pngName=imageParser.parse(os.toByteArray(), "png_"+pngNumber+".png");
		os.close();
		return "";
	}

###用到的所有包

 
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import java.io.StringWriter;
import java.math.BigInteger;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import javax.xml.transform.stream.StreamSource;

import org.apache.poi.POIXMLProperties;
import org.apache.poi.xwpf.usermodel.BodyElementType;
import org.apache.poi.xwpf.usermodel.IBodyElement;
import org.apache.poi.xwpf.usermodel.UnderlinePatterns;
import org.apache.poi.xwpf.usermodel.VerticalAlign;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.apache.poi.xwpf.usermodel.XWPFPicture;
import org.apache.poi.xwpf.usermodel.XWPFRun;
import org.apache.poi.xwpf.usermodel.XWPFStyles;
import org.apache.poi.xwpf.usermodel.XWPFTable;
import org.apache.poi.xwpf.usermodel.XWPFTableCell;
import org.apache.poi.xwpf.usermodel.XWPFTableRow;
import org.apache.xmlbeans.XmlObject;
import org.openxmlformats.schemas.officeDocument.x2006.math.CTOMath;
import org.openxmlformats.schemas.officeDocument.x2006.math.CTOMathPara;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;

import cn.com.eduedu.jee.util.wordnumber.IWordNumber;
import cn.com.eduedu.jee.util.wordnumber.WordNumberFactory;
import net.sourceforge.jeuclid.context.LayoutContextImpl;
import net.sourceforge.jeuclid.context.Parameter;
import net.sourceforge.jeuclid.converter.Converter;

这里有更详细的文章
这里有更详细的文章,能解决更多的问题

你可能感兴趣的:(java)