DOCX4J 解析DOCX文档字体格式

一、DOCX文档格式

docx4j只能解析docx格式的word文档,这种文档其实是一个压缩文件,加压后会有如下图所示的多个文件,文档中字体格式主要包含在document.xml和styles.xml文件中,theme文件夹可能也有,本文没有分析里面的格式。

DOCX4J 解析DOCX文档字体格式_第1张图片

document.xml中的格式定义:

DOCX4J 解析DOCX文档字体格式_第2张图片

DOCX4J 解析DOCX文档字体格式_第3张图片

二、maven引入


       
            org.docx4j
            docx4j
            3.3.5
       

三、测试类

getMainDocumentPart可以得到文档内容,getStyleDefinitionsPart可以得到styles的内容,经过测试发现有些格式存在styles.xml中,比如大纲、页眉、页脚等,而一般段落的格式在文档内容中document.xml,因此可以结合这两个方法去得到对应的格式。

package thesisadmin;

import org.apache.log4j.Logger;

import org.junit.Test;

import java.util.ArrayList;

import java.util.List;

import javax.xml.bind.JAXBElement;

import org.docx4j.openpackaging.packages.WordprocessingMLPackage;

import org.docx4j.openpackaging.parts.WordprocessingML.MainDocumentPart;

import org.docx4j.openpackaging.parts.WordprocessingML.StyleDefinitionsPart;

import org.docx4j.wml.Body;

import org.docx4j.wml.HpsMeasure;

import org.docx4j.wml.PPr;

import org.docx4j.wml.PPrBase.Ind;

import org.docx4j.wml.PPrBase.Spacing;

import org.docx4j.wml.ParaRPr;

import org.docx4j.wml.R;

import org.docx4j.wml.RFonts;

import org.docx4j.wml.RPr;

import org.docx4j.wml.Style;

import org.docx4j.wml.Styles;

public class MyTest {

private static final Logger logger = Logger.getLogger(MyTest.class);

@Test

public void t2() {

// logger.info(EndecryptUtil.d("QWp6RfwMwQL9M6bbTwqlUQ%3D%3D"));

try {

parserDocx("c:\\b.docx");

} catch (Exception e) {

// TODO Auto-generated catch block

e.printStackTrace();

}

}

public ArrayList parserDocx(String inputfilepath) throws Exception {

WordprocessingMLPackage wordMLPackage = WordprocessingMLPackage

.load(new java.io.File(inputfilepath));

MainDocumentPart documentPart = wordMLPackage.getMainDocumentPart();

System.out.println(documentPart.getXML());

org.docx4j.wml.Document wmlDocumentEl = (org.docx4j.wml.Document) documentPart

.getJaxbElement();

Body body = wmlDocumentEl.getBody();

List bodyChildren = body.getContent();//.getEGBlockLevelElts();

ArrayList lss = walkJAXBElements(inputfilepath, bodyChildren);

return lss;

}

public ArrayList walkJAXBElements(String inputpath,

List bodyChildren) {

ArrayList lss = new ArrayList();

for (Object o : bodyChildren) {

if (o instanceof javax.xml.bind.JAXBElement) {

System.out.println("JAXBElement:" + o.getClass().getName());

} else if (o instanceof org.docx4j.wml.P) {

try{

System.out.println("=====================");

String paragraph = walkList(((org.docx4j.wml.P) o).getContent());

System.out.println("------------段落内容------------");

System.out.println(paragraph);

System.out.println("------------段落内容结束-----------");

lss.add(paragraph);

System.out.println("------------段落样式------------");

PPr ppr = ((org.docx4j.wml.P) o).getPPr();

if(ppr!=null){

ParaRPr prpr=ppr.getRPr();

RFonts rfs = prpr.getRFonts();

HpsMeasure hps = prpr.getSz();

System.out.println("字体Ascii:"+rfs.getAscii());

System.out.println("字体HAnsi:"+rfs.getHAnsi());

System.out.println("字体大小:"+hps.getVal());

System.out.println("字体颜色:"+prpr.getColor().getVal());

Ind ind=ppr.getInd();

System.out.println("左缩进:"+ind.getLeftChars());

Spacing sp=ppr.getSpacing();

System.out.println("行距:"+sp.getLine());

}

System.out.println("---------样式结束--------------");

System.out.println("=====================");

}catch(Exception e){

System.out.println(e.getMessage());

continue;

}

}

}

return lss;

}

public String walkList(List children) {

String line = "";

for (Object o : children) {

if (o instanceof javax.xml.bind.JAXBElement) {

if (((JAXBElement) o).getDeclaredType().getName()

.equals("org.docx4j.wml.Text")) {

org.docx4j.wml.Text t = (org.docx4j.wml.Text) ((JAXBElement) o)

.getValue();

line = line + t.getValue();

} else if (((JAXBElement) o).getDeclaredType().getName()

.equals("org.docx4j.wml.Drawing")) {

System.out.println("find img");

// ((JAXBElement)o).getValue() );

}

} else if (o instanceof org.w3c.dom.Node) {

System.out.println(" IGNORED "

+ ((org.w3c.dom.Node) o).getNodeName());

} else if (o instanceof org.docx4j.wml.R) {

RPr rPr = ((R) o).getRPr();

// System.out.println("=========字体样式============");

// if(rPr!=null){

//

// RFonts rfs = rPr.getRFonts();

// HpsMeasure hps = rPr.getSz();

// System.out.println("字体Ascii:"+rfs.getAscii());

// System.out.println("字体HAnsi:"+rfs.getHAnsi());

// System.out.println("字体大小:"+hps.getVal());

// System.out.println("字体颜色:"+rPr.getColor().getVal());

//

// }

// System.out.println("=====================");

org.docx4j.wml.U u = rPr.getU();

org.docx4j.wml.R run = (org.docx4j.wml.R) o;

String tmpStr = walkList(run.getRunContent());

if (u != null) {

for (int i = 0; i < tmpStr.length(); i++) {

line = line + "_" + tmpStr.charAt(i);

}

} else

line = line + tmpStr;

}

else {

System.out.println(" IGNORED " + o.getClass().getName());

}

}

return line;

}

@Test

public void t3() {

try {

WordprocessingMLPackage wordMLPackage = WordprocessingMLPackage

.load(new java.io.File("c:\\b.docx"));

StyleDefinitionsPart styleDefinitionsPart =

wordMLPackage.getMainDocumentPart().getStyleDefinitionsPart(true);

Styles styles = styleDefinitionsPart.getContents();

System.out.println(styleDefinitionsPart.getXML());

try {

String defaultParagraphStyleId = styleDefinitionsPart.getDefaultParagraphStyle().getStyleId();

System.out.println("defaultParagraphStyleId:"+defaultParagraphStyleId);

} catch (NullPointerException npe) {

System.out.println("No default paragraph style!!");

}

try {

String defaultCharacterStyleId = styleDefinitionsPart.getDefaultCharacterStyle().getStyleId();

System.out.println("defaultCharacterStyleId:"+defaultCharacterStyleId);

} catch (NullPointerException npe) {

System.out.println("No default character style!!");

}

List