import com.sinitek.sirm.web.plm.funddate.MatchingObject;
import org.apache.log4j.Logger;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.model.PicturesTable;
import org.apache.poi.hwpf.model.StyleDescription;
import org.apache.poi.hwpf.model.StyleSheet;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.xwpf.usermodel.*;
import org.apache.xmlbeans.XmlException;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.*;
import java.io.*;
import java.math.BigInteger;
import java.util.ArrayList;
import java.util.List;
public class ParseWordUtil {
private static final Logger LOGGER = Logger.getLogger(ParseWordUtil.class);
// word整体样式
private static CTStyles wordStyles = null;
public static void getWordStyle(String filepath) {
XWPFDocument template;
try {
// 读取模板文档
template = new XWPFDocument(new FileInputStream(filepath));
// 获得模板文档的整体样式
wordStyles = template.getStyle();
} catch (FileNotFoundException e) {
LOGGER.error("未找到文件",e);
} catch (IOException e) {
LOGGER.error("",e);
} catch (XmlException e) {
LOGGER.error("XML转换异常",e);
}
}
// 获取word文档标题
public static List getWordTitles(String filepath) throws IOException {
String filename = getWordVersion(filepath);
if (".docx".equals(filename)) {
return getWordTitles2007(filepath);
} else {
return getWordTitlesAndContext2003(filepath, 1); // 1:只获取标题;2:只获取内容;3:标题和内容
}
}
// 获取word文档内容
public static List getWordText(String filepath) throws Exception {
String filename = getWordVersion(filepath);
if (".docx".equals(filename)) {
return getParagraphText2007(filepath);
} else {
return getWordTitlesAndContext2003(filepath, 3);
}
}
// 获取文件版本,97基本已经淘汰不考虑,只针对03和07版本word
public static String getWordVersion(String filepath) {
File file = new File(filepath);
String filename = file.getName();
// filename = filename.substring(0, filename.lastIndexOf("."));
filename = filename.substring(filename.lastIndexOf("."), filename.length());
return filename;
}
/**
* 获取03版word文档标题和内容
* @param path 文件路径
* @param type 1:只获取标题;2:只获取内容;3:标题和内容都获取
* @return list
* @throws IOException
*/
public static List getWordTitlesAndContext2003(String path, Integer type) throws IOException {
InputStream is = new FileInputStream(path);
HWPFDocument doc = new HWPFDocument(is);
Range r = doc.getRange();
List list = new ArrayList();
List titles = new ArrayList();
List context = new ArrayList();
for (int i = 0; i < r.numParagraphs(); i++) {
Paragraph p = r.getParagraph(i);
// check if style index is greater than total number of styles
int numStyles = doc.getStyleSheet().numStyles();
int styleIndex = p.getStyleIndex();
String contexts = p.text();
list.add(contexts); // 标题+内容
if (numStyles > styleIndex) {
StyleSheet style_sheet = doc.getStyleSheet();
StyleDescription style = style_sheet.getStyleDescription(styleIndex);
String styleName = style.getName();
if (styleName != null && styleName.contains("标题")) {
String text = p.text();
titles.add(text);
} else if (styleName != null && styleName.contains("正文")) {
String text = p.text();
context.add(text);
}
}
}
//得到word数据流
byte [] dataStream = doc.getDataStream();
//用于在一段范围内获得段落数
int numCharacterRuns = r.numCharacterRuns();
// System.out.println("CharacterRuns 数:"+numCharacterRuns);
//负责图像提取 和 确定一些文件某块是否包含嵌入的图像。
PicturesTable table = new PicturesTable(doc, dataStream, null, null, null);
//文章图片编号
/*int i = 1;
for(int j=0 ; j getWordTitles2007(String path) throws IOException {
InputStream is = new FileInputStream(path);
XWPFDocument doc = new XWPFDocument(is);
//HWPFDocument doc = new HWPFDocument(is);
//Range r = doc.getRange();
List listRun;
List listParagraphs = doc.getParagraphs();//得到段落信息
List list = new ArrayList();
/*for (int i = 0; i paras = doc.getParagraphs();
for (XWPFParagraph para : paras) {
// 当前段落的属性
// CTPPr pr = para.getCTP().getPPr();
if (para.getText() != null && !"".equals(para.getText()) && !"r".equals(para.getText())) {
System.out.println(para.getText().trim());
String str = para.getText();
String str1 = " " + para.getText().replaceAll("\\n", "").replaceAll("\\t", "") + "\n";
list.add(str);
}
}
/*XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
String text = extractor.getText();
// System.out.println(text);
POIXMLProperties.CoreProperties coreProps = extractor.getCoreProperties();
String title = coreProps.getTitle();
System.out.println(title);*/
//获取文档中所有的表格
/*List tables = doc.getTables();
List rows;
List cells;
for (XWPFTable table : tables) {
// 表格属性
// CTTblPr pr = table.getCTTbl().getTblPr();
// 获取表格对应的行
rows = table.getRows();
for (XWPFTableRow row : rows) {
//获取行对应的单元格
cells = row.getTableCells();
for (XWPFTableCell cell : cells) {
System.out.println(cell.getText());;
}
}
}*/
close(is);
return list;
}
// 获取2007版word文档内容
public static List getParagraphText2007(String filePath) throws Exception {
InputStream is = new FileInputStream(filePath);
XWPFDocument doc = new XWPFDocument(is);
List context = new ArrayList();
List paras = doc.getParagraphs();
for (XWPFParagraph para : paras) {
String str = " " + para.getText().replaceAll("\\n", "").replaceAll("\\t", "") + "\n";
context.add(str);
}
//获取文档中所有的表格
/*List tables = doc.getTables();
List rows;
List cells;
for (XWPFTable table : tables) {
// 表格属性
// CTTblPr pr = table.getCTTbl().getTblPr();
// 获取表格对应的行
rows = table.getRows();
for (XWPFTableRow row : rows) {
//获取行对应的单元格
cells = row.getTableCells();
for (XWPFTableCell cell : cells) {
context.add(cell.getText());
}
}
}*/
close(is);
return context;
}
/**
* 将对比结果写入表格
* @param size 对比list size
* @param object 短句对比结果
* @throws Exception
*/
public static void writeTable(int size, List object, String returnPath) throws Exception {
XWPFDocument doc = new XWPFDocument();
// 获取新建文档对象的样式
XWPFStyles newStyles = doc.createStyles();
// 关键行 // 修改设置文档样式为静态块中读取到的样式
// newStyles.setStyles(wordStyles);
// 创建一个表格
XWPFTable table = doc.createTable(size, 2);
// 这里增加的列原本初始化创建的行在通过getTableCells()方法获取时获取不到,但通过row新增的就可以。
// table.addNewCol(); //给表格增加一列
// table.createRow(); //给表格新增一行
List rows = table.getRows();
// 表格属性
CTTblPr tablePr = table.getCTTbl().addNewTblPr();
// 表格宽度
CTTblWidth width = tablePr.addNewTblW();
width.setW(BigInteger.valueOf(9000));
XWPFTableRow row;
List cells;
XWPFTableCell cell;
int rowSize = rows.size();
int cellSize;
for (int i=0; i list = row.getCtRow().getTcList();
cells = row.getTableCells();
cellSize = cells.size();
for (int j=0; j style is more prominent in the formats bar
ctStyle.setUiPriority(indentNumber);
CTOnOff onoffnull = CTOnOff.Factory.newInstance();
ctStyle.setUnhideWhenUsed(onoffnull);
// style shows up in the formats bar
ctStyle.setQFormat(onoffnull);
// style defines a heading of the given level
CTPPr ppr = CTPPr.Factory.newInstance();
ppr.setOutlineLvl(indentNumber);
ctStyle.setPPr(ppr);
XWPFStyle style = new XWPFStyle(ctStyle);
// is a null op if already defined
XWPFStyles styles = docxDocument.createStyles();
style.setType(STStyleType.PARAGRAPH);
styles.addStyle(style);
}
/**
* 关闭输入流
* @param is 输入流
*/
private static void close(InputStream is) {
if (is != null) {
try {
is.close();
} catch (IOException e) {
LOGGER.error("流关闭异常",e);
}
}
}
/**
* 关闭输出流
* @param os 输出流
*/
private static void close(OutputStream os) throws Exception{
if (os != null) {
try {
os.close();
} catch (IOException e) {
LOGGER.error("流关闭异常",e);
}
}
}
}