POI解析word文档

import com.sinitek.sirm.web.plm.funddate.MatchingObject;
import org.apache.log4j.Logger;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.model.PicturesTable;
import org.apache.poi.hwpf.model.StyleDescription;
import org.apache.poi.hwpf.model.StyleSheet;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.xwpf.usermodel.*;
import org.apache.xmlbeans.XmlException;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.*;

import java.io.*;
import java.math.BigInteger;
import java.util.ArrayList;
import java.util.List;

public class ParseWordUtil {
    private static final Logger LOGGER = Logger.getLogger(ParseWordUtil.class);
    // word整体样式
    private static CTStyles wordStyles = null;

    public static void getWordStyle(String filepath) {
        XWPFDocument template;
        try {
            // 读取模板文档
            template = new XWPFDocument(new FileInputStream(filepath));
            // 获得模板文档的整体样式
            wordStyles = template.getStyle();
        } catch (FileNotFoundException e) {
            LOGGER.error("未找到文件",e);
        } catch (IOException e) {
            LOGGER.error("",e);
        } catch (XmlException e) {
            LOGGER.error("XML转换异常",e);
        }
    }

    // 获取word文档标题
    public static List getWordTitles(String filepath) throws IOException {
        String filename = getWordVersion(filepath);
        if (".docx".equals(filename)) {
            return getWordTitles2007(filepath);
        } else {
            return getWordTitlesAndContext2003(filepath, 1); // 1:只获取标题;2:只获取内容;3:标题和内容
        }
    }

    // 获取word文档内容
    public static List getWordText(String filepath) throws Exception {
        String filename = getWordVersion(filepath);
        if (".docx".equals(filename)) {
            return getParagraphText2007(filepath);
        } else {
            return getWordTitlesAndContext2003(filepath, 3);
        }
    }

    // 获取文件版本,97基本已经淘汰不考虑,只针对03和07版本word
    public static String getWordVersion(String filepath) {
        File file = new File(filepath);
        String filename = file.getName();
        // filename = filename.substring(0, filename.lastIndexOf("."));
        filename = filename.substring(filename.lastIndexOf("."), filename.length());
        return filename;
    }

    /**
     * 获取03版word文档标题和内容
     * @param path 文件路径
     * @param type 1:只获取标题;2:只获取内容;3:标题和内容都获取
     * @return list
     * @throws IOException
     */
    public static List getWordTitlesAndContext2003(String path, Integer type) throws IOException {
        InputStream is = new FileInputStream(path);
        HWPFDocument doc = new HWPFDocument(is);
        Range r = doc.getRange();
        List list = new ArrayList();
        List titles = new ArrayList();
        List context = new ArrayList();
        for (int i = 0; i < r.numParagraphs(); i++) {
            Paragraph p = r.getParagraph(i);
            // check if style index is greater than total number of styles
            int numStyles = doc.getStyleSheet().numStyles();
            int styleIndex = p.getStyleIndex();
            String contexts = p.text();
            list.add(contexts); // 标题+内容

            if (numStyles > styleIndex) {
                StyleSheet style_sheet = doc.getStyleSheet();
                StyleDescription style = style_sheet.getStyleDescription(styleIndex);
                String styleName = style.getName();
                if (styleName != null && styleName.contains("标题")) {
                    String text = p.text();
                    titles.add(text);
                } else if (styleName != null && styleName.contains("正文")) {
                    String text = p.text();
                    context.add(text);
                }
            }
        }

        //得到word数据流
        byte [] dataStream = doc.getDataStream();
        //用于在一段范围内获得段落数
        int numCharacterRuns = r.numCharacterRuns();
        // System.out.println("CharacterRuns 数:"+numCharacterRuns);
        //负责图像提取 和 确定一些文件某块是否包含嵌入的图像。
        PicturesTable table = new PicturesTable(doc, dataStream, null, null, null);

        //文章图片编号
        /*int i = 1;
        for(int j=0 ; j getWordTitles2007(String path) throws IOException {
        InputStream is = new FileInputStream(path);
        XWPFDocument doc = new XWPFDocument(is);
        //HWPFDocument doc = new HWPFDocument(is);
        //Range r = doc.getRange();
        List listRun;
        List listParagraphs = doc.getParagraphs();//得到段落信息
        List list = new ArrayList();

        /*for (int i = 0; i paras = doc.getParagraphs();
        for (XWPFParagraph para : paras) {
            // 当前段落的属性
            // CTPPr pr = para.getCTP().getPPr();
            if (para.getText() != null && !"".equals(para.getText()) && !"r".equals(para.getText())) {
                System.out.println(para.getText().trim());
                String str = para.getText();
                String str1 = "  " + para.getText().replaceAll("\\n", "").replaceAll("\\t", "") + "\n";
                list.add(str);
            }
        }

        /*XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
        String text = extractor.getText();
        // System.out.println(text);
        POIXMLProperties.CoreProperties coreProps = extractor.getCoreProperties();
        String title = coreProps.getTitle();
        System.out.println(title);*/

        //获取文档中所有的表格
        /*List tables = doc.getTables();
        List rows;
        List cells;
        for (XWPFTable table : tables) {
            // 表格属性
            // CTTblPr pr = table.getCTTbl().getTblPr();
            // 获取表格对应的行
            rows = table.getRows();
            for (XWPFTableRow row : rows) {
                //获取行对应的单元格
                cells = row.getTableCells();
                for (XWPFTableCell cell : cells) {
                    System.out.println(cell.getText());;
                }
            }
        }*/
        close(is);

        return list;
    }

    // 获取2007版word文档内容
    public static List getParagraphText2007(String filePath) throws Exception {
        InputStream is = new FileInputStream(filePath);
        XWPFDocument doc = new XWPFDocument(is);

        List context = new ArrayList();
        List paras = doc.getParagraphs();
        for (XWPFParagraph para : paras) {
            String str = "  " + para.getText().replaceAll("\\n", "").replaceAll("\\t", "") + "\n";
            context.add(str);
        }

        //获取文档中所有的表格
        /*List tables = doc.getTables();
        List rows;
        List cells;
        for (XWPFTable table : tables) {
            // 表格属性
            // CTTblPr pr = table.getCTTbl().getTblPr();
            // 获取表格对应的行
            rows = table.getRows();
            for (XWPFTableRow row : rows) {
                //获取行对应的单元格
                cells = row.getTableCells();
                for (XWPFTableCell cell : cells) {
                    context.add(cell.getText());
                }
            }
        }*/
        close(is);
        return context;
    }

    /**
     * 将对比结果写入表格
     * @param size 对比list size
     * @param object 短句对比结果
     * @throws Exception
     */
    public static void writeTable(int size, List object, String returnPath) throws Exception {
        XWPFDocument doc = new XWPFDocument();
        // 获取新建文档对象的样式
        XWPFStyles newStyles = doc.createStyles();
        // 关键行 // 修改设置文档样式为静态块中读取到的样式
        // newStyles.setStyles(wordStyles);
        // 创建一个表格
        XWPFTable table = doc.createTable(size, 2);
        // 这里增加的列原本初始化创建的行在通过getTableCells()方法获取时获取不到,但通过row新增的就可以。
        // table.addNewCol(); //给表格增加一列
        // table.createRow(); //给表格新增一行
        List rows = table.getRows();
        // 表格属性
        CTTblPr tablePr = table.getCTTbl().addNewTblPr();
        // 表格宽度
        CTTblWidth width = tablePr.addNewTblW();
        width.setW(BigInteger.valueOf(9000));
        XWPFTableRow row;
        List cells;
        XWPFTableCell cell;
        int rowSize = rows.size();
        int cellSize;
        for (int i=0; i list = row.getCtRow().getTcList();
            cells = row.getTableCells();
            cellSize = cells.size();
            for (int j=0; j style is more prominent in the formats bar
        ctStyle.setUiPriority(indentNumber);

        CTOnOff onoffnull = CTOnOff.Factory.newInstance();
        ctStyle.setUnhideWhenUsed(onoffnull);

        // style shows up in the formats bar
        ctStyle.setQFormat(onoffnull);

        // style defines a heading of the given level
        CTPPr ppr = CTPPr.Factory.newInstance();
        ppr.setOutlineLvl(indentNumber);
        ctStyle.setPPr(ppr);

        XWPFStyle style = new XWPFStyle(ctStyle);

        // is a null op if already defined
        XWPFStyles styles = docxDocument.createStyles();

        style.setType(STStyleType.PARAGRAPH);
        styles.addStyle(style);

    }


    /**
     * 关闭输入流
     * @param is 输入流
     */
    private static void close(InputStream is) {
        if (is != null) {
            try {
                is.close();
            } catch (IOException e) {
                LOGGER.error("流关闭异常",e);
            }
        }
    }

    /**
     * 关闭输出流
     * @param os 输出流
     */
    private static void close(OutputStream os) throws Exception{
        if (os != null) {
            try {
                os.close();
            } catch (IOException e) {
                LOGGER.error("流关闭异常",e);
            }
        }
    }
}

 

你可能感兴趣的:(POI解析word文档)