Java读取word文档里的复杂型表格(任免表)

使用apache-poi读取word文档里的复杂型表格

这里使用的任免表编辑器产生的word文档。

word模板:https://download.csdn.net/download/weixin_41420919/85708792
Java读取word文档里的复杂型表格(任免表)_第1张图片

问题: 需要读取任免编辑器生成的word里面的内容,其中生成的word是一个表格,需要根据表格的行列数获取相应的内容。

难点:在于表格里有图片,以及单元格的合并。

解决方案:

  1. 使用spire读取word表格(需要收费)
  2. 使用jaboc读取word表格(支持不太好)
  3. 使用apache-poi读取word表格(推荐)

maven坐标

        
		<dependency>
            <groupId>org.apache.poigroupId>
            <artifactId>poiartifactId>
            <version>3.15version>
        dependency>
        <dependency>
            <groupId>org.apache.poigroupId>
            <artifactId>poi-ooxmlartifactId>
            <version>3.15version>
        dependency>
        <dependency>
            <groupId>org.apache.poigroupId>
            <artifactId>poi-scratchpadartifactId>
            <version>3.15version>
        dependency>    

测试程序 (打印word所有的表格以及表格字段)

        //注意测试程序没有关闭流,实际使用要关闭流
        String filePath = "D:\\xujinwei\\yz_dagl\\testWord\\任免表99.doc";
        WordImportUtil test = new WordImportUtil(filePath);
        InputStream is;
        FileInputStream in = new FileInputStream(filePath);// 载入文档
        POIFSFileSystem pfs = new POIFSFileSystem(in);
        HWPFDocument hwpf = new HWPFDocument(pfs);
        Range range = hwpf.getRange();// 得到文档的读取范围
        TableIterator it = new TableIterator(range);
        Map<String,String> map = new HashMap<>();
        int tt = 1;
        while (it.hasNext()) {
            Table tb = it.next();
            System.out.println("=========================================第"+tt+"个表格==");
            // 迭代行,默认从0开始
                int i = 0;
                for (  ; i < tb.numRows(); i++) {
                    TableRow tr = tb.getRow(i);
                    // 迭代列,默认从0开始
                    int j = 0;
                    for ( ; j < tr.numCells(); j++) {
                        TableCell td = tr.getCell(j);// 取得单元格
                        // 取得单元格的内容
                        String s = "";
                        for (int k = 0; k < td.numParagraphs(); k++) {
                            Paragraph para = td.getParagraph(k);// 获取第k个段落
                            s += para.text().trim();
                        }
                        map.put(s, "i:" + i + "   " + "j:" + j);
                    }
                    map.forEach((x,y)->{
                        System.out.println("===========================key========"+x);
                        System.out.println("===========================value========"+y);

                    });
                    map.clear();
                }
            tt++;
        }
          
        //获取图片
	    PicturesTable picturesTable = hwpf.getPicturesTable();
        List<Picture> allPictures = picturesTable.getAllPictures();
        for (Picture picture : allPictures) {
             String picBase64Str = Base64Utils.encodeToString(picture.getContent());
        }
        return "";

      

这里对所有表格的字段进行打印,实测时,我发现我的word文档一个表格被解析成三个表格。从第二个表格起才是我所需要的数据以及因为有些单元格合并,获取的字段不对, 因此我还做了特殊处理。下面提供一个对合并的单元格的解决方案

  for ( ; j < tr.numCells(); j++) {
  		TableCell td = tr.getCell(j);// 取得单元格
        // 取得单元格的内容
        String s = "";    
        //这个s是合并单元格的结果,如果想要获取单独的可以在k那里进行操作,可以通过判断k的值,直接return  
        //para.text().trim();
        for (int k = 0; k < td.numParagraphs(); k++) {
            Paragraph para = td.getParagraph(k);// 获取第k个段落
            s += para.text().trim();
            }
    }

完整的测试类

package com.hrtac.application.util;


import com.hrtac.application.bean.xt001.DA051Bean;
import com.hrtac.application.bean.xt001.DA052Bean;
import org.apache.commons.lang3.StringUtils;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.model.PicturesTable;
import org.apache.poi.hwpf.usermodel.*;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.springframework.util.Base64Utils;

import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
 * Description:
 * User: xu jin wei
 * Date: 2022-06-15
 * Time: 8:44
 */
public class WordImportUtil {


    private String filePath ;
    private HWPFDocument hwpf;
    private InputStream in;

    public WordImportUtil(String filePath){
        this.in = null;// 载入文档
        try {
            in = new FileInputStream(filePath);
            POIFSFileSystem pfs = new POIFSFileSystem(in);
            this.hwpf = new HWPFDocument(pfs);

        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }


    }

    public String getTableText(int tableIndex, int row ,int col, boolean isSpecial){
        int noSpecial = 3;
        return getTableText(tableIndex, row ,col, isSpecial, noSpecial);
    }

    public String getTableText(int tableIndex, int row ,int col, boolean isSpecial, int specialParagraph){

        Range range = hwpf.getRange();// 得到文档的读取范围
        TableIterator it = new TableIterator(range);
        List<String> list = new ArrayList<>();
        Map<String,String> map = new HashMap<>();
        StringBuffer errMsg = new StringBuffer();
        int tt = 1;
        int i =0;
        while (it.hasNext()) {
            Table tb = it.next();
            if(i==tableIndex) {

                if(row>=tb.numRows()){
                    return "";
                }

                TableRow tr = tb.getRow(row);
                TableCell td = tr.getCell(col);// 取得单元格
                // 取得单元格的内容
                String s = "";
                for (int k = 0; k < td.numParagraphs(); k++) {
                    Paragraph para = td.getParagraph(k);// 获取第k个段落
                    s += para.text().trim();

                    if(isSpecial && row==6 && col==2 && specialParagraph==k ){
                        //在职教育-学历
                        return para.text().trim();
                    }
                    if(isSpecial && row==6 && col==2 && specialParagraph==k ){
                        //在职教育-学位
                        return para.text().trim();
                    }
                    if(isSpecial && row==6 && col==4 && specialParagraph==k ){
                        //在职教育-毕业院校
                        return para.text().trim();
                    }
                    if(isSpecial && row==6 && col==4 && specialParagraph==k ){
                        //在职教育-毕业专业
                        return para.text().trim();
                    }

                }
                return s;
            }
            i++;
        }
        return "";
    }

    public void closeStream() {
        if(this.hwpf !=null){
            try {
                this.hwpf.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        if (this.in != null) {
            try {
                this.in.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }


    public DA051Bean getDA051BaseInfoFromWord(WordImportUtil wordImportUtil) {

        DA051Bean da051Bean = new DA051Bean();
        int tableIndex = 1;
        da051Bean.setXM00001(wordImportUtil.getTableText(tableIndex,0,1,false));
        da051Bean.setXB00001(wordImportUtil.getTableText(tableIndex,0,3,false));
        String CSNY001 = wordImportUtil.getTableText(tableIndex, 0, 5,false);
        if(StringUtils.isNotEmpty(CSNY001)) CSNY001 = CSNY001.substring(0,CSNY001.indexOf("(")).replace(".","");
        da051Bean.setCSNY001(CSNY001);
        da051Bean.setMZ00001(wordImportUtil.getTableText(tableIndex,1,1,false));
        da051Bean.setJG00001(wordImportUtil.getTableText(tableIndex,1,3,false));
        da051Bean.setCSD0001(wordImportUtil.getTableText(tableIndex,1,5,false));
        da051Bean.setRDSJ001(wordImportUtil.getTableText(tableIndex,2,1,false).replace(".",""));
        da051Bean.setCJGZ002(wordImportUtil.getTableText(tableIndex,2,3,false).replace(".",""));
        da051Bean.setJKZK001(wordImportUtil.getTableText(tableIndex,2,5,false));
        da051Bean.setZYJS007(wordImportUtil.getTableText(tableIndex,3,1,false));
        da051Bean.setZYTC001(wordImportUtil.getTableText(tableIndex,3,3,false));

        da051Bean.setXLMC001(wordImportUtil.getTableText(tableIndex,4,2,false));
        da051Bean.setXWMC001(wordImportUtil.getTableText(tableIndex,5,2,false));
        da051Bean.setBYYX001(wordImportUtil.getTableText(tableIndex,4,4,false));
        da051Bean.setBYZY001(wordImportUtil.getTableText(tableIndex,5,4,false));
        // 在职教育 学历和学位合并了 院校和专业也合并了
        da051Bean.setXLMC002(wordImportUtil.getTableText(tableIndex,6,2,true,0));
        da051Bean.setXWMC002(wordImportUtil.getTableText(tableIndex,6,2,true,1));
        da051Bean.setBYYX002(wordImportUtil.getTableText(tableIndex,6,4,true,0));
        da051Bean.setBYZY002(wordImportUtil.getTableText(tableIndex,6,4,true,1));


        da051Bean.setXRZW001(wordImportUtil.getTableText(tableIndex,8,1,false));
        da051Bean.setNRZW001(wordImportUtil.getTableText(tableIndex,9,1,false));
        da051Bean.setNMZW001(wordImportUtil.getTableText(tableIndex,10,1,false));
        da051Bean.setJL00001(wordImportUtil.getTableText(tableIndex,11,1,false));

        tableIndex++;
        da051Bean.setJCQK001(wordImportUtil.getTableText(tableIndex,0,1,false));
        da051Bean.setNDKH001(wordImportUtil.getTableText(tableIndex,1,1,false));
        da051Bean.setRMLY001(wordImportUtil.getTableText(tableIndex,2,1,false));


        //和家庭成员有关  7 --11  8--12  9--13  10--14
        da051Bean.setCBDW001(wordImportUtil.getTableText(tableIndex,getCBDW(wordImportUtil),1,false));


        return da051Bean;
    }

    public int getCBDW(WordImportUtil wordImportUtil){
        int index = -1;
        if("呈报单位".equals(wordImportUtil.getTableText(2,11,0,false))){
            index = 11;
        };
        if("呈报单位".equals(wordImportUtil.getTableText(2,12,0,false))){
            index = 12;
        };
        if("呈报单位".equals(wordImportUtil.getTableText(2,13,0,false))){
            index = 13;
        };
        if("呈报单位".equals(wordImportUtil.getTableText(2,14,0,false))){
            index = 14;
        };
        return index;
    }

    public List<DA052Bean> getDa052BeanListFormWord(WordImportUtil wordImportUtil){

        int index = getCBDW(wordImportUtil);


        List<DA052Bean> list = new ArrayList<>();
        for(int row = 4; row < index; row ++){
            DA052Bean da052Bean = new DA052Bean();
            int col = 1;
            da052Bean.setCYCW001(wordImportUtil.getTableText(2,row,col,false));
            da052Bean.setCYXM001(wordImportUtil.getTableText(2,row,++col,false));
            da052Bean.setCYCS001(wordImportUtil.getTableText(2,row,++col,false));
            da052Bean.setCYZZ001(wordImportUtil.getTableText(2,row,++col,false));
            da052Bean.setCYDW001(wordImportUtil.getTableText(2,row,++col,false));
            da052Bean.setXH00001(String.valueOf((row-3)));

            list.add(da052Bean);
        }
        for(int k = list.size(); k <10 ;k++){
            DA052Bean da052Bean = new DA052Bean();
            da052Bean.setXH00001(String.valueOf( (k + 1)));
            list.add(da052Bean);
        }

        return  list;
    }

    public String getPicBase64String(WordImportUtil wordImportUtil) {

        PicturesTable picturesTable = wordImportUtil.hwpf.getPicturesTable();
        List<Picture> allPictures = picturesTable.getAllPictures();
        for (Picture picture : allPictures) {
             return Base64Utils.encodeToString(picture.getContent());
        }
        return "";
    }



    public static void main(String[] args)throws Exception {

        String filePath = "D:\\xujinwei\\yz_dagl\\testWord\\任免表99.doc";
        WordImportUtil test = new WordImportUtil(filePath);
        InputStream is;
        FileInputStream in = new FileInputStream(filePath);// 载入文档
        POIFSFileSystem pfs = new POIFSFileSystem(in);
        HWPFDocument hwpf = new HWPFDocument(pfs);
        Range range = hwpf.getRange();// 得到文档的读取范围
        TableIterator it = new TableIterator(range);
        Map<String,String> map = new HashMap<>();
        int tt = 1;
        while (it.hasNext()) {
            Table tb = it.next();
            System.out.println("=========================================第"+tt+"个表格==");
            // 迭代行,默认从0开始
      //      if(tt==2) {
                int i = 0;
                for (  ; i < tb.numRows(); i++) {
                    TableRow tr = tb.getRow(i);
                    // 迭代列,默认从0开始
                    int j = 0;
                    for ( ; j < tr.numCells(); j++) {
                        TableCell td = tr.getCell(j);// 取得单元格
                        // 取得单元格的内容
                        String s = "";
                        for (int k = 0; k < td.numParagraphs(); k++) {
                            Paragraph para = td.getParagraph(k);// 获取第k个段落
                            s += para.text().trim();
                        }
                        map.put(s, "i:" + i + "   " + "j:" + j);
                    }
                    map.forEach((x,y)->{
                        System.out.println("===========================key========"+x);
                        System.out.println("===========================value========"+y);

                    });
                    map.clear();
                }
         //   }



            tt++;
        }


        /*WordUtils wordUtils = new WordUtils(false);
        wordUtils.openDocument(filePath);


        Dispatch doc = wordUtils.getDoc();


        //  获取所有表格
        Dispatch tables = Dispatch.get(doc, "Tables").toDispatch();
        // 获取表格数目
        int tablesCount = Dispatch.get(tables, "Count").getInt();
        System.out.println("文档中共有" + tablesCount + "个表格");
        for (int i = 1; i <= tablesCount; i++) {
            // 获取第i个表格
            Dispatch table = Dispatch.call(tables, "Item", new Variant(i)).toDispatch();
            // 获取该表格所有行
            Dispatch rows = Dispatch.call(table, "Rows").toDispatch();
            // 获取该表格所有列
            Dispatch columns = Dispatch.call(table, "Columns").toDispatch();
            // 获取该表格行数
            int rowsCount = Dispatch.get(rows, "Count").getInt();
            // 获取该表格列数
            int columnsCount = Dispatch.get(columns, "Count").getInt();
            for (int j = 1; j <= rowsCount; j++) {
                for (int k = 1; k <= columnsCount; k++) {
                    Dispatch cell = Dispatch.call(table, "Cell", new Variant(j), new Variant(k)).toDispatch();
                    Dispatch Range = Dispatch.get(cell, "Range").toDispatch();
                    String text = Dispatch.get(Range, "Text").getString();
                    //去掉最后的回车符;
                    text = text.substring(0, text.length() - 2);
                    System.out.println("单元格中的内容:" + text);
                }
            }
        }
*/
        //  test.testWord(filePath);
        //   test.spireForTableOfDoc(filePath);


    }

}

你可能感兴趣的:(java,java,开发语言)