java poi sax方式处理大数据量excel文件

系统需要用到一个导入excel文件的功能,使用poi组件常规方式读取excel时,内存耗尽,OutOfMemoryError,或者读取非常慢
所以写了一个工具类,使用poi sax方式读取excel,速度快很多,内存消耗可以接受。

测试结果如下:
.xlsx文件,35M大小,总4个sheel,
只读取第一个,37434行,54列

总行数:37434
读取耗时:39秒
打印耗时:17秒

主要代码如下:
ExcelUtils.class 主入口

package com.xxx.bi.utils.excel;

import java.util.List;
import java.util.Objects;

import org.apache.commons.lang3.StringUtils;

import com.google.common.collect.Lists;

public class ExcelUtils {
    /** logger日志. */
    // public static final Logger LOGGER = Logger.getLogger(ExcelUtils.class);

    public ExcelUtils() {
    }

    /**
     * 获取excel的表头
     * 
     * @param filePath
     *            文件路径
     * @param headerNum
     *            表头所在行数
     * @return
     */
    public static List getHeader(String filePath, int headerNum) {
        if (StringUtils.isBlank(filePath)) {
            throw new IllegalArgumentException("传入文件路径不能为空");
        }
        if (Objects.isNull(headerNum) || headerNum < 1) {
            headerNum = 1;
        }
        try {
            return LargeExcelFileReadUtil.getRowFromSheetOne(filePath, headerNum);
        } catch (Exception e) {
            // LOGGER.info("获取excel[" + filePath + "]表头失败,原因:", e);
            e.printStackTrace();
        }
        return Lists.newArrayList();
    }

    /**
     * 获取excel的所有数据
* 所有数据类型都是String
* 会以第一行数据的列数为总列数,所以第一行的数据必须都不为空,否则可能出java.lang.IndexOutOfBoundsException * * @param filePath * 文件路径 * @param headerNum * 表头所在行数 * @return */ public static List> getAllData(String filePath) { if (StringUtils.isBlank(filePath)) { throw new IllegalArgumentException("传入文件路径不能为空"); } try { return LargeExcelFileReadUtil.getRowsFromSheetOne(filePath); } catch (Exception e) { // LOGGER.info("获取excel[" + filePath + "]表头失败,原因:", e); e.printStackTrace(); } return Lists.newArrayList(); } public static void main(String[] args) { long start = System.currentTimeMillis(); String filepath = "C:/Users/Administrator/Desktop/05-作业调配表 -快递.xlsx"; // List result = ExcelUtils.getHeader(filepath, 1); // for (String col : result) { // System.out.println(col); // } List> result = ExcelUtils.getAllData(filepath); long end = System.currentTimeMillis(); for (List list : result) { System.out.println(list.toString()); } long end1 = System.currentTimeMillis(); try { Thread.sleep(1000l); } catch (InterruptedException e) { e.printStackTrace(); } System.err.println("总行数:" + result.size()); System.err.println(("读取耗时:" + (end - start) / 1000) + "秒"); System.err.println(("打印耗时:" + (end1 - end) / 1000) + "秒"); } }

LargeExcelFileReadUtil.class 真正的工具类

package com.xxx.bi.utils.excel;

import java.io.InputStream;
import java.util.List;
import java.util.Objects;

import org.apache.log4j.Logger;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.model.SharedStringsTable;
import org.xml.sax.InputSource;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.XMLReaderFactory;

public class LargeExcelFileReadUtil {
    /** logger日志. */
    public static final Logger LOGGER = Logger.getLogger(LargeExcelFileReadUtil.class);

    // 处理一个sheet
    public static List getRowFromSheetOne(String filename, Integer rowNum) throws Exception {
        InputStream inputStream = null;
        OPCPackage pkg = null;
        SingleRowHandler singleRowHandler = null;
        try {
            pkg = OPCPackage.open(filename);
            XSSFReader r = new XSSFReader(pkg);
            SharedStringsTable sst = r.getSharedStringsTable();
            singleRowHandler = new SingleRowHandler(sst, rowNum);
            XMLReader parser = XMLReaderFactory.createXMLReader("com.sun.org.apache.xerces.internal.parsers.SAXParser");
            parser.setContentHandler(singleRowHandler);
            inputStream = r.getSheet("rId1");
            InputSource sheetSource = new InputSource(inputStream);
            parser.parse(sheetSource);
            return singleRowHandler.getRow();
        } catch (Exception e) {
            String message = e.getMessage();
            if (Objects.nonNull(rowNum) && Objects.nonNull(singleRowHandler)
                    && SingleRowHandler.FINISH_ROW_MESSAGE.equalsIgnoreCase(message)) {
                // 获取某一行数据完成 ,暂时不知道怎么能终止excel解析,直接抛出了异常,实际是成功的
                return singleRowHandler.getRow();
            }
            throw e;
        } finally {
            if (Objects.nonNull(pkg)) {
                pkg.close();
            }
            if (Objects.nonNull(inputStream)) {
                inputStream.close();
            }
        }
    }

    // 处理一个sheet
    public static List> getRowsFromSheetOne(String filename) throws Exception {
        InputStream inputStream = null;
        OPCPackage pkg = null;
        MultiRowHandler multiRowHandler = null;
        try {
            pkg = OPCPackage.open(filename);
            XSSFReader r = new XSSFReader(pkg);
            SharedStringsTable sst = r.getSharedStringsTable();
            multiRowHandler = new MultiRowHandler(sst);
            XMLReader parser = XMLReaderFactory.createXMLReader("com.sun.org.apache.xerces.internal.parsers.SAXParser");
            parser.setContentHandler(multiRowHandler);
            inputStream = r.getSheet("rId1");
            InputSource sheetSource = new InputSource(inputStream);
            parser.parse(sheetSource);
            return multiRowHandler.getRows();
        } catch (Exception e) {
            throw e;
        } finally {
            if (Objects.nonNull(pkg)) {
                pkg.close();
            }
            if (Objects.nonNull(inputStream)) {
                inputStream.close();
            }
        }
    }

}

SingleRowHandler.class 当行处理类,可以只获取表头或表格中的某一行数据

package com.xxx.bi.utils.excel;

import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import java.util.regex.Pattern;

import org.apache.poi.xssf.model.SharedStringsTable;
import org.apache.poi.xssf.usermodel.XSSFRichTextString;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

public class SingleRowHandler extends DefaultHandler {
    public final static String FINISH_ROW_MESSAGE = "row data process finish";

    private Integer rowNum = null;// rowNum不为空时则标示只需要获取这一行的数据
    private int curRowNum = 1;
    private String cellType = "";
    private SharedStringsTable sst;
    private String lastContents;
    private boolean nextIsString;
    private String cellPosition;
    private List row = new ArrayList<>();

    public List getRow() {
        return row;
    }

    public SingleRowHandler(SharedStringsTable sst, Integer rowNum) {
        this.sst = sst;
        this.rowNum = rowNum;
    }

    public void startElement(String uri, String localName, String name, Attributes attributes) throws SAXException {
        if (name.equals("c")) {
            cellPosition = attributes.getValue("r");
            // 这是一个新行
            if (Pattern.compile("^A[0-9]+$").matcher(cellPosition).find()) {
                curRowNum = Integer.valueOf(cellPosition.substring(1));
            }
            cellType = "";
            cellType = attributes.getValue("t");
            if ("s".equals(cellType)) {
                nextIsString = true;
            } else {
                nextIsString = false;
            }
        }
        // 清楚缓存内容
        lastContents = "";
        if (Objects.nonNull(rowNum) && curRowNum > rowNum) {
            // 获取某一行数据完成 ,暂时不知道怎么能终止excel解析,直接抛出了异常,实际是成功的
            throw new SAXException(FINISH_ROW_MESSAGE);
        }
    }

    public void endElement(String uri, String localName, String name) throws SAXException {
        if (nextIsString) {
            int idx = Integer.parseInt(lastContents);
            lastContents = new XSSFRichTextString(sst.getEntryAt(idx)).toString();
            nextIsString = false;
        }

        if (name.equals("v")) {
            if (Objects.isNull(rowNum) || rowNum == curRowNum) {
                row.add(lastContents);
            }
        }
    }

    public void characters(char[] ch, int start, int length) throws SAXException {
        lastContents += new String(ch, start, length);
    }
}

MultiRowHandler.class 获取excel所有行的数据

package com.xxx.bi.utils.excel;

import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import java.util.regex.Pattern;

import org.apache.commons.lang3.StringUtils;
import org.apache.poi.xssf.model.SharedStringsTable;
import org.apache.poi.xssf.usermodel.XSSFRichTextString;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

/**
 * 获取完整excel数据的handler
* * @author Administrator * */ public class MultiRowHandler extends DefaultHandler { private int curRowNum = 0;// 行号,从1开始 private int curColIndex = -1;// 列索引,从0开始 private int colCnt = 0;// 列数,取第一行列数做为列总数 private String cellType = ""; private SharedStringsTable sst; private String lastContents; private boolean nextIsString; private String cellPosition; private List head = null; private List curRowData = null; private boolean curRowIsBlank = true;// 当前是个空行 private List> rows = new ArrayList<>(); public List> getRows() { return rows; } public MultiRowHandler(SharedStringsTable sst) { this.sst = sst; } @Override public void startElement(String uri, String localName, String name, Attributes attributes) throws SAXException { if (name.equals("c")) { cellPosition = attributes.getValue("r"); curColIndex = getColIndex(cellPosition); // 这是一个新行 if (isNewRow(cellPosition)) { curRowNum = getRowNum(cellPosition); if (2 == curRowNum && Objects.nonNull(curRowData)) { head = curRowData; colCnt = head.size(); } curRowData = getBlankRow(colCnt); } cellType = ""; cellType = attributes.getValue("t"); if ("s".equals(cellType)) { nextIsString = true; } else { nextIsString = false; } } // 清楚缓存内容 lastContents = ""; } private boolean isNewRow(String cellPosition) { // 坐标以A开头,后面跟数字 或者坐标行和当前行不一致的 boolean newRow = Pattern.compile("^A[0-9]+$").matcher(cellPosition).find(); if (!newRow) { int cellRowNum = getRowNum(cellPosition); newRow = (cellRowNum != curRowNum); } return newRow; } /** * 根据列坐标获取行号,从1开始,返回0时标示出错 * * @param cellPosition * 列坐标,为A1,B23等 * @return 行号,从1开始,返回0是为失败 */ private static int getRowNum(String cellPosition) { String strVal = Pattern.compile("[^0-9]").matcher(cellPosition).replaceAll("").trim();// 获取坐标中的数字 if (StringUtils.isNotBlank(strVal)) { return Integer.valueOf(strVal); } return 0; } /** * 根据列坐标返回当前列索引,从0开始,返回-1时标示出错
* A1->0; B1->1...AA1->26 * * @param cellPosition * 列坐标,为A1,B23等 * @return 列索引,从0开始,返回-1是为失败,A1->0; B1->1...AA1->26 */ private static int getColIndex(String cellPosition) { int index = -1; int num = 65;// A的Unicode码 int length = cellPosition.length(); for (int i = 0; i < length; i++) { char c = cellPosition.charAt(i); if (Character.isDigit(c)) { break;// 确定指定的char值是否为数字 } index = (index + 1) * 26 + (int) c - num; } return index; } /** * 返回一个全部为空字符串的空行 * * @param cnt * @return */ private List getBlankRow(int cnt) { List result = new ArrayList<>(cnt); for (int i = 0; i < cnt; i++) { result.add(i, ""); } curRowIsBlank = true; return result; } @Override public void endElement(String uri, String localName, String name) throws SAXException { if (nextIsString) { int idx = Integer.parseInt(lastContents); lastContents = new XSSFRichTextString(sst.getEntryAt(idx)).toString(); nextIsString = false; } if (name.equals("v")) { // System.out.println(MessageFormat.format("当前列定位:{0},当前行:{1},当前列:{2},当前值:{3}", // cellPosition, curRowNum, // curColIndex, lastContents)); if (Objects.isNull(head)) { curRowData.add(lastContents); } else { curRowData.set(curColIndex, lastContents); } curRowIsBlank = false; // 这是一个新行 if (isNewRow(cellPosition)) { if (Objects.nonNull(curRowData)) { if (curRowIsBlank) { curRowData.clear();// 如果当前行是空行,则清空当前行数据 } rows.add(curRowData); } } } } @Override public void endDocument() throws SAXException { if (Objects.nonNull(curRowData) && !curRowIsBlank) { rows.add(curRowData);// 最后一行在上面不好加入,最后一行全是空行的不加入 } super.endDocument(); } @Override public void characters(char[] ch, int start, int length) throws SAXException { lastContents += new String(ch, start, length); } @Override public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException { lastContents += ""; } public static void main(String[] args) { System.out.println(getColIndex("BC2")); } }

你可能感兴趣的:(java,poi,excel,大数据文件)