大数据量Excel解析工具分享(适用于对所有数据进行统计分析等场景)

不啰嗦,还是直接上菜

  1. pom依赖
<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi</artifactId>
    <version>3.16</version>
</dependency>
<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi-ooxml</artifactId>
    <version>3.14</version>
</dependency>
  1. 工具类
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xssf.eventusermodel.ReadOnlySharedStringsTable;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler;
import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.SheetContentsHandler;
import org.apache.poi.xssf.model.StylesTable;
import org.apache.poi.xssf.usermodel.XSSFComment;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.XMLReaderFactory;

import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;

/**
 * 解析大数据量Excel工具类
 * 

1. 此工具类不支持并发 *

2. 此工具类当前仅支持单次解析特定sheet * * @author zhangrihui * @date 2020/6/30 0:20:00 */ public class ExcelUtils { private static final Logger logger = LoggerFactory.getLogger(ExcelUtils.class); /** * sheet操作行记录 */ private int rowCount = 0; private RowHandler rowHandler; private boolean skipHeaderRow; private SheetContentsHandler sheetContentsHandler; private OPCPackage pkg; public ExcelUtils(RowHandler rowHandler) { this(rowHandler,true); } public ExcelUtils(RowHandler rowHandler, boolean skipHeaderRow) { this(rowHandler,skipHeaderRow,null); } /** * 用户必须自定义rowHandler,同时可以指定跳过首行。也可以自定义sheet处理器。 */ public ExcelUtils(RowHandler rowHandler, boolean skipHeaderRow, SheetContentsHandler sheetContentsHandler) { if(rowHandler == null){ throw new RuntimeException("rowHandler can not be null"); } if(sheetContentsHandler == null){ sheetContentsHandler = new DefaultSheetHandler(); } this.rowHandler = rowHandler; this.skipHeaderRow = skipHeaderRow; this.sheetContentsHandler = sheetContentsHandler; } /** * 使用本工具类的用户必须自己实现行处理器处理逻辑 * * @author zhangrihui * @date 2020/6/30 07:45:58 */ @FunctionalInterface public interface RowHandler{ void handle(List<String> cells); } /** * excel载入方法。excel操作之前,需要先调用该方法进行载入。 * * @param is excel inputstream * @return * @throws IOException * @throws InvalidFormatException */ public ExcelUtils load(InputStream is) throws IOException, InvalidFormatException { setPkg(OPCPackage.open(is)); return this; } /** * 读取sheet,默认为转换第1个sheet */ public ExcelUtils parse() { return parse(1); } /** * 读取sheet * * @param sheetId 要处理的sheet索引,从1开始 * @return */ public ExcelUtils parse(int sheetId) { if(pkg == null){ throw new RuntimeException("Before parse ,load is expected"); } //当需要一次性处理多个sheet时,重置sheet行统计器 rowCount = 0; try { XSSFReader reader = new XSSFReader(pkg); InputStream shellStream = reader.getSheet("rId" + sheetId); InputSource sheetSource = new InputSource(shellStream); StylesTable styles = reader.getStylesTable(); ReadOnlySharedStringsTable strings = new ReadOnlySharedStringsTable(pkg); XMLReader parser = getSheetParser(styles, strings); parser.parse(sheetSource); } catch (Exception e){ logger.error("excel parse error",e); } finally { try { pkg.close(); } catch (IOException e) { logger.error("pkg close error",e); } } return this; } /** * 获取excel解析器 * * @param styles Table of styles * @param strings Table of shared strings * @return * @throws SAXException SAX异常,用户需自行处理 */ private XMLReader getSheetParser(StylesTable styles, ReadOnlySharedStringsTable strings) throws SAXException { XMLReader parser = XMLReaderFactory.createXMLReader(); parser.setContentHandler(new XSSFSheetXMLHandler(styles, strings, sheetContentsHandler, false)); return parser; } public void setPkg(OPCPackage pkg) { this.pkg = pkg; } /** * 默认sheet内容处理器 * * @author zhangrihui * @date 2020/6/30 07:42:26 */ private class DefaultSheetHandler implements SheetContentsHandler { private List<String> rowData = new ArrayList<>(); @Override public void startRow(int rowNum) { rowCount++; rowData.clear(); } @Override public void endRow(int rowNum) { if(skipHeaderRow && rowCount <=1){ return; } rowHandler.handle(rowData); } @Override public void cell(String cellReference, String formattedValue, XSSFComment comment) { rowData.add(formattedValue); } @Override public void headerFooter(String text, boolean isHeader, String tagName) { } } public static void main(String[] args) { try { ExcelUtils excelUtils = new ExcelUtils(cells -> { System.out.println("cells:"+cells); //直接输出每一行的内容 }); excelUtils.load(new FileInputStream("D:\\***.xlsx")).parse(); } catch (Exception e) { e.printStackTrace(); } } }

  1. 使用说明
    参照main方法,通过自定义RowHandler完成数据行处理逻辑,实现比如统计表格中某一行的最大值、平均值等。该工具类提供的功能类似于mysql的全表扫描,而不是把所有内容都加载进内存中进行操作,因此能适用于大数据量的Excel处理。(还没有拿非常大、超内存的Excel测试过。实际使用中如有问题,还请评论指正~)

你可能感兴趣的:(Java工具类)