Excel大数据量处理(poi大数据量+java 线程池机制)

  • 最近项目需求,有最低十万的数据导入最高500万的数据导入需求,poi,是开源对Excel支持非常强大的框架,因此研究了一番,此过程借阅网上多为人士的代码,和见解,我发现网上的代码都是那一套,是一个前辈在github上的一个开源项目,我也借阅了,非常感谢前辈提供.
  • 我在借阅的时候发现假如一行中有单元格为空的话,会发生数据前移的情况,从而我们就无法准确的和数据库中数据对应,
  • 第三就是其代码风格和我的不像,因此我研究了一番,下面说说我的心得
  • 对于word 2007,其实际上就是一堆的xml的压缩包,你可以把xlsx的后缀改为.zip解压就会看到,找到workbook文件夹打开,找到sheet1.xml打开,发现表格数据以xml写入在里面,其中有各种标签,每个标签的含义大家可以自己去摸索,这里不再多少,因为这个你自己设置单元格各种类型就会出现各种标签,所以不再多说
  • 好了废话少说,开始说代码
  • 第一:准备工作
        <dependency>
            <groupId>org.apache.poigroupId>
            <artifactId>poiartifactId>
            <version>${poi.version}version>
        dependency>
        <dependency>
            <groupId>org.apache.poigroupId>
            <artifactId>poi-ooxmlartifactId>
            <version>${poi.version}version>
        dependency>
        
        <dependency>
            <groupId>xercesgroupId>
            <artifactId>xercesImplartifactId>
            <version>2.9.1version>
        dependency>

注意:${poi.version}就是版本号,这个版本号大家可以自己去maven官网查看,找一个比较新的,使用人数比较多的

  • 第二,对于大数据Excel,2007是基于handle来处理,下面给出我写handle类
package com.rjhcsoft.credit.utils.poi.handle;

import com.rjhcsoft.credit.utils.StringUtil;
import com.rjhcsoft.credit.utils.poi.callback.Callback;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.model.SharedStringsTable;
import org.apache.poi.xssf.usermodel.XSSFRichTextString;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

import java.io.IOException;
import java.util.HashMap;
import java.util.Map;

public class ExcelXlsxHandle extends DefaultHandler {

    private CellDataType nextDataType=CellDataType.SSTINDEX;;
    private int formatIndex;
    private String formatString;
    private SharedStringsTable sst;

    /**
     * 单元格中的数据可能的数据类型
     */
    enum CellDataType {
        BOOL, ERROR, FORMULA, INLINESTR, SSTINDEX, NUMBER, DATE, NULL
    }


    private boolean isAvailabledOfRow = false;// 是否是有效行
    private int availabledRows = 0;
    private int totalRows = 0;
    private int currentRowNum = 0;
    private Map cellMap = null;
    private String key;
    private String lastIndex;
    private Callback callback;


    public ExcelXlsxHandle(Callback callback, XSSFReader reader) {
        this.callback = callback;
        try {
            sst=reader.getSharedStringsTable();
        } catch (IOException e) {
            e.printStackTrace();
            throw new RuntimeException(e);
        } catch (InvalidFormatException e) {
            e.printStackTrace();
            throw new RuntimeException(e);
        }
    }

    @Override
    public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
        if ("row".equalsIgnoreCase(qName)) {// 如果是行元素
            totalRows++;// 总行数+1
            String r = attributes.getValue("r");// 获取行号
            currentRowNum = Integer.parseInt(r);
            cellMap= new HashMap<>();
            isAvailabledOfRow=false;
        } else if ("c".equalsIgnoreCase(qName)) {// 如果是单元格
            key = getKey(attributes);// 获取键值
            cellMap.put(key, null);// 先放入map,单此时值为null
            this.setNextDataType(attributes);
        }
    }

    private String getKey(Attributes attributes){
        return attributes.getValue("r").replaceAll("\\d*","");
    }

    @Override
    public void characters(char[] ch, int start, int length) throws SAXException {
        //      super.characters(ch, start, length);
        lastIndex = new String(ch, start, length);
    }

    @Override
    public void endElement(String uri, String localName, String qName) throws SAXException {
        //super.endElement(uri, localName, qName);
        if ("v".equalsIgnoreCase(qName)) {// 如果是值标签
            String value = this.getDataValue(lastIndex.trim());
            if (!StringUtil.isEmpty(value)) isAvailabledOfRow=true;
            cellMap.put(key, value);// 重设值
        } else if ("c".equalsIgnoreCase(qName)) {
            key = null;// key置位null
            lastIndex = null;// lastIndex置位null
        } else if ("row".equalsIgnoreCase(qName)) {// 如果row是结束标签,说明一行结束
            if (isAvailabledOfRow) {// 如果是有效行
                availabledRows++;// 是有效行则有效行数+1
                callback.callback(cellMap,currentRowNum,availabledRows);// 回调,将结果输送给客户端,让客户端处理
            }
        }
    }

    /**
     * 处理数据类型
     *
     * @param attributes
     */
    public void setNextDataType(Attributes attributes) {
        nextDataType = CellDataType.NUMBER; //cellType为空,则表示该单元格类型为数字
        formatIndex = -1;
        formatString = null;
        String cellType = attributes.getValue("t"); //单元格类型
        String cellStyleStr = attributes.getValue("s"); //
        String columnData = attributes.getValue("r"); //获取单元格的位置,如A1,B1

        if ("b".equals(cellType)) { //处理布尔值
            nextDataType = CellDataType.BOOL;
        } else if ("e".equals(cellType)) {  //处理错误
            nextDataType = CellDataType.ERROR;
        } else if ("inlineStr".equals(cellType)) {
            nextDataType = CellDataType.INLINESTR;
        } else if ("s".equals(cellType)) { //处理字符串
            nextDataType = CellDataType.SSTINDEX;
        } else if ("str".equals(cellType)) {
            nextDataType = CellDataType.FORMULA;
        }
    }

    /**
     * 对解析出来的数据进行类型处理
     * @param value   单元格的值,
     *                value代表解析:BOOL的为0或1, ERROR的为内容值,FORMULA的为内容值,INLINESTR的为索引值需转换为内容值,
     *                SSTINDEX的为索引值需转换为内容值, NUMBER为内容值,DATE为内容值
     * @return
     */
    @SuppressWarnings("deprecation")
    public String getDataValue(String value) {
        String thisStr = null;
        switch (nextDataType) {
            // 这几个的顺序不能随便交换,交换了很可能会导致数据错误
            case BOOL: //布尔值
                thisStr=value;
                break;
            case ERROR: //错误
                thisStr = "\"ERROR:" + value.toString() + '"';
                break;
            case FORMULA: //公式
                thisStr = '"' + value.toString() + '"';
                break;
            case INLINESTR:
                XSSFRichTextString rtsi = new XSSFRichTextString(value.toString());
                thisStr = rtsi.toString();
                rtsi = null;
                break;
            case SSTINDEX: //字符串
                String sstIndex = value.toString();
                try {
                    int idx = Integer.parseInt(sstIndex);
                    XSSFRichTextString rtss = new XSSFRichTextString(sst.getEntryAt(idx));//根据idx索引值获取内容值
                    thisStr = rtss.toString();
                    rtss = null;
                } catch (NumberFormatException ex) {
                    thisStr = value.toString();
                }
                break;
            case NUMBER: //数字
                thisStr=value;
                thisStr = thisStr.replace("_", "").trim();
                break;
            case DATE: //日期
                thisStr=value;
                break;
            default:
                thisStr = value;
                break;
        }
        return thisStr;
    }

    public static void main(String[] args){
        System.out.println("AA123".replaceAll("\\d*",""));
    }
}

注意:这里处理Excel的值得时候,全部当成字符串来处理的,不管你是时间,还是数字还是其他,直接拿的原值,所以如果单元格是日期类型,你拿到的值其实是从1900年1月1日到今天的天数,是一个double类型,转换后续会说,现在就是拿原始值,所有的转换不在这里处理,如果有兴趣,也可以在这里处理
- ExcelUtil

package com.rjhcsoft.credit.utils.poi;

import com.rjhcsoft.credit.utils.StringUtil;
import com.rjhcsoft.credit.utils.poi.annos.Workbook;
import com.rjhcsoft.credit.utils.poi.callback.Callback;
import com.rjhcsoft.credit.utils.poi.handle.ExcelXlsxHandle;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.ZipPackage;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.XMLReaderFactory;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.lang.reflect.Field;
import java.math.BigDecimal;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Iterator;
import java.util.Locale;
import java.util.Map;

public class ExcelUtil {

    /**
     * 只拿取第一个sheet
     * @param callback
     * @param file ,
     */
    public static void readFirst(File file,Callback callback)  {
        XSSFReader reader = getXSSFReader(file);
        XMLReader parser = getXMLReader(callback,reader);
        Iterator sheetsData = getSheetsData(reader);
        parseFirst(sheetsData,parser);
    }


    public static void readFirst(String path,Callback callback){
        readFirst(new File(path),callback);
    }


    public static void readAll(File file,Callback callback){
        XSSFReader reader = getXSSFReader(file);
        XMLReader parser = getXMLReader(callback, reader);
        Iterator sheetsData = getSheetsData(reader);
        parseAll(sheetsData,parser);
    }

    public static void readAll(String path,Callback callback){
        readAll(new File(path),callback);
    }

    public static  T resultToObj(Map result,Class clazz) {
        try {
            T t = clazz.newInstance();
            Field[] fields = clazz.getDeclaredFields();
            for (Field field:fields){
                field.setAccessible(true);
                Workbook workbook = field.getDeclaredAnnotation(Workbook.class);
                if (workbook!=null){
                    String cell = workbook.cell();
                    String value = result.get(cell);
                    if (!StringUtil.isEmpty(value)){
                        value=value.trim();
                        if (field.getType()==String.class){
                            field.set(t,value);
                        }else if (field.getType()==Byte.class){
                            field.set(t,Byte.parseByte(value));
                        }else if (field.getType()==Short.class){
                            field.set(t,Short.parseShort(value));
                        }else if (field.getType()==Integer.class){
                            field.set(t,Integer.parseInt(value));
                        }else if (field.getType()==Long.class){
                            field.set(t,Long.parseLong(value));
                        }else if (field.getType()==Float.class){
                            field.set(t,Float.parseFloat(value));
                        }else if (field.getType()==Double.class){
                            field.set(t,Double.parseDouble(value));
                        }else if (field.getType()==Boolean.class){
                            field.set(t,value.equalsIgnoreCase("0")?false:true);
                        }else if (field.getType()==Character.class){
                            field.set(t,value.charAt(0));
                        }else if (field.getType()== BigDecimal.class){
                            field.set(t,new BigDecimal(value));
                        }else if (field.getType()== Date.class){
                            try {
                                double v = Double.parseDouble(value);
                                long m = (long) (v*24*60*60*1000);
                                SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
                                long n = Math.abs(sdf.parse("1900-00-30").getTime());
                                field.set(t,new Date(m-n));
                            } catch (Exception e) {
                                String format = workbook.format();
                                try {
                                    field.set(t,new SimpleDateFormat(format).parse(value));
                                } catch (ParseException e1) {
                                    e1.printStackTrace();
                                    throw new RuntimeException(e1);
                                }
                            }
                        }
                    }
                }
                field.setAccessible(false);
            }
            return t;
        } catch (InstantiationException e) {
            e.printStackTrace();
            throw new RuntimeException(e);
        } catch (IllegalAccessException e) {
            e.printStackTrace();
            throw new RuntimeException(e);
        }
    }

    private static void parseAll(Iterator sheetsData, XMLReader parser) {
        while (sheetsData.hasNext()){
            parse(sheetsData,parser);
        }
    }

    private static void parse(Iterator sheetsData, XMLReader parser){
        try(InputStream inputStream = sheetsData.next()) {
            parser.parse(new InputSource(inputStream));
        }catch (Exception e){
            e.printStackTrace();
            throw new RuntimeException(e);
        }
    }

    private static void parseFirst(Iterator sheetsData,XMLReader parser){
        if (sheetsData.hasNext()){
            parse(sheetsData,parser);
        }
    }

    private static XSSFReader getXSSFReader(File file){
        if (!file.getName().endsWith(".xlsx")) throw new RuntimeException("请使用word 2007的Excel格式,即xlsx格式");
        XSSFReader reader = null;
        try {
            reader = new XSSFReader(ZipPackage.open(file));
        } catch (IOException e) {
            e.printStackTrace();
            throw new RuntimeException(e);
        } catch (OpenXML4JException e) {
            e.printStackTrace();
            throw new RuntimeException(e);
        }
        return reader;
    }

    private static XMLReader getXMLReader(Callback callback, XSSFReader reader){
        XMLReader parser = null;
        try {
            parser = XMLReaderFactory.createXMLReader("org.apache.xerces.parsers.SAXParser");
        } catch (SAXException e) {
            e.printStackTrace();
            throw new RuntimeException(e);
        }
        parser.setContentHandler(new ExcelXlsxHandle(callback,reader));
        return parser;
    }

    private static Iterator getSheetsData(XSSFReader reader){
        try {
            return reader.getSheetsData();
        } catch (IOException e) {
            e.printStackTrace();
            throw new RuntimeException(e);
        } catch (InvalidFormatException e) {
            e.printStackTrace();
            throw new RuntimeException(e);
        }
    }

    public static void main(String[] args) throws ParseException {
        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.ROOT);
        Date parse = sdf.parse("1970-1-1 00:00:00");
        System.out.println(parse.getTime());
    }
}
  • Callback
package com.rjhcsoft.credit.utils.poi.callback;

import java.util.Map;

public interface Callback {
    void callback(Map result,int currentRowNumber,int availabledRows);
}
  • workbook
package com.rjhcsoft.credit.utils.poi.annos;

import java.lang.annotation.ElementType;
import java.lang.annotation.Retention;
import java.lang.annotation.RetentionPolicy;
import java.lang.annotation.Target;

@Target(ElementType.FIELD)
@Retention(RetentionPolicy.RUNTIME)
public @interface Workbook {
    String cell();

    String format() default "";
}
  • ThreadPool
package com.rjhcsoft.credit.utils.thread.pool;

import java.util.concurrent.Semaphore;

/**
 * 封装线程池
 */
public abstract class BaseThreadPool {
    private Semaphore semaphore;

    {
        init();
    }

    /**
     * 初始化方法,此方法会在构造方法之前,属性之后执行
     */
    protected abstract void init();

    /**
     * 构造方法执行
     */
    public BaseThreadPool(){
        this(5);
    }

    /**
     * 构造方法执行
     * @param permits 并发数
     */
    public BaseThreadPool(int permits){
        if (permits<1) throw new RuntimeException("并发数至少为1");
        semaphore = new Semaphore(permits);
        afterConstructor(permits);
    }

    /**
     * 此为核心执行方法,
     * @param execute 回调接口,此为用户实现其核心执行内容
     */
    public void execute(Execute execute){
        new Thread(new Runnable() {
            @Override
            public void run() {
                try {
                    afterInitThread();
                    semaphore.acquire();
                    beforeExecute();
                    execute.execute();
                    afterExecute();
                }catch (Exception e){
                    e.printStackTrace();
                    exeception(e);
                }finally {
                    semaphore.release();
                    finallz();
                }
            }
        }).start();
    }

    /**
     * 在构造方法执行之后执行此方法
     * @param permits
     */
    protected abstract void afterConstructor(int permits);

    /**
     * 当线程初始化完成,但是还没来得及获取线程锁的时候,执行此方法
     */
    protected abstract void afterInitThread();

    /**
     * 在业务代码执行之前执行此方法
     */
    protected abstract void beforeExecute();

    /**
     * 在实际业务代码执行之后,执行此方法
     */
    protected abstract void afterExecute();

    /**
     * 当出异常时执行此方法
     */
    protected abstract void exeception(Exception e);

    /**
     * 当整个执行业务结束,不论是否出异常,都会执行此方法
     */
    protected abstract void finallz();

    public interface Execute{
        void execute();
    }
}
package com.rjhcsoft.credit.utils.thread.pool;

import com.rjhcsoft.credit.utils.StringUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.atomic.AtomicInteger;

public class SimpleThreadPool extends BaseThreadPool {

    private static Logger LOGGER = LoggerFactory.getLogger(SimpleThreadPool.class);

    private int permins;
    private ArrayBlockingQueue queue;
    private AtomicInteger total;// 线程总数
    private AtomicInteger core;// 核心池中的线程数
    private AtomicInteger wait;// 等待数

    public SimpleThreadPool(int permits){
        super(permits);
    }

    @Override
    protected void afterConstructor(int permits) {
        this.permins=permits;
    }


    @Override
    protected void init() {
        queue = new ArrayBlockingQueue(100);
        total = new AtomicInteger(0);
        core = new AtomicInteger(0);
        wait = new AtomicInteger(0);
    }

    @Override
    protected void afterInitThread() {
        total.addAndGet(1);
        wait.addAndGet(1);
        String threadId = StringUtil.uuid();
        Thread.currentThread().setName(threadId);
        LOGGER.debug("线程["+threadId+"]初始化完成");
    }

    @Override
    protected void beforeExecute() {
        String name = Thread.currentThread().getName();
        LOGGER.debug("线程["+name+"]进入核心池...");
        wait.addAndGet(-1);
        core.addAndGet(1);
        String uuid = StringUtil.uuid();
        try {
            queue.put(uuid);
        } catch (InterruptedException e) {
            e.printStackTrace();
            throw new RuntimeException(e);
        }
    }

    @Override
    protected void afterExecute() {
        core.addAndGet(-1);
    }

    @Override
    protected void exeception(Exception e) {
        throw new RuntimeException(e);
    }

    @Override
    protected void finallz() {
        String poll = queue.poll();
        String threadId = Thread.currentThread().getName();
        LOGGER.debug("线程["+threadId+"]出去了");
    }

    public void callback(Callback callback){
        callback.callback(total.get(),core.get(),wait.get());
    }

    /**
     * 获取总数,调用此方法,程序会进入500毫秒等待,然后判定是否有等待线程,如果有则递归,如果没有则返回
     * @return
     */
    public int getTotal() throws InterruptedException {
        Thread.currentThread().sleep(500);
        if (queue.isEmpty()) return total.get();
        else return getTotal();
    }

    public interface Callback{
        void callback(int total,int core,int wait);
    }
}
  • 最后测试,这里不支持word 2003格式,因为其需要监听器处理,而其监听器是全sheet遍历,不能指定sheet,所以放弃集成
  • 测试代码:
package com.rjhcsoft.credit;

import com.alibaba.fastjson.annotation.JSONField;
import com.rjhcsoft.credit.utils.poi.annos.Workbook;

import java.math.BigDecimal;
import java.util.Date;

public class A {
    @Workbook(cell = "A")
    private Integer a;
    @Workbook(cell = "B")
    private Long b;
    @Workbook(cell = "C")
    private Double c;
    @Workbook(cell = "D")
    private Boolean d;
    @Workbook(cell = "E")
    private String e;
    @Workbook(cell = "F",format = "yyyy-MM-dd")
    @JSONField(format = "yyyy-MM-dd")
    private Date f;
    @Workbook(cell = "G")
    private BigDecimal g;
    @Workbook(cell = "H",format = "yyyy-MM-dd HH:mm:ss")
    @JSONField(format = "yyyy-MM-dd HH:mm:ss")
    private Date h;

    public Date getH() {
        return h;
    }

    public void setH(Date h) {
        this.h = h;
    }

    public Integer getA() {
        return a;
    }

    public void setA(Integer a) {
        this.a = a;
    }

    public Long getB() {
        return b;
    }

    public void setB(Long b) {
        this.b = b;
    }

    public Double getC() {
        return c;
    }

    public void setC(Double c) {
        this.c = c;
    }

    public Boolean getD() {
        return d;
    }

    public void setD(Boolean d) {
        this.d = d;
    }

    public String getE() {
        return e;
    }

    public void setE(String e) {
        this.e = e;
    }

    public Date getF() {
        return f;
    }

    public void setF(Date f) {
        this.f = f;
    }

    public BigDecimal getG() {
        return g;
    }

    public void setG(BigDecimal g) {
        this.g = g;
    }
}
@Autowired
    private JdbcTemplate jdbcTemplate;

    @Test
    public void testThreadPool() throws InterruptedException {

        ArrayBlockingQueue> queue = new ArrayBlockingQueue<>(100);
        new Thread(new Runnable() {
            @Override
            public void run() {
                ExcelUtil.readFirst("C:\\Users\\Administrator\\Desktop\\新建XLSX 工作表.xlsx", new Callback() {
                    @Override
                    public void callback(Map map, int currentRowNumber, int availabledRows) {
                        try {
                            queue.put(map);
                        } catch (InterruptedException e) {
                            e.printStackTrace();
                        }
                    }
                });
            }
        }).start();
        SimpleThreadPool pool = new SimpleThreadPool(5);
        boolean flag = true;
        int m = 0;
        while (flag){
            Map map = queue.poll();
            if (map==null||map.isEmpty()){
                m++;
                if (m>100) flag=false;
                else
                Thread.currentThread().sleep(10);
            }else {
                m = 0;
                A a = ExcelUtil.resultToObj(map, A.class);
                pool.execute(new BaseThreadPool.Execute() {
                    @Override
                    public void execute() {
                        jdbcTemplate.update("insert into T_TEST(a,b,c,d,e,f,g,h) values (?,?,?,?,?,?,?,?)",
                                a.getA(),a.getB(),a.getC(),a.getD(),a.getE(),a.getF(),a.getG(),a.getH());
                    }
                });
            }
        }
    }
  • 测试数据库表:
CREATE TABLE `T_TEST` (
  `a` int(11) DEFAULT NULL,
  `b` bigint(20) DEFAULT NULL,
  `c` double DEFAULT NULL,
  `d` tinyint(1) DEFAULT NULL,
  `e` varchar(255) DEFAULT NULL,
  `f` datetime DEFAULT NULL,
  `g` decimal(10,0) DEFAULT NULL,
  `h` datetime DEFAULT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8;

至此:结束!有什么意见的可以提出,大家多交流,

你可能感兴趣的:(poi,execl,大数据量,线程池,XSSFReader)