java 分批次处理大数据量数据

需求:两个数据库A,B  现在对A库中一个字段加密后,插入到B库表中 

业务分析:业务逻辑比较简单,从A库中拿到数据,数据清洗,插入B库中

业务难点:A库中表的数据量比较大,50W数据(相对意义大数据量),单纯select * from table 查询时间很长,而且容易内存溢出,并且数据清洗完插入B库中效率也是很慢的,我这里采取分批次处理,并且采用线程池 多线程处理数据

1.controller层

 @RequestMapping("/data/temp/driver")
    @ResponseBody
    public ResultBean doGlobalTempDataDriver(){

        int id = globalDataService.getMinId();
        int count = globalDataService.getMaxId();
        int maxId = id + 1000;

        while (id < count) {
            Map params = new HashMap<>(16);
            params.put("id", id);
            params.put("maxId", maxId);
            globalDataService.doGlobalTempDataDriver(params);
            id += 1000;
            maxId += 1000;
        }
        return new ResultBean(200,"驾证数据同步临时表成功");
    }

2.service层

    @Async("taskExecutor")
    @Override
    public ResultBean doGlobalTempDataDriver( Map params) {
        //数据每1000条处理一次
        List driverTBaseinfoList = driverInfoService.getDriverList(params);
        List tempDriverTInfos = new LinkedList<>();
        //遍历数据,赋值到 临时表实体
        for (DriverTBaseinfo driverTBaseinfo : driverTBaseinfoList) {
            TempDriverTInfo tempDriverTInfo = new TempDriverTInfo();
            tempDriverTInfo.setId(driverTBaseinfo.getId());
            tempDriverTInfo.setCclzrq(driverTBaseinfo.getFirstLicenseDate());
            tempDriverTInfo.setCsrq(driverTBaseinfo.getUserBirthday());
            tempDriverTInfo.setDabh(driverTBaseinfo.getArchivesNo());
            tempDriverTInfo.setDjjg(driverTBaseinfo.getIssuingAuthority());
            tempDriverTInfo.setGddh(driverTBaseinfo.getFixedTelephone());
            tempDriverTInfo.setGj(driverTBaseinfo.getUserNationality());
            tempDriverTInfo.setHzcs(driverTBaseinfo.getReplacementCount());
            tempDriverTInfo.setJszh(driverTBaseinfo.getDriverNo());
            tempDriverTInfo.setJszzt(StatusUtils.switchDriverState(driverTBaseinfo.getDriverState()));
            tempDriverTInfo.setLxdh(driverTBaseinfo.getUserPhone());
            tempDriverTInfo.setSfky(driverTBaseinfo.getAvailable());
            tempDriverTInfo.setSfzmmc(StatusUtils.switchIdCardType(driverTBaseinfo.getUserIdcardType()));
            tempDriverTInfo.setSsdq(driverTBaseinfo.getDrafterDistrict());
            tempDriverTInfo.setXb(driverTBaseinfo.getUserSex() .equals("0")  ? "男" : "女");
            tempDriverTInfo.setXm(driverTBaseinfo.getUserName());
            tempDriverTInfo.setYljg(driverTBaseinfo.getMedicalInstitution());
            tempDriverTInfo.setYxjzrq(driverTBaseinfo.getValidEndDate());
            tempDriverTInfo.setYxqsrq(driverTBaseinfo.getValidStartDate());
            tempDriverTInfo.setYzbm(driverTBaseinfo.getUserZipcode());
            tempDriverTInfo.setZjjxdh(StatusUtils.switchDriverType(driverTBaseinfo.getDriverType()));
            tempDriverTInfo.setZz(driverTBaseinfo.getUserAddress());
            tempDriverTInfo.setZxblrq(driverTBaseinfo.getRevokeDate());
            tempDriverTInfos.add(tempDriverTInfo);
        }
        //批量存储临时表
        templateDriverInfoService.saveBatch(tempDriverTInfos);
        return new ResultBean(driverTBaseinfoList.size());
    }

3.mapper文件

 


4.线程池配置

@EnableAsync
@Configuration
public class ExecutorConfig {
    /** 核心线程数 */
    private int corePoolSize = 10;
    /** 最大线程数  */
    private int maxPoolSize = 50;
    /** 队列大小  */
    private int queueCapacity = 10;
    /** 线程最大空闲时间   */
    private int keepAliveSeconds = 150;

    @Bean("taskExecutor")
    public Executor taskExecutor() {
        ThreadPoolTaskExecutor executor = new ThreadPoolTaskExecutor();
        executor.setCorePoolSize(corePoolSize);
        executor.setMaxPoolSize(maxPoolSize);
        executor.setQueueCapacity(queueCapacity);
        executor.setThreadNamePrefix("taskExecutor-");
        executor.setKeepAliveSeconds(keepAliveSeconds);
      
        // rejection-policy:当pool已经达到max size的时候,如何处理新任务
        // CALLER_RUNS:不在新线程中执行任务,而是由调用者所在的线程来执行
        executor.setRejectedExecutionHandler(new ThreadPoolExecutor.CallerRunsPolicy());
        executor.initialize();
        return executor;
    }

}

5.数据库连接池

spring:
  datasource:
    type: com.alibaba.druid.pool.DruidDataSource
    driverClassName: com.mysql.jdbc.Driver
    url: jdbc:mysql://ip:port/ku?serverTimezone=UTC&useUnicode=true&characterEncoding=utf8&allowMultiQueries=true
#    username: ***
#    password: ***
    initialSize: 1
    minIdle: 3
    maxActive: 20
    # 配置获取连接等待超时的时间
    maxWait: 60000
    # 配置间隔多久才进行一次检测,检测需要关闭的空闲连接,单位是毫秒
    timeBetweenEvictionRunsMillis: 60000
    # 配置一个连接在池中最小生存的时间,单位是毫秒
    minEvictableIdleTimeMillis: 30000
    validationQuery: select 'x'
    testWhileIdle: true
    testOnBorrow: false
    testOnReturn: false
    # 打开PSCache,并且指定每个连接上PSCache的大小
    poolPreparedStatements: true
    maxPoolPreparedStatementPerConnectionSize: 20
    # 配置监控统计拦截的filters,去掉后监控界面sql无法统计,'wall'用于防火墙
    filters: stat,slf4j
    # 通过connectProperties属性来打开mergeSql功能;慢SQL记录
    connectionProperties: druid.stat.mergeSql=true;druid.stat.slowSqlMillis=5000
    # 合并多个DruidDataSource的监控数据
    #useGlobalDataSourceStat: true
    removeAbandoned : true
    removeAbandonedTimeout : 300
    logAbandoned : false

5.数据库连接池配置

@Configuration
public class DruidDBConfig {
    private Logger logger = LoggerFactory.getLogger(DruidDBConfig.class);
    @Value("${spring.datasource.url}")
    private String dbUrl;

    @Value("${spring.datasource.username}")
    private String username;

    @Value("${spring.datasource.password}")
    private String password;

    @Value("${spring.datasource.driverClassName}")
    private String driverClassName;

    @Value("${spring.datasource.initialSize}")
    private int initialSize;

    @Value("${spring.datasource.minIdle}")
    private int minIdle;

    @Value("${spring.datasource.maxActive}")
    private int maxActive;

    @Value("${spring.datasource.maxWait}")
    private int maxWait;

    @Value("${spring.datasource.timeBetweenEvictionRunsMillis}")
    private int timeBetweenEvictionRunsMillis;

    @Value("${spring.datasource.minEvictableIdleTimeMillis}")
    private int minEvictableIdleTimeMillis;

    @Value("${spring.datasource.validationQuery}")
    private String validationQuery;

    @Value("${spring.datasource.testWhileIdle}")
    private boolean testWhileIdle;

    @Value("${spring.datasource.testOnBorrow}")
    private boolean testOnBorrow;

    @Value("${spring.datasource.testOnReturn}")
    private boolean testOnReturn;

    @Value("${spring.datasource.poolPreparedStatements}")
    private boolean poolPreparedStatements;

    @Value("${spring.datasource.maxPoolPreparedStatementPerConnectionSize}")
    private int maxPoolPreparedStatementPerConnectionSize;

    @Value("${spring.datasource.filters}")
    private String filters;

    @Value("{spring.datasource.connectionProperties}")
    private String connectionProperties;

    @Bean(initMethod = "init", destroyMethod = "close")   //声明其为Bean实例
    @Primary  //在同样的DataSource中,首先使用被标注的DataSource
    public DataSource dataSource() {
        DruidDataSource datasource = new DruidDataSource();

        datasource.setUrl(this.dbUrl);
        datasource.setUsername(username);
        datasource.setPassword(password);
        datasource.setDriverClassName(driverClassName);

        //configuration
        datasource.setInitialSize(initialSize);
        datasource.setMinIdle(minIdle);
        datasource.setMaxActive(maxActive);
        datasource.setMaxWait(maxWait);
        datasource.setTimeBetweenEvictionRunsMillis(timeBetweenEvictionRunsMillis);
        datasource.setMinEvictableIdleTimeMillis(minEvictableIdleTimeMillis);
        datasource.setValidationQuery(validationQuery);
        datasource.setTestWhileIdle(testWhileIdle);
        datasource.setTestOnBorrow(testOnBorrow);
        datasource.setTestOnReturn(testOnReturn);
        datasource.setPoolPreparedStatements(poolPreparedStatements);
        datasource.setMaxPoolPreparedStatementPerConnectionSize(maxPoolPreparedStatementPerConnectionSize);
        try {
            datasource.setFilters(filters);
        } catch (SQLException e) {
            logger.error("druid configuration initialization filter", e);
        }
        datasource.setConnectionProperties(connectionProperties);

        return datasource;
    }

    @Bean
    public ServletRegistrationBean druidServlet() {
        ServletRegistrationBean reg = new ServletRegistrationBean();
        reg.setServlet(new StatViewServlet());
        reg.addUrlMappings("/druid/*");
        reg.addInitParameter("allow", ""); //白名单
        return reg;
    }

    @Bean public FilterRegistrationBean filterRegistrationBean() {
        FilterRegistrationBean filterRegistrationBean = new FilterRegistrationBean();
        filterRegistrationBean.setFilter(new WebStatFilter());
        filterRegistrationBean.addUrlPatterns("/*");
        filterRegistrationBean.addInitParameter("exclusions", "*.js,*.gif,*.jpg,*.png,*.css,*.ico,/druid/*");
        filterRegistrationBean.addInitParameter("profileEnable", "true");
        filterRegistrationBean.addInitParameter("principalCookieName","USER_COOKIE");
        filterRegistrationBean.addInitParameter("principalSessionName","USER_SESSION");
        filterRegistrationBean.addInitParameter("DruidWebStatFilter","/*");
        return filterRegistrationBean;
    }
}

文末:在数据批量插入或批量更新时在配置文件数据库连接配置中 要加上 &allowMultiQueries=true  并且这里不能配置wall filters: stat,slf4j

你可能感兴趣的:(数据处理)