近期处理数据,总结一下
项目背景:从Mongo中定时转移前第7天文本到mysql数据库且生成文件,有各种不同的定时任务,有批次消息、发送明细消息等等很多的数据需要转移,也有老系统和新系统数据的转移,mysql使用的分区表;架构使用的ssm,定时任务使用的xxl-job分布式任务调度
上面都是废话;
1、JDBC原生batch处理400万条数据
/**
* @Description: 批量新增发送明细数据一次性插入1000条
* @Param:
* @return:
* @Author: fanghuaiming
* @Date:
*/
public void savebatchDtlBatch(List smsSendHistoryDetails) {
LOG.info("Fuction:savebatchDtlBatch start insert into mysql batch 【 sms_msg_dtl 】");
Connection conn = DataSourceUtils.getConnection(dataSource1Config.testDataSource());
long before = System.currentTimeMillis();
int count = 0;
LOG.info("Fuction:savebatchDtlBatch Connection has opend");
PreparedStatement pstmt = null;
try {
pstmt = conn.prepareStatement("INSERT INTO sms_msg_dtl (unique_id,username,mtype,mobile,batch_id,channel_id,location,words,sub_succ,sub_fail,rpt_succ,rpt_fail,uprice,cprice,submit_time,_id,mdstr,extend) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)");
conn.setAutoCommit(false);
} catch (SQLException e) {
e.printStackTrace();
}
for (int i = 0; i < smsSendHistoryDetails.size(); i++) {
try {
SmsSendHistoryDetail smsSendHistoryDetail = smsSendHistoryDetails.get(i);
pstmt.setLong(1, null == smsSendHistoryDetail.getUniqueId() ? 0 : smsSendHistoryDetail.getUniqueId());
pstmt.setInt(2, null == smsSendHistoryDetail.getUserName() ? 0 : smsSendHistoryDetail.getUserName());
pstmt.setInt(3, null == smsSendHistoryDetail.getMtype() ? 0 : smsSendHistoryDetail.getMtype());
pstmt.setLong(4, null == smsSendHistoryDetail.getMobile() ? 0 : smsSendHistoryDetail.getMobile());
pstmt.setLong(5, null == smsSendHistoryDetail.getBatchId() ? 0 : smsSendHistoryDetail.getBatchId());
pstmt.setInt(6, null == smsSendHistoryDetail.getChannelId() ? 0 : smsSendHistoryDetail.getChannelId());
pstmt.setString(7, null == smsSendHistoryDetail.getLocation() ? "" : smsSendHistoryDetail.getLocation());
pstmt.setString(8, null == smsSendHistoryDetail.getWords() ? "" : smsSendHistoryDetail.getWords());
pstmt.setInt(9, null == smsSendHistoryDetail.getSubSucc() ? 0 : smsSendHistoryDetail.getSubSucc());
pstmt.setInt(10, null == smsSendHistoryDetail.getSubFail() ? 0 : smsSendHistoryDetail.getSubFail());
pstmt.setInt(11, null == smsSendHistoryDetail.getRptSucc() ? 0 : smsSendHistoryDetail.getRptSucc());
pstmt.setInt(12, null == smsSendHistoryDetail.getRptFail() ? 0 : smsSendHistoryDetail.getRptFail());
pstmt.setLong(13, null == smsSendHistoryDetail.getuPrice() ? 0 : smsSendHistoryDetail.getuPrice());
pstmt.setLong(14, null == smsSendHistoryDetail.getcPrice() ? 0 : smsSendHistoryDetail.getcPrice());
pstmt.setLong(15, null == smsSendHistoryDetail.getSubmitTime() ? 0 : smsSendHistoryDetail.getSubmitTime());
pstmt.setString(16, null == smsSendHistoryDetail.get_id() ? "" : smsSendHistoryDetail.get_id());
pstmt.setString(17, null == smsSendHistoryDetail.getMdstr() ? "" : smsSendHistoryDetail.getMdstr());
pstmt.setString(18, null == smsSendHistoryDetail.getExtend() || "".equals(smsSendHistoryDetail.getExtend()) ? "" : smsSendHistoryDetail.getExtend());
pstmt.addBatch();
int a = i + 1;
if (a % 1000 == 0) {
try {
pstmt.executeBatch();
} catch (BatchUpdateException e) {
int[] updateCounts = e.getUpdateCounts();
for (int j = 0; j < updateCounts.length; j++) {
if(updateCounts[j]<0) {
LOG.info("Fuction:savebatchDtlBatch mysql 【 sms_msg_dtl 】 mybe get some exception : {} And this data is : {}",e.getMessage(),smsSendHistoryDetails.get(count * 1000 +j));
}
}
}
count += 1;
conn.commit();
pstmt.clearBatch();
}
} catch (
SQLException e) {
LOG.info("Fuction:savebatchDtlBatch mysql 【 sms_msg_dtl 】 batch insert get some Exception : {} ", e);
e.printStackTrace();
}
}
try {
try {
pstmt.executeBatch();
} catch (BatchUpdateException e) {
int[] updateCounts = e.getUpdateCounts();
for (int j = 0; j < updateCounts.length; j++) {
if(updateCounts[j]<0) {
LOG.info("Fuction:savebatchDtlBatch mysql 【 sms_msg_dtl 】 mybe get some exception : {} And this data is : {}",e.getMessage(),smsSendHistoryDetails.get(count * 1000 +j));
}
}
}
conn.commit();
pstmt.clearBatch();
count = 0;
} catch (SQLException e) {
e.printStackTrace();
LOG.info("Fuction:savebatchDtlBatch mysql 【 sms_msg_dtl 】 pstmt take executeBatch had SQLException :{}", e);
} finally {
try {
pstmt.close();
} catch (SQLException e) {
e.printStackTrace();
LOG.info("Fuction:savebatchDtlBatch mysql 【 sms_msg_dtl 】 pstmt take close had Exception :{}", e);
}
}
long after = System.currentTimeMillis();
LOG.info("Fuction:savebatchDtlBatch had done insert into mysql batch 【 sms_msg_dtl 】And Time-consuming : {} milliseconds", after - before);
DataSourceUtils.releaseConnection(conn, dataSource1Config.testDataSource());
LOG.info("Fuction:savebatchDtlBatch Connection has released");
}
2、Mybatis的Batch批处理
@Async
@Override
public void batchInsertBackUp(List smsSendMsgRpts) {
try {
long before = System.nanoTime();
//连接需要加上 allowMultiQueries=true
//进行mybaties的批次操作
SqlSession sqlSession = test1SqlSessionFactory.openSession(ExecutorType.BATCH,false);
//获取Mapper接口
SmsSendMsgRptBackUpMapper smsSendMsgRptBackUpMapper = sqlSession.getMapper(SmsSendMsgRptBackUpMapper.class);
//并行遍历循环新增(非commit操作)
smsSendMsgRpts.parallelStream().forEach(smsSendMsgRpt ->{
smsSendMsgRptBackUpMapper.insertSmsSendMsgRpt(smsSendMsgRpt);
});
//进行批次提交
sqlSession.commit();
//刷新执行结果可以进行断言操作
List batchResults = sqlSession.flushStatements();
//清除缓存
sqlSession.clearCache();
//关闭连接
sqlSession.close();
long after = System.nanoTime();
LOG.info("SmsSendMsgRptBackUpServiceImpl:batchInsertBackUp delete count:{} had done and total Millisecond use : {}",batchResults.size(),after - before);
batchResults.clear();
} catch (Exception e) {
e.printStackTrace();
LOG.info(e.getMessage());
}
}
insert into sms_rpt_dtl_backup
(rpt_time,username,
mobile,rptcode, unique_id,
batch_id,channel_id,extend,
resend_num,resend_count,
user_sub_time,channel_sub_time,
resend,uprice,cprice,msgid,
custom_id,source,create_time,update_time) VALUES
(#{rptTime},#{username},#{mobile},#{rptcode}, #{uniqueId},#{batchId},
#{channelId},#{extend},#{resendNum},#{resendCount},#{userSubTime},
#{channelSubTime},#{resend},#{uprice},#{cprice},#{msgid},#{customId},
#{source},#{createTime},#{updateTime})
3、Mybatis动态sql语句
@Async
@Override
public void batchInsertBackUps(List smsSendMsgRpts) {
try {
long before = System.currentTimeMillis();
smsSendMsgRptBackUpMapper.batchInsertBackUp(smsSendMsgRpts);
long after = System.currentTimeMillis();
LOG.info("SmsSendMsgRptBackUpServiceImpl:batchInsertBackUps insert had done and total Millisecond use : {}",after - before);
} catch (Exception e) {
e.printStackTrace();
LOG.info(e.getMessage());
}
}
INSERT into sms_rpt_dtl_backup (rpt_time,username,
mobile,rptcode, unique_id,
batch_id,channel_id,extend,
resend_num,resend_count,
user_sub_time,channel_sub_time,
resend,uprice,cprice,msgid,
custom_id,source,create_time,update_time) VALUES
(#{item.rptTime,jdbcType=BIGINT},#{item.username,jdbcType=INTEGER},#{item.mobile,jdbcType=BIGINT},
#{item.rptcode,jdbcType=VARCHAR}, #{item.uniqueId,jdbcType=BIGINT},#{item.batchId,jdbcType=BIGINT},
#{item.channelId,jdbcType=INTEGER},#{item.extend,jdbcType=VARCHAR},#{item.resendNum,jdbcType=INTEGER},
#{item.resendCount,jdbcType=INTEGER},#{item.userSubTime,jdbcType=BIGINT},#{item.channelSubTime,jdbcType=BIGINT},
#{item.resend,jdbcType=INTEGER},#{item.uprice,jdbcType=BIGINT},#{item.cprice,jdbcType=BIGINT},
#{item.msgid,jdbcType=VARCHAR},#{item.customId,jdbcType=VARCHAR},#{item.source,jdbcType=INTEGER},
#{item.createTime,jdbcType=TIMESTAMP},#{item.updateTime,jdbcType=TIMESTAMP})
这些部分方法加了异步
import org.springframework.beans.factory.annotation.Value;
import org.springframework.context.annotation.Configuration;
import org.springframework.context.annotation.PropertySource;
import org.springframework.scheduling.annotation.AsyncConfigurer;
import org.springframework.scheduling.annotation.EnableAsync;
import org.springframework.scheduling.concurrent.ThreadPoolTaskExecutor;
import java.util.concurrent.Executor;
/**
* @Description: 异步配置对象
* @Param:
* @return:
* @Author: fanghuaiming
* @Date:
*/
@Configuration
@EnableAsync
@PropertySource({"classpath:application-async.properties"})
public class SpringAsyncConfig implements AsyncConfigurer {
@Value("${springasync.corePoolSize}")
private int corePoolSize = Runtime.getRuntime().availableProcessors();
@Value("${springasync.maxPoolSize}")
private int maxPoolSize = Runtime.getRuntime().availableProcessors();
/**
* @Description: CPU密集型
* @Param:
* @return:
* @Author: fanghuaiming
* @Date:
*/
@Override
public Executor getAsyncExecutor() {
ThreadPoolTaskExecutor executor = new ThreadPoolTaskExecutor();
executor.setCorePoolSize(corePoolSize);
executor.setMaxPoolSize(maxPoolSize);
executor.setThreadNamePrefix("HysmsScheduled-");
executor.initialize();
return executor;
}
}
1、实际上从每日4000万的数据来看,速度最快的反而是JDBC这种操作,但是如果插入过程中出现什么问题会造成数据库连接卡在这里导致数据库连接不够,因为始终没有归还,从第一个jdbc原生可以看出来我只是捕获了异常而已,也只是业务上的处理,JDBC的连接如果处理最终也只是close掉而不是归还给某个池化的连接,但是不可否认它的速度够快
2、其实我打心眼里是想大规模使用Mybatis的Executor的Batch,它是Mybatis提供的方便简洁美观,但是实际上它的效率很慢,曾经接触过部分了解,这个Batch实际上是有大小的限制的,例如1万条过大的sql拼接到一个类似buffer,这个buffer是有大小限制的,需要你进行清除或者按照第一种JDBC操作的方式达到多少阀值进行一次提交的操作,会有一些坑,这里我只是贴出来保证可以执行,至于效率我并没有优化,可能需要自己进行优化,我因为任务加身就没有继续做了,有兴趣的可以多看看资料或者《深入浅出MyBatis》这个pdf我会传到我的下载里,不知道怎么设置不需要积分,不想花积分可以进我们的讨论群,稍后贴出
3、Mybatis的动态sql语句实际上官方是推崇的,第三种方式我是进行了动态规划做的子任务(使用Fork-Join也是一样的),这个方法只进行300万的数据的插入,我仔细观察过效率(广义上的观察,也就是人工观察?),在前100万条的插入的时候是非常快的,但是后面的300万条数据实际还是很慢的,感觉和Executor的BATCH效率差不多,翻了官方的Mybatis文档也没有介绍详细的效率问题,更多的是如何使用,当然这也可能和它的动态sql拦截器相关执行有关,我没有进行优化了,因为现在任务正常跑,暂时不需要进行优化,还是推荐大家多看看《深入浅出MyBatis》
这里使用的是mysql,如果使用Oracle其实也差不多,改一改一样的,我的mysql使用的是InnoDB数据库引擎,最近很多博客标题都是教你如何3分钟插入1000万条数据都是扯淡,你翻进去一看是MyISAM,那连键锁、临键锁、间隙锁都没有当然快了,为了博噱头什么都不管不顾
还有,我说的4000万条数据中业务会做很多事情,例如转换mongo的数据的判断、聚合、条件的整合还有生成csv文件消耗的大量IO等等业务上的操作,所以我是人工查看的效率而不是代码上进行监控,如果有想做这个监控的可以自己写自定义注解加在方法头上进行AOP切面监控,用的地方用,不用的地方就不加去进行监控
总结:
总的来说我个人比较推崇第一种方法(除了手动关闭连接之外),原生的有时候比较方便,第三方的就需要阅读人家的源码去进行优化了,在以上代码部分有疑问或需要下载pdf的请进讨论:java华夏一区,禁止推销