在业务开发中.通常我们会遇到将数据库中的某些数据转换成另一种数据.即 ETL, 一般我们会把这个ETL过程改成多线程的. 已提高处理速度. 本文总结一种方法进行多线程ETL的方法.
使用生产者和消费者模型(类似内存消息队列)+多线程模式抽象,
抽取数据的抽象, 即数据产生端.
暂存生产者锁产生的数据. 等待消费者来拉取数据
负责拉取容器的数据. 然后由Master分派给具体的Worker来执行
public void refresh() {
Container container = new Container(new Flag(false, false));
//生产者
Executors.newSingleThreadExecutor(new NameThreadFactory("pull-data"))
.execute(new Producer(container, cityPointMapper));
//消费者
Executors.newSingleThreadExecutor(new NameThreadFactory("consumer-data"))
.execute(new Consumer(container, new Master(40, cityPointMapper, amapService)));
}
/**
* 从数据库拉取数据. 放入 Container
*/
public static class Producer implements Runnable {
Container container;
CityPointMapper cityPointMapper;
public Producer(Container container, CityPointMapper cityPointMapper) {
this.container = container;
this.cityPointMapper = cityPointMapper;
}
@Override
public void run() {
cityPointMapper.selectCityGeoPoint(new QueryWrapper<>(), resultContext -> {
CityGeoPoint resultObject = resultContext.getResultObject();
container.push(resultObject);
});
Flag flag = container.getFlag();
flag.setTableEnd(true);
}
}
/**
* buffer container
*/
public static class Container {
//队列的选择很重要. 最好支持 BackPressure
ArrayBlockingQueue<CityGeoPoint> queue = new ArrayBlockingQueue<>(2000, false);
Flag flag;
public Container(Flag flag) {
this.flag = flag;
}
public void push(CityGeoPoint resultObject) {
try {
queue.put(resultObject);
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
}
public List<CityGeoPoint> pull(int num) {
List<CityGeoPoint> data = new ArrayList<>();
for (int i = 1; i <= num; i++) {
CityGeoPoint poll = queue.poll();
if (poll != null) {
data.add(poll);
} else if (getFlag().isTableEnd()) {
CityGeoPoint r1 = queue.poll();
if (r1 != null) {
data.add(r1);
} else {
// 已经取到低了,没有数据了
// 通知 拉取者
getFlag().setPollEnd(true);
break;
}
} else {
//当前取到的为空. 所以当前的index不要变
i--;
}
}
return data;
}
public Flag getFlag() {
return flag;
}
public void setFlag(Flag flag) {
this.flag = flag;
}
}
/**
* 拉取数据. 交由Master.分配Worker 处理数据
*/
public static class Consumer implements Runnable {
Master master;
Container container;
public Consumer(Container container, Master master) {
this.container = container;
this.master = master;
}
@Override
public void run() {
do {
//拉取一批数据
List<CityGeoPoint> data = container.pull(20);
//由master 分配给worker执行.
master.assignment(data);
} while (!container.getFlag().isPollEnd());
log.info("pull data end.");
}
}
/**
* 分配任务.
*/
public static class Master {
int worker;
CityPointMapper cityPointMapper;
AmapService amapService;
ThreadPoolExecutor workerPool;
public Master(int worker, CityPointMapper cityPointMapper, AmapService amapService) {
this.worker = worker;
this.cityPointMapper = cityPointMapper;
this.amapService = amapService;
//构造线程池
buildWorkerPool(this.worker);
}
private void buildWorkerPool(int worker) {
//都是io操作
// 调用高德接口
// 更新数据库
//所以线程池可以适当增大 .
this.workerPool = new ThreadPoolExecutor(worker, worker * 2, 60L, TimeUnit.SECONDS,
new SynchronousQueue<>(), new NameThreadFactory("worker"), new ThreadPoolExecutor.CallerRunsPolicy());
}
void assignment(List<CityGeoPoint> data) {
if (!CollectionUtils.isEmpty(data)) {
Worker worker = new Worker(cityPointMapper, amapService, data);
//执行任务
workerPool.execute(worker);
}
}
}
/**
* 工作者处理数据
*/
public static class Worker implements Runnable {
CityPointMapper cityPointMapper;
AmapService amapService;
List<CityGeoPoint> data;
public Worker(CityPointMapper cityPointMapper, AmapService amapService, List<CityGeoPoint> data) {
this.cityPointMapper = cityPointMapper;
this.amapService = amapService;
this.data = data;
}
@Override
public void run() {
List<CityGeoPoint> addressDetail = amapService.getAddressDetail(data);
List<Long> success = new ArrayList<>();
List<CityGeoPoint> failed = new ArrayList<>();
for (CityGeoPoint cgp : addressDetail) {
try {
cityPointMapper.updateDetail(cgp);
success.add(cgp.getId());
} catch (DuplicateKeyException e) {
Point geo = cgp.getGeo();
geo.setX(geo.getX() + 0.001);
geo.setY(geo.getY() + 0.001);
failed.add(cgp);
}
}
//改变geo重试
List<Long> failedLong = new ArrayList<>();
if (!CollectionUtils.isEmpty(failed)) {
for (CityGeoPoint cgp : failed) {
try {
cityPointMapper.updateDetailFaile(cgp);
success.add(cgp.getId());
} catch (DuplicateKeyException e) {
failedLong.add(cgp.getId());
}
}
}
log.info("process success size:" + success.size() + " failed size: " + failedLong.size() + " failed ids: " + failedLong);
}
}
/**
* 线程工厂
*/
public static class NameThreadFactory implements ThreadFactory {
private final AtomicInteger id = new AtomicInteger(0);
private final String name;
public NameThreadFactory(String name) {
if (!name.endsWith(".")) {
name += ".";
}
this.name = name;
}
@Override
public Thread newThread(Runnable r) {
String threadName = name + id.getAndIncrement();
Thread thread = new Thread(r, threadName);
thread.setDaemon(true);
return thread;
}
}
/**
* 状态
*/
public static class Flag {
volatile boolean tableEnd;
volatile boolean pollEnd;
public Flag(boolean tableEnd, boolean pollEnd) {
this.tableEnd = tableEnd;
this.pollEnd = pollEnd;
}
public boolean isTableEnd() {
return tableEnd;
}
public void setTableEnd(boolean tableEnd) {
this.tableEnd = tableEnd;
}
public boolean isPollEnd() {
return pollEnd;
}
public void setPollEnd(boolean pollEnd) {
this.pollEnd = pollEnd;
}
}