轻量级Master-Worker 模型实现多线程的ETL

背景

在业务开发中.通常我们会遇到将数据库中的某些数据转换成另一种数据.即 ETL, 一般我们会把这个ETL过程改成多线程的. 已提高处理速度. 本文总结一种方法进行多线程ETL的方法.

抽象

使用生产者和消费者模型(类似内存消息队列)+多线程模式抽象,

生产者: Producer

抽取数据的抽象, 即数据产生端.

容器: Container

暂存生产者锁产生的数据. 等待消费者来拉取数据

消费者: Consumer

负责拉取容器的数据. 然后由Master分派给具体的Worker来执行

  • 管理者 Master
    生成Task,分配Worker
  • 工人 Worker
    具体执行任务的线程

相关代码

  • main
public void refresh() {
    Container container = new Container(new Flag(false, false));

    //生产者
    Executors.newSingleThreadExecutor(new NameThreadFactory("pull-data"))
            .execute(new Producer(container, cityPointMapper));

    //消费者
    Executors.newSingleThreadExecutor(new NameThreadFactory("consumer-data"))
            .execute(new Consumer(container, new Master(40, cityPointMapper, amapService)));
}
  • Producer
/**
 * 从数据库拉取数据. 放入 Container
 */
public static class Producer implements Runnable {

    Container container;

    CityPointMapper cityPointMapper;

    public Producer(Container container, CityPointMapper cityPointMapper) {
        this.container = container;
        this.cityPointMapper = cityPointMapper;
    }

    @Override
    public void run() {
        cityPointMapper.selectCityGeoPoint(new QueryWrapper<>(), resultContext -> {
            CityGeoPoint resultObject = resultContext.getResultObject();
            container.push(resultObject);
        });

        Flag flag = container.getFlag();
        flag.setTableEnd(true);
    }
}
  • Container
/**
 * buffer container
 */
public static class Container {

    //队列的选择很重要. 最好支持 BackPressure
    ArrayBlockingQueue<CityGeoPoint> queue = new ArrayBlockingQueue<>(2000, false);

    Flag flag;

    public Container(Flag flag) {
        this.flag = flag;
    }

    public void push(CityGeoPoint resultObject) {
        try {
            queue.put(resultObject);
        } catch (InterruptedException e) {
            throw new RuntimeException(e);
        }
    }

    public List<CityGeoPoint> pull(int num) {
        List<CityGeoPoint> data = new ArrayList<>();
        for (int i = 1; i <= num; i++) {
            CityGeoPoint poll = queue.poll();
            if (poll != null) {
                data.add(poll);
            } else if (getFlag().isTableEnd()) {
                CityGeoPoint r1 = queue.poll();
                if (r1 != null) {
                    data.add(r1);
                } else {
                    // 已经取到低了,没有数据了
                    // 通知 拉取者
                    getFlag().setPollEnd(true);
                    break;
                }
            } else {
                //当前取到的为空. 所以当前的index不要变
                i--;
            }
        }
        return data;
    }

    public Flag getFlag() {
        return flag;
    }

    public void setFlag(Flag flag) {
        this.flag = flag;
    }
}
  • Consumer
/**
 * 拉取数据. 交由Master.分配Worker 处理数据
 */
public static class Consumer implements Runnable {

    Master master;

    Container container;

    public Consumer(Container container, Master master) {
        this.container = container;
        this.master = master;
    }

    @Override
    public void run() {
        do {
            //拉取一批数据
            List<CityGeoPoint> data = container.pull(20);
            //由master 分配给worker执行.
            master.assignment(data);
        } while (!container.getFlag().isPollEnd());

        log.info("pull data end.");
    }
}
  • Master
/**
 * 分配任务.
 */
public static class Master {

    int worker;

    CityPointMapper cityPointMapper;

    AmapService amapService;

    ThreadPoolExecutor workerPool;

    public Master(int worker, CityPointMapper cityPointMapper, AmapService amapService) {
        this.worker = worker;
        this.cityPointMapper = cityPointMapper;
        this.amapService = amapService;
        //构造线程池
        buildWorkerPool(this.worker);
    }

    private void buildWorkerPool(int worker) {
        //都是io操作
        // 调用高德接口
        // 更新数据库
        //所以线程池可以适当增大 .
        this.workerPool = new ThreadPoolExecutor(worker, worker * 2, 60L, TimeUnit.SECONDS,
                new SynchronousQueue<>(), new NameThreadFactory("worker"), new ThreadPoolExecutor.CallerRunsPolicy());
    }

    void assignment(List<CityGeoPoint> data) {
        if (!CollectionUtils.isEmpty(data)) {
            Worker worker = new Worker(cityPointMapper, amapService, data);
            //执行任务
            workerPool.execute(worker);
        }
    }
}
  • Worker
/**
 * 工作者处理数据
 */
public static class Worker implements Runnable {

    CityPointMapper cityPointMapper;

    AmapService amapService;

    List<CityGeoPoint> data;

    public Worker(CityPointMapper cityPointMapper, AmapService amapService, List<CityGeoPoint> data) {
        this.cityPointMapper = cityPointMapper;
        this.amapService = amapService;
        this.data = data;
    }

    @Override
    public void run() {
        List<CityGeoPoint> addressDetail = amapService.getAddressDetail(data);
        List<Long> success = new ArrayList<>();
        List<CityGeoPoint> failed = new ArrayList<>();
        for (CityGeoPoint cgp : addressDetail) {
            try {
                cityPointMapper.updateDetail(cgp);
                success.add(cgp.getId());
            } catch (DuplicateKeyException e) {
                Point geo = cgp.getGeo();
                geo.setX(geo.getX() + 0.001);
                geo.setY(geo.getY() + 0.001);
                failed.add(cgp);
            }
        }

        //改变geo重试
        List<Long> failedLong = new ArrayList<>();
        if (!CollectionUtils.isEmpty(failed)) {
            for (CityGeoPoint cgp : failed) {
                try {
                    cityPointMapper.updateDetailFaile(cgp);
                    success.add(cgp.getId());
                } catch (DuplicateKeyException e) {
                    failedLong.add(cgp.getId());
                }
            }
        }
        log.info("process success size:" + success.size() + "  failed size: " + failedLong.size() + " failed ids: " + failedLong);
    }
}
  • NameThreadFactory
/**
 * 线程工厂
 */
public static class NameThreadFactory implements ThreadFactory {

    private final AtomicInteger id = new AtomicInteger(0);

    private final String name;

    public NameThreadFactory(String name) {
        if (!name.endsWith(".")) {
            name += ".";
        }
        this.name = name;
    }

    @Override
    public Thread newThread(Runnable r) {
        String threadName = name + id.getAndIncrement();
        Thread thread = new Thread(r, threadName);
        thread.setDaemon(true);
        return thread;
    }
}
  • 状态
/**
 * 状态
 */
public static class Flag {
    volatile boolean tableEnd;

    volatile boolean pollEnd;

    public Flag(boolean tableEnd, boolean pollEnd) {
        this.tableEnd = tableEnd;
        this.pollEnd = pollEnd;
    }

    public boolean isTableEnd() {
        return tableEnd;
    }

    public void setTableEnd(boolean tableEnd) {
        this.tableEnd = tableEnd;
    }

    public boolean isPollEnd() {
        return pollEnd;
    }

    public void setPollEnd(boolean pollEnd) {
        this.pollEnd = pollEnd;
    }
}

总结

  • 没有抽象 Message 对象.
  • Producer 没有做多线程.

你可能感兴趣的:(设计模式,java,开发语言)