聊聊storm trident的coordinator

序

本文主要研究一下storm trident的coordinator

实例

代码示例

    @Test
    public void testDebugTopologyBuild(){
        FixedBatchSpout spout = new FixedBatchSpout(new Fields("user", "score"), 3,
                new Values("nickt1", 4),
                new Values("nickt2", 7),
                new Values("nickt3", 8),
                new Values("nickt4", 9),
                new Values("nickt5", 7),
                new Values("nickt6", 11),
                new Values("nickt7", 5)
        );
        spout.setCycle(false);
        TridentTopology topology = new TridentTopology();
        Stream stream1 = topology.newStream("spout1",spout)
                .each(new Fields("user", "score"), new BaseFunction() {
                    @Override
                    public void execute(TridentTuple tuple, TridentCollector collector) {
                        System.out.println("tuple:"+tuple);
                    }
                },new Fields());

        topology.build();
    }

这里使用的spout为FixedBatchSpout，它是IBatchSpout类型

拓扑图

MasterBatchCoordinator

storm-1.2.2/storm-core/src/jvm/org/apache/storm/trident/topology/MasterBatchCoordinator.java

public class MasterBatchCoordinator extends BaseRichSpout { 
    public static final Logger LOG = LoggerFactory.getLogger(MasterBatchCoordinator.class);
    
    public static final long INIT_TXID = 1L;
    
    
    public static final String BATCH_STREAM_ID = "$batch";
    public static final String COMMIT_STREAM_ID = "$commit";
    public static final String SUCCESS_STREAM_ID = "$success";

    private static final String CURRENT_TX = "currtx";
    private static final String CURRENT_ATTEMPTS = "currattempts";
    
    private List _states = new ArrayList();
    
    TreeMap _activeTx = new TreeMap();
    TreeMap _attemptIds;
    
    private SpoutOutputCollector _collector;
    Long _currTransaction;
    int _maxTransactionActive;
    
    List _coordinators = new ArrayList();
    
    
    List _managedSpoutIds;
    List _spouts;
    WindowedTimeThrottler _throttler;
    
    boolean _active = true;
    
    public MasterBatchCoordinator(List spoutIds, List spouts) {
        if(spoutIds.isEmpty()) {
            throw new IllegalArgumentException("Must manage at least one spout");
        }
        _managedSpoutIds = spoutIds;
        _spouts = spouts;
        LOG.debug("Created {}", this);
    }

    public List getManagedSpoutIds(){
        return _managedSpoutIds;
    }

    @Override
    public void activate() {
        _active = true;
    }

    @Override
    public void deactivate() {
        _active = false;
    }
        
    @Override
    public void open(Map conf, TopologyContext context, SpoutOutputCollector collector) {
        _throttler = new WindowedTimeThrottler((Number)conf.get(Config.TOPOLOGY_TRIDENT_BATCH_EMIT_INTERVAL_MILLIS), 1);
        for(String spoutId: _managedSpoutIds) {
            _states.add(TransactionalState.newCoordinatorState(conf, spoutId));
        }
        _currTransaction = getStoredCurrTransaction();

        _collector = collector;
        Number active = (Number) conf.get(Config.TOPOLOGY_MAX_SPOUT_PENDING);
        if(active==null) {
            _maxTransactionActive = 1;
        } else {
            _maxTransactionActive = active.intValue();
        }
        _attemptIds = getStoredCurrAttempts(_currTransaction, _maxTransactionActive);

        
        for(int i=0; i<_spouts.size(); i++) {
            String txId = _managedSpoutIds.get(i);
            _coordinators.add(_spouts.get(i).getCoordinator(txId, conf, context));
        }
        LOG.debug("Opened {}", this);
    }

    @Override
    public void close() {
        for(TransactionalState state: _states) {
            state.close();
        }
        LOG.debug("Closed {}", this);
    }
    
    @Override
    public void declareOutputFields(OutputFieldsDeclarer declarer) {
        // in partitioned example, in case an emitter task receives a later transaction than it's emitted so far,
        // when it sees the earlier txid it should know to emit nothing
        declarer.declareStream(BATCH_STREAM_ID, new Fields("tx"));
        declarer.declareStream(COMMIT_STREAM_ID, new Fields("tx"));
        declarer.declareStream(SUCCESS_STREAM_ID, new Fields("tx"));
    }

    @Override
    public Map getComponentConfiguration() {
        Config ret = new Config();
        ret.setMaxTaskParallelism(1);
        ret.registerSerialization(TransactionAttempt.class);
        return ret;
    }

    //......
}

prepare方法首先从Config.TOPOLOGY_TRIDENT_BATCH_EMIT_INTERVAL_MILLIS(topology.trident.batch.emit.interval.millis，在defaults.yaml默认为500)读取触发batch的频率配置，然后创建WindowedTimeThrottler，其maxAmt值为1
这里使用TransactionalState在zookeeper上维护transactional状态
之后读取Config.TOPOLOGY_MAX_SPOUT_PENDING(topology.max.spout.pending，在defaults.yaml中默认为null)设置_maxTransactionActive，如果为null，则设置为1

MasterBatchCoordinator.nextTuple

storm-1.2.2/storm-core/src/jvm/org/apache/storm/trident/topology/MasterBatchCoordinator.java

    @Override
    public void nextTuple() {
        sync();
    }

    private void sync() {
        // note that sometimes the tuples active may be less than max_spout_pending, e.g.
        // max_spout_pending = 3
        // tx 1, 2, 3 active, tx 2 is acked. there won't be a commit for tx 2 (because tx 1 isn't committed yet),
        // and there won't be a batch for tx 4 because there's max_spout_pending tx active
        TransactionStatus maybeCommit = _activeTx.get(_currTransaction);
        if(maybeCommit!=null && maybeCommit.status == AttemptStatus.PROCESSED) {
            maybeCommit.status = AttemptStatus.COMMITTING;
            _collector.emit(COMMIT_STREAM_ID, new Values(maybeCommit.attempt), maybeCommit.attempt);
            LOG.debug("Emitted on [stream = {}], [tx_status = {}], [{}]", COMMIT_STREAM_ID, maybeCommit, this);
        }
        
        if(_active) {
            if(_activeTx.size() < _maxTransactionActive) {
                Long curr = _currTransaction;
                for(int i=0; i<_maxTransactionActive; i++) {
                    if(!_activeTx.containsKey(curr) && isReady(curr)) {
                        // by using a monotonically increasing attempt id, downstream tasks
                        // can be memory efficient by clearing out state for old attempts
                        // as soon as they see a higher attempt id for a transaction
                        Integer attemptId = _attemptIds.get(curr);
                        if(attemptId==null) {
                            attemptId = 0;
                        } else {
                            attemptId++;
                        }
                        _attemptIds.put(curr, attemptId);
                        for(TransactionalState state: _states) {
                            state.setData(CURRENT_ATTEMPTS, _attemptIds);
                        }
                        
                        TransactionAttempt attempt = new TransactionAttempt(curr, attemptId);
                        final TransactionStatus newTransactionStatus = new TransactionStatus(attempt);
                        _activeTx.put(curr, newTransactionStatus);
                        _collector.emit(BATCH_STREAM_ID, new Values(attempt), attempt);
                        LOG.debug("Emitted on [stream = {}], [tx_attempt = {}], [tx_status = {}], [{}]", BATCH_STREAM_ID, attempt, newTransactionStatus, this);
                        _throttler.markEvent();
                    }
                    curr = nextTransactionId(curr);
                }
            }
        }
    }

nextTuple就是调用sync方法，该方法在ack及fail中均有调用；sync方法首先根据事务状态，如果需要提交，则会往MasterBatchCoordinator.COMMIT_STREAM_ID($commit)发送tuple；之后根据_maxTransactionActive以及WindowedTimeThrottler限制，符合要求才启动新的TransactionAttempt，往MasterBatchCoordinator.BATCH_STREAM_ID($batch)发送tuple，同时对WindowedTimeThrottler标记下windowEvent数量

MasterBatchCoordinator.ack

storm-1.2.2/storm-core/src/jvm/org/apache/storm/trident/topology/MasterBatchCoordinator.java

    @Override
    public void ack(Object msgId) {
        TransactionAttempt tx = (TransactionAttempt) msgId;
        TransactionStatus status = _activeTx.get(tx.getTransactionId());
        LOG.debug("Ack. [tx_attempt = {}], [tx_status = {}], [{}]", tx, status, this);
        if(status!=null && tx.equals(status.attempt)) {
            if(status.status==AttemptStatus.PROCESSING) {
                status.status = AttemptStatus.PROCESSED;
                LOG.debug("Changed status. [tx_attempt = {}] [tx_status = {}]", tx, status);
            } else if(status.status==AttemptStatus.COMMITTING) {
                _activeTx.remove(tx.getTransactionId());
                _attemptIds.remove(tx.getTransactionId());
                _collector.emit(SUCCESS_STREAM_ID, new Values(tx));
                _currTransaction = nextTransactionId(tx.getTransactionId());
                for(TransactionalState state: _states) {
                    state.setData(CURRENT_TX, _currTransaction);                    
                }
                LOG.debug("Emitted on [stream = {}], [tx_attempt = {}], [tx_status = {}], [{}]", SUCCESS_STREAM_ID, tx, status, this);
            }
            sync();
        }
    }

ack主要是根据当前事务状态进行不同操作，如果之前是AttemptStatus.PROCESSING状态，则更新为AttemptStatus.PROCESSED；如果之前是AttemptStatus.COMMITTING，则移除当前事务，然后往MasterBatchCoordinator.SUCCESS_STREAM_ID($success)发送tuple，更新_currTransaction为nextTransactionId；最后再调用sync触发新的TransactionAttempt

MasterBatchCoordinator.fail

storm-1.2.2/storm-core/src/jvm/org/apache/storm/trident/topology/MasterBatchCoordinator.java

    @Override
    public void fail(Object msgId) {
        TransactionAttempt tx = (TransactionAttempt) msgId;
        TransactionStatus stored = _activeTx.remove(tx.getTransactionId());
        LOG.debug("Fail. [tx_attempt = {}], [tx_status = {}], [{}]", tx, stored, this);
        if(stored!=null && tx.equals(stored.attempt)) {
            _activeTx.tailMap(tx.getTransactionId()).clear();
            sync();
        }
    }

fail方法将当前事务从_activeTx中移除，然后清空_activeTx中txId大于这个失败txId的数据，最后再调用sync判断是否该触发新的TransactionAttempt(注意这里没有变更_currTransaction，因而sync方法触发新的TransactionAttempt的_txid还是当前这个失败的_currTransaction)

TridentSpoutCoordinator

storm-1.2.2/storm-core/src/jvm/org/apache/storm/trident/spout/TridentSpoutCoordinator.java

public class TridentSpoutCoordinator implements IBasicBolt {
    public static final Logger LOG = LoggerFactory.getLogger(TridentSpoutCoordinator.class);
    private static final String META_DIR = "meta";

    ITridentSpout

聊聊storm trident的coordinator

序

实例

代码示例

拓扑图

MasterBatchCoordinator

MasterBatchCoordinator.nextTuple

MasterBatchCoordinator.ack

MasterBatchCoordinator.fail

TridentSpoutCoordinator

TridentBoltExecutor

TridentBoltExecutor.execute

TridentBoltExecutor.checkFinish

TridentSpoutExecutor

FixedBatchSpout

TridentTopology.newStream

小结

doc

你可能感兴趣的:(storm)