Storm处理Stream Join的简单实例

转自:http://blog.csdn.net/kaitankedemao/article/details/50381023

源码是分析的storm-starter中的SingleJoinExample,对两个简单的流进行聚合:[id,gender]和[id,age]经过join后[id,gender,age]

分析过程直接写在注释里面,所以就不再分离出来。

SingleJoinBolt.java
[java]  view plain  copy
 print ?
  1. package com.wxt.storm.SingleJoinExample.bolt;  
  2.   
  3. import java.util.ArrayList;  
  4. import java.util.HashMap;  
  5. import java.util.HashSet;  
  6. import java.util.List;  
  7. import java.util.Map;  
  8. import java.util.Set;  
  9.   
  10. import backtype.storm.Config;  
  11. import backtype.storm.generated.GlobalStreamId;  
  12. import backtype.storm.task.OutputCollector;  
  13. import backtype.storm.task.TopologyContext;  
  14. import backtype.storm.topology.OutputFieldsDeclarer;  
  15. import backtype.storm.topology.base.BaseRichBolt;  
  16. import backtype.storm.tuple.Fields;  
  17. import backtype.storm.tuple.Tuple;  
  18. import backtype.storm.utils.TimeCacheMap;  
  19.   
  20. public class SimpleJoinBolt extends BaseRichBolt {  
  21.     private OutputCollector _collector;  
  22.     private Fields _outFields;  
  23.     private Fields _idFields;  
  24.     int _numSources;  
  25.     Map _fieldLocations;  
  26.       
  27.     //在内存中保留近期活跃的对象  
  28.     //由于bolt在接收两个数据源的流数据时,同一id两个数据流很可能不会再一个时间点同时发出,因此需要对数据流先进行缓存,直到所有  
  29.     //id相同的数据源都被后被聚合处理,做完聚合处理后再将对应的tuple信息从缓存中删除。在  
  30.     TimeCacheMap,Map> _pending;  
  31.       
  32.     //传进的Fields是聚合后将被输出的字段  
  33.     public SimpleJoinBolt(Fields outFields){  
  34.         this._outFields=outFields;  
  35.     }  
  36.     public void execute(Tuple tuple) {  
  37.         // TODO Auto-generated method stub  
  38.         //从tuple中获取_idFields字段,如果不存在于等待被处理的队列_pending中,则加入一行  
  39.         List id=tuple.select(_idFields);  
  40.         GlobalStreamId streamId=new GlobalStreamId(tuple.getSourceComponent(),tuple.getSourceStreamId());  
  41.         //打印当前处理元组的来源Spout  
  42.         System.out.println("元组来源:"+tuple.getSourceComponent());  
  43.         //打印当前元组  
  44.         System.out.println("接收的元组:"+tuple.getFields().get(0)+" = "+tuple.getValues().get(0)+" , "+tuple.getFields().get(1)+" = "+tuple.getValues().get(1));  
  45.         //如果当前pending中还不存在join key为此id的元组,则将该条记录加入  
  46.         if(!_pending.containsKey(id)){  
  47.             _pending.put(id, new HashMap());  
  48.         }  
  49.         //从_pending队列中获取当前GlobalStreamId对应的HashMap对象  
  50.         Map parts=_pending.get(id);  
  51.         //如果streamId已经包含其中,则抛出异常,接收到同一个spout中的两条一样id的tuple,否则将该streamId加入parts中  
  52.         if(parts.containsKey(streamId)){  
  53.              throw new RuntimeException("Received same side of single join twice");  
  54.         }  
  55.         parts.put(streamId, tuple);  
  56.           
  57.         //如果parts中已经包含了聚合数据源的个数,则从_pending队列中移除这条记录  
  58.         if(parts.size()==_numSources){  
  59.             _pending.remove(id);  
  60.             List joinResult=new ArrayList();  
  61.             for(String outField:_outFields){  
  62.                 GlobalStreamId loc=_fieldLocations.get(outField);  
  63.                 joinResult.add(parts.get(loc).getValueByField(outField));  
  64.             }  
  65.             //输出聚合结果  
  66.             System.out.print("两条关系流中id值为"+id.get(0)+"的元组均已收到,聚合结果为:");  
  67.             for(Object obj:joinResult){  
  68.                 System.out.print(obj+" ");  
  69.             }  
  70.             System.out.println();  
  71.               
  72.             //多锚定  
  73.             _collector.emit(new ArrayList(parts.values()),joinResult);  
  74.             for (Tuple part : parts.values()) {  
  75.                 _collector.ack(part);  
  76.             }  
  77.         }else{  
  78.             System.out.println("只从一个关系流中收取到id值为"+id+"的元组,不可进行join操作");  
  79.         }  
  80.     }  
  81.   
  82.     public void prepare(Map conf, TopologyContext context, OutputCollector collector) {  
  83.         // TODO Auto-generated method stub  
  84.          _fieldLocations = new HashMap();  
  85.         this._collector=collector;  
  86.         //创建TimeCacheMap对象,设置超时回调接口,用于tuple处理失败时fail消息  
  87.         int timeout=((Number)conf.get(Config.TOPOLOGY_MESSAGE_TIMEOUT_SECS)).intValue();  
  88.         _pending=new TimeCacheMap,Map>(timeout,new ExpireCallback());  
  89.         //记录数据源的数据个数  
  90.         _numSources=context.getThisSources().size();  
  91.           
  92.         Set idFields=null;  
  93.         //遍历TopologyContext中不同的数据源:genderSpout和ageSpout  
  94.         System.out.println(context.getThisSources().keySet());  
  95.         for(GlobalStreamId source:context.getThisSources().keySet()){  
  96.             //得到公共的Fields字段id,保存到_idFields中  
  97.             Fields fields=context.getComponentOutputFields(source.get_componentId(),source.get_streamId());  
  98.             //fields:[id,gender],[id,age]  
  99.             Set setFields=new HashSet(fields.toList());  
  100.             if(idFields==null){  
  101.                 idFields=setFields;  
  102.             }else{  
  103.                 //求交集  
  104.                 idFields.retainAll(setFields);  
  105.                 System.out.println(idFields);  
  106.             }  
  107.             //同时将_outFields中字段所在数据源记录下来,保存到一张HashMap _fieldLocations中,以便聚合后获取对应的字段值  
  108.             for(String outfield:_outFields){  
  109.                 for(String sourcefield:fields){  
  110.                     if(outfield.equals(sourcefield)){  
  111.                         _fieldLocations.put(outfield, source);  
  112.                     }  
  113.                 }  
  114.             }  
  115.             //打印结果:gender=GlobalStreamId(componentId=gender-spout,streamId=default)  
  116.             //age=GlobalStreamId(componentId=age-spout,streamId=default)  
  117.             System.out.println(_fieldLocations);  
  118.               
  119.         }  
  120.         _idFields=new Fields(new ArrayList(idFields));  
  121.         if(_fieldLocations.size()!=_outFields.size()){  
  122.             throw new RuntimeException("Cannot find all outfields among sources");  
  123.         }  
  124.     }  
  125.   
  126.     public void declareOutputFields(OutputFieldsDeclarer declarer) {  
  127.         // TODO Auto-generated method stub  
  128.         declarer.declare(_outFields);  
  129.     }  
  130.       
  131.     private class ExpireCallback implements TimeCacheMap.ExpiredCallback, Map>{  
  132.   
  133.         public void expire(List key, Map tuples) {  
  134.             // TODO Auto-generated method stub  
  135.             for(Tuple tuple:tuples.values()){  
  136.                 _collector.fail(tuple);  
  137.             }  
  138.         }  
  139.           
  140.     }  
  141.   
  142. }  

  143. SingleJoinExample.java(Topology)
    [java]  view plain  copy
     print ?
    1. package com.wxt.storm.SingleJoinExample;  
    2.   
    3. import com.wxt.storm.SingleJoinExample.bolt.SimpleJoinBolt;  
    4.   
    5. import backtype.storm.Config;  
    6. import backtype.storm.LocalCluster;  
    7. import backtype.storm.testing.FeederSpout;  
    8. import backtype.storm.topology.TopologyBuilder;  
    9. import backtype.storm.tuple.Fields;  
    10. import backtype.storm.tuple.Values;  
    11. import backtype.storm.utils.Utils;  
    12.   
    13. public class SingleJoinExample {  
    14.       public static void main(String[] args) {  
    15.         FeederSpout genderSpout = new FeederSpout(new Fields("id""gender"));  
    16.         FeederSpout ageSpout = new FeederSpout(new Fields("id""age"));  
    17.   
    18.         TopologyBuilder builder = new TopologyBuilder();  
    19.         builder.setSpout(CommonKeys.GENDER_SPOUT, genderSpout);  
    20.         builder.setSpout(CommonKeys.AGE_SPOUT, ageSpout);  
    21.         builder.setBolt(CommonKeys.JOIN_BOLT, new SimpleJoinBolt(new Fields("gender""age"))).fieldsGrouping(CommonKeys.GENDER_SPOUT, new Fields("id"))  
    22.             .fieldsGrouping(CommonKeys.AGE_SPOUT, new Fields("id"));  
    23.   
    24.         Config conf = new Config();  
    25.         conf.setDebug(true);  
    26.   
    27.         LocalCluster cluster = new LocalCluster();  
    28.         cluster.submitTopology(CommonKeys.JOIN_TOPOLOGY, conf, builder.createTopology());  
    29.   
    30.         for (int i = 0; i < 10; i++) {  
    31.           String gender;  
    32.           if (i % 2 == 0) {  
    33.             gender = "male";  
    34.           }  
    35.           else {  
    36.             gender = "female";  
    37.           }  
    38.           genderSpout.feed(new Values(i, gender));  
    39.         }  
    40.   
    41.         for (int i = 9; i >= 0; i--) {  
    42.           ageSpout.feed(new Values(i, i + 20));  
    43.         }  
    44.   
    45.         Utils.sleep(2000);  
    46.         cluster.shutdown();  
    47.       }  
    48.     }  

    你可能感兴趣的:(jstorm)