wordcount使用滑动时间窗口,每10秒计算过去30秒的单词个数。并在该时间窗口排出TOP5,存入HBase中(排名作为Rowkey,word与count作为Column,这是多数人认为的难点
自定义Function切割字符串
public static class Split extends BaseFunction {
@Override
public void execute(TridentTuple tuple, TridentCollector collector) {
String sentence = tuple.getString(0);
for (String word : sentence.split(" ")) {
collector.emit(new Values(word));
}
}
}
聚合操作,使用HashMap统计单词数量,实现Comparator根据value大小选出topN
public static class Aggregate extends BaseAggregator<Map<String,Integer>>{
@Override
public Map<String, Integer> init(Object o, TridentCollector tridentCollector) {
return new HashMap<String,Integer>();
}
@Override
public void aggregate(Map<String, Integer> map, TridentTuple tridentTuple, TridentCollector tridentCollector) {
String word=tridentTuple.getStringByField("word");
if(map.containsKey(word)){
int num=map.get(word);
map.put(word,num+1);
}else {
map.put(word,1);
}
}
@Override
public void complete(Map<String, Integer> map, TridentCollector tridentCollector) {
List<Map.Entry<String,Integer>> list=new ArrayList<Map.Entry<String, Integer>>(map.entrySet());
Collections.sort(list, new Comparator<Map.Entry<String,Integer>>() {
@Override
public int compare(Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2) {
return o2.getValue()-o1.getValue();
}
});
for(int i=0;i<5&&i<list.size();i++){
tridentCollector.emit(new Values(String.valueOf(i+1),list.get(i).getKey().toString(),list.get(i).getValue().toString()));
}
}
}
构建topology
public static StormTopology buildTopology() {
FixedBatchSpout fixedBatchSpout = new FixedBatchSpout(new Fields("sentence"), 3, new Values("the cow jumped over the moon"),
new Values("the man went to the store and bought some candy"), new Values("four score and seven years ago"),
new Values("how many apples can you eat"), new Values("to or not to be the person"));
fixedBatchSpout.setCycle(true);
//HBase相关配置
//定义Mapper,把HBase的RowKey,列簇,列,对应到Trident中Tuple的Field
//SimpleTridentHBaseMapper是TridentHBaseMapper的一个简单继承
TridentHBaseMapper tridentHBaseMapper=new SimpleTridentHBaseMapper()
.withColumnFamily("cf")
.withColumnFields(new Fields("word","count"))
.withRowKeyField("rank");
//定义HBase数据到Tuple的投影,加入cf列族的word和count列
HBaseProjectionCriteria projectionCriteria=new HBaseProjectionCriteria()
.addColumn(new HBaseProjectionCriteria.ColumnMetaData("cf","word"))
.addColumn(new HBaseProjectionCriteria.ColumnMetaData("cf","count"));
//定义HbaseState的属性类Options
HBaseState.Options options=new HBaseState.Options()
.withConfigKey("hbase.conf")
.withMapper(tridentHBaseMapper)
.withDurability(Durability.SYNC_WAL)
// .withProjectionCriteria(projectionCriteria)
.withTableName("window");
//使用工厂类生产HbaseState对象
StateFactory factory=new HBaseStateFactory(options);
//定义时间滑动窗口 每10秒计算过去30秒的数据
WindowConfig slidingDurationWindow = SlidingDurationWindow.of(new BaseWindowedBolt.Duration(30, TimeUnit.SECONDS), new BaseWindowedBolt.Duration(10, TimeUnit.SECONDS));
//创建TridentTopology
//HBaseUpdater是用于更新HBaseState的类
//在滑动窗口中使用聚合操作
TridentTopology topology = new TridentTopology();
topology.newStream("spout", fixedBatchSpout)
.each(new Fields("sentence"), new Split(), new Fields("word"))
.window(slidingDurationWindow, new Fields("word"), new Aggregate(), new Fields("rank", "word", "count"))
.partitionPersist(factory, new Fields("rank", "word", "count"), new HBaseUpdater(), new Fields());
return topology.build();
}
本地运行or提交集群运行
public static void main(String[] args) throws Exception {
Config conf = new Config();
conf.setDebug(true);
conf.put("hbase.conf", new HashMap());
if (args.length == 0) {
LocalCluster cluster = new LocalCluster();
cluster.submitTopology("topN", conf, buildTopology());
} else {
conf.setNumWorkers(3);
StormSubmitter.submitTopologyWithProgressBar(args[0], conf,
buildTopology());
}
}
对比时间戳可以确认确实是每10秒刷新一遍数据
使用小数据集测试过,数量统计正确