Storm实战之TopN

TopN这种统计场景很常见,例如,统计出搜索热度最高的词,点击率最高的广告等,现在有了Hadoop、Storm这些工具之后,很方便地就能得到结果。
这里以Storm为例,简易地实现了TopN单词的统计,由于刚刚入门,代码写得比较简单。
首先,在多台机器上运行多个bolt,每个bolt负责计算一部分word的TopN,最后有一个全局的bolt,合并上一步的结果,最后得出全局的TopN。

package test.storm.topology;

import test.storm.bolt.WordCounter;
import test.storm.bolt.WordWriter;
import test.storm.spout.WordReader;
import backtype.storm.Config;
import backtype.storm.StormSubmitter;
import backtype.storm.generated.AlreadyAliveException;
import backtype.storm.generated.InvalidTopologyException;
import backtype.storm.topology.TopologyBuilder;
import backtype.storm.tuple.Fields;

public class WordTopN {
    public static void main(String[] args) throws AlreadyAliveException, InvalidTopologyException {
        if (args == null || args.length < 1) {
            System.err.println("Usage: N");
            System.err.println("such as : 10");
            System.exit(-1);
        }

        TopologyBuilder builder = new TopologyBuilder();
        builder.setSpout("wordreader", new WordReader(), 2);
        builder.setBolt("wordcounter", new WordCounter(), 2).fieldsGrouping("wordreader", new Fields("word"));
        builder.setBolt("wordwriter", new WordWriter()).globalGrouping("wordcounter");

        Config conf = new Config();
        conf.put("N", args[0]);

        conf.setDebug(false);
        StormSubmitter.submitTopology("topN", conf, builder.createTopology());

    }
}

这里需要注意的几点是,第一个bolt的分组策略是fieldsGrouping,按照字段分组,这一点很重要,它能保证相同的word被分发到同一个bolt上,
像做wordcount、TopN之类的应用就要使用这种分组策略。
最后一个bolt的分组策略是globalGrouping,全局分组,tuple会被分配到一个bolt用来汇总。
为了提高并行度,spout和第一个bolt均设置并行度为2(我这里测试机器性能不是很高)。

package test.storm.spout;

import java.util.Map;
import java.util.Random;
import java.util.concurrent.atomic.AtomicInteger;

import backtype.storm.spout.SpoutOutputCollector;
import backtype.storm.task.TopologyContext;
import backtype.storm.topology.OutputFieldsDeclarer;
import backtype.storm.topology.base.BaseRichSpout;
import backtype.storm.tuple.Fields;
import backtype.storm.tuple.Values;

public class WordReader extends BaseRichSpout {
    private static final long serialVersionUID = 2197521792014017918L;
    private SpoutOutputCollector collector;
    private static AtomicInteger i = new AtomicInteger();
    private static String[] words = new String[] { \"a\", \"b\", \"c\", \"d\", \"e\", \"f\", \"g\", \"h\", \"i\", \"j\", \"k\", \"l\", \"m\",
            \"n\", \"o\", \"p\", \"q\", \"r\", \"s\", \"t\", \"u\", \"v\", \"w\", \"x\", \"y\", \"z\" };

    @Override
    public void open(Map conf, TopologyContext context, SpoutOutputCollector collector) {
        this.collector = collector;
    }

    @Override
    public void nextTuple() {
        if (i.intValue() < 100) {
            Random rand = new Random();
            String word = words[rand.nextInt(words.length)];
            collector.emit(new Values(word));
            i.incrementAndGet();
        }
    }

    @Override
    public void declareOutputFields(OutputFieldsDeclarer declarer) {
        declarer.declare(new Fields("word"));
    }
}

spout的作用是随机发送word,发送100次,由于并行度是2,将产生2个spout实例,所以这里的计数器使用了static的AtomicInteger来保证线程安全。

package test.storm.bolt;

import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.concurrent.ConcurrentHashMap;

import backtype.storm.task.OutputCollector;
import backtype.storm.task.TopologyContext;
import backtype.storm.topology.IRichBolt;
import backtype.storm.topology.OutputFieldsDeclarer;
import backtype.storm.tuple.Fields;
import backtype.storm.tuple.Tuple;
import backtype.storm.tuple.Values;

public class WordCounter implements IRichBolt {
    private static final long serialVersionUID = 5683648523524179434L;
    private static Map counters = new ConcurrentHashMap();
    private volatile boolean edit = true;

    @Override
    public void prepare(final Map stormConf, TopologyContext context, final OutputCollector collector) {
        new Thread(new Runnable() {
            @Override
            public void run() {
                while (true) {
                    //5秒后counter不再变化,可以认为spout已经发送完毕
                    if (!edit) {
                        if (counters.size() > 0) {
                            List> list = new ArrayList>();
                            list.addAll(counters.entrySet());
                            Collections.sort(list, new ValueComparator());

                            //向下一个bolt发送前N个word
                            for (int i = 0; i < list.size(); i++) {
                                if (i < Integer.parseInt(stormConf.get("N").toString())) {
                                    collector.emit(new Values(list.get(i).getKey() + ":" + list.get(i).getValue()));
                                }
                            }
                        }

                        //发送之后,清空counters,以防spout再次发送word过来
                        counters.clear();
                    }

                    edit = false;
                    try {
                        Thread.sleep(5000);
                    } catch (InterruptedException e) {
                        e.printStackTrace();
                    }
                }
            }
        }).start();
    }

    @Override
    public void execute(Tuple tuple) {
        String str = tuple.getString(0);
        if (counters.containsKey(str)) {
            Integer c = counters.get(str) + 1;
            counters.put(str, c);
        } else {
            counters.put(str, 1);
        }

        edit = true;
    }

    private static class ValueComparator implements Comparator> {
        @Override
        public int compare(Entry entry1, Entry entry2) {
            return entry2.getValue() - entry1.getValue();
        }
    }

    @Override
    public void declareOutputFields(OutputFieldsDeclarer declarer) {
        declarer.declare(new Fields("word_count"));
    }

    @Override
    public void cleanup() {
    }

    @Override
    public Map getComponentConfiguration() {
        return null;
    }
}

在WordCounter里面有个线程安全的容器ConcurrentHashMap,来存储word以及对应的次数。在prepare方法里启动一个线程,长期监听edit的状态,监听间隔是5秒,
当edit为false,即execute方法不再执行、容器不再变化,可以认为spout已经发送完毕了,可以开始排序取TopN了。这里使用了一个volatile edit(回忆一下volatile的使用场景:
对变量的修改不依赖变量当前的值,这里设置true or false,显然不相互依赖)。

package test.storm.bolt;

import java.io.FileWriter;
import java.io.IOException;
import java.util.Map;

import backtype.storm.task.TopologyContext;
import backtype.storm.topology.BasicOutputCollector;
import backtype.storm.topology.OutputFieldsDeclarer;
import backtype.storm.topology.base.BaseBasicBolt;
import backtype.storm.tuple.Tuple;

public class WordWriter extends BaseBasicBolt {
    private static final long serialVersionUID = -6586283337287975719L;
    private FileWriter writer = null;

    public WordWriter() {
    }

    @Override
    public void prepare(Map stormConf, TopologyContext context) {
        try {
            writer = new FileWriter("/data/tianzhen/output/" + this);
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    @Override
    public void execute(Tuple input, BasicOutputCollector collector) {
        String s = input.getString(0);
        try {
            writer.write(s);
            writer.write("\n");
            writer.flush();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            //writer不能close,因为execute需要一直运行
        }
    }

    @Override
    public void declareOutputFields(OutputFieldsDeclarer declarer) {

    }
}

最后一个bolt做全局的汇总,这里我偷了懒,直接将结果写到文件了,省略截取TopN的过程,因为我这里就一个supervisor节点,所以结果是正确的。

你可能感兴趣的:(Storm实战之TopN)