storm计算网站UV(去重计算模式)

需求分析:

UV统计

方案分析:

1,传统的方式是把session_id放入Set实现自动去重,Set.size()获得UV,但是这种方式只能在单机上有效

2,可行的方案(类似WordCount的计算去重word总数):

bolt1通过fieldGrouping进行多线程局部汇总,下一级bolt2进行单线程保存session_id和count数到Map且进行遍历,可以得到:

PV,UV,访问深度(每个session_id的浏览数)


按日期统计

2014-05-01 UV数


去重需求分析:

既然去重,必须持久化数据:

1,内存

数据结构map(中小企业)

2,no-sql分布式数据库,如hbase(大企业)


storm应用场景广泛

但能做的复杂度有限,通常都是汇总型的。

你如果想做数据分析,很难,但是你可以做一些数据分析之前的工作,就是源数据预处理,写库


spout:

package base;

import backtype.storm.spout.SpoutOutputCollector;
import backtype.storm.task.TopologyContext;
import backtype.storm.topology.IRichSpout;
import backtype.storm.topology.OutputFieldsDeclarer;
import backtype.storm.tuple.Fields;
import backtype.storm.tuple.Values;

import java.util.Map;
import java.util.Queue;
import java.util.Random;
import java.util.concurrent.ConcurrentLinkedQueue;

/**
 * Created by Administrator on 2016/10/7.
 */
public class SourceSpout implements IRichSpout{

    /*
    数据源Spout
     */
    private static final long serialVersionUID = 1L;
    Queue queue = new ConcurrentLinkedQueue();
    SpoutOutputCollector collector = null;
    String str = null;

    @Override
    public void open(Map map, TopologyContext topologyContext, SpoutOutputCollector spoutOutputCollector) {
        try{
            this.collector = spoutOutputCollector;
            Random random = new Random();
            String[] hosts = {"www.taobao.com"};
            String[] session_id = { "ABYH6Y4V4SCVXTG6DPB4VH9U123", "XXYH6YCGFJYERTT834R52FDXV9U34", "BBYH61456FGHHJ7JL89RG5VV9UYU7",
                    "CYYH6Y2345GHI899OFG4V9U567", "VVVYH6Y4V4SFXZ56JIPDPB4V678" };
            String[] time = { "2014-01-07 08:40:50", "2014-01-07 08:40:51", "2014-01-07 08:40:52", "2014-01-07 08:40:53",
                    "2014-01-07 09:40:49", "2014-01-07 10:40:49", "2014-01-07 11:40:49", "2014-01-07 12:40:49" };
            for (int i = 0;i < 100; i++){
                queue.add(hosts[0]+"\t"+session_id[random.nextInt(5)]+"\t"+time[random.nextInt(8)]);

            }
        }catch (Exception e){
            e.printStackTrace();
        }
    }

    @Override
    public void close() {

    }

    @Override
    public void activate() {

    }

    @Override
    public void deactivate() {

    }

    @Override
    public void nextTuple() {
        if(queue.size() >= 0){
            collector.emit(new Values(queue.poll()));
            try {
                Thread.sleep(200);
            } catch (InterruptedException e) {
                e.printStackTrace();
            }
        }
    }

    @Override
    public void ack(Object o) {
        System.out.println("spout ack:"+o.toString());

    }

    @Override
    public void fail(Object o) {
        System.out.println("spout fail:"+o.toString());

    }

    @Override
    public void declareOutputFields(OutputFieldsDeclarer outputFieldsDeclarer) {
        outputFieldsDeclarer.declare(new Fields("log"));
    }

    @Override
    public Map, Object> getComponentConfiguration() {
        return null;
    }
}
格式化:

package user_visit;

import backtype.storm.task.TopologyContext;
import backtype.storm.topology.BasicOutputCollector;
import backtype.storm.topology.IBasicBolt;
import backtype.storm.topology.OutputFieldsDeclarer;
import backtype.storm.tuple.Fields;
import backtype.storm.tuple.Tuple;
import backtype.storm.tuple.Values;
import tools.DataFmt;

import java.util.Map;

/**
 * Created by Administrator on 2016/10/8.
 */
public class FmtLogBolt implements IBasicBolt{
    /*
    这个相比于irich的好处就是不用显性的去回调它的ask和fail方法

     */
    private static final long serialVersionUID = 1L;

    @Override
    public void prepare(Map map, TopologyContext topologyContext) {

    }

    String eachLog = null;
    @Override
    public void execute(Tuple tuple, BasicOutputCollector basicOutputCollector) {
        eachLog = tuple.getString(0);
        if(eachLog != null && eachLog.length() > 0){
            //日期,session_id
            basicOutputCollector.emit(new Values(DataFmt.getCountDate(eachLog.split("\t")[2],DataFmt.date_short),eachLog.split("\t")[1]));

        }
    }

    @Override
    public void cleanup() {

    }

    @Override
    public void declareOutputFields(OutputFieldsDeclarer outputFieldsDeclarer) {
        //这两个名称的作用就是在下一级通过这两个名称获取
        outputFieldsDeclarer.declare(new Fields("date","session_id"));
    }

    @Override
    public Map, Object> getComponentConfiguration() {
        return null;
    }
}
局部汇总:
package user_visit;

import backtype.storm.task.TopologyContext;
import backtype.storm.topology.BasicOutputCollector;
import backtype.storm.topology.IBasicBolt;
import backtype.storm.topology.OutputFieldsDeclarer;
import backtype.storm.tuple.Fields;
import backtype.storm.tuple.Tuple;
import backtype.storm.tuple.Values;

import java.util.HashMap;
import java.util.Map;

/**
 * Created by Administrator on 2016/10/8.
 * 统计每个session_id的pv
 */
public class DeepVisitBolt implements IBasicBolt{
    /*

     */
    private static final long serialVersionUID = 1L;

    @Override
    public void prepare(Map map, TopologyContext topologyContext) {

    }
    //map存局部汇总的值
    Map, Integer> counts = new HashMap, Integer>();
    @Override
    public void execute(Tuple tuple, BasicOutputCollector basicOutputCollector) {
        String dateString = tuple.getStringByField("date");
        String session_id = tuple.getStringByField("session_id");
        /*
        我们要去重,就需要把我们要去重的东西放到map的key里面
         */
        Integer count = counts.get(dateString+"_"+session_id);
        if(count == null){
            count = 0;
        }
        count++;
        counts.put(dateString+"_"+session_id, count);
        //这是我们的局部汇总,我们需要把它发到我们的下一级做一个总的汇总
        basicOutputCollector.emit(new Values(dateString+"_"+session_id, count));




    }

    @Override
    public void cleanup() {

    }

    @Override
    public void declareOutputFields(OutputFieldsDeclarer outputFieldsDeclarer) {
        outputFieldsDeclarer.declare(new Fields("date_session_id", "count"));

    }

    @Override
    public Map, Object> getComponentConfiguration() {
        return null;
    }
}
统计:

package user_visit;

import backtype.storm.task.TopologyContext;
import backtype.storm.topology.BasicOutputCollector;
import backtype.storm.topology.FailedException;
import backtype.storm.topology.IBasicBolt;
import backtype.storm.topology.OutputFieldsDeclarer;
import backtype.storm.tuple.Tuple;
import tools.DataFmt;

import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;

/**
 * Created by Administrator on 2016/10/6.
 */
public class UVSumBolt implements IBasicBolt{



    private static final long serialVersionUID = 1L;
    String cur_date = null;
    long beginTime = System.currentTimeMillis();
    long endTime = 0;

    Map, Integer> counts = new HashMap, Integer>();
    @Override
    public void prepare(Map map, TopologyContext topologyContext) {
        cur_date = DataFmt.getCountDate("2014-01-07", DataFmt.date_short);
    }


    @Override
    public void execute(Tuple tuple, BasicOutputCollector basicOutputCollector) {

        try{
            endTime = System.currentTimeMillis();
            long PV = 0;//总数
            long UV = 0;//个数,去重后
            String dateSession_id = tuple.getString(0);
            Integer countInteger = tuple.getInteger(1);

            //判断数据是不是当天的,而且比当前日期还要打
            if(!dateSession_id.startsWith(cur_date) && DataFmt.parseDate(dateSession_id.split("_")[0]).after(DataFmt.parseDate(cur_date))){
                cur_date = dateSession_id.split("_")[0];
                counts.clear();
            }


            counts.put(dateSession_id, countInteger);


            if(endTime - beginTime >= 2000){

                //获取word去重个数,遍历counts的keyset,取count
                Iterator i2 = counts.keySet().iterator();
                while(i2.hasNext()){
                    String key = i2.next();
                    if(key != null){
                        if(key.startsWith(cur_date)){
                            UV ++;
                            PV += counts.get(key);
                        }
                    }
                }
                System.out.println("PV=" + PV + "; UV=" + UV);


            }





        }catch (Exception e){
            throw new FailedException("SumBolt fail!");
        }




    }

    @Override
    public void cleanup() {

    }

    @Override
    public void declareOutputFields(OutputFieldsDeclarer outputFieldsDeclarer) {

    }

    @Override
    public Map, Object> getComponentConfiguration() {
        return null;
    }
}





你可能感兴趣的:(storm)