需求分析:
UV统计
方案分析:
1,传统的方式是把session_id放入Set实现自动去重,Set.size()获得UV,但是这种方式只能在单机上有效
2,可行的方案(类似WordCount的计算去重word总数):
bolt1通过fieldGrouping进行多线程局部汇总,下一级bolt2进行单线程保存session_id和count数到Map且进行遍历,可以得到:
PV,UV,访问深度(每个session_id的浏览数)
按日期统计
2014-05-01 UV数
去重需求分析:
既然去重,必须持久化数据:
1,内存
数据结构map(中小企业)
2,no-sql分布式数据库,如hbase(大企业)
storm应用场景广泛
但能做的复杂度有限,通常都是汇总型的。
你如果想做数据分析,很难,但是你可以做一些数据分析之前的工作,就是源数据预处理,写库
spout:
package base; import backtype.storm.spout.SpoutOutputCollector; import backtype.storm.task.TopologyContext; import backtype.storm.topology.IRichSpout; import backtype.storm.topology.OutputFieldsDeclarer; import backtype.storm.tuple.Fields; import backtype.storm.tuple.Values; import java.util.Map; import java.util.Queue; import java.util.Random; import java.util.concurrent.ConcurrentLinkedQueue; /** * Created by Administrator on 2016/10/7. */ public class SourceSpout implements IRichSpout{ /* 数据源Spout */ private static final long serialVersionUID = 1L; Queue格式化:queue = new ConcurrentLinkedQueue (); SpoutOutputCollector collector = null; String str = null; @Override public void open(Map map, TopologyContext topologyContext, SpoutOutputCollector spoutOutputCollector) { try{ this.collector = spoutOutputCollector; Random random = new Random(); String[] hosts = {"www.taobao.com"}; String[] session_id = { "ABYH6Y4V4SCVXTG6DPB4VH9U123", "XXYH6YCGFJYERTT834R52FDXV9U34", "BBYH61456FGHHJ7JL89RG5VV9UYU7", "CYYH6Y2345GHI899OFG4V9U567", "VVVYH6Y4V4SFXZ56JIPDPB4V678" }; String[] time = { "2014-01-07 08:40:50", "2014-01-07 08:40:51", "2014-01-07 08:40:52", "2014-01-07 08:40:53", "2014-01-07 09:40:49", "2014-01-07 10:40:49", "2014-01-07 11:40:49", "2014-01-07 12:40:49" }; for (int i = 0;i < 100; i++){ queue.add(hosts[0]+"\t"+session_id[random.nextInt(5)]+"\t"+time[random.nextInt(8)]); } }catch (Exception e){ e.printStackTrace(); } } @Override public void close() { } @Override public void activate() { } @Override public void deactivate() { } @Override public void nextTuple() { if(queue.size() >= 0){ collector.emit(new Values(queue.poll())); try { Thread.sleep(200); } catch (InterruptedException e) { e.printStackTrace(); } } } @Override public void ack(Object o) { System.out.println("spout ack:"+o.toString()); } @Override public void fail(Object o) { System.out.println("spout fail:"+o.toString()); } @Override public void declareOutputFields(OutputFieldsDeclarer outputFieldsDeclarer) { outputFieldsDeclarer.declare(new Fields("log")); } @Override public Map , Object> getComponentConfiguration() { return null; } }
package user_visit; import backtype.storm.task.TopologyContext; import backtype.storm.topology.BasicOutputCollector; import backtype.storm.topology.IBasicBolt; import backtype.storm.topology.OutputFieldsDeclarer; import backtype.storm.tuple.Fields; import backtype.storm.tuple.Tuple; import backtype.storm.tuple.Values; import tools.DataFmt; import java.util.Map; /** * Created by Administrator on 2016/10/8. */ public class FmtLogBolt implements IBasicBolt{ /* 这个相比于irich的好处就是不用显性的去回调它的ask和fail方法 */ private static final long serialVersionUID = 1L; @Override public void prepare(Map map, TopologyContext topologyContext) { } String eachLog = null; @Override public void execute(Tuple tuple, BasicOutputCollector basicOutputCollector) { eachLog = tuple.getString(0); if(eachLog != null && eachLog.length() > 0){ //日期,session_id basicOutputCollector.emit(new Values(DataFmt.getCountDate(eachLog.split("\t")[2],DataFmt.date_short),eachLog.split("\t")[1])); } } @Override public void cleanup() { } @Override public void declareOutputFields(OutputFieldsDeclarer outputFieldsDeclarer) { //这两个名称的作用就是在下一级通过这两个名称获取 outputFieldsDeclarer.declare(new Fields("date","session_id")); } @Override public Map局部汇总:, Object> getComponentConfiguration() { return null; } }
package user_visit; import backtype.storm.task.TopologyContext; import backtype.storm.topology.BasicOutputCollector; import backtype.storm.topology.IBasicBolt; import backtype.storm.topology.OutputFieldsDeclarer; import backtype.storm.tuple.Fields; import backtype.storm.tuple.Tuple; import backtype.storm.tuple.Values; import java.util.HashMap; import java.util.Map; /** * Created by Administrator on 2016/10/8. * 统计每个session_id的pv */ public class DeepVisitBolt implements IBasicBolt{ /* */ private static final long serialVersionUID = 1L; @Override public void prepare(Map map, TopologyContext topologyContext) { } //map存局部汇总的值 Map统计:, Integer> counts = new HashMap , Integer>(); @Override public void execute(Tuple tuple, BasicOutputCollector basicOutputCollector) { String dateString = tuple.getStringByField("date"); String session_id = tuple.getStringByField("session_id"); /* 我们要去重,就需要把我们要去重的东西放到map的key里面 */ Integer count = counts.get(dateString+"_"+session_id); if(count == null){ count = 0; } count++; counts.put(dateString+"_"+session_id, count); //这是我们的局部汇总,我们需要把它发到我们的下一级做一个总的汇总 basicOutputCollector.emit(new Values(dateString+"_"+session_id, count)); } @Override public void cleanup() { } @Override public void declareOutputFields(OutputFieldsDeclarer outputFieldsDeclarer) { outputFieldsDeclarer.declare(new Fields("date_session_id", "count")); } @Override public Map , Object> getComponentConfiguration() { return null; } }
package user_visit; import backtype.storm.task.TopologyContext; import backtype.storm.topology.BasicOutputCollector; import backtype.storm.topology.FailedException; import backtype.storm.topology.IBasicBolt; import backtype.storm.topology.OutputFieldsDeclarer; import backtype.storm.tuple.Tuple; import tools.DataFmt; import java.util.HashMap; import java.util.Iterator; import java.util.Map; /** * Created by Administrator on 2016/10/6. */ public class UVSumBolt implements IBasicBolt{ private static final long serialVersionUID = 1L; String cur_date = null; long beginTime = System.currentTimeMillis(); long endTime = 0; Map, Integer> counts = new HashMap , Integer>(); @Override public void prepare(Map map, TopologyContext topologyContext) { cur_date = DataFmt.getCountDate("2014-01-07", DataFmt.date_short); } @Override public void execute(Tuple tuple, BasicOutputCollector basicOutputCollector) { try{ endTime = System.currentTimeMillis(); long PV = 0;//总数 long UV = 0;//个数,去重后 String dateSession_id = tuple.getString(0); Integer countInteger = tuple.getInteger(1); //判断数据是不是当天的,而且比当前日期还要打 if(!dateSession_id.startsWith(cur_date) && DataFmt.parseDate(dateSession_id.split("_")[0]).after(DataFmt.parseDate(cur_date))){ cur_date = dateSession_id.split("_")[0]; counts.clear(); } counts.put(dateSession_id, countInteger); if(endTime - beginTime >= 2000){ //获取word去重个数,遍历counts的keyset,取count Iterator i2 = counts.keySet().iterator(); while(i2.hasNext()){ String key = i2.next(); if(key != null){ if(key.startsWith(cur_date)){ UV ++; PV += counts.get(key); } } } System.out.println("PV=" + PV + "; UV=" + UV); } }catch (Exception e){ throw new FailedException("SumBolt fail!"); } } @Override public void cleanup() { } @Override public void declareOutputFields(OutputFieldsDeclarer outputFieldsDeclarer) { } @Override public Map , Object> getComponentConfiguration() { return null; } }