storm代码处理单词计数

我们首先画一个单词计数流程图

storm代码处理单词计数_第1张图片

需要的jar包是在storm解压下的lib目录里,如果集成其他的保存数据的组件如redis、hdfs就需要到/root/training/apache-storm-1.0.3/external 这里边的jar包

 

代码:

spout组件


package com.test.demo;

import java.util.Map;
import java.util.Random;

import org.apache.storm.spout.SpoutOutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.base.BaseRichSpout;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Values;
import org.apache.storm.utils.Utils;

public class WordCountSpout extends BaseRichSpout
{

	/**
	 * 
	 */
	private static final long serialVersionUID = 1L;

	private SpoutOutputCollector collector;
	
	//模拟采集到的数据
	private String[] datas={"I love Beijing","I love China","Beijing of the captial of China"};
	
	@Override
	public void nextTuple() {
		// TODO Auto-generated method stub
		 Utils.sleep(2000); //每两秒采集一次数据
		 int nextInt = new Random().nextInt(3);
		 
		 this.collector.emit(new Values(datas[nextInt]));
		 
		 System.out.println("采集到的数据是:"+datas[nextInt]);
	}
	
	@Override
	public void open(Map arg0, TopologyContext arg1, SpoutOutputCollector collector) {
		// TODO Auto-generated method stub
		//指定spout组件的输出流
		this.collector=collector;
		
	}
	//声明输入到下一个组件的字段,组件与组件之间使用touple传递
	@Override
	public void declareOutputFields(OutputFieldsDeclarer declarer) {
		// TODO Auto-generated method stub
		declarer.declare(new Fields("content"));
	}

}

bolt组件,负责拆分单词的

package com.test.demo;

import java.util.Map;

import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.base.BaseRichBolt;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Tuple;
import org.apache.storm.tuple.Values;

public class WordCountSplitBolt extends BaseRichBolt
{

	/**
	 * 
	 */
	private static final long serialVersionUID = 1L;
	private OutputCollector collector;
	
	@Override
	public void execute(Tuple tuple) {
		String line = tuple.getStringByField("content");
		String[] split = line.split(" ");
		for (String word : split)
		{
			collector.emit(new Values(word,1));
		}
	}

	@Override
	public void prepare(Map arg0, TopologyContext arg1, OutputCollector collector) {
		
		this.collector=collector;
	}

	@Override
	public void declareOutputFields(OutputFieldsDeclarer declarer) {
		declarer.declare(new Fields("word","count"));
		
	}

}

bolt组件,负责计数的

package com.test.demo;

import java.util.HashMap;
import java.util.Map;

import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.base.BaseRichBolt;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Tuple;

public class WordCountTotalBolt extends BaseRichBolt
{

	/**
	 * 
	 */
	private static final long serialVersionUID = 1L;

	private OutputCollector collector;
	
	private Map result=new HashMap();
	@Override
	public void execute(Tuple tuple) {
		// TODO Auto-generated method stub
		String word = (String) tuple.getValueByField("word");
		Integer integer = result.get(word);
		Integer count = result.get(word);
		if (count!=null)
		{
			result.put(word, count+(Integer)tuple.getValueByField("count"));
		}else {
			result.put(word, (Integer)tuple.getValueByField("count"));
		}
		System.out.println("统计结果:"+word+"---"+result.get(word));
	}

	@Override
	public void prepare(Map arg0, TopologyContext arg1, OutputCollector collector) {
		// TODO Auto-generated method stub
		this.collector=collector;
	}

	@Override
	public void declareOutputFields(OutputFieldsDeclarer declarer) {
		
		declarer.declare(new Fields("word","total"));
	}

}

最后编写任务执行

package com.test.demo;

/*import org.apache.storm.Config;
import org.apache.storm.LocalCluster;
import org.apache.storm.generated.StormTopology;
import org.apache.storm.topology.TopologyBuilder;
import org.apache.storm.tuple.Fields;*/

import org.apache.storm.Config;
import org.apache.storm.LocalCluster;
import org.apache.storm.StormSubmitter;
import org.apache.storm.generated.AlreadyAliveException;
import org.apache.storm.generated.AuthorizationException;
import org.apache.storm.generated.InvalidTopologyException;
import org.apache.storm.generated.StormTopology;
import org.apache.storm.topology.TopologyBuilder;
import org.apache.storm.tuple.Fields;
public class WordCountTopology
{
	public static void main(String[] args) {
		//创建一个topology
		TopologyBuilder builder=new TopologyBuilder();
		//指定任务spout组件
		builder.setSpout("myspout", new WordCountSpout());
		//指定拆分bolt组件,并指定分组方式与上一级组件进行串联
		builder.setBolt("mysplit", new WordCountSplitBolt()).shuffleGrouping("myspout");
		//指定计数的bolt组件
		builder.setBolt("mycount", new WordCountTotalBolt()).fieldsGrouping("mysplit", new Fields("word"));
		
		//创建任务
		StormTopology topology = builder.createTopology();
				
		//配置参数 ,可以为spout或者是bolt传递参数,open和prepare就是和这里对应
		Config conf = new Config();
		//有两种运行模式
		//1、本地模式
//		LocalCluster cluster = new LocalCluster();
//		cluster.submitTopology("MyWC", conf, topology);
		//2、集群模式,直接提交到storm的集群中处理
		try
		{ 
			//args[0]是topology的别名我们通过提交时路径指定,conf是配置文件,topology是任务
			StormSubmitter.submitTopology(args[0], conf, topology);
		} catch (Exception e)
		{
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}
}

有俩种执行方式

一种是本地模式,就是编辑环境中即可执行,一般用于测试,相当于在编辑环境集成了storm

一种是集群模式需要把程序打包并把任务提交到storm的集群上执行

集群模式导出的时候需要选择主类入口

执行命令

storm jar stormdemo1.jar com.test.demo.WordCountTopology MyWC

storm代码处理单词计数_第2张图片

集成redis的blot用于保存storm中处理的结果

需要额外导入的jar包有external中的storm-redis-1.0.3.jar,还有redis中的jar包jedis-2.7.0.jar和commons-pool2-2.3.jar

如果不导入redis中的jedis和common的包,会报错误如下

Exception in thread "main" java.lang.NoClassDefFoundError: redis/clients/jedis/JedisCommands
	at java.lang.Class.getDeclaredMethods0(Native Method)
	at java.lang.Class.privateGetDeclaredMethods(Class.java:2701)
	at java.lang.Class.getDeclaredMethod(Class.java:2128)
	at java.io.ObjectStreamClass.getPrivateMethod(ObjectStreamClass.java:1475)
	at java.io.ObjectStreamClass.access$1700(ObjectStreamClass.java:72)
	at java.io.ObjectStreamClass$2.run(ObjectStreamClass.java:498)
	at java.io.ObjectStreamClass$2.run(ObjectStreamClass.java:472)
	at java.security.AccessController.doPrivileged(Native Method)
	at java.io.ObjectStreamClass.(ObjectStreamClass.java:472)
	at java.io.ObjectStreamClass.lookup(ObjectStreamClass.java:369)
	at java.io.ObjectStreamClass.(ObjectStreamClass.java:468)
	at java.io.ObjectStreamClass.lookup(ObjectStreamClass.java:369)
	at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1134)
	at java.io.ObjectOutputStream.writeObject(ObjectOutputStream.java:348)
	at org.apache.storm.utils.Utils.javaSerialize(Utils.java:232)
	at org.apache.storm.topology.TopologyBuilder.createTopology(TopologyBuilder.java:123)
	at com.test.demo.WordCountTopology.main(WordCountTopology.java:30)
Caused by: java.lang.ClassNotFoundException: redis.clients.jedis.JedisCommands
	at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
	at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:331)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
	... 17 more

集成redis的bolt主要在topology中写就可以了

package com.test.demo;


import org.apache.storm.Config;
import org.apache.storm.LocalCluster;
import org.apache.storm.generated.StormTopology;
import org.apache.storm.redis.bolt.RedisStoreBolt;
import org.apache.storm.redis.common.config.JedisPoolConfig;
import org.apache.storm.redis.common.mapper.RedisDataTypeDescription;
import org.apache.storm.redis.common.mapper.RedisDataTypeDescription.RedisDataType;
import org.apache.storm.redis.common.mapper.RedisStoreMapper;
import org.apache.storm.topology.IRichBolt;
import org.apache.storm.topology.TopologyBuilder;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.ITuple;
public class WordCountTopology
{
	public static void main(String[] args) {
		//创建一个topology
		TopologyBuilder builder=new TopologyBuilder();
		//指定任务spout组件
		builder.setSpout("myspout", new WordCountSpout());
		//指定拆分bolt组件,并指定分组方式与上一级组件进行串联
		builder.setBolt("mysplit", new WordCountSplitBolt()).shuffleGrouping("myspout");
		//指定计数的bolt组件,如果是按字段分组需要制定分组的字段
		builder.setBolt("mycount", new WordCountTotalBolt()).fieldsGrouping("mysplit", new Fields("word"));
		//指定一个redisbolt组件
		builder.setBolt("redisBolt", createRedisBolt()).shuffleGrouping("mycount");
		//创建任务
		StormTopology topology = builder.createTopology();
				
		//配置参数 ,可以为spout或者是bolt传递参数,open和prepare就是和这里对应
		Config conf = new Config();
		//有两种运行模式
		//1、本地模式
		LocalCluster cluster = new LocalCluster();
		cluster.submitTopology("MyWC", conf, topology);
		//2、集群模式,直接提交到storm的集群中处理
		/*try
		{ 
			//args[0]是topology的别名我们通过提交时路径指定,conf是配置文件,topology是任务
			StormSubmitter.submitTopology(args[0], conf, topology);
		} catch (Exception e)
		{
			// TODO Auto-generated catch block
			e.printStackTrace();
		}*/
	}

	private static IRichBolt createRedisBolt() {
		//指定Redis的地址、数据的类型
		JedisPoolConfig.Builder builder = new JedisPoolConfig.Builder();
		builder.setHost("192.168.112.111");
		builder.setPort(6379);
		JedisPoolConfig config = builder.build();
		return new RedisStoreBolt(config, new RedisStoreMapper()
		{
			//指定redis的保存类型
			@Override
			public RedisDataTypeDescription getDataTypeDescription() {
				return new RedisDataTypeDescription(RedisDataType.HASH, "myResult");
			}
			//获取上一级bolt的的值
			@Override
			public String getValueFromTuple(ITuple tuple) {
				
				return String.valueOf(tuple.getIntegerByField("total"));
			}
			//获取上一级bolt的key
			@Override
			public String getKeyFromTuple(ITuple tuple) {
				return tuple.getStringByField("word");
			}
		});
	}
}

我写好了执行也没有问题,但是在redis中使用hgetall myResult的时候获取不到数据,最后发现在是在redisbolt的前一个bolt没有输出tuple造成的

package com.test.demo;

import java.util.HashMap;
import java.util.Map;

import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.base.BaseRichBolt;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Tuple;
import org.apache.storm.tuple.Values;

public class WordCountTotalBolt extends BaseRichBolt
{

	/**
	 * 
	 */
	private static final long serialVersionUID = 1L;

	private OutputCollector collector;
	
	private Map result=new HashMap();
	@Override
	public void execute(Tuple tuple) {
		// TODO Auto-generated method stub
		String word = (String) tuple.getValueByField("word");
		Integer integer = result.get(word);
		Integer count = result.get(word);
		if (count!=null)
		{
			result.put(word, count+(Integer)tuple.getValueByField("count"));
		}else {
			result.put(word, (Integer)tuple.getValueByField("count"));
		}
        //记得在此处向下一级输出tuple
		collector.emit(new Values(word,result.get(word)));
		System.out.println("统计结果:"+word+"---"+result.get(word));
	}

	@Override
	public void prepare(Map arg0, TopologyContext arg1, OutputCollector collector) {
		// TODO Auto-generated method stub
		this.collector=collector;
	}

	@Override
	public void declareOutputFields(OutputFieldsDeclarer declarer) {
		
		declarer.declare(new Fields("word","total"));
	}

}

 

你可能感兴趣的:(大数据hadoop)