Storm整合Hdfs—从hdfs读取数据(二)

小厨上次写过一次利用Storm从hdfs读取数据,但是效果非常不好,例如只能读取一次文件,无法控制读文件的次数。如下图所示:::Storm整合Hdfs—从hdfs读取数据(二)_第1张图片

因此,在这里,小厨利用第二种方法读取hdfs数据。

需求:1、可以连续的读整个文件夹下面的内容;2、读完最后一个文件之后,退出Topo

整个代码1:编写程序入口

package com.bigdata.storm;

import org.apache.storm.Config;
import org.apache.storm.LocalCluster;
import org.apache.storm.StormSubmitter;
import org.apache.storm.generated.AuthorizationException;
import org.apache.storm.generated.InvalidTopologyException;
import org.apache.storm.generated.StormTopology;
import org.apache.storm.thrift.TException;
import org.apache.storm.topology.TopologyBuilder;

/**
 * 描述一个topology对象  并向storm集群提交
 * @author xiaxing
 *
 */
public class TopoSubmitterClient {

	public static void main(String[] args) throws TException, InvalidTopologyException, AuthorizationException {
		
		//先获得一个Topology的构建器
		TopologyBuilder builder = new TopologyBuilder();
		
		//指定topo所有的spout组件类   
		//参数1  spout的id   参数2  spout的实例对象
		builder.setSpout("hdfs-spout", new GetHdfsBolt());
		
		//指定topo所用的第一个bolt组件,同时指定本bolt的消息流是从哪个组件流过来的
		builder.setBolt("countBolt", new CountBolt()).shuffleGrouping("hdfs-spout");
		
		
		//使用builder来生成一个Topology对象
		StormTopology phoneTopo = builder.createTopology();
		
		
		//将phoneTopo提交给集群运行
		Config config = new Config();
		//指定storm集群为Topology分配6个worker执行
		config.setNumWorkers(6);
		
		if(args.length > 0) {
			//集群运行
			StormSubmitter.submitTopology("phonecount-topo", config, phoneTopo);
		}else {
			//storm支持本地模拟测试
			LocalCluster localCluster = new LocalCluster();
			localCluster.submitTopology("phonecount-topo", config, phoneTopo);
		}
		
	}
}

整个代码2: 设置数据源,读取hdfs文件:GetHdfsBolt

package com.bigdata.getfromhdfs;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.Map;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.storm.spout.SpoutOutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.base.BaseRichSpout;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Values;
import org.apache.storm.utils.Utils;
/**
 * 自定义方法读取hdfs的文件
 * @author xiaxing
 *
 */
public class GetHdfsSpout extends BaseRichSpout {

	private static final long serialVersionUID = 1L;

	//声明一个成员变量   将open里面的SpoutOutputCollector对象的发送方法  能传递给nextTuple()方法使用
	private SpoutOutputCollector collector ;
	private Map conf;
	private TopologyContext context;
	public String filename;
	
	
	
	//此处的文件位置要对应 之前 写入hdfsBolt设置的位置
	private String filePath = "/phoneStorm";
	
	public void nextTuple() {
		readHdfsData(filePath);
		
	}

	public void open(Map conf, TopologyContext context, SpoutOutputCollector collector) {
		this.collector = collector;
		this.conf = conf;
		this.context = context;
	
	}

	public void declareOutputFields(OutputFieldsDeclarer declarer) {
		declarer.declare(new Fields("line-word"));
		
	}
	
	private void readHdfsData(String FilePath){
		System.out.println("开始拿数据");
		//创建Configuration对象
		Configuration conf=new Configuration();
		conf.set("fs.defaultFS","hdfs://192.168.83.131:9000");
		
		//遍历目录下的所有文件
		BufferedReader in = null;
		FSDataInputStream dis;
		String line;
		try {
			//创建FileSystem对象
			FileSystem hdfs = FileSystem.get(conf);
			FileStatus[] status = hdfs.listStatus(new Path(FilePath));
			for(int i=0; i>>>>"+status[i].getPath()
							+" , length: "+status[i].getLen()
							+" , dir owner:"+status[i].getOwner());
					System.out.println("*****************************");
					System.out.println("*****************************");
					System.out.println("*****************************");
					System.out.println("*****************************");
					System.out.println("*****************************");
					System.out.println("*****************************");
					System.out.println("*****************************");
					System.out.println("*****************************");
					System.out.println("*****************************");
					Utils.sleep(5000);
				}
				
				dis = hdfs.open(status[i].getPath());//输入流,获取某个文件的路径
				in = new BufferedReader(new InputStreamReader(dis, "UTF-8"));
				while ((line = in.readLine()) != null){
					System.out.println("拿到的数据为"+line);
					this.collector.emit(new Values(line));
					Utils.sleep(100);
				}
				if(i == status.length-1) {
					System.out.println("<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<");
					System.out.println("**********已经读完"+filePath+" 下面的"+status.length+"个文件,是时候结束整个Topo了************");
					System.out.println(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>");
					System.exit(0);;
				}
			}
			
		} catch (IOException e) {
			e.printStackTrace();
		} finally {
			try {
				in.close();
			}catch(IOException e) {
				e.printStackTrace();
			}
		}

	}

}

整合代码3:统计对应手机号出现的次数:CountBolt

package com.readhdfs;
 
import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.base.BaseRichBolt;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Tuple;
import org.apache.storm.tuple.Values;
 
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
 
/**
 * 最后一级bolt 进行统计计数
 */
public class CountBolt extends BaseRichBolt {
    private Map map = new HashMap<>();
   private OutputCollector collector;
    @Override
    public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) {
        this.collector = collector;
    }
 
    @Override
    public void execute(Tuple tuple) {
        String word = tuple.getStringByField("line-word");
        Integer num = map.get(word);
        if(num == null){
            num = 1;
        }else {
            num++;
        }
        map.put(word,num);
        System.out.println("**************************");
        Set> entries = map.entrySet();
        for(Map.Entry entry:entries){
            System.out.println(entry.getKey()+"出现的次数为:"+entry.getValue());
        }
        collector.emit(new Values(word, num));
    }
 
    @Override
    public void declareOutputFields(OutputFieldsDeclarer declarer) {
        declarer.declare(new Fields("word", "num"));
    }
}

至此,代码逻辑编写结束。回头想一想,其实这种可控方法比原生的要好一些。多思考,收获颇丰。

你可能感兴趣的:(大数据-storm)