Storm+Kafka+Zookeeper本地集群实现百万股票数据统计

引言

        目前下面的代码在本地集群测试环境中完成了测试,但是还未提交到docker集群中实际运行,这将会在完成大作业时更新。

前提

        storm集群

        kafka集群

        zookeeper集群

        三个集群的搭建可以查看秃头的其他文章

需求

        作业要求使用storm集群对10份股票数据文件进行统计,数据集中包含10支股票,包含股票金额、成交单价、成交量、成交时间等字段。

        主要实现两个任务:

        1、对不同类型的股票的成交总金额、成交量进行实时统计

        2、对不同类型的股票每小时的交总金额、成交量进行实时统计

思路

        通过kafka读取文件,将数据以消息的形式发送给spout,而后由spout将数据发给一个父bolt,由父bolt预处理数据后,分发给两个子bolt实现具体的任务。

代码

依赖



    4.0.0
    
        org.example
        kafkaConsume
        1.0-SNAPSHOT
    

    storm

    
        8
        8
        UTF-8
    

    
        
        
            org.projectlombok
            lombok
            1.18.12
        
        
        
            junit
            junit
            4.13
            test
        
        
            org.apache.kafka
            kafka-clients
            2.4.1
        
        
        
            org.apache.storm
            storm-core
            2.2.0
        
        
        
            org.apache.storm
            storm-kafka-client
            2.2.0
        
        
            com.alibaba
            fastjson
            1.2.47
        
        
        
            org.apache.commons
            commons-csv
            1.8
        
    
    
        
            
                org.apache.maven.plugins
                maven-compiler-plugin
                3.8.1
                
                    1.8
                    1.8
                    UTF-8
                
            
            
                org.apache.maven.plugins
                maven-assembly-plugin
                2.5.5
                
                    
                        
                            com.yj.TCPClient.upload.App
                        
                    
                    
                        jar-with-dependencies
                    
                
                
                
                
                    
                        make-assembly
                        package
                        
                            single
                        
                    
                
            
        
    

kafka生产者        

package org.example.kafkaPC;

import com.sun.org.apache.bcel.internal.generic.ACONST_NULL;

import org.apache.commons.csv.CSVRecord;
import org.apache.kafka.clients.producer.KafkaProducer;
import org.apache.kafka.clients.producer.ProducerConfig;
import org.apache.kafka.clients.producer.ProducerRecord;
import org.apache.kafka.common.serialization.StringSerializer;
import org.example.utils.ReadFile;
import org.example.utils.TimeFormatter;

import java.util.*;
import java.util.stream.IntStream;

public class Producer {
    // 创建kafka生产者
    // 读取csv文件
    // 取出trade_volume进行类加
    // 创建kafka消费者
    public void producer() {

        // 初始文件目录
        List filePaths = new ArrayList<>();
        filePaths.add("/home/veteran/data/fileData/股票数据/股票数据1.csv");
        filePaths.add("/home/veteran/data/fileData/股票数据/股票数据2.csv");
        filePaths.add("/home/veteran/data/fileData/股票数据/股票数据3.csv");
        filePaths.add("/home/veteran/data/fileData/股票数据/股票数据4.csv");
        filePaths.add("/home/veteran/data/fileData/股票数据/股票数据5.csv");
        filePaths.add("/home/veteran/data/fileData/股票数据/股票数据6.csv");
        filePaths.add("/home/veteran/data/fileData/股票数据/股票数据7.csv");
        filePaths.add("/home/veteran/data/fileData/股票数据/股票数据8.csv");
        filePaths.add("/home/veteran/data/fileData/股票数据/股票数据9.csv");
        filePaths.add("/home/veteran/data/fileData/股票数据/股票数据10.csv");

        // 创建topic
        String topic = "stock_1";
        // 创建生产者
        Properties properties = new Properties();
        properties.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, "192.168.43.219:9092,192.168.43.219:9093,192.168.43.219:9094");
        properties.put(ProducerConfig.LINGER_MS_CONFIG, 5);
        properties.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, StringSerializer.class.getName());
        properties.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, StringSerializer.class.getName());

        KafkaProducer kProducer = new KafkaProducer<>(properties);
        // 创建文件读取器
        ReadFile readFile = new ReadFile();
        TimeFormatter timeFormatter = new TimeFormatter();
        // 循环处理三个文件
        IntStream.range(0, filePaths.size()).forEach(index -> {
            Iterable records = readFile.readCSV(filePaths.get(index));
            // 提交topic
            for (CSVRecord csvRecord : records) {
                try {
                    // 取出stock_name做为key
                    String key = csvRecord.get("stock_code");
                    // 取出trade_volume进行统计
                    double trade_volume = Double.parseDouble(csvRecord.get("trade_volume"));
                    // 取出交易单价
                    float price = Float.parseFloat(csvRecord.get("price"));
                    String totalPrice = Double.toString(trade_volume * price);
                    // 取出时间 并且转换为数值
                    String time = csvRecord.get("time");
//                    System.out.println(time);
                    long convertedTime = timeFormatter.convertTime(time);
                    System.out.println(key+","+totalPrice+","+convertedTime);
                    ProducerRecord kRecord = new ProducerRecord<>(topic, key, totalPrice+","+convertedTime);
                    // 使用kafka producer发送
                    kProducer.send(kRecord);
                }catch (Exception e){
                    e.printStackTrace();
                }finally {
                    continue;
                }
            }
        });
        // 关闭生产者
        kProducer.close();
    }

    public static void main(String[] args) {
        Producer producer = new Producer();
        producer.producer();
    }
}

Topology(拓扑)代码: 

package org.example.storm;

import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.storm.Config;
import org.apache.storm.LocalCluster;
import org.apache.storm.kafka.spout.KafkaSpout;
import org.apache.storm.kafka.spout.KafkaSpoutConfig;
import org.apache.storm.topology.TopologyBuilder;


import java.util.Collections;

public class Topology {
    public static void main(String[] args) throws Exception {
        // 配置spout
        KafkaSpoutConfig kafkaSpoutConfig = KafkaSpoutConfig.builder("192.168.43.219:9092,192.168.43.219:9093,192.168.43.219:9094", "stock_1").setProp(ConsumerConfig.GROUP_ID_CONFIG, "kafkaSpoutTestGroup").build();
        // 配置拓扑
        TopologyBuilder topologyBuilder = new TopologyBuilder();
        // 设置spout
        topologyBuilder.setSpout("kafka-spout", new KafkaSpout<>(kafkaSpoutConfig), 1);
        // 设置bolt1
        topologyBuilder.setBolt("process-bolt", new ProcessingBolt()).shuffleGrouping("kafka-spout");
        // 设置bolt2
//        topologyBuilder.setBolt("next-bolt1", new NextBolt1(), 3).shuffleGrouping("process-bolt");
        // 设置bolt3
        topologyBuilder.setBolt("next-bolt2", new NextBolt2(), 1).shuffleGrouping("process-bolt");
        // 配置
        Config config = new Config();

        // 提交topology
//        StormSubmitter.submitTopology("stockStatistic", config, topologyBuilder.createTopology());
        // 创建本地集群进行测试
        LocalCluster cluster = new LocalCluster();
        cluster.submitTopology("LocalReadingFromKafkaApp", config, topologyBuilder.createTopology());

    }

}

ProcessingBolt代码:

package org.example.storm;

import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.base.BaseRichBolt;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Tuple;
import org.apache.storm.tuple.Values;

import java.util.HashMap;
import java.util.Map;

public class ProcessingBolt extends BaseRichBolt {
    private OutputCollector collector;
    private Map tradeVolumeMap;
    private Map totalPriceMap;
    private Map startTimeMap;
    private Map accumulateTimeMap;
    @Override
    public void prepare(Map map, TopologyContext topologyContext, OutputCollector outputCollector) {
        this.collector = outputCollector;
        // 创建交易量记录map
        tradeVolumeMap = new HashMap<>();
        // 创建总金额记录map
        totalPriceMap = new HashMap<>();
        // 创建开始时间记录map
        startTimeMap = new HashMap<>();
        // 创建累积时间记录器
        accumulateTimeMap = new HashMap<>();
    }

    @Override
    public void execute(Tuple tuple) {
        // 接受key 根据key分类
        String key = tuple.getStringByField("key");
        // 接受数据 数据格式为"交易量,时间"
        String value = tuple.getStringByField("value");
        String[] valueList = value.split(",");
        Double totalPrice = Double.parseDouble(valueList[0]);
//        System.out.println(totalPrice);
//        System.out.println(totalPriceMap.get(key));
        long time = Long.parseLong(valueList[1]);
        if (tradeVolumeMap.containsKey(key)){
            // 累积交易量
            tradeVolumeMap.put(key, tradeVolumeMap.get(key) + 1);
            // 累积交易金额
            totalPriceMap.put(key, totalPriceMap.get(key) + totalPrice);
            // 累积时间 当前时间晚于记录的时间
            if (time > startTimeMap.get(key) && time > accumulateTimeMap.get(key)){;
                accumulateTimeMap.put(key, time);
            } else if (time < startTimeMap.get(key)) {
                // 当前时间早于记录时间 更新startTimeMap
                startTimeMap.put(key, time);
            }
        }else {
            tradeVolumeMap.put(key, 1);
            totalPriceMap.put(key, totalPrice);
            startTimeMap.put(key, time);
            accumulateTimeMap.put(key, time);
        }
        // 封装map 传向下一个bolt
        Values values = new Values(tradeVolumeMap, totalPriceMap, startTimeMap, accumulateTimeMap);
//        System.out.println(tradeVolumeMap);
//        System.out.println(totalPriceMap);
//        System.out.println(startTimeMap);
//        System.out.println(accumulateTimeMap);
//        System.out.println("----------------------------------");
        collector.emit(values);
        collector.ack(tuple);
    }

    @Override
    public void declareOutputFields(OutputFieldsDeclarer outputFieldsDeclarer) {
        // 声明流 q:要申明一个默认流
        outputFieldsDeclarer.declare(new Fields("tradeVolumeMap", "totalPriceMap", "startTimeMap", "accumulateTimeMap"));
    }
}

Next-bolt1代码:

package org.example.storm;

import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.base.BaseRichBolt;
import org.apache.storm.tuple.Tuple;

import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;

public class NextBolt1 extends BaseRichBolt {
    private OutputCollector collector;
    private long startTime;

    @Override
    public void prepare(Map map, TopologyContext topologyContext, OutputCollector outputCollector) {
        this.collector = outputCollector;
        this.startTime = System.currentTimeMillis();
    }

    @Override
    public void execute(Tuple tuple) {
        // 接收并处理传递过来的数据
        Map tradeVolumeMap = new HashMap<>((Map) tuple.getValueByField("tradeVolumeMap"));
        Map totalPriceMap = new HashMap<>((Map) tuple.getValueByField("totalPriceMap"));
        Map startTimeMap = new HashMap<>((Map) tuple.getValueByField("startTimeMap"));
        Map accumulateTimeMap = new HashMap<>((Map) tuple.getValueByField("accumulateTimeMap"));
//        System.out.println(localTradeVolumeMap);
//        System.out.println(localTotalPriceMap);
//        System.out.println(localStartTimeMap);
//        System.out.println(accumulateTimeMap);
        for (Map.Entry entry: tradeVolumeMap.entrySet()){
            // 取得股票code
            String stockName = entry.getKey();
            Integer totalVolume = entry.getValue();
            Double totalPrice = totalPriceMap.get(entry.getKey());
//            System.out.printf("【%s】共计交易 %f 元 成交 %d 单 \n", stockName, totalPrice, totalVolume);
        }
//        System.out.println("----------------------------------");
        System.out.printf("总耗时:%ds \n", (System.currentTimeMillis() - this.startTime) / 1000);

        collector.ack(tuple);
    }

    @Override
    public void declareOutputFields(OutputFieldsDeclarer outputFieldsDeclarer) {

    }
}

Next-bolt2代码:

package org.example.storm;

import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.base.BaseRichBolt;
import org.apache.storm.tuple.Tuple;

import java.util.HashMap;
import java.util.Map;

public class NextBolt2 extends BaseRichBolt {
    private Map tradeVolumeMap;
    private Map totalPriceMap;
    private Map startTimeMap;
    private Map accumulateTimeMap;
    private OutputCollector collector;
    private Map recordStartTimeMap;
    private Map recordTradeVolumeMap;
    private Map recordTotalPriceMap;
    private long startTime;
    @Override
    public void prepare(Map map, TopologyContext topologyContext, OutputCollector outputCollector) {
        this.collector = outputCollector;
        this.startTime = System.currentTimeMillis();
        // 创建交易量记录map
        tradeVolumeMap = new HashMap<>();
        // 创建总金额记录map
        totalPriceMap = new HashMap<>();
        // 创建开始时间记录map
        startTimeMap = new HashMap<>();
        // 创建累积时间记录器
        accumulateTimeMap = new HashMap<>();

        // 创建当前bolt中的时间记录器
        recordStartTimeMap = new HashMap<>();
        // 创建当前bolt中的交易量记录器
        recordTradeVolumeMap = new HashMap<>();
        // 创建当前bolt中的总金额记录器
        recordTotalPriceMap = new HashMap<>();
    }

    @Override
    public void execute(Tuple tuple) {
        // 接收并处理传递过来的数据
        tradeVolumeMap = (Map) tuple.getValueByField("tradeVolumeMap");
        totalPriceMap = (Map) tuple.getValueByField("totalPriceMap");
        startTimeMap = (Map) tuple.getValueByField("startTimeMap");
        accumulateTimeMap = (Map) tuple.getValueByField("accumulateTimeMap");
        // 创建拷贝
        Map tradeVolumeMapCopy = new HashMap<>(tradeVolumeMap);
        Map totalPriceMapCopy = new HashMap<>(totalPriceMap);
        Map startTimeMapCopy = new HashMap<>(startTimeMap);
        Map accumulateTimeMapCopy = new HashMap<>(accumulateTimeMap);

        for (Map.Entry entry: tradeVolumeMapCopy.entrySet()) {
            String key = entry.getKey();
            if (recordTradeVolumeMap.containsKey(key)){
                long accumulateTime = accumulateTimeMapCopy.get(key);
                long recordStartTime = recordStartTimeMap.get(key);
                long volume = tradeVolumeMapCopy.get(key)-recordTradeVolumeMap.get(key);
                double price = totalPriceMapCopy.get(key)-recordTotalPriceMap.get(key);
//                System.out.println(accumulateTime - recordStartTime);
                if (accumulateTime - recordStartTime >= 3600000L){
//                    System.out.println("yes");
//                    System.out.printf("【%s】一小时成交 %d 单 金额为%f\n", key, volume, price);
                    // 更新bolt中的时间记录器
                    recordStartTimeMap.put(key, accumulateTime);
                    // 更新bolt中的交易量记录器
                    recordTradeVolumeMap.put(key, tradeVolumeMapCopy.get(key));
                    // 更新bolt中的总金额记录器
                    recordTotalPriceMap.put(key, totalPriceMapCopy.get(key));
                }
            }else {
                // 初始化bolt中的时间记录器
                recordStartTimeMap.put(key, startTimeMapCopy.get(key));
                // 初始化bolt中的交易量记录器
                recordTradeVolumeMap.put(key, tradeVolumeMapCopy.get(key));
                // 初始化bolt中的总金额记录器
                recordTotalPriceMap.put(key, totalPriceMapCopy.get(key));
            }

        }
//        System.out.println("初始"+recordStartTimeMap);
//        System.out.println("当前"+accumulateTimeMapCopy);
//        System.out.println("---------------------------------");
        System.out.printf("总耗时:%ds \n", (System.currentTimeMillis() - this.startTime) / 1000);
        collector.ack(tuple);
    }

    @Override
    public void declareOutputFields(OutputFieldsDeclarer outputFieldsDeclarer) {

    }
}

问题与解决方案

1、bolt协调问题

        实现两个具体任务是由两个bolt完成的,如果我们同时运行两个bolt,有时候需要来查看结果,但是打印输出的时间消耗远远超过了数据的计算,在测试的时候,如果两个bolt打印输出的量不同,消耗的资源就不同,如果差异过大就会引发线程错误的问题,所以要控制两个bolt之间的资源分配。

2、本地集群与真实集群适配

        目前上面的代码是在本地集群进行测试的,在打包后提交到真实集群出现了spout接受不到kafka消息的问题,此问题暂时还未解决,将在后期大作业中解决后更新。

你可能感兴趣的:(storm,kafka,zookeeper)