// spark streaming 文件输入流
// val inputFile = "file:///usr/local/spark/mycode/wordcount/word.txt"
val inputFile = "hdfs://192.168.126.130:9000/usr/local"
val conf = new SparkConf().setAppName("streamingApp").setMaster("local")
// val sc = new SparkContext(conf)
// val ssc = new StreamingContext(sc,Seconds(20));
val ssc = new StreamingContext(conf,Seconds(20));// 监听间隔时间20s
val lines = ssc.textFileStream(inputFile);
val word = lines.flatMap(line => line.split(" ")).map(word => (word, 1)).reduceByKey((a, b) => a + b)
word.print()
ssc.start() // 开启文件监听
ssc.awaitTermination()
向监听的hdfs 文件路径上传本地文件
hadoop fs -copyFromLocal /usr/local/spark/mycode/wordcount/word.txt /usr/local
效果图
// scoket 套接字 输入流
// StreamingExamples.setStreamingLogLevels()
val conf = new SparkConf().setAppName("scoketInputStream").setMaster("local")
val ssc = new StreamingContext(conf,Seconds(10))
val lines = ssc.socketTextStream("192.168.126.130",9999,StorageLevel.MEMORY_AND_DISK_SER)
val wordCount = lines.flatMap(_.split(" ")).map(x=>(x,1)).reduceByKey(_+_)
wordCount.print()
ssc.start()
ssc.awaitTermination()
nc -lk 9999
输入一些单词
producer
// if (args.length < 4) {
// System.err.println("Usage: KafkaWordCountProducer " +
// " ")
// System.exit(1)
// }
// val Array(brokers, topic, messagesPerSec, wordsPerMessage) = args
// // Zookeeper connection properties
// val props = new HashMap[String, Object]()
// props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, brokers)
// props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG,
// "org.apache.kafka.common.serialization.StringSerializer")
// props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG,
// "org.apache.kafka.common.serialization.StringSerializer")
// val producer = new KafkaProducer[String, String](props)
// // Send some messages
// while(true) {
// (1 to messagesPerSec.toInt).foreach { messageNum =>
// val str = (1 to wordsPerMessage.toInt).map(x => scala.util.Random.nextInt(10).toString)
// .mkString(" ")
// print(str)
// println()
// val message = new ProducerRecord[String, String](topic, null, str)
// producer.send(message)
// }
// Thread.sleep(1000)
// }
consumer
// val sc = new SparkConf().setAppName("KafkaWordCount").setMaster("local[2]")
// val ssc = new StreamingContext(sc, Seconds(10))
// ssc.checkpoint("file:///usr/local/spark/mycode/kafka/checkpoint") //设置检查点,如果存放在HDFS上面,则写成类似ssc.checkpoint("/user/hadoop/checkpoint")这种形式,但是,要启动hadoop
// val zkQuorum = "localhost:2181" //Zookeeper服务器地址
// val group = "1" //topic所在的group,可以设置为自己想要的名称,比如不用1,而是val group = "test-consumer-group"
// val topics = "wordsender" //topics的名称
// val numThreads = 1 //每个topic的分区数
// val topicMap = topics.split(",").map((_, numThreads.toInt)).toMap
// val lineMap = KafkaUtils.createStream(ssc, zkQuorum, group, topicMap)
// val lines = lineMap.map(_._2)
// val words = lines.flatMap(_.split(" "))
// val pair = words.map(x => (x, 1))
// val wordCounts = pair.reduceByKeyAndWindow(_ + _, _ - _, Minutes(2), Seconds(10), 2)
// wordCounts.print
// ssc.start
// ssc.awaitTermination