[Spark应用]-- 实现单词统计(数据源在hdfs上)

实现如下

package com.scala.my

import org.apache.spark.streaming.StreamingContext
import org.apache.spark.SparkConf
import org.apache.spark.streaming.Durations
/**
 * scala版本的hdfs上wordCount
 */
object HdfsWordCount {
  def main(args: Array[String]): Unit = {
    //获取streamingContex,并设置切分rdd时间为6s
    val sc=new StreamingContext(new SparkConf().setAppName("hdfsCount").setMaster("local[2]"),Durations.seconds(5))
    //读取hdfs上的数据
    val lines =sc.textFileStream("hdfs://master:8020/wordcount_dir")
    //压扁
    val paris=lines.flatMap(x=>x.split(","))
    //map
    val map=paris.map { (_,1) }   
    //reduceBykey
    val words=map.reduceByKey(_+_)
    //打印前10个
    words.print()
    //开启
    sc.start()
    //等待
    sc.awaitTermination()
    //关闭资源
    sc.stop()
  }
}

 

你可能感兴趣的:(Spark,spark,scala)