1.spark读取数据与scala编程

以下是idea中用到的maven仓库
版本说明:
spark 2.3.1
scala 2.11
hadoop 3.1.1



    4.0.0

    com.attest.bigdata
    spark-200329
    1.0

    
        
            org.apache.spark
            spark-core_2.11
            2.3.1
        
        
            org.apache.spark
            spark-sql_2.11
            2.3.1
        
        
            org.apache.spark
            spark-streaming_2.11
            2.3.1
        
        
            org.apache.kafka
            kafka_2.11
            1.0.0
        
        
            org.apache.spark
            spark-streaming-kafka-0-8_2.11
            2.3.1
        
    
    
        WordCount
        
            
                net.alchim31.maven
                scala-maven-plugin
                3.2.2
                
                    
                        
                            compile
                            testCompile
                        
                    
                
            
        
    

package com.sparktest.bigdata.spark
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext, rdd}
object WordCount {
  def main(args: Array[String]): Unit = {
    //设定部署环境
    //app id
    val  config : SparkConf = new SparkConf().setMaster("local").setAppName("WordCount")

    val sc = new SparkContext(config)

    //读取文件

    val lines : RDD [String] = sc.textFile("hdfs://192.168.56.101:9000/stream")
    val words : RDD[String] = lines.flatMap(_.split(" "))
    val wordToOne : RDD[(String,Int)] = words.map((_,1))
    val wordToSum : RDD[(String,Int)] = wordToOne.reduceByKey(_+_)
    val result: Array[(String, Int)] = wordToSum.collect()
    //println(result)
    result.foreach(println)
    //sc.textFile("input").flatMap(_.split(" ")).map((_,1)).reduceByKey(_+_).collect

  }

}

你可能感兴趣的:(1.spark读取数据与scala编程)