读取RDD的数据加载DataFrame

package cn.com.bigData0427

import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}

case class Person(id: Int, name: String, age: Int, score: Double)
/**
  * 通过反射的方式将DataSet(RDD)转换成DataFrame
  *   步骤:1.将文件读成RDD或者DataSet转换成 某种对象类型[Person]的 DataSet[Person]或者RDD[Person]
  *         2.直接使用 DataSet[Person].toDF  或者RDD[Person].toDF
  *   注意:自动生成的DataFrame 会按照对象中的属性顺序显示
  */
object CreateDataFrameFromRDDWithReflection {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession.builder().master("local").appName("createDataFrameFromRDDWithReflection").getOrCreate()
    spark.sparkContext.setLogLevel("ERROR")
    import spark.implicits._
    /**
      * 直接读取文件为DataSet
      */
        val person: Dataset[String] = spark.read.textFile("./data/people.txt")
        val personDs: Dataset[Person] = person.map(one => {
          val arr = one.split(",")
          Person(arr(0).toInt, arr(1).toString, arr(2).toInt, arr(3).toDouble)
        })

    /**
      * 直接读取文件为RDD
      */
//    val rdd: RDD[String] = spark.sparkContext.textFile("./data/people.txt")
//    val personDs: RDD[Person] = rdd.map(one => {
//      val arr = one.split(",")
//      Person(arr(0).toInt, arr(1).toString, arr(2).toInt, arr(3).toDouble)
//    })

    val frame: DataFrame = personDs.toDF()
    frame.show()
    /**
      * dataFrame api 操作
      */
    frame.createOrReplaceTempView("people")
    val teenagersDF: DataFrame = spark.sql("SELECT name, age FROM people WHERE age BETWEEN 13 AND 19")
    teenagersDF.show()
    //根据row中的下标获取值
    teenagersDF.map((teenager :Row)=> "Name: " + teenager(0)).show()
    //根据row中的字段获取值
    teenagersDF.map(teenager => "Name: " + teenager.getAs[String]("name")).show()

    /**
      * 数据集[Map[K,V]没有预定义的编码器,在这里定义
      */
//    implicit val mapEncoder = org.apache.spark.sql.Encoders.kryo[Map[String, Any]]
//    //Map 没有额外的编码器,在转换过程中Map 需要隐式转换的编码器
//    val result: Dataset[Map[String, Any]] = teenagersDF.map(teenager=>{teenager.getValuesMap[Any](List("name","age"))})
//    result.collect().foreach(println)


  }
}

package cn.com.bigDataTest0427

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, Row, SparkSession}

case class Person(id:Int,name:String,age:Int,score:Double)

/**
  * 通过动态创建Schema的方式加载DataFrame
  *
  * 注意:动态创建的ROW中数据的顺序要与创建Schema的顺序一致。
  */
object CreateDataFrameFromRDDWithSchema {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession.builder().master("local").appName("createdataframefromrddwithschema").getOrCreate()
    val peopleRDD: RDD[String] = spark.sparkContext.textFile("./data/people.txt")

    /**
      * 将peopleRDD转换成RDD[Row]
      */
    val rowRDD: RDD[Row] = peopleRDD.map(one => {
      val arr: Array[String] = one.split(",")
      Row(arr(0).toInt, arr(1), arr(2).toInt, arr(3).toLong)
    })

    val structType: StructType = StructType(List[StructField](
      StructField("id", IntegerType, nullable = true),
      StructField("name", StringType, nullable = true),
      StructField("age", IntegerType, nullable = true),
      StructField("score", LongType, nullable = true)
    ))

    val frame: DataFrame = spark.createDataFrame(rowRDD,structType)
    frame.show()
    frame.printSchema()
    frame.createOrReplaceTempView("people")

//    val schemaString = "id name age score"
//    /**
//      * 动态创建Schema方式
//      */
//    val fields: Array[StructField] = schemaString.split(" ").map(fieldName => StructField(fieldName, StringType, nullable = true))
//    val schema: StructType = StructType(fields)
//
//    val rowRDD: RDD[Row] = peopleRDD
//      .map(_.split(","))
//      .map(attributes => Row(attributes(0).trim, attributes(1).trim, attributes(2).trim, attributes(3).trim))
//
//    //创建DataFrame
//    import spark.implicits._
//    val peopleDF: DataFrame = spark.createDataFrame(rowRDD,schema)
//    peopleDF.show()
//    peopleDF.printSchema()
//
//    //注册临时表
//    peopleDF.createOrReplaceTempView("people")
    import spark.implicits._
    val results: DataFrame = spark.sql("SELECT name FROM people")
    results.map(attributes => "Name: " + attributes(0)).as("myCol").show()



  }
}


**people.txt**
```txt
1,zhangsan,18,100
2,lisi,19,200
3,wangwu,20,300
4,zhaoliu,30,300

你可能感兴趣的:(spark)