创建DataFrame/DataSet的三种方法

创建DataFrame/DataSet的三种方法

下面直接copy代码

**

  1. List item

第1种:指定列名添加Schema**

package spark_sql


import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}

object Spark_Sql_Schema {
  def main(args: Array[String]): Unit = {
    //创建sparkSession
    //通过sparkSession创建SparkContext
    val spark: SparkSession = SparkSession.builder().master("local[*]").appName("zhiDingSchema").getOrCreate()
    val sc: SparkContext = spark.sparkContext
    sc.setLogLevel("WARN")

    //读取数据并加工
    val ttRDD: RDD[String] = sc.textFile("D:\\大数据\\学期文档\\spark\\资料\\tt.txt")
    val lineDatas: RDD[Array[String]] = ttRDD.map(a => a.split(" "))
    //这里返回的必须是Row
    val rowRDD: RDD[Row] = lineDatas.map(x => Row(x(0).toInt, x(1), x(2).toInt))

    //定义schema 设置表结构
    val schema: StructType = StructType(
      Seq(
        StructField("id", IntegerType, true),
        StructField("name", StringType, true),
        StructField("age", IntegerType, true)
      )
    )

    //创建dataFrame
    val df: DataFrame = spark.createDataFrame(rowRDD, schema)

    //查看数据
    df.show()
    df.printSchema()

    //如果要查询将df注册成一张表
    //  df.createOrReplaceTempView("person")

    //sql查询

    //停止ss、sc
    sc.stop()
    spark.stop()
  }

}

**

  1. List item

第2种:通过StructType指定Schema**

package spark_sql

import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SparkSession}

object SparkSqlZhiDinSchema {
  def main(args: Array[String]): Unit = {
    //创建sparkSession
    //通过sparkSession创建SparkContext
    val spark: SparkSession = SparkSession.builder().master("local[*]").appName("zhiDingSchema").getOrCreate()
    val sc: SparkContext = spark.sparkContext
    sc.setLogLevel("WARN")

    //读取数据并加工
    val ttRDD: RDD[String] = sc.textFile("D:\\大数据\\学期文档\\spark\\资料\\tt.txt")
    val lineDatas: RDD[Array[String]] = ttRDD.map(a => a.split(" "))
    val RDD: RDD[(Int, String, Int)] = lineDatas.map(x => (x(0).toInt, x(1), x(2).toInt))

    //转化成DF、DS
    //隐式转化
    import spark.implicits._
    val rddDF: DataFrame = RDD.toDF("id", "name", "age")

    //查看数据
    rddDF.show()
    rddDF.printSchema()

//    //如果要查询sql
//    rddDF.createOrReplaceTempView("tt")
//    spark.sql("select * from tt").show()

    //StructType形式DSL风格不能使用$ 符号调用

    //停止ss、sc
    sc.stop()
    spark.stop()

  }
}

  1. List item

第3种:编写样例类,利用反射机制推断Schema*

package spark_sql

import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}

object SparkSqlFanShe {
  //创建样例类
  case class Person(id: Int, name: String, age: Int)

  def main(args: Array[String]): Unit = {
    //创建sparkSession
    //通过sparkSession创建SparkContext
    val spark: SparkSession = SparkSession.builder().master("local[*]").appName("zhiDingSchema").getOrCreate()
    val sc: SparkContext = spark.sparkContext
    sc.setLogLevel("WARN")

    //读取数据并加工
    val ttRDD: RDD[String] = sc.textFile("D:\\大数据\\学期文档\\spark\\资料\\tt.txt")
    val lineDatas: RDD[Array[String]] = ttRDD.map(a => a.split(" "))
    //遍历每一行数据 传递到样例类中
    val personRDD: RDD[Person] = lineDatas.map(line => Person(line(0).toInt, line(1), line(2).toInt))

    //转化成DF、DS
    //隐式调用
    import spark.implicits._
    val df: DataFrame = personRDD.toDF()

    //查看数据
    df.show()
    df.printSchema()

    //将df注册成一张表
    df.createOrReplaceTempView("person")

    //sql查询
    //spark.sql("select * from person").show()

    //也可以用DSL风格
    df.select("id", "name").filter($"id" > 3).show()

    /**
     * 笔记
     * RDD DF DS 三者之间的转化
     */

    //RDD转化DF、DS
    personRDD.toDF() //前提是进行隐式转化
    val DS: Dataset[Person] = personRDD.toDS()

    //DF转化RDD、DS
    df.rdd
    val DS1: Dataset[Person] = df.as[Person]

    //DS转化RDD、DF
    DS.rdd
    DS.toDF()

    /**
     * 总结
     * 1.转化RDD  .rdd即可
     * 2.转化成DF toDF()即可
     * 3.转化成DS
     *  3.1 RDD转化DS toDS()
     *  3.2 DF 转DS as[Person]
     */

    //停止sc、ss
    sc.stop()
    spark.stop()

  }

}

你可能感兴趣的:(大数据)