Spark创建DataFrame的三种方式

Scala

package blog

import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}

/**
  * @Author Daniel
  * @Description scala创建DataFrame的三种方式
  **/

object CreateDataFrame {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession.builder()
      .master("local[*]")
      .appName("CreateDataFrame")
      .getOrCreate()

    import spark.implicits._
    //通过toDF方法创建
    val df1 = Seq(
      (1, "Karol", 19),
      (2, "Abby", 20),
      (3, "Zena", 18)
    ).toDF("id", "name", "age")
    df1.show()

    //通过spark.createDataFrame创建
    val schema = StructType(List(
      StructField("id", IntegerType, nullable = false),
      StructField("name", StringType, nullable = true),
      StructField("age", IntegerType, nullable = true)
    ))

    val rdd = spark
      .sparkContext
      .parallelize(Seq(
        Row(1, "Karol", 19),
        Row(2, "Abby", 20),
        Row(3, "Zena", 18)
      ))

    val df2 = spark
      .createDataFrame(rdd, schema)
    df2.show()

    //通过读取文件创建
    val df3 = spark
      .read
      .schema(schema)
      .csv("file:///C:/info.txt")
    df3.show()

    spark.stop()
  }


}

Python

import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, LongType, StringType, IntegerType

spark = SparkSession.builder \
    .master("local") \
    .appName("create_DataFrame") \
    .getOrCreate()
# 表结构
schema = StructType([StructField("id", LongType(), True),
                     StructField("name", StringType(), True),
                     StructField("age", IntegerType(), True)])


# 通过sparkContext中的parallelize方法来创建RDD
def rdd_create():
    rdd = spark.sparkContext.parallelize([
        (1, "Karol", 19),
        (2, "Abby", 20),
        (3, "Zena", 18)])

    # 创建DataFrame
    df = spark.createDataFrame(rdd, schema)
    df.show()


# 从csv文件(文本文件)创建DataFrame
def csv_create():
    # 设置分隔符与路径
    df = spark \
        .read \
        .option("sep", ",") \
        .csv('info.txt', schema=schema)
    df.show()


# 使用pandas创建DataFrame(因为方法较多,这里列举常用的两种)
def pandas_create():
    # list
    res = [(1, 'Karol', 19), (2, 'Abby', 20,), (3, 'Zena', 18)]
    # 1 dict / list
    # dict
    # df = pd.DataFrame({'id': (1, 2, 3),'name': ('Karol', 'Abby', 'Zena'), 'Age': (19, 20, 18)})
    df = pd.DataFrame(res, ['id', 'name', 'age'])
    print(df)
    # 2
    rdd = pd.DataFrame(res)
    df = spark.createDataFrame(rdd, schema)
    df.show()


if __name__ == '__main__':
    rdd_create()
    csv_create()
    pandas_create()

你可能感兴趣的:(Spark)