Scala
package blog
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
object CreateDataFrame {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.master("local[*]")
.appName("CreateDataFrame")
.getOrCreate()
import spark.implicits._
val df1 = Seq(
(1, "Karol", 19),
(2, "Abby", 20),
(3, "Zena", 18)
).toDF("id", "name", "age")
df1.show()
val schema = StructType(List(
StructField("id", IntegerType, nullable = false),
StructField("name", StringType, nullable = true),
StructField("age", IntegerType, nullable = true)
))
val rdd = spark
.sparkContext
.parallelize(Seq(
Row(1, "Karol", 19),
Row(2, "Abby", 20),
Row(3, "Zena", 18)
))
val df2 = spark
.createDataFrame(rdd, schema)
df2.show()
val df3 = spark
.read
.schema(schema)
.csv("file:///C:/info.txt")
df3.show()
spark.stop()
}
}
Python
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, LongType, StringType, IntegerType
spark = SparkSession.builder \
.master("local") \
.appName("create_DataFrame") \
.getOrCreate()
schema = StructType([StructField("id", LongType(), True),
StructField("name", StringType(), True),
StructField("age", IntegerType(), True)])
def rdd_create():
rdd = spark.sparkContext.parallelize([
(1, "Karol", 19),
(2, "Abby", 20),
(3, "Zena", 18)])
df = spark.createDataFrame(rdd, schema)
df.show()
def csv_create():
df = spark \
.read \
.option("sep", ",") \
.csv('info.txt', schema=schema)
df.show()
def pandas_create():
res = [(1, 'Karol', 19), (2, 'Abby', 20,), (3, 'Zena', 18)]
df = pd.DataFrame(res, ['id', 'name', 'age'])
print(df)
rdd = pd.DataFrame(res)
df = spark.createDataFrame(rdd, schema)
df.show()
if __name__ == '__main__':
rdd_create()
csv_create()
pandas_create()