PySaprk之Spark DataFrame的构建方法

一、基于RDD的方式一

通过SparkSession对象的createDataFrame方法来将RDD转换为DataFrame。这里只传入列名称,类型从RDD中进行推断,是否允许为空默认为允许(True)

from pyspark.sql import SparkSession
import os

os.environ["SPARK_HOME"] = '/export/server/spark'
PYSPARK_PYTHON = '/root/anaconda3/envs/pyspark_env/bin/python'
os.environ["PYSPARK_PYTHON"] = PYSPARK_PYTHON
os.environ["PYSPARK_DRIVER_PYTHON"] = PYSPARK_PYTHON

if __name__ == '__main__':
   spark = SparkSession.builder.\
       appName("create df").\
       master("local[*]").\
       getOrCreate()

   sc = spark.sparkContext

   # 首先构建一个RDD
   rdd = sc.textFile("file:///export/data/sql/people.txt").\
       map(lambda x:x.split(',')).\
       map(lambda x:[x[0],int(x[1])]) # 需要做类型转换,因为类型从RDD中探测

   # 构建DF方式1
   df = spark.createDataFrame(rdd,schema=['name','age'])
   # 打印表结构
   df.printSchema()
   # 打印20行数据
   df.show()

   df.createTempView("ttt")
   spark.sql("select * from ttt where age<30").show()

二、基于RDD的方式二

通过StructType对象来定义DataFrame的“表结构”转换RDD

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StringType,IntegerType
import os


os.environ["SPARK_HOME"] = '/export/server/spark'
PYSPARK_PYTHON = "/root/anaconda3/envs/pyspark_env/bin/python"
os.environ["PYSPARK_PYTHON"] = PYSPARK_PYTHON
os.environ["PYSPARK_DRIVER_PYTHON"] = PYSPARK_PYTHON

if __name__ == '__main__':
    spark = SparkSession.builder.\
        appName("create_df").\
        config("spark.sql.shuffle.partitions","4").\
        getOrCreate()
    # SparkSession对象也可以获取SparkContext
    sc = spark.sparkContext
    # 创建DF,首先创建RDD将RDD转换为DF
    rdd = sc.textFile("file:///export/data/sql/stu_score.txt").\
        map(lambda x:x.split(',')).\
        map(lambda x:(int(x[0]),x[1],int(x[2])))
    # StructType类
    # 这个类可以定义整个DataFrame中的Schema
    schema = StructType().\
        add("id",IntegerType(),nullable=False).\
        add("name",StringType(),nullable=True).\
        add("score",IntegerType(),nullable=False)
    # 一个add方法定义一个列的信息,如果有3个列,就写三个add
    # add方法:参数1:列名称,参数二:列类型,参数三:是否允许为空
    df = spark.createDataFrame(rdd,schema)
    df.printSchema()
    df.show()

三、基于RDD的方式3

使用RDD的toDF方法转换RDD

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StringType,IntegerType
import os

os.environ["SPARK_HOME"] = '/export/server/spark'
PYSPARK_PYTHON = "/root/anaconda3/envs/pyspark_env/bin/python"
os.environ["PYSPARK_PYTHON"] = PYSPARK_PYTHON
os.environ["PYSPARK_DRIVER_PYTHON"] = PYSPARK_PYTHON


if __name__ == '__main__':
    spark = SparkSession.builder.\
        appName("create_df").\
        config("spark.sql.shuffle.partitions","4").\
        getOrCreate()
    # SparkSession对象也可以获取SparkContext
    sc = spark.sparkContext
    # 创建DF,首先创建RDD将RDD转DF
    rdd = sc.textFile("file:///export/data/sql/stu_score.txt"). \
        map(lambda x: x.split(',')). \
        map(lambda x: (int(x[0]), x[1], int(x[2])))

    # StructType类
    # 这个类可以定义整个DataFrame中的Schema
    schema = StructType().\
        add("id",IntegerType(),nullable=False).\
        add("name",StringType(),nullable=True).\
        add("score",IntegerType(),nullable=False)
    # 一个add方法定义一个列的信息,如果有3个列,就写三个add
    # add方法:参数1:列名称,参数2:列类型,参数3,是否允许为空
    # 方式1:只传列名,类型靠推断,是否允许为空是true
    df = rdd.toDF(['id', 'subject', 'score'])

    df.printSchema()
    df.show()
    # 方式二:传入完整的Schema描述对象StructType
    df = rdd.toDF(schema)
    df.printSchema()
    df.show()

四、基于Pandas的DataFrame对象

将Pandas的DataFrame对象,转变为分布式的SparkSQL

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StringType,IntegerType
import pandas as pd
import os

os.environ['SPARK_HOME'] = '/export/server/spark'
PYSPARK_PYTHON = '/root/anaconda3/envs/pyspark_env/bin/python'
os.environ['PYSPARK_PYTHON'] = PYSPARK_PYTHON
os.environ['PYSPARK_DRIVER_PYTHON'] = PYSPARK_PYTHON

if __name__ == '__main__':
    spark = SparkSession.builder.\
        appName("create df").\
        master("local[*]").\
        getOrCreate()
    sc = spark.sparkContext
    # 构建Pandas的DF
    pdf = pd.DataFrame({
        "id":[1,2,3],
        "name":["张大仙","王晓晓","王大锤"],
        "age":[11,11,11]
    })
    # 将Pandas的DF对象转换为Spark的DF对象
    df = spark.createDataFrame(pdf)
    df.printSchema()
    df.show()       

五、读取外部数据

  • 统一API
sparksession.read.format("text|csv|json|parquet|orc|avro|jdbc|.......")
	.option("K","V")		# option可选
    .schema(StructType|String)	# String的语法如.schema("name STRING","age INT")
    .LOAD("被读取文件的路径,支持本地文件系统和HDFS")
  • 读取text数据源。使用format(“text”)读取文本数据,读取到的DataFrame只会有一个列,列名默认称之为:value
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StringType,IntegerType
import os

os.environ['SPARK_HOME'] = '/export/server/spark'
PYSPARK_PYTHON = '/root/anaconda3/envs/pyspark_env/bin/python'
os.environ['PYSPARK_PYTHON'] = PYSPARK_PYTHON
os.environ['PYSPARK_DRIVER_PYTHON'] = PYSPARK_PYTHON

if __name__ == '__main__':
    spark = SparkSession.builder.\
        appName("create df").\
        master("local[*]").\
        getOrCreate()
    sc = spark.sparkContext
    schema = StructType().add("data",StringType(),nullable=True)
    df = spark.read.format("text")\
        .schema(schema)\
        .load("file:///export/data/sql/people.txt")
    df.printSchema()
    df.show()
  • 读取json数据源。使用format(“json”)读取json数据
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StringType,IntegerType
import os

os.environ['SPARK_HOME'] = '/export/server/spark'
PYSPARK_PYTHON = '/root/anaconda3/envs/pyspark_env/bin/python'
os.environ['PYSPARK_PYTHON'] = PYSPARK_PYTHON
os.environ['PYSPARK_DRIVER_PYTHON'] = PYSPARK_PYTHON

if __name__ == '__main__':
    spark = SparkSession.builder.\
        appName("create df").\
        master("local[*]").\
        getOrCreate()
    sc = spark.sparkContext
    df = spark.read.format("json")\
        .load("file:///export/data/sql/people.json")
    # JSON类型一般不用写.schema,json自带,json带有列名和列类型(字符串和数字)
    df.printSchema()
    df.show()
  • 读取csv数据源,使用format(“csv”)读取csv数据
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StringType,IntegerType
import os

os.environ['SPARK_HOME'] = '/export/server/spark'
PYSPARK_PYTHON = '/root/anaconda3/envs/pyspark_env/bin/python'
os.environ['PYSPARK_PYTHON'] = PYSPARK_PYTHON
os.environ['PYSPARK_DRIVER_PYTHON'] = PYSPARK_PYTHON

if __name__ == '__main__':
    spark = SparkSession.builder.\
        appName("create df").\
        master("local[*]").\
        getOrCreate()
    sc = spark.sparkContext
    df = spark.read.format("csv")\
        .option("sep",";")\        # 列分隔符
        .option("header",False)\    # 是否有csv标头
        .option("encoding","utf-8")\    # 编码
        .schema("name STRING,age INT,job STRING")\  # 指定列名和类型
        .load("file:///export/data/sql/people.csv") # 路径
    df.printSchema()
    df.show()

你可能感兴趣的:(Spark计算引擎,spark,big,data,hive,python,pandas)