通过SparkSession对象的createDataFrame方法来将RDD转换为DataFrame。这里只传入列名称,类型从RDD中进行推断,是否允许为空默认为允许(True)
from pyspark.sql import SparkSession
import os
os.environ["SPARK_HOME"] = '/export/server/spark'
PYSPARK_PYTHON = '/root/anaconda3/envs/pyspark_env/bin/python'
os.environ["PYSPARK_PYTHON"] = PYSPARK_PYTHON
os.environ["PYSPARK_DRIVER_PYTHON"] = PYSPARK_PYTHON
if __name__ == '__main__':
spark = SparkSession.builder.\
appName("create df").\
master("local[*]").\
getOrCreate()
sc = spark.sparkContext
# 首先构建一个RDD
rdd = sc.textFile("file:///export/data/sql/people.txt").\
map(lambda x:x.split(',')).\
map(lambda x:[x[0],int(x[1])]) # 需要做类型转换,因为类型从RDD中探测
# 构建DF方式1
df = spark.createDataFrame(rdd,schema=['name','age'])
# 打印表结构
df.printSchema()
# 打印20行数据
df.show()
df.createTempView("ttt")
spark.sql("select * from ttt where age<30").show()
通过StructType对象来定义DataFrame的“表结构”转换RDD
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StringType,IntegerType
import os
os.environ["SPARK_HOME"] = '/export/server/spark'
PYSPARK_PYTHON = "/root/anaconda3/envs/pyspark_env/bin/python"
os.environ["PYSPARK_PYTHON"] = PYSPARK_PYTHON
os.environ["PYSPARK_DRIVER_PYTHON"] = PYSPARK_PYTHON
if __name__ == '__main__':
spark = SparkSession.builder.\
appName("create_df").\
config("spark.sql.shuffle.partitions","4").\
getOrCreate()
# SparkSession对象也可以获取SparkContext
sc = spark.sparkContext
# 创建DF,首先创建RDD将RDD转换为DF
rdd = sc.textFile("file:///export/data/sql/stu_score.txt").\
map(lambda x:x.split(',')).\
map(lambda x:(int(x[0]),x[1],int(x[2])))
# StructType类
# 这个类可以定义整个DataFrame中的Schema
schema = StructType().\
add("id",IntegerType(),nullable=False).\
add("name",StringType(),nullable=True).\
add("score",IntegerType(),nullable=False)
# 一个add方法定义一个列的信息,如果有3个列,就写三个add
# add方法:参数1:列名称,参数二:列类型,参数三:是否允许为空
df = spark.createDataFrame(rdd,schema)
df.printSchema()
df.show()
使用RDD的toDF方法转换RDD
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StringType,IntegerType
import os
os.environ["SPARK_HOME"] = '/export/server/spark'
PYSPARK_PYTHON = "/root/anaconda3/envs/pyspark_env/bin/python"
os.environ["PYSPARK_PYTHON"] = PYSPARK_PYTHON
os.environ["PYSPARK_DRIVER_PYTHON"] = PYSPARK_PYTHON
if __name__ == '__main__':
spark = SparkSession.builder.\
appName("create_df").\
config("spark.sql.shuffle.partitions","4").\
getOrCreate()
# SparkSession对象也可以获取SparkContext
sc = spark.sparkContext
# 创建DF,首先创建RDD将RDD转DF
rdd = sc.textFile("file:///export/data/sql/stu_score.txt"). \
map(lambda x: x.split(',')). \
map(lambda x: (int(x[0]), x[1], int(x[2])))
# StructType类
# 这个类可以定义整个DataFrame中的Schema
schema = StructType().\
add("id",IntegerType(),nullable=False).\
add("name",StringType(),nullable=True).\
add("score",IntegerType(),nullable=False)
# 一个add方法定义一个列的信息,如果有3个列,就写三个add
# add方法:参数1:列名称,参数2:列类型,参数3,是否允许为空
# 方式1:只传列名,类型靠推断,是否允许为空是true
df = rdd.toDF(['id', 'subject', 'score'])
df.printSchema()
df.show()
# 方式二:传入完整的Schema描述对象StructType
df = rdd.toDF(schema)
df.printSchema()
df.show()
将Pandas的DataFrame对象,转变为分布式的SparkSQL
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StringType,IntegerType
import pandas as pd
import os
os.environ['SPARK_HOME'] = '/export/server/spark'
PYSPARK_PYTHON = '/root/anaconda3/envs/pyspark_env/bin/python'
os.environ['PYSPARK_PYTHON'] = PYSPARK_PYTHON
os.environ['PYSPARK_DRIVER_PYTHON'] = PYSPARK_PYTHON
if __name__ == '__main__':
spark = SparkSession.builder.\
appName("create df").\
master("local[*]").\
getOrCreate()
sc = spark.sparkContext
# 构建Pandas的DF
pdf = pd.DataFrame({
"id":[1,2,3],
"name":["张大仙","王晓晓","王大锤"],
"age":[11,11,11]
})
# 将Pandas的DF对象转换为Spark的DF对象
df = spark.createDataFrame(pdf)
df.printSchema()
df.show()
sparksession.read.format("text|csv|json|parquet|orc|avro|jdbc|.......")
.option("K","V") # option可选
.schema(StructType|String) # String的语法如.schema("name STRING","age INT")
.LOAD("被读取文件的路径,支持本地文件系统和HDFS")
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StringType,IntegerType
import os
os.environ['SPARK_HOME'] = '/export/server/spark'
PYSPARK_PYTHON = '/root/anaconda3/envs/pyspark_env/bin/python'
os.environ['PYSPARK_PYTHON'] = PYSPARK_PYTHON
os.environ['PYSPARK_DRIVER_PYTHON'] = PYSPARK_PYTHON
if __name__ == '__main__':
spark = SparkSession.builder.\
appName("create df").\
master("local[*]").\
getOrCreate()
sc = spark.sparkContext
schema = StructType().add("data",StringType(),nullable=True)
df = spark.read.format("text")\
.schema(schema)\
.load("file:///export/data/sql/people.txt")
df.printSchema()
df.show()
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StringType,IntegerType
import os
os.environ['SPARK_HOME'] = '/export/server/spark'
PYSPARK_PYTHON = '/root/anaconda3/envs/pyspark_env/bin/python'
os.environ['PYSPARK_PYTHON'] = PYSPARK_PYTHON
os.environ['PYSPARK_DRIVER_PYTHON'] = PYSPARK_PYTHON
if __name__ == '__main__':
spark = SparkSession.builder.\
appName("create df").\
master("local[*]").\
getOrCreate()
sc = spark.sparkContext
df = spark.read.format("json")\
.load("file:///export/data/sql/people.json")
# JSON类型一般不用写.schema,json自带,json带有列名和列类型(字符串和数字)
df.printSchema()
df.show()
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StringType,IntegerType
import os
os.environ['SPARK_HOME'] = '/export/server/spark'
PYSPARK_PYTHON = '/root/anaconda3/envs/pyspark_env/bin/python'
os.environ['PYSPARK_PYTHON'] = PYSPARK_PYTHON
os.environ['PYSPARK_DRIVER_PYTHON'] = PYSPARK_PYTHON
if __name__ == '__main__':
spark = SparkSession.builder.\
appName("create df").\
master("local[*]").\
getOrCreate()
sc = spark.sparkContext
df = spark.read.format("csv")\
.option("sep",";")\ # 列分隔符
.option("header",False)\ # 是否有csv标头
.option("encoding","utf-8")\ # 编码
.schema("name STRING,age INT,job STRING")\ # 指定列名和类型
.load("file:///export/data/sql/people.csv") # 路径
df.printSchema()
df.show()