ps:我使用的是单机版spark3.0版本
import socket
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
import pandas as pd
localIpAddress = socket.gethostbyname(socket.gethostname())
# 创建Spark配置
sparkConf = SparkConf()
# 初始化我们的Spark集群,这实际上会生成工作节点。
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()
sc = spark.sparkContext
sqlContext = SQLContext(sc)
spark
创建spark的数据框有这么两种常规的新建数据方式:
①.createDataFrame ()
②.toDF()
from pyspark.sql.types import *
schema = StructType([
StructField("user_id", StringType(), True),
StructField("name", StringType(), True),
StructField("age", IntegerType(), True),
StructField("score", FloatType(), True)
])
empty_dataframes = spark.createDataFrame(spark.sparkContext.emptyRDD(), schema)
sdf = sqlContext.createDataFrame([("a1", "小明", 12, 56.5), ("a2", "小红", 15, 23.0),\
("a3", "小强", 23, 84.0), ("a3","小小",9,93.5)],\
("user_id", "name", "age", "score"))
from pyspark.sql import Row
row = Row("user_id","name","age","score")
row_user_id = ['a1','a2','a3','a4']
row_name = ['小明','小红','小强','小小']
row_age = [12,15,23,9]
row_score = [56.5,23.0,84.0,93.5]
sdf1 = sc.parallelize([row(row_user_id[i],row_name[i],row_age[i],row_score[i]) for i in range(len(row))]).toDF()
withColumn是通过添加或替换与现有列有相同的名字的列,返回一个新的DataFrame
sdf2 = sdf1.withColumn('score_new',sdf1.score/2.0)
如果不想在原有列的基础上添加新的列,而是添加一列全新的,不同于原有数据框的列,可以考虑join()函数
import numpy as np
sdf3 = sqlContext.createDataFrame([("a1",3.0), ("a2",3.0), ("a3",np.nan)], ("user_id_class", "class"))
sdf4 = sdf2.join(sdf3,sdf2.user_id==sdf3.user_id_class,'left').drop('user_id_class')
sdf5 = sdf4.withColumn('score_new',sdf4.score_new/2)
sdf6 = sdf4.withColumn('score_new',sdf4.score_new.cast("Int"))
sdf6 = sdf6.withColumnRenamed("score_new","new_score")
sdf7 = sdf6.filter(sdf6.age>10)
sdf7 = sdf6.where(sdf6.age>9).where(sdf6.age<23)
from pyspark.sql.functions import isnan, isnull
sdf9 = sdf6.filter(isnull("class")) # 把a列里面数据为null的筛选出来(代表python的None类型)
sdf10 = sdf6.filter(isnan("class")) # 把a列里面数据为nan的筛选出来(Not a Number,非数字数据)
sdf6.fillna(-1)
sdf.show(5)
sdf.printSchema()
list = sdf.head(3)
sdf_num = sdf.count()
# 给age列取别名
sdf.select('user_id',sdf.age.alias('age_value'),'name').show()
from pyspark.sql.functions import isnull
# 查询class列中含有空数据的那一行
sdf11 = sdf4.filter(isnull("class"))
list = sdf.collect()
sdf.describe()
sdf.select(['user_id','name','age','score']).distinct()
sdf.select(['user_id','name','age','score']).dropDuplicates()
sample = sdf.sample(False,0.5,2) # 随机选择50%的行,取其中两个
sdf.columns
sdf['age']
sdf.age
sdf.select('age').show() #选择sdf数据框中age列
sdf.select(sdf.user_id,sdf.age,sdf.name).show() #选择sdf数据框中user_id列,age列,name列
sdf.orderBy(sdf.age).show() # 根据age列升序排序
sdf.orderBy(sdf.age.desc()).show() # 根据age列降序排序
sdf12 = sdf.drop('age')
sdf13 = sdf.drop(sdf.age)
sdf14 = sdf5.na.drop()
# 扔掉user_id或class中任一一列包含na的行
sdf15 = sdf5.dropna(subset=['user_id','class'])
sdf_union = sdf.union(sdf1)
sdf16 = sdf5.join(sdf14, sdf5.user_id == sdf14.user_id,'inner')
PS:其中,方法可以为:inner, outer, left_outer, right_outer, leftsemi。
sdf17 = sdf5.subtract(sdf14)
sdf18 = sdf5.intersect(sdf14)
# 求并集然后去重
sdf19 = sdf5.union(sdf14).distinct()
# 分析关于name列在class列中各个不同的值的数量
sdf19.crosstab('name','class')
先创建一个案例数据框
from pyspark.sql import Row
row = Row("user_id","product_id","name","money")
row_user_id = ['a1','a2','a3','a3','a1','a2']
row_product_id = ['b1','b2','b3','b1','b2','b3']
row_name = ['小明','小红','小强','小强','小明','小红']
row_money = [56.5, 23.0, 84.0, 93.5, 12.7, 43.5, 86.1]
sdf_gb = sc.parallelize([row(row_user_id[i],row_product_id[i],row_name[i],row_money[i]) for i in range(len(row_user_id))]).toDF()
# 分组统计不同名字的人的平均消费水平
sdf_gb.groupby('name').agg({'money':'mean'})
# 分组统计不同名字的人最大的一笔消费
sdf_gb.groupby('name').agg({'money':'max'})
# 分组统计不同名字的人最小的一笔消费
sdf_gb.groupby('name').agg({'money':'min'})
# 分组统计不同名字的人的消费总和
sdf_gb.groupby('name').agg({'money':'sum'})
# 分组统计不同名字的人一共有多少笔消费
sdf_gb.groupby('name').count()
PS:整合后GroupedData类型可用的方法(均返回DataFrame类型):
from pyspark.sql import functions
sdf_gb.groupby('name').agg(functions.avg('money'),functions.min('money'),functions.max('money'),functions.sum('money'),functions.count('money')).show()
pandas_df = sdf.toPandas()
spark_df = spark.createDataFrame(pandas_df)
import databricks.koalas as ks
koalas_df = spark_df.to_koalas()
spark_df = koalas_df.to_spark()
rdd_df = spark_df.rdd
rdd_df.collect()
saprk_df = rdd_df.toDF()
为spark.DataFrame创建一张能进行SQL操作的表:
sdf.createOrReplaceTempView("sdf_SQL")
将spark.DataFrame注册成相关名字的SQL表之后:就可以进行SQL查询了(返回DataFrame):
select_sql = "select * from sdf_SQL where name like '%{}%' and score>{}".format('小',60)
Spark_dataframe = spark.sql(select_sql)
sdf_date = sqlContext.createDataFrame([("a1", "小明","2020-09-01 23:00:00"),\
("a2", "小红","2020-09-02 13:00:00"),\
("a3", "小强", "2020-09-03 03:00:00"),\
("a4","小小","2020-09-04 23:00:00")],\
("user_id", "name","date_time"))
import pyspark.sql.functions as F
sdf_date1 = sdf_date.select('user_id','name',F.date_format('date_time','yyyy-MM-dd')).withColumnRenamed('date_format(date_time, yyyy-MM-dd)','date_time')
同理,也可以将精确到日修改为精确到秒(默认为凌晨12点整):
sdf_date2 = sdf_date1.select('user_id','name',F.date_format('date_time','yyyy-MM-dd HH:mm:ss')).withColumnRenamed('date_format(date_time, yyyy-MM-dd HH:mm:ss)','date_time')
# 将spark.dataframe保存为csv文件
sdf.write.csv("sdf.csv",header=True,sep=",",mode='overwrite')
# 读取csv文件为spark.dataframe
sdf_spark = spark.read.csv("sdf.csv",header=True, inferSchema=True)
其中,sdf.csv文件是存储在当前目录的,如果想指定目录,把指定的路径也写上去就行。
# 将spark.dataframe保存为parquet文件
sdf.write.parquet("sdf.parquet",mode='overwrite')
# 读取parquet文件为spark.dataframe
sdf_spark = spark.read.parquet("sdf.parquet")
最后,如果大家不再进行任何操作的话,记得把spark停掉!!!
spark.stop()