pyspark join

from pyspark.conf import SparkConf
from pyspark.context import SparkContext
from pyspark.sql import SparkSession 

sparkConf = SparkConf()

# 设置Driver进程的内存
sparkConf.set('spark.driver.memory', '8G')
# 设置Driver的CPU core数量
sparkConf.set('spark.driver.cores', '2')
# 设置Spark作业总共要用多少个Executor进程来执行
sparkConf.set("spark.executor.instances", "3")
# 设置每个Executor进程的CPU core数量
sparkConf.set("spark.executor.cores", "2")
# 设置每个Executor进程的内存
sparkConf.set("spark.executor.memory", "4G")
# 设置Spark应用的名称
sparkConf.set("spark.app.name", "pyspark-test")

# 设置Executor进程的CPU core数量
# 注意:请确保"spark.kubernetes.executor.limit.cores"参数值 >= "spark.executor.cores"参数值,否者spark executor启动不起来
sparkConf.set("spark.kubernetes.executor.limit.cores", "2")

spark = SparkSession.builder.config(conf=sparkConf).enableHiveSupport().getOrCreate()
sc = spark.sparkContext

from pyspark.sql import HiveContext
hiveContext = HiveContext(sc)

from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.classification import DecisionTreeClassifier, DecisionTreeModel, DecisionTreeClassificationModel
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler

 

1. 单个column join

from pyspark.sql import Row
rdd1 = sc.parallelize([Row(name='Alice', age=5, height=80), \
                      Row(name='Alice', age=10, height=80), \
                      Row(name='Alice11', age=10, height=80)])
df1 = rdd1.toDF()

rdd2 = sc.parallelize([Row(name='Alice', weight=45)])
df2 = rdd2.toDF()

df_join = df1.join(df2, "name", "left")
df_join.show()

pyspark join_第1张图片

2. 多个columns join

from pyspark.sql import Row
rdd1 = sc.parallelize([Row(name='Alice', age=5, height=80), \
                       Row(name='Alice', age=10, height=80), \
                       Row(name='Alice11', age=10, height=80)])
df1 = rdd1.toDF()

rdd2 = sc.parallelize([Row(name='Alice', age=5, weight=45)])
df2 = rdd2.toDF()

df_join = df1.join(df2, ["name", "age"], "left")

pyspark join_第2张图片

你可能感兴趣的:(pyspark)