inner
, cross
, outer
,full
, full_outer
, left
, left_outer
, right
, right_outer
from pyspark.sql import Row
from pyspark.sql import SparkSession
spark = SparkSession.builder.
appName(‘my_app_name’).
enableHiveSupport().
getOrCreate()
df = spark.createDataFrame([(‘Alice’, 5, 80),
(‘Alice’, 10, 80)],
[‘name’, ‘age’, ‘height’])
df2 = spark.createDataFrame([(‘Alice’, 5, 45),
(‘Alice’, 8, 45)],
[‘name’, ‘age’, ‘weight’]
)
(1)left 左关联
df.join(df2, [“name”], “left”).show()
(2)inner ,含有相同列合并
df.join(df2, [“name”], “inner”).show()
(3)outer ,
df.join(df2, [“name”, “age”], “left”).show()
df.join(df2, [“name”, “age”], “outer”).show()
from pyspark.sql.functions import *
from pyspark.sql import functions as F
##数据处理
df = spark.createDataFrame([(‘a’, 5), (‘b’, 7), (‘c’, 8), (‘d’, 1)], [“Id”, “Rank”])
df.withColumn(‘Id_New’, when(df.Rank <= 5, df.Id).otherwise(‘other’))
.drop(df.Id)
.select(col(‘Id_New’).alias(‘Id’), col(‘Rank’))
.show()
df.withColumn(“Id”, 1)
df = df.withColumn(“new_id”, F.lit(0)).show()
from pyspark.sql.functions import current_date
spark.range(3).withColumn(‘date’,current_date()).show()
##spark 时间
from pyspark.sql.functions import current_timestamp
spark.range(3).withColumn(‘datetime’, current_timestamp()).show()
from pyspark.sql.functions import date_format
df = spark.createDataFrame([(‘2020-04-18’,)], [‘a’])
df.select(date_format(‘a’, ‘MM/dd/yyy’).alias(‘date’)).show()
df.select(date_format(‘a’, ‘yyyyMMdd’).alias(‘date’)).show()
from pyspark.sql.functions import to_date, to_timestamp
(1)转日期
df = spark.createDataFrame([(‘1997-02-28 10:30:00’,)], [‘t’])
df.select(to_date(df.t).alias(‘date’)).show()
[Row(date=datetime.date(1997, 2, 28))]
(2)带时间的日期
df = spark.createDataFrame([(‘1997-02-28 10:30:00’,)], [‘t’])
df.select(to_timestamp(df.t).alias(‘dt’)).show()
[Row(dt=datetime.datetime(1997, 2, 28, 10, 30))]
(3)还可以指定日期格式
df = spark.createDataFrame([(‘1997-02-28 10:30:00’,)], [‘t’])
df.select(to_timestamp(df.t, ‘yyyy-MM-dd HH:mm:ss’).alias(‘dt’)).show()
import datetime
b=datetime.datetime.now()
from pyspark.sql.functions import date_add, date_sub
df = spark.createDataFrame([(‘2015-04-08’,)], [‘d’])
df.select(date_add(df.d, 1).alias(‘d-add’),
date_sub(df.d, 1).alias(‘d-sub’)
).show()
from pyspark.sql.functions import date_add,date_sub
df=spark.createDataFrame([(‘2020-04-21’,)], [‘d’])
df.select(date_add(df.d,1).alias(‘d_add’)
,date_sub(df.d,1).alias(‘d_sub’)).show()
(4) 获取日期年月分
from pyspark.sql.functions import year, month, dayofmonth
df = spark.createDataFrame([(‘2015-04-08’,)], [‘a’])
df.select(year(‘a’).alias(‘year’),
month(‘a’).alias(‘month’),
dayofmonth(‘a’).alias(‘day’)
).show()
date_str = data_date[:-4] + ‘-’ + data_date[-4:-2] + ‘-’ + data_date[-2:]
date_p = datetime.datetime.strptime(date_str, ‘%Y-%m-%d’).date()
cal_p = str(date_p + datetime.timedelta(days=3))
sql_str = ‘data_date<=’ + str(data_date) + ’ and data_date >= ’ + cal_p[:-6] + cal_p[-5:-3] + cal_p[-2:]
(1)df需要提前注册为全局临时表,才可以使用sql
df_C.createGlobalTempView(“feature”) #全局
spark.sql(“select * from global_temp.feature”).show()
df_C.createOrReplaceTempView(“feature”) ## 临时
spark.sql(“select * from feature”).show()
spark.sql("select * from feature where data_date>= ‘20200421’ ").show()
from pyspark.sql.functions import lit
df.withColumn(‘your_col_name’ ,lit(your_const_var))
spark-sql --master yarn
http://140.143.236.62:30001/emr-yarn/cluster/apps/RUNNING
root/zhenai@mre12435