[root@master ~]# hive --service metastore
Starting Hive Metastore Server
[root@master ~]# spark-sql
spark-sql> show databases;
22/06/08 14:55:31 INFO codegen.CodeGenerator: Code generated in 133.316964 ms
dal
default
dim
dwi
dws
ods
Time taken: 0.241 seconds, Fetched 6 row(s)
22/06/08 14:55:31 INFO thriftserver.SparkSQLCLIDriver: Time taken: 0.241 seconds, Fetched 6 row(s)
spark-sql> use default;
Time taken: 1.042 seconds
22/06/08 16:51:23 INFO thriftserver.SparkSQLCLIDriver: Time taken: 1.042 seconds
spark-sql> create external table if not exists merge(
> mdn string
> ,sdate string
> ,grid string
> )
> row format delimited fields terminated by ','
> stored as inputformat 'org.apache.hadoop.mapred.TextInputFormat'
> outputformat 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
> location '/data/merge';
Time taken: 0.96 seconds
22/06/08 16:58:27 INFO thriftserver.SparkSQLCLIDriver: Time taken: 0.96 seconds
[root@master ~]# cd /usr/local/soft/data
[root@master data]# ls
ctyun score.sql testArray2.txt
data_skew.txt scoreStruct testLieToLine.txt
deal_tb.txt score.txt theZenOfPython.txt
dept.txt students_dt.txt theZen.txt
DIANXIN.csv student.sql udtfData.txt
DIANXIN.sql students.txt wordcount
emp.txt students_year_month.txt words.txt
new_db.sql subject.txt yiqing.csv
new_score.txt ${system:java.io.tmpdir}
scoreMap testArray
[root@master data]# vim merge.txt
176******23,202205201530,19560032075040
176******23,202205201531,19560032075040
176******23,202205201532,19560032075040
176******23,202205201533,19560032075040
176******23,202205201534,19560032075041
176******23,202205201535,19560032075042
176******23,202205201536,19560032075043
176******23,202205201537,19560032075040
176******23,202205201538,19560032075040
176******23,202205201539,19560032075040
spark-sql> dfs -ls /;
Found 9 items
drwxr-xr-x - lenovo supergroup 0 2022-03-23 10:04 /TestAPI
drwxr-xr-x - root supergroup 0 2022-05-31 14:58 /daas
drwxr-xr-x - root supergroup 0 2022-06-08 16:58 /data
drwxr-xr-x - root supergroup 0 2022-04-16 09:43 /hbase
drwxr-xr-x - root supergroup 0 2022-04-18 19:47 /sqoop
drwxrw-r-x - root supergroup 0 2022-04-06 19:33 /student
drwxrwxrwx - root supergroup 0 2022-04-02 16:34 /tmp
drwxrwxrwx+ - root supergroup 0 2022-05-15 20:25 /user
drwxr-xr-x - root supergroup 0 2022-04-28 17:53 /zzj
spark-sql> load data local inpath "/usr/local/soft/data/merge.txt" into table merge;
22/06/08 17:12:04 ERROR hdfs.KeyProviderCache: Could not find uri with key [dfs.encryption.key.provider.uri] to create a keyProvider !!
22/06/08 17:12:04 INFO metadata.Hive: Renaming src: file:/usr/local/soft/data/merge.txt, dest: hdfs://master:9000/data/merge/merge.txt, Status:true
Time taken: 0.334 seconds
22/06/08 17:12:04 INFO thriftserver.SparkSQLCLIDriver: Time taken: 0.334 seconds
spark-sql> select * from merge;
176******23 202205201530 19560032075040
176******23 202205201531 19560032075040
176******23 202205201532 19560032075040
176******23 202205201533 19560032075040
176******23 202205201534 19560032075041
176******23 202205201535 19560032075042
176******23 202205201536 19560032075043
176******23 202205201537 19560032075040
176******23 202205201538 19560032075040
176******23 202205201539 19560032075040
Time taken: 0.099 seconds, Fetched 10 row(s)
spark-sql> select mdn
> ,sdate
> ,grid
> ,lag(grid,1) over (partition by mdn order by sdate) as last_grid
> from merge;
176******23 202205201530 19560032075040 NULL
176******23 202205201531 19560032075040 19560032075040
176******23 202205201532 19560032075040 19560032075040
176******23 202205201533 19560032075040 19560032075040
176******23 202205201534 19560032075041 19560032075040
176******23 202205201535 19560032075042 19560032075041
176******23 202205201536 19560032075043 19560032075042
176******23 202205201537 19560032075040 19560032075043
176******23 202205201538 19560032075040 19560032075040
176******23 202205201539 19560032075040 19560032075040
Time taken: 0.56 seconds, Fetched 10 row(s)
spark-sql> select t1.mdn
> ,t1.sdate
> ,t1.grid
> ,t1.last_grid
> ,case when t1.grid = t1.last_grid then 0
> else 1
> end as flag
> from(
> select mdn
> ,sdate
> ,grid
> ,lag(grid,1) over (partition by mdn order by sdate) as last_grid
> from merge
> )t1;
176******23 202205201530 19560032075040 NULL 1
176******23 202205201531 19560032075040 19560032075040 0
176******23 202205201532 19560032075040 19560032075040 0
176******23 202205201533 19560032075040 19560032075040 0
176******23 202205201534 19560032075041 19560032075040 1
176******23 202205201535 19560032075042 19560032075041 1
176******23 202205201536 19560032075043 19560032075042 1
176******23 202205201537 19560032075040 19560032075043 1
176******23 202205201538 19560032075040 19560032075040 0
176******23 202205201539 19560032075040 19560032075040 0
Time taken: 0.493 seconds, Fetched 10 row(s)
spark-sql> select tt1.mdn
> ,tt1.sdate
> ,tt1.grid
> ,tt1.last_grid
> ,tt1.flag
> ,sum(tt1.flag) over (partition by mdn order by sdate) as grp
> from(
> select t1.mdn
> ,t1.sdate
> ,t1.grid
> ,t1.last_grid
> ,case when t1.grid = t1.last_grid then 0
> else 1
> end as flag
> from(
> select mdn
> ,sdate
> ,grid
> ,lag(grid,1) over (partition by mdn order by sdate) as last_grid
> from merge
> )t1
> )tt1;
176******23 202205201530 19560032075040 NULL 1 1
176******23 202205201531 19560032075040 19560032075040 0 1
176******23 202205201532 19560032075040 19560032075040 0 1
176******23 202205201533 19560032075040 19560032075040 0 1
176******23 202205201534 19560032075041 19560032075040 1 2
176******23 202205201535 19560032075042 19560032075041 1 3
176******23 202205201536 19560032075043 19560032075042 1 4
176******23 202205201537 19560032075040 19560032075043 1 5
176******23 202205201538 19560032075040 19560032075040 0 5
176******23 202205201539 19560032075040 19560032075040 0 5
Time taken: 0.709 seconds, Fetched 10 row(s)
spark-sql> select ttt1.mdn
> ,ttt1.grp
> ,ttt1.grid
> ,min(ttt1.sdate)as start_t
> ,max(ttt1.sdate)as end_t
> from(
> select tt1.mdn
> ,tt1.sdate
> ,tt1.grid
> ,tt1.last_grid
> ,tt1.flag
> ,sum(tt1.flag) over (partition by mdn order by sdate) as grp
> from(
> select t1.mdn
> ,t1.sdate
> ,t1.grid
> ,t1.last_grid
> ,case when t1.grid = t1.last_grid then 0
> else 1
> end as flag
> from(
> select mdn
> ,sdate
> ,grid
> ,lag(grid,1) over (partition by mdn order by sdate) as last_grid
> from merge
> )t1
> )tt1
> )ttt1 group by ttt1.mdn,ttt1.grp,ttt1.grid;
176******23 1 19560032075040 202205201530 202205201533
176******23 2 19560032075041 202205201534 202205201534
176******23 3 19560032075042 202205201535 202205201535
176******23 4 19560032075043 202205201536 202205201536
176******23 5 19560032075040 202205201537 202205201539
Time taken: 0.611 seconds, Fetched 5 row(s)
spark-sql> use dwi;
Time taken: 0.013 seconds
22/06/08 20:15:30 INFO thriftserver.SparkSQLCLIDriver: Time taken: 0.013 seconds
spark-sql> show tables;
22/06/08 20:15:42 INFO codegen.CodeGenerator: Code generated in 8.425467 ms
dwi dwi_res_regn_mergelocation_msk_d false
Time taken: 0.029 seconds, Fetched 1 row(s)
22/06/08 20:15:42 INFO thriftserver.SparkSQLCLIDriver: Time taken: 0.029 seconds, Fetched 1 row(s)
spark-sql> desc dwi_res_regn_mergelocation_msk_d
> ;
22/06/08 20:16:05 INFO codegen.CodeGenerator: Code generated in 5.516328 ms
mdn string 手机号码
start_date string 开始时间
end_date string 结束时间
county_id string 区县编码
longi string 经度
lati string 纬度
bsid string 基站标识
grid_id string 网格号
day_id string 天分区
# Partition Information
# col_name data_type comment
day_id string 天分区
Time taken: 0.046 seconds, Fetched 12 row(s)
22/06/08 20:16:05 INFO thriftserver.SparkSQLCLIDriver: Time taken: 0.046 seconds, Fetched 12 row(s)
package com.ctyun.dwi
import org.apache.spark.sql.expressions.{UserDefinedFunction, Window}
import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}
import com.shujia.utils.Geography
object DwiResRegnMergelocationMskDay {
def main(args: Array[String]): Unit = {
//基于ods的数据构建 位置数据融合表
val spark: SparkSession = SparkSession
.builder()
.appName("DwiResRegnMergelocationMskDay")
.enableHiveSupport() //开启Hive的支持
.config("spark.sql.shuffle.partitions","20")
.getOrCreate()
//导入隐式转换及函数
import spark.implicits._
import org.apache.spark.sql.functions._
//在Spark SQL中使用自定义函数
/**
* 传入两个点的经纬度 计算距离
*/
val calculateLength: UserDefinedFunction = udf((longi1: Double, lati1: Double, longi2: Double, lati2: Double) => {
Geography.calculateLength(longi1, lati1, longi2, lati2)
})
//读取hive中ods层中的oidd的数据
val oidd: DataFrame = spark.table("ods.ods_oidd")
//1、将开始时间、结束时间分成两列
oidd
//withColumn可以给数据增加一列
.withColumn("start_t",split($"start_time",",")(1))//提取业务的开始时间
.withColumn("end_t",split($"start_time",",")(0))//提取业务的结束时间
// 在时间轴上进行聚类 主要解决同一个网格内部相邻两条数据有时间交叠的问题
//按照用户”分组“ 再按 开始时间 排序 获取上一条数据的网络编号
.withColumn("last_grid",lag("grid_id",1)over Window.partitionBy("mdn").orderBy("start_t"))
// 用当前数据的grid_id 进行对比 上一条数据的grid_id 如果相同则置0,不同则置1
.withColumn("flag", when($"grid_id" === $"last_grid", 0).otherwise(1))
// 对flag列进行累计求和
.withColumn("grp", sum($"flag") over Window.partitionBy("mdn").orderBy("start_t"))
// 按照grp进行分组 取每个组内的 时间的最小值及最大值
.groupBy("mdn", "county_id", "longi", "lati", "bsid", "grid_id", "grp")
.agg(min("start_t") as "start_t", max("end_t") as "end_t")
//2、基于开始时间排序,取每一条数据的前一条数据 作为新的一列 lag
.withColumn("last_lg",lag($"longi",1)over Window.partitionBy($"mdn").orderBy($"start_t"))//取上一条数据的经度
.withColumn("last_lat",lag($"lati",1)over Window.partitionBy($"mdn").orderBy($"start_t"))//取上一条数据的纬度
// 由取上一条数据的结束时间end_t 变为 取上一条数据的 start_t 主要为了解决不同网格之间相邻两条数据有时间交叠的问题
.withColumn("last_end_time",lag($"start_t",1)over Window.partitionBy($"mdn").orderBy($"start_t"))//取上一条数据的结束时间
//3、计算相邻两条位置记录之间的时间间隔
.withColumn("diff_time",unix_timestamp($"start_t","yyyyMMddHHmmss")-unix_timestamp($"last_end_time","yyyyMMddHHmmss"))
//4、基于经纬度计算距离
.withColumn("distance",when($"last_lg".isNull,1).otherwise(calculateLength($"longi",$"lati",$"last_lg",$"last_lat")))
//5、根据距离及时间间隔计算速度
.withColumn("speed",round($"distance"/$"diff_time",3))
//将结果保存到文件
.write
.format("csv")
.option("sep","\t")
.mode(SaveMode.Overwrite)
.save("/daas/motl/dwi/dwi_res_regn_mergelocation_msk_d/")
}
/**
* 1、使用maven将代码打成jar包并上传
* 2、如果开启了Spark的historySever服务需要给/user/spark/applicationHistory目录通过acl设置权限
* hdfs dfs -setfacl -R -m user:dwi:rwx /user/spark/applicationHistory
* 3、使用spark-submit提交代码:
* spark-submit --master yarn-client --class com.ctyun.dwi.DwiResRegnMergelocationMskDay --jars common-1.0.jar dwi-1.0.jar 20220527
* 4、查看结果目录的文件大小
* hdfs dfs -du -h /daas/motl/dwi/dwi_res_regn_mergelocation_msk_d/
*/
}