在时间轴上进行聚类

[root@master ~]# hive --service metastore
Starting Hive Metastore Server
[root@master ~]# spark-sql
spark-sql> show databases;
22/06/08 14:55:31 INFO codegen.CodeGenerator: Code generated in 133.316964 ms
dal
default
dim
dwi
dws
ods
Time taken: 0.241 seconds, Fetched 6 row(s)
22/06/08 14:55:31 INFO thriftserver.SparkSQLCLIDriver: Time taken: 0.241 seconds, Fetched 6 row(s)
spark-sql> use default;
Time taken: 1.042 seconds
22/06/08 16:51:23 INFO thriftserver.SparkSQLCLIDriver: Time taken: 1.042 seconds
spark-sql> create external table if not exists merge(
         > mdn string
         > ,sdate string
         > ,grid string
         > )
         > row format delimited fields terminated by ','
         > stored as inputformat 'org.apache.hadoop.mapred.TextInputFormat'
         > outputformat 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
         > location '/data/merge';
Time taken: 0.96 seconds
22/06/08 16:58:27 INFO thriftserver.SparkSQLCLIDriver: Time taken: 0.96 seconds

[root@master ~]# cd /usr/local/soft/data
[root@master data]# ls
ctyun          score.sql                 testArray2.txt
data_skew.txt  scoreStruct               testLieToLine.txt
deal_tb.txt    score.txt                 theZenOfPython.txt
dept.txt       students_dt.txt           theZen.txt
DIANXIN.csv    student.sql               udtfData.txt
DIANXIN.sql    students.txt              wordcount
emp.txt        students_year_month.txt   words.txt
new_db.sql     subject.txt               yiqing.csv
new_score.txt  ${system:java.io.tmpdir}
scoreMap       testArray
[root@master data]# vim merge.txt
176******23,202205201530,19560032075040
176******23,202205201531,19560032075040
176******23,202205201532,19560032075040
176******23,202205201533,19560032075040
176******23,202205201534,19560032075041
176******23,202205201535,19560032075042
176******23,202205201536,19560032075043
176******23,202205201537,19560032075040
176******23,202205201538,19560032075040
176******23,202205201539,19560032075040
spark-sql> dfs -ls /;
Found 9 items
drwxr-xr-x   - lenovo supergroup          0 2022-03-23 10:04 /TestAPI
drwxr-xr-x   - root   supergroup          0 2022-05-31 14:58 /daas
drwxr-xr-x   - root   supergroup          0 2022-06-08 16:58 /data
drwxr-xr-x   - root   supergroup          0 2022-04-16 09:43 /hbase
drwxr-xr-x   - root   supergroup          0 2022-04-18 19:47 /sqoop
drwxrw-r-x   - root   supergroup          0 2022-04-06 19:33 /student
drwxrwxrwx   - root   supergroup          0 2022-04-02 16:34 /tmp
drwxrwxrwx+  - root   supergroup          0 2022-05-15 20:25 /user
drwxr-xr-x   - root   supergroup          0 2022-04-28 17:53 /zzj
spark-sql> load data local inpath "/usr/local/soft/data/merge.txt" into table merge;
22/06/08 17:12:04 ERROR hdfs.KeyProviderCache: Could not find uri with key [dfs.encryption.key.provider.uri] to create a keyProvider !!
22/06/08 17:12:04 INFO metadata.Hive: Renaming src: file:/usr/local/soft/data/merge.txt, dest: hdfs://master:9000/data/merge/merge.txt, Status:true
Time taken: 0.334 seconds
22/06/08 17:12:04 INFO thriftserver.SparkSQLCLIDriver: Time taken: 0.334 seconds
spark-sql> select * from merge;
176******23	202205201530	19560032075040
176******23	202205201531	19560032075040
176******23	202205201532	19560032075040
176******23	202205201533	19560032075040
176******23	202205201534	19560032075041
176******23	202205201535	19560032075042
176******23	202205201536	19560032075043
176******23	202205201537	19560032075040
176******23	202205201538	19560032075040
176******23	202205201539	19560032075040
Time taken: 0.099 seconds, Fetched 10 row(s)
spark-sql> select mdn
         > ,sdate
         > ,grid
         > ,lag(grid,1) over (partition by mdn order by sdate) as last_grid
         > from merge;
176******23	202205201530	19560032075040	NULL
176******23	202205201531	19560032075040	19560032075040
176******23	202205201532	19560032075040	19560032075040
176******23	202205201533	19560032075040	19560032075040
176******23	202205201534	19560032075041	19560032075040
176******23	202205201535	19560032075042	19560032075041
176******23	202205201536	19560032075043	19560032075042
176******23	202205201537	19560032075040	19560032075043
176******23	202205201538	19560032075040	19560032075040
176******23	202205201539	19560032075040	19560032075040
Time taken: 0.56 seconds, Fetched 10 row(s)
spark-sql> select t1.mdn
         >         ,t1.sdate
         >         ,t1.grid
         >         ,t1.last_grid
         >         ,case when t1.grid = t1.last_grid then 0
         >         else 1
         >         end as flag
         > from(
         > select mdn
         >           ,sdate
         >           ,grid
         >           ,lag(grid,1) over (partition by mdn order by sdate) as last_grid
         >           from merge
         > )t1;
176******23	202205201530	19560032075040	NULL	1
176******23	202205201531	19560032075040	19560032075040 0
176******23	202205201532	19560032075040	19560032075040 0
176******23	202205201533	19560032075040	19560032075040 0
176******23	202205201534	19560032075041	19560032075040 1
176******23	202205201535	19560032075042	19560032075041 1
176******23	202205201536	19560032075043	19560032075042 1
176******23	202205201537	19560032075040	19560032075043 1
176******23	202205201538	19560032075040	19560032075040 0
176******23	202205201539	19560032075040	19560032075040 0
Time taken: 0.493 seconds, Fetched 10 row(s)
spark-sql> select tt1.mdn
         >         ,tt1.sdate
         >         ,tt1.grid
         >         ,tt1.last_grid
         >         ,tt1.flag
         >         ,sum(tt1.flag) over (partition by mdn order by sdate) as grp
         > from(
         > select t1.mdn
         >         ,t1.sdate
         >         ,t1.grid
         >         ,t1.last_grid
         >         ,case when t1.grid = t1.last_grid then 0
         >         else 1
         >         end as flag
         > from(
         > select mdn
         >           ,sdate
         >           ,grid
         >           ,lag(grid,1) over (partition by mdn order by sdate) as last_grid
         >           from merge
         > )t1
         > )tt1;
176******23	202205201530	19560032075040	NULL	1	1
176******23	202205201531	19560032075040	19560032075040	0	1
176******23	202205201532	19560032075040	19560032075040	0	1
176******23	202205201533	19560032075040	19560032075040	0	1
176******23	202205201534	19560032075041	19560032075040	1	2
176******23	202205201535	19560032075042	19560032075041	1	3
176******23	202205201536	19560032075043	19560032075042	1	4
176******23	202205201537	19560032075040	19560032075043	1	5
176******23	202205201538	19560032075040	19560032075040	0	5
176******23	202205201539	19560032075040	19560032075040	0	5
Time taken: 0.709 seconds, Fetched 10 row(s)
spark-sql> select ttt1.mdn
         >         ,ttt1.grp
         >         ,ttt1.grid
         >         ,min(ttt1.sdate)as start_t
         >         ,max(ttt1.sdate)as end_t
         > from(
         >     select tt1.mdn
         >         ,tt1.sdate
         >         ,tt1.grid
         >         ,tt1.last_grid
         >         ,tt1.flag
         >         ,sum(tt1.flag) over (partition by mdn order by sdate) as grp
         > from(
         > select t1.mdn
         >         ,t1.sdate
         >         ,t1.grid
         >         ,t1.last_grid
         >         ,case when t1.grid = t1.last_grid then 0
         >         else 1
         >         end as flag
         > from(
         > select mdn
         >           ,sdate
         >           ,grid
         >           ,lag(grid,1) over (partition by mdn order by sdate) as last_grid
         >           from merge
         > )t1
         > )tt1
         > )ttt1 group by ttt1.mdn,ttt1.grp,ttt1.grid;
176******23	1	19560032075040	202205201530	202205201533
176******23	2	19560032075041	202205201534	202205201534
176******23	3	19560032075042	202205201535	202205201535
176******23	4	19560032075043	202205201536	202205201536
176******23	5	19560032075040	202205201537	202205201539
Time taken: 0.611 seconds, Fetched 5 row(s)
spark-sql> use dwi;
Time taken: 0.013 seconds
22/06/08 20:15:30 INFO thriftserver.SparkSQLCLIDriver: Time taken: 0.013 seconds
spark-sql> show tables;
22/06/08 20:15:42 INFO codegen.CodeGenerator: Code generated in 8.425467 ms
dwi	dwi_res_regn_mergelocation_msk_d	false
Time taken: 0.029 seconds, Fetched 1 row(s)
22/06/08 20:15:42 INFO thriftserver.SparkSQLCLIDriver: Time taken: 0.029 seconds, Fetched 1 row(s)
spark-sql> desc dwi_res_regn_mergelocation_msk_d
         > ;
22/06/08 20:16:05 INFO codegen.CodeGenerator: Code generated in 5.516328 ms
mdn	string	手机号码
start_date	string	开始时间
end_date	string	结束时间
county_id	string	区县编码
longi	string	经度
lati	string	纬度
bsid	string	基站标识
grid_id	string	网格号
day_id	string	天分区
# Partition Information		
# col_name	data_type	comment
day_id	string	天分区
Time taken: 0.046 seconds, Fetched 12 row(s)
22/06/08 20:16:05 INFO thriftserver.SparkSQLCLIDriver: Time taken: 0.046 seconds, Fetched 12 row(s)
package com.ctyun.dwi

import org.apache.spark.sql.expressions.{UserDefinedFunction, Window}
import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}
import com.shujia.utils.Geography

object DwiResRegnMergelocationMskDay {
  def main(args: Array[String]): Unit = {
    //基于ods的数据构建 位置数据融合表

    val spark: SparkSession = SparkSession
      .builder()
      .appName("DwiResRegnMergelocationMskDay")
      .enableHiveSupport() //开启Hive的支持
      .config("spark.sql.shuffle.partitions","20")
      .getOrCreate()
    //导入隐式转换及函数
    import spark.implicits._
    import org.apache.spark.sql.functions._

    //在Spark SQL中使用自定义函数
    /**
     * 传入两个点的经纬度 计算距离
     */
    val calculateLength: UserDefinedFunction = udf((longi1: Double, lati1: Double, longi2: Double, lati2: Double) => {
      Geography.calculateLength(longi1, lati1, longi2, lati2)
    })

    //读取hive中ods层中的oidd的数据
    val oidd: DataFrame = spark.table("ods.ods_oidd")

      //1、将开始时间、结束时间分成两列
    oidd
      //withColumn可以给数据增加一列
      .withColumn("start_t",split($"start_time",",")(1))//提取业务的开始时间
      .withColumn("end_t",split($"start_time",",")(0))//提取业务的结束时间
      // 在时间轴上进行聚类 主要解决同一个网格内部相邻两条数据有时间交叠的问题
      //按照用户”分组“ 再按 开始时间 排序 获取上一条数据的网络编号
      .withColumn("last_grid",lag("grid_id",1)over Window.partitionBy("mdn").orderBy("start_t"))
      // 用当前数据的grid_id 进行对比 上一条数据的grid_id 如果相同则置0,不同则置1
      .withColumn("flag", when($"grid_id" === $"last_grid", 0).otherwise(1))
      // 对flag列进行累计求和
      .withColumn("grp", sum($"flag") over Window.partitionBy("mdn").orderBy("start_t"))
      // 按照grp进行分组 取每个组内的 时间的最小值及最大值
      .groupBy("mdn", "county_id", "longi", "lati", "bsid", "grid_id", "grp")
      .agg(min("start_t") as "start_t", max("end_t") as "end_t")
      //2、基于开始时间排序,取每一条数据的前一条数据 作为新的一列 lag
      .withColumn("last_lg",lag($"longi",1)over Window.partitionBy($"mdn").orderBy($"start_t"))//取上一条数据的经度
      .withColumn("last_lat",lag($"lati",1)over Window.partitionBy($"mdn").orderBy($"start_t"))//取上一条数据的纬度
      // 由取上一条数据的结束时间end_t 变为 取上一条数据的 start_t 主要为了解决不同网格之间相邻两条数据有时间交叠的问题
      .withColumn("last_end_time",lag($"start_t",1)over Window.partitionBy($"mdn").orderBy($"start_t"))//取上一条数据的结束时间
      //3、计算相邻两条位置记录之间的时间间隔
      .withColumn("diff_time",unix_timestamp($"start_t","yyyyMMddHHmmss")-unix_timestamp($"last_end_time","yyyyMMddHHmmss"))
      //4、基于经纬度计算距离
      .withColumn("distance",when($"last_lg".isNull,1).otherwise(calculateLength($"longi",$"lati",$"last_lg",$"last_lat")))
      //5、根据距离及时间间隔计算速度
      .withColumn("speed",round($"distance"/$"diff_time",3))
      //将结果保存到文件
      .write
      .format("csv")
      .option("sep","\t")
      .mode(SaveMode.Overwrite)
      .save("/daas/motl/dwi/dwi_res_regn_mergelocation_msk_d/")
  }
  /**
   * 1、使用maven将代码打成jar包并上传
   * 2、如果开启了Spark的historySever服务需要给/user/spark/applicationHistory目录通过acl设置权限
   * hdfs dfs -setfacl -R -m user:dwi:rwx /user/spark/applicationHistory
   * 3、使用spark-submit提交代码:
   * spark-submit --master yarn-client --class com.ctyun.dwi.DwiResRegnMergelocationMskDay --jars common-1.0.jar dwi-1.0.jar 20220527
   * 4、查看结果目录的文件大小
   * hdfs dfs -du -h /daas/motl/dwi/dwi_res_regn_mergelocation_msk_d/
   */
}

你可能感兴趣的:(电信数据仓库,初学大数据,电信数据仓库)