2024.1.20 用户画像标签开发,面向过程方法

#!/usr/bin/env python
# @desc :
__coding__ = "utf-8"
__author__ = "bytedance"

from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.types import StringType
from com.bytedance.tag.base.parse_rule import EsMeta
import pyspark.sql.functions as F
import os
from pyspark.sql.window import Window as win

 os.environ['SPARK_HOME'] = '/export/server/spark'
os.environ["PYSPARK_PYTHON"] = '/root/anaconda3/envs/pyspark_env/bin/python3'

if __name__ == '__main__':
    # 1.创建SparkSession对象
    spark = SparkSession.builder \
        .config("spark.sql.shuffle.partitions", 1) \
        .appName("claims_tag_job") \
        .master('local[*]') \
        .getOrCreate()

    # 理赔活跃度的四级标签ID
    four_tag_id = 33


    # 2.插入数据
    # 2.1-读取所有标签
    tag_df = spark.read.jdbc(
        url="jdbc:mysql://192.168.88.166:3306/tags_new?useUnicode=true&characterEncoding=UTF-8&serverTimezone=UTC&useSSL=false",
        table="insurance_basic_tag",
        properties={'user': 'root', 'password': '123456'}
    )
    # # 2.2过滤出四级标签为122行,再查出rule规则
    four_tag_df: DataFrame = tag_df.where(f"id={four_tag_id}").select("rule")

    # 2.3得到rule数据
    rule_str = four_tag_df.first()['rule']

    # # 2.4将rule解析得到对象
    rule_obj = EsMeta.parse_rule(rule_str)

    # # 2.5根据规则读取业务数据
    business_df = spark.read.format("es") \
        .option("es.nodes", rule_obj.esNodes) \
        .option("es.resource", f"{rule_obj.esIndex}") \
        .option("es.read.field.include", f"{rule_obj.selectFields}") \
        .option("es.mapping.date.rich", False) \
        .load()

    '''
    business_df.show()
     +------------+----------+---------+-------------+-------+
    |buy_datetime|claim_date|claim_mnt|       pol_no|user_id|
    +------------+----------+---------+-------------+-------+
    |  2019-08-18|2021-02-08|     5386|P000000200945|  1-422|
    |  2017-10-01|2019-03-16|     6589|   P000054052|  1-423|
    |  2020-09-15|2021-11-27|     6801|   P000062516|  1-424|
    |  2022-06-06|2022-06-18|     8736|P000000165361|  1-425|
    '''
  # # 2.6从标签数据中过滤出五级标签
    five_tag_df: DataFrame = tag_df.where(f"pid={four_tag_id}").select("id", "rule", "name")

    '''
    five_tag_df.show()
   +---+----------+--------+
    | id|      rule|    name|
    +---+----------+--------+
    | 34|     0-365|非常活跃|
    | 35|   366-999|    活跃|
    | 36|1000-36500|  不活跃|
    +---+----------+--------+
    '''

 # 3 数据处理

    new_business_df = business_df.groupby(
        'user_id').agg(F.round(F.sum(F.datediff('claim_date','buy_datetime'))
                               /F.count('pol_no'),2).alias('day')
                       )

    '''
    new_business_df.show()
    +-------+------+
    |user_id|   day|
    +-------+------+
    |  1-422| 291.5|
    |  1-423| 531.0|
    |  1-424| 438.0|
    |  1-425|  27.5|
    |  1-426| 656.0|
    |  1-427| 596.0|
    '''
new_five_tag_df = five_tag_df.select(
        'id',
        F.split('rule', '-')[0].alias('start'),
        F.split('rule', '-')[1].alias('end')
    )
    '''
    new_five_tag_df.show()
    +---+-----+-----+
    | id|start|  end|
    +---+-----+-----+
    | 34|    0|  365|
    | 35|  366|  999|
    | 36| 1000|36500|
    +---+-----+-----+
    '''
    result_df:DataFrame = new_business_df.join(new_five_tag_df)\
    .where('day between start and end').select(
        new_business_df['user_id'],new_five_tag_df['id']\
        .cast(StringType()).alias('tag_ids')
    )
    result_df.show()
# 3.4 -新旧标签进行合并
    # 3.4.1- 查询旧标签的数据
    old_tag_df = spark.read.format("es") \
        .option("es.nodes", "192.168.88.166:9200") \
        .option("es.resource", "insurance_result") \
        .load()


    # 3.3.2- 新旧标签数据full join
    # 3.4.2.1- 创建自定义Python函数,实现新旧标签的合并与去重
    @F.udf(returnType=StringType())
    def merge_old_and_new_tag(old_tag, new_tag):
        new_tag = str(new_tag)
        # 增强代码健壮性,进行None值判断
        if old_tag is None or len(old_tag) == 0:
            return new_tag

        if new_tag is None or len(new_tag) == 0:
            return old_tag

        # 将新旧标签数据格式,由字符串变成List
        old_tag_list = old_tag.split(",")
        new_tag_list = new_tag.split(",")
        #
        # 合并新旧标签
        new_list = []
        new_list_sum = old_tag_list + new_tag_list
        for i in new_list_sum:
            if i not in new_list:
                new_list.append(i)

        # 去重,最终结果
        return ",".join(new_list)
merge_tag_df = result_df.join(old_tag_df, result_df["user_id"] == old_tag_df["user_id"], how="full").select(
        F.coalesce(result_df["user_id"], old_tag_df["user_id"]).alias("user_id"),
        merge_old_and_new_tag(old_tag_df["tag_ids"], result_df["tag_ids"]).alias("tag_ids")
    )
    '''
    +-------+-----------+
    |user_id|    tag_ids|
    +-------+-----------+
    |    1-1|6,16,64,124|
    |   1-10|6,19,63,123|
    |  1-100|6,16,64,125|
    |  1-101|6,19,63,125|
    '''
 # 4.输出数据
    merge_tag_df.write.format("es") \
        .option("es.nodes", "192.168.88.166:9200") \
        .option("es.resource", "insurance_result") \
        .option("es.mapping.id", "user_id") \
        .option("es.index.auto.create", "True") \
        .option("es.write.operation", "upsert") \
        .mode("append") \
        .save()
    print('运行完成', '-' * 100)
    # 释放资源
    spark.stop()

你可能感兴趣的:(python,大数据,hadoop,elasticsearch,spark)