2024.1.25 Object_basic 用户画像标签开发过程 面向对象定义基类

标签开发的大部分过程都是重复的,因此我们使用面向对象的方法,将这些计算的过程提取成方法,并放到一个类中

#!/usr/bin/env python
# @desc : 
__coding__ = "utf-8"
__author__ = "bytedance"

from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.types import StringType
from com.bytedance.tag.base.parse_rule import EsMeta
import os
import pyspark.sql.functions as F
import logging

os.environ["SPARK_HOME"] = "/export/server/spark"
os.environ["PYSPARK_PYTHON"] = "/root/anaconda3/envs/pyspark_env/bin/python3"

  对es上的标签进行排序

@F.udf(returnType=StringType())
def tag_sort_udf(tags: str):
    tags_list = tags.split(',')
    tags_int = sorted(list(map(lambda tag: int(tag), tags_list)))

    tags_str = list(map(lambda tag: str(tag), tags_int))

    return ",".join(tags_str)
 

 新旧标签的合并和去重,并且增强代码健壮性,进行none值的判断

 

# 创建自定义Python函数,实现新旧标签的合并与去重
@F.udf(returnType=StringType())
def merge_old_and_new_tag(old_tag, new_tag):
    old_tag = str(old_tag)
    new_tag = str(new_tag)

    # 增强代码健壮性,进行None值判断
    if old_tag is None or len(old_tag) == 0:
        return new_tag

    if new_tag is None or len(new_tag) == 0:
        return old_tag

    # 将新旧标签数据格式,由字符串变成List
    old_tag_list = old_tag.split(",")
    new_tag_list = new_tag.split(",")

    # 合并新旧标签
    new_list = []
    new_list_sum = old_tag_list + new_tag_list
    for i in new_list_sum:
        if i not in new_list:
            new_list.append(i)
    # 从标签中去除none
    for tag_id in new_list:
        if tag_id in ('None', ''):
            new_list.remove(tag_id)
            logging.warning('新标签中出现了None,请检查数据处理流程是否有误')

    # 去重,最终结果
    return ",".join(new_list)
 

 1. 创建对象

class AbstractBaseModel:
    # 1- 创建SparkSession对象
    def __init__(self, partitions, app_name):
        logging.warning(f"1----调用了父类的init方法,传递参数信息:partitions={partitions},app_name={app_name}")

        self.__spark = SparkSession.builder \
            .config("spark.sql.shuffle.partitions", partitions) \
            .appName(app_name) \
            .master("local[*]") \
            .getOrCreate()

2. 读取mysql数据库中的业务数据表内容 

 # 2- 读取所有标签
    def __read_all_tag(self):
        logging.warning("2----读取标签配置数据: 从MySQL表中读取标签配置内容")

        tag_df = self.__spark.read.jdbc(
            url="jdbc:mysql://192.168.88.166:3306/tags_new?useUnicode=true&characterEncoding=UTF-8&serverTimezone=UTC&useSSL=false",
            table="insurance_basic_tag",
            properties={'user': 'root', 'password': '123456'}
        )
        return tag_df

3.利用之前定义的parse_rule规则,提取出es上的数据 

# 3- 解析得到rule规则
    def __get_parse_rule(self, tag_df, four_tag_id):
        logging.warning("3----解析rule规则: 从4级标签中获取rule规则,并且进行规则解析,解析得到业务数据存储位置信息")

        # 3.1- 过滤出性别4级标签
        four_tag_df: DataFrame = tag_df.where(f"id={four_tag_id}").select("rule")

        # 3.2- 得到4级标签中的rule内容
        rule_str = four_tag_df.first()["rule"]

        # 3.3- 解析rule
        rule_obj = EsMeta.parse_rule(rule_str)

        return rule_obj

 4. 读取ES的业务数据

  # 4- 读取ES的业务数据
    def __get_business_data(self, rule_obj):
        logging.warning("4----读取ES中的业务数据: 根据rule规则读取ES中存储的业务数据")

        business_df = self.__spark.read.format("es") \
            .option("es.nodes", rule_obj.esNodes) \
            .option("es.resource", f"{rule_obj.esIndex}") \
            .option("es.read.field.include", rule_obj.selectFields) \
            .option("es.mapping.date.rich", False) \
            .load()
        return business_df

5. 过滤出五级标签

def __get_five_tag(self, tag_df, four_tag_id):
        logging.warning("5----过滤5级标签: 从4级和5级标签内容中过滤出5级标签内容,也就是pid=4级标签的ID")

        five_tag_df: DataFrame = tag_df.where(f"pid={four_tag_id}").select("id", "rule")
        return five_tag_df

6. 方便调用的时候需要关联更多表的内容,新增方法

  # 新增: 读取其它任意需要关联的表
    def read_es_business(self, table_name: str, field_name: str) -> DataFrame:
        logging.warning('特殊方法运行:读取其他表的数据,补全数据避免none')
        """
        读取传入的es上的表
        :param table_name: 表名
        :param field_name:需要读取的字段,逗号分割
        :return:DataFrame对象
        """
        user_business_df = self.__spark.read.format("es") \
            .option("es.nodes", '192.168.88.166:9200') \
            .option("es.resource", f"{table_name}") \
            .option("es.read.field.include", f"{field_name}") \
            .option("es.mapping.date.rich", False) \
            .load()
        return user_business_df

7. 得到新标签,根据业务自定义开发Sparksql程序 ,也就是类方法中的抽象方法

 """
        类中方法没有方法体,也就是没有具体的代码实现逻辑,只有一个pass,就叫做抽象方法
    """

    def get_new_result(self, five_tag_df, business_df):
        pass

8. 读取ES中的旧标签数据

  def __get_old_result(self):
        logging.warning("7----读取旧的标签数据: 从ES中将用户的历史标签数据读取出来")

        old_tag_df = self.__spark.read.format("es") \
            .option("es.nodes", "192.168.88.166:9200") \
            .option("es.resource", "insurance_result") \
            .load()
        return old_tag_df

9. 合并新旧标签

    def __merge_tag(self, result_df, old_tag_df):
        logging.warning("8----合并新旧标签: 将新标签和旧标签合并,并且要去重。最终得到结果")

        merge_result_df = result_df.join(old_tag_df, result_df["user_id"] == old_tag_df["user_id"], how="full").select(
            F.coalesce(result_df["user_id"], old_tag_df["user_id"]).alias("user_id"),
            merge_old_and_new_tag(result_df["tag_ids"], old_tag_df["tag_ids"]).alias("tag_ids")
        )
        return merge_result_df

10. 结果数据存储并输出到es

  # 9- 结果数据存储

    def __write_2_es(self, merge_result_df):
        logging.warning("9----结果数据存储: 将用户标签结果数据存储到ES中")
        # 对标签排序
        result_df = merge_result_df.select(
            "user_id",
            tag_sort_udf("tag_ids").alias("tag_ids"))

        result_df.write.format("es") \
            .option("es.nodes", "192.168.88.166:9200") \
            .option("es.resource", "insurance_result") \
            .option("es.mapping.id", "user_id") \
            .option("es.index.auto.create", "True") \
            .option("es.write.operation", "upsert") \
            .mode("append") \
            .save()

11.提供一个公有方法,将多个步骤连在一起

    # 10- 提供一个公有方法,将多个步骤连在一起
    def execute(self, four_tag_id):
        logging.warning("10----提供一个公共方法,里面用来将多个步骤串到一起")

        # 2- 读取所有标签
        tag_df = self.__read_all_tag()

        # 3- 解析得到rule规则
        rule_obj = self.__get_parse_rule(tag_df, four_tag_id)

        # 4- 读取ES的业务数据
        business_df = self.__get_business_data(rule_obj)

        # 5- 过滤出五级标签
        five_tag_df = self.__get_five_tag(tag_df, four_tag_id)

        # 6- 数据处理的结果
        result_df = self.get_new_result(five_tag_df, business_df)

        try:
            # 7- 查询旧标签的数据
            old_tag_df = self.__get_old_result()

            # 3.4.2- 新旧标签数据full join
            merge_result_df = self.__merge_tag(result_df, old_tag_df)
        except  Exception as e:
            logging.error("----索引库不存在", e)
            merge_result_df = result_df

        # 9- 数据输出
        self.__write_2_es(merge_result_df)

        # 10- 释放资源
        logging.warning('10----程序运行结束')
        self.__spark.stop()

12 .调用父类,重写子类

#!/usr/bin/env python
# @desc : 
__coding__ = "utf-8"
__author__ = "bytedance"

from com.bytedance.tag.base.object_basic import AbstractBaseModel
import pyspark.sql.functions as F
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.types import StringType
 

13. 重写子类 


# 定义子类
class ClaimsActivityModel(AbstractBaseModel):
    def get_new_result(self, five_tag_df, business_df):
        new_business_df = business_df.groupby('user_id') \
            .agg(
            F.round(F.sum(F.datediff('claim_date', 'buy_datetime'))
                    / F.count('pol_no'), 2).alias('day')
        )
        '''
        new_business_df.show()
        +-------+------+
        |user_id|   day|
        +-------+------+
        |  1-422| 291.5|
        |  1-423| 531.0|
        |  1-424| 438.0|
        |  1-425|  27.5|
        |  1-426| 656.0|
        |  1-427| 596.0|
        '''
        new_five_tag_df = five_tag_df.select(
            'id',
            F.split('rule', '-')[0].alias('start'),
            F.split('rule', '-')[1].alias('end')
        )
        '''
        new_five_tag_df.show()
        +---+-----+-----+
        | id|start|  end|
        +---+-----+-----+
        | 34|    0|  365|
        | 35|  366|  999|
        | 36| 1000|36500|
        +---+-----+-----+
        '''
   
     # 3.3 - 5级标签和业务数据进行jion
        '''
        +-------+-------+
        |user_id|tag_ids|
        +-------+-------+
        |  1-422|     34|
        |  1-423|     35|
        |  1-424|     35|
        |  1-425|     34|
        |  1-426|     35|
        |  1-427|     35|
        ''' 
old_result_df: DataFrame = new_business_df\
            .join(new_five_tag_df,
            [new_business_df.day <= new_five_tag_df.end,
                   new_business_df.day >= new_five_tag_df.start]
        ) \
            .select(
            new_business_df['user_id'], new_five_tag_df['id'] \
                .cast(StringType()).alias('tag_ids')
        )

        # 读取policy_client表的user_id,关联,补全未存在的user_id,将其标签标注为36:不活跃
        user_business_df = self.read_es_business("policy_client", "user_id")

        result_df = user_business_df.join(old_result_df, user_business_df.user_id == old_result_df.user_id, 'left_outer') \
            .select(user_business_df['user_id'], F.expr("if(tag_ids is null, '36', tag_ids)").alias('tag_ids'))

        return result_df
if __name__ == '__main__':
    # 4级标签的id数,设置的分区数,app的名字
    four_tag_id = 33
    claims_obj = ClaimsActivityModel(app_name='claims_tag_job', partitions=2)
    claims_obj.execute(four_tag_id)

你可能感兴趣的:(python,大数据,spark,database,elasticsearch)