标签开发的大部分过程都是重复的,因此我们使用面向对象的方法,将这些计算的过程提取成方法,并放到一个类中
#!/usr/bin/env python
# @desc :
__coding__ = "utf-8"
__author__ = "bytedance"from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.types import StringType
from com.bytedance.tag.base.parse_rule import EsMeta
import os
import pyspark.sql.functions as F
import loggingos.environ["SPARK_HOME"] = "/export/server/spark"
os.environ["PYSPARK_PYTHON"] = "/root/anaconda3/envs/pyspark_env/bin/python3"
对es上的标签进行排序
@F.udf(returnType=StringType())
def tag_sort_udf(tags: str):
tags_list = tags.split(',')
tags_int = sorted(list(map(lambda tag: int(tag), tags_list)))tags_str = list(map(lambda tag: str(tag), tags_int))
return ",".join(tags_str)
新旧标签的合并和去重,并且增强代码健壮性,进行none值的判断
# 创建自定义Python函数,实现新旧标签的合并与去重
@F.udf(returnType=StringType())
def merge_old_and_new_tag(old_tag, new_tag):
old_tag = str(old_tag)
new_tag = str(new_tag)# 增强代码健壮性,进行None值判断
if old_tag is None or len(old_tag) == 0:
return new_tagif new_tag is None or len(new_tag) == 0:
return old_tag# 将新旧标签数据格式,由字符串变成List
old_tag_list = old_tag.split(",")
new_tag_list = new_tag.split(",")# 合并新旧标签
new_list = []
new_list_sum = old_tag_list + new_tag_list
for i in new_list_sum:
if i not in new_list:
new_list.append(i)
# 从标签中去除none
for tag_id in new_list:
if tag_id in ('None', ''):
new_list.remove(tag_id)
logging.warning('新标签中出现了None,请检查数据处理流程是否有误')# 去重,最终结果
return ",".join(new_list)
1. 创建对象
class AbstractBaseModel:
# 1- 创建SparkSession对象
def __init__(self, partitions, app_name):
logging.warning(f"1----调用了父类的init方法,传递参数信息:partitions={partitions},app_name={app_name}")
self.__spark = SparkSession.builder \
.config("spark.sql.shuffle.partitions", partitions) \
.appName(app_name) \
.master("local[*]") \
.getOrCreate()
2. 读取mysql数据库中的业务数据表内容
# 2- 读取所有标签
def __read_all_tag(self):
logging.warning("2----读取标签配置数据: 从MySQL表中读取标签配置内容")
tag_df = self.__spark.read.jdbc(
url="jdbc:mysql://192.168.88.166:3306/tags_new?useUnicode=true&characterEncoding=UTF-8&serverTimezone=UTC&useSSL=false",
table="insurance_basic_tag",
properties={'user': 'root', 'password': '123456'}
)
return tag_df
3.利用之前定义的parse_rule规则,提取出es上的数据
# 3- 解析得到rule规则
def __get_parse_rule(self, tag_df, four_tag_id):
logging.warning("3----解析rule规则: 从4级标签中获取rule规则,并且进行规则解析,解析得到业务数据存储位置信息")
# 3.1- 过滤出性别4级标签
four_tag_df: DataFrame = tag_df.where(f"id={four_tag_id}").select("rule")
# 3.2- 得到4级标签中的rule内容
rule_str = four_tag_df.first()["rule"]
# 3.3- 解析rule
rule_obj = EsMeta.parse_rule(rule_str)
return rule_obj
4. 读取ES的业务数据
# 4- 读取ES的业务数据
def __get_business_data(self, rule_obj):
logging.warning("4----读取ES中的业务数据: 根据rule规则读取ES中存储的业务数据")
business_df = self.__spark.read.format("es") \
.option("es.nodes", rule_obj.esNodes) \
.option("es.resource", f"{rule_obj.esIndex}") \
.option("es.read.field.include", rule_obj.selectFields) \
.option("es.mapping.date.rich", False) \
.load()
return business_df
5. 过滤出五级标签
def __get_five_tag(self, tag_df, four_tag_id):
logging.warning("5----过滤5级标签: 从4级和5级标签内容中过滤出5级标签内容,也就是pid=4级标签的ID")
five_tag_df: DataFrame = tag_df.where(f"pid={four_tag_id}").select("id", "rule")
return five_tag_df
6. 方便调用的时候需要关联更多表的内容,新增方法
# 新增: 读取其它任意需要关联的表
def read_es_business(self, table_name: str, field_name: str) -> DataFrame:
logging.warning('特殊方法运行:读取其他表的数据,补全数据避免none')
"""
读取传入的es上的表
:param table_name: 表名
:param field_name:需要读取的字段,逗号分割
:return:DataFrame对象
"""
user_business_df = self.__spark.read.format("es") \
.option("es.nodes", '192.168.88.166:9200') \
.option("es.resource", f"{table_name}") \
.option("es.read.field.include", f"{field_name}") \
.option("es.mapping.date.rich", False) \
.load()
return user_business_df
7. 得到新标签,根据业务自定义开发Sparksql程序 ,也就是类方法中的抽象方法
"""
类中方法没有方法体,也就是没有具体的代码实现逻辑,只有一个pass,就叫做抽象方法
"""
def get_new_result(self, five_tag_df, business_df):
pass
8. 读取ES中的旧标签数据
def __get_old_result(self):
logging.warning("7----读取旧的标签数据: 从ES中将用户的历史标签数据读取出来")
old_tag_df = self.__spark.read.format("es") \
.option("es.nodes", "192.168.88.166:9200") \
.option("es.resource", "insurance_result") \
.load()
return old_tag_df
9. 合并新旧标签
def __merge_tag(self, result_df, old_tag_df):
logging.warning("8----合并新旧标签: 将新标签和旧标签合并,并且要去重。最终得到结果")
merge_result_df = result_df.join(old_tag_df, result_df["user_id"] == old_tag_df["user_id"], how="full").select(
F.coalesce(result_df["user_id"], old_tag_df["user_id"]).alias("user_id"),
merge_old_and_new_tag(result_df["tag_ids"], old_tag_df["tag_ids"]).alias("tag_ids")
)
return merge_result_df
10. 结果数据存储并输出到es
# 9- 结果数据存储
def __write_2_es(self, merge_result_df):
logging.warning("9----结果数据存储: 将用户标签结果数据存储到ES中")
# 对标签排序
result_df = merge_result_df.select(
"user_id",
tag_sort_udf("tag_ids").alias("tag_ids"))
result_df.write.format("es") \
.option("es.nodes", "192.168.88.166:9200") \
.option("es.resource", "insurance_result") \
.option("es.mapping.id", "user_id") \
.option("es.index.auto.create", "True") \
.option("es.write.operation", "upsert") \
.mode("append") \
.save()
11.提供一个公有方法,将多个步骤连在一起
# 10- 提供一个公有方法,将多个步骤连在一起
def execute(self, four_tag_id):
logging.warning("10----提供一个公共方法,里面用来将多个步骤串到一起")
# 2- 读取所有标签
tag_df = self.__read_all_tag()
# 3- 解析得到rule规则
rule_obj = self.__get_parse_rule(tag_df, four_tag_id)
# 4- 读取ES的业务数据
business_df = self.__get_business_data(rule_obj)
# 5- 过滤出五级标签
five_tag_df = self.__get_five_tag(tag_df, four_tag_id)
# 6- 数据处理的结果
result_df = self.get_new_result(five_tag_df, business_df)
try:
# 7- 查询旧标签的数据
old_tag_df = self.__get_old_result()
# 3.4.2- 新旧标签数据full join
merge_result_df = self.__merge_tag(result_df, old_tag_df)
except Exception as e:
logging.error("----索引库不存在", e)
merge_result_df = result_df
# 9- 数据输出
self.__write_2_es(merge_result_df)
# 10- 释放资源
logging.warning('10----程序运行结束')
self.__spark.stop()
12 .调用父类,重写子类
#!/usr/bin/env python
# @desc :
__coding__ = "utf-8"
__author__ = "bytedance"from com.bytedance.tag.base.object_basic import AbstractBaseModel
import pyspark.sql.functions as F
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.types import StringType
13. 重写子类
# 定义子类
class ClaimsActivityModel(AbstractBaseModel):
def get_new_result(self, five_tag_df, business_df):
new_business_df = business_df.groupby('user_id') \
.agg(
F.round(F.sum(F.datediff('claim_date', 'buy_datetime'))
/ F.count('pol_no'), 2).alias('day')
)
'''
new_business_df.show()
+-------+------+
|user_id| day|
+-------+------+
| 1-422| 291.5|
| 1-423| 531.0|
| 1-424| 438.0|
| 1-425| 27.5|
| 1-426| 656.0|
| 1-427| 596.0|
'''
new_five_tag_df = five_tag_df.select(
'id',
F.split('rule', '-')[0].alias('start'),
F.split('rule', '-')[1].alias('end')
)
'''
new_five_tag_df.show()
+---+-----+-----+
| id|start| end|
+---+-----+-----+
| 34| 0| 365|
| 35| 366| 999|
| 36| 1000|36500|
+---+-----+-----+
'''
# 3.3 - 5级标签和业务数据进行jion
'''
+-------+-------+
|user_id|tag_ids|
+-------+-------+
| 1-422| 34|
| 1-423| 35|
| 1-424| 35|
| 1-425| 34|
| 1-426| 35|
| 1-427| 35|
'''
old_result_df: DataFrame = new_business_df\
.join(new_five_tag_df,
[new_business_df.day <= new_five_tag_df.end,
new_business_df.day >= new_five_tag_df.start]
) \
.select(
new_business_df['user_id'], new_five_tag_df['id'] \
.cast(StringType()).alias('tag_ids')
)
# 读取policy_client表的user_id,关联,补全未存在的user_id,将其标签标注为36:不活跃
user_business_df = self.read_es_business("policy_client", "user_id")
result_df = user_business_df.join(old_result_df, user_business_df.user_id == old_result_df.user_id, 'left_outer') \
.select(user_business_df['user_id'], F.expr("if(tag_ids is null, '36', tag_ids)").alias('tag_ids'))
return result_df
if __name__ == '__main__':
# 4级标签的id数,设置的分区数,app的名字
four_tag_id = 33
claims_obj = ClaimsActivityModel(app_name='claims_tag_job', partitions=2)
claims_obj.execute(four_tag_id)