Gini系数

# 建议在jupyter中运行
import os, glob
import numpy as np
import pandas as pd

# spark init
import findspark; findspark.init();print("Init!")

from pyspark.sql import SparkSession
spark = SparkSession.builder \
                    .appName("get_gini_index") \
                    .config('spark.yarn.queue', 'root.recommend_prod') \
                    .enableHiveSupport()  \
                    .getOrCreate()
sc = spark.sparkContext

# 画图
import matplotlib.pyplot as plt
plt.rcParams['axes.unicode_minus']=False     # 正常显示负号
from matplotlib import font_manager
zhfont = font_manager.FontProperties(fname="../simsun.ttc")   # 设置显示中文

# 颜色设置
colors = ["#E1701A", "#F7A440", "#AAAAAA", "#343F56"]   # 橙色

# ======================== Func ========================

def floatrange(start, stop, steps):
    # 浮点数的递增序列
    return [start+float(i)*(stop-start)/(float(steps)-1) for i in range(steps)]

def compute_gini(data):
#     from scipy import integrate
#     gini = abs(0.5-integrate.trapz(data['y'], data['x'])) / 0.5
    n = data.shape[0]
    gini = 1 - 1/n * (2 * sum(data['y'][:-1]) + 1)
    return gini

# 绘制gini系数图
def plot_gini(cum_x, cum_y, data, x_name, y_name):
    dat = data.copy()
    dat = dat.sort_values(by=cum_x).reset_index()

    dat['cum_x'] = dat[cum_x].cumsum()
    dat['cum_y'] = dat[cum_y].cumsum()


    dat['x'] = dat.index/dat.shape[0]
    dat['y'] = dat['cum_y']/max(dat['cum_y'])
    
    # 画图
    plt.figure(figsize=(5,5),dpi=100)
    plt.plot(dat['x'],dat['y'],color=colors[0])
    plt.plot(floatrange(0,1,100000),floatrange(0,1,100000),color=colors[3])

    plt.xlabel(f'累计商品百分比(%)(按{x_name}排序)', fontproperties=zhfont)
    plt.ylabel(f'累计{y_name}百分比(%)', fontproperties=zhfont)
    plt.text(0.1, 0.8, 'Gini Index: %.4f ' % compute_gini(dat) )

    plt.show()


# ======================== Data ========================

# TODO: 请在这里输入参数
dt = "2021-07-01"
calltype = 'a0.b1.c13'
calltype_name = "bottom"
oz_id = 17

# 生成配置
source_spm_id, page_spm_id, module_spm_id = calltype.split(".")
dt_h = dt.replace("-","_")
dt_x = dt.replace("-","/")

# 生成数据
sql_text = f"""
    -- 获得商品spm信息
    select 
        base.good_id
        ,ifnull(sum(is_show),0) as show_cnt
        ,ifnull(sum(is_click),0) as click_cnt
        ,ifnull(max(buy_cnt),0) as buy_cnt
    from(
        -- 当天在售商品
        select 
            goods_id as good_id
        from 
            wujie.dim_cx_gos_category_df 
        where 
            dt='{dt}' 
            and oz_id = {oz_id}
            and is_td_sale = 1
        group by goods_id
    ) base
    left join (
        -- 曝光/点击信息
        select 
            good_id
            ,case when spm_event_type = '1' and action_type = 'jump_goods_detail' then 1 else 0 end as is_click
            ,case when spm_event_type = '2' then 1 else 0 end as is_show
        from wujie.dwd_cx_log_spm_df
        where 
            dt = '{dt}'
            and source_spm_id = '{source_spm_id}'
            and page_spm_id = '{page_spm_id}'
            and module_spm_id = '{module_spm_id}'
            and user_id not in ('undefined', 'null', 'none')
            and good_id is not null
    ) as spmm
    on base.good_id = spmm.good_id

    left join(
        -- 购买信息
        select 
            good_id
            ,sum(cart_order_number) + sum(instant_order_number) as buy_cnt
        from(
            select 
                goods_id as good_id
                ,if(cart_order_number>0, 1, 0) as cart_order_number
                ,if(instant_order_number>0, 1, 0) as instant_order_number
            from wujie_flow.dwd_cx_log_order_scene_di
            where 
                dt = '{dt}'
                and a = '{source_spm_id}'
                and b = '{page_spm_id}'
                and c = '{module_spm_id}'
                and length(goods_id) > 0
                and nvl(cart_order_number,0)+nvl(instant_order_number,0) > 0
            )
        group by good_id
    ) as buyy
    on base.good_id = buyy.good_id

    group by base.good_id
"""

print(sql_text)
spm_cnt = spark.sql(sql_text)
data = spm_cnt.toPandas()

# ======================== Plot ========================

cum_x = "show_cnt"
cum_y = "click_cnt"
plot_gini(cum_x, cum_y, data, "曝光量", "点击量")

你可能感兴趣的:(码农日常,spark)