提取session内有效访问路径

背景

用户访问APP内页面的顺序是不固定的,为了专注于研究用户转化,只需要研究符合正常产品流程的用户行为即可,则需要对用户的操作日志记录做清洗,只保留符合产品使用流程的访问记录。

规定用户的访问1->2->3->4->5是有效的,则实际记录1->2->3->3->2中1->2->3->3是有效的,最后的2则剔除,只保留1->2->3->3。
问题一:得到规定的访问序列
问题二:根据规定的访问序列提取有效访问记录
实际业务中,页面不只是一级页面,还有二级页面,同时二级页面还有顺序,相当于是一个树的结构

问题一:得到规定的访问序列

原始数据配置如下

WITH event_road AS (
    /* 模拟步骤mapping数据 */
    SELECT 'page1' AS event_key, 1 AS event_id, 0 AS p_event_id, 0 AS group_id
    UNION ALL
    SELECT 'page2' AS event_key, 2 AS event_id, 0 AS p_event_id, 0 AS group_id
    UNION ALL
    SELECT 'page2-1-1' AS event_key, 3 AS event_id, 2 AS p_event_id, 1 AS group_id
    UNION ALL
    SELECT 'page2-1-2' AS event_key, 4 AS event_id, 2 AS p_event_id, 1 AS group_id
    UNION ALL
    SELECT 'page2-2-1' AS event_key, 5 AS event_id, 2 AS p_event_id, 2 AS group_id
    UNION ALL
    SELECT 'page2-2-2' AS event_key, 6 AS event_id, 2 AS p_event_id, 2 AS group_id
    UNION ALL
    SELECT 'page3' AS event_key, 7 AS event_id, 0 AS p_event_id, 0 AS group_id
    UNION ALL
    SELECT 'page4' AS event_key, 8 AS event_id, 0 AS p_event_id, 0 AS group_id
    UNION ALL
    SELECT 'page5' AS event_key, 9 AS event_id, 0 AS p_event_id, 0 AS group_id
)
SELECT * FROM event_road

图一

event_id有顺序的含义,p_event_id为二级页面独有,值为上一级页面的event_id,group_id为二级页面的组号,此表规定的正确路径有3条,分别两个二级的1->2->3->41->2->5->6和一个一级的1->2->7->8->9

SELECT
    string_to_array(
        CASE
            WHEN p1.event_id_str IS NOT NULL THEN p1.event_id_str || ',' || p0.event_id_str
        ELSE
            p0.event_id_str
        END,',') AS event_id_seq
FROM
    (
        SELECT
            p_event_id,
            event_id_str
        FROM
            (
                SELECT
                    p_event_id,
                    group_id,
                    string_agg (event_id, ',' ORDER BY event_id) AS event_id_str
                FROM
                    event_road
                GROUP BY
                    p_event_id,
                    group_id
            ) m0
        GROUP BY
            p_event_id,
            event_id_str
    ) p0
LEFT JOIN 
(
    /* 找到一级步骤到二级步骤的分叉点 */
    SELECT
        t1.p_event_id,
        string_agg (
            t0.event_id,
            ','
        ORDER BY
            t0.event_id
        ) AS event_id_str
    FROM
        (
            /* 一级步骤 */
            SELECT
                *
            FROM
                event_road
            WHERE
                p_event_id = 0
        ) t0
    JOIN 
    (
        /* 二级步骤 */
        SELECT
            p_event_id,
            MIN (event_id) AS min_event_id
        FROM
            event_road
        WHERE
            p_event_id != 0
        GROUP BY
            p_event_id
    ) t1 ON t0.event_id <= t1.min_event_id
    GROUP BY
        t1.p_event_id
) p1 ON p0.p_event_id = p1.p_event_id
图二

问题二:根据规定的访问序列提取有效访问记录

用户的访问记录如下:

/*
session_id
访问序号:session_action_seq
页面:event_key
*/
WITH event_data AS (
    /* 模拟访问数据 */
    SELECT '123' AS session_id, '1' AS session_action_seq, 'page1' AS event_key, 1 AS event_id
    UNION ALL
    SELECT '123' AS session_id, '2' AS session_action_seq, 'page3' AS event_key, 3 AS event_id
    UNION ALL
    SELECT '123' AS session_id, '3' AS session_action_seq, 'page2' AS event_key, 2 AS event_id
    UNION ALL
    SELECT '123' AS session_id, '4' AS session_action_seq, 'page2-1-1' AS event_key, 3 AS event_id
    UNION ALL
    SELECT '123' AS session_id, '5' AS session_action_seq, 'page2-1-2' AS event_key, 4 AS event_id
    UNION ALL
    SELECT '123' AS session_id, '6' AS session_action_seq, 'page3' AS event_key, 7 AS event_id
    UNION ALL
    SELECT '123' AS session_id, '7' AS session_action_seq, 'page5' AS event_key, 9 AS event_id
    UNION ALL
    SELECT '123' AS session_id, '8' AS session_action_seq, 'page4' AS event_key, 8 AS event_id
    UNION ALL
    SELECT '123' AS session_id, '9' AS session_action_seq, 'page6' AS event_key, 10 AS event_id
)
SELECT * FROM event_data
图三

按照规定的访问顺序,提取有效的路径应为(event_id):1->2->3->4和1->2->7->8,其中 session_action_seq 为2、7、9的记录无效,应剔除
由于用sql实现这个逻辑太复杂,需要定义一个函数

DECLARE
    /* 在arr中查找arr_rn中出现的元素,arr_rn中的元素第一次在arr中出现顺序递增 */
    arr_len int:=array_upper(arr,1);
    val INT[];
    val_empty INT[];
    idx INT:=1;
    idx_max INT:=idx;
    item INT;
BEGIN
        FOR item IN SELECT UNNEST(arr_rn)
        LOOP
            FOR idx in idx_max..arr_len
            LOOP
                IF item = arr[idx] THEN
                    IF arr[idx] != arr[idx_max] THEN
                        idx_max = idx;
                    END IF;
                    val:=array_append(val,idx);
                ELSE
                    CONTINUE;
                END IF;
            END LOOP;
            IF item != arr[idx_max] THEN
                EXIT;
            END IF;
        END LOOP;
    IF arr[val[1]] != arr_rn[1] THEN
            RETURN NEXT val_empty;
        ELSE
            RETURN NEXT val;
        END IF;
END;

问题转化为,从[1,3,2,3,4,7,9,8,10]中找出符合[1,2,3,4],[1,2,5,6],[1,2,7,8,9]三种顺序的序列,结果为[1,2,3,4],[1,2],[1,2,7,8],去重之后的有效访问为[1,2,3,4,7,8]

SELECT
    session_id,
    /* 剔除不合法记录 */
    unnest(public.get_inter_arr(event_id_arr,event_id_seq::INT[]))::BIGINT AS session_action_rank
FROM
    (
        SELECT
            session_id,
            ARRAY_AGG (event_id) AS event_id_arr
        FROM
            event_data
        GROUP BY
            session_id
    )t1
    LEFT JOIN
    (/* 通过配置表生成合法路径 */
    SELECT
        string_to_array(
            CASE
                WHEN p1.event_id_str IS NOT NULL THEN p1.event_id_str || ',' || p0.event_id_str
            ELSE
                p0.event_id_str
            END,',') AS event_id_seq
    FROM
        (
            SELECT
                p_event_id,
                event_id_str
            FROM
                (
                    SELECT
                        p_event_id,
                        group_id,
                        string_agg (event_id, ',' ORDER BY event_id) AS event_id_str
                    FROM
                        event_road
                    GROUP BY
                        p_event_id,
                        group_id
                ) m0
            GROUP BY
                p_event_id,
                event_id_str
        ) p0
    LEFT JOIN 
    (
        /* 找到一级步骤到二级步骤的分叉点 */
        SELECT
            t1.p_event_id,
            string_agg (
                t0.event_id,
                ','
            ORDER BY
                t0.event_id
            ) AS event_id_str
        FROM
            (
                /* 一级步骤 */
                SELECT
                    *
                FROM
                    event_road
                WHERE
                    p_event_id = 0
            ) t0
        JOIN 
        (
            /* 二级步骤 */
            SELECT
                p_event_id,
                MIN (event_id) AS min_event_id
            FROM
                event_road
            WHERE
                p_event_id != 0
            GROUP BY
                p_event_id
        ) t1 ON t0.event_id <= t1.min_event_id
        GROUP BY
            t1.p_event_id
    ) p1 ON p0.p_event_id = p1.p_event_id
)t2 ON 1=1
图四

图四中的session_action_rank就是[1,3,2,3,4,7,9,8,10]的下标(从1开始),对应有效访问为[1,2,3,4,7,8]。至此,得到了所有用户的有效访问记录,经过一系列的整理之后,再通过简单的统计就可以得到每一个步骤的UV转化数据。

你可能感兴趣的:(提取session内有效访问路径)