背景
用户访问APP内页面的顺序是不固定的,为了专注于研究用户转化,只需要研究符合正常产品流程的用户行为即可,则需要对用户的操作日志记录做清洗,只保留符合产品使用流程的访问记录。
规定用户的访问1->2->3->4->5是有效的,则实际记录1->2->3->3->2中1->2->3->3是有效的,最后的2则剔除,只保留1->2->3->3。
问题一:得到规定的访问序列
问题二:根据规定的访问序列提取有效访问记录
实际业务中,页面不只是一级页面,还有二级页面,同时二级页面还有顺序,相当于是一个树的结构
问题一:得到规定的访问序列
原始数据配置如下
WITH event_road AS (
/* 模拟步骤mapping数据 */
SELECT 'page1' AS event_key, 1 AS event_id, 0 AS p_event_id, 0 AS group_id
UNION ALL
SELECT 'page2' AS event_key, 2 AS event_id, 0 AS p_event_id, 0 AS group_id
UNION ALL
SELECT 'page2-1-1' AS event_key, 3 AS event_id, 2 AS p_event_id, 1 AS group_id
UNION ALL
SELECT 'page2-1-2' AS event_key, 4 AS event_id, 2 AS p_event_id, 1 AS group_id
UNION ALL
SELECT 'page2-2-1' AS event_key, 5 AS event_id, 2 AS p_event_id, 2 AS group_id
UNION ALL
SELECT 'page2-2-2' AS event_key, 6 AS event_id, 2 AS p_event_id, 2 AS group_id
UNION ALL
SELECT 'page3' AS event_key, 7 AS event_id, 0 AS p_event_id, 0 AS group_id
UNION ALL
SELECT 'page4' AS event_key, 8 AS event_id, 0 AS p_event_id, 0 AS group_id
UNION ALL
SELECT 'page5' AS event_key, 9 AS event_id, 0 AS p_event_id, 0 AS group_id
)
SELECT * FROM event_road
event_id有顺序的含义,p_event_id为二级页面独有,值为上一级页面的event_id,group_id为二级页面的组号,此表规定的正确路径有3条,分别两个二级的1->2->3->4、1->2->5->6和一个一级的1->2->7->8->9
SELECT
string_to_array(
CASE
WHEN p1.event_id_str IS NOT NULL THEN p1.event_id_str || ',' || p0.event_id_str
ELSE
p0.event_id_str
END,',') AS event_id_seq
FROM
(
SELECT
p_event_id,
event_id_str
FROM
(
SELECT
p_event_id,
group_id,
string_agg (event_id, ',' ORDER BY event_id) AS event_id_str
FROM
event_road
GROUP BY
p_event_id,
group_id
) m0
GROUP BY
p_event_id,
event_id_str
) p0
LEFT JOIN
(
/* 找到一级步骤到二级步骤的分叉点 */
SELECT
t1.p_event_id,
string_agg (
t0.event_id,
','
ORDER BY
t0.event_id
) AS event_id_str
FROM
(
/* 一级步骤 */
SELECT
*
FROM
event_road
WHERE
p_event_id = 0
) t0
JOIN
(
/* 二级步骤 */
SELECT
p_event_id,
MIN (event_id) AS min_event_id
FROM
event_road
WHERE
p_event_id != 0
GROUP BY
p_event_id
) t1 ON t0.event_id <= t1.min_event_id
GROUP BY
t1.p_event_id
) p1 ON p0.p_event_id = p1.p_event_id
问题二:根据规定的访问序列提取有效访问记录
用户的访问记录如下:
/*
session_id
访问序号:session_action_seq
页面:event_key
*/
WITH event_data AS (
/* 模拟访问数据 */
SELECT '123' AS session_id, '1' AS session_action_seq, 'page1' AS event_key, 1 AS event_id
UNION ALL
SELECT '123' AS session_id, '2' AS session_action_seq, 'page3' AS event_key, 3 AS event_id
UNION ALL
SELECT '123' AS session_id, '3' AS session_action_seq, 'page2' AS event_key, 2 AS event_id
UNION ALL
SELECT '123' AS session_id, '4' AS session_action_seq, 'page2-1-1' AS event_key, 3 AS event_id
UNION ALL
SELECT '123' AS session_id, '5' AS session_action_seq, 'page2-1-2' AS event_key, 4 AS event_id
UNION ALL
SELECT '123' AS session_id, '6' AS session_action_seq, 'page3' AS event_key, 7 AS event_id
UNION ALL
SELECT '123' AS session_id, '7' AS session_action_seq, 'page5' AS event_key, 9 AS event_id
UNION ALL
SELECT '123' AS session_id, '8' AS session_action_seq, 'page4' AS event_key, 8 AS event_id
UNION ALL
SELECT '123' AS session_id, '9' AS session_action_seq, 'page6' AS event_key, 10 AS event_id
)
SELECT * FROM event_data
按照规定的访问顺序,提取有效的路径应为(event_id):1->2->3->4和1->2->7->8,其中 session_action_seq 为2、7、9的记录无效,应剔除
由于用sql实现这个逻辑太复杂,需要定义一个函数
DECLARE
/* 在arr中查找arr_rn中出现的元素,arr_rn中的元素第一次在arr中出现顺序递增 */
arr_len int:=array_upper(arr,1);
val INT[];
val_empty INT[];
idx INT:=1;
idx_max INT:=idx;
item INT;
BEGIN
FOR item IN SELECT UNNEST(arr_rn)
LOOP
FOR idx in idx_max..arr_len
LOOP
IF item = arr[idx] THEN
IF arr[idx] != arr[idx_max] THEN
idx_max = idx;
END IF;
val:=array_append(val,idx);
ELSE
CONTINUE;
END IF;
END LOOP;
IF item != arr[idx_max] THEN
EXIT;
END IF;
END LOOP;
IF arr[val[1]] != arr_rn[1] THEN
RETURN NEXT val_empty;
ELSE
RETURN NEXT val;
END IF;
END;
问题转化为,从[1,3,2,3,4,7,9,8,10]中找出符合[1,2,3,4],[1,2,5,6],[1,2,7,8,9]三种顺序的序列,结果为[1,2,3,4],[1,2],[1,2,7,8],去重之后的有效访问为[1,2,3,4,7,8]
SELECT
session_id,
/* 剔除不合法记录 */
unnest(public.get_inter_arr(event_id_arr,event_id_seq::INT[]))::BIGINT AS session_action_rank
FROM
(
SELECT
session_id,
ARRAY_AGG (event_id) AS event_id_arr
FROM
event_data
GROUP BY
session_id
)t1
LEFT JOIN
(/* 通过配置表生成合法路径 */
SELECT
string_to_array(
CASE
WHEN p1.event_id_str IS NOT NULL THEN p1.event_id_str || ',' || p0.event_id_str
ELSE
p0.event_id_str
END,',') AS event_id_seq
FROM
(
SELECT
p_event_id,
event_id_str
FROM
(
SELECT
p_event_id,
group_id,
string_agg (event_id, ',' ORDER BY event_id) AS event_id_str
FROM
event_road
GROUP BY
p_event_id,
group_id
) m0
GROUP BY
p_event_id,
event_id_str
) p0
LEFT JOIN
(
/* 找到一级步骤到二级步骤的分叉点 */
SELECT
t1.p_event_id,
string_agg (
t0.event_id,
','
ORDER BY
t0.event_id
) AS event_id_str
FROM
(
/* 一级步骤 */
SELECT
*
FROM
event_road
WHERE
p_event_id = 0
) t0
JOIN
(
/* 二级步骤 */
SELECT
p_event_id,
MIN (event_id) AS min_event_id
FROM
event_road
WHERE
p_event_id != 0
GROUP BY
p_event_id
) t1 ON t0.event_id <= t1.min_event_id
GROUP BY
t1.p_event_id
) p1 ON p0.p_event_id = p1.p_event_id
)t2 ON 1=1
图四中的session_action_rank就是[1,3,2,3,4,7,9,8,10]的下标(从1开始),对应有效访问为[1,2,3,4,7,8]。至此,得到了所有用户的有效访问记录,经过一系列的整理之后,再通过简单的统计就可以得到每一个步骤的UV转化数据。