kmeans_random( rel_source,
expr_point,
k,
fn_dist,
agg_centroid,
max_num_iterations,
min_frac_reassigned
)
kmeanspp( rel_source,
expr_point,
k,
fn_dist,
agg_centroid,
max_num_iterations,
min_frac_reassigned,
seeding_sample_ratio
)
kmeans( rel_source,
expr_point,
rel_initial_centroids,
expr_centroid,
fn_dist,
agg_centroid,
max_num_iterations,
min_frac_reassigned
)
kmeans( rel_source,
expr_point,
initial_centroids,
fn_dist,
agg_centroid,
max_num_iterations,
min_frac_reassigned
)
closest_column( m, x )
(2)参数
simple_silhouette( rel_source,
expr_point,
centroids,
fn_dist
)
(2)参数
-- 创建原始数据表
drop table if exists t_source;
create table t_source
(cust_id int,
amount decimal(10 , 2 ),
quantity int,
dt date);
-- 添加100条数据
insert into t_source (cust_id,amount,quantity,dt) values
(567,1100.51,2,'2017-07-20'),(568,2003.47,2,'2017-07-20'),(569,297.91,2,'2017-07-14'),
(570,300.02,2,'2017-07-12'),(571,198.48,2,'2017-07-19'),(572,4003.07,3,'2017-07-20'),
(573,4003.07,3,'2017-07-20'),(574,393.39,2,'2017-06-29'),(575,611.3,3,'2017-07-13'),
(576,597.73,2,'2017-07-13'),(577,399.32,2,'2017-06-29'),(578,20026.55,2,'2017-07-18'),
(579,1997.57,2,'2017-06-22'),(580,3202.77,3,'2017-07-13'),(581,597.72,2,'2017-06-29'),
(582,600.04,3,'2017-07-14'),(583,3995.11,2,'2017-06-22'),(584,3995.11,2,'2017-06-22'),
(585,3984.27,2,'2017-06-22'),(586,7003.05,3,'2017-06-26'),(587,2001.23,2,'2017-06-27'),
(588,2001.23,2,'2017-06-19'),(589,2094.91,3,'2017-06-19'),(590,2964.5,4,'2017-06-26'),
(591,1982.58,2,'2017-06-09'),(592,3000.62,2,'2017-06-20'),(593,4000,2,'2017-06-22'),
(594,5003.06,2,'2017-06-19'),(595,2098.71,3,'2017-06-27'),(596,196.44,2,'2017-06-20'),
(597,401.46,2,'2017-07-11'),(598,20007.34,5,'2017-07-20'),(599,2001.23,2,'2017-06-22'),
(600,2961.03,2,'2017-06-21'),(601,3997.09,2,'2017-06-20'),(602,1491.21,2,'2017-07-13'),
(603,4105.47,5,'2017-07-18'),(604,1998.17,2,'2017-06-13'),(605,497.28,2,'2017-06-09'),
(606,2306.61,2,'2017-06-27'),(607,9006.76,5,'2017-07-21'),(608,5982.51,3,'2017-06-29'),
(609,2199.46,2,'2017-07-20'),(610,1088.6,2,'2017-06-20'),(611,3991.01,3,'2017-06-20'),
(612,2000.01,2,'2017-06-08'),(613,501.79,3,'2017-07-17'),(614,15002.45,5,'2017-07-13'),
(615,601.1,2,'2017-07-11'),(616,2986.69,2,'2017-06-19'),(617,2012.68,2,'2017-06-30'),
(618,1500.5,2,'2017-06-07'),(619,3988.51,3,'2017-06-19'),(620,20010.44,2,'2017-06-05'),
(621,20002.57,2,'2017-06-02'),(622,5266.72,4,'2017-07-17'),(623,5266.72,4,'2017-07-17'),
(624,7801.58,2,'2017-07-11'),(625,294.18,2,'2017-05-29'),(626,972.54,2,'2017-05-24'),
(627,1978.62,2,'2017-06-08'),(628,694.73,2,'2017-06-05'),(629,1196.04,2,'2017-06-08'),
(630,4451.68,4,'2017-06-20'),(631,2010.49,2,'2017-06-19'),(632,2994.46,2,'2017-06-30'),
(633,1000.37,2,'2017-05-29'),(634,3199.49,3,'2017-06-22'),(635,6023.75,3,'2017-06-08'),
(636,1296.24,12,'2017-07-19'),(637,4003.67,3,'2017-06-26'),(638,4001.54,2,'2017-06-19'),
(639,4000.61,3,'2017-06-23'),(640,4001.83,3,'2017-06-19'),(641,5999.42,3,'2017-06-21'),
(642,4975.83,4,'2017-06-27'),(643,12052.96,5,'2017-07-21'),(644,5010.82,2,'2017-06-07'),
(645,3001.22,2,'2017-07-13'),(646,2992.68,3,'2017-06-23'),(647,4002.45,2,'2017-06-19'),
(648,5938.52,2,'2017-05-22'),(649,4001.83,3,'2017-06-19'),(650,7141.16,2,'2017-06-28'),
(651,26010.8,16,'2017-07-20'),(652,9102.11,7,'2017-07-19'),(653,1225.07,2,'2017-05-31'),
(654,6168.28,3,'2017-06-19'),(655,2997.94,3,'2017-07-11'),(656,2972.38,2,'2017-06-07'),
(657,4303.51,2,'2017-05-23'),(658,4100.16,4,'2017-07-18'),(659,2001.23,2,'2017-06-19'),
(660,11594.24,10,'2017-07-20'),(661,12039.49,2,'2017-06-22'),(662,1494.97,2,'2017-06-13'),
(663,954.77,2,'2017-06-27'),(664,6006.78,3,'2017-06-22'),(665,25755.7,2,'2017-06-06'),
(666,60201.48,2,'2017-07-11');
-- 去掉异常值
drop table if exists t_source_change;
create table t_source_change
(row_id serial,
cust_id int,
amount decimal(10 , 2 ),
quantity int,
dt int);
insert into t_source_change (cust_id,amount,quantity,dt)
select cust_id,
amount,
quantity,
current_date-dt dt
from t_source
where amount < (select percentile_cont (0.99) within group (order by amount)
from t_source);
select * from t_source_change order by cust_id;
查询结果为:
...
94 | 660 | 11594.24 | 10 | 2
95 | 661 | 12039.49 | 2 | 30
96 | 662 | 1494.97 | 2 | 39
97 | 663 | 954.77 | 2 | 25
98 | 664 | 6006.78 | 3 | 30
99 | 665 | 25755.70 | 2 | 46
(99 rows)
可以看到,因为cust_id=666用户的金额不在99%的范围内,所以t_source_change表中去掉了该条记录。在此去除异常并非这个用户异常,而是为了改善聚类结果。最后需要给这些“异常用户”做业务解释。
-- PCA去掉相关性
drop table if exists mat;
create table mat (id integer,
row_vec double precision[]
);
insert into mat
select row_id,
string_to_array(amount||','||quantity||','||dt,',')::double precision[] row_vec
from t_source_change;
drop table if exists result_table, result_table_mean;
select madlib.pca_train('mat', -- source table
'result_table', -- output table
'id', -- row id of source table
3 -- number of principal components
);
drop table if exists residual_table, result_summary_table, out_table;
select madlib.pca_project( 'mat',
'result_table',
'out_table',
'id',
'residual_table',
'result_summary_table'
);
-- 0-1归一化
drop table if exists t_source_change_nor;
create table t_source_change_nor
as
select row_id,
string_to_array(amount_nor||','||quantity_nor||','||dt_nor,',')::double precision[] row_vec
from
(
select row_id,
(row_vec[1] - min_amount)/(max_amount - min_amount) amount_nor,
(row_vec[2] - min_quantity)/(max_quantity - min_quantity) quantity_nor,
(max_dt - row_vec[3])/(max_dt - min_dt) dt_nor
from out_table,
(select max(row_vec[1]) max_amount,
min(row_vec[1]) min_amount,
max(row_vec[2]) max_quantity,
min(row_vec[2]) min_quantity,
max(row_vec[3]) max_dt,
min(row_vec[3]) min_dt
from out_table) t) t;
select * from t_source_change_nor order by row_id;
查询结果为:
...
94 | {0.558470357737996,0.954872666162949,0.296935710714377}
95 | {0.54122257689463,0.482977156688704,0.81244230552888}
96 | {0.949697477408967,0.385844448834949,0.65901807391295}
97 | {0.970623648952883,0.62014760223173,0.704941708880569}
98 | {0.774918367989914,0.513405499602443,0.666993533505089}
99 | {0.00988267286683593,0.150872332720288,0.908966781310526}
(99 rows)
drop table if exists km_result;
create table km_result as
select * from madlib.kmeanspp( 't_source_change_nor', -- table of source data
'row_vec', -- column containing point co-ordinates
3, -- number of centroids to calculate
'madlib.squared_dist_norm2', -- distance function
'madlib.avg', -- aggregate function
20, -- number of iterations
0.001 -- fraction of centroids reassigned to keep iterating
);
\x on;
select * from km_result;
结果如下:
-[ RECORD 1 ]----+-----------------------------------------------------------------------------------------------------------------------------------------------
centroids | {{0.791217523987,0.920651641252,0.673871940211},{0.874057597294,0.532762557118,0.682720362738},{0.796875366696,0.204531299723,0.663443078965}}
cluster_variance | {3.49163639093,0.657987496465,1.91771776225}
objective_fn | 6.06734164965
frac_reassigned | 0
num_iterations | 3
select * from madlib.simple_silhouette( 't_source_change_nor',
'row_vec',
(select centroids from
madlib.kmeanspp('t_source_change_nor',
'row_vec',
3,
'madlib.squared_dist_norm2',
'madlib.avg',
20,
0.001)),
'madlib.dist_norm2'
);
结果如下:
-[ RECORD 1 ]-----+------------------
simple_silhouette | 0.640471849127657
\x off;
select cluster_id,
round(count(cust_id)/99.0,4) pct,
round(avg(amount),4) avg_amount,
round(avg(quantity),4) avg_quantity,
round(avg(dt),2) avg_dt
from
(
select t2.*,
(madlib.closest_column(centroids, row_vec)).column_id as cluster_id
from t_source_change_nor as t1, km_result, t_source_change t2
where t1.row_id = t2.row_id) t
group by cluster_id;
查询结果为:
cluster_id | pct | avg_amount | avg_quantity | avg_dt
------------+--------+------------+--------------+--------
2 | 0.1919 | 5439.9795 | 2.0526 | 48.79
1 | 0.4848 | 3447.5631 | 2.4375 | 29.56
0 | 0.3232 | 5586.0203 | 4.0313 | 5.56
(3 rows)
类别 |
占比 |
描述 |
第一类:高价值用户 |
32.3% |
购买频率高(平均4次);消费金额较高(平均5586元);最近一周有过购买行为,这部分用户需要大力发展。 |
第二类:中价值用户 |
48.5% |
购买频率中等(平均2.4次);消费金额不高(平均3447);最近一个月有个购买行为,这部分用户可以适当诱导购买。 |
第三类:高价值挽留用户 |
19.2 |
购买频率一般(平均2次);消费金额较高(平均5439元);较长时间没有购买行为,这部分客户需要尽量挽留。 |