在数据挖掘中经常会遇到多个变量的问题,而且在多数情况下,多个变量之间常常存在一定的相关性。例如,网站的“浏览量”和“访客数”往往具有较强的相关关系,而电商应用中的“下单数”和“成交数”也具有较强的相关关系。这里的相关关系可以直观理解为当浏览量较高(或较低)时,应该很大程度上认为访客数也较高(或较低)。这个简单的例子中只有两个变量,当变量个数较多且变量之间存在复杂关系时,会显著增加分析问题的复杂性。主成分分析方法可以将多个变量综合为少数几个代表性变量,使这些变量既能够代表原始变量的绝大多数信息又互不相关,这种方法有助于对问题的分析和建模。
Madlib提供了两个主成分分析函数:训练函数与投影函数。训练函数以原始数据为输入,输出主成分。投影函数将原始数据投影到主成分上,实现线性无关降维,输出降维后的数据矩阵。
pca_train( source_table,
out_table,
row_id,
components_param,
grouping_cols,
lanczos_iter,
use_correlation,
result_summary_table
)
pca_sparse_train( source_table,
out_table,
row_id,
col_id, -- Sparse matrices only
val_id, -- Sparse matrices only
row_dim, -- Sparse matrices only
col_dim, -- Sparse matrices only
components_param,
grouping_cols,
lanczos_iter,
use_correlation,
result_summary_table
)
{TABLE|VIEW} source_table (
row_id INTEGER,
row_vec FLOAT8[],
)
或
{TABLE|VIEW} source_table (
row_id INTEGER,
col1 FLOAT8,
col2 FLOAT8,
...
)
注意row_id作为入参是输入矩阵的行标识,必须是从1开始且连续的整数。PCA的稀疏矩阵输入表的格式为:
{TABLE|VIEW} source_table (
...
row_id INTEGER,
col_id INTEGER,
val_id FLOAT8,
...
)
row_id和col_id列指示矩阵下标,是正整数,val_id列定义非0的矩阵元素值。
select madlib.pca_train('usage');
select madlib.pca_sparse_train('usage');
madlib.pca_project( source_table,
pc_table,
out_table,
row_id,
residual_table,
result_summary_table
)
稀疏矩阵的投影函数为:
madlib.pca_sparse_project( source_table,
pc_table,
out_table,
row_id,
col_id, -- Sparse matrices only
val_id, -- Sparse matrices only
row_dim, -- Sparse matrices only
col_dim, -- Sparse matrices only
residual_table,
result_summary_table
)
select madlib.pca_project('usage');
select madlib.pca_sparse_project('usage');
企业编号 |
净利润率(%) |
固定资产利润率(%) |
总产值利润率(%) |
销售收入利润率(%) |
产品成本利润率(%) |
物耗利润率(%) |
人均利润(千元/人) |
流动资产利润率(%) |
1 |
40.4 |
24.7 |
7.2 |
6.1 |
8.3 |
8.7 |
2.442 |
20 |
2 |
25 |
12.7 |
11.2 |
11 |
12.9 |
20.2 |
3.542 |
9.1 |
3 |
13.2 |
3.3 |
3.9 |
4.3 |
4.4 |
5.5 |
0.578 |
3.6 |
4 |
22.3 |
6.7 |
5.6 |
3.7 |
6 |
7.4 |
0.176 |
7.3 |
5 |
34.3 |
11.8 |
7.1 |
7.1 |
8 |
8.9 |
1.726 |
27.5 |
6 |
35.6 |
12.5 |
16.4 |
16.7 |
22.8 |
29.3 |
3.017 |
26.6 |
7 |
22 |
7.8 |
9.9 |
10.2 |
12.6 |
17.6 |
0.847 |
10.6 |
8 |
48.4 |
13.4 |
10.9 |
9.9 |
10.9 |
13.9 |
1.772 |
17.8 |
9 |
40.6 |
19.1 |
19.8 |
19 |
29.7 |
39.6 |
2.449 |
35.8 |
10 |
24.8 |
8 |
9.8 |
8.9 |
11.9 |
16.2 |
0.789 |
13.7 |
11 |
12.5 |
9.7 |
4.2 |
4.2 |
4.6 |
6.5 |
0.874 |
3.9 |
12 |
1.8 |
0.6 |
0.7 |
0.7 |
0.8 |
1.1 |
0.056 |
1 |
13 |
32.3 |
13.9 |
9.4 |
8.3 |
9.8 |
13.3 |
2.126 |
17.1 |
14 |
38.5 |
9.1 |
11.3 |
9.5 |
12.2 |
16.4 |
1.327 |
11.6 |
15 |
26.2 |
10.1 |
5.6 |
15.6 |
7.7 |
30.1 |
0.126 |
25.9 |
drop table if exists mat;
create table mat (id integer,
row_vec double precision[]
);
insert into mat values
(1, '{40.4, 24.7, 7.2, 6.1, 8.3, 8.7, 2.442, 20}'),
(2, '{ 25, 12.7, 11.2, 11, 12.9, 20.2, 3.542, 9.1}'),
(3, '{13.2, 3.3, 3.9, 4.3, 4.4, 5.5, 0.578, 3.6}'),
(4, '{22.3, 6.7, 5.6, 3.7, 6, 7.4, 0.176, 7.3}'),
(5, '{34.3, 11.8, 7.1, 7.1, 8, 8.9, 1.726, 27.5}'),
(6, '{35.6, 12.5, 16.4, 16.7, 22.8, 29.3, 3.017, 26.6}'),
(7, '{ 22, 7.8, 9.9, 10.2, 12.6, 17.6, 0.847, 10.6}'),
(8, '{48.4, 13.4, 10.9, 9.9, 10.9, 13.9, 1.772, 17.8}'),
(9, '{40.6, 19.1, 19.8, 19, 29.7, 39.6, 2.449, 35.8}'),
(10, '{24.8, 8, 9.8, 8.9, 11.9, 16.2, 0.789, 13.7}'),
(11, '{12.5, 9.7, 4.2, 4.2, 4.6, 6.5, 0.874, 3.9}'),
(12, '{ 1.8, 0.6, 0.7, 0.7, 0.8, 1.1, 0.056, 1}'),
(13, '{32.3, 13.9, 9.4, 8.3, 9.8, 13.3, 2.126, 17.1}'),
(14, '{38.5, 9.1, 11.3, 9.5, 12.2, 16.4, 1.327, 11.6}'),
(15, '{26.2, 10.1, 5.6, 15.6, 7.7, 30.1, 0.126, 25.9}');
drop table if exists result_table, result_table_mean;
select madlib.pca_train('mat', -- source table
'result_table', -- output table
'id', -- row id of source table
3 -- number of principal components
);
select * from result_table order by row_id;
row_id | principal_components | std_dev | proportion
--------+------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------+--------------------
1 | {0.54951113056651,0.22059946589484,0.221798212432593,0.234122486371152,0.323859692705692,0.460000381165952,0.0350228413699574,0.477130544733463} | 19.4865050268107 | 0.743985609635922
2 | {-0.679390383883709,-0.258058109137903,0.0922351692999137,0.224015136531732,0.25512757908769,0.589575496706845,-0.0194312272521382,0.00881420572763583} | 9.06653318232267 | 0.161056826858628
3 | {-0.293070645355269,0.116853642654378,-0.357654514354919,-0.0363160573731402,-0.370902632819521,-0.0699941930259179,-0.0562050276014156,0.790943904553534} | 5.06251973016091 | 0.0502146089885736
(3 rows)
可以看到,主成分数量为3时,累积方差比例为95.5,即反映了95.5%的原始信息,而维度已经从8个降低为3个。
drop table if exists residual_table, result_summary_table, out_table;
select madlib.pca_project( 'mat',
'result_table',
'out_table',
'id',
'residual_table',
'result_summary_table'
);
dm=# select * from out_table order by row_id;
row_id | row_vec
--------+---------------------------------------------------------
1 | {7.08021173666004,-17.6113408380368,3.62504928877283}
2 | {-0.377499074086432,5.25083911315586,-6.06667264391957}
3 | {-24.3659516926199,2.69294552046529,-0.854680487274521}
4 | {-15.235298685282,-2.7756923532236,-1.48789032627869}
5 | {4.64264829479036,-9.80192214158058,9.97441166441563}
6 | {23.6146598612176,7.91277187194797,-1.70125446716029}
7 | {-4.25445515316499,6.71053107113929,-3.63489574437095}
8 | {12.8547303317577,-15.2151276724561,-4.53202062778529}
9 | {40.4531114732088,11.566606363421,0.333514089765778}
10 | {-2.3918721025776,3.48063922820141,-1.53633678788746}
11 | {-22.6173674430242,2.15970955881415,0.0711392924992467}
12 | {-37.2273102800874,6.50778045364591,3.06216108712083}
13 | {2.45676837959725,-5.55018275237518,0.715863146049784}
14 | {5.05828673790116,-5.6726215744102,-7.79762716115412}
15 | {10.3093376158273,10.3450641508798,9.82923967771456}
(15 rows)
dm=# select * from result_summary_table;
exec_time | residual_norm | relative_residual_norm
---------------+---------------+------------------------
7834.64002609 | 17.8804330669 | 0.0999460602087
(1 row)
dm=# select * from residual_table order by row_id;
row_id | row_vec
--------+----------------------------------------------------------------------------------------------------------------------------------------------------------
1 | {-2.25323523421767,7.27642620892876,-0.31614472682197,-0.494115688859853,1.00468388047691,0.433380657606633,0.599100230104459,-1.52349927341588}
2 | {-0.863154123125062,3.95387717098127,-0.237022939622535,0.678462615857513,-1.40752199307202,1.2065851169394,1.85980716819093,-1.40110122200098}
3 | {0.308441190889249,-1.42340844662228,-0.116406757628345,0.356984675694431,0.447101713452669,-0.585836964458252,-0.0208119097031971,0.444674616668891}
4 | {0.490130533527738,-1.37485909630399,-0.163638890332356,-1.17864453350125,0.250392381501536,0.293913050865083,-0.884445243345333,0.337196335237827}
5 | {0.152688703669817,-3.81251089756285,1.67507754859814,-0.442271462365288,1.85670956345357,-2.40516264125975,0.477083304830599,2.04871340760574}
6 | {-0.3592450632185,-1.36196195708871,0.957346795421167,0.323580259078128,1.66239742146767,-1.99367354425669,0.791616661159451,1.17524319516153}
7 | {-0.0283404999568164,0.00165496908974205,0.057980039554014,0.54746466878389,0.0776125903500138,-0.300401958384908,-0.5343691201399,0.0124478027687696}
8 | {1.81096896189017,-3.72588393721226,-1.03535063544512,1.12091902152632,-1.90226870931166,0.993403809233695,-0.685046795881378,-0.0480344625583826}
9 | {-1.53345069357698,2.22861611294688,1.01334044488596,-2.06329935977889,2.93160872351416,-1.45115700607537,-0.180751047312987,0.699510168883641}
10 | {0.168817950757148,-1.28795389715317,0.593331153806535,-0.388851856449367,0.376792544638479,-0.506038843322491,-0.602413269186595,0.592379053435519}
11 | {-1.44337010134268,4.34506352414871,0.176067114438398,0.000676834542009175,0.560237949521619,-0.011001520447941,0.255621998627472,-0.817199798912856}
12 | {-0.284425653421206,-0.759447727683328,0.585233711708927,-0.944218598761936,1.49187460452629,-1.04458414728561,0.201902525223109,0.849594937231783}
13 | {-0.470963886374596,0.948783991627835,0.756380230030218,-0.0191957806666807,-0.154128037749362,-0.154423041481078,0.515878569618902,-0.0228197077983638}
14 | {1.72123496250158,-3.46187354552954,-0.954226967261204,0.289982019939877,-1.72309342301114,1.22516956751968,-0.855114089441233,-0.0293111133050323}
15 | {2.58390295180211,-1.54652247225864,-2.99196612118835,2.21252718509424,-5.47239920950344,4.29982746453179,-0.938058982777885,-2.31779393895638}
(15 rows)
out_table为降维后,投影到主成分的数据表。residual_table中的数据表示与每个原始数据项对应的误差,越接近零说明误差越小。result_summary_table表中包含函数执行概要信息。
select row_id, row_vec, madlib.array_sum(row_vec) r from out_table order by r desc;
row_id | row_vec | r
--------+----------------------------------------------------------+-------------------
9 | {40.4531114732088,11.566606363421,-0.333514089765778} | 51.686203746864
6 | {23.6146598612176,7.91277187194796,1.70125446716029} | 33.2286862003258
2 | {-0.377499074086431,5.25083911315585,6.06667264391957} | 10.940012682989
15 | {10.3093376158273,10.3450641508798,-9.82923967771456} | 10.8251620889925
14 | {5.05828673790117,-5.6726215744102,7.79762716115411} | 7.18329232464508
7 | {-4.254455153165,6.71053107113929,3.63489574437095} | 6.09097166234524
10 | {-2.3918721025776,3.48063922820141,1.53633678788746} | 2.62510391351128
8 | {12.8547303317577,-15.2151276724561,4.53202062778528} | 2.17162328708682
13 | {2.45676837959725,-5.55018275237518,-0.715863146049783} | -3.80927751882771
1 | {7.08021173666005,-17.6113408380368,-3.62504928877282} | -14.1561783901495
5 | {4.64264829479035,-9.80192214158058,-9.97441166441563} | -15.1336855112059
4 | {-15.235298685282,-2.7756923532236,1.48789032627869} | -16.5231007122269
11 | {-22.6173674430242,2.15970955881415,-0.0711392924992431} | -20.5287971767093
3 | {-24.3659516926199,2.69294552046528,0.85468048727452} | -20.8183256848801
12 | {-37.2273102800874,6.50778045364591,-3.06216108712083} | -33.7816909135623
(15 rows)
从该结果可知,第9家企业的综合实力最强,第12家企业的综合实力最弱。row_vec中的三列为个主成分的得分。以上应用示例比较简单,真实场景中,PCA方法还要根据实际问题和需求灵活使用。