原创: 牛超 2017-02-08 微信:ncfire QQ10867910
说到SQL,很多朋友都认为无非就是增删改查,简单,不值得投入过多的精力。于是乎遇到复杂的数据处理,便借助PLSQL、T-SQL、JAVA等语言们来实现主体功能,逐渐出现弱化SQL的趋势。
本人从业以来,经历过大大小小的一些项目,而上述情况非常普遍。实在看不下去SQL被拆解得支离破碎的现状,个人感觉数据处理,SQL为王。
如何将问题转为SQL问题,即以SQL来思考问题就非常关键,这也是本文的目的。
SQL处理的对象是集合,即SET BY SET,区别于其它语言的ROW BY ROW甚至POINT BY POINT,处理能力以及性能不可等视。因此,如何合理的构造集合是SQL处理问题的关键。
用一个例子来说明:
在5X5的方格棋盘中(如图),每行、列、斜线(斜线不仅仅包括对角线)最多可以放两个球,如何摆放才能放置最多的球,这样的摆法总共有几种?输出所有的摆法。
要求: 用一句SQL实现。
输出格式: 从方格棋盘第一行至第5行,每行从第一列到第5列依次输出,0表示不放球,1表示放球。例如:1001000000000000000000000。一行输出一个行号和一个解,按解所在的列字符串顺序从大到小排序。
个人的环境是ORACLE,因不能使用PLSQL和自定义函数,既然构造集合是关键,那就头脑风暴一下:
1. 构造所有的单行集合
2. 摆列组合,为防止集合过大,可以提前行冲突检查
3. 冲突检查,包括纵向与斜线检查,斜线冲突采用移位法
解法一:最直接的方式,也是最容易想到的,但不灵活,不易扩展,
WITH BIN_LIST AS --构造单行所有可能的摆位
(
SELECT
CASE WHEN ROWNUM >= 16 THEN 1 ELSE 0 END ||
CASE WHEN MOD(ROWNUM,16) >= 8 THEN 1 ELSE 0 END ||
CASE WHEN MOD(MOD(ROWNUM,16),8) >= 4 THEN 1 ELSE 0 END ||
CASE WHEN MOD(MOD(MOD(ROWNUM,16),8), 4) >= 2 THEN 1 ELSE 0 END ||
MOD(ROWNUM,2) BIN
FROM DUAL
CONNECT BY ROWNUM <= POWER(2,5)-1
) ,
LINES AS (--最多可以放两个球--
SELECT BIN , ROWNUM NUM --BIN的索引号--
FROM BIN_LIST
WHERE REGEXP_COUNT(BIN,'1') <= 2
),
LINES_MOVE AS --移位准备--
(SELECT LINES.*, ROWNUM - (NUM - 1) * 5 RNUM
FROM LINES
START WITH NUM = 1
CONNECT BY ROWNUM <= NUM * 5),
BIN_NUM_ALL AS --移位后集合,用于斜线冲突计算--
(SELECT NUM,
BIN SBIN,--原BIN--
RPAD(BIN, 5 + (RNUM-1), '0') BINL, --左移,升序补0--
RPAD(BIN, 5 + 4 - (RNUM-1), '0') BINR, --右移,降序补0--
RNUM --1~5移位--
FROM LINES_MOVE
ORDER BY BIN, RNUM) ,
REPORT AS (
SELECT LN1.BIN BIN1,
LN1.NUM NUM1,
LN2.BIN BIN2,
LN2.NUM NUM2,
LN3.BIN BIN3,
LN3.NUM NUM3,
LN4.BIN BIN3,
LN4.NUM NUM3,
LN5.BIN BIN3,
LN5.NUM NUM3,
LN1.BIN || LN2.BIN || LN3.BIN || LN4.BIN || LN5.BIN BIN_LIST,
TO_NUMBER(LN1.BIN) + TO_NUMBER(LN2.BIN) + TO_NUMBER(LN3.BIN) + TO_NUMBER(LN4.BIN) + TO_NUMBER(LN5.BIN) SUM_VERTICAL, --纵向合计--
TO_NUMBER(MLN1.BINL) + TO_NUMBER(MLN2.BINL) + TO_NUMBER(MLN3.BINL) + TO_NUMBER(MLN4.BINL) + TO_NUMBER(MLN5.BINL) SUM_LEFT, --斜角 左下->右上 合计--
TO_NUMBER(MLN1.BINR) + TO_NUMBER(MLN2.BINR) + TO_NUMBER(MLN3.BINR) + TO_NUMBER(MLN4.BINR) + TO_NUMBER(MLN5.BINR) SUM_RIGHT --斜角 左上->右下 合计--
FROM LINES LN1 --L1
JOIN BIN_NUM_ALL MLN1
ON MLN1.NUM = LN1.NUM
AND MLN1.RNUM = 1
CROSS JOIN LINES LN2 --L2
JOIN BIN_NUM_ALL MLN2
ON MLN2.NUM = LN2.NUM
AND MLN2.RNUM = 2
JOIN LINES LN3 --L3
ON TO_CHAR(TO_NUMBER(LN1.BIN) + TO_NUMBER(LN2.BIN) + TO_NUMBER(LN3.BIN)) NOT LIKE '%3%'--纵向冲突检查--
JOIN BIN_NUM_ALL MLN3
ON MLN3.NUM = LN3.NUM
AND MLN3.RNUM = 3
AND TO_CHAR(TO_NUMBER(MLN1.BINL) + TO_NUMBER(MLN2.BINL) + TO_NUMBER(MLN3.BINL)) NOT LIKE '%3%'--斜向冲突检查--
AND TO_CHAR(TO_NUMBER(MLN1.BINR) + TO_NUMBER(MLN2.BINR) + TO_NUMBER(MLN3.BINR)) NOT LIKE '%3%'--斜向冲突检查--
JOIN LINES LN4 --L4
ON TO_CHAR(TO_NUMBER(LN1.BIN) + TO_NUMBER(LN2.BIN) + TO_NUMBER(LN3.BIN) + TO_NUMBER(LN4.BIN)) NOT LIKE '%3%'
JOIN BIN_NUM_ALL MLN4
ON MLN4.NUM = LN4.NUM
AND MLN4.RNUM = 4
AND TO_CHAR(TO_NUMBER(MLN1.BINL) + TO_NUMBER(MLN2.BINL) + TO_NUMBER(MLN3.BINL) + TO_NUMBER(MLN4.BINL)) NOT LIKE '%3%'
AND TO_CHAR(TO_NUMBER(MLN1.BINR) + TO_NUMBER(MLN2.BINR) + TO_NUMBER(MLN3.BINR) + TO_NUMBER(MLN4.BINR)) NOT LIKE '%3%'
JOIN LINES LN5 --L5
ON TO_CHAR(TO_NUMBER(LN1.BIN) + TO_NUMBER(LN2.BIN) + TO_NUMBER(LN3.BIN) + TO_NUMBER(LN4.BIN) + TO_NUMBER(LN5.BIN)) NOT LIKE '%3%'
JOIN BIN_NUM_ALL MLN5
ON MLN5.NUM = LN5.NUM
AND MLN5.RNUM = 5
AND TO_CHAR(TO_NUMBER(MLN1.BINL) + TO_NUMBER(MLN2.BINL) + TO_NUMBER(MLN3.BINL) + TO_NUMBER(MLN4.BINL) + TO_NUMBER(MLN5.BINL)) NOT LIKE '%3%'
AND TO_CHAR(TO_NUMBER(MLN1.BINR) + TO_NUMBER(MLN2.BINR) + TO_NUMBER(MLN3.BINR) + TO_NUMBER(MLN4.BINR) + TO_NUMBER(MLN5.BINR)) NOT LIKE '%3%'
)
SELECT ROWNUM, BIN_LIST
FROM (
SELECT BIN_LIST ,RANK() OVER(ORDER BY REGEXP_COUNT(BIN_LIST,'1') DESC ) RK
FROM REPORT ORDER BY BIN_LIST
)
WHERE RK = 1 ;--取摆球最多的集合--
解法二:通过对功能重新梳理,可以采用正则表达式与标准的XML函数来实现简单的表达式累加运算,同时引入参数化,灵活易扩展,但速度非常慢(因排列组合生成的笛卡尔集无法及时过滤无用行)
WITH
PARAMS AS (--参数化
SELECT 2 BOLLS_LIMIT ,
5 MATRIX_LINE_LIMIT
FROM DUAL
) ,
TESTM AS (
SELECT RPAD('1',BOLLS_LIMIT,'1') EN,
RPAD( RPAD('1',BOLLS_LIMIT,'1'),MATRIX_LINE_LIMIT,'0') LN1 , BOLLS_LIMIT, MATRIX_LINE_LIMIT
FROM PARAMS
) , TA AS (
SELECT ROWNUM RNUM , EN,
LN1,
BOLLS_LIMIT,
MATRIX_LINE_LIMIT,
LPAD(LN1,LENGTH(LN1)+BOLLS_LIMIT*(ROWNUM-1),'0') LNS,
MOD(BOLLS_LIMIT * (ROWNUM-1),MATRIX_LINE_LIMIT) LIDX
FROM TESTM
CONNECT BY ROWNUM <= MATRIX_LINE_LIMIT
) , TB AS (
SELECT BOLLS_LIMIT, MATRIX_LINE_LIMIT , LISTAGG(
RPAD( RPAD(SUBSTR( LPAD( LN1,LENGTH(LN1)+LIDX), MATRIX_LINE_LIMIT+1 , BOLLS_LIMIT),LIDX,'0') || EN ,MATRIX_LINE_LIMIT,'0')
) WITHIN GROUP (ORDER BY RNUM) LN
FROM TA
GROUP BY BOLLS_LIMIT, MATRIX_LINE_LIMIT
),
TEST_RESULT AS (--提前测试计算出每行的最大球数
SELECT LN , REGEXP_COUNT(LN,'1') F_CNT ,
BOLLS_LIMIT, MATRIX_LINE_LIMIT ,
REGEXP_COUNT(LN,'1') / MATRIX_LINE_LIMIT BOLLS_LINE_MAX
FROM TB
) ,
BIN_LIST AS ( --构造摆放集合,换一种方式
SELECT REPLACE(SYS_CONNECT_BY_PATH(BIT_NUM,','),',') BIN ,BOLLS_LIMIT , MATRIX_LINE_LIMIT,BOLLS_LINE_MAX
FROM (
SELECT ROWNUM -1 BIT_NUM , BOLLS_LIMIT , MATRIX_LINE_LIMIT,BOLLS_LINE_MAX
FROM TEST_RESULT
CONNECT BY ROWNUM <= 2
)
WHERE LEVEL = MATRIX_LINE_LIMIT
CONNECT BY LEVEL <= MATRIX_LINE_LIMIT
) ,
LINES AS ( --根据测试的最大球数构造行集合--
SELECT /*+MATERIALIZE*/
BIN_LIST.BIN , ROWNUM NUM, --BIN的索引号--
BOLLS_LIMIT , MATRIX_LINE_LIMIT
FROM BIN_LIST
WHERE REGEXP_COUNT(BIN_LIST.BIN,'1') = BOLLS_LINE_MAX
) ,
BIN_NUM_ALL AS(--构造所有可能出现行的组合
SELECT
REPLACE( SYS_CONNECT_BY_PATH( LINES.BIN,',') ,',') BIN_CONCAT,
SYS_CONNECT_BY_PATH( LINES.BIN,' 1#') BIN_EXP,
SYS_CONNECT_BY_PATH( RPAD(BIN, MATRIX_LINE_LIMIT + (LEVEL-1), '0') ,' 2#' ) BINL_EXP, --左移,升序补0--
SYS_CONNECT_BY_PATH( RPAD(BIN, MATRIX_LINE_LIMIT + (MATRIX_LINE_LIMIT - 1) - (LEVEL-1), '0') ,' 3#' ) BINR_EXP, --右移,降序补0--
'['|| (BOLLS_LIMIT + 1)||'-9]' REG_EXP
FROM LINES
WHERE LEVEL = MATRIX_LINE_LIMIT
CONNECT BY LEVEL <= MATRIX_LINE_LIMIT
) ,
RESULT1 AS (
SELECT BXC.*
,
(SELECT LISTAGG(SUM(NUM)) WITHIN GROUP(ORDER BY 1)
FROM XMLTABLE('DATA/ROW' PASSING
XMLTYPE('' ||
REPLACE(REGEXP_REPLACE(BIN_EXP || ',' || BINL_EXP || ',' ||
BINR_EXP,
'([[:digit:]]+)#([[:digit:]]+)',
'\1 \2
'),
',') || '') COLUMNS ID NUMBER PATH 'ID',
NUM NUMBER PATH 'NUM')
GROUP BY ID ) SUM_NUM_LIST --利用正则表达式转成XML后解析求和,但速度不理想--
FROM BIN_NUM_ALL BXC
) ,
RESULT2 AS (
SELECT BIN_CONCAT ,REGEXP_COUNT(BIN_CONCAT,'1') NUM_COUNT ,SUM_NUM_LIST
FROM RESULT1
WHERE NOT REGEXP_LIKE(SUM_NUM_LIST,REG_EXP) --过滤掉不满足条件的
)
SELECT * FROM RESULT2 A;
WITH PARAMS AS --参数化
(SELECT 2 BOLLS_LIMIT, 5 MATRIX_LINE_LIMIT FROM DUAL),
BIN_LIST AS --构造单行所有可能的摆位
(SELECT REPLACE(SYS_CONNECT_BY_PATH(BIT_NUM, ','), ',') BIN
FROM (SELECT ROWNUM - 1 BIT_NUM, MATRIX_LINE_LIMIT
FROM PARAMS
CONNECT BY ROWNUM <= 2)
WHERE LEVEL = MATRIX_LINE_LIMIT
CONNECT BY LEVEL <= MATRIX_LINE_LIMIT) ,
LINES AS --最多可以放两个球--
(SELECT /*+MATERIALIZE*/
BIN_LIST.BIN, ROWNUM NUM, BOLLS_LIMIT, MATRIX_LINE_LIMIT --BIN的索引号--
FROM BIN_LIST
JOIN PARAMS
ON REGEXP_COUNT(BIN_LIST.BIN, '1') BETWEEN 1 AND BOLLS_LIMIT),
LINES_REV(BIN,
NUM,
BOLLS_LIMIT,
MATRIX_LINE_LIMIT,
LVL,
BIN_LIST,
BIN_SUM1,
BIN_SUM2,
BIN_SUM3) AS --关键处理,WITH递归构造行间排列组合
(SELECT BIN,
NUM,
BOLLS_LIMIT,
MATRIX_LINE_LIMIT,
1 LVL,
BIN BIN_LIST,
TO_NUMBER(BIN) BIN_SUM1,
TO_NUMBER(RPAD(BIN, MATRIX_LINE_LIMIT + 0, '0')) BIN_SUM2,
TO_NUMBER(RPAD(BIN,
MATRIX_LINE_LIMIT + (MATRIX_LINE_LIMIT - 1) - 0,
'0')) BIN_SUM3
FROM LINES
UNION ALL
SELECT LN.BIN,
LN.NUM,
LN.BOLLS_LIMIT,
LN.MATRIX_LINE_LIMIT,
LNR.LVL + 1 LVL,
LNR.BIN_LIST || LN.BIN BIN_LIST,
LNR.BIN_SUM1 + TO_NUMBER(LN.BIN) BIN_SUM1,
LNR.BIN_SUM2 +
TO_NUMBER(RPAD(LN.BIN, LNR.MATRIX_LINE_LIMIT + LNR.LVL, '0')) BIN_SUM2,
LNR.BIN_SUM3 + TO_NUMBER(RPAD(LN.BIN,
LNR.MATRIX_LINE_LIMIT +
(LNR.MATRIX_LINE_LIMIT - 1) - LNR.LVL,
'0')) BIN_SUM3
FROM LINES_REV LNR
JOIN LINES LN
ON LNR.LVL < LNR.MATRIX_LINE_LIMIT
AND TO_CHAR(LNR.BIN_SUM1 + TO_NUMBER(LN.BIN)) NOT LIKE
'%' || TO_CHAR(LN.BOLLS_LIMIT + 1) || '%'
AND LNR.BIN_SUM2 +
TO_NUMBER(RPAD(LN.BIN, LNR.MATRIX_LINE_LIMIT + LNR.LVL, '0')) NOT LIKE
'%' || TO_CHAR(LN.BOLLS_LIMIT + 1) || '%'
AND LNR.BIN_SUM3 + TO_NUMBER(RPAD(LN.BIN,
LNR.MATRIX_LINE_LIMIT +
(LNR.MATRIX_LINE_LIMIT - 1) - LNR.LVL,
'0')) NOT LIKE
'%' || TO_CHAR(LN.BOLLS_LIMIT + 1) || '%') --及时进行冲突检查收敛集合达到有效提速的目的
,
RESULT AS
(SELECT T.*, RANK() OVER(ORDER BY BIN_SUM1 DESC) RK
FROM LINES_REV T
WHERE T.LVL = MATRIX_LINE_LIMIT)
SELECT * FROM RESULT WHERE RK = 1;
至此,这个棋盘摆放问题的几种实现方法已经介绍完,通过循序渐进的总结修正,不断地深入使用与理解SQL,特别是第三种解法,ORACLE 11G的RSF特性使SQL具有了递归的能力。
总之,Thinking in SQL,各类复杂数据处理问题都会迎刃而解。