正则清洗数据

测试样例

WITH t as(SELECT '96COTTON ,4SP' fabric FROM dummy)
SELECT REPLACE_REGEXPR('(\d+\.?\d*)'  IN fabric 
    WITH '\1%' OCCURRENCE ALL) FROM t 
;
WITH t as(SELECT 'N/R/SP 45/50/5' fabric FROM dummy)
SELECT fabric,REPLACE_REGEXPR('(\w+)/(\w+)/(\w+) (\d+)/(\d+)/(\d+)'  IN fabric 
    WITH '\4%\1,\5%\2,\6%\3' OCCURRENCE ALL) FROM t 
WHERE FABRIC like_regexpr '\w+/\w+/\w+\s+\d+/\d+/\d+'
;
  1. 预处理
UPDATE LINEZONEDEV.DIM_SKU2 SET FABRIC =replace(FABRIC ,',',',');
UPDATE LINEZONEDEV.DIM_SKU2 SET FABRIC =replace(FABRIC ,'%','%');
  1. 只有1种成分(没有数据or只有1个单词)
UPDATE dim_sku2 SET ftype=5 ,FABRIC2 ='100%'||fabric --select distinct fabric from dim_sku2
where (fabric like_regexpr '^\w+$' or fabric NOT like_regexpr '\d') 
AND fabric NOT like_regexpr '\+|/'
  1. 比例换算型(数字小于10)
select *,REPLACE_REGEXPR('(\w+)/(\w+)\s+(\d)/(\d)' IN fabric 
  WITH '\3%\1,\4%\2' OCCURRENCE ALL)
from dim_sku2 
where fabric like_regexpr '\w+/\w+\s+\d/\d'
-- 97/3 T/R
UPDATE DIM_SKU2 
SET fabric=REPLACE_REGEXPR('(\d+)/(\d+)\s+(\w+)/(\w+)'  IN fabric 
    WITH '\1%\3,\2%\4' OCCURRENCE ALL)  
WHERE FABRIC like_regexpr '\d+/\d+\s+\w+/\w+'
-- N/R/SP 45/50/5
UPDATE DIM_SKU2 
SET fabric=REPLACE_REGEXPR('(\w+)/(\w+)/(\w+)\s+(\d+)/(\d+)/(\d+)'  IN fabric 
    WITH '\4%\1,\5%\2,\6%\3' OCCURRENCE ALL)  
WHERE FABRIC like_regexpr '\w+/\w+/\w+\s+\d+/\d+/\d+'
;
  1. 补逗号,补百分比
UPDATE dim_sku2 SET ftype=2 ,FABRIC2 =REPLACE_REGEXPR('(\d+)' IN fabric 
  WITH ',\1%' OCCURRENCE ALL)
where fabric NOT like_regexpr ',' AND fabric NOT like_regexpr '%' AND FABRIC2 =''
  1. 有逗号,补百分比 196rows
UPDATE dim_sku2 SET ftype=3 ,FABRIC2 =REPLACE_REGEXPR('(\d+\.?\d*)' IN fabric 
  WITH '\1%' OCCURRENCE ALL)
where fabric like_regexpr ',' AND fabric NOT like_regexpr '%'
AND FABRIC2 =''
  1. 有百分比,补逗号 154401rows
UPDATE dim_sku2 SET ftype=4 ,FABRIC2 =REPLACE_REGEXPR('(\d+\.?\d*%)' IN fabric 
  WITH ',\1' OCCURRENCE ALL)
where fabric NOT like_regexpr ',' AND fabric like_regexpr '%'
AND FABRIC2 =''

克重清洗

  1. 单位是盎司的需转化
  • 已填写的单位情况(没有盎司)
SELECT DISTINCT uom_of_weight 
FROM LINEZONEUSER.DIM_SKU 
WHERE uom_of_weight IS NOT null

METER
KG
sqft
KGS
SF
YARD
PC

  • 盎司的换算
LINEZONEUSER.DIM_SKU 
set weight=
where uom_of_weight like_regexpr '[oz|OZ]'
  • 克重里填了盎司单位的
-- 7.5OZ 8.9/9.2 OZ
select weight,REPLACE_REGEXPR('(\d[\x\d]*)' IN fabric
WITH '' OCCURRENCE ALL)*28.35
from LINEZONEUSER.DIM_SKU 
where weight like_regexpr '(oz|OZ)'
and weight like_regexpr '%'

修复

  1. 100%置空
update LINEZONEUSER.DIM_SKU 
SET weight=0
weight not like_regexpr '100%'
  1. 盎司转克
-- 8.9/9.2 OZ
update LINEZONEUSER.DIM_SKU 
SET weight= 
    REPLACE_REGEXPR('(\d+[\.\d]*)(.*)' IN weight WITH '\1' OCCURRENCE ALL)*28.35
where weight like_regexpr '(oz|OZ)' 
--  AND  weight not like_regexpr '100%' --忽略填成100%
  1. 带g或G的去掉无关字符
select weight,  
    REPLACE_REGEXPR('(^|约)(\d+[\.\d]*)(.*)' IN weight WITH '\2' OCCURRENCE ALL)--*28.35
from LINEZONEUSER.DIM_SKU 
where weight like_regexpr '[gG]' 
  AND  weight LIKE '约%'
update LINEZONEUSER.DIM_SKU 
SET weight= 
    REPLACE_REGEXPR('(^|约)(\d+[\.\d]*)(.*)' IN weight WITH '\2' OCCURRENCE ALL)*28.35
where weight like_regexpr '[gG]' 
  AND  weight LIKE '约%'
  1. 区间的简单取左点(-~/隔开)
update LINEZONEUSER.DIM_SKU 
SET weight= 
    REPLACE_REGEXPR('(\d+[\.\d]*)(.*)' IN weight WITH '\1' OCCURRENCE ALL)
where weight like_regexpr  '\d[\/\-~]'
  1. 剩余没法处理:20种

/
103士
12"
155CM
185士
210SQ.M
50Z
MAKE IT LIGHTER
N/A
N/a
n/a
NA
NO
No
no
NO INFORMATION
none
·

你可能感兴趣的:(正则清洗数据)