1 返回字符串A从start位置到结尾的字符串
select substring('abcde',3);
cde
select substring('abcde',-2);
de
2 返回字符串A从start位置开始,长度为len的字符串
select substring('abcde',3,2);
cd
select substring('abcde',-2,2);
de
3 注意事项
1) 开始位置的索引是从1开始的
// 1 构造测试数据
val arr = Array("abcde")
val df = spark.sparkContext.makeRDD(arr).toDF("id")
// 2 操作举例
val res = df
.withColumn("id2", substring($"id", 2, 2)) // 结果bc, 注意截取起始位置索引是从1开始的
.withColumn("id3", substring($"id", -2, 2)) // 结果的, 截取字符串的后两位
.withColumn("id4", $"id".substr(2, 2)) // 结果bc, 注意截取起始位置索引是从1开始的
.withColumn("id5", $"id".substr(-2, 2)) // 结果的, 截取字符串的后两位
// 3 展示结果
res.show()
+-----+---+---+---+---+
| id|id2|id3|id4|id5|
+-----+---+---+---+---+
|abcde| bc| de| bc| de|
+-----+---+---+---+---+
1. 分割成单字符数组
-- 方式01
select filter(split('abc', ''), x -> x!='') arr
+---------+
| arr|
+---------+
|[a, b, c]|
+---------+
-- 方式02
select array_remove(split('abc', ''), '') arr
+---------+
| arr|
+---------+
|[a, b, c]|
+---------+
-- ltrim(trimStr, str) - Removes the leading string contains the characters from the trim string
SELECT ltrim('Sp', 'SSparkSQLS'); # 删除字符串开头位置是'S'或'p'的字符
arkSQLS
regexp_extract(str, regexp[, idx]) - Extract the first string in the str that match the regexp expression and corresponding to the regex group index.
> SELECT regexp_extract('100-200', '(\\d+)-(\\d+)', 1);
100
1) 解析参数--URL不规范也可以解析
select regexp_extract('https://rasre.com?pageId=385','pageId=([^&]*)');
parse_url(url, partToExtract[, key]) - Extracts a part from a URL.
Examples:
SELECT parse_url('http://spark.apache.org/path?query=1', 'HOST');
spark.apache.org
SELECT parse_url('http://spark.apache.org/path?query=1', 'QUERY');
query=1
SELECT parse_url('http://spark.apache.org/path?query=1', 'QUERY', 'query');
1
1 concat和concat_ws对null的不同处理
1) concat只要有一个参数为null,结果就为null
select concat('620',null,'12','25');
NULL
2) concat_ws当有参数为null时, 当这个元素不存在, 不会导致结果为null
select concat_ws('-','620',null,'12','25');
620-12-25
1) 参考链接
正则表达参考01
2)regexp_extract
regexp_extract(str, regexp[, idx]) - Extracts a group that matches regexp.
Examples:
SELECT regexp_extract('100-200', '(\d+)-(\d+)', 1);
100
3)regexp_replace
regexp_replace(str, regexp, rep) - Replaces all substrings of str that match regexp with rep.
Examples:
> SELECT regexp_replace('100-200', '(\d+)', 'num');
num-num
4)replace 非正则,精确的替换
replace(str, search[, replace]) - Replaces all occurrences of search with replace.
Examples:
> SELECT replace('ABCabc', 'abc', 'DEF');
ABCDEF
5)translate 多个单个字符的替换
translate(input, from, to) - Translates the input string by replacing the characters present in the from string with the corresponding characters in the to string.
Examples:
> SELECT translate('AaBbCc', 'abc', '123');
A1B2C3