Java 原生实现 URL Decode 功能(从 Byte 字节数组转 Unicode 字符)

Java 实现 URL Decode

背景:

ElasticSearch 自定义脚本 Painless 当中可以支持 Java 的部分 API,但 6.x 版本没有对 URLDecode 的支持(发现 7.0 已经支持),而分析用户搜索当中需要对 URL 进行 Decode

因而需要自己实现

public class MyURLDecode {

    public static void main(String[] args) {
        String str = "/controller/action?&s=%e8%ad%ac%e5%a6%82%e6%9c%9d%e9%9c%b2%e5%8e%bb%e6%97%a5%e8%8b%a6%e5%a4%9a&page=1&page_size=30";
        
        String decodedStr = myURLDecode(str);
        System.out.println(decodedStr);  // /controller/action?&s=譬如朝露去日苦多&page=1&page_size=30
    }

    public static String URLDecode(String s) {
            boolean needToChange = false;
            int numChars = s.length();
            StringBuilder sb = new StringBuilder();
            int i = 0;
            char c;
            byte[] bytes = null;
            String vv = "+%";
            byte vNum1 = (byte)vv.charAt(0);
            byte vNum2 = (byte)vv.charAt(1);
            while (i < numChars) {
                c = s.charAt(i);
                byte cNum = (byte)c;
                if (cNum == vNum1) {
                    sb.append(' ');
                    i++;
                    needToChange = true;
                } else if (cNum == vNum2) {
                    if (bytes == null) {
                        bytes = new byte[(numChars - i) / 3];
                    }
                    int pos = 0;
                    String hexString = "";
                    int countHex = 0;
                    while (((i + 2) < numChars) && ((byte)c == vNum2)) {
                        int v = Integer.parseInt(s.substring(i + 1, i + 3), 16);
                        hexString += s.substring(i + 1, i + 3);
                        countHex += 1;
                        if (3 == countHex) {
                            int num = Integer.parseInt(hexString, 16);
                            String bitString = Integer.toString(num, 2);
                            String unicodeString = "";
                            if ((num & 0xf0000000L) > 0) {
                                unicodeString = bitString.substring(5, 8) + bitString.substring(10, 16) + bitString.substring(18, 24) + bitString.substring(26, bitString.length());
                            } else if ((num & 0xe00000) > 0) {
                                unicodeString = bitString.substring(4, 8) + bitString.substring(10, 16) + bitString.substring(18, bitString.length());
                            } else if ((num & 0xc000) > 0) {
                                unicodeString = bitString.substring(3, 8) + bitString.substring(10, bitString.length());
                            } else {
                                unicodeString = bitString.substring(1, bitString.length());
                            }
                            char result = (char)Integer.parseInt(unicodeString, 2);
                            sb.append(result);
                            hexString = "";
                            countHex = 0;
                        }
                        bytes[pos++] = (byte)v;
                        i += 3;
                        if (i < numChars) {
                            c = s.charAt(i);
                        }
                    }
                    needToChange = true;
                } else {
                    sb.append(c);
                    i++;
                }
            }
            String ret = needToChange ? sb.toString() : s;
            return ret;
    }
}

Lucene Query

{
	"size": 0,
	"query": {
		"bool": {
		  // 筛选条件
		}
	},
	"aggs": {
		"result": {
			"terms": {
				"script": {
					"lang": "painless",
					"size": 2,
					"source": "def m = /^\\/controller\\/action.+?s=(.+?)&.*?$/.matcher(doc['nginx.access.url'].value);\nif (m.matches()) {\n        String s = m.group(1);\n        boolean needToChange = false;\n        int numChars = s.length();\n        StringBuilder sb = new StringBuilder();\n        int i = 0;\n\n        char c;\n        byte[] bytes = null;\n\n        String vv = \"+%\";\n        byte vNum1 = (byte)vv.charAt(0);\n        byte vNum2 = (byte)vv.charAt(1);\n        while (i < numChars) {\n            c = s.charAt(i);\n            byte cNum = (byte)c;\n            if (cNum == vNum1) {\n                sb.append(' ');\n                i++;\n                needToChange = true;\n            } else if (cNum == vNum2) {\n                if (bytes == null) {\n                    bytes = new byte[(numChars - i) / 3];\n                }\n                int pos = 0;\n\n                String hexString = \"\";\n                int countHex = 0;\n                while (((i + 2) < numChars) && ((byte)c == vNum2)) {\n                    int v = Integer.parseInt(s.substring(i + 1, i + 3), 16);\n                    hexString += s.substring(i + 1, i + 3);\n                    countHex += 1;\n                    if (3 == countHex) {\n                        int num = Integer.parseInt(hexString, 16);\n                        String bitString = Integer.toString(num, 2);\n                        String unicodeString = \"\";\n                        if ((num & 0xf0000000L) > 0) {\n                            unicodeString = bitString.substring(5, 8) + bitString.substring(10, 16) + bitString.substring(18, 24) + bitString.substring(26, bitString.length());\n                        } else if ((num & 0xe00000) > 0) {\n                           unicodeString = bitString.substring(4, 8) + bitString.substring(10, 16) + bitString.substring(18, bitString.length());\n                        } else if ((num & 0xc000) > 0) {\n                            unicodeString = bitString.substring(3, 8) + bitString.substring(10, bitString.length());\n                        } else {\n                            unicodeString = bitString.substring(1, bitString.length());\n                        }\n                        char result = (char)Integer.parseInt(unicodeString, 2);\n                        sb.append(result);\n\n                        hexString = \"\";\n                        countHex = 0;\n                    }\n                    bytes[pos++] = (byte)v;\n                    i += 3;\n                    if (i < numChars) {\n                        c = s.charAt(i);\n                    }\n                }\n\n                needToChange = true;\n            } else {\n                sb.append(c);\n                i++;\n            }\n        }\n\n        String ret = needToChange ? sb.toString() : s;\n        return ret.toUpperCase();\n} else {\n   return \"N/A\";\n}"
				}
			}
		}
	}
}

输出

{
“took”: 2600,
“timed_out”: false,
“_shards”: {
“total”: 278,
“successful”: 278,
“skipped”: 276,
“failed”: 0
},
“hits”: {
“total”: 476944,
“max_score”: 0,
“hits”: []
},
“aggregations”: {
“results”: {
“doc_count_error_upper_bound”: 1419,
“sum_other_doc_count”: 359784,
“buckets”: [
{
“key”: “秋以为期”,
“doc_count”: 6514
},
{
“key”: “原野苍茫”,
“doc_count”: 4704
}
]
}
}
}

参考

  • java.net.URLDecoder.decode() 方法(原有方法中 new String() 构造函数 ElasticSearch 只支持无参的形式,因此需要手工进行从字节数组转为 Unicode 字符)
  • https://blog.csdn.net/hezh1994/article/details/78899683
  • https://www.elastic.co/guide/en/elasticsearch/painless/6.7/painless-api-reference.html

你可能感兴趣的:(数据库)