HTTP URI编解码
字符集合 依据 RFC3986 2规范,HTTP URI中允许出现的US-ASCII字符的子集,可以分成 保留、 未保留及 转义这几类,每类的全部字符列表如下
● 保留: : / ? # [ ] @ ! $ & '( ) * + ,; =共18个,一般用于URI部件分隔符。
● 未保留: a-z A-Z 0-9 - . _ ~共66个,一般用于部件内数据。
● 转义: %HEXHEX,HEX表示一个十六进制数字[0-9A-F]或[0-9a-f],通常采用大写,这两个HEX就表示一个US-ASCII字符代码,转义用于在URI内部插入保留字符及原本不支持的字符。
编码原理
当构建URI的部件时,其中的一个八位字节码相应的字符超出了允许的集合或被用作分隔符,就需要编码,正是在这个时候,由实现决定保留字符的哪些被用于子部件分隔符,哪个被安全地用于数据。一个百分号编码的八位字节码被编码成一个三重字符, 包括百分号字符"%"和随后的两个十六进制数字展示那个八位字节的数值。
字符映射表
用于快速判断一个字符是否为未保留字符,定义如下
1
const
char
http_uri_table[
256
]
=
2 {
3 /**//* 0 */
4 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
7 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
8 /**//* 64 */
9 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
11 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
12 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0,
13 /**//* 128 */
14 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
15 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
16 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
17 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
18 /**//* 192 */
19 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
20 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
21 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
22 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
23} ;
24 #define HTTP_CHAR_IS_UNRESERVED(c) (http_uri_table[(unsigned char)(c)])
2 {
3 /**//* 0 */
4 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
5 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
7 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
8 /**//* 64 */
9 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
10 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
11 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
12 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0,
13 /**//* 128 */
14 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
15 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
16 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
17 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
18 /**//* 192 */
19 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
20 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
21 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
22 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
23} ;
24 #define HTTP_CHAR_IS_UNRESERVED(c) (http_uri_table[(unsigned char)(c)])
接口实现
http_uri_encode有2个版本:一个带uri长度参数len,另一个则不带。
1
void
http_uri_encode(
const
char
*
uri, size_t len, std::
string
&
str,
bool
space_as_plus
/**/
/*=false*/
)
2 {
3 char c,buf[4];
4
5 for (size_t i = 0; i < len; i++) {
6 c = uri[i];
7 if (HTTP_CHAR_IS_UNRESERVED(c)) {
8 str.push_back(c);
9 }else if(c == ' ' && space_as_plus) {
10 str.push_back('+');
11 }else{
12 sprintf(buf,"%%%02X",(unsigned char)c);
13 str.append(buf);
14 }
15 }
16}
17
18 void http_uri_encode( const char * uri, std:: string & str, bool space_as_plus /**/ /*=false*/ )
19 {
20 char c,buf[4];
21
22 for (; c=*uri; ++uri) {
23 if (HTTP_CHAR_IS_UNRESERVED(c)) {
24 str.push_back(c);
25 }else if(c == ' ' && space_as_plus) {
26 str.push_back('+');
27 }else{
28 sprintf(buf,"%%%02X",(unsigned char)c);
29 str.append(buf);
30 }
31 }
32}
2 {
3 char c,buf[4];
4
5 for (size_t i = 0; i < len; i++) {
6 c = uri[i];
7 if (HTTP_CHAR_IS_UNRESERVED(c)) {
8 str.push_back(c);
9 }else if(c == ' ' && space_as_plus) {
10 str.push_back('+');
11 }else{
12 sprintf(buf,"%%%02X",(unsigned char)c);
13 str.append(buf);
14 }
15 }
16}
17
18 void http_uri_encode( const char * uri, std:: string & str, bool space_as_plus /**/ /*=false*/ )
19 {
20 char c,buf[4];
21
22 for (; c=*uri; ++uri) {
23 if (HTTP_CHAR_IS_UNRESERVED(c)) {
24 str.push_back(c);
25 }else if(c == ' ' && space_as_plus) {
26 str.push_back('+');
27 }else{
28 sprintf(buf,"%%%02X",(unsigned char)c);
29 str.append(buf);
30 }
31 }
32}
解码原理
当解析URI的时候,首先要根据HTTP协议分离各个部件,再将各部件内可能的转义数据进行反转义以还原。
接口实现
http_uri_decode有2种版本,一种提供存储解码后的uri参数str;另一种则不提供即在原uri上解码,返回实际解码后的字节数。每种版本又有2个版本,一个带uri长度参数len,另一个则不带。
1
void
http_uri_decode(
const
char
*
uri, size_t len, std::
string
&
str,
int
decode_plus
/**/
/*= 0*/
)
2 {
3 char c,t[3]={'\0'};
4
5 for (size_t i = 0; i < len; i++) {
6 c = uri[i];
7 if (c == '?') {
8 if(decode_plus < 0)
9 decode_plus = 1;
10 } else if (c == '+' && decode_plus) {
11 c = ' ';
12 } else if (c == '%' && isxdigit(uri[i+1]) && isxdigit(uri[i+2])) {
13 t[0] = uri[++i],t[1] = uri[++i];
14 c = (char)strtol(t, NULL, 16);
15 }
16 str.push_back(c);
17 }
18}
19
20 void http_uri_decode( const char * uri, std:: string & str, int decode_plus /**/ /*=0*/ )
21 {
22 char c,t[3]={'\0'};
23
24 for (; c=*uri; ++uri) {
25 if (c == '?') {
26 if(decode_plus < 0)
27 decode_plus = 1;
28 } else if (c == '+' && decode_plus) {
29 c = ' ';
30 } else if (c == '%' && isxdigit(*(uri+1)) && isxdigit(*(uri+2))) {
31 t[0] = *++uri,t[1] = *++uri;
32 c = (char)strtol(t, NULL, 16);
33 }
34 str.push_back(c);
35 }
36}
37
38 // in place decode function
39 size_t http_uri_decode( char * uri, size_t len, int decode_plus /**/ /*=0*/ )
40 {
41 char c,t[3]={'\0'};
42 size_t i,j;
43
44 for (i=j=0; i < len; ++i,++j) {
45 c = uri[i];
46 if (c == '?') {
47 if(decode_plus < 0)
48 decode_plus = 1;
49 } else if (c == '+' && decode_plus) {
50 c = ' ';
51 } else if (c == '%' && isxdigit(uri[i+1]) && isxdigit(uri[i+2])) {
52 t[0] = uri[++i],t[1] = uri[++i];
53 c = (char)strtol(t, NULL, 16);
54 }
55 uri[j] = c;
56 }
57
58 uri[j] = '\0';
59 return j;
60}
61
62 size_t http_uri_decode( char * uri, int decode_plus /**/ /*=0*/ )
63 {
64 char c,*s=uri,*d=uri,t[3]={'\0'};
65
66 for (; c=*s; ++s,++d) {
67 if (c == '?') {
68 if(decode_plus < 0)
69 decode_plus = 1;
70 } else if (c == '+' && decode_plus) {
71 c = ' ';
72 } else if (c == '%' && isxdigit(*(s+1)) && isxdigit(*(s+2))) {
73 t[0] = *++s,t[1] = *++s;
74 c = (char)strtol(t, NULL, 16);
75 }
76 *d = c;
77 }
78
79 *d= '\0';
80 return d-uri;
81}
2 {
3 char c,t[3]={'\0'};
4
5 for (size_t i = 0; i < len; i++) {
6 c = uri[i];
7 if (c == '?') {
8 if(decode_plus < 0)
9 decode_plus = 1;
10 } else if (c == '+' && decode_plus) {
11 c = ' ';
12 } else if (c == '%' && isxdigit(uri[i+1]) && isxdigit(uri[i+2])) {
13 t[0] = uri[++i],t[1] = uri[++i];
14 c = (char)strtol(t, NULL, 16);
15 }
16 str.push_back(c);
17 }
18}
19
20 void http_uri_decode( const char * uri, std:: string & str, int decode_plus /**/ /*=0*/ )
21 {
22 char c,t[3]={'\0'};
23
24 for (; c=*uri; ++uri) {
25 if (c == '?') {
26 if(decode_plus < 0)
27 decode_plus = 1;
28 } else if (c == '+' && decode_plus) {
29 c = ' ';
30 } else if (c == '%' && isxdigit(*(uri+1)) && isxdigit(*(uri+2))) {
31 t[0] = *++uri,t[1] = *++uri;
32 c = (char)strtol(t, NULL, 16);
33 }
34 str.push_back(c);
35 }
36}
37
38 // in place decode function
39 size_t http_uri_decode( char * uri, size_t len, int decode_plus /**/ /*=0*/ )
40 {
41 char c,t[3]={'\0'};
42 size_t i,j;
43
44 for (i=j=0; i < len; ++i,++j) {
45 c = uri[i];
46 if (c == '?') {
47 if(decode_plus < 0)
48 decode_plus = 1;
49 } else if (c == '+' && decode_plus) {
50 c = ' ';
51 } else if (c == '%' && isxdigit(uri[i+1]) && isxdigit(uri[i+2])) {
52 t[0] = uri[++i],t[1] = uri[++i];
53 c = (char)strtol(t, NULL, 16);
54 }
55 uri[j] = c;
56 }
57
58 uri[j] = '\0';
59 return j;
60}
61
62 size_t http_uri_decode( char * uri, int decode_plus /**/ /*=0*/ )
63 {
64 char c,*s=uri,*d=uri,t[3]={'\0'};
65
66 for (; c=*s; ++s,++d) {
67 if (c == '?') {
68 if(decode_plus < 0)
69 decode_plus = 1;
70 } else if (c == '+' && decode_plus) {
71 c = ' ';
72 } else if (c == '%' && isxdigit(*(s+1)) && isxdigit(*(s+2))) {
73 t[0] = *++s,t[1] = *++s;
74 c = (char)strtol(t, NULL, 16);
75 }
76 *d = c;
77 }
78
79 *d= '\0';
80 return d-uri;
81}