Modified UTF-8 与 UTF-32 相互转换
自己的实现,经过一定的测试。
头文件
1
/**/
/*
2Convert Modified UTF-8 <==> UTF-32.
3*/
4
5
6 /**/ /*
7function : Convert Modified UTF-8 to UTF-32.
8input : str_mutf8, a null terminated string in Modified UTF-8.
9output : str_utf32, a null terminated string in UTF-32.
10input : str_utf32_limit, the max length(character count)
11 of str_utf32 plus one(for 'null'), str_utf32 must have enough space
12 for str_utf32_limit characters.
13return : -1 for errors;
14 else the length(character count) of str_utf32,
15 maybe larger than (str_utf32_limit-1) if the space
16 of str_utf32 isn't enougn.
17note : convert 0xc080 to U+0000 字符串未结束
18 convert 0x00 to U+0000 字符串结束
19*/
20 int mutf8_to_utf32( const unsigned char * str_mutf8,
21 unsigned int * str_utf32, int str_utf32_limit );
22
23 /**/ /*
24function : Convert UTF-32 to Modified UTF-8.
25input : str_utf32, a null terminated string in UTF-32.
26output : str_mutf8, a null terminated string in Modified UTF-8.
27input : str_mutf8_limit, the max length(byte count)
28 of str_mutf8 plus one(for 'null'), str_mutf8 must have enough space
29 for str_mutf8_limit bytes.
30return : -1 for errors;
31 else the length(byte count) of str_mutf8,
32 maybe larger than (str_mutf8_limit-1) if the space
33 of str_mutf8 isn't enougn.
34note : convet U+0000 to 0x00, not 0xc080 字符串结束
35*/
36 int utf32_to_mutf8( const unsigned int * str_utf32,
37 unsigned char * str_mutf8, int str_mutf8_limit );
38
39
2Convert Modified UTF-8 <==> UTF-32.
3*/
4
5
6 /**/ /*
7function : Convert Modified UTF-8 to UTF-32.
8input : str_mutf8, a null terminated string in Modified UTF-8.
9output : str_utf32, a null terminated string in UTF-32.
10input : str_utf32_limit, the max length(character count)
11 of str_utf32 plus one(for 'null'), str_utf32 must have enough space
12 for str_utf32_limit characters.
13return : -1 for errors;
14 else the length(character count) of str_utf32,
15 maybe larger than (str_utf32_limit-1) if the space
16 of str_utf32 isn't enougn.
17note : convert 0xc080 to U+0000 字符串未结束
18 convert 0x00 to U+0000 字符串结束
19*/
20 int mutf8_to_utf32( const unsigned char * str_mutf8,
21 unsigned int * str_utf32, int str_utf32_limit );
22
23 /**/ /*
24function : Convert UTF-32 to Modified UTF-8.
25input : str_utf32, a null terminated string in UTF-32.
26output : str_mutf8, a null terminated string in Modified UTF-8.
27input : str_mutf8_limit, the max length(byte count)
28 of str_mutf8 plus one(for 'null'), str_mutf8 must have enough space
29 for str_mutf8_limit bytes.
30return : -1 for errors;
31 else the length(byte count) of str_mutf8,
32 maybe larger than (str_mutf8_limit-1) if the space
33 of str_mutf8 isn't enougn.
34note : convet U+0000 to 0x00, not 0xc080 字符串结束
35*/
36 int utf32_to_mutf8( const unsigned int * str_utf32,
37 unsigned char * str_mutf8, int str_mutf8_limit );
38
39
C代码
1
/**/
/*
2Convert Modified UTF-8 <==> UTF-32.
3*/
4
5
6 #include " cvt_mutf8_utf32.h "
7 #include < stdio.h >
8
9
10 /**/ /*
11A U+0001 to U+007F
120+++ ++++ u &0x80 => 0x00
13
14B U+0080 to U+07FF, and null character (U+0000)
15110+ ++++ u &0xe0 => 0xc0
1610++ ++++ v &0xc0 => 0x80
17((u & 0x1f) << 6) + (v & 0x3f)
18
19C U+0800 to U+FFFF
201110 ++++ u &0xf0 => 0xe0
2110++ ++++ v &0xc0 => 0x80
2210++ ++++ w &0xc0 => 0x80
23((u & 0xf) << 12) + ((v & 0x3f) << 6) + (w & 0x3f)
24
25D above U+FFFF (U+10000 to U+10FFFF)
261110 1101 u &0xff => 0xed
271010 ++++ v &0xf0 => 0xa0
2810++ ++++ w &0xc0 => 0x80
291110 1101 x &0xff => 0xed
301011 ++++ y &0xf0 => 0xb0
3110++ ++++ z &0xc0 => 0x80
320x10000+((v&0x0f)<<16)+((w&0x3f)<<10)+(y&0x0f)<<6)+(z&0x3f)
33*/
34
35 int mutf8_to_utf32( const unsigned char * str_mutf8,
36 unsigned int * str_utf32, int str_utf32_limit ) {
37 unsigned int cod, u, v, w, x, y, z;
38 int len32 = 0;
39 if ( (NULL == str_mutf8) || (0 > str_utf32_limit) ) {
40 return (-1);
41 }
42
43#define __ADD_UTF32_COD_Z__ do {\
44 if ( (NULL != str_utf32) && (len32 < str_utf32_limit) ) {\
45 str_utf32[ len32 ] = cod;\
46 }\
47 ++len32;\
48 } while ( 0 )
49
50 for ( ; ; ) {
51 u = *str_mutf8++;
52
53 if ( 0 == u ) {
54 break;
55 }
56
57 if ( 0x00 == (0x80 & u) ) {
58 cod = u;
59 __ADD_UTF32_COD_Z__;
60 continue;
61 }
62
63 if ( 0xc0 == (0xe0 & u) ) {
64 v = *str_mutf8++;
65 if ( 0x80 != (0xc0 & v) ) {
66 return (-1);
67 }
68 cod = ((u&0x1f)<<6) |
69 (v&0x3f);
70 __ADD_UTF32_COD_Z__;
71 continue;
72 }
73
74 if ( 0xe0 == (0xf0 & u) ) {
75 v = *str_mutf8++;
76 if ( 0x80 != (0xc0 & v) ) {
77 return (-1);
78 }
79 w = *str_mutf8++;
80 if ( 0x80 != (0xc0 & w) ) {
81 return (-1);
82 }
83 if ( (0xed == (0xff & u)) &&
84 (0xa0 == (0xf0 & v)) &&
85 (0x80 == (0xc0 & w))
86 ) {
87 x = *str_mutf8++;
88 if ( 0xed != (0xff & x) ) {
89 return (-1);
90 }
91 y = *str_mutf8++;
92 if ( 0xb0 != (0xf0 & y) ) {
93 return (-1);
94 }
95 z = *str_mutf8++;
96 if ( 0x80 != (0xc0 & z) ) {
97 return (-1);
98 }
99 cod = 0x10000 + (
100 ((v&0x0f)<<16) |
101 ((w&0x3f)<<10) |
102 ((y&0x0f)<<6) |
103 (z&0x3f) );
104 __ADD_UTF32_COD_Z__;
105 continue;
106 }
107 cod = ((u&0xf)<<12) |
108 ((v&0x3f)<<6) |
109 (w&0x3f);
110 __ADD_UTF32_COD_Z__;
111 continue;
112 }
113
114 return (-1);
115 }
116
117 if ( NULL == str_utf32 ) {
118 }
119 else if ( len32 < str_utf32_limit ) {
120 str_utf32[ len32 ] = 0;
121 }
122 else {
123 str_utf32[ str_utf32_limit-1 ] = 0;
124 }
125
126 return len32;
127 #undef __ADD_UTF32_COD_Z__
128 }
129
130 int utf32_to_mutf8( const unsigned int * str_utf32,
131 unsigned char * str_mutf8, int str_mutf8_limit ) {
132 unsigned int cod;
133 int len8 = 0;
134 if ( (NULL == str_utf32) || (0 > str_mutf8_limit) ) {
135 return (-1);
136 }
137
138#define __ADD_MUTF8_B_Z__(b) do {\
139 if ( (NULL != str_mutf8) && (len8 < str_mutf8_limit) ) {\
140 str_mutf8[ len8 ] = (unsigned char)(b);\
141 }\
142 ++len8;\
143 } while ( 0 )
144
145 for ( ; ; ) {
146 cod = *str_utf32++;
147
148 if ( 0 == cod ) {
149 break;
150 }
151
152 if ( 0x007f >= cod ) {
153 __ADD_MUTF8_B_Z__(cod);
154 continue;
155 }
156
157 if ( 0x07ff >= cod ) {
158 __ADD_MUTF8_B_Z__(0xc0|((cod>>6)&0x1f));
159 __ADD_MUTF8_B_Z__(0x80|(cod&0x3f));
160 continue;
161 }
162
163 if ( 0xffff >= cod ) {
164 __ADD_MUTF8_B_Z__(0xe0|((cod>>12)&0x0f));
165 __ADD_MUTF8_B_Z__(0x80|((cod>>6)&0x3f));
166 __ADD_MUTF8_B_Z__(0x80|(cod&0x3f));
167 continue;
168 }
169
170 if ( 0x10ffff >= cod ) {
171 cod -= 0x10000;
172 __ADD_MUTF8_B_Z__(0xed);
173 __ADD_MUTF8_B_Z__(0xa0|((cod>>16)&0x0f));
174 __ADD_MUTF8_B_Z__(0x80|((cod>>10)&0x3f));
175 __ADD_MUTF8_B_Z__(0xed);
176 __ADD_MUTF8_B_Z__(0xb0|((cod>>6)&0x0f));
177 __ADD_MUTF8_B_Z__(0x80|(cod&0x3f));
178 continue;
179 }
180
181 return (-1);
182 }
183
184 if ( NULL == str_mutf8 ) {
185 }
186 else if ( len8 < str_mutf8_limit ) {
187 str_mutf8[ len8 ] = 0;
188 }
189 else {
190 str_mutf8[ str_mutf8_limit-1 ] = 0;
191 }
192
193 return len8;
194 #undef __ADD_MUTF8_B_Z__
195 }
196
197
2Convert Modified UTF-8 <==> UTF-32.
3*/
4
5
6 #include " cvt_mutf8_utf32.h "
7 #include < stdio.h >
8
9
10 /**/ /*
11A U+0001 to U+007F
120+++ ++++ u &0x80 => 0x00
13
14B U+0080 to U+07FF, and null character (U+0000)
15110+ ++++ u &0xe0 => 0xc0
1610++ ++++ v &0xc0 => 0x80
17((u & 0x1f) << 6) + (v & 0x3f)
18
19C U+0800 to U+FFFF
201110 ++++ u &0xf0 => 0xe0
2110++ ++++ v &0xc0 => 0x80
2210++ ++++ w &0xc0 => 0x80
23((u & 0xf) << 12) + ((v & 0x3f) << 6) + (w & 0x3f)
24
25D above U+FFFF (U+10000 to U+10FFFF)
261110 1101 u &0xff => 0xed
271010 ++++ v &0xf0 => 0xa0
2810++ ++++ w &0xc0 => 0x80
291110 1101 x &0xff => 0xed
301011 ++++ y &0xf0 => 0xb0
3110++ ++++ z &0xc0 => 0x80
320x10000+((v&0x0f)<<16)+((w&0x3f)<<10)+(y&0x0f)<<6)+(z&0x3f)
33*/
34
35 int mutf8_to_utf32( const unsigned char * str_mutf8,
36 unsigned int * str_utf32, int str_utf32_limit ) {
37 unsigned int cod, u, v, w, x, y, z;
38 int len32 = 0;
39 if ( (NULL == str_mutf8) || (0 > str_utf32_limit) ) {
40 return (-1);
41 }
42
43#define __ADD_UTF32_COD_Z__ do {\
44 if ( (NULL != str_utf32) && (len32 < str_utf32_limit) ) {\
45 str_utf32[ len32 ] = cod;\
46 }\
47 ++len32;\
48 } while ( 0 )
49
50 for ( ; ; ) {
51 u = *str_mutf8++;
52
53 if ( 0 == u ) {
54 break;
55 }
56
57 if ( 0x00 == (0x80 & u) ) {
58 cod = u;
59 __ADD_UTF32_COD_Z__;
60 continue;
61 }
62
63 if ( 0xc0 == (0xe0 & u) ) {
64 v = *str_mutf8++;
65 if ( 0x80 != (0xc0 & v) ) {
66 return (-1);
67 }
68 cod = ((u&0x1f)<<6) |
69 (v&0x3f);
70 __ADD_UTF32_COD_Z__;
71 continue;
72 }
73
74 if ( 0xe0 == (0xf0 & u) ) {
75 v = *str_mutf8++;
76 if ( 0x80 != (0xc0 & v) ) {
77 return (-1);
78 }
79 w = *str_mutf8++;
80 if ( 0x80 != (0xc0 & w) ) {
81 return (-1);
82 }
83 if ( (0xed == (0xff & u)) &&
84 (0xa0 == (0xf0 & v)) &&
85 (0x80 == (0xc0 & w))
86 ) {
87 x = *str_mutf8++;
88 if ( 0xed != (0xff & x) ) {
89 return (-1);
90 }
91 y = *str_mutf8++;
92 if ( 0xb0 != (0xf0 & y) ) {
93 return (-1);
94 }
95 z = *str_mutf8++;
96 if ( 0x80 != (0xc0 & z) ) {
97 return (-1);
98 }
99 cod = 0x10000 + (
100 ((v&0x0f)<<16) |
101 ((w&0x3f)<<10) |
102 ((y&0x0f)<<6) |
103 (z&0x3f) );
104 __ADD_UTF32_COD_Z__;
105 continue;
106 }
107 cod = ((u&0xf)<<12) |
108 ((v&0x3f)<<6) |
109 (w&0x3f);
110 __ADD_UTF32_COD_Z__;
111 continue;
112 }
113
114 return (-1);
115 }
116
117 if ( NULL == str_utf32 ) {
118 }
119 else if ( len32 < str_utf32_limit ) {
120 str_utf32[ len32 ] = 0;
121 }
122 else {
123 str_utf32[ str_utf32_limit-1 ] = 0;
124 }
125
126 return len32;
127 #undef __ADD_UTF32_COD_Z__
128 }
129
130 int utf32_to_mutf8( const unsigned int * str_utf32,
131 unsigned char * str_mutf8, int str_mutf8_limit ) {
132 unsigned int cod;
133 int len8 = 0;
134 if ( (NULL == str_utf32) || (0 > str_mutf8_limit) ) {
135 return (-1);
136 }
137
138#define __ADD_MUTF8_B_Z__(b) do {\
139 if ( (NULL != str_mutf8) && (len8 < str_mutf8_limit) ) {\
140 str_mutf8[ len8 ] = (unsigned char)(b);\
141 }\
142 ++len8;\
143 } while ( 0 )
144
145 for ( ; ; ) {
146 cod = *str_utf32++;
147
148 if ( 0 == cod ) {
149 break;
150 }
151
152 if ( 0x007f >= cod ) {
153 __ADD_MUTF8_B_Z__(cod);
154 continue;
155 }
156
157 if ( 0x07ff >= cod ) {
158 __ADD_MUTF8_B_Z__(0xc0|((cod>>6)&0x1f));
159 __ADD_MUTF8_B_Z__(0x80|(cod&0x3f));
160 continue;
161 }
162
163 if ( 0xffff >= cod ) {
164 __ADD_MUTF8_B_Z__(0xe0|((cod>>12)&0x0f));
165 __ADD_MUTF8_B_Z__(0x80|((cod>>6)&0x3f));
166 __ADD_MUTF8_B_Z__(0x80|(cod&0x3f));
167 continue;
168 }
169
170 if ( 0x10ffff >= cod ) {
171 cod -= 0x10000;
172 __ADD_MUTF8_B_Z__(0xed);
173 __ADD_MUTF8_B_Z__(0xa0|((cod>>16)&0x0f));
174 __ADD_MUTF8_B_Z__(0x80|((cod>>10)&0x3f));
175 __ADD_MUTF8_B_Z__(0xed);
176 __ADD_MUTF8_B_Z__(0xb0|((cod>>6)&0x0f));
177 __ADD_MUTF8_B_Z__(0x80|(cod&0x3f));
178 continue;
179 }
180
181 return (-1);
182 }
183
184 if ( NULL == str_mutf8 ) {
185 }
186 else if ( len8 < str_mutf8_limit ) {
187 str_mutf8[ len8 ] = 0;
188 }
189 else {
190 str_mutf8[ str_mutf8_limit-1 ] = 0;
191 }
192
193 return len8;
194 #undef __ADD_MUTF8_B_Z__
195 }
196
197