Modified UTF-8 与 UTF-32 相互转换

Modified UTF-8 与 UTF-32 相互转换


自己的实现,经过一定的测试。

头文件
 1 /**/ /*
 2Convert Modified UTF-8  <==>  UTF-32.
 3*/

 4
 5
 6 /**/ /*
 7function : Convert Modified UTF-8 to UTF-32.
 8input : str_mutf8, a null terminated string in Modified UTF-8.
 9output : str_utf32, a null terminated string in UTF-32.
10input : str_utf32_limit, the max length(character count) 
11        of str_utf32 plus one(for 'null'), str_utf32 must have enough space 
12        for str_utf32_limit characters.
13return : -1 for errors; 
14        else the length(character count) of str_utf32, 
15                maybe larger than (str_utf32_limit-1) if the space 
16                of str_utf32 isn't enougn.
17note : convert 0xc080 to U+0000 字符串未结束
18        convert 0x00 to U+0000 字符串结束
19*/

20 int  mutf8_to_utf32(  const  unsigned  char   * str_mutf8, 
21                 unsigned  int   * str_utf32,  int  str_utf32_limit );
22
23 /**/ /*
24function : Convert UTF-32 to Modified UTF-8.
25input : str_utf32, a null terminated string in UTF-32.
26output : str_mutf8, a null terminated string in Modified UTF-8.
27input : str_mutf8_limit, the max length(byte count) 
28        of str_mutf8 plus one(for 'null'), str_mutf8 must have enough space 
29        for str_mutf8_limit bytes.
30return : -1 for errors; 
31        else the length(byte count) of str_mutf8, 
32                maybe larger than (str_mutf8_limit-1) if the space 
33                of str_mutf8 isn't enougn.
34note : convet U+0000 to 0x00, not 0xc080 字符串结束
35*/

36 int  utf32_to_mutf8(  const  unsigned  int   * str_utf32, 
37                 unsigned  char   * str_mutf8,  int  str_mutf8_limit );
38
39


C代码
  1 /**/ /*
  2Convert Modified UTF-8  <==>  UTF-32.
  3*/

  4
  5
  6 #include  " cvt_mutf8_utf32.h "
  7 #include  < stdio.h >  
  8
  9
 10 /**/ /*
 11A U+0001 to U+007F
 120+++ ++++ u &0x80 => 0x00
 13
 14B U+0080 to U+07FF, and null character (U+0000)
 15110+ ++++ u &0xe0 => 0xc0
 1610++ ++++ v &0xc0 => 0x80
 17((u & 0x1f) << 6) + (v & 0x3f)
 18
 19C U+0800 to U+FFFF
 201110 ++++ u &0xf0 => 0xe0
 2110++ ++++ v &0xc0 => 0x80
 2210++ ++++ w &0xc0 => 0x80
 23((u & 0xf) << 12) + ((v & 0x3f) << 6) + (w & 0x3f)
 24
 25D above U+FFFF (U+10000 to U+10FFFF)
 261110 1101 u &0xff => 0xed
 271010 ++++ v &0xf0 => 0xa0
 2810++ ++++ w &0xc0 => 0x80
 291110 1101 x &0xff => 0xed
 301011 ++++ y &0xf0 => 0xb0
 3110++ ++++ z &0xc0 => 0x80
 320x10000+((v&0x0f)<<16)+((w&0x3f)<<10)+(y&0x0f)<<6)+(z&0x3f) 
 33*/

 34
 35 int  mutf8_to_utf32(  const  unsigned  char   * str_mutf8, 
 36                 unsigned  int   * str_utf32,  int  str_utf32_limit )  {
 37        unsigned int cod, u, v, w, x, y, z;
 38        int len32 = 0;
 39        if ( (NULL == str_mutf8) || (0 > str_utf32_limit) ) {
 40                return (-1);
 41        }

 42
 43#define  __ADD_UTF32_COD_Z__   do {\
 44                if ( (NULL != str_utf32) && (len32 < str_utf32_limit) ) {\
 45                        str_utf32[ len32 ] = cod;\
 46                }
\
 47                ++len32;\
 48        }
  while  (  0  )
 49
 50          for  ( ; ; )  {
 51                u = *str_mutf8++;
 52
 53                if ( 0 == u ) {
 54                        break;
 55                }

 56
 57                if ( 0x00 == (0x80 & u)  ) {
 58                        cod = u;
 59                        __ADD_UTF32_COD_Z__;
 60                        continue;
 61                }

 62
 63                if ( 0xc0 == (0xe0 & u) ) {
 64                        v = *str_mutf8++;
 65                        if ( 0x80 != (0xc0 & v) ) {
 66                                return (-1);
 67                        }

 68                        cod =   ((u&0x1f)<<6| 
 69                                (v&0x3f);
 70                        __ADD_UTF32_COD_Z__;
 71                        continue;
 72                }

 73
 74                if ( 0xe0 == (0xf0 & u) ) {
 75                        v = *str_mutf8++;
 76                        if ( 0x80 != (0xc0 & v) ) {
 77                                return (-1);
 78                        }

 79                        w = *str_mutf8++;
 80                        if ( 0x80 != (0xc0 & w) ) {
 81                                return (-1);
 82                        }

 83                        if (    (0xed == (0xff & u)) && 
 84                                (0xa0 == (0xf0 & v)) && 
 85                                (0x80 == (0xc0 & w)) 
 86                        ) {
 87                                x = *str_mutf8++;
 88                                if ( 0xed != (0xff & x) ) {
 89                                        return (-1);
 90                                }

 91                                y = *str_mutf8++;
 92                                if ( 0xb0 != (0xf0 & y) ) {
 93                                        return (-1);
 94                                }

 95                                z = *str_mutf8++;
 96                                if ( 0x80 != (0xc0 & z) ) {
 97                                        return (-1);
 98                                }

 99                                cod =   0x10000 + (
100                                        ((v&0x0f)<<16| 
101                                        ((w&0x3f)<<10| 
102                                        ((y&0x0f)<<6| 
103                                        (z&0x3f) );
104                                __ADD_UTF32_COD_Z__;
105                                continue;
106                        }

107                        cod =   ((u&0xf)<<12| 
108                                ((v&0x3f)<<6| 
109                                (w&0x3f);
110                        __ADD_UTF32_COD_Z__;
111                        continue;
112                }

113
114                return (-1);
115        }

116
117          if  ( NULL  ==  str_utf32 )  {
118        }

119          else   if  ( len32  <  str_utf32_limit )  {
120                str_utf32[ len32 ] = 0;
121        }

122          else   {
123                str_utf32[ str_utf32_limit-1 ] = 0;
124        }

125
126          return  len32;
127 #undef  __ADD_UTF32_COD_Z__
128 }
129
130 int  utf32_to_mutf8(  const  unsigned  int   * str_utf32, 
131                 unsigned  char   * str_mutf8,  int  str_mutf8_limit )  {
132        unsigned int cod;
133        int len8 = 0;
134        if ( (NULL == str_utf32) || (0 > str_mutf8_limit) ) {
135                return (-1);
136        }

137
138#define __ADD_MUTF8_B_Z__(b)   do {\
139                if ( (NULL != str_mutf8) && (len8 < str_mutf8_limit) ) {\
140                        str_mutf8[ len8 ] = (unsigned char)(b);\
141                }
\
142                ++len8;\
143        }
  while  (  0  )
144
145          for  ( ; ; )  {
146                cod = *str_utf32++;
147
148                if ( 0 == cod ) {
149                        break;
150                }

151
152                if ( 0x007f >= cod ) {
153                        __ADD_MUTF8_B_Z__(cod);
154                        continue;
155                }

156
157                if ( 0x07ff >= cod ) {
158                        __ADD_MUTF8_B_Z__(0xc0|((cod>>6)&0x1f));
159                        __ADD_MUTF8_B_Z__(0x80|(cod&0x3f));
160                        continue;
161                }

162
163                if ( 0xffff >= cod ) {
164                        __ADD_MUTF8_B_Z__(0xe0|((cod>>12)&0x0f));
165                        __ADD_MUTF8_B_Z__(0x80|((cod>>6)&0x3f));
166                        __ADD_MUTF8_B_Z__(0x80|(cod&0x3f));
167                        continue;
168                }

169
170                if ( 0x10ffff >= cod ) {
171                        cod -= 0x10000;
172                        __ADD_MUTF8_B_Z__(0xed);
173                        __ADD_MUTF8_B_Z__(0xa0|((cod>>16)&0x0f));
174                        __ADD_MUTF8_B_Z__(0x80|((cod>>10)&0x3f));
175                        __ADD_MUTF8_B_Z__(0xed);
176                        __ADD_MUTF8_B_Z__(0xb0|((cod>>6)&0x0f));
177                        __ADD_MUTF8_B_Z__(0x80|(cod&0x3f));
178                        continue;
179                }

180
181                return (-1);
182        }

183
184          if  ( NULL  ==  str_mutf8 )  {
185        }

186          else   if  ( len8  <  str_mutf8_limit )  {
187                str_mutf8[ len8 ] = 0;
188        }

189          else   {
190                str_mutf8[ str_mutf8_limit-1 ] = 0;
191        }

192
193          return  len8;
194 #undef  __ADD_MUTF8_B_Z__
195 }
196
197

你可能感兴趣的:(Modified UTF-8 与 UTF-32 相互转换)