UNICODE(UTF-16)与UTF-8编码的相互转换

我们通常所说的UNICODE其实是UTF-16,下面这几个函数实现UNICODE(UTF-16)与UTF-8编码的相互转换。

/** * This file implement functions of: * * 1. UTF-16 character to UTF-8 chaaracter converting. * 2. UTF-8 character to UTF-16 character converting. * * 3. UTF-16 string to UTF-8 string converting. * 4. UTF-8 string to UTF-16 string converting. */ /* Maximum bytes of a utf-8 character */ #define MAX_CHARACTER_SIZE 8 /** * UnicodeToUTF8 - convert unicode char to UTF-8 char * @unicode: a UNICODE(utf-16) character * @p: a buffer to contain a utf-8 characters * * @return: One step over the end of the utf-8 character buffer */ unsigned char * UnicodeToUTF8( int unicode, unsigned char *p) { unsigned char *e = NULL; if((e = p)) { if(unicode < 0x80) { *e++ = unicode; } else if(unicode < 0x800) { /* <11011111> < 000 0000 0000> */ *e++ = ((unicode >> 6) & 0x1f)|0xc0; *e++ = (unicode & 0x3f)|0x80; } else if(unicode < 0x10000) { /* <11101111> <0000 0000 0000 0000> */ *e++ = ((unicode >> 12) & 0x0f)|0xe0; *e++ = ((unicode >> 6) & 0x3f)|0x80; *e++ = (unicode & 0x3f)|0x80; } else if(unicode < 0x200000) { /* <11110111> <0 0000 0000 0000 0000 0000> */ *e++ = ((unicode >> 18) & 0x07)|0xf0; *e++ = ((unicode >> 12) & 0x3f)|0x80; *e++ = ((unicode >> 6) & 0x3f)|0x80; *e++ = (unicode & 0x3f)|0x80; } else if(unicode < 0x4000000) { /* <11111011> <00 0000 0000 0000 0000 0000 0000> */ *e++ = ((unicode >> 24) & 0x03)|0xf8 ; *e++ = ((unicode >> 18) & 0x3f)|0x80; *e++ = ((unicode >> 12) & 0x3f)|0x80; *e++ = ((unicode >> 6) & 0x3f)|0x80; *e++ = (unicode & 0x3f)|0x80; } else { /* <11111101> <0000 0000 0000 0000 0000 0000 0000 0000> */ *e++ = ((unicode >> 30) & 0x01)|0xfc; *e++ = ((unicode >> 24) & 0x3f)|0x80; *e++ = ((unicode >> 18) & 0x3f)|0x80; *e++ = ((unicode >> 12) & 0x3f)|0x80; *e++ = ((unicode >> 6) & 0x3f)|0x80; *e++ = (unicode & 0x3f)|0x80; } } /* Return One step over the end of the utf-8 character buffer */ return e; } /** * UTF8ToUnicode - convert UTF-8 char to unicode char * @ch: A buffer contain a utf-8 character * @unicode: Contain the converted utf-16 character * * @return: Bytes count of the utf-8 character (1 ~ 6), * can be used to step to next utf-8 character when convert a utf-8 string to a utf-16 string */ int UTF8ToUnicode (unsigned char *ch, int *unicode) { unsigned char *p = NULL; int e = 0, n = 0; if((p = ch) && unicode) { if(*p >= 0xfc) { /* 6:<11111100> */ e = (p[0] & 0x01) << 30; e |= (p[1] & 0x3f) << 24; e |= (p[2] & 0x3f) << 18; e |= (p[3] & 0x3f) << 12; e |= (p[4] & 0x3f) << 6; e |= (p[5] & 0x3f); n = 6; } else if(*p >= 0xf8) { /* 5:<11111000> */ e = (p[0] & 0x03) << 24; e |= (p[1] & 0x3f) << 18; e |= (p[2] & 0x3f) << 12; e |= (p[3] & 0x3f) << 6; e |= (p[4] & 0x3f); n = 5; } else if(*p >= 0xf0) { /* 4:<11110000> */ e = (p[0] & 0x07) << 18; e |= (p[1] & 0x3f) << 12; e |= (p[2] & 0x3f) << 6; e |= (p[3] & 0x3f); n = 4; } else if(*p >= 0xe0) { /* 3:<11100000> */ e = (p[0] & 0x0f) << 12; e |= (p[1] & 0x3f) << 6; e |= (p[2] & 0x3f); n = 3; } else if(*p >= 0xc0) { /* 2:<11000000> */ e = (p[0] & 0x1f) << 6; e |= (p[1] & 0x3f); n = 2; } else { e = p[0]; n = 1; } *unicode = e; } /* Return bytes count of this utf-8 character */ return n; } /** * UnicodeStrToUTF8Str - Convert a utf-16 string to a utf-8 string * @unicde_str: A utf-16 string * @utf8_str: A buffer to contain utf-8 string * @utf8_str_size: Maximum size of the utf-8 string buffer * * @return: One step over the end of the last utf-8 character */ unsigned char * UnicodeStrToUTF8Str (unsigned short * unicode_str, unsigned char * utf8_str, int utf8_str_size) { int unicode = 0; unsigned char *e = NULL, *s = NULL; unsigned char utf8_ch[MAX_CHARACTER_SIZE]; s = utf8_str; if ((unicode_str) && (s)) { while ((unicode = (int) (*unicode_str++))) { memset (utf8_ch, 0, sizeof (utf8_ch)); if ((e = UnicodeToUTF8 (unicode, utf8_ch)) > utf8_ch) { *e = '/0'; /* Judge whether exceed the destination buffer */ if ((s - utf8_str + strlen ((const char *) utf8_ch)) >= utf8_str_size) { return s; } else { memcpy (s, utf8_ch, strlen ((const char *) utf8_ch)); s += strlen ((const char *) utf8_ch); *s = '/0'; } } else { /* Converting error occurs */ return s; } } } return s; } /** * UTF8StrToUnicodeStr - Convert a utf-8 stirng to a utf-16 string * @utf8_str: A utf-8 string * @unicode_str: A buffer to contain utf-16 string * @unicode_str_size: Maximum size of the utf-16 string buffer * * @return: Number of utf-16 character */ int UTF8StrToUnicodeStr (unsigned char * utf8_str, unsigned short * unicode_str, int unicode_str_size) { int unicode = 0; int n = 0; int count = 0; unsigned char *s = NULL; unsigned short *e = NULL; s = utf8_str; e = unicode_str; if ((utf8_str) && (unicode_str)) { while (*s) { if ((n = UTF8ToUnicode (s, &unicode)) > 0) { if ((count + 1) >= unicode_str_size) { return count; } else { *e = (unsigned short) unicode; e++; *e = 0; /* Step to next utf-8 character */ s += n; } } else { /* Converting error occurs */ return count; } } } return count; } 

你可能感兴趣的:(Algorithm)