UCS-UNICODE-UTF-8编码

对每一个字符采用四个8比特字节编码的称为UCS-4,对每一个字符采用两个8比特字节编码的称为UCS-2。

 

UTF-8定义:
在UTF-8中,字符采用1到6个8比特字节的序列进行编码。仅仅一个8比特字节的一个序列中,字节的高位为0,其他的7位用于字符值编码。n(n>1)个8比特字节的一个序列中,初始的8比特字节中高n位为1,接着一位为0,此字节余下的位包含被编码字符值的位。接着的所有8比特字节的最高位为1,接着下一位为0,余下每个字节6位包含被编码字符的位。
下表总结了这些不同的8比特字节类型格式。字母x指出此位来自于进行编码的UCS-4字符值。

UCS-4范围(16进制) UTF-8 系列(二进制)

 0000 0000<->0000 007F 0xxxxxxx

0000 0080<->0000 07FF 110xxxxx 10xxxxxx

0000 0800<->0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx

0001 0000<->001F FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx

0020 0000<->03FF FFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx

 0400 0000<->7FFF FFFF 1111110x 10xxxxxx ... 10xxxxxx

 

int UnicodeToUTF8(WCHAR ucs2, unsigned char *buffer)
{
memset(buffer, 0, 4);
if ((0x0000 <= ucs2) && (ucs2 <= 0x007f)) // one char of UTF8
{
buffer[0] = (char)ucs2;
return 1;
}
if ((0x0080 <= ucs2) && (ucs2 <= 0x07ff)) // two char of UTF8
{
buffer[1] = 0x80 | char(ucs2 & 0x003f);
buffer[0] = 0xc0 | char((ucs2 >> 6) & 0x001f);
return 2;
}
if ((0x0800 <= ucs2) && (ucs2 <= 0xffff)) // three char of UTF8
{
buffer[2] = 0x80 | char(ucs2 & 0x003f);
buffer[1] = 0x80 | char((ucs2 >> 6) & 0x003f);
buffer[0] = 0xe0 | char((ucs2 >> 12) & 0x001f);
return 3;
}
return 0;
}

一下是UTF8->unicode:

 

WCHAR UTF8ToUnicode(unsigned char *buffer)
{
WCHAR temp = 0;
if (buffer[0] < 0x80) // one char of UTF8
{
temp = buffer[0];
}
if ((0xc0 <= buffer[0]) && (buffer[0] < 0xe0)) // two char of UTF8
{
temp = buffer[0] & 0x1f;
temp = temp << 6;
temp = temp | (buffer[1] & 0x3f);
}
if ((0xe0 <= buffer[0]) && (buffer[0] < 0xf0)) // three char of UTF8
{
temp = buffer[0] & 0x0f;
temp = temp << 6;
temp = temp | (buffer[1] & 0x3f);
temp = temp << 6;
temp = temp | (buffer[2] & 0x3f);
}
if ((0x80 <= buffer[0]) && (buffer[0] < 0xc0)) // not the first byte of UTF8 character
return 0xfeff; // 0xfeff will never appear in usual

return temp; // more than 3-bytes return 0
}

UNICODE 0x678 = 110 0111 1000 = UFT8 1101 1001 1011 1000  用10专用了后六位

你可能感兴趣的:(buffer,character,byte)