在windows下打开一个记事本,保存文件,下面有四种编码选择:
①ANSI,也就是多字节字符集,在中文系统下约等于GB2312。
②Unicode,就是UTF16 LE,小端序存储的UTF16。
③Unicode big endian ,就是UTF16 BE,大端序存储的UTF16。
④UTF8,网页上很多都是用UTF8,UTF8用1-4个字节来编码所有的字符,英文只需要1个 字节,中文需要3-4个字节。比起UTF16来说,UTF8这样可以尽可能的节省网络带宽,因为在网络上传输的字符,大部分以英文为主。UTF16至少是2个字节,部分字符4个字节。
如果我们写一个VC程序,从Web上获取到了网页内容,这些数据的编码大多是UTF8的,获取到我们VC程序中的char字符数组中时就会发现,英文可以正常显示,中文全部乱码了。因为VC把char数组中的字符串当做多字节字符集来处理。为了能正常显示UTF8编码的字符串,我们需要进行一些转换。将其转换为多字节字符集的字符串(ANSI),或是转换为宽字节字符串(UTF16)。
我们可以使用Windows API中的MultiByteToWideChar和WideCharToMultiByte来进行转换。示例代码如下:
std::wstring UTF8_To_UTF16(const std::string &source) { unsigned long len = ::MultiByteToWideChar(CP_UTF8, NULL, source.c_str(), -1, NULL, NULL); if (len == 0) return std::wstring(); wchar_t *buffer = new wchar_t[len]; ::MultiByteToWideChar(CP_UTF8, NULL, source.c_str(), -1, buffer, len); std::wstring dest(buffer); delete [] buffer; return dest; } std::string UTF16_To_UTF8(const std::wstring &source) { unsigned long len = ::WideCharToMultiByte(CP_UTF8, NULL, source.c_str(), -1, NULL, NULL, NULL, NULL); if (len == 0) return std::string(); char *buffer = new char[len]; ::WideCharToMultiByte(CP_UTF8, NULL, source.c_str(), -1, buffer, len, NULL, NULL); std::string dest(buffer); delete [] buffer; return dest; } #endif std::wstring GBK_To_UTF16(const std::string &source) { enum {GB2312 = 936}; unsigned long len = ::MultiByteToWideChar(GB2312, NULL, source.c_str(), -1, NULL, NULL); if (len == 0) return std::wstring(); wchar_t *buffer = new wchar_t[len]; ::MultiByteToWideChar(GB2312, NULL, source.c_str(), -1, buffer, len); std::wstring dest(buffer); delete [] buffer; return dest; } std::string UTF16_To_GBK(const std::wstring &source) { enum {GB2312 = 936}; unsigned long len = ::WideCharToMultiByte(GB2312, NULL, source.c_str(), -1, NULL, NULL, NULL, NULL); if (len == 0) return std::string(); char *buffer = new char[len]; ::WideCharToMultiByte(GB2312, NULL, source.c_str(), -1, buffer, len, NULL, NULL); std::string dest(buffer); delete [] buffer; return dest; } std::string GBK_To_UTF8(const std::string &source) { enum {GB2312 = 936}; unsigned long len = ::MultiByteToWideChar(GB2312, NULL, source.c_str(), -1, NULL, NULL); if (len == 0) return std::string(); wchar_t *wide_char_buffer = new wchar_t[len]; ::MultiByteToWideChar(GB2312, NULL, source.c_str(), -1, wide_char_buffer, len); len = ::WideCharToMultiByte(CP_UTF8, NULL, wide_char_buffer, -1, NULL, NULL, NULL, NULL); if (len == 0) { delete [] wide_char_buffer; return std::string(); } char *multi_byte_buffer = new char[len]; ::WideCharToMultiByte(CP_UTF8, NULL, wide_char_buffer, -1, multi_byte_buffer, len, NULL, NULL); std::string dest(multi_byte_buffer); delete [] wide_char_buffer; delete [] multi_byte_buffer; return dest; } std::string UTF8_To_GBK(const std::string &source) { enum {GB2312 = 936}; unsigned long len = ::MultiByteToWideChar(CP_UTF8, NULL, source.c_str(), -1, NULL, NULL); if (len == 0) return std::string(); wchar_t *wide_char_buffer = new wchar_t[len]; ::MultiByteToWideChar(CP_UTF8, NULL, source.c_str(), -1, wide_char_buffer, len); len = ::WideCharToMultiByte(GB2312, NULL, wide_char_buffer, -1, NULL, NULL, NULL, NULL); if (len == 0) { delete [] wide_char_buffer; return std::string(); } char *multi_byte_buffer = new char[len]; ::WideCharToMultiByte(GB2312, NULL, wide_char_buffer, -1, multi_byte_buffer, len, NULL, NULL); std::string dest(multi_byte_buffer); delete [] wide_char_buffer; delete [] multi_byte_buffer; return dest; }
实际上C++标准库也提供了一些转换方法,不过标准库只提供了UTF8与UTF16之间的转换,使用标准库可以跨平台使用,但是需要C++11的支持。示例代码如下:
#include <locale> #include <codecvt> std::wstring UTF8_To_UTF16(const std::string &source) { try { static std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> cvt; return cvt.from_bytes(source); } catch (std::range_error &) { return std::wstring(); } } std::string UTF16_To_UTF8(const std::wstring &source) { try { static std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> cvt; return cvt.to_bytes(source); } catch (std::range_error &) { return std::string(); } }
另外,如果需要在GCC上实现的话,可以用标准库中的mbstowcs代替MultiByteToWideChar,用wcstombs代替WideCharToMultiByte,当然用iconv更方便。
在VC上也可以使用mbstowcs和wcstombs,因为其内部就是用MultiByteToWideChar和WideCharToMultiByte实现的。