关于unicode,各种编码等国际化的技术原理可以参看我blog上的文章。最近的项目里要用到GBK->wchar_t,wchar_t->UTF8.所以对这部分功能做了些简单封装。其实对于国际化技术的封装,无非就是
DBCS <=> wchar_t .
wchar_t <=> 各种unicode编码 比如说UTF8,UTF16等。
这样的转换都是绝对可以成功的。像GBK<->BIG5这种dbcs<=>dbcs的转换就不一定能成功了。
wchar_t作为C++的字符串内部处理用类型,主要原因是各种字符串函数都都有wchar_t作为接口的版本。方便使用。wchar_t这个东西的长度是由编译器,平台实现决定的,所以请记住,处理wchar_t的时候,千万不要关心他的长度。如果你的代码对wchar_t的长度特别关心,这个时候你需要的应该是一种标准的unicode编码
(Linux下的wchar_t长度为4byte,好心痛)
罗嗦这么多,各位看官久等了。上代码。
class string_util
{
public:
#ifndef _UNIX
//我的程序只需要支持GBK,各位可以在这里加上Linux下的编码名字与windows下的codepage的对应关系进行扩展。
static inline int codepage(const char* code_page)
{
return 936;//"GBK"
}
#endif
static inline int dbcs2wchar(const char* code_page,/*in*/const char* in,int in_len,
/*out*/wchar_t* out,int out_max)
{
#ifdef _UNIX
size_t result;
iconv_t env;
env = iconv_open("WCHAR_T",code_page);
result = iconv(env,(char**)&in,(size_t*)&in_len,(char**)&out,(size_t*)&out_max);
iconv_close(env);
return (int) result;
#else
return ::MultiByteToWideChar(codepage(code_page),0,in,in_len,out,out_max);
#endif
}
static inline int dbcs2wchar(const char* code_page,/*in*/const string& in,/*out*/wstring& out)
{
int len = in.length() + 1;
int result;
wchar_t* pBuffer = new wchar_t[len];
memset(pBuffer,0,len*sizeof(wchar_t));
result = dbcs2wchar(code_page,in.c_str(),in.length(),pBuffer,len*sizeof(wchar_t));
if(pBuffer >= 0)
{
out = pBuffer;
}
else
{
out.clear();
}
delete[] pBuffer;
return result;
}
static inline int wchar2dbcs(const char* code_page,/*in*/const wchar_t* in,int in_len,
/*out*/char* out,int out_max)
{
#ifdef _UNIX
size_t result;
iconv_t env;
env = iconv_open(code_page,"WCHAR_T");
result = iconv(env,(char**)&in,(size_t*)&in_len,(char**)&out,(size_t*)&out_max);
iconv_close(env);
return (int) result;
#else
BOOL use_def_char;
use_def_char = FALSE;
return ::WideCharToMultiByte(codepage(code_page),0,in,in_len/sizeof(wchar_t),out,out_max,"?",&use_def_char);
#endif
}
static inline int wchar2dbcs(const char* code_page,/*in*/const wstring& in,/*out*/string& out)
{
int len = in.length() + 1;
int result;
char* pBuffer = new char[len*3];
memset(pBuffer,0,len*3);
result = wchar2dbcs(code_page,in.c_str(),in.length() * sizeof(wchar_t),pBuffer,len*3);
if(result >= 0)
{
out = pBuffer;
}
else
{
out = "";
}
delete[] pBuffer;
return result;
}
static inline int wchar2utf8(/*in*/const wchar_t* in,int in_len,
/*out*/char* out,int out_max)
{
#ifdef _UNIX
size_t result;
iconv_t env;
env = iconv_open("UTF8","WCHAR_T");
result = iconv(env,(char**)&in,(size_t*)&in_len,(char**)&out,(size_t*)&out_max);
iconv_close(env);
return (int) result;
#else
BOOL use_def_char;
use_def_char = FALSE;
return ::WideCharToMultiByte(CP_UTF8,0,in,in_len/sizeof(wchar_t),out,out_max,NULL,NULL);
#endif
}
static inline int wchar2utf8(/*in*/const wstring& in,/*out*/string& out)
{
int len = in.length() + 1;
int result;
char* pBuffer = new char[len*3];
memset(pBuffer,0,len*3);
result = wchar2utf8(in.c_str(),in.length() * sizeof(wchar_t),pBuffer,len*3);
if(result >= 0)
{
out = pBuffer;
}
else
{
out = "";
}
delete[] pBuffer;
return result;
}
static inline int utf82wchar(/*in*/const char* in,int in_len,
/*out*/wchar_t* out,int out_max)
{
#ifdef _UNIX
size_t result;
iconv_t env;
env = iconv_open("WCHAR_T","UTF8");
result = iconv(env,(char**)&in,(size_t*)&in_len,(char**)&out,(size_t*)&out_max);
iconv_close(env);
return (int) result;
#else
return ::MultiByteToWideChar(CP_UTF8,0,in,in_len,out,out_max);
#endif
}
static inline int utf82wchar(/*in*/const string& in,/*out*/wstring& out)
{
int len = in.length() + 1;
int result;
//wstring temp;
wchar_t* pBuffer = new wchar_t[len];
memset(pBuffer,0,len*sizeof(wchar_t));
result = utf82wchar(in.c_str(),in.length(),pBuffer,len*sizeof(wchar_t));
//printf("utf82wchar result is %d,errno is %s/n",result,strerror(errno));
if(result >= 0)
{
out = pBuffer;
}
else
{
out.clear();
}
delete[] pBuffer;
return result;
}
};