使用vs选择字符集,有
Not Set
Use Multi-Byte Character Set
Use Unicode Character Set
三个选项,这个在msdn上有解释http://msdn.microsoft.com/zh-cn/library/ey142t48(v=VS.80).aspx
Not Set 和Use Multi-Byte Character Set是char型的
Use Unicode Character Set是w_char型的
" If the symbol _UNICODE is defined for a build of your program, TCHAR is defined as type wchar_t, a 16-bit character encoding type; "
"If the symbol _MBCS is defined for a build of your program, type TCHAR, on which CString is based, maps to char. "
这里说的多字节( Multi-Byte Character Set)实际就是双字节
“The class library is also enabled for multibyte character sets — specifically for double-byte character sets (DBCS).”
两个概念:字符编码,字符集
字符集,比如unicode,ascii, gdk
字符编码,比如utf-8,utf-16
utf的编码只针对unicode
windows的txt文件保存为utf-8时头部有 0xef ,0xbb, 0xbf作为标识,保存为ansi时就没有任何标识,保存为unicode时头部为0xff, 0xfe
例如"1我",
保存为utf-8时为:ef bb bf 31 e6 88 91 //"1"是一个字节,"我"是三个字节
保存为ansi为:31 ce d2 //"1"是一个字节,"我"是两个字节
保存为unicode(utf-16)时为:ff fe 31 00 11 62 //"1""我"分别是两个字节
保存为unicode(utf-16)大端格式时为:fe ff 00 31 32 11
BOM: Byte Order Mark
BOM是字节顺序的,即大端和小端,utf-16是双字节编码,需要确定是大端或小端,utf-8是单字节编码,不需要
在unicode中,有一个名为 "ZERO WIDTH NO-BREAK SPACE"的字符 : FFFE
他其实不代表任何字符,也就是上面出现的头部了,大端为fe ff,小端为ff fe,
utf-8的前面为什么是ef bb bf呢,因为ff fe的utf-8编码就是ef bb bf,我们可以通过ef bb bf来获知该txt是以utf-8来编码的
以下是我用在项目的一段代码
#include "stdafx.h" #include "misc.h" namespace dsmisc { WORD TextEncoding(const TCHAR* FileName) { WORD Res=UNKNOWN; HANDLE handle = CreateFile(FileName, FILE_ALL_ACCESS, FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL); if(handle==INVALID_HANDLE_VALUE) { return UNKNOWN; } else { BYTE bom[10]; DWORD BytesReaded=0; BOOL err = ReadFile(handle, bom, 3, &BytesReaded, NULL); if(BytesReaded<2) { int dd = GetLastError(); Res = UNKNOWN; } else { if(BytesReaded>2 && bom[0]== 0xef && bom[1]== 0xbb && bom[2] == 0xbf) { Res = UTF8; } else if(bom[0]==0xfe && bom[1]==0xff ) { Res = UTF16BIG; } else if(bom[0]== 0xff && bom[1]==0xfe ) { Res = UTF16LITTLE; } else { Res = ANSI; } } } CloseHandle(handle); return Res; } int CastWideChar2MultiChar(const WCHAR* pSrc,TCHAR** pDest) { int castCnt = 0; //_convert = (lstrlenA(_lpa)+1), (INT_MAX/2<_convert)? NULL : \ // ATLA2WHELPER((LPWSTR) alloca(_convert*sizeof(WCHAR)), _lpa, _convert, _acp))) int needCnt = WideCharToMultiByte (CP_ACP,0,pSrc,-1,NULL,0,NULL,NULL); char* pAllocDest = new char[needCnt+2]; if (pAllocDest) { castCnt = WideCharToMultiByte (CP_ACP,0,pSrc,-1,pAllocDest,needCnt,NULL,NULL); *pDest = pAllocDest; } return castCnt; } int UTF82WConvert( const char* a_szSrc, wchar_t* a_szDest, int a_nDestSize ) { return MultiByteToWideChar( CP_UTF8, 0, a_szSrc, -1, a_szDest, a_nDestSize ); } int UTF16LE2WConvert( const char* a_szSrc, int a_nSize, wchar_t* a_szDest, int a_nDestSize ) { return memcpy_s(a_szDest,a_nDestSize,(const wchar_t*)a_szSrc,a_nSize); } int UTF16BE2WConvert( const char* a_szSrc, int a_nSize, wchar_t* a_szDest, int a_nDestSize ) { return LCMapStringW (LOCALE_USER_DEFAULT , LCMAP_BYTEREV, (wchar_t*)a_szSrc, a_nSize/2, a_szDest, a_nDestSize/2); } }
TCHAR thumbnailName[256]; sprintf(thumbnailName, "%stn%s.txt", m_FtpFolder.GetBuffer(), ResourceInfo.guid.c_str()); int resc = dsmisc::TextEncoding(thumbnailName); locale loc = std::locale::global(std::locale("")); std::ifstream is(thumbnailName); locale::global(loc); if(!is) { TTRACE("[%s]读取文本失败\n", __FUNCTION__); ResourceInfo.thumbnail = ""; ResourceInfo.thumbnailsize = 0; } else { std::string ctx; std::stringstream streamIn; streamIn<<is.rdbuf(); streamIn>>ctx; if(resc==dsmisc::ANSI) { ResourceInfo.thumbnailsize = ctx.size(); ResourceInfo.thumbnail.assign(ctx.c_str(), ctx.size()); } else if(resc==dsmisc::UTF8) { int a_nDestSize = ctx.size()+1; wchar_t* a_szDest = new wchar_t[a_nDestSize]; //utf-8到wchar_t int res = dsmisc::UTF82WConvert(ctx.c_str()+3, a_szDest, a_nDestSize); //wchar_t 到 TCHAR TCHAR *pDest; res = dsmisc::CastWideChar2MultiChar(a_szDest, &pDest); ResourceInfo.thumbnailsize = _tcslen(pDest); ResourceInfo.thumbnail.assign(pDest, ResourceInfo.thumbnailsize); delete pDest; //!!! delete a_szDest; //!!! } else if(resc==dsmisc::UTF16BIG) { int a_nDestSize = (ctx.size()-2)/2;//去除头部后的长度 wchar_t* a_szDest = new wchar_t[a_nDestSize+1]; //加一个结束符'\0'' a_szDest[a_nDestSize] = 0; //memset(a_szDest, 0 , a_nDestSize+2); int res = dsmisc::UTF16BE2WConvert(ctx.c_str()+2, ctx.size()-2, a_szDest, a_nDestSize*sizeof(wchar_t)); //wchar_t 到 TCHAR TCHAR *pDest; res = dsmisc::CastWideChar2MultiChar(a_szDest, &pDest); ResourceInfo.thumbnailsize = _tcslen(pDest); ResourceInfo.thumbnail.assign(pDest, ResourceInfo.thumbnailsize); delete pDest; //!!! delete a_szDest; //!!! } else if(resc==dsmisc::UTF16LITTLE) { int a_nDestSize = (ctx.size()-2)/2;//去除头部后的长度 wchar_t* a_szDest = new wchar_t[a_nDestSize+1]; //加一个结束符'\0'' a_szDest[a_nDestSize] = 0; //unicode到wchar_t int res = dsmisc::UTF16LE2WConvert(ctx.c_str()+2, ctx.size()-2, a_szDest, a_nDestSize*sizeof(wchar_t)); //wchar_t 到 TCHAR TCHAR *pDest; res = dsmisc::CastWideChar2MultiChar(a_szDest, &pDest); ResourceInfo.thumbnailsize = _tcslen(pDest); ResourceInfo.thumbnail.assign(pDest, ResourceInfo.thumbnailsize); delete pDest; //!!! delete a_szDest; //!!! } }