字符集

使用vs选择字符集,有

Not Set

Use Multi-Byte Character Set

Use Unicode Character Set

三个选项,这个在msdn上有解释http://msdn.microsoft.com/zh-cn/library/ey142t48(v=VS.80).aspx

Not Set 和Use Multi-Byte Character Set是char型的

Use Unicode Character Set是w_char型的

" If the symbol _UNICODE is defined for a build of your program, TCHAR is defined as type wchar_t, a 16-bit character encoding type; "

"If the symbol _MBCS is defined for a build of your program, type TCHAR, on which CString is based, maps to char"


这里说的多字节( Multi-Byte Character Set)实际就是双字节

The class library is also enabled for multibyte character sets — specifically for double-byte character sets (DBCS).


两个概念:字符编码,字符集

字符集,比如unicode,ascii, gdk

字符编码,比如utf-8,utf-16

utf的编码只针对unicode


windows的txt文件保存为utf-8时头部有 0xef ,0xbb, 0xbf作为标识,保存为ansi时就没有任何标识,保存为unicode时头部为0xff, 0xfe

例如"1我",

保存为utf-8时为:ef bb bf 31 e6 88 91    //"1"是一个字节,"我"是三个字节

保存为ansi为:31 ce d2                           //"1"是一个字节,"我"是两个字节

保存为unicode(utf-16)时为:ff fe 31 00 11 62   //"1""我"分别是两个字节

保存为unicode(utf-16)大端格式时为:fe ff 00 31 32 11


BOM: Byte Order Mark

BOM是字节顺序的,即大端和小端,utf-16是双字节编码,需要确定是大端或小端,utf-8是单字节编码,不需要


在unicode中,有一个名为 "ZERO WIDTH NO-BREAK SPACE"的字符 : FFFE

他其实不代表任何字符,也就是上面出现的头部了,大端为fe ff,小端为ff fe,

utf-8的前面为什么是ef bb bf呢,因为ff fe的utf-8编码就是ef bb bf,我们可以通过ef bb bf来获知该txt是以utf-8来编码的


以下是我用在项目的一段代码


#include "stdafx.h"
#include "misc.h"

namespace dsmisc
{


WORD TextEncoding(const TCHAR* FileName)
{
	WORD Res=UNKNOWN; 

	HANDLE handle = CreateFile(FileName,
								FILE_ALL_ACCESS,
								FILE_SHARE_READ|FILE_SHARE_WRITE,
								NULL,
								OPEN_EXISTING,
								FILE_ATTRIBUTE_NORMAL,
								NULL);

	if(handle==INVALID_HANDLE_VALUE)
	{
		return UNKNOWN;
	}
	else
	{
		BYTE bom[10];
		DWORD BytesReaded=0;
		BOOL err = ReadFile(handle, bom, 3, &BytesReaded, NULL);
		if(BytesReaded<2)
		{
			int dd = GetLastError();
			Res = UNKNOWN;
		}
		else
		{
			if(BytesReaded>2 && bom[0]== 0xef && bom[1]== 0xbb && bom[2] == 0xbf)
			{
				Res = UTF8;
			}
			else if(bom[0]==0xfe && bom[1]==0xff )
			{
				Res = UTF16BIG;
			}
			else if(bom[0]== 0xff && bom[1]==0xfe )
			{
				Res = UTF16LITTLE;
			}
			else
			{
				Res = ANSI;
			}
		}
	}

	CloseHandle(handle);

	return Res;
}

int CastWideChar2MultiChar(const WCHAR* pSrc,TCHAR** pDest)
{
	int castCnt = 0;

	//_convert = (lstrlenA(_lpa)+1), (INT_MAX/2<_convert)? NULL :  \
	//	ATLA2WHELPER((LPWSTR) alloca(_convert*sizeof(WCHAR)), _lpa, _convert, _acp)))
	int needCnt = WideCharToMultiByte (CP_ACP,0,pSrc,-1,NULL,0,NULL,NULL);
	char* pAllocDest = new char[needCnt+2];
	if (pAllocDest)
	{
		castCnt = WideCharToMultiByte (CP_ACP,0,pSrc,-1,pAllocDest,needCnt,NULL,NULL);
		*pDest = pAllocDest;
	}
	return castCnt;
}

int UTF82WConvert( const char* a_szSrc, wchar_t* a_szDest, int a_nDestSize )  
{  
	return MultiByteToWideChar( CP_UTF8, 0, a_szSrc, -1, a_szDest, a_nDestSize );
}

int UTF16LE2WConvert( const  char* a_szSrc, int a_nSize, wchar_t* a_szDest, int a_nDestSize )  
{  
	return memcpy_s(a_szDest,a_nDestSize,(const wchar_t*)a_szSrc,a_nSize);  
}

int UTF16BE2WConvert( const  char* a_szSrc, int a_nSize, wchar_t* a_szDest, int a_nDestSize )
{
	return LCMapStringW (LOCALE_USER_DEFAULT , LCMAP_BYTEREV, (wchar_t*)a_szSrc, a_nSize/2, a_szDest, a_nDestSize/2);
}

}



		TCHAR thumbnailName[256];
		sprintf(thumbnailName, "%stn%s.txt", m_FtpFolder.GetBuffer(), ResourceInfo.guid.c_str());
		int resc = dsmisc::TextEncoding(thumbnailName);

		locale loc = std::locale::global(std::locale(""));
		std::ifstream is(thumbnailName);
		locale::global(loc);
		if(!is)
		{
			TTRACE("[%s]读取文本失败\n", __FUNCTION__);
			ResourceInfo.thumbnail = "";
			ResourceInfo.thumbnailsize = 0;
		}
		else
		{
			std::string ctx;
			std::stringstream streamIn;
			streamIn<<is.rdbuf();
			streamIn>>ctx;

			if(resc==dsmisc::ANSI)
			{
				ResourceInfo.thumbnailsize = ctx.size();
				ResourceInfo.thumbnail.assign(ctx.c_str(), ctx.size());
			}
			else if(resc==dsmisc::UTF8)
			{
				int a_nDestSize = ctx.size()+1;
				wchar_t* a_szDest = new wchar_t[a_nDestSize];

				//utf-8到wchar_t
				int res = dsmisc::UTF82WConvert(ctx.c_str()+3, a_szDest, a_nDestSize);

				//wchar_t 到 TCHAR
				TCHAR *pDest;
				res = dsmisc::CastWideChar2MultiChar(a_szDest, &pDest);

				ResourceInfo.thumbnailsize = _tcslen(pDest);
				ResourceInfo.thumbnail.assign(pDest, ResourceInfo.thumbnailsize);

				delete pDest;	//!!!
				delete a_szDest;  //!!!
			}
			else if(resc==dsmisc::UTF16BIG)
			{
				int a_nDestSize = (ctx.size()-2)/2;//去除头部后的长度
				wchar_t* a_szDest = new wchar_t[a_nDestSize+1];	//加一个结束符'\0''
				a_szDest[a_nDestSize] = 0;
				//memset(a_szDest, 0 , a_nDestSize+2); 

				int res = dsmisc::UTF16BE2WConvert(ctx.c_str()+2, ctx.size()-2, a_szDest, a_nDestSize*sizeof(wchar_t));	
				//wchar_t 到 TCHAR
				TCHAR *pDest;
				res = dsmisc::CastWideChar2MultiChar(a_szDest, &pDest);

				ResourceInfo.thumbnailsize = _tcslen(pDest);
				ResourceInfo.thumbnail.assign(pDest, ResourceInfo.thumbnailsize);

				delete pDest;	//!!!
				delete a_szDest;  //!!!	
			}
			else if(resc==dsmisc::UTF16LITTLE)
			{
				int a_nDestSize = (ctx.size()-2)/2;//去除头部后的长度
				wchar_t* a_szDest = new wchar_t[a_nDestSize+1];	//加一个结束符'\0''
				a_szDest[a_nDestSize] = 0;

				//unicode到wchar_t
				int res = dsmisc::UTF16LE2WConvert(ctx.c_str()+2, ctx.size()-2, a_szDest, a_nDestSize*sizeof(wchar_t));	
				//wchar_t 到 TCHAR
				TCHAR *pDest;
				res = dsmisc::CastWideChar2MultiChar(a_szDest, &pDest);

				ResourceInfo.thumbnailsize = _tcslen(pDest);
				ResourceInfo.thumbnail.assign(pDest, ResourceInfo.thumbnailsize);

				delete pDest;	//!!!
				delete a_szDest;  //!!!				
			}
		}




你可能感兴趣的:(character,byte)