Unicode与多字节转换
*****************************************************************************
* Class : Global Function
* Function : to_MultiByte
* Description: 把Unicode转换成多字节
* Parameters : char* strSM -- 要转换的Unicode缓冲
* int nLength -- Unicode缓冲的长度
* Return : 转换后的多字节字串
* Input :
* Output :
* History : Created by Rongdian Monday, April 29, 2002 11:03:15
* Process :
******************************************************************************/
CString to_MultiByte(char* strSM, int nLength){
UINT nLen = 0;
PBYTE lpszM;
PBYTE lpszW = new BYTE[nLength];
memcpy(lpszW, strSM, nLength);
for(int i = 0; i < nLength/2; i++)
*((unsigned short*)lpszW + i) = ntohs(*((unsigned short*)lpszW + i));
nLen = WideCharToMultiByte(936, WC_COMPOSITECHECK,
(const unsigned short*)lpszW, nLength/2, NULL, 0, NULL, NULL);
lpszM = new BYTE[nLen+1];
nLen = WideCharToMultiByte(936, WC_COMPOSITECHECK,
(const unsigned short*)lpszW, nLength/2, (char*)lpszM, nLen, NULL, NULL);
lpszM[nLen] = 0;
CString csSM((LPCTSTR)lpszM, nLen);
delete lpszM;
delete lpszW;
return csSM;
}
/*****************************************************************************
* Class : Global Function
* Function : to_UCS2
* Description: 把多字节转换成Unicode
* Parameters : char* strSM -- -- 要转换的多字节缓冲
* int nLength -- 多字节缓冲的长度
* Return : 转换后的Unicode字串
******************************************************************************/
CString to_UCS2(char* strSM, int nLength){
CString csSM((LPCTSTR)strSM, nLength);
PBYTE lpszW = NULL;
UINT nLen = 0;
nLen = MultiByteToWideChar(936, MB_PRECOMPOSED,
(LPCTSTR)csSM, csSM.GetLength(), NULL, 0);
lpszW = new BYTE[nLen * 2];
nLen = MultiByteToWideChar(936, MB_PRECOMPOSED,
(LPCTSTR)csSM, csSM.GetLength(), (LPWSTR)lpszW, nLen);
for(UINT i = 0; i < nLen; i ++)
*((unsigned short*)lpszW + i) = htons(*((unsigned short*)lpszW + i));
CString csRet((LPCTSTR)lpszW, nLen * 2);
delete lpszW;
return csRet;
}
********************************************************************************/
WideCharToMultiByte是一个很强大的函数,它有很多参数。它允许你控制很多,象代码页、默认字符集等等。下面是一个例子:
char* GetAnsicString(const CString &s, UINT nCodePage)
{
int nSize = s.GetLength();
char *pAnsicString = new char[(nSize + 1) * sizeof(TCHAR)];
ZeroMemory(pAnsicString, [(nSize + 1) * sizeof(TCHAR));
WideCharToMultiByte(nCodePage, 0, s, nSize, pAnsicString, (nSize + 1) * sizeof(TCHAR), NULL, NULL);
return pAnsicString;
}
CString strUnicode = _T("Some test string");
char *pAnsicString = GetAnsicString(strUnicode, CP_ACP);
************************************************************
MSDN:
WideCharToMultiByte
The WideCharToMultiByte function maps a wide-character string to a new character string. The new character string is not necessarily from a multibyte character set.
int WideCharToMultiByte(
UINT CodePage, // code page
DWORD dwFlags, // performance and mapping flags
LPCWSTR lpWideCharStr, // wide-character string
int cchWideChar, // number of chars in string
LPSTR lpMultiByteStr, // buffer for new string
int cbMultiByte, // size of buffer
LPCSTR lpDefaultChar, // default for unmappable chars
LPBOOL lpUsedDefaultChar // set when default char used
);
Parameters
CodePage
[in] Specifies the code page used to perform the conversion. This parameter can be given the value of any code page that is installed or available in the system. You can also specify one of the following values. Value Meaning
CP_ACP ANSI code page
CP_MACCP Macintosh code page
CP_OEMCP OEM code page
CP_SYMBOL Windows 2000/XP: Symbol code page (42)
CP_THREAD_ACP Windows 2000/XP: Current thread's ANSI code page
CP_UTF7 Windows 98/Me, Windows NT 4.0 and later: Translate using UTF-7. When this is set, lpDefaultChar and lpUsedDefaultChar must be NULL
CP_UTF8 Windows 98/Me, Windows NT 4.0 and later: Translate using UTF-8. When this is set, dwFlags must be zero and both lpDefaultChar and lpUsedDefaultChar must be NULL.
Windows 95: Under the Microsoft Layer for Unicode, WideCharToMultiByte also supports CP_UTF7 and CP_UTF8.
dwFlags
[in] Specifies the handling of unmapped characters. The function performs more quickly when none of these flags is set. The following flag constants are defined. Value Meaning
WC_NO_BEST_FIT_CHARS Windows 2000/XP: Any Unicode characters that do not translate directly to multibyte equivalents will be translated to the default character (see lpDefaultChar parameter). In other words, if translating from Unicode to multibyte and back to Unicode again does not yield the exact same Unicode character, the default character is used.
This flag may be used by itself or in combination with the other dwFlag options.
WC_COMPOSITECHECK Convert composite characters to precomposed characters.
WC_DISCARDNS Discard nonspacing characters during conversion.
WC_SEPCHARS Generate separate characters during conversion. This is the default conversion behavior.
WC_DEFAULTCHAR Replace exceptions with the default character during conversion.
When WC_COMPOSITECHECK is specified, the function converts composite characters to precomposed characters. A composite character consists of a base character and a nonspacing character, each having different character values. A precomposed character has a single character value for a base/nonspacing character combination. In the character è, the e is the base character, and the accent grave mark is the nonspacing character.
When an application specifies WC_COMPOSITECHECK, it can use the last three flags in this list (WC_DISCARDNS, WC_SEPCHARS, and WC_DEFAULTCHAR) to customize the conversion to precomposed characters. These flags determine the function's behavior when there is no precomposed mapping for a base/nonspace character combination in a wide-character string. These last three flags can only be used if the WC_COMPOSITECHECK flag is set.
The function's default behavior is to generate separate characters (WC_SEPCHARS) for unmapped composite characters.
lpWideCharStr
[in] Points to the wide-character string to be converted.
cchWideChar
[in] Specifies the number of wide characters in the string pointed to by the lpWideCharStr parameter. If this value is –1, the string is assumed to be null-terminated and the length is calculated automatically. The length will include the null-terminator.
lpMultiByteStr
[out] Points to the buffer to receive the translated string.
cbMultiByte
[in] Specifies the size, in bytes, of the buffer pointed to by the lpMultiByteStr parameter. If this value is zero, the function returns the number of bytes required for the buffer. (In this case, the lpMultiByteStr buffer is not used.)
lpDefaultChar
[in] Points to the character used if a wide character cannot be represented in the specified code page. If this parameter is NULL, a system default value is used. The function is faster when both lpDefaultChar and lpUsedDefaultChar are NULL.
If CodePage is either CP_UTF7 or CP_UTF8, this parameter must be NULL.
lpUsedDefaultChar
[in] Points to a flag that indicates whether a default character was used. The flag is set to TRUE if one or more wide characters in the source string cannot be represented in the specified code page. Otherwise, the flag is set to FALSE. This parameter may be NULL. The function is faster when both lpDefaultChar and lpUsedDefaultChar are NULL.
If CodePage is either CP_UTF7 or CP_UTF8, this parameter must be NULL.
Return Values
If the function succeeds, and cbMultiByte is nonzero, the return value is the number of bytes written to the buffer pointed to by lpMultiByteStr. The number includes the byte for the null terminator.
If the function succeeds, and cbMultiByte is zero, the return value is the required size, in bytes, for a buffer that can receive the translated string.
If the function fails, the return value is zero. To get extended error information, call GetLastError. GetLastError may return one of the following error codes:
ERROR_INSUFFICIENT_BUFFER
ERROR_INVALID_FLAGS
ERROR_INVALID_PARAMETER
Remarks
The lpMultiByteStr and lpWideCharStr pointers must not be the same. If they are the same, the function fails, and GetLastError returns ERROR_INVALID_PARAMETER.
If CodePage is CP_SYMBOL and cbMultiByte is less than cchWideChar, no characters are written to lpMultiByte. Otherwise, if cbMultiByte is less than cchWideChar, cbMultiByte characters are copied to the buffer pointed to by lpMultiByte.
An application can use the lpDefaultChar parameter to change the default character used for the conversion.
As noted earlier, the WideCharToMultiByte function operates most efficiently when both lpDefaultChar and lpUsedDefaultChar are NULL. The following table shows the behavior of WideCharToMultiByte for the four combinations of lpDefaultChar and lpUsedDefaultChar.
lpDefaultChar lpUsedDefaultChar Result
NULL NULL No default checking. This is the most efficient way to use this function.
non-NULL NULL Uses the specified default character, but does not set lpUsedDefaultChar.
NULL non-NULL Uses the system default character and sets lpUsedDefaultChar if necessary.
non-NULL non-NULL Uses the specified default character and sets lpUsedDefaultChar if necessary.
Windows 95/98/Me: WideCharToMultiByte is supported by the Microsoft Layer for Unicode. To use this, you must add certain files to your application, as outlined in Microsoft Layer for Unicode on Windows 95/98/Me Systems.
Example Code
For an example, see Looking Up a User's Full Name.
Requirements
Windows NT/2000/XP: Included in Windows NT 3.1 and later.
Windows 95/98/Me: Included in Windows 95 and later.
Header: Declared in Winnls.h; include Windows.h.
Library: Use Kernel32.lib.
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// 字符串转换
#define _WStr2StrN(s, w, n) WideCharToMultiByte(CP_ACP, 0, w, -1, s, n, NULL, NULL)
#define _Str2WStrN(w, s, n) MultiByteToWideChar(CP_ACP, 0, s, -1, w, n)
#ifdef UNICODE
#define _TStr2StrN(s, t, n) _WStr2StrN(s, t, n)
#define _TStrWStrN(w, t, n) lstrcpyn(w, t, n)
#define _Str2TStrN(t, s, n) _Str2WStrN(t, s, n)
#define _WStrTStrN(t, w, n) lstrcpyn(t, w, n)
#else // UNICODE
#define _TStr2StrN(s, t, n) lstrcpyn(s, t, n)
#define _TStrWStrN(w, t, n) _Str2WStrN(w, t, n)
#define _Str2TStrN(t, s, n) lstrcpyn(t, s, n)
#define _WStrTStrN(t, w, n) _WStr2StrN(t, w, n)
#endif // UNICODE
#define _WStr2Str(s, w) _WStr2StrN(s, w, _TSizeof(s))
#define _Str2WStr(w, s) _Str2WStrN(w, s, _TSizeof(w))
#define _TStr2Str(s, t) _TStr2StrN(s, t, _TSizeOf(s))
#define _TStr2WStr(w, t) _TStrWStrN(w, t, _TSizeOf(w))
#define _Str2TStr(t, s) _Str2TStrN(t, s, _TSizeOf(t))
#define _WStrTStr(t, w) _WStrTStrN(t, w, _TSizeOf(t))
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
你需要的是:#define _WStr2StrN(s, w, n) WideCharToMultiByte(CP_ACP, 0, w, -1, s, n, NULL, NULL)
要分析URL字符串,并把URL中的UTF8字符串转换成ANSI字符串:
转换各种字符编码,例如转换UTF8字符到ANSI字符方法:
先将UTF8字符转换成UNICODE
用MultiByteToWideChar(CP_UTF8, ...)
然后再将UNICODE转换成ANSI
用WideCharToMultiByte(CP_ACP, ..)
====================================
UNICODE字符串与ANSIC字符串的转换
从ANSIC到UNICODE比较简单
可以用CString的Format函数或默认转换:
char *pAnsicString = "Some test string";
CString strUnicode = pAnsicString;
还有MultiByteToWideChar函数。
从UNICODE到ANSIC也不难:
wcstombs
char* GetAnsicString(const CString &s)
{
int nSize = (s.GetLength() + 1) * sizeof(TCHAR);
char *pAnsicString = new char[nSize];
ZeroMemory(pAnsicString, nSize);
wcstombs(pAnsicString, s, nSize);
return pAnsicString;
}
CString strUnicode = _T("Some test string");
char *pAnsicString = GetAnsicString(strUnicode);
请仔细阅读论坛法帖的要求,不准用特殊符号的…………
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------
2010-05-12 17:23:56| 分类: C++语法 | 标签: |字号大中小 订阅