1. 了解下这个 API
//z 2014-03-25 08:18:41 IS2120@BG57IV3 T3343244181.K.F1434403198[T1,L68,R2,V15]
void UnicodeToAnsi(WCHAR *in, char *out, int cchout)
{
int len ;
len = WideCharToMultiByte(CP_ACP,
0,
in,
wcslen(in)+1,
out,
cchout,
NULL,
NULL) ;
if (!len)
ErrorExit("out of memory") ;
}
//z 2014-04-14 22:04:51 IS2120@BG57IV3 T1381068076.K.F1547169058[T4,L105,R3,V66]
2. 一个例子,将文件自动转换为 utf-8
// ChangeFileEncoding.cpp : 定义控制台应用程序的入口点。
//
#include "stdafx.h"
#include "ChangeFileEncoding.h"
#include <string>
#ifdef _DEBUG
#define new DEBUG_NEW
#endif
// 唯一的应用程序对象
CWinApp theApp;
using namespace std;
void recursiveFile(CString strFileType);
void convertGBToUTF8(CString strWritePath, const char* gb2312);
int _tmain(int argc, TCHAR* argv[], TCHAR* envp[])
{
int nRetCode = 0;
// 初始化 MFC 并在失败时显示错误
if (!AfxWinInit(::GetModuleHandle(NULL), NULL, ::GetCommandLine(), 0))
{
// TODO: 更改错误代码以符合您的需要
_tprintf(_T("错误: MFC 初始化失败\n"));
nRetCode = 1;
}
else
{
/*for(int i = 0; i < argc; i++)
{
MessageBox(NULL, argv[i], L"Arglist contents", MB_OK);
}*/
//声明一个CFileFind类变量,以用来搜索
//接受一个参数作为源代码文件的根目录
TCHAR *lpszDirName = argv[1];
CString strFileType;
strFileType.Format(_T("%s\\*.*"), lpszDirName);
//递归此目录下的.h文件和.cpp文件,如果发现不是utf8编码则转换为utf8编码
recursiveFile(strFileType);
}
return nRetCode;
}
void recursiveFile( CString strFileType)
{
CFileFind finder;
BOOL isFinded = finder.FindFile(strFileType);//查找第一个文件
while(isFinded)
{
isFinded = finder.FindNextFile(); //递归搜索其他的文件
if(!finder.IsDots()) //如果不是"."目录
{
CString strFoundFile = finder.GetFilePath();
if(finder.IsDirectory()) //如果是目录,则递归地调用
{
CString strNextFileType;
strNextFileType.Format(_T("%s\\*.*"), strFoundFile);
recursiveFile(strNextFileType);
}
else
{
//如果是头文件或cpp文件
if(strFoundFile.Right(4) == _T(".cpp") || strFoundFile.Right(2) == _T(".h")) {
CFile fileReader(strFoundFile, CFile::modeRead);
byte head[3];
fileReader.Read(head, 3);
//判断是否带有BOM文件头
if(head[0] == 0xef && head[1]==0xbb && head[2] == 0xbf )
{
fileReader.Close();
continue;
}
fileReader.SeekToBegin();
int bufLength = 256;
char *buf = new char[bufLength];
ZeroMemory(buf, bufLength);
int nReadLength;
std::string strContent;
while((nReadLength = fileReader.Read(buf, bufLength)))
{
strContent.append(buf, nReadLength);
ZeroMemory(buf, nReadLength);
}
delete buf;
fileReader.Close();
convertGBToUTF8(strFoundFile, strContent.c_str());
}
}
}
}
finder.Close();
}
void convertGBToUTF8(CString strWritePath, const char* gb2312)
{
CFile fp;
fp.Open(strWritePath, CFile::modeCreate|CFile::modeWrite|CFile::typeBinary,NULL);
int len = MultiByteToWideChar(CP_ACP, 0, gb2312, -1, NULL, 0);
wchar_t* wstr = new wchar_t[len+1];
memset(wstr, 0, len+1);
MultiByteToWideChar(CP_ACP, 0, gb2312, -1, wstr, len);
len = WideCharToMultiByte(CP_UTF8, 0, wstr, -1, NULL, 0, NULL, NULL);
char* str = new char[len+1];
memset(str, 0, len+1);
len = WideCharToMultiByte(CP_UTF8, 0, wstr, -1, str, len, NULL, NULL);
if(wstr) delete[] wstr;
str[len] = '\n';
const unsigned char aryBOM[] = {0xEF, 0xBB, 0xBF};
fp.Write(aryBOM, sizeof(aryBOM));
fp.Write(str,len);
delete[] str;
fp.Close();
}
//z 2014-04-14 22:04:51 IS2120@BG57IV3 T1381068076.K.F1547169058[T4,L105,R3,V66]
http://blog.csdn.net/visualcatsharp/article/details/7345854
//z 2014-05-06 12:00:46 L.239'43154 BG57IV3@XCL T1109932947.K.F253293061 [T409,L5358,R263,V7006]
3. v2
// ConvertZ.cpp : 定义控制台应用程序的入口点。
//
#include "stdafx.h"
#include "ConvertZ.h"
#include <string>
using namespace std;
#ifdef _DEBUG
#define new DEBUG_NEW
#endif
// 唯一的应用程序对象
CWinApp theApp;
void recursiveFile(CString strFileType);
void convertGBToUTF8(CString strWritePath, const char* gb2312);
int _tmain(int argc, TCHAR* argv[], TCHAR* envp[])
{
int nRetCode = 0;
// 初始化 MFC 并在失败时显示错误
if (!AfxWinInit(::GetModuleHandle(NULL), NULL, ::GetCommandLine(), 0))
{
// TODO: 更改错误代码以符合您的需要
_tprintf(_T("错误: MFC 初始化失败\n"));
nRetCode = 1;
}
else
{
/*for(int i = 0; i < argc; i++)
{
MessageBox(NULL, argv[i], L"Arglist contents", MB_OK);
}*/
//声明一个CFileFind类变量,以用来搜索
if(argc != 2)
{
CString strUsage;
strUsage.Format(_T("usage : \n %s dir\n dir [sample] : c:\\src\n"),argv[0]);
_tprintf(strUsage.GetBuffer());
strUsage.ReleaseBuffer();
return nRetCode;
}
//接受一个参数作为源代码文件的根目录
TCHAR *lpszDirName = argv[1];
CString strFileType;
strFileType.Format(_T("%s\\*.*"), lpszDirName);
//递归此目录下的.h文件和.cpp文件,如果发现不是utf8编码则转换为utf8编码
recursiveFile(strFileType);
}
return nRetCode;
}
bool isSrcType(const CString strFileType)
{
CString strExt_R4 = strFileType.Right(4);
CString strExt_R2 = strFileType.Right(2);
if ((strExt_R4.CompareNoCase(_T(".cpp")) == 0)
|| (strExt_R2.CompareNoCase(_T(".c")) == 0)
|| (strExt_R2.CompareNoCase(_T(".h")) == 0)
|| (strExt_R4.CompareNoCase(_T(".cxx")) == 0)
|| (strExt_R4.CompareNoCase(_T(".hpp")) == 0)
)
{
return true;
}
return false;
}
void recursiveFile( CString strFileType)
{
CFileFind finder;
BOOL isFinded = finder.FindFile(strFileType);//查找第一个文件
while(isFinded)
{
isFinded = finder.FindNextFile(); //递归搜索其他的文件
if(!finder.IsDots()) //如果不是"."目录
{
CString strFoundFile = finder.GetFilePath();
if(finder.IsDirectory()) //如果是目录,则递归地调用
{
CString strNextFileType;
strNextFileType.Format(_T("%s\\*.*"), strFoundFile);
recursiveFile(strNextFileType);
}
else
{
//如果是头文件或cpp文件
if(isSrcType(strFoundFile)) {
CFile fileReader(strFoundFile, CFile::modeRead|CFile::typeBinary);
byte head[3];
fileReader.Read(head, 3);
//判断是否带有BOM文件头
if(head[0] == 0xef && head[1]==0xbb && head[2] == 0xbf )
{
fileReader.Close();
continue;
}
fileReader.SeekToBegin();
int bufLength = 256;
char *buf = new char[bufLength];
ZeroMemory(buf, bufLength);
int nReadLength;
std::string strContent;
while((nReadLength = fileReader.Read(buf, bufLength)))
{
strContent.append(buf, nReadLength);
ZeroMemory(buf, nReadLength);
}
delete buf;
fileReader.Close();
convertGBToUTF8(strFoundFile, strContent.c_str());
}
}
}
}
finder.Close();
}
void convertGBToUTF8(CString strWritePath, const char* gb2312)
{
CFile fp;
fp.Open(strWritePath, CFile::modeCreate|CFile::modeWrite|CFile::typeBinary,NULL);
const int ngblen = static_cast<int>(strlen(gb2312));
int len = MultiByteToWideChar(CP_ACP, 0, gb2312, ngblen, NULL, 0);
wchar_t* wstr = new wchar_t[len+1];
memset(wstr, 0, (len+1)*sizeof(wchar_t));
MultiByteToWideChar(CP_ACP, 0, gb2312, ngblen, wstr, len);
wstr[len] = '\0';
int newLen = 0;
newLen = WideCharToMultiByte(CP_UTF8, 0, wstr, len, NULL, 0, NULL, NULL);
char* str = new char[newLen+1];
memset(str, 0, (newLen+1)*sizeof(char));
newLen = WideCharToMultiByte(CP_UTF8, 0, wstr, len, str, newLen, NULL, NULL);
if(wstr)
{
delete[] wstr;
wstr = NULL;
}
str[newLen] = '\0';
const unsigned char aryBOM[] = {0xEF, 0xBB, 0xBF};
fp.Write(aryBOM, sizeof(aryBOM));
fp.Write(str,newLen);
delete[] str;
fp.Close();
}
//z 2014-05-22 16:55:50 L.223'25450 BG57IV3 T427209771 .K.F253293061 [T484,L6693,R325,V8206]
Introduction
One very commonly asked question in programming is how to detect the character encoding of a string
. Well, I'm going to share a cool method I came up with that can detect if a string
is UTF-8, UTF-16BE, UTF-16LE, UTF-32BE, or UTF-32LE in just 4 lines of code.
Explanation
We'll be working with null terminated string
s, so the first rule is that we must terminate all string
s with a quadruple null, regardless of encoding. You may wish to add a definition such as the following:
Collapse
| Copy Code
#define NT "\0\0\0"
char *exampleString = "This is UTF-8" NT;
Next is an explanation of how the checking works.
Collapse
| Copy Code
1.===== If a string doesn't contain nulls, its UTF-8
:
else
:
2:===== If a string doesn't contain double nulls, it's UTF-16
:--.
: 3:== If the nulls are on odd numbered indices, it's UTF-16LE
: :
: else
: :
: 4'== The string defaults to UTF-16BE
:
else
:
5:===== If the index modulo 4 is 0 and the character is greater than
: 0x7F, the string is UTF-32LE. This is because the range of
: UTF-32 only goes up to 0x7FFFFFFF, meaning approximately 22%
: of the characters that can be represented will validate that
: the string is not big endian; including a BOM.
:
else
:
6'===== The string defaults to UTF-32BE
The Code
We check every byte until we reach a quadruple null:
Collapse
| Copy Code
int String_GetEncoding(char *string)
{
unsigned c, i = 0, flags = 0;
while (string[i] | string[i + 1] | string[i + 2] | string[i + 3])
flags = (c = string[i++]) ? flags | ((!(flags % 4) &&
c > 0x7F) << 3) : flags | 1 | (!(i & 1) << 1)
| ((string[i] == 0) << 2);
return (flags & 1) + ((flags & 2) != 0) +
((flags & 4) != 0) + ((flags & 8) != 0);
}
The output:
Collapse
| Copy Code
0 = UTF-8
1 = UTF-16BE
2 = UTF-16LE
3 = UTF-32BE
4 = UTF-32LE
Notes
Since UTF-32 encoding can contain several null bytes, its byte order checking is done through an alternative method that doesn't work 100% of the time, e.g., if all the characters are within the ASCII range and there isn't a BOM, it'll return UTF-32BE when it might actually be UTF-32LE.
This isn't really a big issue since UTF-32 is never used for storage, so chances are anyone that might use it will already know the byte ordering without having to check. However, if you're OCD, you could perform an additional check by treating UTF-32BE as UTF-16 and determining that string
's byte ordering.
License
This article, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)
About the Author
Ghosuwa Wogomon
United States
IsTextUnicode function
Determines if a buffer is likely to contain a form of Unicode text.
Syntax
BOOL IsTextUnicode(
_In_ const VOID *lpv,
_In_ int iSize,
_Inout_opt_ LPINT lpiResult
);
Detect encoding of a string in C/C++
Assuming you know the length of the input array, you can make the following guesses:
- First, check to see if the first few bytes match any well know byte order marks (BOM) for Unicode. If they do, you're done!
- Next, search for '\0' before the last byte. If you find one, you might be dealing with UTF-16 or UTF-32. If you find multiple consecutive '\0's, it's probably UTF-32.
- If any character is from
0x80
to 0xff
, it's certainly not ASCII or UTF-7. If you are restricting your input to some variant of Unicode, you can assume it's UTF-8. Otherwise, you have to do some guessing to determine which multi-byte character set it is. That will not be fun.
- At this point it is either: ASCII, UTF-7, Base64, or ranges of UTF-16 or UTF-32 that just happen to not use the top bit and do not have any null characters.
|
answered
Sep 23 '11 at 1:42
MSN
29.8k
2
36
61
|
It's not an easy problem to solve, and generally relies on heuristics to take a best guess at what the input encoding is, which can be tripped up by relatively innocuous inputs - for example, take a look at this Wikipedia article and The Notepad file encoding Redux for more details.
If you're looking for a Windows-only solution with minimal dependencies, you can look at using a combination of IsTextUnicode and MLang's DetectInputCodePage to attempt character set detection.
If you are looking for portability, but don't mind taking on a fairly large dependency in the form of ICU then you can make use of it's character set detection routines to achieve the same thing in a portable manner.
|
answered
Sep 23 '11 at 1:49
russw_uk
531
3
4
|
The Notepad file encoding problem, redux
17 Apr 2007 10:00 AM
About every ten months, somebody new discovers the Notepad file encoding problem. Let's see what else there is to say about it.
First of all, can we change Notepad's detection algorithm? The problem is that there are a lot of different text files out there. Let's look just at the ones that Notepad supports.
- 8-bit ANSI (of which 7-bit ASCII is a subset). These have no BOM; they just dive right in with bytes of text. They are also probably the most common type of text file.
- UTF-8. These usually begin with a BOM but not always.
- Unicode big-endian (UTF-16BE). These usually begin with a BOM but not always.
- Unicode little-endian (UTF-16LE). These usually begin with a BOM but not always.
If a BOM is found, then life is easy, since the BOM tells you what encoding the file uses. The problem is when there is no BOM. Now you have to guess, and when you guess, you can guess wrong. For example, consider this file:
D0 AE
Depending on which encoding you assume, you get very different results.
- If you assume 8-bit ANSI (with code page 1252), then the file consists of the two characters
U+00D0 U+00AE
, or "Ю". Sure this looks strange, but maybe it's part of the word VATNIЮ which might be the name of an Icelandic hotel.
- If you assume UTF-8, then the file consists of the single Cyrillic character
U+042E
, or "Ю".
- If you assume Unicode big-endian, then the file consists of the Korean Hangul syllable
U+D0AE
, or "킮".
- If you assume Unicode little-endian, then the file consists of the Korean Hangul syllable
U+AED0
, or "껐".
Okay, so this file can be interpreted in four different ways. Are you going to use the "try to guess" algorithm from IsTextUnicode
? (Michael Kaplan has some thoughts on this subject.) If so, then you are right where Notepad is today. Notice that all four interpretations are linguistically plausible.
Some people might say that the rule should be "All files without a BOM are 8-bit ANSI." In that case, you're going to misinterpret all the files that use UTF-8 or UTF-16 and don't have a BOM. Note that the Unicode standard even advises against using a BOM for UTF-8, so you're already throwing out everybody who follows the recommendation.
Okay, given that the Unicode folks recommend against using a BOM for UTF-8, maybe your rule is "All files without a BOM are UTF-8." Well, that messes up all 8-bit ANSI files that use characters above 127.
Maybe you're willing to accept that ambiguity, and use the rule, "If the file looks like valid UTF-8, then use UTF-8; otherwise use 8-bit ANSI, but under no circumstances should you treat the file as UTF-16LE or UTF-16BE." In other words, "never auto-detect UTF-16". First, you still have ambiguous cases, like the file above, which could be either 8-bit ANSI or UTF-8. And second, you are going to be flat-out wrong when you run into a Unicode file that lacks a BOM, since you're going to misinterpret it as either UTF-8 or (more likely) 8-bit ANSI. You might decide that programs that generate UTF-16 files without a BOM are broken, but that doesn't mean that they don't exist. For example,
cmd /u /c dir >results.txt
This generates a UTF-16LE file without a BOM. If you poke around your Windows directory, you'll probably find other Unicode files without a BOM. (For example, I found COM+.log
.) These files still "worked" under the old IsTextUnicode
algorithm, but now they are unreadable. Maybe you consider that an acceptable loss.
The point is that no matter how you decide to resolve the ambiguity, somebody will win and somebody else will lose. And then people can start experimenting with the "losers" to find one that makes your algorithm look stupid for choosing "incorrectly".
//////////////////////////////////////////////////////////////////////////
//
// FILE: utf8conv.h
//
// Header file defining helper functions for converting strings
// between Unicode UTF-8 and UTF-16.
//
// UTF-8 is stored in std::string; UTF-16 is stored in std::wstring.
//
// This code just uses Win32 Platform SDK and C++ standard library;
// so it can be used also with the Express editions of Visual Studio.
//
//
// February 4th, 2011
//
// by Giovanni Dicanio <[email protected]>
//
//////////////////////////////////////////////////////////////////////////
#pragma once
//------------------------------------------------------------------------
// INCLUDES
//------------------------------------------------------------------------
#include <stdarg.h> // variable argument lists...
#include <stdio.h> // ...and vsprintf_s
#include <exception> // std::exception
#include <string> // STL string classes
#include <Windows.h> // Win32 Platform SDK main header
namespace utf8util {
//------------------------------------------------------------------------
// Exception class representing an error occurred during UTF-8 conversion.
//------------------------------------------------------------------------
class utf8_error
: public std::exception
{
public:
// Constructs an utf8_error with a message string that can use a
// printf-like syntax for formatting.
explicit utf8_error(const char * format, ...);
// Override from std::exception::what()
const char * what() const;
//
// IMPLEMENTATION
//
private:
char m_message[512]; // buffer for error message
};
inline utf8_error::utf8_error(const char * format, ...)
{
// Format error message in buffer
va_list args;
va_start(args, format);
vsprintf_s(m_message, format, args);
va_end(args);
}
inline const char * utf8_error::what() const
{
return m_message;
}
//------------------------------------------------------------------------
//------------------------------------------------------------------------
// Converts a string from UTF-8 to UTF-16.
// On error, can throw an utf8_error exception.
//------------------------------------------------------------------------
inline std::wstring utf16_from_utf8(const std::string & utf8)
{
//
// Special case of empty input string
//
if (utf8.empty())
return std::wstring();
//
// Get length (in wchar_t's) of resulting UTF-16 string
//
const int utf16_length = ::MultiByteToWideChar(
CP_UTF8, // convert from UTF-8
0, // default flags
utf8.data(), // source UTF-8 string
utf8.length(), // length (in chars) of source UTF-8 string
NULL, // unused - no conversion done in this step
0 // request size of destination buffer, in wchar_t's
);
if (utf16_length == 0)
{
// Error
DWORD error = ::GetLastError();
throw utf8_error(
"Can't get length of UTF-16 string (MultiByteToWideChar set last error to %lu).",
error);
}
//
// Allocate destination buffer for UTF-16 string
//
std::wstring utf16;
utf16.resize(utf16_length);
//
// Do the conversion from UTF-8 to UTF-16
//
if ( ! ::MultiByteToWideChar(
CP_UTF8, // convert from UTF-8
0, // default flags
utf8.data(), // source UTF-8 string
utf8.length(), // length (in chars) of source UTF-8 string
&utf16[0], // destination buffer
utf16.length() // size of destination buffer, in wchar_t's
) )
{
// Error
DWORD error = ::GetLastError();
throw utf8_error(
"Can't convert string from UTF-8 to UTF-16 (MultiByteToWideChar set last error to %lu).",
error);
}
//
// Return resulting UTF-16 string
//
return utf16;
}
//------------------------------------------------------------------------
// Converts a string from UTF-16 to UTF-8.
// On error, can throw an utf8_error exception.
//------------------------------------------------------------------------
inline std::string utf8_from_utf16(const std::wstring & utf16)
{
//
// Special case of empty input string
//
if (utf16.empty())
return std::string();
//
// Get length (in chars) of resulting UTF-8 string
//
const int utf8_length = ::WideCharToMultiByte(
CP_UTF8, // convert to UTF-8
0, // default flags
utf16.data(), // source UTF-16 string
utf16.length(), // source string length, in wchar_t's,
NULL, // unused - no conversion required in this step
0, // request buffer size
NULL, NULL // unused
);
if (utf8_length == 0)
{
// Error
DWORD error = ::GetLastError();
throw utf8_error(
"Can't get length of UTF-8 string (WideCharToMultiByte set last error to %lu).",
error);
}
//
// Allocate destination buffer for UTF-8 string
//
std::string utf8;
utf8.resize(utf8_length);
//
// Do the conversion from UTF-16 to UTF-8
//
if ( ! ::WideCharToMultiByte(
CP_UTF8, // convert to UTF-8
0, // default flags
utf16.data(), // source UTF-16 string
utf16.length(), // source string length, in wchar_t's,
&utf8[0], // destination buffer
utf8.length(), // destination buffer size, in chars
NULL, NULL // unused
) )
{
// Error
DWORD error = ::GetLastError();
throw utf8_error(
"Can't convert string from UTF-16 to UTF-8 (WideCharToMultiByte set last error to %lu).",
error);
}
//
// Return resulting UTF-8 string
//
return utf8;
}
} // namespace utf8util
//////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////
//
// FILE: TestUTF8Conversion.cpp
//
// Defines the entry point for the console test application.
//
// By Giovanni Dicanio <[email protected]>
//
//////////////////////////////////////////////////////////////////////////
#include "stdafx.h" // precompiled headers
#include "utf8conv.h" // UTF-8 conversion helpers
using namespace std;
using namespace utf8util;
//------------------------------------------------------------------------
// Some tests for UTF-8 <-> UTF-16 conversion.
//------------------------------------------------------------------------
void test()
{
//
// Test a simple UTF-16 <-> UTF-8 conversion
//
// Source UTF-16 string
wstring utf16(L"Euro sign (U+20AC): \x20AC");
// Convert from UTF-16 to UTF-8
string utf8 = utf8_from_utf16(utf16);
// Convert back from UTF-8 to UTF-16
wstring utf16_new = utf16_from_utf8(utf8);
// Check conversion result
if (utf16_new != utf16)
throw runtime_error("UTF-16 <-> UTF-8 conversion failed.");
//
// Test with empty strings
//
if (! utf16_from_utf8("").empty())
throw runtime_error("Empty UTF-8 string not converted to empty UTF-16 string.");
if (! utf8_from_utf16(L"").empty())
throw runtime_error("Empty UTF-16 string not converted to empty UTF-8 string.");
//
// Test with invalid UTF-8 bytes
//
// 0xC0 0xAF UTF-8 sequence is discussed in "Writing Secure Code"
// (Chapter 11, "How UTF-8 Encodes Data", page 380)
char utf8_invalid[] = "UTF-8 invalid sequence: \xC0\xAF";
wstring utf16_invalid = utf16_from_utf8(utf8_invalid);
//
// Unicode UTF-16 'REPLACEMENT CHARACTER' (U+FFFD)
// is used for the invalid UTF-8 bytes.
//
// http://www.fileformat.info/info/unicode/char/fffd/index.htm
//
}
//------------------------------------------------------------------------
// Entry-point.
//------------------------------------------------------------------------
int wmain(int argc, wchar_t* argv[])
{
static const int ok = 0;
static const int fail = 1;
int exit_code = ok;
try
{
cout << "*** Testing UTF-8 <-> UTF-16 Conversion ***" << endl;
test();
cout << "All right." << endl;
}
catch(const exception & e)
{
cerr << "*** ERROR: " << e.what() << endl;
exit_code = fail;
}
return exit_code;
}
//////////////////////////////////////////////////////////////////////////