最近在考虑写一个可以跨平台的通用字符串类,首先需要搞定的就是编码转换问题。
vs默认保存代码文件,使用的是本地code(中文即GBK,日文即Shift-JIS),也可以使用带BOM的UTF-8。
gcc则是UTF-8,有无BOM均可(源代码的字符集可以由参数-finput-charset指定)。
那么源代码可以采用带BOM的UTF-8来保存。而windows下的unicode是UTF-16编码;linux则使用UTF-8或UTF-32。因此不论在哪种系统里,程序在处理字符串时都需要考虑UTF编码之间的相互转换。
下面直接贴出算法代码。算法上我借鉴了秦建辉(http://blog.csdn.net/jhqin)的UnicodeConverter,只是在外面增加了一些泛型处理,让使用相对简单。
核心算法(来自UnicodeConverter):
namespace transform
{
/*
UTF-32 to UTF-8
*/
inline static size_t utf(uint32 src, uint8* des)
{
if (src == 0) return 0;
static const byte PREFIX[] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
static const uint32 CODE_UP[] =
{
0x80, // U+00000000 - U+0000007F
0x800, // U+00000080 - U+000007FF
0x10000, // U+00000800 - U+0000FFFF
0x200000, // U+00010000 - U+001FFFFF
0x4000000, // U+00200000 - U+03FFFFFF
0x80000000 // U+04000000 - U+7FFFFFFF
};
size_t i, len = sizeof(CODE_UP) / sizeof(uint32);
for(i = 0; i < len; ++i)
if (src < CODE_UP[i]) break;
if (i == len) return 0; // the src is invalid
len = i + 1;
if (des)
{
for(; i > 0; --i)
{
des[i] = static_cast((src & 0x3F) | 0x80);
src >>= 6;
}
des[0] = static_cast(src | PREFIX[len - 1]);
}
return len;
}
/*
UTF-8 to UTF-32
*/
inline static size_t utf(const uint8* src, uint32& des)
{
if (!src || (*src) == 0) return 0;
uint8 b = *(src++);
if (b < 0x80)
{
des = b;
return 1;
}
if (b < 0xC0 || b > 0xFD) return 0; // the src is invalid
size_t len;
if (b < 0xE0)
{
des = b & 0x1F;
len = 2;
}
else
if (b < 0xF0)
{
des = b & 0x0F;
len = 3;
}
else
if (b < 0xF8)
{
des = b & 0x07;
len = 4;
}
else
if (b < 0xFC)
{
des = b & 0x03;
len = 5;
}
else
{
des = b & 0x01;
len = 6;
}
size_t i = 1;
for (; i < len; ++i)
{
b = *(src++);
if (b < 0x80 || b > 0xBF) return 0; // the src is invalid
des = (des << 6) + (b & 0x3F);
}
return len;
}
/*
UTF-32 to UTF-16
*/
inline static size_t utf(uint32 src, uint16* des)
{
if (src == 0) return 0;
if (src <= 0xFFFF)
{
if (des) (*des) = static_cast(src);
return 1;
}
else
if (src <= 0xEFFFF)
{
if (des)
{
des[0] = static_cast(0xD800 + (src >> 10) - 0x40); // high
des[1] = static_cast(0xDC00 + (src & 0x03FF)); // low
}
return 2;
}
return 0;
}
/*
UTF-16 to UTF-32
*/
inline static size_t utf(const uint16* src, uint32& des)
{
if (!src || (*src) == 0) return 0;
uint16 w1 = src[0];
if (w1 >= 0xD800 && w1 <= 0xDFFF)
{
if (w1 < 0xDC00)
{
uint16 w2 = src[1];
if (w2 >= 0xDC00 && w2 <= 0xDFFF)
{
des = (w2 & 0x03FF) + (((w1 & 0x03FF) + 0x40) << 10);
return 2;
}
}
return 0; // the src is invalid
}
else
{
des = w1;
return 1;
}
}
}
namespace transform
{
/*
UTF-16 to UTF-8
*/
inline static size_t utf(uint16 src, uint8* des)
{
// make utf-16 to utf-32
uint32 tmp;
if (utf(&src, tmp) != 1) return 0;
// make utf-32 to utf-8
return utf(tmp, des);
}
/*
UTF-8 to UTF-16
*/
inline static size_t utf(const uint8* src, uint16& des)
{
// make utf-8 to utf-32
uint32 tmp;
size_t len = utf(src, tmp);
if (len == 0) return 0;
// make utf-32 to utf-16
if (utf(tmp, &des) != 1) return 0;
return len;
}
}
namespace transform
{
/*
UTF-X: string to string
*/
template
size_t utf(const uint32* src, T* des) // UTF-32 to UTF-X(8/16)
{
if (!src || (*src) == 0) return 0;
size_t num = 0;
for(; *src; ++src)
{
size_t len = utf(*src, des);
if (len == 0) break;
if (des) des += len;
num += len;
}
if (des) (*des) = 0;
return num;
}
template
size_t utf(const T* src, uint32* des) // UTF-X(8/16) to UTF-32
{
if (!src || (*src) == 0) return 0;
size_t num = 0;
while(*src)
{
uint32 tmp;
size_t len = utf(src, tmp);
if (len == 0) break;
if (des)
{
(*des) = tmp;
++des;
}
src += len;
num += 1;
}
if (des) (*des) = 0;
return num;
}
template
size_t utf(const T* src, U* des) // UTF-X(8/16) to UTF-Y(16/8)
{
if (!src || (*src) == 0) return 0;
size_t num = 0;
while(*src)
{
// make utf-x to ucs4
uint32 tmp;
size_t len = utf(src, tmp);
if (len == 0) break;
src += len;
// make ucs4 to utf-y
len = utf(tmp, des);
if (len == 0) break;
if (des) des += len;
num += len;
}
if (des) (*des) = 0;
return num;
}
}
const uint8* c = (uint8*)"こんにちわ、世界";
size_t n = (sizeof(wchar_t) == 2) ?
transform::utf(c, (uint16*)0) :
transform::utf(c, (uint32*)0);
wchar_t* s = new wchar_t[n];
if (sizeof(wchar_t) == 2)
transform::utf(c, (uint16*)s);
else
transform::utf(c, (uint32*)s);
显然这里可以通过泛型来让算法更好用。
首先,需要被抽离出来的就是参数的类型大小和类型本身的依赖关系:
template struct utf_type;
template <> struct utf_type<1> { typedef uint8 type_t; };
template <> struct utf_type<2> { typedef uint16 type_t; };
template <> struct utf_type<4> { typedef uint32 type_t; };
template
struct check
{
static const bool value =
((sizeof(T) == sizeof(typename utf_type::type_t)) && !is_pointer::value);
};
type_t utf(T src, U* des)
type_t utf(const T* src, U* des)
type_t utf(const T* src, U& des)
type_t utf(const T* src, U* des)
template Y), bool = (X != Y)>
struct detail;
/*
UTF-X(32/16) to UTF-Y(16/8)
*/
template
struct detail
{
typedef typename utf_type::type_t src_t;
typedef typename utf_type::type_t des_t;
template
static typename enable_if::value && check::value,
size_t>::type_t utf(T src, U* des)
{
return transform::utf((src_t)(src), (des_t*)(des));
}
template
static typename enable_if::value,
size_t>::type_t utf(T src)
{
return transform::utf((src_t)(src), (des_t*)(0));
}
template
static typename enable_if::value && check::value,
size_t>::type_t utf(const T* src, U* des)
{
return transform::utf((const src_t*)(src), (des_t*)(des));
}
template
static typename enable_if::value,
size_t>::type_t utf(const T* src)
{
return transform::utf((src_t)(src), (des_t*)(0));
}
};
/*
UTF-X(16/8) to UTF-Y(32/16)
*/
template
struct detail
{
typedef typename utf_type::type_t src_t;
typedef typename utf_type::type_t des_t;
template
static typename enable_if::value && check::value,
size_t>::type_t utf(const T* src, U& des)
{
des_t tmp; // for disable the warning strict-aliasing from gcc 4.4
size_t ret = transform::utf((const src_t*)(src), tmp);
des = tmp;
return ret;
}
template
static typename enable_if::value && check::value,
size_t>::type_t utf(const T* src, U* des)
{
return transform::utf((const src_t*)(src), (des_t*)(des));
}
template
static typename enable_if::value,
size_t>::type_t utf(const T* src)
{
return transform::utf((const src_t*)(src), (des_t*)(0));
}
};
template
struct converter
: detail
{};
const char* c = "こんにちわ、世界";
wstring s;
size_t n; wchar_t w;
while (!!(n = converter::utf(c, w))) // 这里的!!是为了屏蔽gcc的警告
{
s.push_back(w);
c += n;
}
FILE* fp = fopen("test_converter.txt", "wb");
fwrite(s.c_str(), sizeof(wchar_t), s.length(), fp);
fclose(fp);
上面这一小段代码是将一段UTF-8的文字逐字符转换为wchar_t,并一个个push_back到wstring里,最后把转换完毕的字符串输出到test_converter.txt里。
其实上面的泛型还是显得累赘了。为什么不直接在transform::utf上使用泛型参数呢?
一开始只想到上面那个方法,自然是由于惯性的想要手动指定如何转换编码的缘故,比如最开始的想法,是想做成类似这样的模板:utf<8, 32>(s1, s2),指定两个数字,来决定输入和输出的格式。
后来发现,直接指定字符串/字符的类型或许更加直接些。
现在回头再看看,其实转换所需要的字长(8、16、32)已经在参数的类型中指定了:8bits的char或byte类型肯定不会是用来存放UTF-32的嘛。。
所以只需要把上面核心算法的参数泛型化就可以了。这时代码就会写成下面这个样子:
namespace transform
{
namespace private_
{
template struct utf_type;
template <> struct utf_type<1> { typedef uint8 type_t; };
template <> struct utf_type<2> { typedef uint16 type_t; };
template <> struct utf_type<4> { typedef uint32 type_t; };
template
struct check
{
static const bool value =
((sizeof(T) == sizeof(typename utf_type::type_t)) && !is_pointer::value);
}
}
using namespace transform::private_;
/*
UTF-32 to UTF-8
*/
template
typename enable_if::value && check::value,
size_t>::type_t utf(T src, U* des)
{
if (src == 0) return 0;
static const byte PREFIX[] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
static const uint32 CODE_UP[] =
{
0x80, // U+00000000 - U+0000007F
0x800, // U+00000080 - U+000007FF
0x10000, // U+00000800 - U+0000FFFF
0x200000, // U+00010000 - U+001FFFFF
0x4000000, // U+00200000 - U+03FFFFFF
0x80000000 // U+04000000 - U+7FFFFFFF
};
size_t i, len = sizeof(CODE_UP) / sizeof(uint32);
for(i = 0; i < len; ++i)
if (src < CODE_UP[i]) break;
if (i == len) return 0; // the src is invalid
len = i + 1;
if (des)
{
for(; i > 0; --i)
{
des[i] = static_cast((src & 0x3F) | 0x80);
src >>= 6;
}
des[0] = static_cast(src | PREFIX[len - 1]);
}
return len;
}
/*
UTF-8 to UTF-32
*/
template
typename enable_if::value && check::value,
size_t>::type_t utf(const T* src, U& des)
{
if (!src || (*src) == 0) return 0;
uint8 b = *(src++);
if (b < 0x80)
{
des = b;
return 1;
}
if (b < 0xC0 || b > 0xFD) return 0; // the src is invalid
size_t len;
if (b < 0xE0)
{
des = b & 0x1F;
len = 2;
}
else
if (b < 0xF0)
{
des = b & 0x0F;
len = 3;
}
else
if (b < 0xF8)
{
des = b & 0x07;
len = 4;
}
else
if (b < 0xFC)
{
des = b & 0x03;
len = 5;
}
else
{
des = b & 0x01;
len = 6;
}
size_t i = 1;
for (; i < len; ++i)
{
b = *(src++);
if (b < 0x80 || b > 0xBF) return 0; // the src is invalid
des = (des << 6) + (b & 0x3F);
}
return len;
}
/*
UTF-32 to UTF-16
*/
template
typename enable_if::value && check::value,
size_t>::type_t utf(T src, U* des)
{
if (src == 0) return 0;
if (src <= 0xFFFF)
{
if (des) (*des) = static_cast(src);
return 1;
}
else
if (src <= 0xEFFFF)
{
if (des)
{
des[0] = static_cast(0xD800 + (src >> 10) - 0x40); // high
des[1] = static_cast(0xDC00 + (src & 0x03FF)); // low
}
return 2;
}
return 0;
}
/*
UTF-16 to UTF-32
*/
template
typename enable_if::value && check::value,
size_t>::type_t utf(const T* src, U& des)
{
if (!src || (*src) == 0) return 0;
uint16 w1 = src[0];
if (w1 >= 0xD800 && w1 <= 0xDFFF)
{
if (w1 < 0xDC00)
{
uint16 w2 = src[1];
if (w2 >= 0xDC00 && w2 <= 0xDFFF)
{
des = (w2 & 0x03FF) + (((w1 & 0x03FF) + 0x40) << 10);
return 2;
}
}
return 0; // the src is invalid
}
else
{
des = w1;
return 1;
}
}
/*
UTF-16 to UTF-8
*/
template
typename enable_if::value && check::value,
size_t>::type_t utf(T src, U* des)
{
// make utf-16 to utf-32
uint32 tmp;
if (utf(&src, tmp) != 1) return 0;
// make utf-32 to utf-8
return utf(tmp, des);
}
/*
UTF-8 to UTF-16
*/
template
typename enable_if::value && check::value,
size_t>::type_t utf(const T* src, U& des)
{
// make utf-8 to utf-32
uint32 tmp;
size_t len = utf(src, tmp);
if (len == 0) return 0;
// make utf-32 to utf-16
if (utf(tmp, &des) != 1) return 0;
return len;
}
/*
UTF-X: string to string
*/
template
typename enable_if::value && (check::value || check::value),
size_t>::type_t utf(const T* src, U* des) // UTF-32 to UTF-X(8/16)
{
if (!src || (*src) == 0) return 0;
size_t num = 0;
for(; *src; ++src)
{
size_t len = utf(*src, des);
if (len == 0) break;
if (des) des += len;
num += len;
}
if (des) (*des) = 0;
return num;
}
template
typename enable_if<(check::value || check::value) && check::value,
size_t>::type_t utf(const T* src, U* des) // UTF-X(8/16) to UTF-32
{
if (!src || (*src) == 0) return 0;
size_t num = 0;
while(*src)
{
uint32 tmp;
size_t len = utf(src, tmp);
if (len == 0) break;
if (des)
{
(*des) = tmp;
++des;
}
src += len;
num += 1;
}
if (des) (*des) = 0;
return num;
}
template
typename enable_if<(check::value && check::value) ||
(check::value && check::value),
size_t>::type_t utf(const T* src, U* des) // UTF-X(8/16) to UTF-Y(16/8)
{
if (!src || (*src) == 0) return 0;
size_t num = 0;
while(*src)
{
// make utf-x to utf-32
uint32 tmp;
size_t len = utf(src, tmp);
if (len == 0) break;
src += len;
// make utf-32 to utf-y
len = utf(tmp, des);
if (len == 0) break;
if (des) des += len;
num += len;
}
if (des) (*des) = 0;
return num;
}
}
const char* c = "你好世界";
size_t n = nx::transform::utf(c, (wchar_t*)0);
完整代码请参考:
https://code.google.com/p/nixy/source/browse/trunk/nixycore/string/transform.h
更多内容请访问:http://darkc.at