C++中string / char* ,wstring / wchar_t*
window以下
char* cName = "北京市"; // 多字节转化成宽字符字符串! unsigned short wsName[50] = {0}; int wideCharCount = MultiByteToWideChar(CP_ACP, 0, (LPSTR)cName, -1, NULL, 0) - 1; MultiByteToWideChar(CP_ACP, 0, (LPSTR)cName, -1, (LPWSTR)wsName, wideCharCount + 1); for (int i=0; i<wideCharCount; i++) { printf("%d ", wsName[i]); }printf("\n");输出
21271 20140 24066
Linux以下
#include <stdlib.h> #include <stdio.h> #include <string.h> #include <locale.h> #include <iostream> #include <string> using namespace std; void multibyte_to_widechar_test(); void read_file(const char* fname); void dump_uchar(unsigned char ch); int main() { multibyte_to_widechar_test(); read_file("chs"); printf("any key pressed to exit...\n"); getchar(); return 0; } void multibyte_to_widechar_test() { typedef string str_t; str_t cur_loc = setlocale(LC_ALL, NULL); printf("cur_locale = %s\n", cur_loc.c_str()); setlocale(LC_ALL, "zh_CN.GBK"); char mb_buf[100]; strcpy(mb_buf, "北京市"); int mbstr_len = strlen(mb_buf); wchar_t* wcstr = NULL; int wcstr_len = mbstowcs(wcstr, mb_buf, 0) + 1; printf("mb_len = %d, wc_len = %d\n", mbstr_len, wcstr_len); wcstr = new wchar_t[wcstr_len]; int ret = mbstowcs(wcstr, mb_buf, mbstr_len); if (ret <= 0) { printf("转化失败\n"); } else { printf("转化成功\n"); // wsprintf(L"%ls\n", wcstr); printf("view1 =====\n"); for (int i=0; i<wcstr_len - 1; i++) { int code = (int)wcstr[i]; printf("%d\t", code); } printf("\n"); printf("view2 =====\n"); for (int i=0; i<wcstr_len - 1; i++) { int code = (int)wcstr[i]; dump_uchar( (unsigned char)(code/256) ); dump_uchar( (unsigned char)(code%256) ); } printf("\n"); } setlocale(LC_ALL, cur_loc.c_str()); } void dump_uchar(unsigned char ch) { const char* str = "0123456789abcdef"; printf("0x%c%c\t", str[ch/16], str[ch%16]); } void read_file(const char* fname) { FILE* fp = fopen(fname, "r"); if (!fp) { return; } printf("===============\n"); char buffer[100] = {0}; fgets(buffer, 100, fp); printf("%s", buffer); printf("view1 =========== \n"); int len = strlen(buffer) - 1; for (int i=0; i<len; i++) { dump_uchar((unsigned char)buffer[i]); }printf("\n"); printf("view2 =========== \n"); for (int i=0; i<len; i+=2) { unsigned char down = (unsigned char)buffer[i]; unsigned char high = (unsigned char)buffer[i+1]; printf("%d ", (high<<8)|down); } printf("\n"); fclose(fp); }multibyte_to_widechar_test函数将多字节编码转化成unicode编码。然后输出unicode串内容。
export LC_ALL="zh_CN.GBK"所以chs文件的编码默认是gbk。
root@h10-xx-xx-xx:~/peteryfren/cpp/encode_app> ./app_test cur_locale = C mb_len = 6, wc_len = 4 转化成功 view1 ===== 21271 20140 24066 view2 ===== 0x53 0x17 0x4e 0xac 0x5e 0x02 =============== 北京市 view1 =========== 0xb1 0xb1 0xbe 0xa9 0xca 0xd0 view2 =========== 45489 43454 53450 any key pressed to exit...“北京市”的unicode编码值与window上输出一致。“北京市”的gbk2312编码为45489,43454,53450。
iconv -f UTF-8 -t GBK test.txt -o pp.txt
>>> s = u'北京市' >>> s u'\u5317\u4eac\u5e02' >>> gbks = '北京市' >>> gbks '\xb1\xb1\xbe\xa9\xca\xd0' >>> s.encode('utf-8') '\xe5\x8c\x97\xe4\xba\xac\xe5\xb8\x82'
1. http://blog.csdn.net/xiaobai1593/article/details/7063535
2. GBK2312编码表參见:http://ff.163.com/newflyff/gbk-list/
3. unicode编码表參见:http://jlqzs.blog.163.com/blog/static/2125298320070101826277/