1 关于多字节与宽字符的解释
在中文windows中,内码为GBK,而在Linux中,内码为utf32 。这样VC在中文系统下的wchar_t为2字节,GCC在Linux系统下wchar_t为4字节。
2 代码中的中文字符编码
2.1 C++源码中的中文
2.2 Python源码中的中文
#-*- coding:utf8 -*- import sys import os print(sys.getdefaultencoding()) print(sys.stdout.encoding) print(sys.stdin.encoding) s = "中国ABAB" print(s) print(s.encode("utf8")) print(s.encode("gbk"))
utf-8 cp936 cp936 中国ABAB b'\xe4\xb8\xad\xe5\x9b\xbdAB\xef\xbc\xa1\xef\xbc\xa2' b'\xd6\xd0\xb9\xfaAB\xa3\xc1\xa3\xc2'
2.3 Java源码中的中文
类似于 Python
public static void testSrcCode() { String s = "中国ABAB"; int len = s.length(); for(int i=0; i<s.length();i++) { System.out.print(s.charAt(i)); System.out.printf(" code point: 0x%1$x\n",s.codePointAt(i)); } System.out.println("-----------------------------------"); try { System.out.print("UTF8: "); byte[] bt = s.getBytes("utf8"); for(int j = 0;j < bt.length ;j++) { System.out.printf("0x%1$x ",bt[j]); } System.out.println(); } catch (UnsupportedEncodingException e) { e.printStackTrace(); } System.out.println("-----------------------------------"); try { System.out.print("GBK: "); byte[] bt = s.getBytes("gbk"); for(int j = 0;j < bt.length ;j++) { System.out.printf("0x%1$x ",bt[j]); } System.out.println(); } catch (UnsupportedEncodingException e) { e.printStackTrace(); } System.out.println("-----------------------------------"); try { System.out.print("UTF16: "); byte[] bt = s.getBytes("utf16"); for(int j = 0;j < bt.length ;j++) { System.out.printf("0x%1$x ",bt[j]); } System.out.println(); } catch (UnsupportedEncodingException e) { e.printStackTrace(); } }
中 code point: 0x4e2d 国 code point: 0x56fd A code point: 0x41 B code point: 0x42 A code point: 0xff21 B code point: 0xff22 ----------------------------------- UTF8: 0xe4 0xb8 0xad 0xe5 0x9b 0xbd 0x41 0x42 0xef 0xbc 0xa1 0xef 0xbc 0xa2 ----------------------------------- GBK: 0xd6 0xd0 0xb9 0xfa 0x41 0x42 0xa3 0xc1 0xa3 0xc2 ----------------------------------- UTF16: 0xfe 0xff 0x4e 0x2d 0x56 0xfd 0x0 0x41 0x0 0x42 0xff 0x21 0xff 0x22
2.4 BOM
文本文件头不同编码的标记,称为BOM,存储的时候可以指定为NO BOM,可以去掉此头。
UTF编码 |
UTF-8 |
UTF-16LE |
UTF-16BE |
UTF-32LE |
FF FE 00 00 |
UTF-32BE |
00 00 FE FF |
3 中文编码历史
在中文编码的历史上,在GB出现以前,已经存在一些汉字内码表示。所谓 内码即机内编码汉字的方法。为了便于不同内码系统之间的信息交换,国家定制了交换码。
后续1995年又制定了GBK(CP939稍有差异)标准,为两字节编码 ,兼容GB2312,收集了2W多个汉字。同时也将ASCII字母及标点符号进行了编码,这样导致这些符号是两字节的,在显示的时候就是所谓的全角符号,直观看起来就是比较大。而英文模式下的字母和符号即为半角。
在unicode3.1标准出现之后,国家又制定了GB18030,其继续兼容 GBK ,并采用变长编码,将所有unicode字符均映射了一遍。和UFT8一样,采用了多字节编码。
4 中文显示原理
Charset AutoDetection (InputText) { if (all characters in InputText are ASCII) { if InputText contains ESC or ~{ { call ISO-2022 and HZ detector with InputText; if one of them succeed, return that charset, otherwise return ASCII; } else return ASCII; } else if (InputText start with BOM) { return UCS2; } else { Call all multi-byte detectors and single-byte detectors; Return the one with best confidence; } }5.4 字符探测实践
import java.io.*; import java.net.*; import info.monitorenter.cpdetector.io.*; class ChardetWrapper { // Create the proxy: CodepageDetectorProxy detector = CodepageDetectorProxy.getInstance(); // A singleton. // constructor: public ChardetWrapper() { // Add the implementations of // info.monitorenter.cpdetector.io.ICodepageDetector: // This one is quick if we deal with unicode codepages: detector.add(new ByteOrderMarkDetector()); // The first instance delegated to tries to detect the meta charset // attribut in html pages. detector.add(new ParsingDetector(false)); // be verbose about parsing. // This one does the tricks of exclusion and frequency detection, if // first implementation is // unsuccessful: detector.add(JChardetFacade.getInstance()); // Another singleton. detector.add(ASCIIDetector.getInstance()); // Fallback, see javadoc. } public boolean parseUrl(String url) throws MalformedURLException, IOException { boolean ret = false; // Work with the configured proxy: java.nio.charset.Charset charset = null; charset = detector.detectCodepage(new URL(url)); if(charset == null) { System.out.println("bogus document"); } else { System.out.println(charset.toString()); // Open the document in the given code page: // Read from it, do sth., whatever you desire. The character are now - hopefully - correct.. ret = true; } return ret; } public boolean parseFile(String file) throws MalformedURLException, IOException { boolean ret = false; // Work with the configured proxy: java.nio.charset.Charset charset = null; FileInputStream fis = new FileInputStream(file); BufferedInputStream bis = new BufferedInputStream(fis); charset = detector.detectCodepage(bis,10240000); if(charset == null) { System.out.println("bogus document"); } else { System.out.println(charset.toString()); // Open the document in the given code page: // Read from it, do sth., whatever you desire. The character are now - hopefully - correct.. ret = true; } return ret; } } public class CpdetTest { public static void main(String[] args) { ChardetWrapper u = new ChardetWrapper(); String url = "http://www.sina.com"; //String path = "C:/字幕/007:来自俄罗斯的爱情.ass"; try { //u.parseFile(path); u.parseUrl(url); } catch (Exception e) { e.printStackTrace(); } } }也可以直接使用jchardec来进行探测,其要求继承ChardetObserver接口,并实现notify方法,库会在探测成功后通过notify方法将结果传出来。
import java.io.*; import java.net.*; import java.util.*; import org.mozilla.intl.chardet.*; class myChardecObserver implements nsICharsetDetectionObserver { public void Notify(String charset) { CharsetDetectorTest.found = true; System.out.println("CHARSET = " + charset); } } public class CharsetDetectorTest { public static boolean found = false; public static void main(String argv[]) throws Exception { // Initalize the nsDetector() ; int lang = (argv.length == 2) ? Integer.parseInt(argv[1]): nsPSMDetector.ALL; nsDetector detector = new nsDetector(nsPSMDetector.ALL); // Set an observer... // The Notify() will be called when a matching charset is found. detector.Init(new myChardecObserver()); URL url = new URL("http://www.sina.com//"); BufferedInputStream bis = new BufferedInputStream(url.openStream()); //FileInputStream fis = new FileInputStream("C:/字幕/007:来自俄罗斯的爱情.ass"); //BufferedInputStream bis = new BufferedInputStream(fis); byte[] buf = new byte[512]; int len; boolean done = false; boolean isAscii = true; while ((len = bis.read(buf, 0, buf.length)) != -1 ) { //System.out.printf("Read byte %d\n",len); // Check if the stream is only ascii. if (isAscii) isAscii = detector.isAscii(buf, len); // DoIt if non-ascii and not done yet. if (!isAscii && !done) done = detector.DoIt(buf, len, false); } detector.DataEnd(); if (isAscii) { System.out.println("CHARSET = ASCII"); found = true; } if (!found) { String prob[] = detector.getProbableCharsets(); for (int i = 0; i < prob.length; i++) { System.out.println("Probable Charset = " + prob[i]); } } } }