package example.encoding; import; /** *//** * The Class GetCharTest. */ public class GetCharTest { /** *//** * The main method. * * @param args the arguments */ public static void main(String args[]) { String content = "中文"; String defaultEncoding = System.getProperty("file.encoding"); String defaultLnaguage = System.getProperty("user.language"); System.out.println("System default encoding --- " + defaultEncoding); System.out.println("System default language --- " + defaultLnaguage); GetCharTest tester = new GetCharTest(); tester.getCharWithDefaultEncoding(content); tester.getCharWithGivenEncoding(content, "ISO-8859-1"); tester.getCharWithGivenEncoding(content, "GBK"); tester.getCharWithGivenEncoding(content, "UTF-8"); } /** *//** * Gets the char with default encoding. * * @param content the content * * @return the char with default encoding */ public void getCharWithDefaultEncoding(String content) { System.out.println("\nGet characters with default encoding\n"); printCharArray(content); } /** *//** * Gets the char with given encoding. * * @param content the content * @param encoding the encoding * * @return the char with given encoding */ public void getCharWithGivenEncoding(String content, String encoding) { System.out.println("\nGet characters with given encoding : " + encoding + "\n"); try { String encodedString = new String(content.getBytes(), encoding); printCharArray(encodedString); } catch (UnsupportedEncodingException e) { e.printStackTrace(); } } /** *//** * Prints the char array. * * @param inStr the in str */ public void printCharArray(String inStr) { char[] charArray = inStr.toCharArray(); for (int i = 0; i < inStr.length(); i++) { byte b = (byte) charArray[i]; short s = (short) charArray[i]; String hexB = Integer.toHexString(b).toUpperCase(); String hexS = Integer.toHexString(s).toUpperCase(); StringBuffer sb = new StringBuffer(); // print char sb.append("char["); sb.append(i); sb.append("]='"); sb.append(charArray[i]); sb.append("'\t"); // byte value sb.append("byte="); sb.append(b); sb.append(" \\u"); sb.append(hexB); sb.append('\t'); // short value sb.append("short="); sb.append(s); sb.append(" \\u"); sb.append(hexS); sb.append('\t'); // Unicode Block sb.append(Character.UnicodeBlock.of(charArray[i])); System.out.println(sb.toString()); } System.out.println("\nCharacters length: " + charArray.length); } }
System default encoding --- GBK
System default language --- zh
Get characters with default encoding
char[0]='中' byte=45 \u2D short=20013 \u4E2D CJK_UNIFIED_IDEOGRAPHS
char[1]='文' byte=-121 \uFFFFFF87 short=25991 \u6587 CJK_UNIFIED_IDEOGRAPHS
Characters length: 2
Get characters with given encoding : ISO-8859-1
char[0]='?' byte=-42 \uFFFFFFD6 short=214 \uD6 LATIN_1_SUPPLEMENT
char[1]='?' byte=-48 \uFFFFFFD0 short=208 \uD0 LATIN_1_SUPPLEMENT
char[2]='?' byte=-50 \uFFFFFFCE short=206 \uCE LATIN_1_SUPPLEMENT
char[3]='?' byte=-60 \uFFFFFFC4 short=196 \uC4 LATIN_1_SUPPLEMENT
Characters length: 4
Get characters with given encoding : GBK
char[0]='中' byte=45 \u2D short=20013 \u4E2D CJK_UNIFIED_IDEOGRAPHS
char[1]='文' byte=-121 \uFFFFFF87 short=25991 \u6587 CJK_UNIFIED_IDEOGRAPHS
Characters length: 2
Get characters with given encoding : UTF-8
char[0]='?' byte=-3 \uFFFFFFFD short=-3 \uFFFFFFFD SPECIALS
char[1]='?' byte=-3 \uFFFFFFFD short=-3 \uFFFFFFFD SPECIALS
char[2]='?' byte=-3 \uFFFFFFFD short=-3 \uFFFFFFFD SPECIALS
char[3]='?' byte=-3 \uFFFFFFFD short=-3 \uFFFFFFFD SPECIALS
Characters length: 4
System default encoding --- Cp1252
System default language --- en
Get characters with default encoding
char[0]='?' byte=45 \u2D short=20013 \u4E2D CJK_UNIFIED_IDEOGRAPHS
char[1]='?' byte=-121 \uFFFFFF87 short=25991 \u6587 CJK_UNIFIED_IDEOGRAPHS
Characters length: 2
Get characters with given encoding : ISO-8859-1
char[0]='?' byte=63 \u3F short=63 \u3F BASIC_LATIN
char[1]='?' byte=63 \u3F short=63 \u3F BASIC_LATIN
Characters length: 2
Get characters with given encoding : GBK
char[0]='?' byte=63 \u3F short=63 \u3F BASIC_LATIN
char[1]='?' byte=63 \u3F short=63 \u3F BASIC_LATIN
Characters length: 2
Get characters with given encoding : UTF-8
char[0]='?' byte=63 \u3F short=63 \u3F BASIC_LATIN
char[1]='?' byte=63 \u3F short=63 \u3F BASIC_LATIN
Characters length: 2
String encodedString = new String(content.getBytes(), encoding);
char[] charArray = inStr.toCharArray();
byte[0] = -42 hex string = ffffffd6
byte[1] = -48 hex string = ffffffd0
byte[2] = -50 hex string = ffffffce
byte[3] = -60 hex string = ffffffc4
char[0]='中' --- byte[0] + byte[1]
char[1]='文' --- byte[2] + byte[3]
char[0]='?' ---- byte[0]
char[1]='?' ---- byte[1]
char[2]='?' ---- byte[2]
char[3]='?' ---- byte[3]
char[0]='?' ---- byte[0]
char[1]='?' ---- byte[1]
char[2]='?' ---- byte[2]
char[3]='?' ---- byte[3]
这个方法再次证明了String的getBytes()方法的危险性,如果我们使用new String(str.getBytes(), encoding)对字符串进行重新编码解码时,我们一定要清楚str.getBytes()方法返回的字节数组的长度、内容到底是什么,因为在接下来使用新的encoding进行编码解码时,Java并不会自动地对字节数组进行扩展以适应新的encoding。而是按照新的编码方法直接对该字节数组进行解析。