Java读带有BOM的UTF-8文件乱码

通过java写的UTF-8文件,使用Java可以正确的读,但是如果用记事本将相同的内容使用UTF-8格式保存,则在使用程序读取是会从文件中多读出一个不可见字符,多一个?

实例:

新建一个文本:内容为“测试BOM”的txt文本,另存为UTF-8.

处理带BOM的类UnicodeReader

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
package com.java.io;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PushbackInputStream;
import java.io.Reader;
/**
version: 1.1 / 2007-01-25
- changed BOM recognition ordering (longer boms first)
网络地址:http://koti.mbnet.fi/akini/java/unicodereader/UnicodeReader.java.txt
Original pseudocode   : Thomas Weidenfeller
Implementation tweaked: Aki Nieminen
http://www.unicode.org/unicode/faq/utf_bom.html
BOMs:
00 00 FE FF    = UTF-32, big-endian
FF FE 00 00    = UTF-32, little-endian
EF BB BF       = UTF-8,
FE FF          = UTF-16, big-endian
FF FE          = UTF-16, little-endian
Win2k Notepad:
Unicode format = UTF-16LE
***/
/**
* Generic unicode textreader, which will use BOM mark
* to identify the encoding to be used. If BOM is not found
* then use a given default or system encoding.
*/
public class UnicodeReader extends Reader {
PushbackInputStream internalIn;
InputStreamReader   internalIn2 = null ;
String              defaultEnc;
private static final int BOM_SIZE = 4 ;
/**
*
* @param in  inputstream to be read
* @param defaultEnc default encoding if stream does not have
*                   BOM marker. Give NULL to use system-level default.
*/
UnicodeReader(InputStream in, String defaultEnc) {
internalIn = new PushbackInputStream(in, BOM_SIZE);
this .defaultEnc = defaultEnc;
}
public String getDefaultEncoding() {
return defaultEnc;
}
/**
* Get stream encoding or NULL if stream is uninitialized.
* Call init() or read() method to initialize it.
*/
public String getEncoding() {
if (internalIn2 == null ) return null ;
return internalIn2.getEncoding();
}
/**
* Read-ahead four bytes and check for BOM marks. Extra bytes are
* unread back to the stream, only BOM bytes are skipped.
*/
protected void init() throws IOException {
if (internalIn2 != null ) return ;
String encoding;
byte bom[] = new byte [BOM_SIZE];
int n, unread;
n = internalIn.read(bom, 0 , bom.length);
if ( (bom[ 0 ] == ( byte ) 0x00 ) && (bom[ 1 ] == ( byte ) 0x00 ) &&
(bom[ 2 ] == ( byte ) 0xFE ) && (bom[ 3 ] == ( byte ) 0xFF ) ) {
encoding = "UTF-32BE" ;
unread = n - 4 ;
} else if ( (bom[ 0 ] == ( byte ) 0xFF ) && (bom[ 1 ] == ( byte ) 0xFE ) &&
(bom[ 2 ] == ( byte ) 0x00 ) && (bom[ 3 ] == ( byte ) 0x00 ) ) {
encoding = "UTF-32LE" ;
unread = n - 4 ;
} else if (  (bom[ 0 ] == ( byte ) 0xEF ) && (bom[ 1 ] == ( byte ) 0xBB ) &&
(bom[ 2 ] == ( byte ) 0xBF ) ) {
encoding = "UTF-8" ;
unread = n - 3 ;
} else if ( (bom[ 0 ] == ( byte ) 0xFE ) && (bom[ 1 ] == ( byte ) 0xFF ) ) {
encoding = "UTF-16BE" ;
unread = n - 2 ;
} else if ( (bom[ 0 ] == ( byte ) 0xFF ) && (bom[ 1 ] == ( byte ) 0xFE ) ) {
encoding = "UTF-16LE" ;
unread = n - 2 ;
} else {
// Unicode BOM mark not found, unread all bytes
encoding = defaultEnc;
unread = n;
}
//System.out.println("read=" + n + ", unread=" + unread);
if (unread > 0 ) internalIn.unread(bom, (n - unread), unread);
// Use given encoding
if (encoding == null ) {
internalIn2 = new InputStreamReader(internalIn);
} else {
internalIn2 = new InputStreamReader(internalIn, encoding);
}
}
public void close() throws IOException {
init();
internalIn2.close();
}
public int read( char [] cbuf, int off, int len) throws IOException {
init();
return internalIn2.read(cbuf, off, len);
}
}

测试类


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
package com.java.io;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
public class BomRead {
/**
* 读带有BOM的UTF-8文件乱码
* @param args
*/
public static void main(String[] args) throws Exception {
File file  = new File( "E:\\JS_Exercise\\JavaExercise\\BOM.txt" );
FileInputStream in = new FileInputStream(file);
BufferedReader br = new BufferedReader( new InputStreamReader(in, "UTF-8" ));
String line = null ;
System.out.println( "处理前:" );
while ((line = br.readLine()) != null ){
System.out.println(line);
}
File file2  = new File( "E:\\JS_Exercise\\JavaExercise\\BOM.txt" );
FileInputStream in2 = new FileInputStream(file2);
BufferedReader br2 = new BufferedReader( new UnicodeReader(in2, "UTF-8" ));
String line2 = null ;
System.out.println( "处理后:" );
while ((line2 = br2.readLine()) != null ){
System.out.println(line2);
}
}
}

输出结果


处理前:

?测试BOM

处理后:

测试BOM


另一种解决方式

从目前来看1.6只是解决了读取带有BOM文件失败的问题,还是不能区别处理有BOM和无BOMUTF-8编码的文件,从Bug ID:4508058里的描述可以看出,这个问题将作为一个不会修改的问题关闭,对于BOM编码的识别将由应用程序自己来处理,原因可从另处一个bug处查看到,因为Unicode对于BOM的编码的规定可能发生变化。也就是说对于一个UTF-8的文件,应用程序需要知道这个文件有没有写BOM,然后自己决定处理BOM的方式。

因此在遇到此问题的时候可以特殊问题特殊处理。

你可能感兴趣的:(java,记事本)