通过java写的UTF-8文件,使用Java可以正确的读,但是如果用记事本将相同的内容使用UTF-8格式保存,则在使用程序读取是会从文件中多读出一个不可见字符,多一个?
实例:
新建一个文本:内容为“测试BOM”的txt文本,另存为UTF-8.
处理带BOM的类UnicodeReader
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
|
package
com.java.io;
import
java.io.IOException;
import
java.io.InputStream;
import
java.io.InputStreamReader;
import
java.io.PushbackInputStream;
import
java.io.Reader;
/**
version: 1.1 / 2007-01-25
- changed BOM recognition ordering (longer boms first)
网络地址:http://koti.mbnet.fi/akini/java/unicodereader/UnicodeReader.java.txt
Original pseudocode : Thomas Weidenfeller
Implementation tweaked: Aki Nieminen
http://www.unicode.org/unicode/faq/utf_bom.html
BOMs:
00 00 FE FF = UTF-32, big-endian
FF FE 00 00 = UTF-32, little-endian
EF BB BF = UTF-8,
FE FF = UTF-16, big-endian
FF FE = UTF-16, little-endian
Win2k Notepad:
Unicode format = UTF-16LE
***/
/**
* Generic unicode textreader, which will use BOM mark
* to identify the encoding to be used. If BOM is not found
* then use a given default or system encoding.
*/
public
class
UnicodeReader
extends
Reader {
PushbackInputStream internalIn;
InputStreamReader internalIn2 =
null
;
String defaultEnc;
private
static
final
int
BOM_SIZE =
4
;
/**
*
* @param in inputstream to be read
* @param defaultEnc default encoding if stream does not have
* BOM marker. Give NULL to use system-level default.
*/
UnicodeReader(InputStream in, String defaultEnc) {
internalIn =
new
PushbackInputStream(in, BOM_SIZE);
this
.defaultEnc = defaultEnc;
}
public
String getDefaultEncoding() {
return
defaultEnc;
}
/**
* Get stream encoding or NULL if stream is uninitialized.
* Call init() or read() method to initialize it.
*/
public
String getEncoding() {
if
(internalIn2 ==
null
)
return
null
;
return
internalIn2.getEncoding();
}
/**
* Read-ahead four bytes and check for BOM marks. Extra bytes are
* unread back to the stream, only BOM bytes are skipped.
*/
protected
void
init()
throws
IOException {
if
(internalIn2 !=
null
)
return
;
String encoding;
byte
bom[] =
new
byte
[BOM_SIZE];
int
n, unread;
n = internalIn.read(bom,
0
, bom.length);
if
( (bom[
0
] == (
byte
)
0x00
) && (bom[
1
] == (
byte
)
0x00
) &&
(bom[
2
] == (
byte
)
0xFE
) && (bom[
3
] == (
byte
)
0xFF
) ) {
encoding =
"UTF-32BE"
;
unread = n -
4
;
}
else
if
( (bom[
0
] == (
byte
)
0xFF
) && (bom[
1
] == (
byte
)
0xFE
) &&
(bom[
2
] == (
byte
)
0x00
) && (bom[
3
] == (
byte
)
0x00
) ) {
encoding =
"UTF-32LE"
;
unread = n -
4
;
}
else
if
( (bom[
0
] == (
byte
)
0xEF
) && (bom[
1
] == (
byte
)
0xBB
) &&
(bom[
2
] == (
byte
)
0xBF
) ) {
encoding =
"UTF-8"
;
unread = n -
3
;
}
else
if
( (bom[
0
] == (
byte
)
0xFE
) && (bom[
1
] == (
byte
)
0xFF
) ) {
encoding =
"UTF-16BE"
;
unread = n -
2
;
}
else
if
( (bom[
0
] == (
byte
)
0xFF
) && (bom[
1
] == (
byte
)
0xFE
) ) {
encoding =
"UTF-16LE"
;
unread = n -
2
;
}
else
{
// Unicode BOM mark not found, unread all bytes
encoding = defaultEnc;
unread = n;
}
//System.out.println("read=" + n + ", unread=" + unread);
if
(unread >
0
) internalIn.unread(bom, (n - unread), unread);
// Use given encoding
if
(encoding ==
null
) {
internalIn2 =
new
InputStreamReader(internalIn);
}
else
{
internalIn2 =
new
InputStreamReader(internalIn, encoding);
}
}
public
void
close()
throws
IOException {
init();
internalIn2.close();
}
public
int
read(
char
[] cbuf,
int
off,
int
len)
throws
IOException {
init();
return
internalIn2.read(cbuf, off, len);
}
}
|
测试类
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
|
package
com.java.io;
import
java.io.BufferedReader;
import
java.io.File;
import
java.io.FileInputStream;
import
java.io.InputStreamReader;
import
java.nio.charset.Charset;
public
class
BomRead {
/**
* 读带有BOM的UTF-8文件乱码
* @param args
*/
public
static
void
main(String[] args)
throws
Exception {
File file =
new
File(
"E:\\JS_Exercise\\JavaExercise\\BOM.txt"
);
FileInputStream in =
new
FileInputStream(file);
BufferedReader br =
new
BufferedReader(
new
InputStreamReader(in,
"UTF-8"
));
String line =
null
;
System.out.println(
"处理前:"
);
while
((line = br.readLine()) !=
null
){
System.out.println(line);
}
File file2 =
new
File(
"E:\\JS_Exercise\\JavaExercise\\BOM.txt"
);
FileInputStream in2 =
new
FileInputStream(file2);
BufferedReader br2 =
new
BufferedReader(
new
UnicodeReader(in2,
"UTF-8"
));
String line2 =
null
;
System.out.println(
"处理后:"
);
while
((line2 = br2.readLine()) !=
null
){
System.out.println(line2);
}
}
}
|
输出结果
处理前:
?测试BOM
处理后:
测试BOM
另一种解决方式
从目前来看1.6只是解决了读取带有BOM文件失败的问题,还是不能区别处理有BOM和无BOM的UTF-8编码的文件,从Bug ID:4508058里的描述可以看出,这个问题将作为一个不会修改的问题关闭,对于BOM编码的识别将由应用程序自己来处理,原因可从另处一个bug处查看到,因为Unicode对于BOM的编码的规定可能发生变化。也就是说对于一个UTF-8的文件,应用程序需要知道这个文件有没有写BOM,然后自己决定处理BOM的方式。
因此在遇到此问题的时候可以特殊问题特殊处理。