Well, I don’t know if this is the best solution, but we can test the file against various CharsetDecodersand see if any of them reports no errors. Here is a class implementing this behaviour (note: the code below will open and read the file and test it against the decoder until EOF is reached – if an error occurs it proceeds to the next decoder etc. – so if you specify a great number of charsets to be tested, or test large files, it will be slow ) :
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
|
/*
* Copyright 2010 Georgios Migdos .
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*
http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* under the License.
*/
importjava.io.BufferedInputStream;
importjava.io.File;
importjava.io.FileInputStream;
importjava.io.FileNotFoundException;
importjava.io.IOException;
importjava.io.InputStreamReader;
importjava.nio.ByteBuffer;
importjava.nio.charset.CharacterCodingException;
importjava.nio.charset.Charset;
importjava.nio.charset.CharsetDecoder;
/**
*
*
@author
Georgios Migdos
*/
publicclassCharsetDetector {
publicCharset detectCharset(File f, String[] charsets) {
Charset charset =null;
for(String charsetName : charsets) {
charset = detectCharset(f, Charset.forName(charsetName));
if(charset !=null) {
break;
}
}
returncharset;
}
privateCharset detectCharset(File f, Charset charset) {
try{
BufferedInputStream input =newBufferedInputStream(newFileInputStream(f));
CharsetDecoder decoder = charset.newDecoder();
decoder.reset();
byte[] buffer =newbyte[512];
booleanidentified =false;
while((input.read(buffer) != -1) && (!identified)) {
identified = identify(buffer, decoder);
}
input.close();
if(identified) {
returncharset;
}else{
returnnull;
}
}catch(Exception e) {
returnnull;
}
}
privatebooleanidentify(byte[] bytes, CharsetDecoder decoder) {
try{
decoder.decode(ByteBuffer.wrap(bytes));
}catch(CharacterCodingException e) {
returnfalse;
}
returntrue;
}
publicstaticvoidmain(String[] args) {
File f =newFile("example.txt");
String[] charsetsToBeTested = {"UTF-8","windows-1253","ISO-8859-7"};
CharsetDetector cd =newCharsetDetector();
Charset charset = cd.detectCharset(f, charsetsToBeTested);
if(charset !=null) {
try{
InputStreamReader reader =newInputStreamReader(newFileInputStream(f), charset);
intc =0;
while((c = reader.read()) != -1) {
System.out.print((char)c);
}
reader.close();
}catch(FileNotFoundException fnfe) {
fnfe.printStackTrace();
}catch(IOException ioe){
ioe.printStackTrace();
}
}else{
System.out.println("Unrecognized charset.");
}
}
}
|