JAVA: HOW TO AUTO-DETECT A FILE'S ENCODING


April 8, 2010 · by  George  · in  java programming ·  3 Comments  

Well, I don’t know if this is the best solution, but we can test the file against various CharsetDecodersand see if any of them reports no errors. Here is a class implementing this behaviour (note: the code below will open and read the file and test it against the decoder until EOF is reached – if an error occurs it proceeds to the next decoder etc. – so if you specify a great number of charsets to be tested, or test large files, it will be slow ) :

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
/*
 *  Copyright 2010 Georgios Migdos .
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 *  under the License.
 */
 
importjava.io.BufferedInputStream;
importjava.io.File;
importjava.io.FileInputStream;
importjava.io.FileNotFoundException;
importjava.io.IOException;
importjava.io.InputStreamReader;
importjava.nio.ByteBuffer;
importjava.nio.charset.CharacterCodingException;
importjava.nio.charset.Charset;
importjava.nio.charset.CharsetDecoder;
 
/**
 *
 * @author Georgios Migdos
 */
publicclassCharsetDetector {
 
    publicCharset detectCharset(File f, String[] charsets) {
 
        Charset charset =null;
 
        for(String charsetName : charsets) {
            charset = detectCharset(f, Charset.forName(charsetName));
            if(charset !=null) {
                break;
            }
        }
 
        returncharset;
    }
 
    privateCharset detectCharset(File f, Charset charset) {
        try{
            BufferedInputStream input =newBufferedInputStream(newFileInputStream(f));
 
            CharsetDecoder decoder = charset.newDecoder();
            decoder.reset();
 
            byte[] buffer =newbyte[512];
            booleanidentified =false;
            while((input.read(buffer) != -1) && (!identified)) {
                identified = identify(buffer, decoder);
            }
 
            input.close();
 
            if(identified) {
                returncharset;
            }else{
                returnnull;
            }
 
        }catch(Exception e) {
            returnnull;
        }
    }
 
    privatebooleanidentify(byte[] bytes, CharsetDecoder decoder) {
        try{
            decoder.decode(ByteBuffer.wrap(bytes));
        }catch(CharacterCodingException e) {
            returnfalse;
        }
        returntrue;
    }
 
    publicstaticvoidmain(String[] args) {
        File f =newFile("example.txt");
 
        String[] charsetsToBeTested = {"UTF-8","windows-1253","ISO-8859-7"};
 
        CharsetDetector cd =newCharsetDetector();
        Charset charset = cd.detectCharset(f, charsetsToBeTested);
 
        if(charset !=null) {
            try{
                InputStreamReader reader =newInputStreamReader(newFileInputStream(f), charset);
                intc =0;
                while((c = reader.read()) != -1) {
                    System.out.print((char)c);
                }
                reader.close();
            }catch(FileNotFoundException fnfe) {
                fnfe.printStackTrace();
            }catch(IOException ioe){
                ioe.printStackTrace();
            }
 
        }else{
            System.out.println("Unrecognized charset.");
        }
    }
}

你可能感兴趣的:(编码,自动识别文件编码)