java识别文件编码

import java.io.*;
import java.nio.ByteBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.util.Arrays;
import java.util.List;

public class CharsetDetector {
    private static String[] charsetsToBeTested = {"UTF-8", "GB18030", "UTF-16"};

    public static Charset detectCharset(File f) {

        Charset charset = null;

        for (String charsetName : charsetsToBeTested) {
            charset = detectCharset(f, Charset.forName(charsetName));
            if (charset != null) {
                break;
            }
        }

        return charset;
    }

      private static Charset detectCharset(File f, Charset charset) {
        try (BufferedInputStream input = new BufferedInputStream(new FileInputStream(f))){
            CharsetDecoder decoder = charset.newDecoder();
            decoder.reset();

            byte[] buffer = new byte[1024];
            boolean identified = true;
            while (input.read(buffer, 0, buffer.length) != -1 && identified){
                identified = identify(buffer, decoder);
            }

            if (identified) {
                return charset;
            } else {
                return null;
            }

        } catch (Exception e) {
            return null;
        }
    }

    private static boolean identify(byte[] bytes, CharsetDecoder decoder) {
        try {
            decoder.decode(ByteBuffer.wrap(bytes));
        } catch (CharacterCodingException e) {
            return false;
        }
        return true;
    }

    public static void main(String[] args) throws IOException {
        File f = new File("/home/leen/Downloads/192.167.72.156_Ubuntu16.txt");
        File test1 = new File("/home/leen/Downloads/test.txt");
        File test2 = new File("/home/leen/Downloads/test2.txt");
        List files = Arrays.asList(f, test1, test2);

        for (File file : files) {
            CharsetDetector cd = new CharsetDetector();
            Charset charset = cd.detectCharset(file);

            if (charset != null) {
                System.out.println(String.format("file name = '%s', charset = '%s'", file.getName(), charset.name()));
            }else{
                System.out.println(String.format("file name = '%s', charset = 'unknown'", file.getName()));
            }
        }
    }
}

 

 

你可能感兴趣的:(积累分享,编码识别,encoding,java)