java读取判断文件编码格式

读取文件,判断文件编码格式

BufferedInputStream 才支持 mark、reset功能。

package sample.test.name;

import df.util.Util;
import df.util.type.StringUtil;
import df.util.type.SysLog;

import java.io.*;
import java.util.Arrays;

/**
 * Created by andrew on 2015/7/12.
 */
public class NameApp {
    private static final String TAG = Util.toTAG(NameApp.class);

    public static void main(String[] args) {
        File dir = new File("C:\\Users\\andrew\\name");
        if (null != dir
                && dir.isDirectory()) {
            File[] files = dir.listFiles(new FileFilter() {
                @Override
                public boolean accept(File pathname) {
                    String name = pathname.getName();
                    if (name.toLowerCase().endsWith(".txt")) {
                        return true;
                    }
                    return false;
                }
            });

            StringBuffer buf = new StringBuffer();
            for (File f : files) {
                InputStream reader = null;
                try {
                    reader = new BufferedInputStream(new FileInputStream(f));
                    boolean isSupport = reader.markSupported();
                    SysLog.v(TAG, " IS SUPPORT = ", isSupport);
                    reader.mark(0);
                    byte[] first3Bytes = new byte[3];
                    int read = reader.read(first3Bytes);
                    String charset = "gbk";
                    if (3 == read) {
                        //其中的 0xefbb、0xfffe、0xfeff、0x5c75这些都是这个文件的前面两个字节的16进制数
                        if (first3Bytes[0] == (byte) 0xFF && first3Bytes[1] == (byte) 0xFE) {
                            charset = "UTF-16LE";
                        } else if (first3Bytes[0] == (byte) 0xFE && first3Bytes[1] == (byte) 0xFF) {
                            charset = "UTF-16BE";
                        } else if (first3Bytes[0] == (byte) 0xEF && first3Bytes[1] == (byte) 0xBB && first3Bytes[2] == (byte) 0xBF) {
                            charset = "UTF-8";
                        } else{
                            reader.reset();
                        }
                    }else{
                        reader.reset();
                    }


                    byte[] line = new byte[10];
                    buf.setLength(0);

                    int len = 0;
                    Arrays.fill(line, (byte) 0);
                    while ((len = reader.read(line, 0, line.length)) != -1) {
                        SysLog.v(TAG, " LINE=", StringUtil.toHexString(line));
                        buf.append(new String(line,charset));
                        Arrays.fill(line, (byte) 0);
                    }
                    SysLog.v(TAG,"filename=",f.getName(), charset,"end=", buf.length(),buf.toString());
                } catch (Exception e) {
                    SysLog.v(TAG, ", e=", e.getMessage());
                    e.printStackTrace();
                } finally {
                    if (null != reader) {
                        try {
                            reader.close();
                        } catch (IOException e) {
                            e.printStackTrace();
                        }
                    }
                }
            }
        }
    }
}


你可能感兴趣的:(java读取判断文件编码格式)