跳过UTF-8的BOM

/**
version: 1.1 / 2007-01-25
- changed BOM recognition ordering (longer boms first)

Original pseudocode   : Thomas Weidenfeller
Implementation tweaked: Aki Nieminen

http://www.unicode.org/unicode/faq/utf_bom.html
BOMs in byte length ordering:
  00 00 FE FF    = UTF-32, big-endian
  FF FE 00 00    = UTF-32, little-endian
  EF BB BF       = UTF-8,
  FE FF          = UTF-16, big-endian
  FF FE          = UTF-16, little-endian

Win2k Notepad:
  Unicode format = UTF-16LE
***/

public class UnicodeInputStream extends InputStream
{
    public static void main(String[] args) throws Exception
    {
        UnicodeInputStream ui = new UnicodeInputStream(new FileInputStream("data.txt"), "UTF-8");
        CharBuffer cb = CharBuffer.allocate(10);
        String encoding = ui.getEncoding();
        System.out.println(encoding);
        InputStreamReader reader = new InputStreamReader(ui, encoding);
        int len = 0;
        while((len = reader.read(cb)) != -1) {
            cb.flip();
            System.out.print("===" + cb + "===");
            cb.clear();
        }
        reader.close();
    }
    
    private PushbackInputStream internalIn;
    private boolean isInited = false;
    private String defaultEnc;
    private String encoding;
    private static final int BOM_SIZE = 4;

    public UnicodeInputStream(InputStream in, String defaultEncoding)
    {
        internalIn = new PushbackInputStream(in, BOM_SIZE);
        defaultEnc = defaultEncoding;
    }

    void init() throws IOException
    {
        if (isInited)
            return;
        byte[] bom = new byte[BOM_SIZE];
        int n = internalIn.read(bom, 0, bom.length);
        int skip;
        
        if ((bom[0] == (byte)0x00) && (bom[1] == (byte)0x00) && (bom[2] == (byte)0xFE) && (bom[3] == (byte)0xFF))
        {
            skip = 4;
            encoding = "UTF-32BE";
        }
        else if ((bom[0] == (byte)0xFF) && (bom[1] == (byte)0xFE) && (bom[2] == (byte)0x00) && (bom[3] == (byte)0x00))
        {
            skip = 4;
            encoding = "UTF-32LE";
        }
        else if ((bom[0] == (byte)0xEF) && (bom[1] == (byte)0xBB) && (bom[2] == (byte)0xBF))
        {
            skip = 3;
            encoding = "UTF-8";
        }
        else if ((bom[0] == (byte)0xFE) && (bom[1] == (byte)0xFF))
        {
            skip = 2;
            encoding = "UTF-16BE";
        }
        else if ((bom[0] == (byte)0xFF) && (bom[1] == (byte)0xFE))
        {
            skip = 2;
            encoding = "UTF-16LE";
        }
        else
        {
            skip = 0;
            encoding = defaultEnc;
        }

        if(n != -1)
            internalIn.unread(bom, skip, n - skip);
        isInited = true;
    }

    public String getEncoding()
    {
        try
        {
            init();
        }
        catch (IOException e)
        {
            throw new IllegalStateException(e);
        }
        return encoding;
    }

    public String getDefaultEncoding()
    {
        return defaultEnc;
    }

    @Override
    public int read() throws IOException
    {
        return internalIn.read();
    }

    @Override
    public void close() throws IOException
    {
        internalIn.close();
    }
}



public class UnicodeReader extends Reader
{
    public static void main(String[] args) throws Exception
    {
        UnicodeReader reader = new UnicodeReader(new FileInputStream("data.txt"), "UTF-8");
//        CharBuffer cb = CharBuffer.allocate(10);
//        System.out.println(reader.getEncoding());
//        int len = 0;
//        while ((len = reader.read(cb)) != -1)
//        {
//            cb.flip();
//            System.out.print("===" + cb + "===");
//            cb.clear();
//        }
//        reader.close();
        
        BufferedReader br = new BufferedReader(reader);
        String line;
        while((line = br.readLine()) != null)
            System.out.println(line);
        br.close();
    }

    private static final int BOM_SIZE = 4;
    private PushbackInputStream internalIn;
    private InputStreamReader reader;
    private String encoding;
    private String defaultEnc;
    private boolean isInited = false;

    public UnicodeReader(InputStream in, String defaultEncoding)
    {
        defaultEnc = defaultEncoding;
        internalIn = new PushbackInputStream(in, BOM_SIZE);
        try
        {
            init();
            reader = new InputStreamReader(internalIn, encoding);
        }
        catch (IOException e)
        {
            try
            {
                internalIn.close();
            }
            catch (IOException e1)
            {
                e1.printStackTrace();
            }

            throw new ExceptionInInitializerError("initialization failed");
        }
    }

    public String getEncoding()
    {
        return encoding;
    }

    protected void init() throws IOException
    {
        if (isInited)
            return;
        byte[] bom = new byte[BOM_SIZE];
        int n = internalIn.read(bom, 0, bom.length), skip;

        if ((bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00) && (bom[2] == (byte) 0xFE) && (bom[3] == (byte) 0xFF))
        {
            skip = 4;
            encoding = "UTF-32BE";
        }
        else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE) && (bom[2] == (byte) 0x00) && (bom[3] == (byte) 0x00))
        {
            skip = 4;
            encoding = "UTF-32LE";
        }
        else if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB) && (bom[2] == (byte) 0xBF))
        {
            skip = 3;
            encoding = "UTF-8";
        }
        else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF))
        {
            skip = 2;
            encoding = "UTF-16BE";
        }
        else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE))
        {
            skip = 2;
            encoding = "UTF-16LE";
        }
        else
        {
            skip = 0;
            encoding = defaultEnc;
        }

        if (n != -1)
            internalIn.unread(bom, skip, n - skip);
        isInited = true;
    }

    @Override
    public void close() throws IOException
    {
        reader.close();
    }

    @Override
    public int read(char[] cbuf, int off, int len) throws IOException
    {
        return reader.read(cbuf, off, len);
    }
}


参考
http://koti.mbnet.fi/akini/java/unicodereader/

你可能感兴趣的:(unicode,bom)