识别文件编码

概述  

   项目中经常遇到上传文本文件,当含有中文时,由于编码就会出现乱码,其根本原因就是用户上传文件的编码与解析的编码不一致.基本有两种解决方案

  1. 规定用户上传的文件的编码
  2. 自动识别文件编码
         对于第一种的解决方法简单粗暴,通常的做法是提供一个规定了默认编码示例文件供用户下载,但是这种的不确定性因素比较大,因此考虑通用的自动识别也是有必要的.自动识别文件编码的工具包有很多,仅摘取几例学习.
         自动识别编码工具包其基本原理就是取一串字节流,然后根据各个不同编码集的编码规则依次进行匹配判断.为了简化操作,不采用真实的web环境,直接使用本地文件测试(因为web传递的字节流,更简单的,直接使用字节数组测试).

example

         识别工具类有很多,此处举例仅作参考       
         测试主要以ansi,unicode,unicode big endian,utf-8,以文件流的形式进行测试
         还有另一种简化操作,使用字节数组测试,为了取到与文件流相同的效果,将字节数组写入流中
		/*	对于需要重复读取的流(判断编码取一次,获取内容取一次),需要使用支持reset的流.
		注:有些解析器支持字节数组,但是处理字节数组与处理流是有区别的,可能会得到不同的结果*/
		BufferedInputStream in = new BufferedInputStream(new ByteArrayInputStream(content.getBytes("GBK")));

tika

package charset;

import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;

import org.apache.tika.detect.AutoDetectReader;
import org.junit.Test;


public class ParseCharset {
	public static String content = "中国";
	
	@Test
	public void parseByTika() {
		AutoDetectReader detect = null;
		InputStream in = null;
		try {
			in = new FileInputStream("C:\\Users\\admin\\Desktop\\temp\\test.txt");
			//detect = new AutoDetectReader(getInputStream(charsetName));
			detect = new AutoDetectReader(in);
			Charset charset = detect.getCharset();
			//System.out.println(charset.name());
			String row = null;
			while ((row = detect.readLine()) != null) {
				if (!charset.name().startsWith("UTF"))
					row = new String(row.getBytes(charset.name()), "GBK");
				System.out.println("charset : " + charset.name() +"; content : "+ row);
			}
		} catch (Exception e) {
			e.printStackTrace();
		} finally {
			try {
				in.close();
				detect.close();
			} catch (IOException e) {
				e.printStackTrace();
			}
		}
		/***************运行结果****************/
		/*        unicode big endian
		  charset : UTF-16BE; content : 中国
		  
		                ansi
		  charset : IBM855; content : 中国
		  
		                unicode
		 charset : UTF-16LE; content : 中国
		  
		                utf-8
		 charset : UTF-8; content : 中国
		 
		 注:一般解析不出来,当ISO-8859-1(字节编码,数据不会丢失)处理
		*/
		
		
		/***************相关依赖****************/
		/*
		 * pom依赖 
		 * 		
		
			org.apache.tika
			tika-core
			1.16
		
		
		
			org.apache.tika
			tika-parsers
			1.16
		
		 *解析基本所有常见格式的文件,得到文件的metadata,content等内容,返回格式化信息
		 *解析的内容有 文件格式,文件内容,文件编码,字符串语言等
		 *
		 * 
		 */
		
	}
}
tika解析的核心源码,AutoDetectReader配置了三种解析器Icu4jEncodingDetector,UniversalEncodingDetector ,HtmlEncodingDetector,轮询解析,以UniversalEncodingDetector 为例
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.txt;

import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;

import org.apache.tika.detect.EncodingDetector;
import org.apache.tika.metadata.Metadata;

public class UniversalEncodingDetector implements EncodingDetector {

    private static final int BUFSIZE = 1024;

    private static final int LOOKAHEAD = 16 * BUFSIZE;

    public Charset detect(InputStream input, Metadata metadata)
            throws IOException {
        if (input == null) {
            return null;
        }

        input.mark(LOOKAHEAD);
        try {
            UniversalEncodingListener listener =
                    new UniversalEncodingListener(metadata);

            byte[] b = new byte[BUFSIZE];
            int n = 0;
            int m = input.read(b);
            while (m != -1 && n < LOOKAHEAD && !listener.isDone()) {
                n += m;
                listener.handleData(b, 0, m);
                m = input.read(b, 0, Math.min(b.length, LOOKAHEAD - n));
            }

            return listener.dataEnd();
        } catch (LinkageError e) {
            return null; // juniversalchardet is not available
        } finally {
            input.reset();
        }
    }

}

重点关注的listener.handleData

        if (this.done) {
            return;
        }
        
        if (length > 0) {
            this.gotData = true;
        }
        
        if (this.start) {
            this.start = false;
            if (length > 3) {
                int b1 = buf[offset] & 0xFF;
                int b2 = buf[offset+1] & 0xFF;
                int b3 = buf[offset+2] & 0xFF;
                int b4 = buf[offset+3] & 0xFF;
                
                         //判断规则
                switch (b1) {
                case 0xEF:
                    if (b2 == 0xBB && b3 == 0xBF) {
                        this.detectedCharset = Constants.CHARSET_UTF_8;
                    }
                    break;
                case 0xFE:
                    if (b2 == 0xFF && b3 == 0x00 && b4 == 0x00) {
                        this.detectedCharset = Constants.CHARSET_X_ISO_10646_UCS_4_3412;
                    } else if (b2 == 0xFF) {
                        this.detectedCharset = Constants.CHARSET_UTF_16BE;
                    }
                    break;
                case 0x00:
                    if (b2 == 0x00 && b3 == 0xFE && b4 == 0xFF) {
                        this.detectedCharset = Constants.CHARSET_UTF_32BE;
                    } else if (b2 == 0x00 && b3 == 0xFF && b4 == 0xFE) {
                        this.detectedCharset = Constants.CHARSET_X_ISO_10646_UCS_4_2143;
                    }
                    break;
                case 0xFF:
                    if (b2 == 0xFE && b3 == 0x00 && b4 == 0x00) {
                        this.detectedCharset = Constants.CHARSET_UTF_32LE;
                    } else if (b2 == 0xFE) {
                        this.detectedCharset = Constants.CHARSET_UTF_16LE;
                    }
                    break;
                } // swich end
                
                if (this.detectedCharset != null) {
                    this.done = true;
                    return;
                }
            }
        } // if (start) end
        
        int maxPos = offset + length;
        for (int i=offset; i
         其实各种不同解析器,基本都配备多个编码解析器,一个个进行匹配,都匹配不上,则返回默认(比如 AutoDetectReader的ISO-8859-1,或者设置默认),因为更关注实现这种功能的思路,而并不是各种编码之间的区别,故对最底层的解析判断不进行深入研究.

cpdetector

      cpdetector是一个开源的字符检测工具( 主页)
	public void parseByIo() {
		try {
			File file = new File("C:\\Users\\admin\\Desktop\\temp\\test.txt");
			CodepageDetectorProxy detector = CodepageDetectorProxy.getInstance();
			//相关解析器
			detector.add(new ByteOrderMarkDetector());
			detector.add(JChardetFacade.getInstance());
			detector.add(new ParsingDetector(true));
	        detector.add(ASCIIDetector.getInstance());
	        detector.add(UnicodeDetector.getInstance());
	        
	        //获取编码
	        java.nio.charset.Charset charset = null;
            charset = detector.detectCodepage(file.toURI().toURL());
            
            //读取文本内容
            BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file), charset));
            String content = reader.readLine();
			if (!charset.name().startsWith("UTF"))
				content = new String(content.getBytes(charset.name()), "GBK");
            System.out.println("charset : " + charset.name() +"; content : "+ content);
            reader.close();
		/***************运行结果****************/
		/*        unicode big endian
		  charset : UTF-16BE; content : 中国
		  
		                ansi
		  charset : windows-1252; content : 中国
		  
		                unicode
		 charset : UTF-16LE; content : 中国
		  
		                utf-8
		 charset : UTF-8; content : 中国
		 
		*/
		
		/***************相关依赖****************/
		/*https://sourceforge.net/projects/cpdetector/files/cpdetector/javadoc/
		 * 下载相关jar,引入项目中
		 * antlr-2.7.4.jar
		 * chardet-1.0.jar
		 * cpdetector-1.0.10.jar
		 * jargs-1.0.jar
		 * 
		 */
		} catch (Exception e) {
			e.printStackTrace();
		}
	}
      这个源码没有,所以只能用,无法究竟其原理

TikaEncodingDetector

	public void parseByany23() {

		InputStream in = null;
		try {
			in = new FileInputStream("C:\\Users\\admin\\Desktop\\temp\\test.txt");
			TikaEncodingDetector detector = new TikaEncodingDetector();
			String guessEncoding = detector.guessEncoding(in);
			String preGuessEncoding=guessEncoding;
			if (!guessEncoding.startsWith("UTF")) {
				
				guessEncoding = "GBK";
			}
			in.close();
			
            //读取文本内容
            BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream("C:\\Users\\admin\\Desktop\\temp\\test.txt"), guessEncoding));
            String content = reader.readLine();
            System.out.println("charset : " + preGuessEncoding +"; content : "+ content);
            reader.close();
		} catch (Exception e) {
			e.printStackTrace();
		} finally {
			try {
				in.close();
			} catch (IOException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
		}
		/***************运行结果****************/
		/*        unicode big endian
		  charset : UTF-16BE; content : 中国
		  
		                ansi
		  charset : IBM420_ltr; content : 中国
		  
		                unicode
		 charset : UTF-16LE; content : 中国
		  
		                utf-8
		 charset : UTF-8; content : 中国
		 
		 注:此方法只为单纯获取编码,但是不能再获取编码的获取编码内容,需要打开文件2次(fileInputStream不支持reset)
		 */
		
		
		/***************相关依赖****************/
		/*
		 * pom依赖 
		
		
			org.apache.any23
			apache-any23-encoding
			1.1
		
		 * 
		 */
	}
   核心解析
   /**
     * Return an array of all charsets that appear to be plausible
     * matches with the input data.  The array is ordered with the
     * best quality match first.
     * 

* Raise an exception if *

    *
  • no charsets appear to match the input data.
  • *
  • no input text has been provided
  • *
* * @return An array of CharsetMatch objects representing possibly matching charsets. * @stable ICU 3.4 */ public CharsetMatch[] detectAll() { CharsetRecognizer csr; int i; CharsetMatch charsetMatch; int confidence; ArrayList matches = new ArrayList(); // Iterate over all possible charsets, remember all that // give a match quality > 0. for (i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) { csr = ALL_CS_RECOGNIZERS.get(i).recognizer; charsetMatch = csr.match(this); if (charsetMatch != null) { confidence = charsetMatch.getConfidence() & 0x000000ff; if (confidence > 0) { // Just to be safe, constrain confidence = Math.min(confidence, MAX_CONFIDENCE); // Apply charset hint. if ((fDeclaredEncoding != null) && (fDeclaredEncoding.equalsIgnoreCase(csr.getName()))) { // Reduce lack of confidence (delta between "sure" and current) by 50%. confidence += (MAX_CONFIDENCE - confidence) / 2; } CharsetMatch m = new CharsetMatch(this, csr, confidence, charsetMatch.getName(), charsetMatch.getLanguage()); matches.add(m); } } } Collections.sort(matches); // CharsetMatch compares on confidence Collections.reverse(matches); // Put best match first. CharsetMatch[] resultArray = new CharsetMatch[matches.size()]; resultArray = matches.toArray(resultArray); return resultArray; }

   以上原理tika还有个解析类也是同理

	@Test
	public void parseByCharsetDetector () {
		try {
			BufferedInputStream in = new BufferedInputStream(new FileInputStream(new File("C:\\Users\\admin\\Desktop\\temp\\test.txt")));
			System.out.println(in.markSupported());
			CharsetDetector detector = new CharsetDetector();
			detector.setText(in);
			
			CharsetMatch cm = detector.detect();
			
			String charsetName = cm.getName();
			BufferedReader reader=null;
			if (!charsetName.startsWith("UTF")) {
				reader=	new BufferedReader(new InputStreamReader(in, "GBK"));
			} else {
				reader = new BufferedReader(cm.getReader());
			}
			String content = reader.readLine();
			
			//读取文本内容
			System.out.println("charset : " + charsetName +"; content : "+ content);
			reader.close();
			in.close();
		} catch (Exception e) {
			e.printStackTrace();
		} 
		/***************运行结果****************/
		/*        unicode big endian
		  charset : UTF-16LE; content : 中国
		  
		                ansi
		 charset : IBM420_ltr; content : 中国
		  
		                unicode
		 charset : UTF-16BE; content : 中国
		  
		                utf-8
		 charset : UTF-8; content : 中国
		 
		 注:此方法对流的要求必须要允许reset,此处编码为IBM420_ltr,会报错,故需重新读取一次流
		 */
		
		
		/***************相关依赖****************/
		/*
		 * pom依赖 
	
		
		
			org.apache.any23
			apache-any23-encoding
			1.1
		
		 * 
		 */
	}


CharsetToolkit

	@Test
	public void parseByCharsetToolkit() {
		File file = new File("C:\\Users\\admin\\Desktop\\temp\\test.txt");
		try {
			CharsetToolkit detector = new CharsetToolkit(file);
			detector.setDefaultCharset(Charset.forName("GBK"));
			Charset charset = detector.getCharset();
			BufferedReader reader = detector.getReader();
			String content = reader.readLine();
			
			//读取文本内容
			if (!charset.name().startsWith("UTF"))
				content = new String(content.getBytes(charset.name()), "GBK");
            System.out.println("charset : " + charset.name() +"; content : "+ content);
			reader.close();
		} catch (Exception e) {
			e.printStackTrace();
		} 
		/***************运行结果****************/
		/*        unicode big endian
		  charset : UTF-16BE; content : 中国
		  
		                ansi
		 charset : GBK; content : 中国
		  
		                unicode
		 charset : UTF-16LE; content : 中国
		  
		                utf-8
		 charset : UTF-8; content : 中国
		 
		 注:此方法对无法解析的都使用期默认的编码,故要设置默认编码
		 */
		
		
		/***************相关依赖****************/
		/*
		 * pom依赖 
	
	
	    org.codehaus.groovy
	    groovy-all
	    2.4.12
	
		 * 
		 */
	}
核心源码
    /**
     * Guess the encoding of the provided buffer.
     * If Byte Order Markers are encountered at the beginning of the buffer, we immediately
     * return the charset implied by this BOM. Otherwise, the file would not be a human
     * readable text file.
     * 

* If there is no BOM, this method tries to discern whether the file is UTF-8 or not. * If it is not UTF-8, we assume the encoding is the default system encoding * (of course, it might be any 8-bit charset, but usually, an 8-bit charset is the default one). *

* It is possible to discern UTF-8 thanks to the pattern of characters with a multi-byte sequence. *

     * UCS-4 range (hex.)        UTF-8 octet sequence (binary)
     * 0000 0000-0000 007F       0xxxxxxx
     * 0000 0080-0000 07FF       110xxxxx 10xxxxxx
     * 0000 0800-0000 FFFF       1110xxxx 10xxxxxx 10xxxxxx
     * 0001 0000-001F FFFF       11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
     * 0020 0000-03FF FFFF       111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
     * 0400 0000-7FFF FFFF       1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
     * 
* With UTF-8, 0xFE and 0xFF never appear. * * @return the Charset recognized. */ private Charset guessEncoding() { // if the file has a Byte Order Marker, we can assume the file is in UTF-xx // otherwise, the file would not be human readable if (hasUTF8Bom()) return Charset.forName("UTF-8"); if (hasUTF16LEBom()) return Charset.forName("UTF-16LE"); if (hasUTF16BEBom()) return Charset.forName("UTF-16BE"); // if a byte has its most significant bit set, the file is in UTF-8 or in the default encoding // otherwise, the file is in US-ASCII boolean highOrderBit = false; // if the file is in UTF-8, high order bytes must have a certain value, in order to be valid // if it's not the case, we can assume the encoding is the default encoding of the system boolean validU8Char = true; // TODO the buffer is not read up to the end, but up to length - 6 int length = buffer.length; int i = 0; while (i < length - 6) { byte b0 = buffer[i]; byte b1 = buffer[i + 1]; byte b2 = buffer[i + 2]; byte b3 = buffer[i + 3]; byte b4 = buffer[i + 4]; byte b5 = buffer[i + 5]; if (b0 < 0) { // a high order bit was encountered, thus the encoding is not US-ASCII // it may be either an 8-bit encoding or UTF-8 highOrderBit = true; // a two-bytes sequence was encountered if (isTwoBytesSequence(b0)) { // there must be one continuation byte of the form 10xxxxxx, // otherwise the following character is is not a valid UTF-8 construct if (!isContinuationChar(b1)) validU8Char = false; else i++; } // a three-bytes sequence was encountered else if (isThreeBytesSequence(b0)) { // there must be two continuation bytes of the form 10xxxxxx, // otherwise the following character is is not a valid UTF-8 construct if (!(isContinuationChar(b1) && isContinuationChar(b2))) validU8Char = false; else i += 2; } // a four-bytes sequence was encountered else if (isFourBytesSequence(b0)) { // there must be three continuation bytes of the form 10xxxxxx, // otherwise the following character is is not a valid UTF-8 construct if (!(isContinuationChar(b1) && isContinuationChar(b2) && isContinuationChar(b3))) validU8Char = false; else i += 3; } // a five-bytes sequence was encountered else if (isFiveBytesSequence(b0)) { // there must be four continuation bytes of the form 10xxxxxx, // otherwise the following character is is not a valid UTF-8 construct if (!(isContinuationChar(b1) && isContinuationChar(b2) && isContinuationChar(b3) && isContinuationChar(b4))) validU8Char = false; else i += 4; } // a six-bytes sequence was encountered else if (isSixBytesSequence(b0)) { // there must be five continuation bytes of the form 10xxxxxx, // otherwise the following character is is not a valid UTF-8 construct if (!(isContinuationChar(b1) && isContinuationChar(b2) && isContinuationChar(b3) && isContinuationChar(b4) && isContinuationChar(b5))) validU8Char = false; else i += 5; } else validU8Char = false; } if (!validU8Char) break; i++; } // if no byte with an high order bit set, the encoding is US-ASCII // (it might have been UTF-7, but this encoding is usually internally used only by mail systems) if (!highOrderBit) { // returns the default charset rather than US-ASCII if the enforce8Bit flag is set. if (this.enforce8Bit) return this.defaultCharset; else return Charset.forName("US-ASCII"); } // if no invalid UTF-8 were encountered, we can assume the encoding is UTF-8, // otherwise the file would not be human readable if (validU8Char) return Charset.forName("UTF-8"); // finally, if it's not UTF-8 nor US-ASCII, let's assume the encoding is the default encoding return this.defaultCharset; }

你可能感兴趣的:(java)