方法1:利用windows文本文件编码特点。
windows下,Unicode、Unicode big endian和UTF-8编码的txt文件的开头会多出几个字节,分别是FF、FE(Unicode),FE、FF(Unicode big endian),EF、BB、BF(UTF-8)。
public static String getCharset(File file) {
String charset = "GBK";
byte[] first3Bytes =new byte[3];
try {
boolean checked =false;
BufferedInputStream bis =new BufferedInputStream(
new FileInputStream(file));
bis.mark(0);
int read = bis.read(first3Bytes,0,3);
if (read == -1)
return charset;
if (first3Bytes[0] == (byte)0xFF && first3Bytes[1] == (byte)0xFE) {
charset = "UTF-16LE";
checked = true;
} else if (first3Bytes[0] == (byte) 0xFE && first3Bytes[1]
== (byte)0xFF) {
charset = "UTF-16BE";
checked = true;
} else if (first3Bytes[0] == (byte) 0xEF && first3Bytes[1]
== (byte)0xBB
&& first3Bytes[2] == (byte)0xBF) {
charset = "UTF-8";
checked = true;
}
bis.reset();
if (!checked) {
int loc =0;
while ((read = bis.read()) != -1) {
loc++;
if (read >=0xF0)
break;
//单独出现BF以下的,也算是GBK
if (0x80 <= read && read <= 0xBF)
break;
if (0xC0 <= read && read <= 0xDF) {
read = bis.read();
if (0x80 <= read && read <= 0xBF)// 双字节 (0xC0 - 0xDF)
// (0x80 -
// 0xBF),也可能在GB编码内
continue;
else
break;
// 也有可能出错,但是几率较小
} else if (0xE0 <= read && read <= 0xEF) {
read = bis.read();
if (0x80 <= read && read <= 0xBF) {
read = bis.read();
if (0x80 <= read && read <= 0xBF) {
charset ="UTF-8";
break;
} else
break;
} else
break;
}
}
System.out.println(loc +" " + Integer.toHexString(read));
}
bis.close();
} catch (Exception e) {
e.printStackTrace();
}
return charset;
}
缺点:不能这样去探测linux下的文件。
方法2:开源工程 JCharDet
http://www.iteye.com/topic/266501
package org.mozilla.intl.chardet;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
/**
* 借助JCharDet获取文件字符集
* @author icer
* PS:
* JCharDet 是mozilla自动字符集探测算法代码的java移植,其官方主页为:
* http://jchardet.sourceforge.net/
* @date 2008/11/13
*/
public class FileCharsetDetector {
private boolean found = false;
/**
* 如果完全匹配某个字符集检测算法, 则该属性保存该字符集的名称. 否则(如二进制文件)其值就为默认值 null, 这时应当查询属性
*/
private String encoding =null;
public static void main(String[] argv) throws Exception {
if (argv.length !=1 && argv.length !=2) {
System.out
.println("Usage: FileCharsetDetector
System.out.println("");
System.out.println("Where
System.out.println("For optional
System.out.println(" 1 => Japanese");
System.out.println(" 2 => Chinese");
System.out.println(" 3 => Simplified Chinese");
System.out.println(" 4 => Traditional Chinese");
System.out.println(" 5 => Korean");
System.out.println(" 6 => Dont know (default)");
return;
} else {
String encoding = null;
if (argv.length ==2) {
encoding = new FileCharsetDetector().guestFileEncoding(argv[0],
Integer.valueOf(argv[1]));
} else {
encoding = new FileCharsetDetector().guestFileEncoding(argv[0]);
}
System.out.println("文件编码:" + encoding);
}
}
/**
* 传入一个文件(File)对象,检查文件编码
*
* @param file
* File对象实例
* @return 文件编码,若无,则返回null
* @throws FileNotFoundException
* @throws IOException
*/
public String guestFileEncoding(File file)throws FileNotFoundException,
IOException {
return geestFileEncoding(file,new nsDetector());
}
/**
* 获取文件的编码
*
* @param file
* File对象实例
* @param languageHint
* 语言提示区域代码 eg:1 : Japanese; 2 : Chinese; 3 : Simplified Chinese;
* 4 : Traditional Chinese; 5 : Korean; 6 : Dont know (default)
* @return 文件编码,eg:UTF-8,GBK,GB2312形式,若无,则返回null
* @throws FileNotFoundException
* @throws IOException
*/
public String guestFileEncoding(File file,int languageHint)
throws FileNotFoundException, IOException {
return geestFileEncoding(file,new nsDetector(languageHint));
}
/**
* 获取文件的编码
*
* @param path
* 文件路径
* @return 文件编码,eg:UTF-8,GBK,GB2312形式,若无,则返回null
* @throws FileNotFoundException
* @throws IOException
*/
public String guestFileEncoding(String path)throws FileNotFoundException,
IOException {
return guestFileEncoding(new File(path));
}
/**
* 获取文件的编码
*
* @param path
* 文件路径
* @param languageHint
* 语言提示区域代码 eg:1 : Japanese; 2 : Chinese; 3 : Simplified Chinese;
* 4 : Traditional Chinese; 5 : Korean; 6 : Dont know (default)
* @return
* @throws FileNotFoundException
* @throws IOException
*/
public String guestFileEncoding(String path,int languageHint)
throws FileNotFoundException, IOException {
return guestFileEncoding(new File(path), languageHint);
}
/**
* 获取文件的编码
*
* @param file
* @param det
* @return
* @throws FileNotFoundException
* @throws IOException
*/
private String geestFileEncoding(File file, nsDetector det)
throws FileNotFoundException, IOException {
// Set an observer...
// The Notify() will be called when a matching charset is found.
det.Init(new nsICharsetDetectionObserver() {
public void Notify(String charset) {
found = true;
encoding = charset;
}
});
BufferedInputStream imp = new BufferedInputStream(new FileInputStream(
file));
byte[] buf =new byte[1024];
int len;
boolean done =false;
boolean isAscii =true;
while ((len = imp.read(buf,0, buf.length)) != -1) {
// Check if the stream is only ascii.
if (isAscii)
isAscii = det.isAscii(buf, len);
// DoIt if non-ascii and not done yet.
if (!isAscii && !done)
done = det.DoIt(buf, len,false);
}
det.DataEnd();
if (isAscii) {
encoding = "ASCII";
found = true;
}
if (!found) {
String prob[] = det.getProbableCharsets();
if (prob.length >0) {
// 在没有发现情况下,则取第一个可能的编码
encoding = prob[0];
} else {
return null;
}
}
return encoding;
}
}
jar包下载地址:http://download.csdn.net/detail/u012587637/8041169
方法3:开源工程juniversalcharde
http://code.google.com/p/juniversalchardet/
public static String getFileIncode(File file) {
if (!file.exists()) {
System.err.println("getFileIncode: file not exists!");
return null;
}
byte[] buf =new byte[4096];
FileInputStream fis = null;
try {
fis = new FileInputStream(file);
UniversalDetector detector = new UniversalDetector(null);
int nread;
while ((nread = fis.read(buf)) >0 && !detector.isDone()) {
detector.handleData(buf,0, nread);
}
detector.dataEnd();
String encoding = detector.getDetectedCharset();
if (encoding !=null) {
System.out.println("Detected encoding = " + encoding);
} else {
System.out.println("No encoding detected.");
}
detector.reset();
fis.close();
return encoding;
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
jar包下载:http://download.csdn.net/detail/u012587637/8041181
说明:第三个方法要比第二个速度快些,也比较新,所以推荐使用第三个。