java字符集探测器jchardet

import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;

import java.net.URL;

import org.mozilla.intl.chardet.HtmlCharsetDetector;
import org.mozilla.intl.chardet.nsDetector;
import org.mozilla.intl.chardet.nsICharsetDetectionObserver;
import org.mozilla.intl.chardet.nsPSMDetector;
//java字符集探测器jchardet  http://jchardet.sourceforge.net/
public class JCharset {

	public static void main(String[] args) throws Exception {
		// Initalize the nsDetector() ;
		int lang = (args.length == 2) ? Integer.parseInt(args[1])
				: nsPSMDetector.ALL;
		//初始化字符串探测器
		nsDetector det = new nsDetector(lang);
		det.Init(new nsICharsetDetectionObserver() {
			public void Notify(String charset) {
				HtmlCharsetDetector.found = true;
				System.out.println("CHARSET = " + charset);
			}
		});
		if(args.length<1)
		{
			System.err.println("args.length<1");
			detectorString("This is a 涓枃鐨�String�");
			return ;
		}
		
		//url网址 ,file:///D:/test.txt  http://www.baidu.com/
		URL url = new URL(args[0]);
		BufferedInputStream imp = new BufferedInputStream(url.openStream());

		byte[] buf = new byte[1024];
		int len;
		boolean done = false;
		boolean isAscii = true;

		while ((len = imp.read(buf, 0, buf.length)) != -1) {

			// Check if the stream is only ascii.
			if (isAscii)
				isAscii = det.isAscii(buf, len);
			// DoIt if non-ascii and not done yet.
			if (!isAscii && !done)
				done = det.DoIt(buf, len, false);
		}
		det.DataEnd();

		if (isAscii) {
			System.out.println("CHARSET = ASCII");

		} else {
			System.out.println("CHARSET != ASCII");
		}
	}

	private static void detectorString(String str) {
		nsDetector det = new nsDetector( nsPSMDetector.ALL);
		det.Init(new nsICharsetDetectionObserver() {
			public void Notify(String charset) {
				System.out.println("detectorString CHARSET = " + charset);
			}
		});
		ByteArrayInputStream byteArrayInputStream=new ByteArrayInputStream(str.getBytes());

		byte[] buf = new byte[1024];
		int len;
		boolean done = false;
		boolean isAscii = true;
		while ((len = byteArrayInputStream.read(buf, 0, buf.length)) != -1) {

			if (isAscii)
				isAscii = det.isAscii(buf, len);
			if (!isAscii && !done)
				done = det.DoIt(buf, len, false);
		}
		det.DataEnd();

		if (isAscii) {
			System.out.println("detectorString CHARSET = ASCII");

		} else {
			System.out.println("detectorString CHARSET != ASCII");
		}
	}
}

你可能感兴趣的:(java字符集探测器jchardet)