java检测文件编码——cpdetector

cpdetector一个可以自动检测文本编码格式的项目

detector按照“谁最先返回非空的探测结果,就以该结果为准”的原则返回探测到的 字符集编码。
使用需要用到三个第三方JAR包:antlr.jar、chardet.jar和cpdetector.jar
cpDetector是基于统计学原理的,不保证完全正确。

以下是读取xxx.txt文件中的内容,以html的方式返回给浏览器的简单servlet实例。在实现的过程了,遇到的最大问题就是,浏览器打开中文乱码问题,原因是.txt文件保存时的编码不统一,所以在“out.println(new String(buffer, charset));”时charset不能写死,而应该通过某种途径获取.txt文件的编码格式,获取的方式网上主要有以下三种,亲测第三种解决了问题,第一第二中方法都不完善。

package com.hwc.a.servlet;

import info.monitorenter.cpdetector.io.ASCIIDetector;
import info.monitorenter.cpdetector.io.CodepageDetectorProxy;
import info.monitorenter.cpdetector.io.JChardetFacade;
import info.monitorenter.cpdetector.io.ParsingDetector;
import info.monitorenter.cpdetector.io.UnicodeDetector;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintWriter;

import javax.servlet.ServletException;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;

public class TxtToHtmlServlet extends HttpServlet {
    private static final long serialVersionUID = 1L;

    public void doGet(HttpServletRequest request, HttpServletResponse response)
            throws ServletException, IOException {
        String path = request.getParameter("path");
        if (path != null && !"".equals(path)) {
            // TODO 开始下载
            path = new String(path.getBytes("ISO-8859-1"), "utf-8");
            InputStream fis = null;
            PrintWriter out = null;
            try {
                // path是指欲下载的文件的路径。
                // File file = new File(request.getRealPath("/")+"/"+path);
                File file = new File(path);
                // 取得文件名。
                String filename = file.getName();
                // 取得文件的后缀名。
                filename = filename.substring(0, filename.lastIndexOf("."));

                // 以流的形式下载文件。
                fis = new BufferedInputStream(new FileInputStream(file));
                byte[] buffer = new byte[fis.available()];
                fis.read(buffer);
                // 清空response
                response.reset();

                String charset = getFileEncode(path);
                System.out.println("============getFileEncode charset:" + charset);
                if (charset == null) {
                    charset = getCharset(path);
                    System.out.println("============getCharset charset:" + charset);
                }

                response.setHeader("Content-type", "text/html;charset="+ charset);
                response.setContentType("text/html;charset=" + charset);
                out = response.getWriter();
                out.println(new String(buffer, charset));
                out.flush();
            } catch (IOException ex) {
                ex.printStackTrace();
            } finally {
                if (fis != null) {
                    fis.close();
                }
                if (out != null) {
                    out.close();
                }
            }
        }
    }


    /** * 方法一: 仅作参考,不准确 * @param fileName * @return * @throws IOException */
    private String getCharset(String fileName) throws IOException {

        BufferedInputStream bin = new BufferedInputStream(new FileInputStream(
                fileName));
        int p = (bin.read() << 8) + bin.read();

        String code = null;

        switch (p) {
        case 0xefbb:
            code = "UTF-8";
            break;
        case 0xfffe:
            code = "Unicode";
            break;
        case 0xfeff:
            code = "UTF-16BE";
            break;
        default:
            code = "GB2312";
        }
        return code;
    }

    /** * 方法二: 仅作参考,不准确 * @param head * @return */
    private String codetype(byte[] head) {
        byte[] codehead = new byte[4];
        // 截取数组
        System.arraycopy(head, 0, codehead, 0, 4);
        String code = "";
        if (head[0] == -1 && head[1] == -2) {
            code = "UTF-16";
        } else if (head[0] == -2 && head[1] == -1) {
            code = "Unicode";
        } else if (head[0] == -17 && head[1] == -69 && head[2] == -65)
            code = "UTF-8";
        else {
            code = "gb2312";
        }
        return code;
    }

    /** * 方法三:比较准确,解决了实际问题 * @param filePath * @return */
    public static String getFileEncode(String filePath) {
        String charsetName = null;
        try {
            File file = new File(filePath);
            CodepageDetectorProxy detector = CodepageDetectorProxy.getInstance();
            detector.add(new ParsingDetector(false));
            detector.add(JChardetFacade.getInstance());
            detector.add(ASCIIDetector.getInstance());
            detector.add(UnicodeDetector.getInstance());
            java.nio.charset.Charset charset = null;
            charset = detector.detectCodepage(file.toURI().toURL());
            if (charset != null) {
                charsetName = charset.name();
            } else {
                charsetName = "UTF-8";
            }
        } catch (Exception ex) {
            ex.printStackTrace();
            return null;
        }
        return charsetName;
    }

    public void doPost(HttpServletRequest request, HttpServletResponse response)
            throws ServletException, IOException {
        doGet(request, response);
    }
}

你可能感兴趣的:(java检测文件编码——cpdetector)