java判断txt文件的编码格式

 /**
     * txt转html
     * @param s
     * @return
     */
    public static String txtToHtml(String s) {
        try {
            StringBuilder builder = new StringBuilder();
            File file=new File(s);
            if(file.isFile() && file.exists()){ //判断文件是否存在
                String encoding=getFilecharset(new File(s));
                InputStreamReader read = new InputStreamReader(
                        new FileInputStream(file),encoding);//考虑到编码格式
                BufferedReader bufferedReader = new BufferedReader(read);
                String lineTxt = null;
                while((lineTxt = bufferedReader.readLine()) != null){
                    boolean previousWasASpace = false;
                    for (char c : (lineTxt+"\n").toCharArray()) {
                        if (c == ' ') {
                            if (previousWasASpace) {
                                builder.append(" ");
                                previousWasASpace = false;
                                continue;
                            }
                            previousWasASpace = true;
                        } else {
                            previousWasASpace = false;
                        }
                        switch (c) {
                            case '<':
                                builder.append("<");
                                break;
                            case '>':
                                builder.append(">");
                                break;
                            case '&':
                                builder.append("&");
                                break;
                            case '"':
                                builder.append("");
                                break;
                            case '\n':
                                builder.append("
"); break; // We need Tab support here, because we print StackTraces as HTML case '\t': builder.append(" "); break; default: builder.append(c); } } } read.close(); String converted = builder.toString(); String str = "(?i)\\b((?:https?://|www\\d{0,3}[.]|[a-z0-9.\\-]+[.][a-z]{2,4}/)(?:[^\\s()<>]+|\\(([^\\s()<>]+|(\\([^\\s()<>]+\\)))*\\))+(?:\\(([^\\s()<>]+|(\\([^\\s()<>]+\\)))*\\)|[^\\s`!()\\[\\]{};:\'\".,<>?«»“”‘’]))"; Pattern patt = Pattern.compile(str); Matcher matcher = patt.matcher(converted); converted = matcher.replaceAll("$1"); return converted; }else{ logger.error("找不到指定的文件"); return null; } } catch (Exception e) { logger.error("读取文件内容出错"); e.printStackTrace(); return null; } } //判断编码格式方法 private static String getFilecharset(File sourceFile) { String charset = "GBK"; byte[] first3Bytes = new byte[3]; try { boolean checked = false; BufferedInputStream bis = new BufferedInputStream(new FileInputStream(sourceFile)); bis.mark(0); int read = bis.read(first3Bytes, 0, 3); if (read == -1) { return charset; //文件编码为 ANSI } else if (first3Bytes[0] == (byte) 0xFF && first3Bytes[1] == (byte) 0xFE) { charset = "UTF-16LE"; //文件编码为 Unicode checked = true; } else if (first3Bytes[0] == (byte) 0xFE && first3Bytes[1] == (byte) 0xFF) { charset = "UTF-16BE"; //文件编码为 Unicode big endian checked = true; } else if (first3Bytes[0] == (byte) 0xEF && first3Bytes[1] == (byte) 0xBB && first3Bytes[2] == (byte) 0xBF) { charset = "UTF-8"; //文件编码为 UTF-8 checked = true; } bis.reset(); if (!checked) { int loc = 0; while ((read = bis.read()) != -1) { loc++; if (read >= 0xF0) break; if (0x80 <= read && read <= 0xBF) // 单独出现BF以下的,也算是GBK break; if (0xC0 <= read && read <= 0xDF) { read = bis.read(); if (0x80 <= read && read <= 0xBF) // 双字节 (0xC0 - 0xDF) // (0x80 // - 0xBF),也可能在GB编码内 continue; else break; } else if (0xE0 <= read && read <= 0xEF) {// 也有可能出错,但是几率较小 read = bis.read(); if (0x80 <= read && read <= 0xBF) { read = bis.read(); if (0x80 <= read && read <= 0xBF) { charset = "UTF-8"; break; } else break; } else break; } } } bis.close(); } catch (Exception e) { e.printStackTrace(); } return charset; }


转自:http://blog.163.com/wf_shunqiziran/blog/static/176307209201258102217810/


你可能感兴趣的:(Java)