获取txt文件正确编码后读取文件内容,将内容中全角转半角

前段时间工作中需要做大量txt文件抽取,但txt文件编码格式又不统一,文件内容存在全角字符,无奈自己查询学习后编写了个工具类,其中提供:获取txt文件编码格式(中文简体系统使用,其他系统语言可修改使用),全角转半角功能,废话少说,看代码:

package demo.ok;

import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;

public class TextReaderEncode {
	
	public static void main(String[] args) throws Exception {
		//String path1="C:\\Users\\Administrator\\Desktop\\1\\ANSI.txt";
		//String path2="C:\\Users\\Administrator\\Desktop\\1\\Unicode.txt";
		//String path3="C:\\Users\\Administrator\\Desktop\\1\\Unicode_big.txt";
		String path4="C:\\Users\\Administrator\\Desktop\\1\\UTF-8.txt";
		
		//String str=getFilecharset(path4);
		//System.out.println(str);  //编码
		
		String results=getTextFromText(path4); //获取文件编码,根据编码读取文件内容,文件内容全角转成半角
		System.out.println(results); //结果
	}
	
	public static String getTextFromText(String filePath){  
        try {  
            InputStreamReader isr = new InputStreamReader(new FileInputStream(filePath),getFilecharset(filePath));  //getFilecharset(path):判断文件的编码格式 
            BufferedReader br = new BufferedReader(isr);  
              
            StringBuffer sb = new StringBuffer();     
            String temp = null;     
            while((temp = br.readLine()) != null){     
                sb.append(temp+"\n");     
            }
            
            String result=sb.toString();
            //判断字符串内容是否是全角半角混合都是全角,如果是则全角转为半角 
            if(result.getBytes().length > result.length() && result.getBytes().length != result.length()){ 
            	result=qToB(result); //全角转半角
    		}
            br.close();          
            return result; 
            
        } catch (FileNotFoundException e) {  
            // TODO Auto-generated catch block  
            e.printStackTrace();  
        }catch (IOException e) {  
            // TODO Auto-generated catch block  
            e.printStackTrace();  
        }     
        return null;  
	}  
	
	//全角字符串转换半角字符串
	public static String qToB(String fullWidthStr){
		if (null == fullWidthStr || fullWidthStr.length() <= 0) {
            return "";
        }
        char[] charArray = fullWidthStr.toCharArray();
        //对全角字符转换的char数组遍历
        for (int i = 0; i < charArray.length; ++i) {
            int charIntValue = (int) charArray[i];
            //如果符合转换关系,将对应下标之间减掉偏移量65248;如果是空格的话,直接做转换
            if (charIntValue >= 65281 && charIntValue <= 65374) {
                charArray[i] = (char) (charIntValue - 65248);
            } else if (charIntValue == 12288) {
                charArray[i] = (char) 32;
            }
        }
        return new String(charArray);
	}
	
	/**
	 * * 判断编码格式方法  *
	 * 新建txt默认为ANSI编码,但是ANSI编码的文件具体的编码格式根据系统语言决定,
	 * 中文简体的系统语言使用GBK读取ANSI的文件,繁体使用BIG5读取ANSI文件
	 * 一般都是使用简体系统环境,所以这里直接默认使用GBK
	 * */
	private static  String getFilecharset(String sourceFile) {  
		String charset = "GBK";   
       
		byte[] first3Bytes = new byte[3];  
		try {  
			boolean checked = false;  
			BufferedInputStream bis = new BufferedInputStream(new FileInputStream(sourceFile));  
			bis.mark(0);  
            int read = bis.read(first3Bytes, 0, 3);  
            if (read == -1) {  
                return charset; //文件编码为 ANSI ,简体默认使用GBK
            } else if (first3Bytes[0] == (byte) 0xFF  //判断头标识
            		&& first3Bytes[1] == (byte) 0xFE) {  
            	charset = "Unicode"; //文件编码为UTF-16LE即 Unicode  
                checked = true;  
            } else if (first3Bytes[0] == (byte) 0xFE  
                    && first3Bytes[1] == (byte) 0xFF) {  
                charset = "UTF-16BE"; //文件编码为 Unicode big endian  
                checked = true;  
            } else if (first3Bytes[0] == (byte) 0xEF  
                    && first3Bytes[1] == (byte) 0xBB  
                    && first3Bytes[2] == (byte) 0xBF) {  
                charset = "UTF-8"; //文件编码为 UTF-8  
                checked = true;  
            }  
            bis.reset();  //回位到上一个mark
            if (!checked) {  
                int loc = 0;  
                while ((read = bis.read()) != -1) {  
                    loc++;  
                    if (read >= 0xF0)  
                        break;  
                    if (0x80 <= read && read <= 0xBF) // 单独出现BF以下的,也算是GBK  
                        break;  
                    if (0xC0 <= read && read <= 0xDF) {  
                        read = bis.read();  
                        if (0x80 <= read && read <= 0xBF) // 双字节 (0xC0 - 0xDF)  
                            // (0x80 - 0xBF),也可能在GB编码内  
                            continue;  
                        else  
                            break;  
                    } else if (0xE0 <= read && read <= 0xEF) {// 也有可能出错,但是几率较小  
                        read = bis.read();  
                        if (0x80 <= read && read <= 0xBF) {  
                            read = bis.read();  
                            if (0x80 <= read && read <= 0xBF) {  
                                charset = "UTF-8";  
                                break;  
                            } else  
                                break;  
                        } else  
                            break;  
                    }  
                }  
            }  
            bis.close();  
        } catch (Exception e) {  
            e.printStackTrace();  
        }  
        return charset;  
	}  
}

 

你可能感兴趣的:(java,文件编码,全角转半角)