java从txt文件中提取QQ号码,解决各种编码格式下中文乱码问题

需求:一个文件夹中有若干个txt文件,其中每个文件的格式大概为:


做现场土壤和地下水调查
(848479566)
群主

海冬清
(1136655133)
管理员

管理员潋滟
(1951098843)
管理员

无心即乐
(1207685)

�Y源���H
(1816377)

土星哥
(2186944)

~海~
(2862282)

海洋世界
(3253989)

沧海一粟
(3520672)

东方
(5391796)

上海斐斯热脱附
(6699554)


文件编码并不统一,为了能提取()中的号码,写入到另一文件中,代码如下:

import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
public class QQNumberExtract {
      
    public static void main(String[] args) {
        String folderPath = "F:" + File.separator + "土壤qq群好友";
        File fileFolder = new File(folderPath);
          
        File[] txtFiles = fileFolder.listFiles();
          
        for(File f : txtFiles){
            String charset = getCharset(f);
            extract(f,charset);
        }
    }
      
    public static void extract(File txtFile, String encoding){
          
        File file = txtFile;
        FileInputStream inputStream = null;
        InputStreamReader reader = null;
        BufferedReader bfReader = null;
        BufferedWriter bw = null;
          
        try {
            inputStream = new FileInputStream(file);
            reader = new InputStreamReader(inputStream,encoding);
            bfReader = new BufferedReader(reader); 
              
            StringBuilder sb = new StringBuilder();
            String line = null;
            String line1 = null;
            String line2 = null;
              
            while ((line = bfReader.readLine()) != null){
                  
                String[] temp = line.split("\\(");
                if(temp != null && temp.length == 2){
                    line1 = temp[1];
                }
                  
                if(line1 != null && !line1.equals("")){
                    String[] temp2 = line1.split("\\)");
                    if(temp2 != null && temp2.length == 1){
                        line2 = temp2[0];
                    }
                    sb.append(line2 + " ");
                }
                  
                line1 = null;
                line2 = null;
            }
            System.out.println(sb.toString());
              
            //写入
            String parentPath = txtFile.getParent();
            String savePath = parentPath + "1";
            File saveFilePath = new File(savePath);
            if(!saveFilePath.exists()){
                saveFilePath.mkdirs();
            }
            String newFileName = getNameWithOutSuffix(txtFile.getName()) + "1.txt";
                      
            String newTxtFilePath = savePath + File.separator + newFileName;
            File file2 = new File(newTxtFilePath);
            if(file2.exists()){
                file2.delete();
                file2.createNewFile();
            }
              
            bw = new BufferedWriter(new FileWriter(file2, true));
            bw.write(sb.toString());
            bw.flush();
              
        } catch (Exception e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }finally {  
            try {  
                inputStream.close();
                bw.close();
                bfReader.close();
                reader.close();
            } catch (IOException e) {  
                e.printStackTrace();  
            }  
        }
    }
      
    /**
     * 提取文件名,不带后缀
     * @param name
     * @return
     */
    public static String getNameWithOutSuffix(String name){
        String[] temp = name.split("\\.");
        if(temp != null && temp.length == 2){
            return temp[0];
        }
        return null;
    }
      
    /**
     * 获取文件编码格式
     * @param file
     * @return
     */
    private static String getCharset(File file){ 
          
        BufferedInputStream bin = null;
        try {
            bin = new BufferedInputStream(new FileInputStream(file));
            int p = (bin.read() << 8) + bin.read();   
                
            String code = null;   
                
            switch (p) {   
                case 0xefbb:   
                    code = "UTF-8";   
                    break;   
                case 0xfffe:   
                    code = "Unicode";   
                    break;   
                case 0xfeff:   
                    code = "UTF-16BE";   
                    break;   
                default:   
                    code = "GBK";   
            }   
            return code; 
        } catch (Exception e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }finally{
            try {
                bin.close();
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
        }
        return null;   
    } 
}


你可能感兴趣的:(java读取txt,java读txt中文乱码)