java文件处理测试

测试一下java处理文件


package name;

import df.util.Util;
import df.util.type.RandomUtil;
import df.util.type.StringUtil;
import df.util.type.SysLog;

import java.io.*;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;

/**
 * Created by andrew on 2015/7/12.
 */
public class NameApp implements Runnable {
    private static final String TAG = Util.toTAG(NameApp.class);

    Map<String, Integer> repeatNames = new HashMap<String, Integer>();
    String               ignoreChars = "";

    @Override
    public void run() {
        File currentPath = new File("");
        File dir         = new File(currentPath.getAbsolutePath());
        File parent      = new File(dir.getParent());
        if (null != parent
                && parent.isDirectory()) {
            File res = new File(parent.getAbsolutePath() + "\\res");
            if (!res.isDirectory()) {
                res.mkdirs();
            }

            File ignoreFile = new File(parent.getAbsolutePath() + "\\ignore_chars.txt");
            String ignoreEncodeType = "gbk";
            if (!ignoreFile.isFile()) {
                File ignoreCurrentPath = new File(ignoreFile.getAbsolutePath());
                File ignoreParent = new File(ignoreCurrentPath.getParent());
                ignoreParent.mkdirs();
                String path = ignoreFile.getAbsolutePath();
                ignoreFile = new File(path);
                try {
                    ignoreFile.createNewFile();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }

            try {
                InputStream ignoreIn = new BufferedInputStream(new FileInputStream(ignoreFile));
                ignoreEncodeType = encodeType(ignoreIn);
                byte[] ignoreBug = new byte[1024];
                while (ignoreIn.read(ignoreBug) != -1) {
                    ignoreChars += new String(ignoreBug, ignoreEncodeType);
                }
                ignoreChars = ignoreChars.trim() + ' ';
                SysLog.v(TAG, " ignoreChars=", ignoreChars);
            } catch (IOException e) {
                e.printStackTrace();
            }

            int saveFileIndex = 0;
            while (true) {
                File[] files = res.listFiles(new FileFilter() {
                    @Override
                    public boolean accept(File pathname) {
                        String name = pathname.getName();
                        if (name.toLowerCase().endsWith(".txt")) {
                            return true;
                        }
                        return false;
                    }
                });

                StringBuffer buf = new StringBuffer();
                for (File f : files) {
                    InputStream reader = null;
                    OutputStream out = null;
                    OutputStream outRepeatName = null;
                    try {
                        File saveNameFile = null;
                        while (saveFileIndex < 99999) {
                            saveNameFile = new File(parent.getAbsolutePath() + "\\names_" + saveFileIndex + ".txt");
                            if (!saveNameFile.isFile()
                                    || saveNameFile.length() < 1024 * 1024 * 4) {
                                break;
                            }
                            saveFileIndex++;
                        }
                        out = new BufferedOutputStream(new FileOutputStream(saveNameFile, true));
                        outRepeatName = new BufferedOutputStream(new FileOutputStream(new File(parent.getAbsolutePath() + "\\repeat_names.txt"), true));
                        reader = new BufferedInputStream(new FileInputStream(f));
                        String charset = encodeType(reader);
                        if (StringUtil.empty(charset)) {
                            charset = "gbk";
                        }

                        byte[] line = new byte[1024 * 1024];
                        buf.setLength(0);

                        Arrays.fill(line, (byte) 0);
                        while (reader.read(line, 0, line.length) != -1) {
                            buf.append(new String(line, charset));
                            Arrays.fill(line, (byte) 0);
                        }
                        String txt = buf.toString().trim();
                        String repeatText = "";
                        String allNames = "";
                        String son = "";
                        String context = "";
                        String context2 = "";
                        while (son.length() < 2) {
                            int index = RandomUtil.toRandomInt(0, txt.length() - 2);
                            int index2 = RandomUtil.toRandomInt(0, txt.length() - 2);
                            char c1 = txt.charAt(index);
                            char c2 = txt.charAt(index2);
                            if (ignoreChars.contains("" + c1)
                                    || ignoreChars.contains("" + c2)) {
                                continue;
                            }

                            if (isLegal(c1) && isLegal(c2)) {
                                son += "" + c1 + c2;
                                son = son.trim();
                                if (son.length() == 2) {
                                    final int gap = 3;
                                    int front = index > gap ? index - gap : 0;
                                    int behind = index + gap < txt.length() ? index + gap : txt.length();
                                    context = txt.substring(front, behind);
                                    context = context.replace("\r", "");
                                    context = context.replace("\n", "");
                                    context = context.replace(" ", "");

                                    int front2 = index2 > gap ? index2 - gap : 0;
                                    int behind2 = index2 + gap < txt.length() ? index2 + gap : txt.length();
                                    context2 = txt.substring(front2, behind2);
                                    context2 = context2.replace("\r", "");
                                    context2 = context2.replace("\n", "");
                                    context2 = context2.replace(" ", "");
                                    String nameContext = son + "  " + context + "  " + context2;
                                    allNames += nameContext + "  " + f.getName() + "\r\n";

                                    int repeatCount = 0;
                                    if (repeatNames.containsKey(son)) {
                                        repeatCount = repeatNames.get(son) + 1;
                                        repeatText = nameContext + "  " + repeatCount + "    " + f.getName() + "\r\n";
                                    }
                                    repeatNames.put(son, repeatCount);
                                }
                            }
                        }
                        out.write(allNames.getBytes());
                        out.flush();
                        if (!StringUtil.empty(repeatText)) {
                            if (repeatNames.containsKey(son)) {
                                Integer val = repeatNames.get(son);
                                if (val > 5) {
                                    outRepeatName.write(repeatText.getBytes());
                                    outRepeatName.flush();
                                    repeatNames.clear();
                                    SysLog.v(TAG, repeatText, " , ", f.getName());
                                }
                            }
                        }
                    } catch (Exception e) {
                        SysLog.v(TAG, ", e=", e.getMessage());
                        e.printStackTrace();
                    } finally {
                        if (null != reader) {
                            try {
                                reader.close();
                            } catch (IOException e) {
                                e.printStackTrace();
                            }
                        }

                        try {
                            out.close();
                        } catch (Exception e) {
                            e.printStackTrace();
                        }

                        try {
                            outRepeatName.close();
                        } catch (Exception e) {
                            e.printStackTrace();
                        }
                    }
                }
            }
        }
    }


    public static void main(String[] args) {
        new Thread(new NameApp()).start();
    }

    public static String encodeType(InputStream reader) {
        String type = "";
        try {
            reader.mark(0);
            byte[] first3Bytes = new byte[3];
            int read = reader.read(first3Bytes);
            if (3 == read) {
                //其中的 0xefbb、0xfffe、0xfeff、0x5c75这些都是这个文件的前面两个字节的16进制数
                if (first3Bytes[0] == (byte) 0xFF && first3Bytes[1] == (byte) 0xFE) {
                    type = "UTF-16LE";
                } else if (first3Bytes[0] == (byte) 0xFE && first3Bytes[1] == (byte) 0xFF) {
                    type = "UTF-16BE";
                } else if (first3Bytes[0] == (byte) 0xEF && first3Bytes[1] == (byte) 0xBB && first3Bytes[2] == (byte) 0xBF) {
                    type = "UTF-8";
                } else {
                    type = "gbk";
                    reader.reset();
                }
            } else {
                type = "gbk";
                reader.reset();
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
        return type;
    }

    public static boolean isLegal(char c) {
        if ((c >= 'a' && c <= 'z')
                || (c >= 'A' && c <= 'Z')
                || (c >= '0' && c <= '9')
                || c == ' '
                || c == '。'
                || c == ','
                || c == '"'
                || c == '“'
                || c == '”'
                || c == '、'
                || c == '?'
                || c == '?'
                || c == 12288) {
            return false;
        }
        return true;
    }
}


你可能感兴趣的:(java文件处理测试)