测试一下java处理文件
package name; import df.util.Util; import df.util.type.RandomUtil; import df.util.type.StringUtil; import df.util.type.SysLog; import java.io.*; import java.util.Arrays; import java.util.HashMap; import java.util.Map; /** * Created by andrew on 2015/7/12. */ public class NameApp implements Runnable { private static final String TAG = Util.toTAG(NameApp.class); Map<String, Integer> repeatNames = new HashMap<String, Integer>(); String ignoreChars = ""; @Override public void run() { File currentPath = new File(""); File dir = new File(currentPath.getAbsolutePath()); File parent = new File(dir.getParent()); if (null != parent && parent.isDirectory()) { File res = new File(parent.getAbsolutePath() + "\\res"); if (!res.isDirectory()) { res.mkdirs(); } File ignoreFile = new File(parent.getAbsolutePath() + "\\ignore_chars.txt"); String ignoreEncodeType = "gbk"; if (!ignoreFile.isFile()) { File ignoreCurrentPath = new File(ignoreFile.getAbsolutePath()); File ignoreParent = new File(ignoreCurrentPath.getParent()); ignoreParent.mkdirs(); String path = ignoreFile.getAbsolutePath(); ignoreFile = new File(path); try { ignoreFile.createNewFile(); } catch (IOException e) { e.printStackTrace(); } } try { InputStream ignoreIn = new BufferedInputStream(new FileInputStream(ignoreFile)); ignoreEncodeType = encodeType(ignoreIn); byte[] ignoreBug = new byte[1024]; while (ignoreIn.read(ignoreBug) != -1) { ignoreChars += new String(ignoreBug, ignoreEncodeType); } ignoreChars = ignoreChars.trim() + ' '; SysLog.v(TAG, " ignoreChars=", ignoreChars); } catch (IOException e) { e.printStackTrace(); } int saveFileIndex = 0; while (true) { File[] files = res.listFiles(new FileFilter() { @Override public boolean accept(File pathname) { String name = pathname.getName(); if (name.toLowerCase().endsWith(".txt")) { return true; } return false; } }); StringBuffer buf = new StringBuffer(); for (File f : files) { InputStream reader = null; OutputStream out = null; OutputStream outRepeatName = null; try { File saveNameFile = null; while (saveFileIndex < 99999) { saveNameFile = new File(parent.getAbsolutePath() + "\\names_" + saveFileIndex + ".txt"); if (!saveNameFile.isFile() || saveNameFile.length() < 1024 * 1024 * 4) { break; } saveFileIndex++; } out = new BufferedOutputStream(new FileOutputStream(saveNameFile, true)); outRepeatName = new BufferedOutputStream(new FileOutputStream(new File(parent.getAbsolutePath() + "\\repeat_names.txt"), true)); reader = new BufferedInputStream(new FileInputStream(f)); String charset = encodeType(reader); if (StringUtil.empty(charset)) { charset = "gbk"; } byte[] line = new byte[1024 * 1024]; buf.setLength(0); Arrays.fill(line, (byte) 0); while (reader.read(line, 0, line.length) != -1) { buf.append(new String(line, charset)); Arrays.fill(line, (byte) 0); } String txt = buf.toString().trim(); String repeatText = ""; String allNames = ""; String son = ""; String context = ""; String context2 = ""; while (son.length() < 2) { int index = RandomUtil.toRandomInt(0, txt.length() - 2); int index2 = RandomUtil.toRandomInt(0, txt.length() - 2); char c1 = txt.charAt(index); char c2 = txt.charAt(index2); if (ignoreChars.contains("" + c1) || ignoreChars.contains("" + c2)) { continue; } if (isLegal(c1) && isLegal(c2)) { son += "" + c1 + c2; son = son.trim(); if (son.length() == 2) { final int gap = 3; int front = index > gap ? index - gap : 0; int behind = index + gap < txt.length() ? index + gap : txt.length(); context = txt.substring(front, behind); context = context.replace("\r", ""); context = context.replace("\n", ""); context = context.replace(" ", ""); int front2 = index2 > gap ? index2 - gap : 0; int behind2 = index2 + gap < txt.length() ? index2 + gap : txt.length(); context2 = txt.substring(front2, behind2); context2 = context2.replace("\r", ""); context2 = context2.replace("\n", ""); context2 = context2.replace(" ", ""); String nameContext = son + " " + context + " " + context2; allNames += nameContext + " " + f.getName() + "\r\n"; int repeatCount = 0; if (repeatNames.containsKey(son)) { repeatCount = repeatNames.get(son) + 1; repeatText = nameContext + " " + repeatCount + " " + f.getName() + "\r\n"; } repeatNames.put(son, repeatCount); } } } out.write(allNames.getBytes()); out.flush(); if (!StringUtil.empty(repeatText)) { if (repeatNames.containsKey(son)) { Integer val = repeatNames.get(son); if (val > 5) { outRepeatName.write(repeatText.getBytes()); outRepeatName.flush(); repeatNames.clear(); SysLog.v(TAG, repeatText, " , ", f.getName()); } } } } catch (Exception e) { SysLog.v(TAG, ", e=", e.getMessage()); e.printStackTrace(); } finally { if (null != reader) { try { reader.close(); } catch (IOException e) { e.printStackTrace(); } } try { out.close(); } catch (Exception e) { e.printStackTrace(); } try { outRepeatName.close(); } catch (Exception e) { e.printStackTrace(); } } } } } } public static void main(String[] args) { new Thread(new NameApp()).start(); } public static String encodeType(InputStream reader) { String type = ""; try { reader.mark(0); byte[] first3Bytes = new byte[3]; int read = reader.read(first3Bytes); if (3 == read) { //其中的 0xefbb、0xfffe、0xfeff、0x5c75这些都是这个文件的前面两个字节的16进制数 if (first3Bytes[0] == (byte) 0xFF && first3Bytes[1] == (byte) 0xFE) { type = "UTF-16LE"; } else if (first3Bytes[0] == (byte) 0xFE && first3Bytes[1] == (byte) 0xFF) { type = "UTF-16BE"; } else if (first3Bytes[0] == (byte) 0xEF && first3Bytes[1] == (byte) 0xBB && first3Bytes[2] == (byte) 0xBF) { type = "UTF-8"; } else { type = "gbk"; reader.reset(); } } else { type = "gbk"; reader.reset(); } } catch (IOException e) { e.printStackTrace(); } return type; } public static boolean isLegal(char c) { if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == ' ' || c == '。' || c == ',' || c == '"' || c == '“' || c == '”' || c == '、' || c == '?' || c == '?' || c == 12288) { return false; } return true; } }