Java 实现大文件统计字母出现个数

简述:

在大文件中用多线程实现查找里面次数出现最多的字母

文件格式如,

Z
Q
S
D
N
O
E
U
...


所有的类,包图

Java 实现大文件统计字母出现个数_第1张图片


0.Constant.java

常量类

package com.anialy.test.io;

public class Constants {
	public static String[] words = new String[]{
		"A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K","L",
		"M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z"};
	
	// 随机生成的数据文件
	public static String OUTPUT_FILE_NAME = "output.txt";
	
	// 统计结果文件
	public static String RESULTS_FILE = "results.txt";
}



1. DataProductor.java

数据生成代码,可以跑一段时间然后看输出的文件

运行main函数即可

package com.anialy.test.io;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.Random;

public class DataProductor {

	public static void main(String[] args) {
		// 随机生成数据
		new DataProductor().produceData();
	}

	private FileOutputStream outputFileStream = null;
	private static final File outputFile = new File(Constants.OUTPUT_FILE_NAME);

	public void produceData(){
		DataProductor productor = new DataProductor();
		PrepareOutput prepareOutput = productor.new PrepareOutput();
		Thread thread = new Thread(prepareOutput);
		thread.start();
	}


	/**
	 * 子线程在文件中插入英文字符
	 */
	private class PrepareOutput implements Runnable{
		String output = ""	;

		private String genWord(){
			int index = new Random().nextInt(Constants.words.length);
			return Constants.words[index];
		}

		public void run(){
			while(true){
				for(int i=0; i<10000;i++){
					output += genWord() + "\n";
				}
				try{
					byte[] outputBytes = output.getBytes("UTF-8");
					//append text at the end, NO covering the previous file
					outputFileStream = new FileOutputStream(outputFile,true);
					outputFileStream.write(outputBytes);
				} catch (FileNotFoundException e) {
					e.printStackTrace();
				} catch (UnsupportedEncodingException e) {
					e.printStackTrace();
				} catch (IOException e) {
					e.printStackTrace();
				}finally{
					//close file stream
					try {
						outputFileStream.close();
					} catch (IOException e) {
						e.printStackTrace();
					}
				}
			}
		}
	}
}


生成的数据文件



示例如下,左侧为notepad中的行号,右侧是字符每行一个换行符间隔

Java 实现大文件统计字母出现个数_第2张图片



3. CalcDemo

分成多个子线程统计每个英文字符出现的次数

package com.anialy.test.io;

import java.io.File;

public class CalcDemo {
	// the number of calc threads number
	public static final int CALC_THREADS_NUM = 8;
	// the src file 
	private static final File file = new File(Constants.OUTPUT_FILE_NAME);
	// total length
	private static final Long totalBytes = file.length();
	// bytes per thread
	private static final Long bytesPerThread = totalBytes / CALC_THREADS_NUM;
	// bytes left
	private static final Long bytesLeft = totalBytes % bytesPerThread;
	
	private static void initInfo(){
		System.out.printf("file size: %d bytes\n", totalBytes);
		System.out.printf("per thread: %d bytes\n", bytesPerThread);
		System.out.printf("bytes left: %d bytes\n", bytesLeft);
	}
	
	public static void doCalc() {
		initInfo();
		// calc thread start
		for(int threadId=0; threadId


生成结果文件,

Java 实现大文件统计字母出现个数_第3张图片



4.CalcThread.java

3中使用的统计文件的线程类

package com.anialy.test.io;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.io.UnsupportedEncodingException;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeMap;

public class CalcThread extends Thread{
	// Thread NO.
	private int NO = 0; 
	
	private RandomAccessFile raf;
	
	private Long start; // access start index;
	
	private Long end; // access end index;
	
	private Map map = new TreeMap();
	
	public CalcThread(File file, Long start, Long end, int NO) {
		try {
			raf = new RandomAccessFile(file, "rw");
			this.start = start;
			this.end = end;
			this.NO = NO;
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		}
		
		int letterSize = Constants.words.length;
		for(int i=0; i iter = map.keySet().iterator();
		FileOutputStream outputFileStream = null;
		try {
			// 将统计的结果导入文件result-{NO}
			File outputFile = new File("result-" + NO);
			StringBuffer sbf = new StringBuffer();
			while(iter.hasNext()){
				String key = iter.next();
				sbf.append(key + ":" + map.get(key) + "\n");
			}
			outputFileStream = new FileOutputStream(outputFile,true);
			outputFileStream.write(sbf.toString().getBytes("UTF-8"));
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		} catch (UnsupportedEncodingException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		} finally{
			if(outputFileStream != null){
				try {
					outputFileStream.close();
				} catch (IOException e) {
					e.printStackTrace();
				}
			}
		}
	}

	
	// main 测试
	public static void main(String[] args) {
		File file = new File(Constants.OUTPUT_FILE_NAME);
		new CalcThread(file, 0L, file.length(), 1).start();
	}
}

5. Conclusion.java

对分割统计所得的小文件,最后进行统计

package com.anialy.test.io;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeMap;

public class Conclusion {
	
	static Map resultMap = new TreeMap();
	static File file = new File("results.txt");
	
	/**
	 * 分别统计每个子结果文件
	 */
	@SuppressWarnings("resource")
	public static void sumUp() {
		// include the thread to deal with byte-left
		for(int i=0; i iter = resultMap.keySet().iterator();
		StringBuffer sbf = new StringBuffer();
		while(iter.hasNext()){
			String letter = iter.next();
			Integer cnt = resultMap.get(letter);
			String line = letter + ": " + cnt + "\n";
			sbf.append(line);
		}
		
		FileOutputStream fos;
		try {
			fos = new FileOutputStream(file);
			fos.write(sbf.toString().getBytes("UTF-8"));
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		} catch (UnsupportedEncodingException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
	}
	
	public static void main(String[] args) {
		sumUp();
	}
}


得到,



结果如下

A: 47177
B: 45772
C: 46418
D: 46785
E: 47498
F: 46146
G: 45693
H: 46705
I: 47734
J: 46402
K: 46519
L: 46679
M: 46958
N: 46577
O: 47473
P: 45797
Q: 46858
R: 46850
S: 47055
T: 46028
U: 45530
V: 45026
W: 46445
X: 46960
Y: 46718
Z: 46197

6. MainTest.java

主测试文件(20s sleep如果output.txt数据文件更大则需要更长时间)

package com.anialy.test.io;

import java.io.File;

/**
 * Package: com.anialy.test.io
 *
 * File: MainTest.java 
 *
 * Author: anialy   Date: 2014-9-3
 * 
 */
public class MainTest {
	public static void main(String[] args) {
		// clear data
		File file = null;
		for(int i=0; i<=CalcDemo.CALC_THREADS_NUM; i++){
			// delete old data file
			file = new File("result-" + i);
			if(file.isFile())
				file.delete();
		}

		file = new File(Constants.RESULTS_FILE);
		if(file.isFile())
			file.delete();


		// CALC_THREADS_NUM+1(for the bytes-left) Threads to analyze data
		CalcDemo.doCalc();

		// 20 seconds time waiting for all threads' process 
		try {
			Thread.sleep(20000);
		} catch (InterruptedException e) {
			e.printStackTrace();
		}

		// main thread to sum up data
		Conclusion.sumUp();
	}
}














你可能感兴趣的:(Java)