简述:
在大文件中用多线程实现查找里面次数出现最多的字母
文件格式如,
Z
Q
S
D
N
O
E
U
...
所有的类,包图
0.Constant.java
常量类
package com.anialy.test.io;
public class Constants {
public static String[] words = new String[]{
"A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K","L",
"M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z"};
// 随机生成的数据文件
public static String OUTPUT_FILE_NAME = "output.txt";
// 统计结果文件
public static String RESULTS_FILE = "results.txt";
}
1. DataProductor.java
数据生成代码,可以跑一段时间然后看输出的文件
运行main函数即可
package com.anialy.test.io;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.Random;
public class DataProductor {
public static void main(String[] args) {
// 随机生成数据
new DataProductor().produceData();
}
private FileOutputStream outputFileStream = null;
private static final File outputFile = new File(Constants.OUTPUT_FILE_NAME);
public void produceData(){
DataProductor productor = new DataProductor();
PrepareOutput prepareOutput = productor.new PrepareOutput();
Thread thread = new Thread(prepareOutput);
thread.start();
}
/**
* 子线程在文件中插入英文字符
*/
private class PrepareOutput implements Runnable{
String output = "" ;
private String genWord(){
int index = new Random().nextInt(Constants.words.length);
return Constants.words[index];
}
public void run(){
while(true){
for(int i=0; i<10000;i++){
output += genWord() + "\n";
}
try{
byte[] outputBytes = output.getBytes("UTF-8");
//append text at the end, NO covering the previous file
outputFileStream = new FileOutputStream(outputFile,true);
outputFileStream.write(outputBytes);
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}finally{
//close file stream
try {
outputFileStream.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
}
}
生成的数据文件
示例如下,左侧为notepad中的行号,右侧是字符每行一个换行符间隔
3. CalcDemo
分成多个子线程统计每个英文字符出现的次数
package com.anialy.test.io;
import java.io.File;
public class CalcDemo {
// the number of calc threads number
public static final int CALC_THREADS_NUM = 8;
// the src file
private static final File file = new File(Constants.OUTPUT_FILE_NAME);
// total length
private static final Long totalBytes = file.length();
// bytes per thread
private static final Long bytesPerThread = totalBytes / CALC_THREADS_NUM;
// bytes left
private static final Long bytesLeft = totalBytes % bytesPerThread;
private static void initInfo(){
System.out.printf("file size: %d bytes\n", totalBytes);
System.out.printf("per thread: %d bytes\n", bytesPerThread);
System.out.printf("bytes left: %d bytes\n", bytesLeft);
}
public static void doCalc() {
initInfo();
// calc thread start
for(int threadId=0; threadId
4.CalcThread.java
3中使用的统计文件的线程类
package com.anialy.test.io;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.io.UnsupportedEncodingException;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeMap;
public class CalcThread extends Thread{
// Thread NO.
private int NO = 0;
private RandomAccessFile raf;
private Long start; // access start index;
private Long end; // access end index;
private Map map = new TreeMap();
public CalcThread(File file, Long start, Long end, int NO) {
try {
raf = new RandomAccessFile(file, "rw");
this.start = start;
this.end = end;
this.NO = NO;
} catch (FileNotFoundException e) {
e.printStackTrace();
}
int letterSize = Constants.words.length;
for(int i=0; i iter = map.keySet().iterator();
FileOutputStream outputFileStream = null;
try {
// 将统计的结果导入文件result-{NO}
File outputFile = new File("result-" + NO);
StringBuffer sbf = new StringBuffer();
while(iter.hasNext()){
String key = iter.next();
sbf.append(key + ":" + map.get(key) + "\n");
}
outputFileStream = new FileOutputStream(outputFile,true);
outputFileStream.write(sbf.toString().getBytes("UTF-8"));
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally{
if(outputFileStream != null){
try {
outputFileStream.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
// main 测试
public static void main(String[] args) {
File file = new File(Constants.OUTPUT_FILE_NAME);
new CalcThread(file, 0L, file.length(), 1).start();
}
}
对分割统计所得的小文件,最后进行统计
package com.anialy.test.io;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeMap;
public class Conclusion {
static Map resultMap = new TreeMap();
static File file = new File("results.txt");
/**
* 分别统计每个子结果文件
*/
@SuppressWarnings("resource")
public static void sumUp() {
// include the thread to deal with byte-left
for(int i=0; i iter = resultMap.keySet().iterator();
StringBuffer sbf = new StringBuffer();
while(iter.hasNext()){
String letter = iter.next();
Integer cnt = resultMap.get(letter);
String line = letter + ": " + cnt + "\n";
sbf.append(line);
}
FileOutputStream fos;
try {
fos = new FileOutputStream(file);
fos.write(sbf.toString().getBytes("UTF-8"));
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
public static void main(String[] args) {
sumUp();
}
}
得到,
结果如下
A: 47177
B: 45772
C: 46418
D: 46785
E: 47498
F: 46146
G: 45693
H: 46705
I: 47734
J: 46402
K: 46519
L: 46679
M: 46958
N: 46577
O: 47473
P: 45797
Q: 46858
R: 46850
S: 47055
T: 46028
U: 45530
V: 45026
W: 46445
X: 46960
Y: 46718
Z: 46197
6. MainTest.java
主测试文件(20s sleep如果output.txt数据文件更大则需要更长时间)
package com.anialy.test.io;
import java.io.File;
/**
* Package: com.anialy.test.io
*
* File: MainTest.java
*
* Author: anialy Date: 2014-9-3
*
*/
public class MainTest {
public static void main(String[] args) {
// clear data
File file = null;
for(int i=0; i<=CalcDemo.CALC_THREADS_NUM; i++){
// delete old data file
file = new File("result-" + i);
if(file.isFile())
file.delete();
}
file = new File(Constants.RESULTS_FILE);
if(file.isFile())
file.delete();
// CALC_THREADS_NUM+1(for the bytes-left) Threads to analyze data
CalcDemo.doCalc();
// 20 seconds time waiting for all threads' process
try {
Thread.sleep(20000);
} catch (InterruptedException e) {
e.printStackTrace();
}
// main thread to sum up data
Conclusion.sumUp();
}
}