统计一篇超过10G的文章中每个单词出现的次数,基本代码思路如下:(代码具体细节思路见以下代码的后续分析)
(实现过程中存在一定问题,后续完善)
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel;
import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
public class CountWordsOfArticle {
public void countWordsOfArticle(String fileName, int arraySize) throws IOException {
File file = new File(fileName);
if (!file.exists()) {
System.out.println("该文件不存在");
return;
}
MappedBiggerFileReader reader = new MappedBiggerFileReader(fileName, arraySize);
while (reader.read() != -1) {
wordCount(reader);
}
}
private static void wordCount(MappedBiggerFileReader reader) throws IOException {
Map map = new ConcurrentHashMap<>();
BufferedReader in = new BufferedReader(new InputStreamReader(reader));
StringBuffer buffer = new StringBuffer();
String line = " ";
while ((line = in.readLine()) != null) {
buffer.append(line);
}
String request = buffer.toString();
Pattern p = Pattern.compile("[, . ; ! ? ]");
Matcher m = p.matcher(request);
String[] strs = p.split(request);
for (int i = 0; i < strs.length; i++) {
if (map.containsKey(strs[i].toLowerCase())) {
map.put(strs[i].toLowerCase(), map.get(strs[i].toLowerCase()) + 1);
} else {
map.put(strs[i].toLowerCase(), 1);
}
}
List> result = map.entrySet().stream()
.sorted(new Comparator>() {
@Override
public int compare(Map.Entry o1, Map.Entry o2) {
return o2.getValue().compareTo(o1.getValue());
}
}).collect(Collectors.toList());
result.forEach(item -> {
System.out.println(item.getKey() + " " + item.getValue());
});
}
public class MappedBiggerFileReader extends InputStream{
private MappedByteBuffer[] mappedBufArray;
private int count = 0;
private int number;
private FileInputStream fileIn;
private long fileLength;
private int arraySize;
private byte[] array;
public MappedBiggerFileReader(String fileName, int arraySize) throws IOException {
this.fileIn = new FileInputStream(fileName);
FileChannel fileChannel = fileIn.getChannel();
this.fileLength = fileChannel.size();
this.number = (int) Math.ceil((double) fileLength / (double) Integer.MAX_VALUE);
this.mappedBufArray = new MappedByteBuffer[number];// 内存文件映射数组
long preLength = 0;
long regionSize = (long) Integer.MAX_VALUE;// 映射区域的大小
for (int i = 0; i < number; i++) {// 将文件的连续区域映射到内存文件映射数组中
if (fileLength - preLength < (long) Integer.MAX_VALUE) {
regionSize = fileLength - preLength;// 最后一片区域的大小
}
mappedBufArray[i] = fileChannel.map(FileChannel.MapMode.READ_ONLY, preLength, regionSize);
preLength += regionSize;// 下一片区域的开始
}
this.arraySize = arraySize;
}
public int read() throws IOException {
if (count >= number) {
return -1;
}
int limit = mappedBufArray[count].limit();
int position = mappedBufArray[count].position();
if (limit - position > arraySize) {
array = new byte[arraySize];
mappedBufArray[count].get(array);
return arraySize;
} else {// 本内存文件映射最后一次读取数据
array = new byte[limit - position];
mappedBufArray[count].get(array);
if (count < number) {
count++;// 转换到下一个内存文件映射
}
return limit - position;
}
}
public void close() throws IOException {
fileIn.close();
array = null;
}
public byte[] getArray() {
return array;
}
public long getFileLength() {
return fileLength;
}
}
}
Java 读取文件的一般操作如下,将文件数据全部读取到内存中,然后再对数据进行操作。
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
public class SmallFileTest {
public static void main(String[] args) throws IOException {
Path path = Paths.get("file path");
byte[] data = Files.readAllBytes(path);
...........(省略)
}
}
这对于小文件是没有问题的,但是对于稍大一些的文件就会抛出异常:
Exception in thread "main" java.lang.OutOfMemoryError: Required array size too large
at java.nio.file.Files.readAllBytes(Files.java:3156)
分析:从错误定位看出,Files.readAllBytes
方法最大支持 Integer.MAX_VALUE - 8
大小的文件,也即最大2GB的文件。一旦超过了这个限度,java 原生的方法就不能直接使用了。
java.io.BufferedInputStream
,每次调用 read()
方法时会接连取出文件中长度为 arraySize
的数据到 array
中。这种方法可行但是效率不高。import java.io.BufferedInputStream;
import java.io.FileInputStream;
import java.io.IOException;
public class StreamFileReader {
private BufferedInputStream fileIn;
private long fileLength;
private int arraySize;
private byte[] array;
public StreamFileReader(String fileName, int arraySize) throws IOException {
this.fileIn = new BufferedInputStream(new FileInputStream(fileName), arraySize);
this.fileLength = fileIn.available();
this.arraySize = arraySize;
}
public int read() throws IOException {
byte[] tmpArray = new byte[arraySize];
int bytes = fileIn.read(tmpArray);// 暂存到字节数组中
if (bytes != -1) {
array = new byte[bytes];// 字节数组长度为已读取长度
System.arraycopy(tmpArray, 0, array, 0, bytes);// 复制已读取数据
return bytes;
}
return -1;
}
public void close() throws IOException {
fileIn.close();
array = null;
}
public byte[] getArray() {
return array;
}
public long getFileLength() {
return fileLength;
}
public static void main(String[] args) throws IOException {
StreamFileReader reader = new StreamFileReader("/home/zfh/movie.mkv", 65536);
long start = System.nanoTime();
while (reader.read() != -1) ;
long end = System.nanoTime();
reader.close();
System.out.println("StreamFileReader: " + (end - start));
}
}
import java.io.FileInputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
public class ChannelFileReader {
private FileInputStream fileIn;
private ByteBuffer byteBuf;
private long fileLength;
private int arraySize;
private byte[] array;
public ChannelFileReader(String fileName, int arraySize) throws IOException {
this.fileIn = new FileInputStream(fileName);
this.fileLength = fileIn.getChannel().size();
this.arraySize = arraySize;
this.byteBuf = ByteBuffer.allocate(arraySize);
}
public int read() throws IOException {
FileChannel fileChannel = fileIn.getChannel();
int bytes = fileChannel.read(byteBuf);// 读取到ByteBuffer中
if (bytes != -1) {
array = new byte[bytes];// 字节数组长度为已读取长度
byteBuf.flip();
byteBuf.get(array);// 从ByteBuffer中得到字节数组
byteBuf.clear();
return bytes;
}
return -1;
}
public void close() throws IOException {
fileIn.close();
array = null;
}
public byte[] getArray() {
return array;
}
public long getFileLength() {
return fileLength;
}
public static void main(String[] args) throws IOException {
ChannelFileReader reader = new ChannelFileReader("/home/zfh/movie.mkv", 65536);
long start = System.nanoTime();
while (reader.read() != -1) ;
long end = System.nanoTime();
reader.close();
System.out.println("ChannelFileReader: " + (end - start));
}
}
import java.io.FileInputStream;
import java.io.IOException;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel;
public class MappedBiggerFileReader {
private MappedByteBuffer[] mappedBufArray;
private int count = 0;
private int number;
private FileInputStream fileIn;
private long fileLength;
private int arraySize;
private byte[] array;
public MappedBiggerFileReader(String fileName, int arraySize) throws IOException {
this.fileIn = new FileInputStream(fileName);
FileChannel fileChannel = fileIn.getChannel();
this.fileLength = fileChannel.size();
this.number = (int) Math.ceil((double) fileLength / (double) Integer.MAX_VALUE);
this.mappedBufArray = new MappedByteBuffer[number];// 内存文件映射数组
long preLength = 0;
long regionSize = (long) Integer.MAX_VALUE;// 映射区域的大小
for (int i = 0; i < number; i++) {// 将文件的连续区域映射到内存文件映射数组中
if (fileLength - preLength < (long) Integer.MAX_VALUE) {
regionSize = fileLength - preLength;// 最后一片区域的大小
}
mappedBufArray[i] = fileChannel.map(FileChannel.MapMode.READ_ONLY, preLength, regionSize);
preLength += regionSize;// 下一片区域的开始
}
this.arraySize = arraySize;
}
public int read() throws IOException {
if (count >= number) {
return -1;
}
int limit = mappedBufArray[count].limit();
int position = mappedBufArray[count].position();
if (limit - position > arraySize) {
array = new byte[arraySize];
mappedBufArray[count].get(array);
return arraySize;
} else {// 本内存文件映射最后一次读取数据
array = new byte[limit - position];
mappedBufArray[count].get(array);
if (count < number) {
count++;// 转换到下一个内存文件映射
}
return limit - position;
}
}
public void close() throws IOException {
fileIn.close();
array = null;
}
public byte[] getArray() {
return array;
}
public long getFileLength() {
return fileLength;
}
public static void main(String[] args) throws IOException {
MappedBiggerFileReader reader = new MappedBiggerFileReader("/home/zfh/movie.mkv", 65536);
long start = System.nanoTime();
while (reader.read() != -1) ;
long end = System.nanoTime();
reader.close();
System.out.println("MappedBiggerFileReader: " + (end - start));
}
}
用上面三种方法读取1GB文件,运行结果如下
StreamFileReader: 11494900386
ChannelFileReader: 11329346316
MappedFileReader: 11169097480
读取10GB文件,运行结果如下
StreamFileReader: 194579779394
ChannelFileReader: 190430242497
MappedBiggerFileReader: 186923035795
扫描文章,使用正则表达式分割出一个个单词,然后把这个单词放到map
import java.io.File;
import java.io.FileNotFoundException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Scanner;
import java.util.Set;
public class splitWords
{
public static void main(String[] args) throws FileNotFoundException
{
File file=new File("C:\\Users\\Administrator\\Desktop\\English.txt");
if(!file.exists())
{
System.out.println("文件不存在");
return;
}
Scanner scanner=new Scanner(file);
//单词和数量映射表
HashMap hashMap=new HashMap();
System.out.println("文章-----------------------------------");
while(scanner.hasNextLine())
{
String line=scanner.nextLine();
System.out.println(line);
//\w+ : 匹配所有的单词
//\W+ : 匹配所有非单词
String[] lineWords=line.split("\\W+");//用非单词符来做分割,分割出来的就是一个个单词
Set wordSet=hashMap.keySet();
for(int i=0;i iterator=hashMap.keySet().iterator();
while(iterator.hasNext())
{
String word=iterator.next();
// System.out.printf("单词: "+word+"出现次数:"+hashMap.get(word));
System.out.printf("单词:%-12s 出现次数:%d\n",word,hashMap.get(word));
}
System.out.println("程序结束--------------------------------");
}
}
采用多线程时:
import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
public class CountWordsOfArticle {
private static void wordCount(String request) {
Map map = new ConcurrentHashMap<>();
Pattern p = Pattern.compile("[, . ; ! ? ]");
Matcher m = p.matcher(request);
String[] strs = p.split(request);
for (int i = 0; i < strs.length; i++) {
if (map.containsKey(strs[i].toLowerCase())) {
map.put(strs[i].toLowerCase(), map.get(strs[i].toLowerCase())+1);
} else {
map.put(strs[i].toLowerCase(), 1);
}
}
List> result = map.entrySet().stream().sorted(new Comparator>() {
@Override
public int compare(Map.Entry o1, Map.Entry o2) {
return o2.getValue().compareTo(o1.getValue());
}
}).collect(Collectors.toList());
result.forEach(item -> {
System.out.println(item.getKey() + " " + item.getValue());
}
);
}
}
【1】https://blog.csdn.net/zhufenghao/article/details/51192043.
【2】https://blog.csdn.net/qq_21808961/article/details/78857170.