问题说明:有一个无续long型的大文件,超过系统内存,需要对其进行排序。
大文件排序,多线程并发处理:
import lombok.SneakyThrows;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import java.io.*;
import java.nio.charset.Charset;
import java.util.*;
import java.util.concurrent.atomic.AtomicInteger;
/**
* 大文件排序
*/
public class BigFileSort {
private static final Random RANDOM = new Random();
private long splitLength = 10;
private AtomicInteger atomicInteger = new AtomicInteger();
private int totalThread = 0;
public BigFileSort(long splitLength) {
this.splitLength = splitLength;
}
/**
* 创建大文件
*/
public void createBigFile(String fileName, long amount) throws IOException {
FileOutputStream out = null;
try {
File file = new File(fileName);
if (file.exists()) {
if (file.isDirectory()) {
throw new IOException("File '" + file + "' exists but is a directory");
}
if (!file.canWrite()) {
throw new IOException("File '" + file + "' cannot be written to");
}
} else {
File parent = file.getParentFile();
if (parent != null && !parent.exists() && !parent.mkdirs()) {
throw new IOException("File '" + file + "' could not be created");
}
}
out = new FileOutputStream(file);
for (long i = 0; i < amount; i ++) {
long value = RANDOM.nextLong();
value = value < 0 ? -value : value;
out.write(String.valueOf(value + " ").getBytes());
}
} finally {
IOUtils.closeQuietly(out);
}
}
/**
* 拆分小文件
*/
public void splitFile(String fileName) throws IOException {
FileInputStream fis = null;
FileOutputStream out = null;
try {
File file = new File(fileName);
fis = new FileInputStream(file);
int temp = 0;
//当temp等于-1时,表示已经到了文件结尾,停止读取
String fileParent = file.getParentFile().getAbsolutePath();
String fileNamePrefix = "big-split-";
int index = 10000;
File currentFile = new File(fileParent, fileNamePrefix + index);
int amount = 0;
out = new FileOutputStream(currentFile);
while ((temp = fis.read()) != -1) {
char item = (char) temp;
if (item == ' ') {
amount += 1;
}
out.write(item);
if (amount == 100000) {
amount = 0;
index += 1;
currentFile = new File(fileParent, fileNamePrefix + index);
IOUtils.closeQuietly(out);
out = new FileOutputStream(currentFile);
}
}
} finally {
IOUtils.closeQuietly(fis);
}
}
/**
* 并发拆分文件
* @param fileName
* @throws IOException
*/
public void splitFileAll(String fileName, long threadAmount) throws IOException {
File file = new File(fileName);
FileInputStream fis = new FileInputStream(file);
long length = fis.available();
long index = 10000;
long offset = length / threadAmount;
long skip = 0;
long end = skip + offset;
while (skip < length) {
end = Math.min(end, length);
totalThread += 1;
long finalSkip = skip;
long finalEnd = end;
long finalIndex = index;
new Thread() {
@SneakyThrows
public void run() {
splitFile(fileName, finalIndex, finalSkip, finalEnd);
atomicInteger.incrementAndGet();
}
}.start();
index += 1;
skip = end;
end = skip + offset;
}
}
public void waitThreadEnd() throws InterruptedException {
while (totalThread != atomicInteger.get()) {
Thread.sleep(2000L);
}
}
/**
* 指定位置skip读取开始,从第一个 ' '之后开始读取,到end位置之后的第一个' '结束
* @param fileName
* @param skip
* @param end
* @throws IOException
*/
public void splitFile(String fileName, long index, long skip, long end) throws IOException {
FileInputStream fis = null;
FileOutputStream out = null;
try {
File file = new File(fileName);
fis = new FileInputStream(file);
fis.skip(skip);
int temp = 0;
//当temp等于-1时,表示已经到了文件结尾,停止读取
String fileParent = file.getParentFile().getAbsolutePath();
String fileNamePrefix = "big-split-";
int curIndex = 10000;
File currentFile = new File(fileParent, fileNamePrefix + index + curIndex);
int amount = 0;
boolean start = false;
out = new FileOutputStream(currentFile);
long readAmount = skip;
while ((temp = fis.read()) != -1) {
char item = (char) temp;
if (start) {
out.write(item);
}
readAmount += 1;
if (item == ' ') {
amount += 1;
start = true;
if (readAmount > end) {
break;
}
}
if (amount == splitLength) {
amount = 0;
curIndex += 1;
currentFile = new File(fileParent, fileNamePrefix + index + curIndex);
IOUtils.closeQuietly(out);
out = new FileOutputStream(currentFile);
}
}
} finally {
IOUtils.closeQuietly(fis);
IOUtils.closeQuietly(out);
}
}
/**
* 文件排序
* @param fileName
*/
public void sortFile(String fileName) throws IOException {
File file = new File(fileName);
String allStr = FileUtils.readFileToString(file);
if (StringUtils.isBlank(allStr)) {
FileUtils.deleteQuietly(file);
return;
}
String[] allArray = allStr.split(" ");
List list = new ArrayList<>();
for (int i = 0; i < allArray.length; i++) {
if (StringUtils.isNotBlank(allArray[i])) {
list.add(Long.parseLong(allArray[i]));
}
}
Collections.sort(list);
FileUtils.writeStringToFile(file, StringUtils.join(list, " "), Charset.forName("UTF-8"));
}
/**
* 合并目录下指定的全部文件
* @param dir
* @param fileNamePrefix
* @throws IOException
*/
public void mergeFileSortAll(File dir, String fileNamePrefix) throws IOException, InterruptedException {
File[] files = dir.listFiles(new FilenameFilter() {
@Override
public boolean accept(File dir, String name) {
return name.startsWith(fileNamePrefix);
}
});
int i = 0;
while (files.length > 1) {
totalThread = 0;
atomicInteger.set(0);
while (i < files.length - 1) {
totalThread += 1;
File[] finalFiles = files;
int finalI = i;
new Thread() {
@SneakyThrows
public void run() {
mergeFileSort(finalFiles[finalI], finalFiles[finalI + 1]);
atomicInteger.incrementAndGet();
}
}.start();
i += 2;
}
i = 0;
waitThreadEnd();
files = dir.listFiles(new FilenameFilter() {
@Override
public boolean accept(File dir, String name) {
return name.startsWith(fileNamePrefix);
}
});
}
}
/**
* 文件合并并排序
* @param src
* @param dest
*/
public void mergeFileSort(File src, File dest) throws IOException {
FileInputStream srcFis = null;
FileInputStream destFis = null;
FileOutputStream out = null;
File newFile = new File(src.getParent(), UUID.randomUUID().toString());
try {
srcFis = new FileInputStream(src);
destFis = new FileInputStream(dest);
out = new FileOutputStream(newFile);
Long srcItem = read(srcFis);
Long destItem = read(destFis);
while (srcItem != null && destItem != null) {
if (srcItem >= destItem) {
out.write((destItem + " ").getBytes());
destItem = read(destFis);
} else {
out.write((srcItem + " ").getBytes());
srcItem = read(srcFis);
}
}
if (srcItem != null) {
out.write((srcItem + " ").getBytes());
}
if (destItem != null) {
out.write((destItem + " ").getBytes());
}
int temp = 0;
while ((temp = srcFis.read()) != -1) {
char item = (char) temp;
out.write(item);
}
while ((temp = destFis.read()) != -1) {
char item = (char) temp;
out.write(item);
}
IOUtils.closeQuietly(srcFis);
IOUtils.closeQuietly(destFis);
IOUtils.closeQuietly(out);
FileUtils.deleteQuietly(src);
FileUtils.deleteQuietly(dest);
FileUtils.moveFile(newFile, src);
} finally {
IOUtils.closeQuietly(srcFis);
IOUtils.closeQuietly(destFis);
}
}
private Long read(FileInputStream fis) throws IOException {
int temp = 0;
StringBuilder str = new StringBuilder();
while ((temp = fis.read()) != -1) {
char item = (char) temp;
if (item == ' ') {
break;
}
str.append(item);
}
if (str.length() > 0) {
return Long.parseLong(str.toString());
}
return null;
}
public static void main(String[] args) throws IOException, InterruptedException {
BigFileSort bigFileSort = new BigFileSort(10000);
String fileName = "H:/big-file/big-file.txt";
bigFileSort.createBigFile(fileName, 200000);
bigFileSort.splitFileAll(fileName, 5);
bigFileSort.waitThreadEnd();
String fileNamePrefix = "big-split-";
File fileInfo = new File(fileName);
File[] files = fileInfo.getParentFile().listFiles(new FilenameFilter() {
@Override
public boolean accept(File dir, String name) {
return name.startsWith(fileNamePrefix);
}
});
for (File file: files) {
System.out.println(file.getName());
if (file.getName().startsWith(fileNamePrefix)) {
bigFileSort.sortFile(file.getAbsolutePath());
}
}
bigFileSort.mergeFileSortAll(fileInfo.getParentFile(), fileNamePrefix);
}
}