排序-JAVA实现【十一】大文件排序

问题说明:有一个无续long型的大文件,超过系统内存,需要对其进行排序。

大文件排序,多线程并发处理:


import lombok.SneakyThrows;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;

import java.io.*;
import java.nio.charset.Charset;
import java.util.*;
import java.util.concurrent.atomic.AtomicInteger;

/**
 * 大文件排序
 */
public class BigFileSort {
    private static final Random RANDOM = new Random();

    private long splitLength = 10;

    private AtomicInteger atomicInteger = new AtomicInteger();

    private int totalThread = 0;

    public BigFileSort(long splitLength) {
        this.splitLength = splitLength;
    }

    /**
     * 创建大文件
     */
    public void createBigFile(String fileName, long amount) throws IOException {

        FileOutputStream out = null;

        try {
            File file = new File(fileName);
            if (file.exists()) {
                if (file.isDirectory()) {
                    throw new IOException("File '" + file + "' exists but is a directory");
                }

                if (!file.canWrite()) {
                    throw new IOException("File '" + file + "' cannot be written to");
                }
            } else {
                File parent = file.getParentFile();
                if (parent != null && !parent.exists() && !parent.mkdirs()) {
                    throw new IOException("File '" + file + "' could not be created");
                }
            }

            out = new FileOutputStream(file);

            for (long i = 0; i < amount; i ++) {
                long value = RANDOM.nextLong();
                value = value < 0 ? -value : value;
                out.write(String.valueOf(value + " ").getBytes());
            }

        } finally {
            IOUtils.closeQuietly(out);
        }

    }

    /**
     * 拆分小文件
     */
    public void splitFile(String fileName) throws IOException {
        FileInputStream fis = null;
        FileOutputStream out = null;
        try {
            File file = new File(fileName);
            fis = new FileInputStream(file);

            int temp = 0;
            //当temp等于-1时,表示已经到了文件结尾,停止读取
            String fileParent = file.getParentFile().getAbsolutePath();
            String fileNamePrefix = "big-split-";
            int index = 10000;

            File currentFile = new File(fileParent, fileNamePrefix + index);
            int amount = 0;
            out = new FileOutputStream(currentFile);

            while ((temp = fis.read()) != -1) {
                char item = (char) temp;
                if (item == ' ') {
                    amount += 1;
                }
                out.write(item);
                if (amount == 100000) {
                    amount = 0;
                    index += 1;
                    currentFile = new File(fileParent, fileNamePrefix + index);
                    IOUtils.closeQuietly(out);
                    out = new FileOutputStream(currentFile);
                }
            }

        } finally {
            IOUtils.closeQuietly(fis);
        }
    }

    /**
     * 并发拆分文件
     * @param fileName
     * @throws IOException
     */
    public void splitFileAll(String fileName, long threadAmount) throws IOException {
        File file = new File(fileName);
        FileInputStream fis = new FileInputStream(file);
        long length = fis.available();
        long index = 10000;
        long offset = length / threadAmount;
        long skip = 0;
        long end = skip + offset;
        while (skip < length) {
            end = Math.min(end, length);

            totalThread += 1;

            long finalSkip = skip;
            long finalEnd = end;
            long finalIndex = index;
            new Thread() {
                @SneakyThrows
                public void run() {
                    splitFile(fileName, finalIndex, finalSkip, finalEnd);

                    atomicInteger.incrementAndGet();
                }
            }.start();

            index += 1;
            skip = end;
            end = skip + offset;
        }
    }

    public void waitThreadEnd() throws InterruptedException {
        while (totalThread != atomicInteger.get()) {
            Thread.sleep(2000L);
        }
    }

    /**
     * 指定位置skip读取开始,从第一个 ' '之后开始读取,到end位置之后的第一个' '结束
     * @param fileName
     * @param skip
     * @param end
     * @throws IOException
     */
    public void splitFile(String fileName, long index, long skip, long end) throws IOException {
        FileInputStream fis = null;
        FileOutputStream out = null;
        try {
            File file = new File(fileName);
            fis = new FileInputStream(file);
            fis.skip(skip);

            int temp = 0;
            //当temp等于-1时,表示已经到了文件结尾,停止读取
            String fileParent = file.getParentFile().getAbsolutePath();
            String fileNamePrefix = "big-split-";
            int curIndex = 10000;

            File currentFile = new File(fileParent, fileNamePrefix + index + curIndex);
            int amount = 0;
            boolean start = false;
            out = new FileOutputStream(currentFile);

            long readAmount = skip;
            while ((temp = fis.read()) != -1) {
                char item = (char) temp;

                if (start) {
                    out.write(item);
                }

                readAmount += 1;
                if (item == ' ') {
                    amount += 1;
                    start = true;

                    if (readAmount > end) {
                        break;
                    }
                }
                if (amount == splitLength) {
                    amount = 0;
                    curIndex += 1;
                    currentFile = new File(fileParent, fileNamePrefix + index + curIndex);
                    IOUtils.closeQuietly(out);
                    out = new FileOutputStream(currentFile);
                }
            }

        } finally {
            IOUtils.closeQuietly(fis);
            IOUtils.closeQuietly(out);
        }
    }

    /**
     * 文件排序
     * @param fileName
     */
    public void sortFile(String fileName) throws IOException {
        File file = new File(fileName);
        String allStr = FileUtils.readFileToString(file);
        if (StringUtils.isBlank(allStr)) {
            FileUtils.deleteQuietly(file);
            return;
        }
        String[] allArray = allStr.split(" ");
        List list = new ArrayList<>();

        for (int i = 0; i < allArray.length; i++) {
            if (StringUtils.isNotBlank(allArray[i])) {
                list.add(Long.parseLong(allArray[i]));
            }
        }

        Collections.sort(list);

        FileUtils.writeStringToFile(file, StringUtils.join(list, " "), Charset.forName("UTF-8"));
    }

    /**
     * 合并目录下指定的全部文件
     * @param dir
     * @param fileNamePrefix
     * @throws IOException
     */
    public void mergeFileSortAll(File dir, String fileNamePrefix) throws IOException, InterruptedException {
        File[] files = dir.listFiles(new FilenameFilter() {
            @Override
            public boolean accept(File dir, String name) {
                return name.startsWith(fileNamePrefix);
            }
        });

        int i = 0;
        while (files.length > 1) {
            totalThread = 0;
            atomicInteger.set(0);
            while (i < files.length - 1) {

                totalThread += 1;

                File[] finalFiles = files;
                int finalI = i;
                new Thread() {
                    @SneakyThrows
                    public void run() {
                        mergeFileSort(finalFiles[finalI], finalFiles[finalI + 1]);

                        atomicInteger.incrementAndGet();
                    }
                }.start();

                i += 2;
            }
            i = 0;

            waitThreadEnd();

            files = dir.listFiles(new FilenameFilter() {
                @Override
                public boolean accept(File dir, String name) {
                    return name.startsWith(fileNamePrefix);
                }
            });
        }
    }

    /**
     * 文件合并并排序
     * @param src
     * @param dest
     */
    public void mergeFileSort(File src, File dest) throws IOException {
        FileInputStream srcFis = null;
        FileInputStream destFis = null;

        FileOutputStream out = null;

        File newFile = new File(src.getParent(), UUID.randomUUID().toString());
        try {
            srcFis = new FileInputStream(src);
            destFis = new FileInputStream(dest);
            out = new FileOutputStream(newFile);

            Long srcItem = read(srcFis);

            Long destItem = read(destFis);
            while (srcItem != null && destItem != null) {
                if (srcItem >= destItem) {
                    out.write((destItem + " ").getBytes());

                    destItem = read(destFis);
                } else {
                    out.write((srcItem + " ").getBytes());
                    srcItem = read(srcFis);
                }
            }
            if (srcItem != null) {
                out.write((srcItem + " ").getBytes());
            }
            if (destItem != null) {
                out.write((destItem + " ").getBytes());
            }

            int temp = 0;
            while ((temp = srcFis.read()) != -1) {
                char item = (char) temp;
                out.write(item);
            }

            while ((temp = destFis.read()) != -1) {
                char item = (char) temp;
                out.write(item);
            }

            IOUtils.closeQuietly(srcFis);
            IOUtils.closeQuietly(destFis);
            IOUtils.closeQuietly(out);

            FileUtils.deleteQuietly(src);
            FileUtils.deleteQuietly(dest);

            FileUtils.moveFile(newFile, src);
        } finally {
            IOUtils.closeQuietly(srcFis);
            IOUtils.closeQuietly(destFis);
        }
    }

    private Long read(FileInputStream fis) throws IOException {
        int temp = 0;
        StringBuilder str = new StringBuilder();
        while ((temp = fis.read()) != -1) {
            char item = (char) temp;
            if (item == ' ') {
                break;
            }
            str.append(item);
        }
        if (str.length() > 0) {
            return Long.parseLong(str.toString());
        }
        return null;
    }

    public static void main(String[] args) throws IOException, InterruptedException {
        BigFileSort bigFileSort = new BigFileSort(10000);
        String fileName = "H:/big-file/big-file.txt";
        bigFileSort.createBigFile(fileName, 200000);
        bigFileSort.splitFileAll(fileName, 5);
        bigFileSort.waitThreadEnd();

        String fileNamePrefix = "big-split-";

        File fileInfo = new File(fileName);
        File[] files = fileInfo.getParentFile().listFiles(new FilenameFilter() {
            @Override
            public boolean accept(File dir, String name) {
                return name.startsWith(fileNamePrefix);
            }
        });
        for (File file: files) {
            System.out.println(file.getName());
            if (file.getName().startsWith(fileNamePrefix)) {
                bigFileSort.sortFile(file.getAbsolutePath());
            }
        }

        bigFileSort.mergeFileSortAll(fileInfo.getParentFile(), fileNamePrefix);
    }
}

 

你可能感兴趣的:(算法排序)