哈夫曼编码解压缩文件 - Java实现

文章目录

    • 前言
    • 一、文件压缩
    • 二、文件解压
    • 结语

前言

不了解哈夫曼树的可以移步查看我的另一篇博客:哈夫曼树(最优二叉树)

使用哈夫曼编码压缩文件,其实就是将每个字符的哈夫曼编码得到,每8位转为一个字节存到文件中,解压缩的时候,在将字节转为二进制重新找到哈夫曼编码对应的字符,这样即可完成文件的解压缩。

文件解压缩的方法:
①将每个字符对应的权值存入压缩文件,在解压时重写构建哈夫曼树,遍历哈夫曼树来获得对应的字符
②将每个字符对应的哈夫曼编码以及长度存入压缩文件,在解压时根据每个字符对应哈夫曼编码的长度,来截取每个字符对应的哈夫曼编码

本博客使用:方法②。
方法①:用于在使用字节流传输时如果每个字符对应的权值大于255时,就会出现权值错误,这是由于java在字节流传输时,会将int转为bety,取int低8位,而int为32位,那么大于8位的数值就会丢失。
具体参考该博客
当然可以使用字符流来传输就可以解决这个问题。

一、文件压缩

大体步骤:

  1. 读取文件,统计每个字符出现的次数(权值)
  2. 根据权值,创建哈夫曼树
  3. 遍历哈夫曼树,得到每个字符的哈夫曼编码
  4. 再次读取文件,将每个字符对应的哈夫曼编码拼接,每8位编码转为一个字节写入压缩文件

注意:

  • 字符可能出现特殊字符,btye值小于0,需要特殊处理,代码中有
  • 需要将码表(每个字符对应的长度、字符对应的哈夫曼编码)写入压缩文件,用于文件解压
  • 每8位转一个字节,如果不够那么就需要补0,所以需要将最后8位补0的个数写入文件
  • 使用缓存机制,减少io次数,提高效率

Compress类
Compress.java

package com.kiger.fileDecompression;

import java.io.*;
import java.util.Arrays;
import java.util.Comparator;
import java.util.PriorityQueue;

/**
 * @ClassName Compress
 * @Description 压缩文件类
 * @Author zk_kiger
 * @Date 2019/11/7 18:55
 * @Version 1.0
 */

public class Compress {
    static final int CHAR_INDEX = 256;
    static final int BUFFER_SIZE = 128;
    // 用来记录文件中字符出现的次数,下标对应字符的ASCII码
    private int[] times = new int[CHAR_INDEX];
    // 用来记录每个字符对应的huffman编码
    private String[] huffmanCodes = new String[CHAR_INDEX];
    // 优先队列用于创建huffman树,自动从小到大排序结点
    private PriorityQueue<Node> queue = new PriorityQueue<>(new Comparator<Node>() {
        @Override
        public int compare(Node o1, Node o2) {
            return o1.getWeight() - o2.getWeight();
        }
    });

    public Compress() {
        for (int i = 0; i < huffmanCodes.length; i++) {
            huffmanCodes[i] = "";
        }
    }

    /**
     * 压缩文件
     * @param fromPath 被压缩文件路径
     * @param toPath   已压缩文件路径
     */
    public void compress(String fromPath, String toPath) {
        compress_(fromPath, toPath);
    }
    private void compress_(String fromPath, String toPath) {

        // 1.读取文件并统计字符权值
        statCharWeight(fromPath);

        // 2.根据权值创建Huffman树
        Node root = createHuffman();

        // 3.根据前序遍历获得编码表
        getHuffmanCode(root, "");

        System.out.println("正在压缩文件...");
        // 4.根据编码表压缩文件
        compressFile(fromPath, toPath);
        System.out.println("文件压缩完成...");

    }

    // 根据编码表压缩文件
    byte value = 0;
    int index = 0;
    int writeBufferSize = 0;
    byte[] writeBuffer = new byte[BUFFER_SIZE];
    int lastIndex = 0;   // 最后一个字节补0的个数
    private void compressFile(String fromPath, String toPath) {
        File toFile = new File(toPath);
        try (
             BufferedInputStream bis = new BufferedInputStream(new FileInputStream(fromPath));
             BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(toFile))
                ) {

            // 将每个编码的长度写入文件
            StringBuilder code = new StringBuilder();
            for (int i = 0; i < CHAR_INDEX; i++) {
                bos.write(huffmanCodes[i].length());
//                if (huffmanCodes[i].length() != 0)
//                    System.out.println(i + " : " + huffmanCodes[i]);
                code.append(huffmanCodes[i]);
            }
            // 再将哈夫曼编码写入文件
            char[] charArray = code.toString().toCharArray();
            for (int i = 0; i < charArray.length; i++) {
                if (charArray[i] == '0')
                    value = CLR_BYTE(value, index);
                if (charArray[i] == '1')
                    value = SET_BYTE(value, index);
                index++;
                if (index >= 8) {
                    index = 0;
                    writeInBuffer(bos, value);
                }
            }
            if (index != 0) {
                writeInBuffer(bos, value);
            }

            // 写文件内容
            index = 0;
            value = 0;
            byte[] bytes = new byte[BUFFER_SIZE];
            int len;
            double length = 0;
            double fileTotalSize = (double)bis.available();
            while ((len = bis.read(bytes)) != -1) {
                length += len;
                double jd = (length/fileTotalSize)*100;
                System.out.printf("压缩进度:%.2f%%\n",jd);
                // 用于拼接字符编码
                StringBuilder sb = new StringBuilder();
                for (int i = 0; i < len; i++) {
                    int temp = bytes[i];
                    if (temp < 0) {
                        sb.append(huffmanCodes[CHAR_INDEX + temp]);
//                        System.out.print((CHAR_INDEX + temp) + " ");
                    } else {
                        sb.append(huffmanCodes[temp]);
//                        System.out.print(temp + " ");
                    }
                }
//                System.out.print(sb.toString());
                // 将拼接好的01字符,每8位转为一个字节存到缓存区
                char[] chars = sb.toString().toCharArray();
                for (int i = 0; i < chars.length; i++) {
                    if (chars[i] == '0')
                        value = CLR_BYTE(value, index);
                    if (chars[i] == '1')
                        value = SET_BYTE(value, index);
                    index++;
                    if (index >= 8) {
                        writeInBuffer(bos, value);
                        index = 0;
                    }
                }
            }
            if (index != 0) {
                lastIndex = 8 - index;
                writeInBuffer(bos, value);
                writeInBuffer(bos, (byte) lastIndex);
//                System.out.println(lastIndex);
            } else {
                writeInBuffer(bos, (byte) lastIndex);
            }
            // 将缓存中的字节写入到文件中
            byte[] data = Arrays.copyOfRange(writeBuffer, 0, writeBufferSize);
            bos.write(data);
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    // 前序遍历获得哈夫曼编码表
    private void getHuffmanCode(Node root, String code) {
        if (root.getLeftChild() != null)
            getHuffmanCode(root.getLeftChild(), code + "0");
        if (root.getRightChild() != null)
            getHuffmanCode(root.getRightChild(), code + "1");
        if (root.getLeftChild() == null && root.getRightChild() == null) {
//            System.out.println(root.getIndex() + " 的编码为:" + code);
            huffmanCodes[root.getIndex()] = code;
        }
    }

    // 创建Huffman树
    private Node createHuffman() {

        // 将字符结点存入到优先队列中
        for (int i = 0; i < times.length; i++) {
            if (times[i] != 0){
//                System.out.println("i = " + i + " : " + "value = " + times[i]);
                queue.add(new Node(i, times[i]));
            }

        }

        // 根据优先队列构建哈夫曼树
        while (queue.size() > 1) {
            // 权值最小
            Node rightChild = queue.remove();
            // 权值仅次于rightChild
            Node leftChild = queue.remove();
            Node newNode = new Node(-1, rightChild.getWeight() + leftChild.getWeight());
            newNode.setLeftChild(leftChild);
            newNode.setRightChild(rightChild);
            queue.add(newNode);
        }

        // 返回根结点
        return queue.peek();
    }

    // 计算字符权值
    private void statCharWeight(String fromPath) {

        try (
                BufferedInputStream bis = new BufferedInputStream(new FileInputStream(fromPath))
                ) {
            byte[] bytes = new byte[BUFFER_SIZE];
            int len;
            while ((len = bis.read(bytes)) != -1) {
                // 用缓存中的字节统计出现权值
                for (int i = 0; i < len; i++) {
                    int temp = bytes[i];
                    if (temp < 0)
                        times[CHAR_INDEX + temp]++;
                    else
                        times[temp]++;
                }
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    //指定位,置1
    private byte SET_BYTE(byte value, int index){
        return (value) |= (1 << ((index) ^ 7));
    }
    //指定位,置0
    private byte CLR_BYTE(byte value, int index){
        return (value) &= (~(1 << ((index) ^ 7)));
    }

    // 写入缓存,达到要求再写入文件
    private void writeInBuffer(BufferedOutputStream bos, byte value) throws IOException {
        if (writeBufferSize < BUFFER_SIZE) {
//            System.out.print(value + " ");
//            System.out.println(Integer.toBinaryString((byte)value) + " ");
            writeBuffer[writeBufferSize] = value;
            if (++writeBufferSize >= BUFFER_SIZE) {
                bos.write(writeBuffer);
                writeBufferSize = 0;
            }
        }
    }

}

二、文件解压

大体步骤:

  1. 读取文件记录每个字符对应的哈夫曼编码长度
  2. 根据每个字符哈夫曼编码长度截取每个字符的哈夫曼编码
  3. 读取文件内容哈夫曼编码找到对应的字符,并写入解压文件中

注意:

  • 在读取字节转为二进制时,需要将字节先强转为整型(因为在字节中可能出现大于127的值,转为byte时,就会存入该值的补码,那么就需要先转为整型再转二进制,否则会出现错误的二进制转换)
int num = value&0xff;
  • 在压缩文件的最后一个字节存储的是,后面8位补0的个数,需要在特殊读取最后2个字节

Decompress类
Decompress.java

package com.kiger.fileDecompression;

import java.io.*;
import java.util.*;

/**
 * @ClassName Decompress
 * @Description 解压类
 * @Author zk_kiger
 * @Date 2019/11/7 22:14
 * @Version 1.0
 */

public class Decompress {
    static final int CHAR_INDEX = 256;
    static final int BUFFER_SIZE = 128;
    // 每个字符对应哈夫曼编码的长度
    private int[] codelengths = new int[CHAR_INDEX];
    // 每个Huffman编码对应的字符
    private Map<String, Integer> huffmanMap = new HashMap<>();
    // 优先队列用于创建huffman树,自动从小到大排序结点
    private PriorityQueue<Node> queue = new PriorityQueue<>(new Comparator<Node>() {
        @Override
        public int compare(Node o1, Node o2) {
            return o1.getWeight() - o2.getWeight();
        }
    });

    public Decompress() {}

    public void deCompress(String fromPath, String toPath) {
        deCompress_(fromPath, toPath);
    }

    /**
     * 解压文件
     */
    private void deCompress_(String fromPath, String toPath) {

        // 1.读取文件里面的码表并还原码表
        // 2.根据权值重新构建Huffman树
        // 3.根据创建Huffman树遍历将字符写入文件
        System.out.println("开始解压缩文件...");
        decompressFile(fromPath, toPath);
        System.out.println("解压缩文件完成...");

    }


    // 读取文件内容,转为哈夫曼编码并解码写入文件
    private void decompressFile(String fromPath, String toPath) {
        // 前面256个字节存储的是每个字符的权值,从第257个字节读取
        try (
                BufferedInputStream bis = new BufferedInputStream(new FileInputStream(fromPath));
                BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(toPath))
                ){
            // 1.读取文件里面的码表并还原码表
            readHuffmanCode(bis);

            // 2.读取剩下的文件内容
            byte[] bytes = new byte[BUFFER_SIZE];
            int len;
            int lastIndex = -1;
            double length = 0;
            double fileTotalSize = (double)bis.available();
            String codeString = "";
            while ((len = bis.read(bytes)) != -1) {
                length += len;
                double jd = (length/fileTotalSize)*100;
                System.out.printf("解压进度:%.2f%%\n",jd);
                StringBuilder sb = new StringBuilder();
                if (bis.available() == 0) {
                    lastIndex = len-1;
                    len -= 2;
                }
                for (int i = 0; i < len; i++) {
                    // 将1字节8位字符串
                    sb.append(tranIntToBin(bytes[i]));
                }
                // 为最后一个字节,需要去掉后面添加的0
                if (lastIndex != -1) {
                    byte value = bytes[lastIndex-1];
                    int lastLen = bytes[lastIndex]&0xff;
//                    System.out.println(lastLen);
                    String s = tranIntToBin(value);
                    sb.append(s, 0, s.length()-lastLen);
                }
//                System.out.println(sb.toString());
                // 根据Huffman编码找到对应的字符
                codeString += sb.toString();
                for (int i = 0; i < codeString.length(); i++) {
                    String s = codeString.substring(0, i+1);
                    if (huffmanMap.containsKey(s)) {
                        writeInBuffer(bos, huffmanMap.get(s));
//                        System.out.print(huffmanMap.get(s) + " ");
                        codeString = codeString.substring(i+1);
                        i = -1;
                    }
                }
            }
            byte[] data = Arrays.copyOfRange(writeBuffer, 0, writeBufferSize);
            bos.write(data);
        } catch (IOException e) {
            e.printStackTrace();
        }

    }

    // 读取文件码表 - 得到每个字符对应的编码
    private void readHuffmanCode(BufferedInputStream bis) {
        try {
            int temp;
            int codeTotalLength = 0;
            // 记录每个字符对应的编码长度
            for (int i = 0; i < codelengths.length; i++) {
                temp = bis.read();
                codelengths[i] = temp;
                codeTotalLength += codelengths[i];
            }
            // 得到编码总长度可以获取前多少字节存放编码,用来截取每一个字符对应的编码
            int length = codeTotalLength / 8;
            if ((codeTotalLength%8) != 0)
                length++;
            byte[] bytes = new byte[length];
            int len;
            while ((len = bis.read(bytes)) != -1) {
                StringBuilder sb = new StringBuilder();
                for (int i = 0; i < bytes.length; i++) {
                    // 将字节转为二进制
                    sb.append(tranIntToBin(bytes[i]));
                }
                String code = sb.toString();
                // 读取Huffman编码并存入map中
                for (int i = 0; i < codelengths.length; i++) {
                    if (codelengths[i] != 0) {
                        String s = code.substring(0, codelengths[i]);
//                        System.out.println(i + " : " + codelengths[i] + " : " + s);
                        huffmanMap.put(s, i);
                        code = code.substring(codelengths[i]);
                    }
                }
                break;
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }


    // 写入缓存,达到要求再写入文件
    int writeBufferSize = 0;
    byte[] writeBuffer = new byte[BUFFER_SIZE];
    private void writeInBuffer(BufferedOutputStream bos, int value) throws IOException {
        if (writeBufferSize < BUFFER_SIZE) {
            writeBuffer[writeBufferSize] = (byte)value;
            if (++writeBufferSize >= BUFFER_SIZE) {
                bos.write(writeBuffer);
                writeBufferSize = 0;
            }
        }
    }

    // 将整数转为8位二进制
    private static String tranIntToBin(byte value) {
        // 该操作非常重要  字节&0xff  强转为int类型
        int num = value&0xff;
//        System.out.println(num + " ");
        String s = "";
        for (int i = 0; i < 8; i++) {
            s = num%2 + s;
            num = num / 2;
        }
        return s;

    }

}

测试类
RunTest.java

package com.kiger.fileDecompression;

import java.io.IOException;

/**
 * @ClassName RunTest
 * @Description TODO
 * @Author zk_kiger
 * @Date 2019/11/7 21:20
 * @Version 1.0
 */

public class RunTest {
    public static void main(String[] args) throws IOException {
        String sourcePath = "D:\\javaProject\\DataStructureAndAlgorithms\\Tree\\src\\com\\kiger\\fileDecompression\\test.txt";
        String compressPath = "D:\\javaProject\\DataStructureAndAlgorithms\\Tree\\src\\com\\kiger\\fileDecompression\\test2.huffmanZip";
        String decompressPath = "D:\\javaProject\\DataStructureAndAlgorithms\\Tree\\src\\com\\kiger\\fileDecompression\\test3.txt";
        
        Compress compress = new Compress();
        compress.compress(sourcePath, compressPath);
        Decompress decompress = new Decompress();
        decompress.deCompress(compressPath, decompressPath);
    }
}

结语

由于哈夫曼编码压缩文件效率较低只能达到80%~90%之间,而且还要存入码表,所以效率不高。

你可能感兴趣的:(数据结构)