不了解哈夫曼树的可以移步查看我的另一篇博客:哈夫曼树(最优二叉树)
使用哈夫曼编码压缩文件,其实就是将每个字符的哈夫曼编码得到,每8位转为一个字节存到文件中,解压缩的时候,在将字节转为二进制重新找到哈夫曼编码对应的字符,这样即可完成文件的解压缩。
文件解压缩的方法:
①将每个字符对应的权值存入压缩文件,在解压时重写构建哈夫曼树,遍历哈夫曼树来获得对应的字符
②将每个字符对应的哈夫曼编码以及长度存入压缩文件,在解压时根据每个字符对应哈夫曼编码的长度,来截取每个字符对应的哈夫曼编码
本博客使用:方法②。
方法①:用于在使用字节流传输时如果每个字符对应的权值大于255时,就会出现权值错误,这是由于java在字节流传输时,会将int转为bety,取int低8位,而int为32位,那么大于8位的数值就会丢失。
具体参考该博客
当然可以使用字符流来传输就可以解决这个问题。
大体步骤:
注意:
Compress类
Compress.java
package com.kiger.fileDecompression;
import java.io.*;
import java.util.Arrays;
import java.util.Comparator;
import java.util.PriorityQueue;
/**
* @ClassName Compress
* @Description 压缩文件类
* @Author zk_kiger
* @Date 2019/11/7 18:55
* @Version 1.0
*/
public class Compress {
static final int CHAR_INDEX = 256;
static final int BUFFER_SIZE = 128;
// 用来记录文件中字符出现的次数,下标对应字符的ASCII码
private int[] times = new int[CHAR_INDEX];
// 用来记录每个字符对应的huffman编码
private String[] huffmanCodes = new String[CHAR_INDEX];
// 优先队列用于创建huffman树,自动从小到大排序结点
private PriorityQueue<Node> queue = new PriorityQueue<>(new Comparator<Node>() {
@Override
public int compare(Node o1, Node o2) {
return o1.getWeight() - o2.getWeight();
}
});
public Compress() {
for (int i = 0; i < huffmanCodes.length; i++) {
huffmanCodes[i] = "";
}
}
/**
* 压缩文件
* @param fromPath 被压缩文件路径
* @param toPath 已压缩文件路径
*/
public void compress(String fromPath, String toPath) {
compress_(fromPath, toPath);
}
private void compress_(String fromPath, String toPath) {
// 1.读取文件并统计字符权值
statCharWeight(fromPath);
// 2.根据权值创建Huffman树
Node root = createHuffman();
// 3.根据前序遍历获得编码表
getHuffmanCode(root, "");
System.out.println("正在压缩文件...");
// 4.根据编码表压缩文件
compressFile(fromPath, toPath);
System.out.println("文件压缩完成...");
}
// 根据编码表压缩文件
byte value = 0;
int index = 0;
int writeBufferSize = 0;
byte[] writeBuffer = new byte[BUFFER_SIZE];
int lastIndex = 0; // 最后一个字节补0的个数
private void compressFile(String fromPath, String toPath) {
File toFile = new File(toPath);
try (
BufferedInputStream bis = new BufferedInputStream(new FileInputStream(fromPath));
BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(toFile))
) {
// 将每个编码的长度写入文件
StringBuilder code = new StringBuilder();
for (int i = 0; i < CHAR_INDEX; i++) {
bos.write(huffmanCodes[i].length());
// if (huffmanCodes[i].length() != 0)
// System.out.println(i + " : " + huffmanCodes[i]);
code.append(huffmanCodes[i]);
}
// 再将哈夫曼编码写入文件
char[] charArray = code.toString().toCharArray();
for (int i = 0; i < charArray.length; i++) {
if (charArray[i] == '0')
value = CLR_BYTE(value, index);
if (charArray[i] == '1')
value = SET_BYTE(value, index);
index++;
if (index >= 8) {
index = 0;
writeInBuffer(bos, value);
}
}
if (index != 0) {
writeInBuffer(bos, value);
}
// 写文件内容
index = 0;
value = 0;
byte[] bytes = new byte[BUFFER_SIZE];
int len;
double length = 0;
double fileTotalSize = (double)bis.available();
while ((len = bis.read(bytes)) != -1) {
length += len;
double jd = (length/fileTotalSize)*100;
System.out.printf("压缩进度:%.2f%%\n",jd);
// 用于拼接字符编码
StringBuilder sb = new StringBuilder();
for (int i = 0; i < len; i++) {
int temp = bytes[i];
if (temp < 0) {
sb.append(huffmanCodes[CHAR_INDEX + temp]);
// System.out.print((CHAR_INDEX + temp) + " ");
} else {
sb.append(huffmanCodes[temp]);
// System.out.print(temp + " ");
}
}
// System.out.print(sb.toString());
// 将拼接好的01字符,每8位转为一个字节存到缓存区
char[] chars = sb.toString().toCharArray();
for (int i = 0; i < chars.length; i++) {
if (chars[i] == '0')
value = CLR_BYTE(value, index);
if (chars[i] == '1')
value = SET_BYTE(value, index);
index++;
if (index >= 8) {
writeInBuffer(bos, value);
index = 0;
}
}
}
if (index != 0) {
lastIndex = 8 - index;
writeInBuffer(bos, value);
writeInBuffer(bos, (byte) lastIndex);
// System.out.println(lastIndex);
} else {
writeInBuffer(bos, (byte) lastIndex);
}
// 将缓存中的字节写入到文件中
byte[] data = Arrays.copyOfRange(writeBuffer, 0, writeBufferSize);
bos.write(data);
} catch (IOException e) {
e.printStackTrace();
}
}
// 前序遍历获得哈夫曼编码表
private void getHuffmanCode(Node root, String code) {
if (root.getLeftChild() != null)
getHuffmanCode(root.getLeftChild(), code + "0");
if (root.getRightChild() != null)
getHuffmanCode(root.getRightChild(), code + "1");
if (root.getLeftChild() == null && root.getRightChild() == null) {
// System.out.println(root.getIndex() + " 的编码为:" + code);
huffmanCodes[root.getIndex()] = code;
}
}
// 创建Huffman树
private Node createHuffman() {
// 将字符结点存入到优先队列中
for (int i = 0; i < times.length; i++) {
if (times[i] != 0){
// System.out.println("i = " + i + " : " + "value = " + times[i]);
queue.add(new Node(i, times[i]));
}
}
// 根据优先队列构建哈夫曼树
while (queue.size() > 1) {
// 权值最小
Node rightChild = queue.remove();
// 权值仅次于rightChild
Node leftChild = queue.remove();
Node newNode = new Node(-1, rightChild.getWeight() + leftChild.getWeight());
newNode.setLeftChild(leftChild);
newNode.setRightChild(rightChild);
queue.add(newNode);
}
// 返回根结点
return queue.peek();
}
// 计算字符权值
private void statCharWeight(String fromPath) {
try (
BufferedInputStream bis = new BufferedInputStream(new FileInputStream(fromPath))
) {
byte[] bytes = new byte[BUFFER_SIZE];
int len;
while ((len = bis.read(bytes)) != -1) {
// 用缓存中的字节统计出现权值
for (int i = 0; i < len; i++) {
int temp = bytes[i];
if (temp < 0)
times[CHAR_INDEX + temp]++;
else
times[temp]++;
}
}
} catch (IOException e) {
e.printStackTrace();
}
}
//指定位,置1
private byte SET_BYTE(byte value, int index){
return (value) |= (1 << ((index) ^ 7));
}
//指定位,置0
private byte CLR_BYTE(byte value, int index){
return (value) &= (~(1 << ((index) ^ 7)));
}
// 写入缓存,达到要求再写入文件
private void writeInBuffer(BufferedOutputStream bos, byte value) throws IOException {
if (writeBufferSize < BUFFER_SIZE) {
// System.out.print(value + " ");
// System.out.println(Integer.toBinaryString((byte)value) + " ");
writeBuffer[writeBufferSize] = value;
if (++writeBufferSize >= BUFFER_SIZE) {
bos.write(writeBuffer);
writeBufferSize = 0;
}
}
}
}
大体步骤:
注意:
int num = value&0xff;
Decompress类
Decompress.java
package com.kiger.fileDecompression;
import java.io.*;
import java.util.*;
/**
* @ClassName Decompress
* @Description 解压类
* @Author zk_kiger
* @Date 2019/11/7 22:14
* @Version 1.0
*/
public class Decompress {
static final int CHAR_INDEX = 256;
static final int BUFFER_SIZE = 128;
// 每个字符对应哈夫曼编码的长度
private int[] codelengths = new int[CHAR_INDEX];
// 每个Huffman编码对应的字符
private Map<String, Integer> huffmanMap = new HashMap<>();
// 优先队列用于创建huffman树,自动从小到大排序结点
private PriorityQueue<Node> queue = new PriorityQueue<>(new Comparator<Node>() {
@Override
public int compare(Node o1, Node o2) {
return o1.getWeight() - o2.getWeight();
}
});
public Decompress() {}
public void deCompress(String fromPath, String toPath) {
deCompress_(fromPath, toPath);
}
/**
* 解压文件
*/
private void deCompress_(String fromPath, String toPath) {
// 1.读取文件里面的码表并还原码表
// 2.根据权值重新构建Huffman树
// 3.根据创建Huffman树遍历将字符写入文件
System.out.println("开始解压缩文件...");
decompressFile(fromPath, toPath);
System.out.println("解压缩文件完成...");
}
// 读取文件内容,转为哈夫曼编码并解码写入文件
private void decompressFile(String fromPath, String toPath) {
// 前面256个字节存储的是每个字符的权值,从第257个字节读取
try (
BufferedInputStream bis = new BufferedInputStream(new FileInputStream(fromPath));
BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(toPath))
){
// 1.读取文件里面的码表并还原码表
readHuffmanCode(bis);
// 2.读取剩下的文件内容
byte[] bytes = new byte[BUFFER_SIZE];
int len;
int lastIndex = -1;
double length = 0;
double fileTotalSize = (double)bis.available();
String codeString = "";
while ((len = bis.read(bytes)) != -1) {
length += len;
double jd = (length/fileTotalSize)*100;
System.out.printf("解压进度:%.2f%%\n",jd);
StringBuilder sb = new StringBuilder();
if (bis.available() == 0) {
lastIndex = len-1;
len -= 2;
}
for (int i = 0; i < len; i++) {
// 将1字节8位字符串
sb.append(tranIntToBin(bytes[i]));
}
// 为最后一个字节,需要去掉后面添加的0
if (lastIndex != -1) {
byte value = bytes[lastIndex-1];
int lastLen = bytes[lastIndex]&0xff;
// System.out.println(lastLen);
String s = tranIntToBin(value);
sb.append(s, 0, s.length()-lastLen);
}
// System.out.println(sb.toString());
// 根据Huffman编码找到对应的字符
codeString += sb.toString();
for (int i = 0; i < codeString.length(); i++) {
String s = codeString.substring(0, i+1);
if (huffmanMap.containsKey(s)) {
writeInBuffer(bos, huffmanMap.get(s));
// System.out.print(huffmanMap.get(s) + " ");
codeString = codeString.substring(i+1);
i = -1;
}
}
}
byte[] data = Arrays.copyOfRange(writeBuffer, 0, writeBufferSize);
bos.write(data);
} catch (IOException e) {
e.printStackTrace();
}
}
// 读取文件码表 - 得到每个字符对应的编码
private void readHuffmanCode(BufferedInputStream bis) {
try {
int temp;
int codeTotalLength = 0;
// 记录每个字符对应的编码长度
for (int i = 0; i < codelengths.length; i++) {
temp = bis.read();
codelengths[i] = temp;
codeTotalLength += codelengths[i];
}
// 得到编码总长度可以获取前多少字节存放编码,用来截取每一个字符对应的编码
int length = codeTotalLength / 8;
if ((codeTotalLength%8) != 0)
length++;
byte[] bytes = new byte[length];
int len;
while ((len = bis.read(bytes)) != -1) {
StringBuilder sb = new StringBuilder();
for (int i = 0; i < bytes.length; i++) {
// 将字节转为二进制
sb.append(tranIntToBin(bytes[i]));
}
String code = sb.toString();
// 读取Huffman编码并存入map中
for (int i = 0; i < codelengths.length; i++) {
if (codelengths[i] != 0) {
String s = code.substring(0, codelengths[i]);
// System.out.println(i + " : " + codelengths[i] + " : " + s);
huffmanMap.put(s, i);
code = code.substring(codelengths[i]);
}
}
break;
}
} catch (IOException e) {
e.printStackTrace();
}
}
// 写入缓存,达到要求再写入文件
int writeBufferSize = 0;
byte[] writeBuffer = new byte[BUFFER_SIZE];
private void writeInBuffer(BufferedOutputStream bos, int value) throws IOException {
if (writeBufferSize < BUFFER_SIZE) {
writeBuffer[writeBufferSize] = (byte)value;
if (++writeBufferSize >= BUFFER_SIZE) {
bos.write(writeBuffer);
writeBufferSize = 0;
}
}
}
// 将整数转为8位二进制
private static String tranIntToBin(byte value) {
// 该操作非常重要 字节&0xff 强转为int类型
int num = value&0xff;
// System.out.println(num + " ");
String s = "";
for (int i = 0; i < 8; i++) {
s = num%2 + s;
num = num / 2;
}
return s;
}
}
测试类
RunTest.java
package com.kiger.fileDecompression;
import java.io.IOException;
/**
* @ClassName RunTest
* @Description TODO
* @Author zk_kiger
* @Date 2019/11/7 21:20
* @Version 1.0
*/
public class RunTest {
public static void main(String[] args) throws IOException {
String sourcePath = "D:\\javaProject\\DataStructureAndAlgorithms\\Tree\\src\\com\\kiger\\fileDecompression\\test.txt";
String compressPath = "D:\\javaProject\\DataStructureAndAlgorithms\\Tree\\src\\com\\kiger\\fileDecompression\\test2.huffmanZip";
String decompressPath = "D:\\javaProject\\DataStructureAndAlgorithms\\Tree\\src\\com\\kiger\\fileDecompression\\test3.txt";
Compress compress = new Compress();
compress.compress(sourcePath, compressPath);
Decompress decompress = new Decompress();
decompress.deCompress(compressPath, decompressPath);
}
}
由于哈夫曼编码压缩文件效率较低只能达到80%~90%之间,而且还要存入码表,所以效率不高。