最近闲来无事,在小破站看到了Huffman编码原理,就想着用Java自己实现一波,Huffman编码的原理就是通过统计字符出现的次数作为权值来构建Huffman树,然后以左子树为0,右子树为1进行编码,因为字符对应的节点都是叶子节点,这样每一个字符的编码都是唯一的,以此实现对字符串的压缩编解码.话不多说,直接上代码:
首先定义树节点对象
public static class HTNode {
/**
* 左子节点
*/
private HTNode left;
/**
* 右子节点
*/
private HTNode right;
/**
* 权重
*/
private int power;
/**
* 对应的字符
*/
private E v;
public HTNode(int power){
this.setPower(power);
}
public HTNode(){
}
public HTNode(int power, E v){
this.power = power;
this.v = v;
}
public HTNode getLeft() {
return left;
}
public void setLeft(HTNode left) {
this.left = left;
}
public HTNode getRight() {
return right;
}
public void setRight(HTNode right) {
this.right = right;
}
public int getPower() {
return power;
}
public void setPower(int power) {
this.power = power;
}
public E getV() {
return v;
}
public void setV(E v) {
this.v = v;
}
/**
* 前序遍历函数
* @param node
*/
public void printfVLR(HTNode node){
if(node == null){
return;
}
System.out.println(node.power+" "+node.v);
//遍历左子树
printfVLR(node.left);
//遍历右子树
printfVLR(node.right);
}
}
接下来写个排序函数,就用快排吧
public static void quickSortT(T[] arr, int start, int end){
if(start >= end)
return;
int midIdx = subQuickSort(arr, start, end);
quickSortT(arr, 0, midIdx -1);
quickSortT(arr, midIdx+1, end);
}
public static int subQuickSort(T[] arr, int start, int end){
T mid = arr[start];
while (start < end){
if(arr[end].getPower() >= mid.getPower() && start < end){
end--;
}
arr[start] = arr[end];
while(arr[start].getPower() <= mid.getPower() && start < end){
start++;
}
arr[end] = arr[start];
}
arr[start] = mid;
return start;
}
然后是构建Huffman树的函数:
public class TestHuffmanTree {
private static HTNode root;
public static HTNode getHuffmanTree(Map map){
return huffmanTree(transfer(map));
}
public static HTNode[] transfer(Map map){
HTNode[] htNodes = new HTNode[map.size()];
int i=0;
Iterator> iterator = map.entrySet().iterator();
while(iterator.hasNext()){
Map.Entry next = iterator.next();
htNodes[i] = new HTNode(next.getValue(), next.getKey());
i++;
}
return htNodes;
}
private static HTNode huffmanTree(HTNode[] arr){
long start = System.nanoTime();
if(null == arr || arr.length == 0)
return null;
while(arr.length > 1){
for(int i=0;i< arr.length;++i){
if(arr.length >= 2){
QuickSort.quickSortT(arr, 0, arr.length-1);
}
//取出两个节点,结合成新节点,
HTNode n1 = arr[0];
HTNode n2 = arr[1];
HTNode newNode = new HTNode(n1.power+ n2.power);
newNode.setLeft(n1);
newNode.setRight(n2);
arr = Arrays.copyOfRange(arr, 1, arr.length);
arr[0] = newNode;
}
}
root = arr[0];
System.out.println("耗时:"+(System.nanoTime() - start));
return root;
}
}
然后是对huffman树进行编码
public class TestHuffmanCode {
public static void main(String[] args) {
String s = "i have a dream, this dream depends on american dream";
Map charMap = getCharMap(s);
TestHuffmanTree.HTNode huffmanTree = TestHuffmanTree.getHuffmanTree(charMap);
huffmanTree.printfVLR(huffmanTree);
getHuffmanCode(huffmanTree ,null);
huffmanCodeMap.forEach((k, v) ->{
System.out.println(k +"=="+v);
});
String result = "";
for(int i=0;i< chars.length;++i){
String hCode = huffmanCodeMap.get(chars[i]);
result+=hCode;
}
System.out.println("最终编码为:"+result);
}
private static Map getCharMap(String s){
Map map = new HashMap<>();
chars = s.toCharArray();
for(int i=0;i< chars.length;++i){
char aChar = chars[i];
if(map.containsKey(aChar)){
Integer count = map.get(aChar);
map.put(aChar, ++count);
}else{
map.put(aChar, 1);
}
}
return map;
}
static Map huffmanCodeMap = new HashMap<>();
static StringBuilder sb = new StringBuilder();
static char[] chars;
/**
* 前序遍历获取各子节点Huffman编码,并缓存到 huffmanCodeMap 中
* @param node
* @param direction
*/
private static void getHuffmanCode(TestHuffmanTree.HTNode node, Integer direction){
if(node == null)
return;
if(direction != null){
sb.append(direction);
}
if(node.getV() != null){
huffmanCodeMap.put(node.getV(), sb.toString());
sb = sb.deleteCharAt(sb.length()-1);
return;
}
TestHuffmanTree.HTNode left = node.getLeft();
getHuffmanCode(left, 0);
TestHuffmanTree.HTNode right = node.getRight();
getHuffmanCode(right, 1);
//如果跳出本次递归调用,返回上一层递归,如果sb不为空,删除掉最后一个编码
if(sb.length() > 0){
sb = sb.deleteCharAt(sb.length()-1);
}
}
}
接下来是打印结果
耗时:698300
52 null
21 null
9
12 null
5 d
7 a
31 null
14 null
7 null
3 i
4 m
7 e
17 null
8 null
4 null
2 s
2 null
1 p
1 o
4 r
9 null
4 null
2 h
2 null
1 v
1 c
5 null
2 null
1 ,
1 t
3 n
==00
a==011
c==111011
d==010
e==101
h==11100
i==1000
,==111100
m==1001
n==11111
o==110011
p==110010
r==1101
s==11000
t==111101
v==111010
最终编码为:1000001110001111101010100011000101101101011100111110000111101111001000110000001011011010111001000101011100101011111101011000001100111111100011100110111011000111011011111110001011011010111001
完毕!