在数据膨胀,信息爆炸的今天,数据压缩的意义不言而喻。谈到数据压缩,就不能不提赫夫曼(Huffman)编码,赫夫曼编码是首个实用的压缩编码方案,即使在今天的许多知名压缩算法里,依然可以见到赫夫曼编码的影子。
另外在数据通信中,用二进制给每个字符进行编码时不得不面对的一个问题是如何使电文总长度最短而不产生二义性。根据字符出现的频率,利用赫夫曼树可以构造出一种不等长度的二进制,使编码后的电文长度最短,而保证不产生二义性。
赫夫曼树
以下程序在效率上有什么问题呢?
根据老师出的一份试卷各个分数段人数的比例, 可以看出%70的70-89分数段的人数要进行两次比较才能判断出成绩。
我们把上面的流程改为以下,效果可能有明显的改善:
我们先把这两颗二叉树简化成叶子结点带权的二叉树(注·树结点间的连线相关的数叫做权,Weight)。
相关定义
1. 结点的路径长度: 从根结点到该结点的路径上的连接数。
2. 树的路径长度:树中每个叶子结点的路径长度之和。
3. 结点带权路径长度: 结点的路径长度与结点权值得乘积。
4.树的带权路径长度: WPL(Weighted Path Length)是树中所有叶子结点的带权路径长度之和。
WPL的值越小,说明构造出来的二叉树性能越优。
下面演示了用Huffman算法构造一棵Huffman树的过程:
赫夫曼编码
赫夫曼编码可以很有效地压缩数据(通常可以节省20%-90%的空间,具体压缩率依赖于数据的特性)。
名词解释: 定长编码,变长编码,前缀码
定长编码: 像ASCII编码
变长编码: 单个编码的长度不一致,可以根据整体出现频率来调节
前缀码:所谓的前缀码,就是没有任何码字是其他码字的前缀
赫夫曼编码的整体结构示意图如下:
赫夫曼编码的 编码和解码的代码实现:
queue.h
#pragma once
#ifndef _PQUEUE_H
#define _PQUEUE_H
#include "huffman.h"
#define TYPE htNode *
#define MAX_SZ 256
typedef struct _pQueueNode {
TYPE val;
unsigned int priority;
struct _pQueueNode *next;
}pQueueNode;
typedef struct _pQueue {
unsigned int size;
pQueueNode *first;
}pQueue;
void initPQueue(pQueue **queue);
void addPQueue(pQueue **queue, TYPE val, unsigned int priority);
TYPE getPQueue(pQueue **queue);
#endif // !_PQEUE_H
queue.cpp
#include "stdafx.h"
#include "queue.h"
#include
#include
void initPQueue(pQueue **queue) {
(*queue) = (pQueue *)malloc(sizeof(pQueue));
(*queue)->first = NULL;
(*queue)->size = 0;
return;
}
void addPQueue(pQueue **queue, TYPE val, unsigned int priority) {
if ((*queue)->size == MAX_SZ)
{
printf("\nQueue is full.\n");
return;
}
pQueueNode *aux = (pQueueNode *)malloc(sizeof(pQueueNode));
aux->priority = priority;
aux->val = val;
if ((*queue)->size == 0 || (*queue)->first == NULL)
{
aux->next = NULL;
(*queue)->first = aux;
(*queue)->size = 1;
return;
}
else
{
if (priority <= (*queue)->first->priority)
{
aux->next = (*queue)->first;
(*queue)->first = aux;
(*queue)->size++;
return;
}
else
{
pQueueNode *iterator = (*queue)->first;
while (iterator->next != NULL)
{
if (priority <= iterator->next->priority)
{
aux->next = iterator->next;
iterator->next = aux;
(*queue)->size++;
return;
}
iterator = iterator->next;
}
if (iterator->next == NULL)
{
aux->next = NULL;
iterator->next = aux;
(*queue)->size++;
return;
}
}
}
}
TYPE getPQueue(pQueue **queue) {
TYPE returnValue = NULL;
if ((*queue)->size > 0)
{
returnValue = (*queue)->first->val;
(*queue)->first = (*queue)->first->next;
(*queue)->size--;
}
else
{
printf("\nQueue is empty.\n");
}
return returnValue;
}
huffman.h
#pragma once
#ifndef _HUFFMAN_H
#define _HUFFMAN_H
typedef struct _htNode {
char symbol;
struct _htNode *left, *right;
}htNode;
typedef struct _htTree {
htNode *root;
}htTree;
typedef struct _hlNode {
char symbol;
char *code;
struct _hlNode *next;
}hlNode;
typedef struct _hlTable
{
hlNode *first;
hlNode *last;
}hlTable;
#endif // !_HUFFMAN_H
huffman.cpp
#include "stdafx.h"
#include
#include
#include
#include "huffman.h"
#include "queue.h"
#pragma warning(disable:4996)
void traverseTree(htNode *treeNode, hlTable **table, int k, char code[256]) {
if (treeNode->left == NULL && treeNode->right == NULL)
{
code[k] = '\0';
hlNode *aux = (hlNode *)malloc(sizeof(hlNode));
aux->code = (char *)malloc(sizeof(char) * (strlen(code) + 1));
strcpy(aux->code, code);
//strcpy_s(aux->code, strlen(code), code);
aux->symbol = treeNode->symbol;
aux->next = NULL;
if ((*table)->first == NULL)
{
(*table)->first = aux;
(*table)->last = aux;
}
else
{
(*table)->last->next = aux;
(*table)->last = aux;
}
}
if (treeNode->left != NULL)
{
code[k] = '0';
traverseTree(treeNode->left, table, k + 1, code);
}
if (treeNode->right != NULL)
{
code[k] = '1';
traverseTree(treeNode->right, table, k + 1, code);
}
}
htTree * buildTree(char *inputString) {
int *probability = (int *)malloc(sizeof(int) * 256);
//初始化
for (int i = 0; i < 256; i++)
{
probability[i] = 0;
}
//统计待编码的字符串各个字符出现的次数
for (int j = 0; inputString[j] != '\0'; j++)
{
probability[(unsigned char)inputString[j]]++;
}
//pQueue队列的头指针
pQueue *huffmanQueue;
initPQueue(&huffmanQueue);
//填充队列
for (int k = 0; k < 256; k++)
{
if (probability[k] != 0)
{
htNode *aux = (htNode *)malloc(sizeof(htNode));
aux->left = NULL;
aux->right = NULL;
aux->symbol = (char)k;
addPQueue(&huffmanQueue, aux, probability[k]);
}
}
free(probability);
//生成huffman树
while (huffmanQueue->size != 1)
{
int priority = huffmanQueue->first->priority;
priority += huffmanQueue->first->next->priority;
htNode *left = getPQueue(&huffmanQueue);
htNode *right = getPQueue(&huffmanQueue);
htNode *newNode = (htNode *)malloc(sizeof(htNode));
newNode->left = left;
newNode->right = right;
addPQueue(&huffmanQueue, newNode, priority);
}
htTree *tree = (htTree *)malloc(sizeof(htTree));
tree->root = getPQueue(&huffmanQueue);
return tree;
}
hlTable *buildTable(htTree *huffmanTree) {
hlTable *table = (hlTable *)malloc(sizeof(hlTable));
table->first = NULL;
table->last = NULL;
char code[256];
int k = 0;
traverseTree(huffmanTree->root, &table, k, code);
return table;
}
void encode(hlTable *table, char *stringToEncode) {
hlNode *traversal;
printf("Encoding...\n\nInput string : \n %s\n\nEncoding string : \n", stringToEncode);
for (int i = 0; stringToEncode[i] != '\0'; i++)
{
traversal = table->first;
while (traversal->symbol != stringToEncode[i])
{
traversal = traversal->next;
}
printf("%s", traversal->code);
}
printf("\n");
}
void decode(htTree *tree, char *stringToDecode) {
htNode *traversal = tree->root;
printf("\n\nDecoding...\n\nInput string : \n%s\n\nDecoded string : \n", stringToDecode);
for (int i = 0; stringToDecode[i] != '\0'; i++)
{
if (traversal->left == NULL && traversal->right == NULL)
{
printf("%c", traversal->symbol);
traversal = tree->root;
}
if (stringToDecode[i] == '0')
{
traversal = traversal->left;
}
if (stringToDecode[i] == '1')
{
traversal = traversal->right;
}
if (stringToDecode[i] != '0' && stringToDecode[i] != '1')
{
printf("The input string is not coded correctly!\n");
return;
}
}
if (traversal->left == NULL && traversal->right == NULL)
{
}
}
int main() {
htTree *codeTree = buildTree("I love you!");
hlTable *codeTable = buildTable(codeTree);
encode(codeTable, "I love you");
decode(codeTree, "1011111");
getchar();
getchar();
return 0;
}