霍夫曼编码是一种无损数据压缩算法。其思想是为输入字符分配可变长度代码,分配代码的长度基于相应字符的频率。
分配给输入字符的可变长度代码是前缀代码,意味着代码(位序列)的分配方式是分配给一个字符的代码不是分配给任何其他字符的代码的前缀。这就是霍夫曼编码如何确保在解码生成的比特流时没有歧义。
让我们通过一个反例来理解前缀代码。设a、b、c、d四个字符,它们对应的变长码分别为00、01、0、1。这种编码会导致歧义,因为c的编码是a、b的编码前缀。如果压缩比特流是0001,解压输出可能是“cccd”或“ccb”或“acd”或“ab”。有关霍夫曼编码的应用,
霍夫曼编码主要有两大部分
用于构造最优前缀码的方法称为霍夫曼编码。
该算法以自下而上的方式构建树。我们可以用T来表示这棵树
让,|c| 是叶子的数量
|c| -1 是合并节点所需的操作数。Q 是构造二叉堆时可以使用的优先级队列。
Algorithm Huffman (c) { n= |c| Q = c for i<-1 to n-1 do { temp <- get node () left (temp] Get_min (Q) right [temp] Get Min (Q) a = left [templ b = right [temp] F [temp]<- f[a] + [b] insert (Q, temp) } return Get_min (0) }
构建哈夫曼树的步骤
输入是一组唯一字符及其出现频率,输出是哈夫曼树。
character Frequency a 5 b 9 c 12 d 13 e 16 f 45
步骤 1.构建一个包含 6 个节点的最小堆,其中每个节点代表具有单个节点的树的根。
步骤 2从最小堆中提取两个最小频率节点。添加频率为 5 + 9 = 14 的新内部节点。
现在最小堆包含 5 个节点,其中 4 个节点是每个具有单个元素的树的根,一个堆节点是具有 3 个元素的树的根
character Frequency c 12 d 13 Internal Node 14 e 16 f 45
第三步:从堆中提取两个最小频率节点。添加频率为 12 + 13 = 25 的新内部节点
现在最小堆包含 4 个节点,其中 2 个节点是每个具有单个元素的树的根,两个堆节点是具有多个节点的树的根
character Frequency Internal Node 14 e 16 Internal Node 25 f 45
第四步:提取两个最小频率节点。添加频率为 14 + 16 = 30 的新内部节点
现在最小堆包含 3 个节点。
character Frequency Internal Node 25 Internal Node 30 f 45
步骤5:提取两个最小频率节点。添加频率为 25 + 30 = 55 的新内部节点
现在最小堆包含 2 个节点。
character Frequency f 45 Internal Node 55
第六步:提取两个最小频率节点。添加频率为 45 + 55 = 100 的新内部节点
现在最小堆只包含一个节点。
character Frequency Internal Node 100
由于堆只包含一个节点,算法到此为止。
从哈夫曼树打印代码的步骤:
遍历从根开始形成的树。维护一个辅助数组。在移动到左孩子的同时,将 0 写入数组。在移动到右孩子的同时,将 1 写入数组。遇到叶节点时打印数组。
代码如下:
character code-word f 0 c 100 d 101 a 1100 b 1101 e 111
下面是上述方法的实现:
// C++ program for Huffman Coding
#include
#include
using namespace std;
// This constant can be avoided by explicitly
// calculating height of Huffman Tree
#define MAX_TREE_HT 100
// A Huffman tree node
struct MinHeapNode {
// One of the input characters
char data;
// Frequency of the character
unsigned freq;
// Left and right child of this node
struct MinHeapNode *left, *right;
};
// A Min Heap: Collection of
// min-heap (or Huffman tree) nodes
struct MinHeap {
// Current size of min heap
unsigned size;
// capacity of min heap
unsigned capacity;
// Array of minheap node pointers
struct MinHeapNode** array;
};
// A utility function allocate a new
// min heap node with given character
// and frequency of the character
struct MinHeapNode* newNode(char data, unsigned freq)
{
struct MinHeapNode* temp = (struct MinHeapNode*)malloc(
sizeof(struct MinHeapNode));
temp->left = temp->right = NULL;
temp->data = data;
temp->freq = freq;
return temp;
}
// A utility function to create
// a min heap of given capacity
struct MinHeap* createMinHeap(unsigned capacity)
{
struct MinHeap* minHeap
= (struct MinHeap*)malloc(sizeof(struct MinHeap));
// current size is 0
minHeap->size = 0;
minHeap->capacity = capacity;
minHeap->array = (struct MinHeapNode**)malloc(
minHeap->capacity * sizeof(struct MinHeapNode*));
return minHeap;
}
// A utility function to
// swap two min heap nodes
void swapMinHeapNode(struct MinHeapNode** a,
struct MinHeapNode** b)
{
struct MinHeapNode* t = *a;
*a = *b;
*b = t;
}
// The standard minHeapify function.
void minHeapify(struct MinHeap* minHeap, int idx)
{
int smallest = idx;
int left = 2 * idx + 1;
int right = 2 * idx + 2;
if (left < minHeap->size
&& minHeap->array[left]->freq
< minHeap->array[smallest]->freq)
smallest = left;
if (right < minHeap->size
&& minHeap->array[right]->freq
< minHeap->array[smallest]->freq)
smallest = right;
if (smallest != idx) {
swapMinHeapNode(&minHeap->array[smallest],
&minHeap->array[idx]);
minHeapify(minHeap, smallest);
}
}
// A utility function to check
// if size of heap is 1 or not
int isSizeOne(struct MinHeap* minHeap)
{
return (minHeap->size == 1);
}
// A standard function to extract
// minimum value node from heap
struct MinHeapNode* extractMin(struct MinHeap* minHeap)
{
struct MinHeapNode* temp = minHeap->array[0];
minHeap->array[0] = minHeap->array[minHeap->size - 1];
--minHeap->size;
minHeapify(minHeap, 0);
return temp;
}
// A utility function to insert
// a new node to Min Heap
void insertMinHeap(struct MinHeap* minHeap,
struct MinHeapNode* minHeapNode)
{
++minHeap->size;
int i = minHeap->size - 1;
while (i
&& minHeapNode->freq
< minHeap->array[(i - 1) / 2]->freq) {
minHeap->array[i] = minHeap->array[(i - 1) / 2];
i = (i - 1) / 2;
}
minHeap->array[i] = minHeapNode;
}
// A standard function to build min heap
void buildMinHeap(struct MinHeap* minHeap)
{
int n = minHeap->size - 1;
int i;
for (i = (n - 1) / 2; i >= 0; --i)
minHeapify(minHeap, i);
}
// A utility function to print an array of size n
void printArr(int arr[], int n)
{
int i;
for (i = 0; i < n; ++i)
cout << arr[i];
cout << "\n";
}
// Utility function to check if this node is leaf
int isLeaf(struct MinHeapNode* root)
{
return !(root->left) && !(root->right);
}
// Creates a min heap of capacity
// equal to size and inserts all character of
// data[] in min heap. Initially size of
// min heap is equal to capacity
struct MinHeap* createAndBuildMinHeap(char data[],
int freq[], int size)
{
struct MinHeap* minHeap = createMinHeap(size);
for (int i = 0; i < size; ++i)
minHeap->array[i] = newNode(data[i], freq[i]);
minHeap->size = size;
buildMinHeap(minHeap);
return minHeap;
}
// The main function that builds Huffman tree
struct MinHeapNode* buildHuffmanTree(char data[],
int freq[], int size)
{
struct MinHeapNode *left, *right, *top;
// Step 1: Create a min heap of capacity
// equal to size. Initially, there are
// modes equal to size.
struct MinHeap* minHeap
= createAndBuildMinHeap(data, freq, size);
// Iterate while size of heap doesn't become 1
while (!isSizeOne(minHeap)) {
// Step 2: Extract the two minimum
// freq items from min heap
left = extractMin(minHeap);
right = extractMin(minHeap);
// Step 3: Create a new internal
// node with frequency equal to the
// sum of the two nodes frequencies.
// Make the two extracted node as
// left and right children of this new node.
// Add this node to the min heap
// '$' is a special value for internal nodes, not
// used
top = newNode('$', left->freq + right->freq);
top->left = left;
top->right = right;
insertMinHeap(minHeap, top);
}
// Step 4: The remaining node is the
// root node and the tree is complete.
return extractMin(minHeap);
}
// Prints huffman codes from the root of Huffman Tree.
// It uses arr[] to store codes
void printCodes(struct MinHeapNode* root, int arr[],
int top)
{
// Assign 0 to left edge and recur
if (root->left) {
arr[top] = 0;
printCodes(root->left, arr, top + 1);
}
// Assign 1 to right edge and recur
if (root->right) {
arr[top] = 1;
printCodes(root->right, arr, top + 1);
}
// If this is a leaf node, then
// it contains one of the input
// characters, print the character
// and its code from arr[]
if (isLeaf(root)) {
cout << root->data << ": ";
printArr(arr, top);
}
}
// The main function that builds a
// Huffman Tree and print codes by traversing
// the built Huffman Tree
void HuffmanCodes(char data[], int freq[], int size)
{
// Construct Huffman Tree
struct MinHeapNode* root
= buildHuffmanTree(data, freq, size);
// Print Huffman codes using
// the Huffman tree built above
int arr[MAX_TREE_HT], top = 0;
printCodes(root, arr, top);
}
// Driver code
int main()
{
char arr[] = { 'a', 'b', 'c', 'd', 'e', 'f' };
int freq[] = { 5, 9, 12, 13, 16, 45 };
int size = sizeof(arr) / sizeof(arr[0]);
HuffmanCodes(arr, freq, size);
return 0;
}
// C++(STL) program for Huffman Coding with STL
#include
using namespace std;
// A Huffman tree node
struct MinHeapNode {
// One of the input characters
char data;
// Frequency of the character
unsigned freq;
// Left and right child
MinHeapNode *left, *right;
MinHeapNode(char data, unsigned freq)
{
left = right = NULL;
this->data = data;
this->freq = freq;
}
};
// For comparison of
// two heap nodes (needed in min heap)
struct compare {
bool operator()(MinHeapNode* l, MinHeapNode* r)
{
return (l->freq > r->freq);
}
};
// Prints huffman codes from
// the root of Huffman Tree.
void printCodes(struct MinHeapNode* root, string str)
{
if (!root)
return;
if (root->data != '$')
cout << root->data << ": " << str << "\n";
printCodes(root->left, str + "0");
printCodes(root->right, str + "1");
}
// The main function that builds a Huffman Tree and
// print codes by traversing the built Huffman Tree
void HuffmanCodes(char data[], int freq[], int size)
{
struct MinHeapNode *left, *right, *top;
// Create a min heap & inserts all characters of data[]
priority_queue,
compare>
minHeap;
for (int i = 0; i < size; ++i)
minHeap.push(new MinHeapNode(data[i], freq[i]));
// Iterate while size of heap doesn't become 1
while (minHeap.size() != 1) {
// Extract the two minimum
// freq items from min heap
left = minHeap.top();
minHeap.pop();
right = minHeap.top();
minHeap.pop();
// Create a new internal node with
// frequency equal to the sum of the
// two nodes frequencies. Make the
// two extracted node as left and right children
// of this new node. Add this node
// to the min heap '$' is a special value
// for internal nodes, not used
top = new MinHeapNode('$',
left->freq + right->freq);
top->left = left;
top->right = right;
minHeap.push(top);
}
// Print Huffman codes using
// the Huffman tree built above
printCodes(minHeap.top(), "");
}
// Driver Code
int main()
{
char arr[] = { 'a', 'b', 'c', 'd', 'e', 'f' };
int freq[] = { 5, 9, 12, 13, 16, 45 };
int size = sizeof(arr) / sizeof(arr[0]);
HuffmanCodes(arr, freq, size);
return 0;
}
// This code is contributed by Aditya Goel
Output
f: 0 c: 100 d: 101 a: 1100 b: 1101 e: 111
时间复杂度: O(nlogn),其中 n 是唯一字符的数量。如果有 n 个节点,则调用 extractMin() 2*(n – 1) 次。extractMin() 在调用 minHeapify() 时需要 O(logn) 时间。因此,整体复杂度为 O(nlogn)。
如果输入数组已排序,则存在线性时间算法。我们很快就会在下一篇文章中讨论这个问题。
霍夫曼编码的应用:
它在有一系列频繁出现的字符的情况下很有用。