写个haffman编码玩一玩,haffman编码是很多压缩方法的基础,其中参考文献1非常详尽的介绍了gzip的算法,很有参考价值。
在haffman tree的实现过程中,有一个比较让人困惑的地方就是:在建立haffman tree的过程中,需要向最小堆中添加新的元素,同时新添加的元素的左右孩子的指针也需要设置好,但最小堆在更新的过程中可能不断交换数组中元素,导致haffman tree记录元素的位置失效,解决方法非常简单最小堆中保存的不是元素而是元素的指针,这样最小堆Pop时返回的不是位置而是节点指针,这样无论元素如何移动都不会影响树的构建了。文献[2]的代码展示了这个特点,而文献[3]的构建过程更加简洁优美。
#include <stdio.h> #include <string> #include <vector> #include <stdlib.h> template<typename T, typename Compare = std::less<T> > class MinHeap { public: MinHeap() : size_(0) {} MinHeap(Compare compare) : compare_(compare), size_(0) {} T Pop() { T value; if (size_ > 0) { value = data_[0]; Swap(data_[0], data_[size_ - 1]); size_--; if (size_ > 0) { Heapfy(0); } data_.pop_back(); } return value; } void Push(const T& value) { data_.push_back(value); size_++; size_t index = size_ - 1; size_t parent; while(index > 0) { parent = Parent(index); if (compare_(data_[index], data_[parent])) { Swap(data_[index], data_[parent]); index = parent; } else { break; } } } size_t Size() { return size_; } private: void Swap(T& left, T& right) { T tmp = left; left = right; right = tmp; } size_t Parent(size_t child) { if (child != 0) { return (child - 1) / 2; } else { return 0; } } size_t LeftChild(size_t parent) { return (parent + 1) * 2 - 1; } size_t RightChild(size_t parent) { return (parent + 1) * 2; } void Heapfy(size_t index) { size_t min = index; size_t left = LeftChild(index); size_t right = RightChild(index); if ((left < size_) && compare_(data_[left],data_[index])) { min = left; } if ((right < size_) && compare_(data_[right],data_[min])) { min = right; } if (min != index) { Swap(data_[index], data_[min]); Heapfy(min); } } Compare compare_; size_t size_; std::vector<T> data_; }; class CharFrequence { public: CharFrequence() : char_('0'), frequence_(1) {} CharFrequence(unsigned char character, unsigned int frequence) : char_(character), frequence_(frequence) {} unsigned char char_; unsigned int frequence_; }; class HaffmanTreeNode : public CharFrequence { public: HaffmanTreeNode() : left_(NULL), right_(NULL) {} HaffmanTreeNode(unsigned char character, unsigned frequence) : CharFrequence(character, frequence), left_(NULL), right_(NULL) {} HaffmanTreeNode( HaffmanTreeNode* left_child, HaffmanTreeNode* right_child) : left_(left_child), right_(right_child) { if (left_child && right_child) { frequence_ = left_child->frequence_ + right_child->frequence_; } } friend bool operator<(const HaffmanTreeNode& left, const HaffmanTreeNode& right) { return left.frequence_ < right.frequence_; } HaffmanTreeNode* left_; HaffmanTreeNode* right_; }; class EncodeByte { public: void SetBit(size_t offset) { } bool GetBit(size_t offset) { size_t byte_num = offset / 8; size_t byte_offset = offset % 8; unsigned char byte = data_[byte_num]; byte >>= (7 - byte_offset); if (byte & 1) { return true; } else { return false; } } class BitsIterator { public: BitsIterator(size_t offset, EncodeByte* byte_manager) : offset_(offset), byte_manager_(byte_manager) { } BitsIterator(size_t offset, const EncodeByte* byte_manager) : offset_(offset), byte_manager_(const_cast<EncodeByte*>(byte_manager)) { } bool operator*() { return byte_manager_->GetBit(offset_); } BitsIterator& operator++() { offset_++; return *this; } BitsIterator operator++(int) { BitsIterator tmp = *this; offset_++; return tmp; } BitsIterator& operator--() { offset_--; return *this; } BitsIterator operator--(int) { BitsIterator tmp = *this; offset_--; return tmp; } friend bool operator!=(const BitsIterator& left, const BitsIterator& right) { return left.offset_ != right.offset_; } private: size_t offset_; EncodeByte* byte_manager_; }; EncodeByte(const std::vector<unsigned char>& encode) : data_(encode) { if (data_.size() > 0) { unsigned char last_byte = data_[data_.size() - 1]; if (last_byte == 128) { bits_length_ = (data_.size() - 1) * 8; } else if (last_byte == 0) { bits_length_ = (data_.size() -1) * 8 - 1; } else { size_t filling_bits_num = 0; while((last_byte & 1) == 0) { filling_bits_num++; last_byte >>= 1; } bits_length_ = data_.size() * 8 - filling_bits_num -1; } } } BitsIterator Begin() const { BitsIterator it(0, this); return it; } BitsIterator End() const { BitsIterator it(bits_length_, this); return it; } private: std::vector<unsigned char> data_; size_t bits_length_; }; class HaffmanTree { public: void Build(const std::vector<HaffmanTreeNode>& char_frequence) { for (int i = 0; i < char_frequence.size(); ++i) { min_heap_.Push(char_frequence[i]); } HaffmanTreeNode* left = NULL; HaffmanTreeNode* right = NULL; HaffmanTreeNode* parent = NULL; while (min_heap_.Size() > 1) { left = new HaffmanTreeNode(min_heap_.Pop()); right = new HaffmanTreeNode(min_heap_.Pop()); parent = new HaffmanTreeNode(left, right); min_heap_.Push(*parent); } root_ = new HaffmanTreeNode(min_heap_.Pop()); std::string code; Trival(root_, &code); } std::string GetHaffmanCode(char character) { return haffman_code[character]; } void Decode(const EncodeByte& bytes, std::vector<unsigned char>* orginal) { EncodeByte::BitsIterator it = bytes.Begin(); HaffmanTreeNode* current = root_; while (it != bytes.End()) { if (current && current->left_ == NULL) { orginal->push_back(current->char_); current = root_; } if (*it) { current = current->right_; } else { current = current->left_; } ++it; } if (current && current->left_ == NULL) { orginal->push_back(current->char_); } } private: void Trival(HaffmanTreeNode* node, std::string* code) { if (node) { if (node->left_ == NULL && node->right_ == NULL) { haffman_code[node->char_] = *code; } (*code) += "0"; Trival(node->left_, code); code->erase(code->size() - 1, 1); (*code) += "1"; Trival(node->right_, code); code->erase(code->size() - 1, 1); } } HaffmanTreeNode* root_; MinHeap<HaffmanTreeNode> min_heap_; std::string haffman_code[256]; }; class MyCompress { public: void Compress(const std::string orginal,std::vector<unsigned char>* encode) { CharacterFrequenceCompute(orginal); int bits_count = 0; std::string haffman_code; unsigned char code = '\0'; for (int i = 0; i < orginal.size(); ++i) { haffman_code = haffman_tree_.GetHaffmanCode(orginal[i]); for (int j = 0; j < haffman_code.size(); ++j) { code <<= 1; bits_count++; if (haffman_code[j] == '1') { code += 1; } if (bits_count == 8) { encode->push_back(code); code = '\0'; bits_count = 0; } } } //last bits ending with 10...0 if (bits_count == 0) { encode->push_back(128); } else if (bits_count == 7) { code <<= 1; code += 1; encode->push_back(0); } else { code <<= 1; code += 1; code <<= (8 - bits_count - 1); encode->push_back(code); } } void Decompress(const std::vector<unsigned char>& encode, std::string* orginal) { } void Decompress(const std::vector<unsigned char>& encode, std::vector<unsigned char>* orginal) { EncodeByte encoded_bytes(encode); haffman_tree_.Decode(encoded_bytes, orginal); } void PrintHaffmanCode(unsigned char c) { printf("code %c : %s\n", c, haffman_tree_.GetHaffmanCode(c).c_str()); } private: void CharacterFrequenceCompute(const std::string& text) { HaffmanTreeNode tree_nodes[256]; for (int i = 0; i < 256; ++i) { tree_nodes[i].char_ = i; } for (int i = 0; i < text.size(); ++i) { tree_nodes[text[i]].char_ = text[i]; tree_nodes[text[i]].frequence_++; } haffman_tree_.Build(std::vector<HaffmanTreeNode>(tree_nodes, tree_nodes + sizeof(tree_nodes) / sizeof(HaffmanTreeNode))); } HaffmanTree haffman_tree_; }; void MinHeapTest() { MinHeap<HaffmanTreeNode> min_heap; const size_t kElementSize = 10; HaffmanTreeNode current; int key = 0; for (int i = 0; i < kElementSize; ++i) { key = rand() % 500; min_heap.Push(HaffmanTreeNode(static_cast<unsigned char>(i +'0'), key)); } while (min_heap.Size() > 0) { current = min_heap.Pop(); printf("pop: char=%c, frequence=%d\n", current.char_, current.frequence_); } } void HaffmanTreeTest() { HaffmanTree haffman_tree; std::vector<HaffmanTreeNode> char_freq; const int kCharSize = 256; int freq = 0; unsigned char character; for (int i = 0; i < kCharSize; ++i) { freq = rand() * rand() % 10000; character = static_cast<unsigned char>(0); printf("char:%c\n", character); char_freq.push_back(HaffmanTreeNode(character, i)); } haffman_tree.Build(char_freq); } void HaffmanTreeTest1() { HaffmanTree haffman_tree; std::vector<HaffmanTreeNode> char_freq; char_freq.push_back(HaffmanTreeNode('a', 1)); char_freq.push_back(HaffmanTreeNode('b', 10)); char_freq.push_back(HaffmanTreeNode('c', 2)); char_freq.push_back(HaffmanTreeNode('d', 8)); haffman_tree.Build(char_freq); } void CompressTest() { std::string text ="aaaaaaaaaaaaaadsfadsfasdfasaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaadfasb"; std::vector<unsigned char> encode; MyCompress compresser; compresser.Compress(text, &encode); std::vector<unsigned char> orginal; compresser.Decompress(encode, &orginal); printf("orginal:%s\n", text.c_str()); printf("decoded:"); for (int i = 0; i < orginal.size(); ++i) { printf("%c", orginal[i]); } printf("\n"); printf("orginal bytes:%zd, compressed bytes:%zd\ncompressed ratio:%f\n", text.size(), encode.size(), (float)encode.size() / text.size() ); } int main(int argc, char** argv) { CompressTest(); // HaffmanTreeTest(); }
1)EncodeByte::GetBit()函数的移位操作开始出错了,正确的移位应该是7 - offset,而不是offset,这个需要注意
2)Compress函数的一些功能应该实现在HaffmanTree中更合适,不想改了
3)使用EncodeByte管理bits简化了很多工作,使得bits相当与iterator来处理
4)利用最小堆构建haffman tree的过程比较经典,设计到对象的创建过程具有很好的参考价值
5)还有困惑的地方当模板类使用比较参数时,还有不是很清楚的地方,例如std::less<T>的代码还要看看,如何让模板支持函数对象和函数需要再了解一下
参考文献:
[1]http://www.360doc.com/content/11/0218/15/2150347_94086443.shtml
[2]http://cppgm.blogspot.com/2008/02/huffman-coding.html
[3]http://marknelson.us/1996/01/01/priority-queues/