哈夫曼树实现示例

写个haffman编码玩一玩,haffman编码是很多压缩方法的基础,其中参考文献1非常详尽的介绍了gzip的算法,很有参考价值。

在haffman tree的实现过程中,有一个比较让人困惑的地方就是:在建立haffman tree的过程中,需要向最小堆中添加新的元素,同时新添加的元素的左右孩子的指针也需要设置好,但最小堆在更新的过程中可能不断交换数组中元素,导致haffman tree记录元素的位置失效,解决方法非常简单最小堆中保存的不是元素而是元素的指针,这样最小堆Pop时返回的不是位置而是节点指针,这样无论元素如何移动都不会影响树的构建了。文献[2]的代码展示了这个特点,而文献[3]的构建过程更加简洁优美。

#include <stdio.h>
#include <string>
#include <vector>
#include <stdlib.h>
template<typename T, typename Compare = std::less<T>  >
class MinHeap {
 public:
  MinHeap() : size_(0) {}
  MinHeap(Compare compare) : compare_(compare), size_(0) {}
  T Pop() {
    T value;
    if (size_ > 0) {
      value = data_[0];
      Swap(data_[0], data_[size_ - 1]);
      size_--;
      if (size_ > 0) {
        Heapfy(0);
      }
      data_.pop_back();
    }
    return value;
  }
  void Push(const T& value) {
    data_.push_back(value);
    size_++;
    size_t index = size_ - 1;
    size_t parent;
    while(index > 0) {
      parent = Parent(index);
      if (compare_(data_[index], data_[parent])) {
        Swap(data_[index], data_[parent]);
        index = parent;
      } else {
        break;
      }
    }
  }
  size_t Size() {
    return size_;
  }
 private:
  void Swap(T& left, T& right) {
    T tmp = left;
    left = right;
    right = tmp;
  }
  size_t Parent(size_t child) {
    if (child != 0) {
      return (child - 1) / 2;
    } else {
      return 0;
    }
  }
  size_t LeftChild(size_t parent) {
    return (parent + 1) * 2 - 1;
  }
  size_t RightChild(size_t parent) {
    return (parent + 1) * 2;
  }
  void Heapfy(size_t index) {
    size_t min = index;
    size_t left = LeftChild(index);
    size_t right = RightChild(index);
    if ((left < size_) && compare_(data_[left],data_[index])) {
      min = left;
    }
    if ((right < size_) && compare_(data_[right],data_[min])) {
      min = right;
    }
    if (min != index) {
      Swap(data_[index], data_[min]);
      Heapfy(min);
    }
  }
  Compare compare_;
  size_t size_;
  std::vector<T> data_;
};
class CharFrequence {
 public:
  CharFrequence() : char_('0'), frequence_(1) {}
  CharFrequence(unsigned char character, unsigned int frequence) : char_(character), frequence_(frequence) {}
  unsigned char char_;
  unsigned int frequence_;
};
class HaffmanTreeNode : public CharFrequence {
 public:
  HaffmanTreeNode() : left_(NULL), right_(NULL) {}
  HaffmanTreeNode(unsigned char character, unsigned frequence) : CharFrequence(character, frequence), left_(NULL), right_(NULL) {}
  HaffmanTreeNode( HaffmanTreeNode* left_child, HaffmanTreeNode* right_child) 
      : left_(left_child), right_(right_child) {
    if (left_child && right_child) {
      frequence_ = left_child->frequence_ + right_child->frequence_;
    }
  }
  friend bool operator<(const HaffmanTreeNode& left, const HaffmanTreeNode& right) {
    return left.frequence_ < right.frequence_;
  }
  HaffmanTreeNode* left_;
  HaffmanTreeNode* right_;
};
class EncodeByte {
 public:
  void SetBit(size_t offset) {
  }
  bool GetBit(size_t offset) {
    size_t byte_num = offset / 8;
    size_t byte_offset = offset % 8;
    unsigned char byte = data_[byte_num];
    byte >>= (7 - byte_offset);
    if (byte & 1) {
      return true;
    } else {
      return false;
    }
  }  
  class BitsIterator {
   public:
    BitsIterator(size_t offset, EncodeByte* byte_manager) : offset_(offset), byte_manager_(byte_manager) { }
    BitsIterator(size_t offset, const EncodeByte* byte_manager) : offset_(offset), byte_manager_(const_cast<EncodeByte*>(byte_manager)) { }
    bool operator*() {
      return byte_manager_->GetBit(offset_);
    }
    BitsIterator& operator++() {
      offset_++;
      return *this;
    }
    BitsIterator operator++(int) {
      BitsIterator tmp = *this;
      offset_++;
      return tmp;
    }
    BitsIterator& operator--() {
      offset_--;
      return *this;
    }
    BitsIterator operator--(int) {
      BitsIterator tmp = *this;
      offset_--;
      return tmp;
    }
    friend bool operator!=(const BitsIterator& left, const BitsIterator& right) {
      return left.offset_ != right.offset_;
    }
   private:    
    size_t offset_;
    EncodeByte* byte_manager_;
  };
  EncodeByte(const std::vector<unsigned char>& encode) : data_(encode) {
    if (data_.size() > 0) {
      unsigned char last_byte = data_[data_.size() - 1];
      if (last_byte == 128) {
        bits_length_ = (data_.size() - 1) * 8;
      } else if (last_byte == 0) {
        bits_length_ = (data_.size() -1) * 8 - 1;
      } else {
        size_t filling_bits_num = 0;
        while((last_byte & 1) == 0) {
          filling_bits_num++;
          last_byte >>= 1;
        }
        bits_length_ = data_.size() * 8 - filling_bits_num -1;
      }
    }                                  
  }
  BitsIterator Begin() const {
    BitsIterator it(0, this);
    return it;
  }
  BitsIterator End() const {
    BitsIterator it(bits_length_, this);
    return it;
  }
 private:
  std::vector<unsigned char> data_;
  size_t bits_length_;
};
class HaffmanTree {
 public:
  void Build(const std::vector<HaffmanTreeNode>& char_frequence) {
    for (int i = 0; i < char_frequence.size(); ++i) {
      min_heap_.Push(char_frequence[i]);
    }
    HaffmanTreeNode* left = NULL;
    HaffmanTreeNode* right = NULL;
    HaffmanTreeNode* parent = NULL;
    while (min_heap_.Size() > 1) {
      left = new HaffmanTreeNode(min_heap_.Pop());
      right = new HaffmanTreeNode(min_heap_.Pop());
      parent = new HaffmanTreeNode(left, right);
      min_heap_.Push(*parent);
    }
    root_ = new HaffmanTreeNode(min_heap_.Pop());
    std::string code;
    Trival(root_, &code);
  }
  std::string GetHaffmanCode(char character) {
    return haffman_code[character];
  }
  void Decode(const EncodeByte& bytes, std::vector<unsigned char>* orginal) {
    EncodeByte::BitsIterator it = bytes.Begin();
    HaffmanTreeNode* current = root_;
    while (it != bytes.End()) {
      if (current && current->left_ == NULL) {
        orginal->push_back(current->char_);
        current = root_;
      }
      if (*it) {
        current = current->right_;
      } else {
        current = current->left_;
      }
      ++it;
    }
    if (current && current->left_ == NULL) {
      orginal->push_back(current->char_);
    }

  }
 private:
  void Trival(HaffmanTreeNode* node, std::string* code) {
    if (node) {
      if (node->left_ == NULL && node->right_ == NULL) {
        haffman_code[node->char_] = *code;
      }
      (*code) += "0";
      Trival(node->left_, code);
      code->erase(code->size() - 1, 1);
      (*code) += "1";
      Trival(node->right_, code);
      code->erase(code->size() - 1, 1);
    }
  }
  HaffmanTreeNode* root_;
  MinHeap<HaffmanTreeNode> min_heap_;
  std::string haffman_code[256];
};
class MyCompress {
 public:
  void Compress(const std::string orginal,std::vector<unsigned char>* encode) {
    CharacterFrequenceCompute(orginal);
    int bits_count = 0;
    std::string haffman_code;
    unsigned char code = '\0';
    for (int i = 0; i < orginal.size(); ++i) {
      haffman_code = haffman_tree_.GetHaffmanCode(orginal[i]);
      for (int j = 0; j < haffman_code.size(); ++j) {
        code <<= 1;
        bits_count++;
        if (haffman_code[j] == '1') {
          code += 1;
        }
        if (bits_count == 8) {
          encode->push_back(code);
          code = '\0';
          bits_count = 0;
        }
      }
    }
    //last bits ending with 10...0 
    if (bits_count == 0) {      
      encode->push_back(128);
    } else if (bits_count == 7) {
      code <<= 1;
      code += 1;
      encode->push_back(0);
    } else {
      code <<= 1;
      code += 1;
      code <<= (8 - bits_count - 1);
      encode->push_back(code);
    }      
  }
  void Decompress(const std::vector<unsigned char>& encode, std::string* orginal) {
  }
  void Decompress(const std::vector<unsigned char>& encode, std::vector<unsigned char>* orginal) {
    EncodeByte encoded_bytes(encode);    
    haffman_tree_.Decode(encoded_bytes, orginal);
  }
  void PrintHaffmanCode(unsigned char c) {
    printf("code %c : %s\n", c, haffman_tree_.GetHaffmanCode(c).c_str());
  }
 private:
  void CharacterFrequenceCompute(const std::string& text) {
    HaffmanTreeNode tree_nodes[256];
    for (int i = 0; i < 256; ++i) {
      tree_nodes[i].char_ = i;
    }
    for (int i = 0; i < text.size(); ++i) {
      tree_nodes[text[i]].char_ = text[i];
      tree_nodes[text[i]].frequence_++;
    }
    haffman_tree_.Build(std::vector<HaffmanTreeNode>(tree_nodes, tree_nodes + sizeof(tree_nodes) / sizeof(HaffmanTreeNode)));
  }
  HaffmanTree haffman_tree_;
};
void MinHeapTest() {
  MinHeap<HaffmanTreeNode> min_heap;
  const size_t kElementSize = 10;
  HaffmanTreeNode current;
  int key = 0;
  for (int i = 0; i < kElementSize; ++i) {
    key = rand() % 500;
    min_heap.Push(HaffmanTreeNode(static_cast<unsigned char>(i +'0'), key));
  }
  while (min_heap.Size() > 0) {
    current = min_heap.Pop();
    printf("pop: char=%c, frequence=%d\n", current.char_, current.frequence_);
  }
}
void HaffmanTreeTest() {
  HaffmanTree haffman_tree;
  std::vector<HaffmanTreeNode> char_freq;
  const int kCharSize = 256;
  int freq = 0;
  unsigned char character;
  for (int i = 0; i < kCharSize; ++i) {
    freq = rand() * rand() % 10000;
    character = static_cast<unsigned char>(0);
    printf("char:%c\n", character);
    char_freq.push_back(HaffmanTreeNode(character, i));
  }
  haffman_tree.Build(char_freq);
}
void HaffmanTreeTest1() {
  HaffmanTree haffman_tree;
  std::vector<HaffmanTreeNode> char_freq;
  char_freq.push_back(HaffmanTreeNode('a', 1));
  char_freq.push_back(HaffmanTreeNode('b', 10));
  char_freq.push_back(HaffmanTreeNode('c', 2));
  char_freq.push_back(HaffmanTreeNode('d', 8));
  haffman_tree.Build(char_freq);
}
void CompressTest() {
  std::string text ="aaaaaaaaaaaaaadsfadsfasdfasaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaadfasb";
  std::vector<unsigned char> encode;
  MyCompress compresser;
  compresser.Compress(text, &encode);
  std::vector<unsigned char> orginal;
  compresser.Decompress(encode, &orginal);
  printf("orginal:%s\n", text.c_str());
  printf("decoded:");
  for (int i = 0; i < orginal.size(); ++i) {
    printf("%c", orginal[i]);
  }
  printf("\n");
  printf("orginal bytes:%zd, compressed bytes:%zd\ncompressed ratio:%f\n", text.size(), encode.size(), (float)encode.size() / text.size() );
}
int main(int argc, char** argv) {
    CompressTest();
  //   HaffmanTreeTest();
}

实现中的心得:

1)EncodeByte::GetBit()函数的移位操作开始出错了,正确的移位应该是7 - offset,而不是offset,这个需要注意

2)Compress函数的一些功能应该实现在HaffmanTree中更合适,不想改了

3)使用EncodeByte管理bits简化了很多工作,使得bits相当与iterator来处理

4)利用最小堆构建haffman tree的过程比较经典,设计到对象的创建过程具有很好的参考价值

5)还有困惑的地方当模板类使用比较参数时,还有不是很清楚的地方,例如std::less<T>的代码还要看看,如何让模板支持函数对象和函数需要再了解一下

参考文献:

[1]http://www.360doc.com/content/11/0218/15/2150347_94086443.shtml

[2]http://cppgm.blogspot.com/2008/02/huffman-coding.html

[3]http://marknelson.us/1996/01/01/priority-queues/

你可能感兴趣的:(哈夫曼树实现示例)