霍夫曼编码(英语:Huffman Coding),又译为哈夫曼编码、赫夫曼编码,是一种用于无损数据压缩的熵编码(权编码)算法。由美国计算机科学家大卫·霍夫曼(David Albert Huffman)在1952年发明。(来源于维基百科)
是一种变长的编码方式,对出现频率高的字符采用较短的编码,而对出现频率较低的字符采用较长的编码(在对字符进行变长编码时,注意要满足前缀码规则:任意字符的编码不能是其他字符编码的前缀)
要得到Huffman编码,首先需要构建Huffman树。我们首先来介绍一下Huffman树
在我们构造Huffman树之前,我们需要声明几个定义:
举个栗子:
给定权集:{2,3,4,7,8,9},构造一棵Huffman树:
构造过程如上所示。
我们介绍了Huffman树是如何来构造的,那么Huffman编码是如何应用到实际工作中的呢?
我们给定一个字符串,统计每个字符出现的频数,作为我们的权集,来构造一棵Huffman树,再通过我们构造的Huffman树来得到每个字符的编码,进而得到字符串的编码。
举个栗子,我们给定字符串"state,seat,act,tea,cat,set,a,eat"
通过Huffman树得到Huffman编码:
a:00
,:01
t:10
e:110
c:1110
s:1111
(整个字符串的编码)1111100010110011111110001001001110100110110000111100010011111110100100011100010
那么接下来我们来一探究竟,看看我们是如何通过一棵Huffman树来得到每个叶子结点的Huffman编码呢?
我们需要对我们构造的Huffman树做一个小小的操作:在所有的左分支上标 0 0 0,所有的右分支上标 1 1 1。(即在每个结点到左孩子的边上写上数字 0 0 0,到右孩子的边上写上数字 1 1 1)
通过这样的处理我们就能得到每个字符的编码了。
每个字符的编码即 根结点到与该字符对应的叶子结点 的路径上所有编码的连接体
对于上图中权值为 2 2 2的叶子结点,其编码就为 0100 0100 0100
介绍完了Huffman编码的基本原理,我们来给出他的代码实现:
///
/// 定义二叉树结点
///
/// 二叉树结点数据域的数据类型
public class BinTreeNode : IComparable> where T : IComparable
{
///
/// 结点数据域所含数据
///
private T _data;
///
/// 获取或设置结点数据
///
public T Data
{
get
{
return _data;
}
set
{
if (value == null)
throw new ArgumentNullException();
_data = value;
}
}
///
/// 获取或设置结点的左子树
///
public BinTreeNode LeftChild { get; set; }
///
/// 获取或设置结点的右子树
///
public BinTreeNode RightChild { get; set; }
///
/// 构造函数,初始化结点
///
/// 初始化结点的数据
public BinTreeNode(T data)
{
LeftChild = null;
_data = data;
RightChild = null;
}
///
/// 构造函数,初始化结点
///
/// 初始化结点的左子树
/// 初始化结点的数据
/// 初始化结点的右子树
public BinTreeNode(BinTreeNode lChild, T data, BinTreeNode rChild)
{
LeftChild = lChild;
_data = data;
RightChild = rChild;
}
public int CompareTo(BinTreeNode other)
{
if (other == null)
throw new ArgumentNullException();
return _data.CompareTo(other.Data);
}
}
///
/// Huffman树结点定义(base()向父类构造函数 传递参数)
///
public class HuffManTreeNode : BinTreeNode //继承了二叉树结点
{
///
/// Huffman树结点的权重
///
public int Weight { get; set; }
///
/// 构造Huffman树结点
///
///
///
public HuffManTreeNode(char data, int weight) : base(data)
{
Weight = weight;
}
///
/// 构造 Huffman树结点 所含数据为空
///
///
public HuffManTreeNode(int weight) : base('\0')
{
Weight = weight;
}
}
///
/// 构造一个数据类型,存储字符及字符的huffman编码
///
public class HuffmanDicItem
{
///
/// 字符
///
public char Character { get; set; }
///
/// 字符的huffman编码
///
public string Code { get; set; }
public HuffmanDicItem(char character, string code)
{
if (character == '\0')
throw new Exception("字符为空");
if (code == string.Empty)
throw new Exception("编码为空");
Character = character;
Code = code;
}
}
public class HuffManTree
{
///
/// 对字符串中的字符进行频数统计,构造森林
///
/// 待编码的字符串
///
private List CreateInitForest(string str)
{
if (string.IsNullOrEmpty(str))
throw new ArgumentNullException();
List result = new List(); //链表存储huffman树结点
char[] charArray = str.ToCharArray();
List> lst = charArray.GroupBy(a => a).ToList(); // lambda表达式
// ToList()?
foreach(IGrouping g in lst)
{
char data = g.Key;
int weight = g.ToList().Count;
HuffManTreeNode node = new HuffManTreeNode(data, weight);
result.Add(node);
}
return result;
}
///
/// 构造Huffman树
///
///
///
private HuffManTreeNode CreateHuffmanTree(List sources)
{
if (sources == null)
throw new ArgumentNullException();
if (sources.Count < 2)
throw new Exception("少于两个结点,无法构成Huffman树");
HuffManTreeNode root = default(HuffManTreeNode);
bool isNext = true;
while (isNext)
{
List lst = sources.OrderBy(a => a.Weight).ToList();
HuffManTreeNode first = lst[0];
HuffManTreeNode second = lst[1];
int Sum_weight = first.Weight + second.Weight;
HuffManTreeNode node = new HuffManTreeNode(Sum_weight);
node.LeftChild = first;
node.RightChild = second;
if (lst.Count == 2)
{
root = node;
isNext = false;
}
else
{
sources = lst.GetRange(2, sources.Count - 2);
sources.Add(node);
}
}
return root;
}
///
/// 构造Huffman编码字典
///
///
///
///
private List CreateHuffmanDict(string code, HuffManTreeNode current)
{
if (current == null)
throw new ArgumentNullException();
List result = new List();
if (current.LeftChild == null && current.RightChild == null)
{
result.Add(new HuffmanDicItem(current.Data, code));
}
else
{
List dictL = CreateHuffmanDict(code + "0", (HuffManTreeNode)current.LeftChild);
List dictR = CreateHuffmanDict(code + "1", (HuffManTreeNode)current.RightChild);
result.AddRange(dictL);
result.AddRange(dictR);
}
return result;
}
private List CreateHuffmanDict(HuffManTreeNode root)
{
if (root == null)
throw new ArgumentNullException();
return CreateHuffmanDict(string.Empty, root);
}
///
/// 对字符串进行编码
///
///
///
///
private string ToHuffmanCode(string source, List lst)
{
if (string.IsNullOrEmpty(source))
throw new ArgumentNullException();
if (lst == null)
throw new ArgumentNullException();
string result = string.Empty;
for (int i = 0; i < source.Length; i++)
{
result += lst.Single(a => a.Character == source[i]).Code;
}
return result;
}
public string StringToHuffmanCode(out List dict, string str)
{
List forest = CreateInitForest(str);
HuffManTreeNode root = CreateHuffmanTree(forest);
dict = CreateHuffmanDict(root);
string result = ToHuffmanCode(str, dict);
return result;
}
///
/// 对Huffman编码进行解码
///
///
///
///
public string HuffmanCodeToString(List dict, string code)
{
string result = string.Empty;
for (int i = 0; i < code.Length;)
{
foreach(HuffmanDicItem Item in dict)
{
if (code[i] == Item.Code[0] && Item.Code.Length + i <= code.Length)
{
string temp = code.Substring(i, Item.Code.Length);
if (temp == Item.Code)
{
result += Item.Character;
i += Item.Code.Length;
break;
}
}
}
}
return result;
}
}
给定字符串“state,seat,act,tea,cat,set,a,eat”,对其进行Huffman编码
static void Main(string[] args)
{
string str = "state,seat,act,tea,cat,set,a,eat";
Console.WriteLine(str);
HuffManTree huffmanTree = new HuffManTree();
List dic;
string code = huffmanTree.StringToHuffmanCode(out dic, str); // 对给定字符串进行编码
for (int i = 0; i < dic.Count; i++)
{
Console.WriteLine(dic[i].Character + ":" + dic[i].Code);
}
Console.WriteLine(code);
string decode = huffmanTree.HuffmanCodeToString(dic, code); // 对Huffman编码进行解码
Console.WriteLine(decode);
Console.ReadLine();
}
综上,我们便介绍完了Huffman树及Huffman编码,并给出了Huffman编码的实现代码。
可以看出Huffman编码与常规的定长编码相比,避免了多余的空间浪费,给出了一种更高效的变长编码方式