Range Coding 的 D 实现

Range Coding 是算术编码的变种,二者的效率几乎没有差别,Range Coding 速度更快,且没有专利问题。下面的程序移植和改进自一个非常清晰简洁的C++实现。当然,直接使用下面的代码去压缩文件效果并不好,速度慢压缩率也低,Range Coding 更适合作为其他算法的后端,比如 LZ77、Block Sorting。

如果你看到这里一头雾水的话,可以上 wikipedia 参考“算术编码”,不过更好的选择是找一篇名为《笨笨数据压缩教程》的系列文章来入门。

D1.0 Code
  1. /** Code for range coding, derived from public domain work by Dmitry Subbotin
  2. Modified to use 64-bit integer maths, for increased precision
  3. Authors: Dmitry Subbotin (initial author of the Carry-less Range Coder)
  4. Sachin Garg (C++)
  5. Wei Li (D language)
  6. Reference: http://sachingarg.com/compression/entropy_coding/64bit/
  7. */
  8. import std.stdio;
  9. template RangeCoding64Base()
  10. {
  11. const ulong Top = 1UL << 56UL;
  12. const ulong Bottom = 1UL << 48UL;
  13. const ulong MaxRange = Bottom;
  14. ulong m_low = 0;
  15. ulong m_range = ulong.max;
  16. }
  17. struct RangeEncoding64
  18. {
  19. mixin RangeCoding64Base;
  20. private bool m_flushed = false ;
  21. private void delegate(ubyte) m_sink = null;
  22. void init( void delegate(ubyte) sink)
  23. {
  24. assert(sink !is null);
  25. m_sink = sink;
  26. }
  27. void close()
  28. {
  29. if (!m_flushed)
  30. flush();
  31. }
  32. void encode(ulong symbolLow, ulong symbolHigh, ulong totalRange)
  33. {
  34. m_range /= totalRange;
  35. m_low += symbolLow * m_range;
  36. m_range *= symbolHigh - symbolLow;
  37. while ((m_low ^ (m_low + m_range)) < Top || m_range < Bottom && ((m_range = -m_low & (Bottom - 1)), true ))
  38. {
  39. ubyte b = m_low >> (m_low.sizeof * 8 - 8);
  40. m_sink(b);
  41. m_range <<= 8;
  42. m_low <<= 8;
  43. }
  44. }
  45. void flush()
  46. {
  47. if (!m_flushed)
  48. {
  49. for ( int i = 0; i < m_low. sizeof ; i++)
  50. {
  51. ubyte b = m_low >> (m_low.sizeof * 8 - 8);
  52. m_sink(b);
  53. m_low <<= 8;
  54. }
  55. m_flushed = true ;
  56. }
  57. }
  58. }
  59. struct RangeDecoding64
  60. {
  61. mixin RangeCoding64Base;
  62. private ulong m_code;
  63. private ubyte delegate() m_emitter;
  64. void init(ubyte delegate() emitter)
  65. {
  66. assert(emitter !is null);
  67. m_emitter = emitter;
  68. for ( size_t i = 0; i < m_code. sizeof ; i++)
  69. {
  70. m_code = (m_code << 8) | emitter();
  71. }
  72. }
  73. ulong currentCount(ulong totalRange) //积累频率
  74. {
  75. return (m_code - m_low) / (m_range /= totalRange);
  76. }
  77. void decode(ulong symbolLow, ulong symbolHigh, ulong totalRange)
  78. {
  79. m_low += symbolLow * m_range;
  80. m_range *= symbolHigh - symbolLow;
  81. while ((m_low ^ m_low + m_range) < Top || m_range < Bottom && ((m_range = -m_low & Bottom - 1), true ))
  82. {
  83. m_code= m_code << 8 | m_emitter(), m_range <<= 8, m_low <<= 8;
  84. }
  85. }
  86. }
  87. //简单的0阶自适应模型
  88. struct OrderZeroModel(uint SymMax = 255)
  89. {
  90. public const uint SymbolMax = SymMax;
  91. public const uint NoOfSymbols = SymbolMax + 2;
  92. //最后一个元素是积累频率
  93. private ulong[SymbolMax + 2] m_freq;
  94. void init()
  95. {
  96. for ( size_t i = 0; i < m_freq.length; i++)
  97. {
  98. m_freq[i] = i;
  99. }
  100. }
  101. private void rescale()
  102. { //用了64位以后似乎这是不可能的
  103. ulong newTotal = 0;
  104. for ( size_t i = 1; i < m_freq.length - 1; i++)
  105. {
  106. newTotal += ((m_freq[i] - m_freq[i - 1]) / 2) + 1;
  107. m_freq[i] = m_freq[i - 1] + ((m_freq[i] - m_freq[i - 1]) / 2) + 1;
  108. }
  109. m_freq[m_freq.length - 1] = newTotal;
  110. }
  111. void update(uint sym)
  112. {
  113. for ( size_t i = sym + 1; i < m_freq.length; i++) {
  114. m_freq[i]++;
  115. }
  116. if (total() >= RangeCoding64Base!().MaxRange)
  117. rescale();
  118. }
  119. uint getSymbol(ulong n)
  120. {
  121. uint sym = SymbolMax;
  122. while (m_freq[sym] > n) --sym;
  123. return sym;
  124. }
  125. uint total()
  126. {
  127. return m_freq[m_freq.length - 1];
  128. }
  129. ulong low(uint sym)
  130. {
  131. return m_freq[sym];
  132. }
  133. ulong high(uint sym)
  134. {
  135. return m_freq[sym + 1];
  136. }
  137. ulong opIndex(uint rhs)
  138. {
  139. return m_freq[rhs];
  140. }
  141. }
  142. // sample:
  143. int main(string[] args)
  144. {
  145. const uint EndOfStream = 256;
  146. ubyte[] compressed;
  147. void sink(ubyte u)
  148. {
  149. compressed ~= u;
  150. }
  151. ubyte[] origin;
  152. for ( int i = 0; i < 10000; i++)
  153. origin ~= ['A', 'B', 'C', 'D', 'E', 'F', 'G'];
  154. RangeEncoding64 encoder;
  155. encoder.init(&sink);
  156. OrderZeroModel!(EndOfStream) model; //我们把 256 作为 Range Coding 流结束符
  157. model.init();
  158. writefln("compression started..." );
  159. foreach(ubyte b; origin)
  160. {
  161. encoder.encode(model.low(b), model.high(b), model.total);
  162. model.update(b);
  163. }
  164. encoder.encode(model.low(EndOfStream), model.high(EndOfStream), model.total);
  165. model.update(EndOfStream);
  166. encoder.close();
  167. writefln("originial size: %d" , origin.length);
  168. writefln("compressed size: %d (%d%%)" , compressed.length,
  169. (origin.length - compressed.length) * 100 / origin.length);
  170. writefln("decoding...." );
  171. size_t pos = 0;
  172. ubyte delegate() emitter = {
  173. return compressed[pos++];
  174. };
  175. model.init();
  176. RangeDecoding64 dec;
  177. dec.init(emitter);
  178. ubyte[] decompressed;
  179. while ( true )
  180. {
  181. ulong count = dec.currentCount(model.total);
  182. uint sym = model.getSymbol(count);
  183. if (sym == 256) break ;
  184. decompressed ~= sym;
  185. dec.decode(model.low(sym), model.high(sym), model.total);
  186. model.update(sym);
  187. }
  188. writefln(decompressed.length);
  189. //确保解码无误
  190. assert(decompressed[] == origin[]);
  191. return 0;
  192. }


你可能感兴趣的:(C++,c,算法,linux,C#)