如标题所描述的,数据无损压缩极限从1948年香农提出“信息熵”以来就界定信源编码的理论极限。此次,基于我提出的加权概率模型,我给出可以无穷例举的实验。该实验的目的就是为了实现“无损降熵”和“无损增熵”。凡是学过信息论和编码技术的朋友,听到这个是不是觉得我在天方夜谭?
不急!!我先给出三组数据,每组256个(0-255的随机数字)字节,请针对下面三组数据进行无损压缩:
第一组:3,172,216,20,41,187,55,239,218,71,179,55,215,130,144,0,236,228,150,29,68,94,2,4,208,108,219,77,236,55,15,56,34,113,29,140,207,18,164,20,0,25,0,57,55,80,2,212,244,151,73,68,107,224,121,123,48,251,29,61,149,122,115,124,104,63,61,168,102,8,40,209,183,146,27,15,248,5,204,215,85,251,187,169,61,174,59,182,118,55,57,178,8,56,88,162,20,151,114,125,204,34,24,47,120,54,145,49,173,40,69,24,14,190,104,1,56,0,2,4,203,106,215,39,178,62,238,48,16,219,180,182,150,254,54,192,48,146,192,136,236,211,195,213,53,66,31,211,132,107,82,93,30,231,25,102,137,242,171,192,188,76,158,114,0,0,9,121,129,97,154,109,53,76,91,50,196,223,226,0,45,44,172,56,54,176,65,237,70,112,0,0,255,251,13,22,229,133,121,141,90,6,3,136,231,159,204,126,104,13,143,156,49,174,222,0,31,229,201,142,141,155,94,163,198,128,215,98,250,187,155,219,70,32,107,124,31,70,98,192,14,255,106,2,213,200,187,12,0,122,242,22,192,97,134,54,10,0,0,0
第二组:7,46,42,168,175,245,71,130,223,45,146,35,246,16,38,52,173,51,88,77,167,52,159,151,27,152,28,136,53,212,88,28,55,237,186,182,57,199,175,107,61,165,19,205,7,180,107,136,230,217,18,17,95,82,253,96,96,224,116,35,233,22,226,29,126,211,121,151,124,49,21,10,22,153,1,138,61,43,6,104,114,152,230,58,106,252,11,91,189,16,224,123,196,156,254,160,219,158,213,185,35,59,145,65,221,160,249,180,16,182,185,174,8,253,211,177,57,59,221,87,222,188,52,193,167,12,82,218,228,93,35,118,64,157,112,175,182,213,54,207,254,87,214,187,154,134,7,15,134,218,237,64,63,214,55,23,86,118,180,27,107,82,142,75,20,239,65,217,244,132,58,167,95,138,245,9,223,51,78,192,139,130,63,0,239,64,147,152,103,212,42,232,169,197,211,26,138,85,43,67,1,118,206,242,68,49,38,176,85,133,195,173,242,1,126,87,61,215,137,159,252,185,150,174,137,200,73,189,232,182,99,244,162,98,121,198,241,21,205,150,137,112,141,101,105,113,77,5,228,85,23,139,252,79,63,166,197,152,0,0
第三组:16,168,14,177,241,9,215,81,64,222,107,65,220,124,9,47,41,47,144,197,3,204,109,19,255,143,25,141,166,66,182,162,96,77,38,133,140,84,146,229,8,63,225,225,40,167,124,224,98,97,59,221,215,65,222,236,107,220,3,219,149,124,196,96,91,245,107,136,181,235,192,93,45,121,9,92,24,123,80,153,187,216,190,171,91,39,106,215,232,63,210,32,137,162,146,148,63,232,149,63,141,69,136,102,186,139,19,102,114,116,227,109,36,53,1,219,225,116,242,168,229,68,50,81,33,8,181,229,168,12,14,231,214,70,100,37,66,60,96,6,249,230,87,100,150,30,242,230,12,92,39,217,110,42,11,163,21,53,127,4,152,236,134,131,115,250,196,98,60,209,22,122,117,107,15,66,17,82,144,243,143,46,248,85,152,40,59,203,168,62,167,13,189,129,169,44,157,118,163,122,222,131,23,203,81,194,71,96,21,237,110,150,158,211,244,25,164,74,15,31,186,50,234,117,132,63,158,27,65,255,117,187,21,11,0,251,221,69,102,29,245,125,204,118,21,234,97,186,185,158,54,132,50,181,211,1,82,107,76,0
根据统计,我们很容易得出上述三组数据中1的概率分别为0.462981,0.504327,0.491346。根据信息论,想在这样的随机数中找出规律进行无损压缩基本不可能。
按照信息熵,上述三组数据将无法再压缩(这个结论我觉得不必要我去描述相关的定理了吧)。
但是基于我提出加权概率模型却能将三者全部进行无损降熵后再压缩。降熵后,第一组可无损压缩至32个字节,第二组可无损压缩至64个字节,第三组可无损压缩至128个字节。
看到这里,是不是嗤之以鼻,怎么可能?往往搞研究的意义就是你“认为”不可能!
接下来我再给出三组数据:
第A组:1,0,0,0,1,0,0,0,1,1,1,1,0,0,0,0,1,1,1,0,0,0,0,1,0,1,1,0,0,1,0,1,0,0,1,0,1,1,0,0,0,0,1,0,0,1,1,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,1,1,1,1,0,0,0,0,1,0,1,1,1,0,1,1,0,1,1,1,1,1,0,0,0,1,0,1,1,1,0,0,1,1,1,1,0,0,0,1,1,0,0,0,1,1,0,0,0,1,1,0,0,0,0,0,1,0,1,1,1,1,0,0,1,0,0,0,1,1,1,1,0,0,1,1,1,1,1,1,1,0,1,0,0,0,1,1,1,1,1,1,0,0,0,1,0,0,0,0,0,0,1,1,0,1,1,0,1,0,0,1,0,0,0,0,1,1,1,0,1,0,1,0,0,0,0,0,1,0,0,1,0,1,1,1,1,1,0,0,1,0,1,0,1,0,1,1,0,0,0,0,1,0,0,1,0,1,1,0,0,0,1,1,1,0,1,0,0,0,1,0,0,1,1,1,0,0,0,0,1,1,0,0,1,0,0,0,1,1,1,0,0
第B组:1,3,2,0,1,0,0,3,0,0,2,3,1,3,0,1,2,2,1,2,1,2,2,3,3,3,1,2,2,2,1,3,3,3,0,2,2,3,3,3,2,1,3,3,3,0,0,1,2,0,2,2,2,0,1,0,0,3,1,1,2,2,2,2,3,2,2,1,2,1,0,0,0,3,0,3,2,1,1,1,0,1,1,2,3,1,3,3,1,0,1,2,2,2,1,0,1,2,1,0,1,2,2,2,1,3,1,0,3,3,3,0,0,3,3,2,2,0,1,3,2,3,2,2,0,3,1,1,1,0,0,0,3,2,3,2,3,0,0,0,0,1,3,2,2,3,0,2,1,2,3,0,3,3,3,0,3,2,3,2,3,0,1,3,1,1,3,2,1,1,2,2,2,1,1,3,2,1,3,3,3,3,1,2,0,2,2,0,3,1,3,2,0,2,0,3,1,2,0,1,3,1,2,2,2,1,0,0,1,0,1,1,0,3,1,2,1,2,0,2,0,0,0,2,2,1,2,0,0,3,3,0,2,1,2,1,2,3,1,0,1,2,1,2,0,2,2,3,2,1,3,3,2,1,2,0
第C组:3,9,6,7,7,13,11,13,6,14,8,4,6,15,4,7,6,3,2,14,1,5,5,13,7,6,6,14,13,7,3,1,4,10,10,0,8,4,7,8,10,6,10,5,0,12,13,12,11,5,4,15,9,5,5,13,13,7,10,8,12,7,5,11,8,15,3,12,2,0,4,14,12,3,11,10,13,0,1,15,8,0,14,7,15,14,10,12,4,3,1,11,1,2,6,7,9,0,9,2,14,8,11,8,3,15,1,12,5,3,8,6,6,9,9,13,10,8,13,2,6,0,2,13,5,15,14,13,13,5,3,10,2,6,4,12,9,0,7,1,0,13,11,11,12,11,12,9,2,11,5,8,10,7,6,8,1,7,6,3,14,5,10,12,7,0,6,5,5,14,3,4,6,6,2,6,1,13,0,3,15,14,11,5,8,15,4,4,11,2,5,3,3,10,5,1,6,15,10,8,15,3,14,10,15,5,5,13,15,6,13,4,9,11,7,1,8,2,5,7,13,8,5,5,3,10,13,0,5,11,7,15,15,5,15,13,3,3,10,10,13,15,10,3,8,8,0,4,2,6,1,11,13,1,11,15
显然,A组按照8位合成一个字节,于是只需要32个字节;B组按照2位合成一个字节,需要64个字节;C组按照4位合成一个字节,需要128个字节。但是A组与第一组存在什么关系呢?
因为我通过加权概率模型将A组无损增熵得到的第一组数据,于是第一组数据可以被无损降熵为A。这也就是我说我可以将第一组数据无损压缩到32个字节,即无损压缩8倍。同理,也就说明了第二组和第三组均能无损压缩。
我可以明确的告诉大家,无论怎么穷举不同长度的数据,只要能通过无损增熵得到的随机数据,均能被无损降熵。下面给出一部分我的实验代码:
#include
#include
#include
#include
#include "WJLEntropyTransformation.h"
#ifdef WIN32
#define inline __inline
#endif // WIN32
// 每个字节中符号1的个数,方便统计
unsigned char CntOfOneSymboltemp[256]=
{
0x00,0x01,0x01,0x02,0x01,0x02,0x02,0x03,0x01,0x02,0x02,0x03,0x02,0x03,0x03,0x04,
0x01,0x02,0x02,0x03,0x02,0x03,0x03,0x04,0x02,0x03,0x03,0x04,0x03,0x04,0x04,0x05,
0x01,0x02,0x02,0x03,0x02,0x03,0x03,0x04,0x02,0x03,0x03,0x04,0x03,0x04,0x04,0x05,
0x02,0x03,0x03,0x04,0x03,0x04,0x04,0x05,0x03,0x04,0x04,0x05,0x04,0x05,0x05,0x06,
0x01,0x02,0x02,0x03,0x02,0x03,0x03,0x04,0x02,0x03,0x03,0x04,0x03,0x04,0x04,0x05,
0x02,0x03,0x03,0x04,0x03,0x04,0x04,0x05,0x03,0x04,0x04,0x05,0x04,0x05,0x05,0x06,
0x02,0x03,0x03,0x04,0x03,0x04,0x04,0x05,0x03,0x04,0x04,0x05,0x04,0x05,0x05,0x06,
0x03,0x04,0x04,0x05,0x04,0x05,0x05,0x06,0x04,0x05,0x05,0x06,0x05,0x06,0x06,0x07,
0x01,0x02,0x02,0x03,0x02,0x03,0x03,0x04,0x02,0x03,0x03,0x04,0x03,0x04,0x04,0x05,
0x02,0x03,0x03,0x04,0x03,0x04,0x04,0x05,0x03,0x04,0x04,0x05,0x04,0x05,0x05,0x06,
0x02,0x03,0x03,0x04,0x03,0x04,0x04,0x05,0x03,0x04,0x04,0x05,0x04,0x05,0x05,0x06,
0x03,0x04,0x04,0x05,0x04,0x05,0x05,0x06,0x04,0x05,0x05,0x06,0x05,0x06,0x06,0x07,
0x02,0x03,0x03,0x04,0x03,0x04,0x04,0x05,0x03,0x04,0x04,0x05,0x04,0x05,0x05,0x06,
0x03,0x04,0x04,0x05,0x04,0x05,0x05,0x06,0x04,0x05,0x05,0x06,0x05,0x06,0x06,0x07,
0x03,0x04,0x04,0x05,0x04,0x05,0x05,0x06,0x04,0x05,0x05,0x06,0x05,0x06,0x06,0x07,
0x04,0x05,0x05,0x06,0x05,0x06,0x06,0x07,0x05,0x06,0x06,0x07,0x06,0x07,0x07,0x08
};
int main(){
long cont1 = 0, cont2 = 0;
unsigned int enoutlength = 512, length = 512;
unsigned int deoutlength = 512;
int i = 0;
unsigned char *TestArray;
unsigned char *enoutArray;
unsigned char *deoutArray;
double ProbabilityofOne = 0.0;
// random datas
printf("the length is %d for random datas:\n",length);
TestArray = (unsigned char *)malloc(sizeof(unsigned char) * length);
srand(time(0));
for(i = 0; i < length; ++i) {
TestArray[i] = rand() % 4; // 这个地方可以修改成rand() % 2;rand() % 3;rand() % 16等等
printf("%d,",(unsigned char)TestArray[i]);
}
printf("\n");
printf("------------------------------------test Increase Entropy ------------------------------------\n");
enoutArray = WJLLosslessIncreaseEntropyCoding(TestArray, length, &enoutlength, &ProbabilityofOne);
for(i = 0; i < enoutlength; ++i) {
printf("%d,",(unsigned char)enoutArray[i]);
cont1 += CntOfOneSymboltemp[enoutArray[i]];
}
printf("\n");
printf("原始数据中符号1的概率:%f,",ProbabilityofOne);
printf("\n");
printf("增熵编码结果中符号1的概率:%f,",(double)cont1/ (enoutlength * 8.0));
printf("\n");
printf("\n");
printf("-------------------------------------test Drop Entropy ---------------------------------------\n");
deoutArray = WJLLosslessDropEntropyCoding(enoutArray, enoutlength, &deoutlength, &ProbabilityofOne);
for(i = 0; i < deoutlength; ++i) {
printf("%d,",(unsigned char)deoutArray[i]);
if(deoutArray[i] != TestArray[i]){
cont2 ++;
}
}
printf("\n差异个数为:%d\n",cont2);
printf("\n");
printf("\n");
system("pause");
return 0;
}
接下来,我再给几张控制台运行的图:
为了证明这一点,我又设计了实验,由随机函数生成的随机数是否能进行无损降熵。实验得出强制降熵是可以实现,降熵后再增熵时出现部分数值错误,而且降熵的越严重,增熵出现的错误也就越多。
同时,也会出现非常小的概率能使得当前的随机数能被无损降熵。下面是512个随机数形成的一个实验:
66,67,227,79,149,215,197,12,114,61,210,103,32,58,11,103,88,44,167,126,145,64,110,98,129,229,99,24,224,129,165,81,111,98,151,250,57,247,63,15,24,64,236,176,124,131,28,118,104,65,156,7,189,104,241,232,187,202,247,147,197,206,200,202,100,101,223,42,143,64,126,3,42,65,76,103,204,47,67,129,197,173,201,40,96,109,29,180,101,161,245,17,226,178,102,18,43,53,3,7,32,27,73,145,174,39,59,181,153,166,73,48,120,90,118,10,1,183,186,110,134,82,165,185,192,150,70,209,204,184,75,184,115,238,105,97,174,220,255,194,105,79,246,42,136,46,235,213,40,174,144,190,39,196,79,180,230,224,49,176,77,216,255,102,16,34,167,26,49,71,224,181,199,148,17,24,253,19,111,176,93,186,102,74,79,225,59,41,220,250,239,86,184,124,102,56,128,32,201,100,64,80,40,182,56,220,100,161,224,240,76,195,41,67,6,189,6,144,177,65,43,74,72,109,20,140,201,87,74,79,151,233,226,223,29,239,70,144,181,110,56,172,200,55,19,177,54,189,84,185,248,36,90,58,4,154,106,239,112,234,246,22,219,78,31,220,9,133,120,22,205,38,13,49,44,52,164,108,191,242,66,68,89,249,241,48,234,135,193,143,163,26,13,223,91,62,255,47,50,163,86,215,116,114,103,102,192,225,98,220,105,6,215,26,28,232,121,150,195,220,33,83,170,15,22,17,224,95,139,193,225,111,105,59,113,250,79,51,203,103,215,105,251,158,28,33,137,27,121,211,87,64,147,35,204,240,153,19,50,89,202,120,93,17,56,171,142,101,204,128,151,238,139,124,63,101,23,66,231,183,121,79,110,92,29,231,83,229,30,79,25,213,197,61,240,178,76,141,145,92,231,53,106,157,5,123,251,0,210,102,128,231,1,114,208,216,202,135,133,24,46,44,93,123,130,249,224,122,43,74,67,134,167,59,141,156,205,208,248,26,85,115,154,209,154,164,68,91,159,236,171,174,80,18,197,242,11,146,118,144,95,75,108,140,20,120,72,20,107,225,254,89,166,83,126,157,203,77,148,141,166,252,12,210,13,105,27,227,104,242,250,29,112,247,212,21,54,213,226,45,103,24,213,101,207,236,116,116,29,213,134,24
将符号1的概率强制设定为0.6进行降熵编码,得到降熵后的数据如下:
102,200,233,4,247,255,255,207,159,139,255,252,64,211,255,57,253,126,255,255,252,63,253,196,247,95,255,255,239,244,58,222,41,135,240,175,40,212,231,255,233,243,175,155,22,254,54,117,191,189,121,43,247,84,80,255,255,207,182,180,223,60,220,255,53,140,55,170,150,243,230,237,175,143,173,142,49,191,252,24,62,244,143,247,255,255,117,38,22,122,103,182,75,154,111,225,209,252,221,5,19,243,191,254,255,59,255,235,253,255,204,38,247,63,94,209,203,109,0,237,250,252,191,238,33,223,197,203,121,153,123,254,223,217,119,90,79,237,175,139,239,252,97,217,252,126,85,120,83,184,115,182,245,167,237,231,255,216,63,226,70,84,67,247,191,135,217,139,117,191,122,90,140,123,127,246,165,221,183,120,110,234,255,248,215,227,239,255,255,251,151,107,217,245,242,43,189,183,95,255,250,253,188,44,255,255,255,98,152,171,175,59,240,254,135,120,104,225,158,241,75,107,116,126,187,201,153,35,238,105,252,222,102,230,237,191,255,255,223,240,227,247,87,127,255,255,223,251,120,185,20,165,140,98,239,255,253,48,239,40,189,185,240,255,255,255,45,244,247,127,223,111,255,214,119,255,185,141,255,252,106,254,239,198,222,111,177,241,184,187,253,186,247,109,155,213,151,142,213,222,107,149,14,59,177,187,252,145,255,235,213,191,237,255,255,154,254,219,19,255,226,255,253,235,245,123,131,207,79,224,233,173,212,222,113,221,111,255,255,240,71,118,91,190,193,52,255,246,239,15,244,174,167,192,7,238,130,255,247,255,255,255,253,208,125,68,182,201,234,255,243,116,49,255,255,234,255,125,203,101,3,219,179,243,239,255,248,107,197,31,245,219,243,69,113,242,223,50,67,108,62,225,215,214,255,253,174,87,160,249,88,254,254,243,231,255,227,239,255,105,166,15,157,204,63,127,71,172,200,184,255,255,255,255,255,255,145,175,117,46,254,21,79,255,103,245,40,61,236,223,251,29,30,95,221,207,84,94,103,103,159,127,255,255,255,255,232,47,114,206,11,117,5,255,254,218,122,116,251,255,161,111,68,155,250,84,167,70,238,93,159,254,95,255,255,255,255,255,181,191,223,55,14,95,189,79,191,255,199,172,255,242
经过统计,实际降熵后符号1的概率为0.680176,不用说也知道降熵成功了。增熵时却得到如下的数据:
66,67,227,79,149,212***,165***,12,114,61,139***,103,32,58,11,101***,87***,43***,90***,126,134***,64,110,98,129,157***,96***,24,224,129,165,81,111,98,151,250,57,218***,63,15,24,64,236,176,124,131,28,118,104,65,156,7,189,95***,241,232,187,202,247,147,197,206,200,202,100,101,223,42,143,64,126,3,42,65,70***,103,204,47,67,128***,197,160***,201,40,96,109,29,180,101,161,245,17,226,177***,102,18,43,53,2***,231***,32,27,42***,145,174,39,59,181,153,165***,73,48,120,90,118,10,1,183,186,110,134,82,165,185,192,150,70,209,204,184,75,184,115,238,105,96***,174,220,255,194,105,79,246,41***,136,46,235,213,40,174,143***,190,39,196,79,180,230,224,49,176,77,216,255,101***,16,34,167,26,48***,71,224,181,199,148,17,24,207***,19,111,176,68***,169***,102,74,79,225,59,41,220,250,239,66***,184,124,102,56,127***,5***,201,100,64,80,40,182,55***,220,100,161,224,240,76,195,41,67,6,189,6,144,177,65,43,74,72,109,20,140,201,49***,3***,76***,150***,232***,226,222***,253***,219***,70,144,181,110,56,172,200,55,19,113***,54,189,84,185,248,36,90,45***,222***,154,106,239,110***,234,246,22,219,73***,31,219***,248***,133,120,22,205,38,13,49,44,52,164,108,191,242,66,68,89,249,241,48,234,135,193,143,163,26,13,223,91,61***,255,47,50,163,52***,215,116,114,103,85***,192,218***,98,220,105,6,215,26,28,232,121,150,195,220,33,83,125***,192***,22,17,224,95,139,193,225,102***,105,59,113,250,79,51,203,103,215,105,249***,155***,12***,245***,137,27,121,211,87,64,147,35,204,240,152***,242***,50,89,202,120,93,17,56,171,142,101,158***,128,151,238,137***,124,63,101,23,66,231,183,121,79,110,91***,29,231,83,211***,30,79,25,213,197,61,239***,178,76,135***,145,91***,231,53,106,157,5,123,251,0,210,102,128,230***,232***,26***,201***,167***,202,135,133,24,46,44,93,119***,130,249,224,122,43,74,67,134,167,57***,141,156,205,208,248,26,85,112***,100***,201***,154,164,68,91,159,236,171,174,80,18,197,242,11,145***,118,144,95,75,108,140,20,120,72,20,107,212***,254,27***,158***,2***,124***,157,202***,77,148,141,166,252,12,182***,13,105,24***,226***,227***,212***,50***
其中有90个字节在增熵时发生错误(如带***的数值是错误的)。
接下来,我说为什么这个实验可以撼动“信息论”。至少我能找到无穷种随机数据均能被无损降熵,那么就说明“信息论”的理论存在不完整性。
本次只是记录了我的实验发现,我会提供C的lib库供大家测试。
在我看来,这也许是一个新的研究方向,所以先把实验结果发出来。