一、说明。
所谓决策表,类似于关系数据库的二位数据表,形如:
4 3 0
1 0 1
8 1 0
1 2 0
1 2 1
7 3 1
7 4 0
排序后输出:
1 0 1
1 2 0
1 2 1
4 3 0
7 3 1
7 4 0
8 1 0
二、问题由来。
决策表约简是粗糙集的一个经典问题。
关于如何解释粗糙集约简问题,我有一个很简单的解释,不过不会在这里写出。
简而言之约简就是在保持原有数据集分类能力的前提下删除冗余属性。
粗糙集的创始者Pawlak有着一个近乎偏执的理念:知识就是分类。
完成分类是进一步完成粗糙集约简的基础。
所以针对如何分类就有了各种各样的解法。
蛮力算法就是两两比较,完成分类,这个复杂度很高。
在这种情况下,先排序再分类是一个进步的方法。
当然排序的方法也很多,基数排序、快速排序都是排序,也的确都有人进行过尝试。
我这里的这个排序方法来自于《计算机学报》上的一篇《属性序下的快速约简算法》。
文章的作者当时发了两篇文章,这篇约简的文章建立在另外一篇《二维表快速排序的复杂度分析》之上。
三、实现代码,只是想重复这个实验,然后用我的方法与此相比较。
1 #include <stdlib.h> 2 #include <string.h> 3 #include <stdio.h> 4 #include <math.h> 5 #include <time.h> 6 #include <windows.h> 7 #include "decTable.h" 8 9 const int AttOrderTerminator = -1; 10 11 typedef struct tagConditionClass{ 12 int deputyRowNO; // start index NO. in tblIdx. 13 int terminalRowNO; // terminal index NO. in tblIdx. 14 bool available; // if cdncls has >= 2 decision value, it is not available. 15 }ConditionClass; 16 17 struct tagDecisionTableEX{ 18 DecisionTable * table; 19 int * tblIdx; 20 int from; 21 int to; 22 }; 23 typedef tagDecisionTableEX DecisionTableEX; 24 25 int partition(DecisionTable * table, int * tblIdx, int stage, int low, int high); 26 int TDQuicksort(DecisionTable * table, int * tblIdx, int stage, int low, int high); 27 int loadDecisionTablePositiveRegion(DecisionTable * table, int * tblIdx, bool * tblPositiveRegion); 28 int partitionMatrix(DecisionTableEX * tex, int * attOrder, int stage, int * nonEmptyLabel, bool * tblPositiveRegion); 29 int attOrderReduction(); 30 31 int partition(DecisionTable * table, int * tblIdx, int stage, int low, int high){ 32 TableElement * s = table->dataCenter; 33 int ext = table->extCdnAttribCount; 34 int t; 35 36 int mid = low; 37 int hiEnd = mid+1; 38 int counter = 0; 39 for(int i=low+1; i<=high; i++){ 40 int ref = s[tblIdx[low] * ext + stage]; 41 int element = s[tblIdx[i] * ext + stage]; 42 if (element < ref){ 43 mid++; 44 hiEnd++; 45 t = tblIdx[mid]; 46 tblIdx[mid] = tblIdx[i]; 47 tblIdx[i] = t; 48 } 49 if(element == ref){ 50 t = tblIdx[i]; 51 tblIdx[i] = tblIdx[hiEnd]; 52 tblIdx[hiEnd] = t; 53 hiEnd++; 54 counter++; 55 } 56 } 57 58 t = tblIdx[low]; 59 tblIdx[low] = tblIdx[mid]; 60 tblIdx[mid] = t; 61 62 if (mid == low) return mid + counter; 63 64 return mid-1; 65 } 66 67 int TDQuicksort(DecisionTable * table, int * tblIdx, int stage, int low, int high){ 68 TableElement * s = table->dataCenter; 69 int ext = table->extCdnAttribCount; 70 71 if (stage > table->cdnAttributeCount) return 0; 72 if (low >= high) return 0; 73 74 bool NextDemension = false; 75 for (int i=low+1; i<=high; i++) 76 if ( s[tblIdx[i] * ext + stage] != s[tblIdx[low] * ext + stage]){ 77 NextDemension = true; 78 break; 79 } 80 81 if (NextDemension){ 82 int mid = partition(table, tblIdx, stage, low, high); 83 TDQuicksort(table, tblIdx, stage, low, mid); 84 TDQuicksort(table, tblIdx, stage, mid+1, high); 85 } 86 87 if (!NextDemension){ 88 TDQuicksort(table, tblIdx, stage+1, low, high); 89 } 90 91 return 0; 92 } 93 94 int loadDecisionTablePositiveRegion(DecisionTable * table, int * tblIdx, bool * tblPositiveRegion){ 95 int CdnEquClsNO = table->elementCount+2; 96 int cdnClsPointer = 0; 97 98 int cdn = table->cdnAttributeCount; 99 int ext = table->extCdnAttribCount; 100 int tfsi = table->elementCount; 101 102 ConditionClass * cdnCls = NULL; 103 HANDLE heap = NULL; 104 105 int cc = table->cdnCmp; 106 107 heap = HeapCreate(HEAP_NO_SERIALIZE|HEAP_GENERATE_EXCEPTIONS, 1024*1024, 0); 108 if (heap != NULL){ 109 cdnCls = (ConditionClass * )HeapAlloc(heap, 0, CdnEquClsNO * sizeof(ConditionClass)); 110 } 111 MakeSure(cdnCls != NULL); 112 SecureZeroMemory(cdnCls, CdnEquClsNO * sizeof(ConditionClass)); 113 114 int from = 0; 115 while(from < tfsi){ 116 int duplicate = 0; 117 int i=from; 118 BigInt * src64 = (BigInt *)(table->dataCenter + tblIdx[from] * ext); 119 120 cdnCls[cdnClsPointer].deputyRowNO = from; // index NO. in tblIdx 121 cdnCls[cdnClsPointer].available = true; 122 123 // while (Line[from] == Line[i]) {...} 124 while (true){ 125 bool bird = true; 126 if (i == tfsi) break; 127 BigInt * dst64 = (BigInt *)(table->dataCenter + tblIdx[i] * ext); 128 for(int m=0; m<cc; m++) 129 if(src64[m]^dst64[m]){ 130 bird = false; 131 break; 132 } 133 if (!bird) break; 134 135 if (cdnCls[cdnClsPointer].available == true) 136 if (table->dcnElement[tblIdx[i]] != table->dcnElement[tblIdx[from]]){ 137 cdnCls[cdnClsPointer].available = false; 138 } 139 140 i++; 141 duplicate++; 142 } 143 144 from += duplicate; 145 cdnCls[cdnClsPointer].terminalRowNO = from - 1; 146 cdnClsPointer++; 147 } 148 149 for (int i=0; i<cdnClsPointer; i++){ 150 int start = cdnCls[i].deputyRowNO; 151 int terminal = cdnCls[i].terminalRowNO; 152 if (cdnCls[i].available){ 153 for (int m=start; m<=terminal; m++) tblPositiveRegion[tblIdx[m]] = true; 154 } 155 if (!cdnCls[i].available){ 156 for (int m=start; m<=terminal; m++) tblPositiveRegion[tblIdx[m]] = false; 157 } 158 } 159 160 HeapFree(heap, 0, cdnCls); 161 HeapDestroy(heap); 162 163 return 0; 164 } 165 166 int partitionMatrix(DecisionTableEX * tex, int * attOrder, int stage, int * nonEmptyLabel, bool * tblPositiveRegion){ 167 DecisionTable * table = tex->table; 168 int * tblIdx = tex->tblIdx; 169 int from = tex->from; 170 int to = tex->to; 171 172 while (attOrder[stage] != AttOrderTerminator){ 173 if (from >= to) return 0; 174 175 bool noPRelement = true; 176 for (int i=from; i<=to; i++) 177 if (tblPositiveRegion[tblIdx[i]]){ 178 noPRelement = false; 179 break; 180 } 181 if(noPRelement) return 0; 182 183 bool cannotDistinguishInStage = true; 184 int ext = table->extCdnAttribCount; 185 TableElement * s = table->dataCenter; 186 for (int i=from; i<=to; i++) 187 if (s[tblIdx[i] * ext + attOrder[stage]] != s[tblIdx[from] * ext + attOrder[stage]]){ 188 cannotDistinguishInStage = false; 189 break; 190 } 191 if (cannotDistinguishInStage) partitionMatrix(tex, attOrder, stage+1, nonEmptyLabel, tblPositiveRegion); 192 193 if (cannotDistinguishInStage == false){ 194 nonEmptyLabel[stage] = 1; 195 int sum = 0; 196 double avg = 0; 197 for (int i=from; i<=to; i++) sum += s[tblIdx[i] * ext + attOrder[stage]]; 198 avg = ((double)sum) / (to - from + 1); 199 int mid = from -1; 200 for (int i=from; i<=to; i++){ 201 if (s[tblIdx[i] * ext + attOrder[stage]] <= avg){ 202 mid++; 203 int t = tblIdx[mid]; 204 tblIdx[mid] = tblIdx[i]; 205 tblIdx[i] = t; 206 } 207 } 208 //if (mid==from || mid==to) __debugbreak(); 209 //int mid = partition(table, tblIdx, attOrder[stage], from, to); 210 211 tex->from = from; 212 tex->to = mid; 213 partitionMatrix(tex, attOrder, stage, nonEmptyLabel, tblPositiveRegion); 214 215 tex->from = mid+1; 216 tex->to = to; 217 partitionMatrix(tex, attOrder, stage, nonEmptyLabel, tblPositiveRegion); 218 } 219 220 //printf("%d stage completed.\n", stage); 221 stage++; 222 } 223 224 return 0; 225 } 226 227 int attOrderReduction(){ 228 DecisionTable table; 229 time_t timeBegin; 230 time_t timeEnd; 231 char fileName[MAX_STR]; 232 233 beginDecisionTable(&table); 234 235 printf("\nInput data file name : "); 236 scanf_s("%s", fileName, MAX_STR); 237 strcat_s(fileName, MAX_STR, ".txt"); 238 timeBegin = clock(); 239 fillTableWithFile(&table, fileName); 240 timeEnd = clock(); 241 printf("\n%f(s) consumed in reading from file", (double)(timeEnd-timeBegin)/CLOCKS_PER_SEC); 242 printf("\n"); 243 244 //reduction main 245 timeBegin = clock(); 246 int * tblIdx = (int *)malloc(table.elementCount * sizeof(int)); 247 bool * tblPositiveRegion = (bool *)malloc(table.elementCount * sizeof(bool)); 248 for (int i=0; i<table.elementCount; i++) tblIdx[i]=i; 249 250 TDQuicksort(&table, tblIdx, 0, 0, table.elementCount-1); 251 //TDQuicksort test code 252 //FILE * fp; 253 //fopen_s(&fp, "r8.txt", "w+"); 254 //for (int i=0; i<table.elementCount; i++){ 255 // for (int m=0; m<table.cdnAttributeCount; m++) 256 // fprintf(fp, "%8d", table.dataCenter[ tblIdx[i]*table.extCdnAttribCount + m ]); 257 // fprintf(fp, "\n"); 258 //} 259 //fclose(fp); 260 261 return 0; 262 } 263 264 int main(){ 265 attOrderReduction(); 266 267 return 0; 268 }
四、实验结果摘录
所有实验数据来自UCI数据库。
实验机器: i3 2100 + 4G + win7 32bit
VS 2012 32bit Release Mode
Forest CoverType
581012 条数据,每数据 54 条件属性。
0.234s
Poker Hand
1025010 数据, 每数据 10条件属性。
0.359s