Candidate-Elimination算法是数据挖掘中的一种概念学习算法,部分解决Find-S的不足,可以输出所有与训练样本一致的概念,同时利用概念间偏序关系来指导搜索,其伪代码描述如下
Initialize Gto the set of most-general hypotheses in H Initialize Sto the set of most-specific hypotheses in H For each training example, d, do: If dis a positive example then: Remove from Gany hypotheses that do not match d For each hypothesis sin Sthat does not match d Remove sfrom S Add to S all minimal generalizations, h, of ssuch that: 1) h matches d 2) some member of Gis more general than h Remove fromSany hthat is more general than another hypothesis in S If dis a negative example then: Remove from Sany hypotheses that match d For each hypothesis gin Gthat matches d Remove gfrom G Add to G all minimal specializations, h, of gsuch that: 1) hdoes not match d 2) some member of Sis more specific than h Remove fromG any hthat is more specific than another hypothesis in G
【概念挖掘需求】
基本的算法思想是,泛化边界初始化为全?,特化边界特化为全空集(@),
1、若遇到正实例,首先从G中删除不包含该实例的概念,然后对S删除与实例不相符的概念,同时做最小泛化,使其包含该实例
2、若遇到负实例,首先从S中删除包含了该实例的概念,然后对G删除包含了该实例的概念,同时做最小特化,注意最小特化集是与当前S一一枚举出可能的特化概念,删除那些符合该实例的概念
C++源码
#include <iostream> #include <string> #include <vector> using namespace std; #define MAXLEN 7//输入每行的数据个数 //每行为标号 outlook Temperature Humidity Wind PlayTennis Swiming //采用二维动态数组Vector保存处理 vector <vector <string> > state;//实例集 vector <vector <string> > S;//特化边界 vector <vector <string> > G;//泛化边界 vector <string> item(MAXLEN);//对应一行实例集 string yes("Yes"); string all("?"); string white("@");//@表示空集 string end("end");//输入结束 //判定概念h是否包含实例d bool checkInclude(vector <string> h, vector <string> d){ bool res = true; for(int i = 1; i < MAXLEN-1; i++){ if(!h[i].compare(all) || !h[i].compare(d[i])) continue; else if(!h[i].compare(white) || h[i].compare(d[i])) res = false; else { cerr<<"error in checkInclude"<<endl; res = false; } } return res; } //对概念S[offset]做最小泛化,使得S[offset]包含d void doMinGeneral(int offset, vector <string> d){ for(int i = 1; i < MAXLEN-1; i++){ if(!S[offset][i].compare(white)) { S[offset][i] = d[i]; } else if(S[offset][i].compare(d[i])) S[offset][i] = all; else continue;//包括相等和h[i]为all的情况 } } //对概念G[offset]做最小特化,使得G[offset]包不含d,注意最小特化来自于和S的组合 void doMinSpecific(int offset, vector <string> d, vector <string> s){ for(int i = 1; i < MAXLEN-1; i++){ if(G[offset][i].compare(s[i]) && !G[offset][i].compare(all)){ if(s[i].compare(d[i])){ //产生新的泛化边界,注意不唯一 vector<string> temp (G[offset]);//先拷贝一份G[offset] temp[i] = s[i]; G.push_back(temp); } } if(G[offset][i].compare(s[i]) && G[offset][i].compare(all)){ cerr<<"doMinSpecific: error in data"<<endl;//G[offset][i]不为?且其与s[i]不同的情况 } } } void input(){ int i, j; string s; while(cin>>s,s.compare(end) != 0){//-1为输入结束 item[0] = s; for( i = 1;i < MAXLEN; i++){ cin>>item[i]; } state.push_back(item); } //初始化 for( i = 0;i < MAXLEN; i++){ item[i] = white; } S.push_back(item); for( i = 0;i < MAXLEN; i++){ item[i] = all; } G.push_back(item); } void output(){ int i,j; cout<<"The specific boundary is:"<<endl; for(i = 0; i < S.size(); i++){ for(j = 1;j < MAXLEN-1; j++){ cout<<S[i][j]<<" "; } cout<<endl; } cout<<endl; cout<<"The general boundary is:"<<endl; for(i = 0; i < G.size(); i++){ for(j = 1;j < MAXLEN-1; j++){ cout<<G[i][j]<<" "; } cout<<endl; } cout<<endl; } void solve(){ int i,j; for( i = 0; i < state.size(); i++){ if(!state[i][MAXLEN-1].compare(yes)){ //正实例的情况,首先从G中删除不包含该实例的概念, //然后对S删除与实例不相符的概念,同时做最小泛化,使其包含该实例 for(j = 0; j < G.size(); j++){ if(!checkInclude(G[j],state[i])) { G.erase(G.begin() + j); } } for(j = 0; j < S.size(); j++){ doMinGeneral(j,state[i]); } } else {//负实例的情况,首先从S中删除包含了该实例的概念, //然后对G做最小特化,与当前S一一枚举出可能的特化概念,删除那些符合该实例的概念 for(j = 0; j < S.size(); j++){ if(checkInclude(S[j],state[i])){ S.erase(S.begin() + j); } } int orginGSize = G.size();//先记下G原始大小 for(j = 0; j < orginGSize; j++){ doMinSpecific(j,state[i],S[0]); } G.erase(G.begin(), G.begin() + (orginGSize));//删除G[0]到G[orginGSize-1],但是要注意括号内的参数不同 } } } int main(){ input(); solve(); output(); return 0; }
测试数据一
0 Sunny Warm High Strong Warm Yes 1 Sunny Warm Normal Strong Warm No 2 Rainy Cold High Strong Warm No 3 Sunny Warm High Strong Cool Yes end
测试结果一
The specific boundary is: Sunny Warm High Strong ? The general boundary is: Sunny ? High ? ? ? Warm High ? ?
0 Sunny Warm High Strong Warm Yes 1 Sunny Warm Normal Strong Warm No 2 Rainy Cold High Strong Warm No 3 Sunny Warm High Strong Cool Yes end测试结果二
The specific boundary is: Sunny Warm High Strong ? The general boundary is: Sunny ? High ? ? ? Warm High ? ?
具体预测的方法是
1、如果给定的实例符合概念集合S,则一定去游泳
2、如果给定的实例不符合概念集合G,则一定不去游泳
3、如果给定的实例不符合概念集合S,但是符合概念集合G,则可能去游泳