paper:
Feature Selection for High-Dimensional Data: A Fast Correlation-Based Filter Solution.
Feature selection method based on correlation measure and relevance redundancy analysis.
Use in conjunction with an attribute set evaluator
通过对特征集的相关性 以及 冗余分析做评价
//TODO 不明白
// m_attributeList 属性的索引, m_attributeList[2]表示待测属性集中的第2个属性在原数据中的索引位置。
// 简单起见, 可以认为m_attributeList[i] == i. 假设5个属性, index分别为0 1 2 3 4
// m_attributeMerit 属性的评价分. 假设分别为 2.1 2.3 1 1.2 0.5
// rank 评价分升序排列时的索引值。 即 4 2 3 0 1
/*
bestToWorst: 其实就是按merit从高到底排列其index
1 2.3
0 2.1
3 1.2
2 1
4 0.5
*/
/*
bestToWorst: 其实就是按merit从高到底排列其index
1 2.3
0 2.1
3 1.2
2 1
4 0.5
m_rankedFCBF[dimension.length][4]
1 2.3 -1
0 2.1 -1
3 1.2 -1
2 1 -1
4 0.5 -1
*/
FCBFElimination 就是
for(i = 0; i<dimension.length; i++)
{
if(m_rankedFCBF[2] != 1)
{++i; continue;} //
for(j=i+1; j<dimension.length; j++)
{
if(m_rankedFCBF[i][1] < SUij) //则置 m_rankedFCBF[j][2]=m_rankedFCBF[i][0]
{
m_rankedFCBF[j][2] = m_rankedFCBF[i][0];
m_rankedFCBF[j][3] = SUij;
}
}
}
然后保留m_rankedFCBF[i][0] == m_rankedFCBF[i][2]的属性
具体算法逻辑如下:
获取最优特征集:
for (i = 0; i < m_attributeList.length; i++) { m_attributeMerit[i] = ASEvaluator.evaluateAttribute(m_attributeList[i]); } double[][] tempRanked = rankedAttributes(); int[] rankedAttributes = new int[m_selectedFeatures.length]; for (i = 0; i < m_selectedFeatures.length; i++) { rankedAttributes[i] = (int)tempRanked[i][0]; } return rankedAttributes;
rankedAttributes:
public double[][] rankedAttributes () { int i, j; //m_attributeList index //m_attributeMerit 各index对应的merit //rank merit从小到大有序的index if (m_attributeList == null || m_attributeMerit == null) { throw new Exception(""); } //对属性的评价分排序,假设有n个属性. 索引排序, 得到merit从小到大的index int[] ranked = Utils.sort(m_attributeMerit); // reverse the order of the ranked indexes //bestToWorst是n*2数组 double[][] bestToWorst = new double[ranked.length][2]; for (i = ranked.length - 1, j = 0; i >= 0; i--) { bestToWorst[j++][0] = ranked[i]; //alan: means in the arrary ranked, varialbe is from ranked as from small to large } // convert the indexes to attribute indexes for (i = 0; i < bestToWorst.length; i++) { int temp = ((int)bestToWorst[i][0]); bestToWorst[i][0] = m_attributeList[temp]; //for the index bestToWorst[i][1] = m_attributeMerit[temp]; //for the value of the index } if (m_numToSelect > bestToWorst.length) { throw new Exception("More attributes requested than exist in the data"); } this.FCBFElimination(bestToWorst); if (m_numToSelect <= 0) { if (m_threshold == -Double.MAX_VALUE) { m_calculatedNumToSelect = m_selectedFeatures.length; } else { determineNumToSelectFromThreshold(m_selectedFeatures); } } return m_selectedFeatures; }
FCBFElimination:
private void FCBFElimination(double[][]rankedFeatures) throws Exception { int i,j; m_rankedFCBF = new double[m_attributeList.length][4]; int[] attributes = new int[1]; int[] classAtrributes = new int[1]; int numSelectedAttributes = 0; int startPoint = 0; double tempSUIJ = 0; AttributeSetEvaluator ASEvaluator = (AttributeSetEvaluator)m_asEval; for (i = 0; i < rankedFeatures.length; i++) { m_rankedFCBF[i][0] = rankedFeatures[i][0]; m_rankedFCBF[i][1] = rankedFeatures[i][1]; m_rankedFCBF[i][2] = -1; } while (startPoint < rankedFeatures.length) { if (m_rankedFCBF[startPoint][2] != -1) { startPoint++; continue; } m_rankedFCBF[startPoint][2] = m_rankedFCBF[startPoint][0]; numSelectedAttributes++; for (i = startPoint + 1; i < m_attributeList.length; i++) { if (m_rankedFCBF[i][2] != -1) { continue; } attributes[0] = (int) m_rankedFCBF[startPoint][0]; classAtrributes[0] = (int) m_rankedFCBF[i][0]; tempSUIJ = ASEvaluator.evaluateAttribute(attributes, classAtrributes); if (m_rankedFCBF[i][1] < tempSUIJ || Math.abs(tempSUIJ-m_rankedFCBF[i][1])<1E-8) { m_rankedFCBF[i][2] = m_rankedFCBF[startPoint][0]; m_rankedFCBF[i][3] = tempSUIJ; } } startPoint++; } m_selectedFeatures = new double[numSelectedAttributes][2]; for (i = 0, j = 0; i < m_attributeList.length; i++) { if (m_rankedFCBF[i][2] == m_rankedFCBF[i][0]) { m_selectedFeatures[j][0] = m_rankedFCBF[i][0]; m_selectedFeatures[j][1] = m_rankedFCBF[i][1]; j++; } } }