按类别DF特征词选择算法

声明:

 

按类别特征词选择算法声明
vector < pair < string , double >   > LocalDFFeatureSelectionForPerclass(DICTIONARY &  mymap,CONTINGENCY &  contingencyTable, string  classLabel); // 局部DF法针对每个词对每个类别进行排序
         void  DFFeatureSelection(vector < string >  classLabels,DICTIONARY  & mymap,CONTINGENCY &  contingencyTable, int  N, char   * address); // 调用局部DF特征词选择函数

 

函数实现:

 

对词典中的每个词,统计其在某一个类别中出现的次数,并按词频从大到小排序
/* ********************************************************************** */
/*   按类别的DF特征词选择法                                                */
/* ********************************************************************** */
vector
< pair < string , double >   >  Preprocess::LocalDFFeatureSelectionForPerclass(DICTIONARY &  mymap,CONTINGENCY &  contingencyTable , string  classLabel)
{
    
// int finalKeyWordsCount=0; // 计算共取了多少个关键词
    clock_t start,finish;
    
double  totaltime;
    start
= clock();
    vector
< pair < string , double >   > DFinfo;
    
for (map < string ,vector < pair < int , int >>> ::iterator it = mymap.begin();it != mymap.end(); ++ it)
    {
        
        pair
< string , string > compoundKey = make_pair(it -> first,classLabel);
        
double  classCount = ( double )contingencyTable[compoundKey].first;
        DFinfo.push_back(make_pair(it
-> first,classCount));
        
    }

    stable_sort(DFinfo.begin(),DFinfo.end(),isLarger);
    finish
= clock();
    totaltime
= ( double )(finish - start) / CLOCKS_PER_SEC;
    cout
<< " 为类别 " << classLabel << " 遴选特征词共用了 " << totaltime << endl;

    
return  DFinfo;


}

 

 

DF特征词选择法:

 

代码
/* ********************************************************************** */
/*  DF特征词选择法                                                                      */
/* ********************************************************************** */
void  Preprocess:: DFFeatureSelection(vector < string   > classLabels,DICTIONARY  & mymap,CONTINGENCY &  contingencyTable, int  N, char   * address)
{
    clock_t start,finish;
    
double  totaltime;
    
int  totalTraingingCorpus = endIndex - beginIndex + 1 ; // 训练语料库总共的文章数目
     set < string > finalKeywords; // 存放最终遴选出的特征词
    vector < pair < string , double >> DFInfo;
    start
= clock();
    
for (vector < string > ::iterator it = classLabels.begin();it != classLabels.end();it ++ )
    {
        
// 训练语料库中某个类别的文章数目
         int  N_subClassCnt = getCategorizationNum( * it, " TrainingCorpus " );
        
// threshold决定每个类别遴选多少个特征词
         int  threshold = N_subClassCnt * N / totalTraingingCorpus;
        DFInfo
= LocalDFFeatureSelectionForPerclass(mymap,contingencyTable, * it);
        
for (vector < pair < string , double >   > ::size_type j = 0 ;j < threshold;j ++ )
        {
            finalKeywords.insert(DFInfo[j].first);

        }
        DFInfo.clear();




    }


    ofstream outfile(address);
    
int  finalKeyWordsCount = finalKeywords.size();
    
for  ( set < string > ::iterator it = finalKeywords.begin();it != finalKeywords.end();it ++ )
    {
        outfile
<<* it << endl;

    }
    outfile.close();
    cout
<< " 最后共选择特征词 " << finalKeyWordsCount << endl;
    finish
= clock();
    totaltime
= ( double )(finish - start) / CLOCKS_PER_SEC;
    cout
<< " 遴选特征词共有了 " << totaltime << endl;

}

 主函数调用:

 

代码
p.LoadDictionary(mymap, " F:\\finallyliuyu\\dict.dat " );
    p.LoadContingencyTable(contingenyTable,
" F:\\finallyliuyu\\contingency.dat " );
    p.DFFeatureSelection(labels,mymap,contingenyTable,
2000 , " F:\\finallyliuyu\\keywords.dat " );

 

 

 

你可能感兴趣的:(算法)