C++处理reuters21578(二)

作者finallyliuyu 出处博客园

 通过C++处理reuters21578(一)的代码,初步形成了两张表单存放训练语料库和测试语料库,由于这两个语料库的个别类别不一致,所以要找到这两个语料库类别的交集,然后最终形成文本分类的训练语料库和测试语料库。以下主函数中完成此功能。

 

 

class  GT_clss
 {
 
public :
     GT_clss(
string   & s):comparepart(s){}
     
bool   operator ()( const   string   & elem)
     {
         
return  elem == comparepart;

     }
 
private :
     
string  comparepart;
 };

 

 

 

C++处理reuters21578(二) 数据库中共有多少个类别
vector < string > GetLabels( string  tablename)
 {   vector
< string > labels;
 
char   *  selectbySpecificId = new   char  [ 1000 ];
 memset(selectbySpecificId,
0 , 1000 );
 sprintf_s(selectbySpecificId,
1000 , " select Categorization from %s  " ,tablename.c_str());
 CoInitialize(NULL);
 _ConnectionPtr pConn(__uuidof(Connection));
 _RecordsetPtr pRst(__uuidof(Recordset));
 pConn
-> ConnectionString = " Provider=SQLOLEDB.1;Password=finally;Persist Security Info=True; User ID=sa;Initial Catalog=NewsInfo " ;
 pConn
-> Open( "" , "" , "" ,adConnectUnspecified);
 pRst
= pConn -> Execute(selectbySpecificId,NULL,adCmdText);
 
while ( ! pRst -> rsEOF)
 {
     
string  label = (_bstr_t)pRst -> GetCollect( " Categorization " );
     
if  ( ! count_if(labels.begin(),labels.end(),GT_clss(label)))
     {
         labels.push_back(label);
     }

     pRst
-> MoveNext();

 }
 pRst
-> Close();
 pConn
-> Close();
 pRst.Release();
 pConn.Release();
 CoUninitialize();
 delete []selectbySpecificId;

 
return  labels;



 }

 

 

 

C++处理reuters21578(二) 主函数
int  _tmain( int  argc, _TCHAR *  argv[])
{
     
int  end;
    
// set<string>labels;
    vector < string > labelsTrain = GetLabels( " ReteursTrain " );
    vector
< string > labelsTest = GetLabels( " ReteursTest " );
    vector
< string > finalLabels;
    
for  (vector < string > ::iterator it = labelsTrain.begin();it != labelsTrain.end();it ++ )
    {
        trim(
* it, "   " );
    }
    
for (vector < string > ::iterator it = labelsTest.begin();it != labelsTest.end();it ++ )
    {
        trim(
* it, "   " );

    }
    
    
for  (vector < string > ::iterator it = labelsTrain.begin();it != labelsTrain.end();it ++ )
    {
        
if  (count_if(labelsTest.begin(),labelsTest.end(),GT_clss( * it)))
        {
            finalLabels.push_back(
* it);
        }
    }

    
char   *  selectbySpecificId = new   char  [ 1000 ];
    memset(selectbySpecificId,
0 , 1000 );
    sprintf_s(selectbySpecificId,
1000 , " select CArticleName,CAbstract,Categorization from ReteursTest " );
    CoInitialize(NULL);
    _ConnectionPtr pConn(__uuidof(Connection));
    _RecordsetPtr pRst(__uuidof(Recordset));
    _ConnectionPtr pConn2(__uuidof(Connection));
    pConn
-> ConnectionString = " Provider=SQLOLEDB.1;Password=finally;Persist Security Info=True; User ID=sa;Initial Catalog=NewsInfo " ;
    pConn2
-> ConnectionString = " Provider=SQLOLEDB.1;Password=finally;Persist Security Info=True; User ID=sa;Initial Catalog=FinallyCorpus " ;
    pConn
-> Open( "" , "" , "" ,adConnectUnspecified);
    pConn2
-> Open( "" , "" , "" ,adConnectUnspecified);
    pRst
= pConn -> Execute(selectbySpecificId,NULL,adCmdText);
    
while ( ! pRst -> rsEOF)
    {
        
string  label = (_bstr_t)pRst -> GetCollect( " Categorization " );
        trim(label,
"   " );

        
if  (count_if(finalLabels.begin(),finalLabels.end(),GT_clss(label)))
        {
            
string  ArticleTitle = (_bstr_t)pRst -> GetCollect( " CArticleName " );
            
string  ArticleText = (_bstr_t)pRst -> GetCollect( " CAbstract " );
            ArticleTitle
= ProcessforMSSQL(ArticleTitle);
            ArticleText
= ProcessforMSSQL(ArticleText);
            
char   * sqlInsert = new   char [ 1000000 ];
            _variant_t RecordsAffected;
            memset(sqlInsert,
0 , 1000000 );
            sprintf_s(sqlInsert,
1000000 , " insert into ReteursTestingCorpus(CArticleName,CAbstract,Categorization) values('%s','%s','%s') " ,ArticleTitle.c_str(),ArticleText.c_str(),label.c_str());
            pConn2
-> Execute(sqlInsert, & RecordsAffected, - 1 );
            delete []sqlInsert;

            


            
        }
        

        pRst
-> MoveNext();

    }
    pRst
-> Close();
    pConn
-> Close();
    pRst.Release();
    pConn.Release();
    pConn2
-> Close();
    pConn2.Release();
    CoUninitialize();
    delete []selectbySpecificId;

    
    cout
<< " 两标签集交集为 " << endl;

    cout
<< finalLabels.size() << endl;

    
// DictionaryToDataBase();
    
    
// FindFile(L"E:\\新闻语料\\reuters21578");
    

    
// pRst=pConn->Execute(,NULL,adCmdText);


    
   cout
<< " finish " << endl;
    
    
    cin
>> end;






}

 

 

你可能感兴趣的:(C++)