我的第一个C++程序,还像个C++c程序的样子吧

目的:从数据库中抽取文章关键词,并统计这些关键词在哪些文章中出现,出现多少次。(算是词袋子模型吧),然后对每篇文章形成形成VSM模型,写成weka的数据格式,然后调用weka对文章聚类。

目前“形成此代码模型一块已经完毕”

其中词袋子的数据结构如下:

map<string,vector<pair<int,int>>>&mymap),

目前已经完成此部分的serilize(save/load)以及print 功能

#include "stdafx.h"
#include<iostream>
#include<map>
#include<vector>
#include<string>
#include<iomanip>
#include<fstream>
//#include<boost/tokenizer.hpp>
using namespace std;

 

我的第一个C++程序,还像个C++c程序的样子吧 形成词袋子模型
nt ConstructMap(map < string ,vector < pair < int , int >>>& mymap)
{
    
    vector
< string >  mySplit( string  s);
    CoInitialize(NULL);
    _ConnectionPtr pConn(__uuidof(Connection));
    _RecordsetPtr pRst(__uuidof(Recordset));
    pConn
-> ConnectionString = " Provider=SQLOLEDB.1;Password=xxx;Persist Security Info=True; User ID=sa;Initial Catalog=ArticleCollection " ;
    pConn
-> Open( "" , "" , "" ,adConnectUnspecified);
    pRst
= pConn -> Execute( " select CKeyWord,ArticleId from Article order by ArticleId " ,NULL,adCmdText);
    
while ( ! pRst -> rsEOF)
    {    vector
< string > wordcollection;
        
string  keywordstr = (_bstr_t)pRst -> GetCollect( " CKeyWord " );
        
if (keywordstr != "" )
        {
                wordcollection
= mySplit(keywordstr);
                
string  tempid = (_bstr_t)pRst -> GetCollect( " ArticleId " );
                
int  articleid = atoi(tempid.c_str());
                
for (vector < string > ::iterator strit = wordcollection.begin();strit != wordcollection.end();strit ++ )
                {
                    vector
< pair < int , int >> ::iterator it;
                    
if (mymap[ * strit].empty())
                    {
                        pair
< int , int > mytemppair = make_pair(articleid, 1 );
                        mymap[
* strit].push_back(mytemppair);

                    }
                    
else
                    {
                        
for (it = mymap[ * strit].begin();it != mymap[ * strit].end();it ++ )
                        {  
                            
if (it -> first == articleid)
                            {
                                it
-> second =++ (it -> second);
                                
break ;
                            }
                    
                        }
                        
if (it == mymap[ * strit].end())
                        {
                            pair
< int , int > mytemppair = make_pair(articleid, 1 );
                            mymap[
* strit].push_back(mytemppair);
                        }

                    }

            }
            

        }
        
        
        pRst
-> MoveNext();
        wordcollection.clear();
    }
    pRst
-> Close();
    pConn
-> Close();
    pRst.Release();
    pConn.Release();
    CoUninitialize();
    
return   0 ;

}

 

 

我的第一个C++程序,还像个C++c程序的样子吧
我的第一个C++程序,还像个C++c程序的样子吧 加载词袋子模型
void  load(map < string ,vector < pair < int , int >   >   >& mymap)
{
    ifstream infile(
" c:\\mydict.dat " ,ios::binary);
    
int  lenMyMap; // 保存词典长度
     int  lenVector; // 保存每个词出现的文章数目
     string  key; // 保存读出的map的键值
     int  articleId; // 文章标号
     int  count; // 在该文章中刚出现的数目
     string  comma;
    
string  semicolon;
    
    infile
>> lenMyMap;
    
while ( ! infile.eof())
    {
        infile
>> key;
        infile
>> lenVector;
        vector
< pair < int , int >   > temp;
        
for  ( int  i = 0 ;i < lenVector;i ++ )
        {
            infile
>> articleId >> count >> semicolon;
            temp.push_back(make_pair(articleId,count));
        }
        mymap[key]
= temp;
        
        
    }
    

    infile.close();

}

 

保存词袋子模型
void  save(map < string ,vector < pair < int , int >   >   >& mymap)
{   ofstream outfile(
" c:\\mydict.dat " ,ios::binary);
    outfile
<< mymap.size() << endl;
    map
< string ,vector < pair < int , int >   >   > ::iterator it;
    
for  (it = mymap.begin();it != mymap.end();it ++ )
    {   outfile
<< it -> first << endl;
        vector
< pair < int , int >> ::iterator subit;
        outfile
<< it -> second.size() << endl;
        
for (subit = (it -> second).begin();subit != (it -> second).end(); ++ subit)
        {
            outfile
<< subit -> first << "   " << subit -> second << "   " << " ; " << "   " ;
        }
        outfile
<< endl;
    }
    
// outfile.write((char *)&mymap,sizeof(mymap));

    outfile.close();
}
我的第一个C++程序,还像个C++c程序的样子吧 打印词袋子模型
void  print(map < string ,vector < pair < int , int >   >   >& mymap)
{   
    cout
<< mymap.size() << endl;
    map
< string ,vector < pair < int , int >   >   > ::iterator it;
    
for  (it = mymap.begin();it != mymap.end();it ++ )
    {   cout
<< it -> first << endl;
        vector
< pair < int , int >> ::iterator subit;
        cout
<< it -> second.size() << endl;
        
for (subit = (it -> second).begin();subit != (it -> second).end(); ++ subit)
        {
            cout
<< subit -> first << ' , ' << subit -> second << " ; " ;
        }
        cout
<< endl;
    }
    
}

 

 

 

你可能感兴趣的:(C++)