c++ boost 汉字和模式串混用的例子

c++ boost 汉字和模式串混用的例子
*===============================================================

*   Copyright (C) 2013 All rights reserved.

*   

*   文件名称:StringProcess.cpp

*   创 建 者:

*   创建日期:2013年04月24日

*   描    述:

*   备    注: 

*   更新日志:

*

================================================================*/

#include<stdio.h>

#include<string.h>

#include<stdlib.h> 

#include <sys/time.h>

#include<ctype.h>

#include<locale.h>

#include "boost/regex.hpp"

#include <iconv.h> 

#include <errno.h>

#include<algorithm>

// please add your code here!

using namespace std;

#define MAX_LINE_LENGTH 1048576

#define TAGLEN 50

/************************************************************

* @brief <funcName:trim> Author:刘禹 finallyly 20130425 去掉字符串首尾空格

==================================================

* @param s

==================================================

**********************************************************/

void trim(char *s)

{

    char *start;

    char *end;

    int len=strlen(s);

    start=s;

    end=s+len-1;

    while(1)

    {

        char c=*start;

        if(!isspace(c))

        {

            break;

        }

        start++;

        if(start>end)

        {

            s[0]='\0';

            return ;

        }

    }

    while(1)

    {

        char c=*end;

        if(!isspace(c))

        {

            break;

        }

        end --;

        if(start>end)

        {

            s[0]='\0';

            return;

        }

    }

    memmove(s,start,end-start+1);

    s[end-start+1]='\0';

    return;

}



inline bool strTolower( char* str )

{

    if ( !str )

        return false;

    int i = 0;

    bool flag = true;

    while ( str[i] )

    {

        if ( 'A' <= str[i] && 'Z' >= str[i] )

        {

            str[i] += 32;

        }

        else if ( 'a' <= str[i] && 'z' >= str[i] )

        {

        }

        else

        {

            flag = false;

        }

        ++i;

    }

    return flag;

}



/************************************************************

* @brief <funcName:> Author:刘禹 finallyly

* 从系统默认的汉字编码本机是GBK转unicode,宽字符保存

==================================================

* @param sToMatch

==================================================

* @return 

**********************************************************/

wstring String2Wstring(string sToMatch)

{     

    wstring wsToMatch;

    setlocale( LC_CTYPE, "" ); // 很重要,没有这一句,转换会失败。   

    int iWLen = mbstowcs( NULL, sToMatch.c_str(), sToMatch.length() ); // 计算转换后宽字符串的长度。(不包含字符串结束符)

    if(iWLen>0)

    {

        wchar_t *lpwsz = new wchar_t[iWLen + 1];  

        int i = mbstowcs( lpwsz, sToMatch.c_str(), sToMatch.length() ); // 转换。(转换后的字符串有结束符)   

        wsToMatch.assign(lpwsz);  

        delete []lpwsz;  

    }

    else

    {

        wsToMatch=L"";    

    }

    return wsToMatch;

}  

/************************************************************

* @brief <funcName:> Author:刘禹 finallyly

* Unicode转系统自带编码,用于输出

==================================================

* @param sToMatch

==================================================

* @return 

**********************************************************/

string Wstring2String(wstring sToMatch)  

{     

    string sResult;

    int iLen = wcstombs( NULL, sToMatch.c_str(), 0 ); // 计算转换后字符串的长度。(不包含字符串结束符)   

    if(iLen>0)

    {

        char *lpsz = new char[iLen + 1];  

        int i = wcstombs( lpsz, sToMatch.c_str(), iLen ); // 转换。(没有结束符)   

        lpsz[iLen] = '\0';  

        sResult.assign(lpsz); 

        delete []lpsz;  

    }

    else

    {

        sResult="";

    }

    return sResult;  

}

/************************************************************

* @brief <funcName:> Author:刘禹 finallyly

* 从指定编码转换到目标编码

==================================================

* @param toCode

==================================================

* @param fromCode

==================================================

* @param srcstr

==================================================

* @param deststr

==================================================

* @param srclen

==================================================

* @param destlen

==================================================

* @return 

**********************************************************/

int toAnotherCode(const char *toCode,const char *fromCode,char *srcstr, char *deststr, size_t srclen,size_t &destlen)

{

    iconv_t convertor=iconv_open(toCode,fromCode);

    size_t inputsize;

    size_t outputsize;

    size_t oldoutputsize;

    char *input, *inputold;

    char *output=NULL;

    char *outputold=NULL;

    int flag=0;

    if(convertor==iconv_t(-1))

    {

        fprintf(stderr,"convertor device initailization failed!\n");

        return 1;

    }

    else

    {

        inputsize=srclen;

        input=new char[inputsize+1];

        memcpy(input,srcstr,inputsize);

        input[inputsize]='\0';

        inputold=input;

        outputsize=inputsize*5;

        oldoutputsize=outputsize;

        output=new char[outputsize];

        output[0]=0;

        outputold=output;

        size_t rc = iconv(convertor,&input,&inputsize,&output,&outputsize);

        memcpy(deststr,outputold,oldoutputsize-outputsize);

        deststr[destlen]=0;

        destlen=oldoutputsize-outputsize;

        if(rc>0)

        {

            flag=1;

        }

        

        delete []inputold;

        delete []outputold;



    }

    iconv_close(convertor);

    if(flag==1)

    {

        return 0;

    }

    else

    {

        return 1;

    }



}

/************************************************************

* @brief <funcName:PrintUsage> Author:刘禹 finallyly 20130424

==================================================

**********************************************************/

void PrintUsage()

{

    fprintf( stderr, "prog [IN]hzpylist_file [IN]input_file [OUT]output_file [OUT]errdmp_file\n" );

}

void testRegex()

{

    string s="刘禹,刘德华,刘佳佳。。。王大虎。。。刘长春,xixi";

    string t="刘[^刘]*?,";

    wstring p=String2Wstring(t);

    wstring ws=String2Wstring(s);

    boost::wregex wreg(p,boost::regbase::icase|boost::regex::perl);

    boost::wsmatch wm;

    vector<string> results;

    wstring::const_iterator  it=ws.begin();

    wstring::const_iterator  end=ws.end();

    while(boost::regex_search(it,end,wm,wreg))

    {

        wstring wtemp=wm[0];

        string temp=Wstring2String(wtemp);

        results.push_back(temp);

        it=wm[0].second;

    }

    fprintf(stdout,"输出正则匹配结果\n");

    for(vector<string>::iterator it=results.begin();it!=results.end();it++)

    {

            printf("%s\n",(*it).c_str());

    }

}

int LoadFile(char* inputfile)

{

    FILE *fin = NULL;

    char line[102400] = {0};

    char word[102400] = {0};

    int len = 0;

    fin = fopen(inputfile, "r");

    if (NULL == fin)

    {

        fprintf(stderr,"LoadAddress can not open inputfilename %s\n", inputfile);

        return 1;

    }

    

    while(true)

    {

        fgets(line, 102400, fin);

        if (feof(fin))

        {

            break;

        }

        len = strlen(line);

        if (0 == line[0] || '\n' != line[len - 1])

        {

            continue;

        }

        line[len - 1] = 0;

        string pattern ="首都或首府:";

        string p1="([\u2E80-\u9FFF])+";

        wstring wp1 = String2Wstring(p1);

        //wstring wpattern = L"([\u2E80-\u9FFF])+";

        wstring wpattern = L"([\u2E80-\u9FFF]+)"+String2Wstring(pattern)+L"([\u2E80-\u9FFF]+)";

        wstring winputstr = String2Wstring(line);

        boost::wregex wreg(wpattern, boost::regex::perl|boost::regbase::icase);

        boost::smatch what;

        boost::wsmatch wswhat;

        wstring::const_iterator wstrit = winputstr.begin();

        wstring::const_iterator wstrend = winputstr.end();

        while (boost::regex_search(wstrit, wstrend, wswhat, wreg))

        {

            wstring ws1 = wswhat[1];

            wstring ws2 = wswhat[2]; 

            string s1 = Wstring2String(ws1);

            string s2 = Wstring2String(ws2);

            fprintf(stdout, "%s\t%s\n", s1.c_str(), s2.c_str());

            wstrit=wswhat[0].second;  

        }

    }

    

    if (NULL != fin)

    {

        fclose(fin);

        fin = NULL;

    }

    return 0;

}

int main( int argc, char *argv[] )

{

    timeval tv1, tv2;

    gettimeofday(&tv1, NULL); 

    

    if ( 2 != argc )

    {

        PrintUsage();

        return 1;

    }

    

    LoadFile(argv[1]);

    gettimeofday(&tv2, NULL);

    fprintf(stderr,"%s has finished congratulations!\n",argv[0]);

    fprintf( stderr,"time elapsed: %.2f ms\n", (float)((tv2.tv_sec - tv1.tv_sec)*1000000+(tv2.tv_usec-tv1.tv_usec))/1000);

    return 0;

}
View Code

 

你可能感兴趣的:(boost)