c++ 中文字符分割

utf-8
vector Similarity::s2v(string t_str)
{
boost::regex re("\d+");
//setup converter
vector wanted;
for(int i=0; i

    char c = t_str[i];
    unsigned short b = 0x80;
    int head = 0;
    while((c & (b>>head)) != 0){
        head += 1;
    }
    if(head == 0)head = 1;
    string candiate = t_str.substr(i, head);
    //is number
    if(!wanted.empty()){
        if(boost::regex_match(candiate, re) && boost::regex_match(*(wanted.end()-1), re)) {
            *(wanted.end() - 1) = *(wanted.end() - 1) + candiate;
            continue;
        }
        bool repeat = 0;
        for(auto item : wanted){
            if(item == candiate){
                repeat = 1;
                break;
            }
        }
        if(repeat)
            continue;
    }
    wanted.push_back(candiate);
    i+=head-1;
}
return wanted;

}

你可能感兴趣的:(c++ 中文字符分割)