c++ 中文字符分割

2020-01-21  本文已影响0人  思君颜如玉

utf-8
vector<string> Similarity::s2v(string t_str)
{
boost::regex re("\d+");
//setup converter
vector<string> wanted;
for(int i=0; i<t_str.length(); i++){

    char c = t_str[i];
    unsigned short b = 0x80;
    int head = 0;
    while((c & (b>>head)) != 0){
        head += 1;
    }
    if(head == 0)head = 1;
    string candiate = t_str.substr(i, head);
    //is number
    if(!wanted.empty()){
        if(boost::regex_match(candiate, re) && boost::regex_match(*(wanted.end()-1), re)) {
            *(wanted.end() - 1) = *(wanted.end() - 1) + candiate;
            continue;
        }
        bool repeat = 0;
        for(auto item : wanted){
            if(item == candiate){
                repeat = 1;
                break;
            }
        }
        if(repeat)
            continue;
    }
    wanted.push_back(candiate);
    i+=head-1;
}
return wanted;

}

上一篇 下一篇

猜你喜欢

热点阅读