c++实现删除utf8多个字符

2021-11-14  本文已影响0人  一路向后

1.UTF-8.h

#ifndef _RAQUEL_UTF8_H_
#define _RAQUEL_UTF8_H_

namespace Raquel {
    class UTF8 {
    public:
        static int getLength(const char *buf);
        static int getULength(const char *buf);
        static int getULength(const char *buf, int &pos);
        static int delByChar(const char *buf, char *out, char *rem, const char *del);
    };
};

#endif

2.UTF-8.cpp

#include <iostream>
#include <cstring>
#include "UTF-8.h"

using namespace std;

int Raquel::UTF8::getLength(const char *buf)
{
    int i = 0;

    for(i=0; buf[i]; i++);

    return i;
}

int Raquel::UTF8::getULength(const char *buf)
{
    char firstByte;
    int offset = 1;
    int i = 0;
    int j = 0;
    int k = 0;
    int l = 0;

    while(buf[i])
    {
        firstByte = buf[i];

        offset = 1;
 
        if(firstByte & 128)
        {
            if(firstByte & 32)
            {
                if(firstByte & 16)
                {
                    offset = 4;

                    if(buf[i+1] == 0x00)
                    {
                        k = 3;
                        l = 1;
                    }
                    else if(buf[i+2] == 0x00)
                    {
                        k = 2;
                        l = 1;
                    }
                    else if(buf[i+3] == 0x00)
                    {
                        k = 1;
                        l = 1;
                    }
                }
                else
                {
                    offset = 3;

                    if(buf[i+1] == 0x00)
                    {
                        k = 2;
                        l = 1;
                    }
                    else if(buf[i+2] == 0x00)
                    {
                        k = 1;
                        l = 1;
                    }
                }
            }
            else
            {
                offset = 2;

                if(buf[i+1] == 0x00)
                {
                    k = 1;
                    l = 1;
                }
            }
        }

        i += offset - k;
        j++;
    }

    return j-l;
}

int Raquel::UTF8::getULength(const char *buf, int &pos)
{
    char firstByte;
    int offset = 1;
    int i = 0;
    int j = 0;
    int k = 0;
    int l = 0;
    int m = 0;

    while(buf[i])
    {
        firstByte = buf[i];

        offset = 1;
 
        if(firstByte & 128)
        {
            if(firstByte & 32)
            {
                if(firstByte & 16)
                {
                    offset = 4;

                    if(buf[i+1] == 0x00)
                    {
                        k = 3;
                        l = 1;
                        m = 1;
                    }
                    else if(buf[i+2] == 0x00)
                    {
                        k = 2;
                        l = 1;
                        m = 2;
                    }
                    else if(buf[i+3] == 0x00)
                    {
                        k = 1;
                        l = 1;
                        m = 3;
                    }
                }
                else
                {
                    offset = 3;

                    if(buf[i+1] == 0x00)
                    {
                        k = 2;
                        l = 1;
                        m = 1;
                    }
                    else if(buf[i+2] == 0x00)
                    {
                        k = 1;
                        l = 1;
                        m = 2;
                    }
                }
            }
            else
            {
                offset = 2;

                if(buf[i+1] == 0x00)
                {
                    k = 1;
                    l = 1;
                    m = 1;
                }
            }
        }

        i += offset - k;
        j++;
    }

    pos = i - m;

    return j-l;
}

int Raquel::UTF8::delByChar(const char *buf, char *out, char *rem, const char *del)
{
    char firstByte[2];
    int offset[2] = {1, 1};
    int i = 0;
    int j = 0;
    int k = 0;
    int l = 0;

    int u = 0;
    int v = 0;
    int w = 0;
    int t = 0;
    int e = 0;

    int p = 0;
    int q = 0;
    int r = 0;
    int s = 0;

    while(buf[i])
    {
        firstByte[0] = buf[i];

        offset[0] = 1;
 
        if(firstByte[0] & 128)
        {
            if(firstByte[0] & 32)
            {
                if(firstByte[0] & 16)
                {
                    offset[0] = 4;

                    if(buf[i+1] == 0x00)
                    {
                        k = 3;
                        l = 1;
                    }
                    else if(buf[i+2] == 0x00)
                    {
                        k = 2;
                        l = 1;
                    }
                    else if(buf[i+3] == 0x00)
                    {
                        k = 1;
                        l = 1;
                    }
                }
                else
                {
                    offset[0] = 3;

                    if(buf[i+1] == 0x00)
                    {
                        k = 2;
                        l = 1;
                    }
                    else if(buf[i+2] == 0x00)
                    {
                        k = 1;
                        l = 1;
                    }
                }
            }
            else
            {
                offset[0] = 2;

                if(buf[i+1] == 0x00)
                {
                    k = 1;
                    l = 1;
                }
            }
        }

        v = offset[0] - k;

        p = 0;
        q = 0;
        r = 0;
        s = 0;

        //cout << "v: " << v << endl;

        while(del[p])
        {
            firstByte[1] = del[p];
            offset[1] = 1;

            if(firstByte[1] & 128)
            {
                if(firstByte[1] & 32)
                {
                    if(firstByte[1] & 16)
                    {
                        offset[1] = 4;

                        if(del[p+1] == 0x00)
                        {
                            r = 3;
                            s = 1;
                        }
                        else if(del[p+2] == 0x00)
                        {
                            r = 2;
                            s = 1;
                        }
                        else if(del[p+3] == 0x00)
                        {
                            r = 1;
                            s = 1;
                        }
                    }
                    else
                    {
                        offset[1] = 3;

                        if(del[p+1] == 0x00)
                        {
                            r = 2;
                            s = 1;
                        }
                        else if(del[p+2] == 0x00)
                        {
                            r = 1;
                            s = 1;
                        }
                    }
                }
                else
                {
                    offset[1] = 2;

                    if(del[p+1] == 0x00)
                    {
                        r = 1;
                        s = 1;
                    }
                }
            }

            //cout << "w: " << offset[1] - r << endl;

            if(offset[1] - r == v)
            {
                for(t=0; t<v; t++)
                {
                    if(del[p+t] != buf[i+t])
                    {
                        break;
                    }
                }

                //cout << "t: " << t << endl;

                if(t == v)
                {
                    e++;
                    break;
                }
            }

            p += offset[1] - r;
            q++;
        }

        //cout << "u: " << u << endl;
        //cout << "i: " << i << endl;

        if(t != v)
        {
            if(k == 0)
            {
                for(t=0; t<v; t++)
                {
                    out[u++] = buf[i+t];
                }
            }
            else
            {
                w = 0;

                for(t=0; t<v; t++)
                {
                    rem[w++] = buf[i+t];
                }

                rem[w] = 0x00;
            }
        }

        i += v;
        j++;
    }

    out[u] = 0x00;

    return e;
}

int main()
{
    char buf[64] = "你4μ\U00010102";
    char out[64] = "你";
    char rem[5] = {0};
    int len = 0;

    //buf[14] = 0x00;

    cout << string(buf) << endl;
    cout << Raquel::UTF8::delByChar(buf, out, rem, "啊\U00010102你") << endl;
    cout << string(out) << endl;

    //printf("%d %d\n", buf[12], buf[13]);
    //printf("%d %d\n", rem[0], rem[1]);

    return 0;
}

3.编译源码

$ g++ -o UTF-8 UTF-8.cpp -std=c++11

4.运行及其结果

$ ./UTF-8
你4μ𐄂
2
4μ
上一篇 下一篇

猜你喜欢

热点阅读