c++实现删除utf8多个字符
2021-11-14 本文已影响0人
一路向后
1.UTF-8.h
#ifndef _RAQUEL_UTF8_H_
#define _RAQUEL_UTF8_H_
namespace Raquel {
class UTF8 {
public:
static int getLength(const char *buf);
static int getULength(const char *buf);
static int getULength(const char *buf, int &pos);
static int delByChar(const char *buf, char *out, char *rem, const char *del);
};
};
#endif
2.UTF-8.cpp
#include <iostream>
#include <cstring>
#include "UTF-8.h"
using namespace std;
int Raquel::UTF8::getLength(const char *buf)
{
int i = 0;
for(i=0; buf[i]; i++);
return i;
}
int Raquel::UTF8::getULength(const char *buf)
{
char firstByte;
int offset = 1;
int i = 0;
int j = 0;
int k = 0;
int l = 0;
while(buf[i])
{
firstByte = buf[i];
offset = 1;
if(firstByte & 128)
{
if(firstByte & 32)
{
if(firstByte & 16)
{
offset = 4;
if(buf[i+1] == 0x00)
{
k = 3;
l = 1;
}
else if(buf[i+2] == 0x00)
{
k = 2;
l = 1;
}
else if(buf[i+3] == 0x00)
{
k = 1;
l = 1;
}
}
else
{
offset = 3;
if(buf[i+1] == 0x00)
{
k = 2;
l = 1;
}
else if(buf[i+2] == 0x00)
{
k = 1;
l = 1;
}
}
}
else
{
offset = 2;
if(buf[i+1] == 0x00)
{
k = 1;
l = 1;
}
}
}
i += offset - k;
j++;
}
return j-l;
}
int Raquel::UTF8::getULength(const char *buf, int &pos)
{
char firstByte;
int offset = 1;
int i = 0;
int j = 0;
int k = 0;
int l = 0;
int m = 0;
while(buf[i])
{
firstByte = buf[i];
offset = 1;
if(firstByte & 128)
{
if(firstByte & 32)
{
if(firstByte & 16)
{
offset = 4;
if(buf[i+1] == 0x00)
{
k = 3;
l = 1;
m = 1;
}
else if(buf[i+2] == 0x00)
{
k = 2;
l = 1;
m = 2;
}
else if(buf[i+3] == 0x00)
{
k = 1;
l = 1;
m = 3;
}
}
else
{
offset = 3;
if(buf[i+1] == 0x00)
{
k = 2;
l = 1;
m = 1;
}
else if(buf[i+2] == 0x00)
{
k = 1;
l = 1;
m = 2;
}
}
}
else
{
offset = 2;
if(buf[i+1] == 0x00)
{
k = 1;
l = 1;
m = 1;
}
}
}
i += offset - k;
j++;
}
pos = i - m;
return j-l;
}
int Raquel::UTF8::delByChar(const char *buf, char *out, char *rem, const char *del)
{
char firstByte[2];
int offset[2] = {1, 1};
int i = 0;
int j = 0;
int k = 0;
int l = 0;
int u = 0;
int v = 0;
int w = 0;
int t = 0;
int e = 0;
int p = 0;
int q = 0;
int r = 0;
int s = 0;
while(buf[i])
{
firstByte[0] = buf[i];
offset[0] = 1;
if(firstByte[0] & 128)
{
if(firstByte[0] & 32)
{
if(firstByte[0] & 16)
{
offset[0] = 4;
if(buf[i+1] == 0x00)
{
k = 3;
l = 1;
}
else if(buf[i+2] == 0x00)
{
k = 2;
l = 1;
}
else if(buf[i+3] == 0x00)
{
k = 1;
l = 1;
}
}
else
{
offset[0] = 3;
if(buf[i+1] == 0x00)
{
k = 2;
l = 1;
}
else if(buf[i+2] == 0x00)
{
k = 1;
l = 1;
}
}
}
else
{
offset[0] = 2;
if(buf[i+1] == 0x00)
{
k = 1;
l = 1;
}
}
}
v = offset[0] - k;
p = 0;
q = 0;
r = 0;
s = 0;
//cout << "v: " << v << endl;
while(del[p])
{
firstByte[1] = del[p];
offset[1] = 1;
if(firstByte[1] & 128)
{
if(firstByte[1] & 32)
{
if(firstByte[1] & 16)
{
offset[1] = 4;
if(del[p+1] == 0x00)
{
r = 3;
s = 1;
}
else if(del[p+2] == 0x00)
{
r = 2;
s = 1;
}
else if(del[p+3] == 0x00)
{
r = 1;
s = 1;
}
}
else
{
offset[1] = 3;
if(del[p+1] == 0x00)
{
r = 2;
s = 1;
}
else if(del[p+2] == 0x00)
{
r = 1;
s = 1;
}
}
}
else
{
offset[1] = 2;
if(del[p+1] == 0x00)
{
r = 1;
s = 1;
}
}
}
//cout << "w: " << offset[1] - r << endl;
if(offset[1] - r == v)
{
for(t=0; t<v; t++)
{
if(del[p+t] != buf[i+t])
{
break;
}
}
//cout << "t: " << t << endl;
if(t == v)
{
e++;
break;
}
}
p += offset[1] - r;
q++;
}
//cout << "u: " << u << endl;
//cout << "i: " << i << endl;
if(t != v)
{
if(k == 0)
{
for(t=0; t<v; t++)
{
out[u++] = buf[i+t];
}
}
else
{
w = 0;
for(t=0; t<v; t++)
{
rem[w++] = buf[i+t];
}
rem[w] = 0x00;
}
}
i += v;
j++;
}
out[u] = 0x00;
return e;
}
int main()
{
char buf[64] = "你4μ\U00010102";
char out[64] = "你";
char rem[5] = {0};
int len = 0;
//buf[14] = 0x00;
cout << string(buf) << endl;
cout << Raquel::UTF8::delByChar(buf, out, rem, "啊\U00010102你") << endl;
cout << string(out) << endl;
//printf("%d %d\n", buf[12], buf[13]);
//printf("%d %d\n", rem[0], rem[1]);
return 0;
}
3.编译源码
$ g++ -o UTF-8 UTF-8.cpp -std=c++11
4.运行及其结果
$ ./UTF-8
你4μ𐄂
2
4μ