文本文件的编码格式识别功能 (UTF-8,ANSI,UTF-16
2013-01-29 本文已影响510人
剑舞春秋
代码:
enum ENCODETYPE
{
UNKNOW,
ANSI,
UTF8,
UTF16
};
ENCODETYPE TellEncodeType(BYTE* pBuf,int bufLen)
{
ENCODETYPE filetype=UNKNOW;
if (pBuf[0]==0xFF && pBuf[1]==0xFE
|| pBuf[0]==0xFE && pBuf[1]==0xFF)
filetype=UTF16;
else
{
int utf8Nums=0; //符合UTF8编码的字符个数,非Ansi部分
int count=0;
while(count {
int i=0;
while( i {
if (pBuf[count+i]>0xC0)
{
if (pBuf[count+i+1]<0x80 || pBuf[count+i+1]>0xC0)
{
filetype=ANSI;
break;
}
else
{
/*
The transformation table for UTF-8 is presented below:
UNICODE UTF-8
00000000 - 0000007F 0xxxxxxx
00000080 - 000007FF 110xxxxx 10xxxxxx
00000800 - 0000FFFF 1110xxxx 10xxxxxx 10xxxxxx //0xE0
00010000 - 001FFFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx //0xF0
00200000 - 03FFFFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx //0xF8
04000000 - 7FFFFFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx //0xFC
*/
BYTE *tC;//target Char to 识别
int utfStrLen=0;
//是否为正确的UTF8格式
tC=pBuf+count+i;
if (tC[0]<=0xF0)//<4字节
if(tC[0]>0xE0) utfStrLen=3;
else utfStrLen=2;
else if (tC[0]>=0xFC) utfStrLen=6;
else if(tC[0]>=0xF8) utfStrLen=5;
else utfStrLen=4;
int k=1;
while(k {
if (!(tC[k] & 0x80 && !(tC[k]& 0x40) ))//前二位为10
{
filetype=ANSI;
break;
}
k++;
}
if (k==utfStrLen)
utf8Nums++;
if (utf8Nums==10)
filetype=UTF8;
}
}
i++;
}//while( i
++count;
}//while(count
}//else
return filetype;
}
调用例子
LRESULT OnOpenFile(WORD /*wNotifyCode*/, WORD /*wID*/, HWND /*hWndCtl*/, BOOL& /*bHandled*/)
{
//const TCHAR szFilter[]=_T("playlist files(*.pl)\0*.pl\0");
CFileDialog dlg(TRUE,NULL,NULL,OFN_FILEMUSTEXIST|OFN_DONTADDTORECENT,NULL,m_hWnd);
if(dlg.DoModal()!=IDOK)return 0;
LPTSTR filepath=dlg.m_ofn.lpstrFile;
ENCODETYPE filetype=UNKNOW;
BYTE * pBuf;
FILE * pFile;
pFile = _tfopen( filepath, _T("rb") );
if (pFile!=NULL)
{
//get the file size
fseek(pFile,0,SEEK_END);
int filesize=ftell(pFile);
pBuf=(BYTE*)malloc(filesize);
fseek(pFile,0,SEEK_SET);
fread(pBuf,1,filesize,pFile);
filetype=TellEncodeType(pBuf,filesize);
fclose (pFile);
}
return 0;
}