判断txt文件是ANSI UTF-8 unicode编码c语言实现源码

#include
#include
#include
int main(int argc,char**argv){
unsigned  char *buffer;
//  char *buffer;
//177.txt utf-8 bom
//277.txt ansi
//377.txt  little unicode
//477.txt  big unicode
//577.txt  utf-8 no bom
char *fpath=argv[1];
FILE *pFile=fopen(fpath,"rb");
//FILE *pFile=fopen("277.txt", "rb+");// "rw,ccs=UTF-8");
fseek(pFile,0,SEEK_END);
int lsize = ftell(pFile);
rewind(pFile);
buffer = (unsigned char*)malloc(sizeof(unsigned char)*lsize);
memset(buffer, 0, sizeof(unsigned char)*lsize);
int     result=fread(buffer,1,lsize,pFile);
printf("sizw=%d   %d\n %s\n",lsize,strlen(buffer),buffer);
fclose(pFile);
int i=0;
for(i=0;i   printf("%02X ",(unsigned  char)buffer[i]);
}

printf("\n");

if(lsize>2&& (0xFF&buffer[0])==0xEF &&(0xFF&buffer[1]) ==0xBB && (0xFF&buffer[2]) == 0xBF){
        printf("utf-8 bom\n");
        memmove(buffer, buffer + 3, strlen(buffer) - 2);
        lsize=strlen(buffer) - 2;

}else if(lsize>1 && (0xFF&buffer[0])==0xFF &&(0xFF&buffer[1]) ==0xFE){
        printf("litter unicode\n");
        memmove(buffer, buffer + 2, lsize - 1);
        lsize=lsize - 1;
        i=0;
        for(i=0;i                 int tmp=buffer[i];
                buffer[i]=buffer[i+1];
                buffer[i+1]=tmp;
                i++;

        }


}else if(lsize>1 && (0xFF&buffer[0])==0xFE &&(0xFF&buffer[1]) ==0xFF){
        printf("default big unicode\n");
        memmove(buffer, buffer + 2, lsize - 1);
        lsize=lsize - 1;

}else if((((0xFF&buffer[0])&0x80)==0) || (lsize>1 && ((0xFF&buffer[0])&0xE0)==0xC0 && ((0xFF&buffer[1])&0xC0)==0x80) ||(lsize>2 && ((0xFF&buffer[0])&0xF0)==0xE0 && ((0xFF&buffer[1])&0xC0)==0x80 &&((0xFF&buffer[2])&0xC0)==0x80) ){
         printf("utf-8 no bom\n");
}else if(((0xFF&buffer[0])&0xFF)>=0x81 && ((0xFF&buffer[1])&0xFF)>=0x40 ){
          printf("ansi\n");
}


i=0;
for(i=0;i   printf("%02X ",(unsigned  char)buffer[i]);
}
printf("\n");
return 0;
}

打”记事本“程序Notepad.exe依次采用ANSI,Unicode,Unicode big endian 和 UTF-8编码方式保存。然后,用文本编辑软件UltraEdit中的”十六进制功能“,观察该文件的内部编码方式。

1)ANSI:GB2312编码,是采用大头方式存储的。

2)Unicode:编码是四个字节“FF FE 25 4E”,其中“FF FE”表明是小头方式存储,

3)Unicode big endian:编码是四个字节“FE FF ”,其中“FE FF”表明是大头方式存储。

4)UTF-8:编码是六个字节“EF BB BF”,前三个字节“EF BB BF”表示这是UTF-8编码,它的存储顺序与编码顺序是一致的。

你可能感兴趣的:(判断txt文件是ANSI UTF-8 unicode编码c语言实现源码)