#include
#include
#include
int main(int argc,char**argv){
unsigned char *buffer;
// char *buffer;
//177.txt utf-8 bom
//277.txt ansi
//377.txt little unicode
//477.txt big unicode
//577.txt utf-8 no bom
char *fpath=argv[1];
FILE *pFile=fopen(fpath,"rb");
//FILE *pFile=fopen("277.txt", "rb+");// "rw,ccs=UTF-8");
fseek(pFile,0,SEEK_END);
int lsize = ftell(pFile);
rewind(pFile);
buffer = (unsigned char*)malloc(sizeof(unsigned char)*lsize);
memset(buffer, 0, sizeof(unsigned char)*lsize);
int result=fread(buffer,1,lsize,pFile);
printf("sizw=%d %d\n %s\n",lsize,strlen(buffer),buffer);
fclose(pFile);
int i=0;
for(i=0;i
}
printf("\n");
if(lsize>2&& (0xFF&buffer[0])==0xEF &&(0xFF&buffer[1]) ==0xBB && (0xFF&buffer[2]) == 0xBF){
printf("utf-8 bom\n");
memmove(buffer, buffer + 3, strlen(buffer) - 2);
lsize=strlen(buffer) - 2;
}else if(lsize>1 && (0xFF&buffer[0])==0xFF &&(0xFF&buffer[1]) ==0xFE){
printf("litter unicode\n");
memmove(buffer, buffer + 2, lsize - 1);
lsize=lsize - 1;
i=0;
for(i=0;i
buffer[i]=buffer[i+1];
buffer[i+1]=tmp;
i++;
}
}else if(lsize>1 && (0xFF&buffer[0])==0xFE &&(0xFF&buffer[1]) ==0xFF){
printf("default big unicode\n");
memmove(buffer, buffer + 2, lsize - 1);
lsize=lsize - 1;
}else if((((0xFF&buffer[0])&0x80)==0) || (lsize>1 && ((0xFF&buffer[0])&0xE0)==0xC0 && ((0xFF&buffer[1])&0xC0)==0x80) ||(lsize>2 && ((0xFF&buffer[0])&0xF0)==0xE0 && ((0xFF&buffer[1])&0xC0)==0x80 &&((0xFF&buffer[2])&0xC0)==0x80) ){
printf("utf-8 no bom\n");
}else if(((0xFF&buffer[0])&0xFF)>=0x81 && ((0xFF&buffer[1])&0xFF)>=0x40 ){
printf("ansi\n");
}
i=0;
for(i=0;i
}
printf("\n");
return 0;
}
打”记事本“程序Notepad.exe依次采用ANSI,Unicode,Unicode big endian 和 UTF-8编码方式保存。然后,用文本编辑软件UltraEdit中的”十六进制功能“,观察该文件的内部编码方式。
1)ANSI:GB2312编码,是采用大头方式存储的。
2)Unicode:编码是四个字节“FF FE 25 4E”,其中“FF FE”表明是小头方式存储,
3)Unicode big endian:编码是四个字节“FE FF ”,其中“FE FF”表明是大头方式存储。
4)UTF-8:编码是六个字节“EF BB BF”,前三个字节“EF BB BF”表示这是UTF-8编码,它的存储顺序与编码顺序是一致的。