考虑到判断效率,简要写了如下代码做判断即可达到基本效果:
/* * Get the character code type. (UTF-8 or GB18030) * @param s the string to be operator. * @return return the code type. (1 means UTF-8, 0 for GB18030, -1 for error) */ int get_character_code_type(const char* s) { if (NULL == s) { return -1; } int i = 0; for(; s[i] != '\0'; i++) { // ASCII character. if (!(s[i] & 0x80)) { continue; } // Hanzi utf-8 code possiable. else if(!( (s[i] & 0xF0) ^ 0xE0) && s[i+1] && !( (s[i+1] & 0xC0) ^ 0x80) && s[i+2] && !( (s[i+2] & 0xC0) ^ 0x80)) { return 1; } // Not a UTF-8 code. else { return 0; } } return -1; }
写一个测试例子来测试一下:
#include "char_code.h" #include <string.h> #include <stdlib.h> #include <unistd.h> #include <fcntl.h> #include <stdio.h> int main(int argc, char* argv[]) { if (argc < 2) { printf("%s [file_path]\n", argv[0]); return -1; } // open file and read buf. int f = open(argv[1], O_RDONLY); if ( -1 == f ) { fprintf(stderr, "file %s open failed.\n", argv[1]); return -1; } char buf[1024] = {0}; read(f, buf, 1023); int ret = get_character_code_type(buf); fprintf(stdout, "char code type = %s\n", (ret == 1 ? "UTF-8" : "GB18030")); // close file. if ( 0 != close(f)) { fprintf(stderr, "file %s close failed.\n", argv[1]); return -1; } return 0; }
编译:
gcc test.c -o test -O0 -g3 -Wall
运行结果:
$ ./test gb18030.txt char code type = GB18030 $ ./test utf8.txt char code type = UTF-8
下载:
http://download.csdn.net/detail/firstboy0513/4137551