uft-8编码识别

关于下面的char *参数是必须为unsigned char* 
static const char trailingBytesForUTF8[256] = {
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
};
static int isLegalUTF8( const unsigned char *source, int length ) {
    unsigned char a;
    const unsigned char *srcptr = source + length;
    switch ( length ) {
    default:
        return 0;
        /* Everything else falls through when "true"... */
    case 4:
        if ( ( a = ( *--srcptr ) ) < 0x80 || a > 0xBF ) return 0;
    case 3:
        if ( ( a = ( *--srcptr ) ) < 0x80 || a > 0xBF ) return 0;
    case 2:
        if ( ( a = ( *--srcptr ) ) > 0xBF ) return 0;
        switch ( *source ) {
            /* no fall-through in this inner switch */
        case 0xE0:
            if ( a < 0xA0 ) return 0;
            break;
        case 0xF0:
            if ( a < 0x90 ) return 0;
            break;
        case 0xF4:
            if ( a > 0x8F ) return 0;
            break;
        default:
            if ( a < 0x80 ) return 0;
        }
    case 1:
        if ( *source >= 0x80 && *source < 0xC2 ) return 0;
        if ( *source > 0xF4 ) return 0;
    }
    return 1;
}

static int bson_validate_string(  const unsigned char *string,
                                 const int length ) {

    int position = 0;
    int sequence_length = 1;
    while ( position < length ) {
     sequence_length = trailingBytesForUTF8[*( string + position )] + 1;
            if ( ( position + sequence_length ) > length ) {
                
                return ERROR;
            }
            if ( !isLegalUTF8( string + position, sequence_length ) ) {
                
                return ERROR;
            }
            position += sequence_length;
           return OK;
}

你可能感兴趣的:(uft-8编码识别)