eSNACC对ASN.1内置字符串的编码和解码

eSNACC运行时库直接支持各种各样的ASN.1字符串定义。这包括PrintableString, BMPString, TeletexString, NumericString, IA5String, UniversalString, UTF8String, VisibleString。其做法也大同小异，所有这些字符串都是在eSNACC的字节串基础上typedef过来的，例如：

typedef AsnOcts PrintableString; /**/ /* [UNIVERSAL 19] IMPLICIT OCTET STRING */

只是根据各自特性的不同而在编码解码时有某些判断，或者加了某些判断函数等。只有UTF8String相对比较复杂一些。让我们一起来分析一下吧。

/******************************PrintableString************************************************/

PrintableString也就是可打印字符组成的字符串。因为要求是可以打印的字符，其判断函数在串中全部为可打印字符的函数时返回0，否则返回-1。

static int chkPrintableString(PrintableString * checkBuf)

{
    unsigned int i;
    char temp;

    if (checkBuf == NULL)
        return -1;

    for (i = 0; i < checkBuf->octetLen; i++)

{
        temp = checkBuf->octs[i];
        /**//* Check A-Z */
        if ((temp < 'A') || (temp > 'Z'))

{
            /**//* Check for a-z */
            if ((temp < 'a') || (temp > 'z'))

{
                /**//* Check for 0-9 */
                if ((temp < '0') || (temp > '9'))

{
switch (temp)

{
                    case ' ':        /**//* space */
                    case '\'':        /* apostrophe */
                    case '(':        /**//* left parenthesis */
                    case ')':        /**//* right parenthesis */
                    case '+':        /**//* plus sign */
                    case ',':        /**//* comma */
                    case '-':        /**//* hyphen */
                    case '.':        /**//* full stop (period) */
                    case '/':        /**//* solidus */
                    case ':':        /**//* colon */
                    case '=':        /**//* equal sign */
                    case '?':        /**//* question mark */
                        break;

                    default:
                        return -1;
                    }
                }
            }
        }
    }

    return 0;
} /**/ /* end of chkPrintableString() */

然后在编码和解码时判断本函数的要求是否满足，否则报错。

/******************************************************************************************/

/************************************BMPString**********************************************/

BMPString也就是UNICODE_STRING，也就是双字节字符。所以要求其内部字节串的长度必须是2的倍数，不然编码和解码时就会报错。

AsnLen BEncBMPStringContent(GenBuf * b, BMPString * octs)

{
if ((octs->octetLen % 2) != 0)

{
        BufSetWriteError (b, TRUE);
    }
    return BEncAsnOctsContent(b, octs);
} /**/ /* end of BEncBMPStringContent() */

void BDecBMPStringContent(GenBuf * b, AsnTag tagId, AsnLen len,BMPString * result, AsnLen * bytesDecoded,ENV_TYPE env)

{
    BDecAsnOctsContent(b, tagId, len, result, bytesDecoded, env);
    if ((result->octetLen % 2) != 0)

{
        Asn1Error ("BDecBMPStringContent: ERROR - Invalid BMPString Format");
        longjmp (env, -40);
    }
}

/*****************************************************************************************/

/******************************TeletexString*************************************************/

TeletexString就是8位字节串，所以与AsnOcts是一样的，只是标签不同，这也就体现在编码和解码时对标签的处理不同。

/******************************************************************************************/

/******************************NumericString*************************************************/

NumericString就是要求由数字组成的串，不过可以有空格。判断函数如下：

static int chkNumericString(NumericString * checkBuf)

{
    unsigned int i;

    if (checkBuf == NULL)
        return -1;

    for (i = 0; i < checkBuf->octetLen; i++)

{
        if ((checkBuf->octs[i] != ' ') &&
            ((checkBuf->octs[i] < '0') || (checkBuf->octs[i] > '9')))
            return -1;
    }

    return 0;
} /**/ /* end of chkNumericString() */

/******************************************************************************************/

/******************************IA5String****************************************************/

在微软的网站上，对IA5的解释是：The International Alphabet number 5 (IA5) is generally equivalent to the ASCII alphabet, but different versions can include accents or other characters specific to a regional language. eSNACC给的判断函数是：

static int checkIA5String(IA5String * octs)

{
    unsigned int i;

    if (octs == NULL)
        return -1;

    for (i = 0; i < octs->octetLen; i++)

{
        if ((unsigned char)octs->octs[i] > 0x7F)
            return -1;
    }

    return 0;
}

/******************************************************************************************/

/******************************UniversalString************************************************/

UniversalString要求是用4个字节表示字符的串。所以在编码和解码时要求字节串的长度为4的整数倍：

AsnLen BEncUniversalStringContent(GenBuf * b, UniversalString * octs)

{
if ((octs->octetLen % 4) != 0)

{
        Asn1Error ("BEncUniversalStringContent: ERROR - Invalid UniversalString Format");
        GenBufSetWriteError (b, TRUE);
    }
    return BEncAsnOctsContent(b, octs);
} /**/ /* end of BEncUniversalStringContent() */

void BDecUniversalStringContent(GenBuf * b, AsnTag tagId, AsnLen len,UniversalString * result, AsnLen * bytesDecoded,ENV_TYPE env)

{
    BDecAsnOctsContent (b, tagId, len, result, bytesDecoded, env);
    if ((result->octetLen % 4) != 0)

{
        Asn1Error ("BDecUniversalStringContent: ERROR - Invalid UniversalString Format");
        longjmp (env, -40);
    }
} /**/ /* end of BDecUniversalStringContent() */

/******************************************************************************************/

/******************************VisibleString**************************************************/

VisibleString，具体定义不明确，直接给出eSNACC对串内容的判断函数吧：

static int chkVisibleString(VisibleString * checkBuf)

{
    unsigned int i;
    char temp;

    if (checkBuf == NULL)
        return -1;

    for (i = 0; i < checkBuf->octetLen; i++)

{
        temp = checkBuf->octs[i];
        /**//* Check A-Z */

        if((unsigned int)temp > 128)

{
            return -1;
        }
    }

    return 0;
} /**/ /* end of chkVisibleString() */

/******************************************************************************************/

/******************************UTF8String***************************************************/

压轴好戏在后头，所以最后让我们来分析一个最有价值含量的！

UTF-8是UNICODE的一种变长字符编码又称万国码，由Ken Thompson于1992年创建。现在已经标准化为RFC 3629。UTF-8用1到6个字节编码UNICODE字符。eSNACC用字节串来表示UTF8String，但是有一个判断这个字节串是否有效UTF8String的函数，并且还定义了UTF8String和wchar类型相互转换的函数，或许从这些函数中，我们能学习eSNACC是怎么处理UTF-8编码的。

我们先看一些定义和utf8的判断函数：

typedef struct

{
    unsigned char mask;
    unsigned char value;
    unsigned long maxCharValue;
} MaskValue;

/**/ /* Global Values */
#define MAX_UTF8_OCTS_PER_CHAR        6

const MaskValue gUTF8Masks[MAX_UTF8_OCTS_PER_CHAR] =

{

{ 0x80, 0x00, 0x0000007F }, /**//* one-byte encoding 标记为0XXX XXXX*/

{ 0xE0, 0xC0, 0x000007FF }, /**//* two-byte encoding 标记为110X XXXX*/

{ 0xF0, 0xE0, 0x0000FFFF }, /**//* three-byte encoding 标记为1110 XXXX*/

{ 0xF8, 0xF0, 0x0001FFFF }, /**//* four-byte encoding 标记为1111 0XXX*/

{ 0xFC, 0xF8, 0x03FFFFFF }, /**//* five-byte encoding 标记为1111 10XX*/

{ 0xFE, 0xFC, 0x07FFFFFF } /**//* six-byte encoding 标记为1111 110X*/
} ;

static bool IsValidUTF8String(UTF8String * octs)

{
    unsigned long i;
    unsigned int j;

    if (octs == NULL)
        return false;

    i = 0;
    while (i < octs->octetLen)

{
        /**//* Determine the number of UTF-8 octets that follow the first */
        for (j = 0; (j < MAX_UTF8_OCTS_PER_CHAR) &&
            ((gUTF8Masks[j].mask & octs->octs[i]) != gUTF8Masks[j].value); j++)
            ;

        /**//* Return false if the first octet was invalid or if the number of
        subsequent octets exceeds the UTF8String length */
        if ((j == MAX_UTF8_OCTS_PER_CHAR) || ((i + j) >= octs->octetLen))
            return false;

        /**//* Skip over first octet */
        i++;

        /**//* Check that each subsequent octet is properly formatted */
        for (; j > 0; j--)

{
            if ((octs->octs[i++] & 0xC0) != 0x80)
                return false;
        }
    }

    return true;
}

首先通过掩码来判断第一个字节，确定当前这个字符是用几个字节来表示的。可能的是1-6，如果不是就会报错。如果是一个，那么当前这个字节就是这个字符了，也就是只要字符小于0x7F，就只需要一个字符：这对应了MaskValue中的maxCharValue。其余的情况类似。而函数末尾那个for说明了：utf8的格式为：如果一个字符用了大于1个字节来表示，那么除了第一个用于表长度的字节以外，后面的表值的字节必须是10XX XXXX的样式。

那么一个wchar字符是如何用utf8来表示的呢？我们看看wchar -> utf8的函数：

int CvtWchar2UTF8(wchar_t * inStr, char ** utf8Str)

{
    size_t wLen;
    unsigned int i, j, x, y;
    size_t wchar_size = sizeof(wchar_t);
    wchar_t temp_wchar;

    /**//* Check parameters */
    if ((inStr == NULL) || (utf8Str == NULL))
        return -1;

    wLen = wcslen(inStr);

    /**//* Allocate and clear memory for a worst case UTF-8 string */
    *utf8Str = (char*)calloc(wLen * (wchar_size / 2 * 3) + 1, sizeof(char));
    if (*utf8Str == NULL)
        return -2;

    /**//* Convert each wide character into a UTF-8 char sequence */
    for (i = 0, x = 0; i < wLen; i++)

{
        temp_wchar = inStr[i];

        /**//* Return an error if the wide character is invalid */
        if (temp_wchar < 0)

{
            free(*utf8Str);
            *utf8Str = NULL;
            return -3;
        }

        /**//* Determine the number of characters required to encode this wide
        character */
        for (j = 0; (j < MAX_UTF8_OCTS_PER_CHAR) &&
            (temp_wchar > gUTF8Masks[j].maxCharValue); j++)
            ;

        /**//* Return an error if the wide character is invalid */
        if (j == MAX_UTF8_OCTS_PER_CHAR)

{
            free(*utf8Str);
            *utf8Str = NULL;
            return -3;
        }

        /**//* Skip over the first UTF-8 octet and encode the remaining octets
        (if any) from right-to-left.  Fill in the least significant six bits
        of each octet with the low-order bits from the wide character value */
        for (y = j; y > 0; y--)

{
            (*utf8Str)[x + y] = (char)(0x80 | (temp_wchar & 0x3F));
            temp_wchar >>= 6;
        }

        /**//* Encode the first UTF-8 octet */
        (*utf8Str)[x] = gUTF8Masks[j].value;
        (*utf8Str)[x++] |= ~gUTF8Masks[j].mask & temp_wchar;

        /**//* Update the UTF-8 string index (skipping over the subsequent octets
        already encoded */
        x += j;
    }

    return 0;
} /**/ /* end of CvtWchar2UTF8() */

本函数第一次为存储utf8的串分配足够多的内存：“如果UNICODE字符由2个字节表示，则编码成UTF-8很可能需要3个字节，而如果UNICODE字符由4个字节表示，则编码成UTF-8可能需要6个字节。”但是此处总是用3或者6个字节来存。

然后遍历每一个wchar字符，获取存放他所需的字节数，然后反序来设定字节内容。最后设定长度标记字节。对单字节表示的情况，其实这就是原值。

对应的utf8 -> wchar 函数：

int CvtUTF8towchar( char * utf8Str, wchar_t ** outStr)

{
    unsigned int len, i, j, x;
    size_t wchar_size = sizeof(wchar_t);

    if ((utf8Str == NULL) || (outStr == NULL))
        return -1;

    len = strlen(utf8Str);

    /**//* Allocate and clear the memory for a worst case result wchar_t string */
    *outStr = (wchar_t*)calloc(len + 1, sizeof(wchar_t));
    if (*outStr == NULL)
        return -2;

    /**//* Convert the UTF-8 string to a wchar_t string */
    i = 0;
    x = 0;
    while (i < len)

{
        /**//* Determine the number of UTF-8 octets that follow the first */
        for (j = 0; (j < MAX_UTF8_OCTS_PER_CHAR) &&
            ((gUTF8Masks[j].mask & utf8Str[i]) != gUTF8Masks[j].value); j++)
            ;

        /**//* Return an error if the first octet was invalid or if the number of
        subsequent octets exceeds the UTF-8 string length */
        if ((j == MAX_UTF8_OCTS_PER_CHAR) || ((i + j) >= len))

{
            free(*outStr);
            *outStr = NULL;
            return -3;
        }

        /**//* Return an error if the size of the wchar_t doesn't support the
        size of this UTF-8 character */
        if ((j > 2) && (wchar_size < 4))

{
            free(*outStr);
            *outStr = NULL;
            return -4;
        }

        /**//* Copy the bits from the first octet into the wide character */
        (*outStr)[x] = (char)(~gUTF8Masks[j].mask & utf8Str[i++]);

        /**//* Add in the bits from each subsequent octet */
        for (; j > 0; j--)

{
            /**//* Return an error if a subsequent octet isn't properly formatted */
            if ((utf8Str[i] & 0xC0) != 0x80)

{
                free(*outStr);
                *outStr = NULL;
                return -3;
            }

            (*outStr)[x] <<= 6;
            (*outStr)[x] |= utf8Str[i++] & 0x3F;
        }
        x++;
    }

    /**//* Reallocate the wchar string memory to its correct size */
    if (x < len)

{
        *outStr = (wchar_t*)realloc(*outStr, (x + 1) * sizeof(wchar_t));
        if (*outStr == NULL)
            return -2;
    }

    return 0;
}

以上这两个函数都是利用掩码值做一些按位操作，我一直想着有没有办法说明其意义，不过很遗憾，我实在没想出能用汉语说明白的办法。看来还是大家从计算机世界的语言去弄明白吧~

/******************************************************************************************/

完。

eSNACC对ASN.1内置字符串的编码和解码

eSNACC对ASN.1内置字符串的编码和解码

你可能感兴趣的:(eSNACC对ASN.1内置字符串的编码和解码)