atoi函数的实现二: 测试各实现的正确性

linux内核的atoi测试

v_JULY_v君的问题非常好(请见文章的评论)! 每次都让我思考. 现将linux内核的atoi测试代码贴出来, 为了区别了C标准库的atoi函数, 我把测试的函数名改为matoi:

#include <ctype.h>
#include <string.h>
#include <stdio.h>

/*http://lxr.free-electrons.com/source/lib/kstrtox.h#L4*/
#define KSTRTOX_OVERFLOW        (1U << 31)
const char *_parse_integer_fixup_radix(const char *s, unsigned int *base);
unsigned int _parse_integer(const char *s, unsigned int base, unsigned long long *res);

/*http://lxr.free-electrons.com/source/arch/powerpc/boot/types.h#L12*/
typedef int                     s32;
typedef unsigned int            u32;
typedef unsigned long long      u64;

/*http://lxr.free-electrons.com/source/drivers/media/pci/ngene/ngene-dvb.c#L127*/
static u32 overflow;

/*http://lxr.free-electrons.com/source/include/linux/kernel.h#L29*/
#define ULLONG_MAX      (~0ULL)

#define unlikely(cond) (cond)

/*http://lxr.free-electrons.com/source/lib/kstrtox.c#L23*/
const char *_parse_integer_fixup_radix(const char *s, unsigned int *base)
{
        if (*base == 0) {
                if (s[0] == '0') {
                        if (_tolower(s[1]) == 'x' && isxdigit(s[2]))
                                *base = 16;
                        else
                                *base = 8;
                } else
                        *base = 10;
        }
        if (*base == 16 && s[0] == '0' && _tolower(s[1]) == 'x')
                s += 2;
        return s;
}

/*http://lxr.free-electrons.com/source/lib/kstrtox.c#L47*/
/*
 * Convert non-negative integer string representation in explicitly given radix
 * to an integer.
 * Return number of characters consumed maybe or-ed with overflow bit.
 * If overflow occurs, result integer (incorrect) is still returned.
 *
 * Don't you dare use this function.
 */
unsigned int _parse_integer(const char *s, unsigned int base, unsigned long long *p)
{
        unsigned long long res;
        unsigned int rv;
        int overflow;

        res = 0;
        rv = 0;
        overflow = 0;
        while (*s) {
                unsigned int val;

                if ('0' <= *s && *s <= '9')
                        val = *s - '0';
                else if ('a' <= _tolower(*s) && _tolower(*s) <= 'f')
                        val = _tolower(*s) - 'a' + 10;
                else
                        break;

                if (val >= base)
                        break;
                /*
                 * Check for overflow only if we are within range of
                 * it in the max base we support (16)
                 */
                if (unlikely(res & (~0ull << 60))) {
                        if (res > ULLONG_MAX - val/base)
                                overflow = 1;
                }
                res = res * base + val;
                rv++;
                s++;
        }
        *p = res;
        if (overflow)
                rv |= KSTRTOX_OVERFLOW;
        return rv;
}

/*http://lxr.free-electrons.com/source/lib/vsprintf.c#L44*/
/**
 * simple_strtoull - convert a string to an unsigned long long
 * @cp: The start of the string
 * @endp: A pointer to the end of the parsed string will be placed here
 * @base: The number base to use
 *
 * This function is obsolete. Please use kstrtoull instead.
 */
unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int base)
{
        unsigned long long result;
        unsigned int rv;

        cp = _parse_integer_fixup_radix(cp, &base);
        rv = _parse_integer(cp, base, &result);
        /* FIXME */
        cp += (rv & ~KSTRTOX_OVERFLOW);

        if (endp)
                *endp = (char *)cp;

        return result;
}

/*http://lxr.free-electrons.com/source/lib/vsprintf.c#L83*/
/**
 * simple_strtoul - convert a string to an unsigned long
 * @cp: The start of the string
 * @endp: A pointer to the end of the parsed string will be placed here
 * @base: The number base to use
 *
 * This function is obsolete. Please use kstrtoul instead.
 */
unsigned long simple_strtoul(const char *cp, char **endp, unsigned int base)
{
        return simple_strtoull(cp, endp, base);
}

/*http://lxr.free-electrons.com/source/drivers/staging/tidspbridge/rmgr/dbdcd.c#L950*/
/*
 *  ======== atoi ========
 *  Purpose:
 *      This function converts strings in decimal or hex format to integers.
 */
static s32 matoi(const char *psz_buf)
{
        char *pch = psz_buf;
        s32 base = 0;

        while (isspace(*pch))
                pch++;

        if (*pch == '-' || *pch == '+') {
                base = 10;
                pch++;
        } else if (*pch && tolower(pch[strlen(pch) - 1]) == 'h') {
                base = 16;
        }

        return simple_strtoul(pch, NULL, base);
}

void test(const char* str) {
    printf("%s : %d\n", str, matoi(str));
}

int main() {
    test("2147483647");
    test("2147483648");
    test("-2147483648");
    test("-2147483649");
    test("10522545459");
    test("-10522545459");

    return 0;
}

修改的地方在第75行, 原来的代码为:

if (res > div_u64(ULLONG_MAX - val, base))

而div_u64调用的div_u64_rem函数中包含汇编代码编译不过(原因尚未可知, 有待进一步研究), 所以我把这段程序去掉了.

程序的输出结果(很显然, 对于溢出的情况, 程序没有处理):

2147483647 : 2147483647
2147483648 : -2147483648
10522545459 : 1932610867
-2147483648 : -2147483648
-2147483649 : -2147483647
-10522545459 : 1932610867

Nut/OS的atoi测试

以下是测试代码(在ubuntu 10.4.1, gcc 4.4.3上编译通过, 为了区别于C标准库的函数, 函数名strtol更改为mstrtol, atoi更改为matoi2):

#include <errno.h>
#include <stdio.h>
#include <ctype.h>
#include <limits.h>

#define CONST      const

long mstrtol(CONST char *nptr, char **endptr, int base)
{
    register CONST char *s;
    register long acc, cutoff;
    register int c;
    register int neg, any, cutlim;

    /*
     * Skip white space and pick up leading +/- sign if any.
     * If base is 0, allow 0x for hex and 0 for octal, else
     * assume decimal; if base is already 16, allow 0x.
     */
    s = nptr;
    do {
        c = (unsigned char) *s++;
    } while (isspace(c));
    if (c == '-') {
        neg = 1;
        c = *s++;
    } else {
        neg = 0;
        if (c == '+')
            c = *s++;
    }
    if ((base == 0 || base == 16) && c == '0' && (*s == 'x' || *s == 'X')) {
        c = s[1];
        s += 2;
        base = 16;
    }
    if (base == 0)
        base = c == '0' ? 8 : 10;

    /*
     * Compute the cutoff value between legal numbers and illegal
     * numbers.  That is the largest legal value, divided by the
     * base.  An input number that is greater than this value, if
     * followed by a legal input character, is too big.  One that
     * is equal to this value may be valid or not; the limit
     * between valid and invalid numbers is then based on the last
     * digit.  For instance, if the range for longs is
     * [-2147483648..2147483647] and the input base is 10,
     * cutoff will be set to 214748364 and cutlim to either
     * 7 (neg==0) or 8 (neg==1), meaning that if we have accumulated
     * a value > 214748364, or equal but the next digit is > 7 (or 8),
     * the number is too big, and we will return a range error.
     *
     * Set any if any `digits' consumed; make it negative to indicate
     * overflow.
     */
    cutoff = neg ? LONG_MIN : LONG_MAX;
    cutlim = cutoff % base;
    cutoff /= base;
    if (neg) {
        if (cutlim > 0) {
            cutlim -= base;
            cutoff += 1;
        }
        cutlim = -cutlim;
    }
    for (acc = 0, any = 0;; c = (unsigned char) *s++) {
        if (isdigit(c))
            c -= '0';
        else if (isalpha(c))
            c -= isupper(c) ? 'A' - 10 : 'a' - 10;
        else
            break;
        if (c >= base)
            break;
        if (any < 0)
            continue;
        if (neg) {
            if ((acc < cutoff || acc == cutoff) && c > cutlim) {
                any = -1;
                acc = LONG_MIN;
                errno = ERANGE;
            } else {
                any = 1;
                acc *= base;
                acc -= c;
            }
        } else {
            if ((acc > cutoff || acc == cutoff) && c > cutlim) {
                any = -1;
                acc = LONG_MAX;
                errno = ERANGE;
            } else {
                any = 1;
                acc *= base;
                acc += c;
            }
        }
    }
    if (endptr != 0)
        *endptr = (char *) (any ? s - 1 : nptr);
    return (acc);
}

int matoi2(CONST char *str)
{
    return ((int) mstrtol(str, (char **) NULL, 10));
}

int mgetline(char* buf, size_t n) {
  size_t idx = 0;
  int c;
  
  while (--n > 0 && (c = getchar()) != EOF && c != '\n') {
    buf[idx++] = c;
  }
  buf[idx] = '\0';
  return idx;
}

#define MAX_LINE 200

int main() {
    char buf[MAX_LINE];
    while (mgetline(buf, MAX_LINE) >= 0) {
        if (strcmp(buf, "quit") == 0) break;
        printf("matoi2=%d\n", matoi2(buf));
    }
    return 0;
}

程序的测试结果:

10522545459
matoi2=2147483647
-10522545459
matoi2=-2147483648

程序貌似对溢出的处理是正确的, 真的吗? 请注意代码的第79和第89行. 现在我把测试数据换成"10522545454", 与"10522545459"区别在于最后一个字符.

10522545454
matoi2=1932610862
-10522545454
matoi2=-1932610862

bingo! 正中下怀! 对于字串"10522545454", 在读取最后的数字字符'4'时, 整数1052254545已经大于2147483647/10了, 说明已经溢出, 不应该再判断字串的最后一位4是否大于2147483647%10, 所以第79行应该改为(89行修改方法类似):

            if (acc < cutoff || (acc == cutoff && c > cutlim)) {

修改过后的代码测试正常:

10522545459
matoi2=2147483647
-10522545459\
matoi2=-2147483648
10522545454
matoi2=2147483647
-10522545454
matoi2=-2147483648
quit

关于此bug, 我已经邮件通知En-Nut-Discussion.

以下为邮件回复的截图, Uwe Bonnes说: 可以打个补丁到分支. 不过他把单词reasonable给拼错了.

atoi函数的实现二: 测试各实现的正确性_第1张图片

References:

Linux Cross Reference

Nut/OS API

你可能感兴趣的:(c,linux,String,atoi)