v_JULY_v君的问题非常好(请见文章的评论)! 每次都让我思考. 现将linux内核的atoi测试代码贴出来, 为了区别了C标准库的atoi函数, 我把测试的函数名改为matoi:
#include <ctype.h> #include <string.h> #include <stdio.h> /*http://lxr.free-electrons.com/source/lib/kstrtox.h#L4*/ #define KSTRTOX_OVERFLOW (1U << 31) const char *_parse_integer_fixup_radix(const char *s, unsigned int *base); unsigned int _parse_integer(const char *s, unsigned int base, unsigned long long *res); /*http://lxr.free-electrons.com/source/arch/powerpc/boot/types.h#L12*/ typedef int s32; typedef unsigned int u32; typedef unsigned long long u64; /*http://lxr.free-electrons.com/source/drivers/media/pci/ngene/ngene-dvb.c#L127*/ static u32 overflow; /*http://lxr.free-electrons.com/source/include/linux/kernel.h#L29*/ #define ULLONG_MAX (~0ULL) #define unlikely(cond) (cond) /*http://lxr.free-electrons.com/source/lib/kstrtox.c#L23*/ const char *_parse_integer_fixup_radix(const char *s, unsigned int *base) { if (*base == 0) { if (s[0] == '0') { if (_tolower(s[1]) == 'x' && isxdigit(s[2])) *base = 16; else *base = 8; } else *base = 10; } if (*base == 16 && s[0] == '0' && _tolower(s[1]) == 'x') s += 2; return s; } /*http://lxr.free-electrons.com/source/lib/kstrtox.c#L47*/ /* * Convert non-negative integer string representation in explicitly given radix * to an integer. * Return number of characters consumed maybe or-ed with overflow bit. * If overflow occurs, result integer (incorrect) is still returned. * * Don't you dare use this function. */ unsigned int _parse_integer(const char *s, unsigned int base, unsigned long long *p) { unsigned long long res; unsigned int rv; int overflow; res = 0; rv = 0; overflow = 0; while (*s) { unsigned int val; if ('0' <= *s && *s <= '9') val = *s - '0'; else if ('a' <= _tolower(*s) && _tolower(*s) <= 'f') val = _tolower(*s) - 'a' + 10; else break; if (val >= base) break; /* * Check for overflow only if we are within range of * it in the max base we support (16) */ if (unlikely(res & (~0ull << 60))) { if (res > ULLONG_MAX - val/base) overflow = 1; } res = res * base + val; rv++; s++; } *p = res; if (overflow) rv |= KSTRTOX_OVERFLOW; return rv; } /*http://lxr.free-electrons.com/source/lib/vsprintf.c#L44*/ /** * simple_strtoull - convert a string to an unsigned long long * @cp: The start of the string * @endp: A pointer to the end of the parsed string will be placed here * @base: The number base to use * * This function is obsolete. Please use kstrtoull instead. */ unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int base) { unsigned long long result; unsigned int rv; cp = _parse_integer_fixup_radix(cp, &base); rv = _parse_integer(cp, base, &result); /* FIXME */ cp += (rv & ~KSTRTOX_OVERFLOW); if (endp) *endp = (char *)cp; return result; } /*http://lxr.free-electrons.com/source/lib/vsprintf.c#L83*/ /** * simple_strtoul - convert a string to an unsigned long * @cp: The start of the string * @endp: A pointer to the end of the parsed string will be placed here * @base: The number base to use * * This function is obsolete. Please use kstrtoul instead. */ unsigned long simple_strtoul(const char *cp, char **endp, unsigned int base) { return simple_strtoull(cp, endp, base); } /*http://lxr.free-electrons.com/source/drivers/staging/tidspbridge/rmgr/dbdcd.c#L950*/ /* * ======== atoi ======== * Purpose: * This function converts strings in decimal or hex format to integers. */ static s32 matoi(const char *psz_buf) { char *pch = psz_buf; s32 base = 0; while (isspace(*pch)) pch++; if (*pch == '-' || *pch == '+') { base = 10; pch++; } else if (*pch && tolower(pch[strlen(pch) - 1]) == 'h') { base = 16; } return simple_strtoul(pch, NULL, base); } void test(const char* str) { printf("%s : %d\n", str, matoi(str)); } int main() { test("2147483647"); test("2147483648"); test("-2147483648"); test("-2147483649"); test("10522545459"); test("-10522545459"); return 0; }
修改的地方在第75行, 原来的代码为:
if (res > div_u64(ULLONG_MAX - val, base))
而div_u64调用的div_u64_rem函数中包含汇编代码编译不过(原因尚未可知, 有待进一步研究), 所以我把这段程序去掉了.
程序的输出结果(很显然, 对于溢出的情况, 程序没有处理):
2147483647 : 2147483647 2147483648 : -2147483648 10522545459 : 1932610867 -2147483648 : -2147483648 -2147483649 : -2147483647 -10522545459 : 1932610867
以下是测试代码(在ubuntu 10.4.1, gcc 4.4.3上编译通过, 为了区别于C标准库的函数, 函数名strtol更改为mstrtol, atoi更改为matoi2):
#include <errno.h> #include <stdio.h> #include <ctype.h> #include <limits.h> #define CONST const long mstrtol(CONST char *nptr, char **endptr, int base) { register CONST char *s; register long acc, cutoff; register int c; register int neg, any, cutlim; /* * Skip white space and pick up leading +/- sign if any. * If base is 0, allow 0x for hex and 0 for octal, else * assume decimal; if base is already 16, allow 0x. */ s = nptr; do { c = (unsigned char) *s++; } while (isspace(c)); if (c == '-') { neg = 1; c = *s++; } else { neg = 0; if (c == '+') c = *s++; } if ((base == 0 || base == 16) && c == '0' && (*s == 'x' || *s == 'X')) { c = s[1]; s += 2; base = 16; } if (base == 0) base = c == '0' ? 8 : 10; /* * Compute the cutoff value between legal numbers and illegal * numbers. That is the largest legal value, divided by the * base. An input number that is greater than this value, if * followed by a legal input character, is too big. One that * is equal to this value may be valid or not; the limit * between valid and invalid numbers is then based on the last * digit. For instance, if the range for longs is * [-2147483648..2147483647] and the input base is 10, * cutoff will be set to 214748364 and cutlim to either * 7 (neg==0) or 8 (neg==1), meaning that if we have accumulated * a value > 214748364, or equal but the next digit is > 7 (or 8), * the number is too big, and we will return a range error. * * Set any if any `digits' consumed; make it negative to indicate * overflow. */ cutoff = neg ? LONG_MIN : LONG_MAX; cutlim = cutoff % base; cutoff /= base; if (neg) { if (cutlim > 0) { cutlim -= base; cutoff += 1; } cutlim = -cutlim; } for (acc = 0, any = 0;; c = (unsigned char) *s++) { if (isdigit(c)) c -= '0'; else if (isalpha(c)) c -= isupper(c) ? 'A' - 10 : 'a' - 10; else break; if (c >= base) break; if (any < 0) continue; if (neg) { if ((acc < cutoff || acc == cutoff) && c > cutlim) { any = -1; acc = LONG_MIN; errno = ERANGE; } else { any = 1; acc *= base; acc -= c; } } else { if ((acc > cutoff || acc == cutoff) && c > cutlim) { any = -1; acc = LONG_MAX; errno = ERANGE; } else { any = 1; acc *= base; acc += c; } } } if (endptr != 0) *endptr = (char *) (any ? s - 1 : nptr); return (acc); } int matoi2(CONST char *str) { return ((int) mstrtol(str, (char **) NULL, 10)); } int mgetline(char* buf, size_t n) { size_t idx = 0; int c; while (--n > 0 && (c = getchar()) != EOF && c != '\n') { buf[idx++] = c; } buf[idx] = '\0'; return idx; } #define MAX_LINE 200 int main() { char buf[MAX_LINE]; while (mgetline(buf, MAX_LINE) >= 0) { if (strcmp(buf, "quit") == 0) break; printf("matoi2=%d\n", matoi2(buf)); } return 0; }
程序的测试结果:
10522545459 matoi2=2147483647 -10522545459 matoi2=-2147483648
程序貌似对溢出的处理是正确的, 真的吗? 请注意代码的第79和第89行. 现在我把测试数据换成"10522545454", 与"10522545459"区别在于最后一个字符.
10522545454 matoi2=1932610862 -10522545454 matoi2=-1932610862
bingo! 正中下怀! 对于字串"10522545454", 在读取最后的数字字符'4'时, 整数1052254545已经大于2147483647/10了, 说明已经溢出, 不应该再判断字串的最后一位4是否大于2147483647%10, 所以第79行应该改为(89行修改方法类似):
if (acc < cutoff || (acc == cutoff && c > cutlim)) {
修改过后的代码测试正常:
10522545459 matoi2=2147483647 -10522545459\ matoi2=-2147483648 10522545454 matoi2=2147483647 -10522545454 matoi2=-2147483648 quit
关于此bug, 我已经邮件通知En-Nut-Discussion.
以下为邮件回复的截图, Uwe Bonnes说: 可以打个补丁到分支. 不过他把单词reasonable给拼错了.
References:
Linux Cross Reference
Nut/OS API