预处理器除了知道什么是数字外,并不尝试去解读它。对于预处理器来说,这样并无不可,而且这样可以使得预处理器更灵活。但是当从预处理器处得到数字序列时,词法分析器需要知道如何解读它。函数cpp_classify_number尝试根据数字字符串设置flags。
143 unsigned int
144 cpp_classify_number (cpp_reader *pfile, const cpp_token *token) in cppexp.c
145 {
146 const uchar *str = token->val.str.text;
147 const uchar *limit;
148 unsigned int max_digit, result, radix;
149 enum {NOT_FLOAT = 0, AFTER_POINT, AFTER_EXPON} float_flag;
150
151 /* If the lexer has done its job, length one can only be a single
152 digit. Fast-path this very common case. */
153 if (token->val.str.len == 1)
154 return CPP_N_INTEGER | CPP_N_SMALL | CPP_N_DECIMAL;
155
156 limit = str + token->val.str.len;
157 float_flag = NOT_FLOAT;
158 max_digit = 0;
159 radix = 10;
160
161 /* First, interpret the radix. */
162 if (*str == '0')
163 {
164 radix = 8;
165 str++;
166
167 /* Require at least one hex digit to classify it as hex. */
168 if ((*str == 'x' || *str == 'X')
169 && (str[1] == '.' || ISXDIGIT (str[1])))
170 {
171 radix = 16;
172 str++;
173 }
174 }
175
176 /* Now scan for a well-formed integer or float. */
177 for (;;)
178 {
179 unsigned int c = *str++;
180
181 if (ISDIGIT (c) || (ISXDIGIT (c) && radix == 16))
182 {
183 c = hex_value (c);
184 if (c > max_digit)
185 max_digit = c;
186 }
187 else if (c == '.')
188 {
189 if (float_flag == NOT_FLOAT)
190 float_flag = AFTER_POINT;
191 else
192 SYNTAX_ERROR ("too many decimal points in number");
193 }
194 else if ((radix <= 10 && (c == 'e' || c == 'E'))
195 || (radix == 16 && (c == 'p' || c == 'P')))
196 {
197 float_flag = AFTER_EXPON;
198 break;
199 }
200 else
201 {
202 /* Start of suffix. */
203 str--;
204 break;
205 }
206 }
207
208 if (float_flag != NOT_FLOAT && radix == 8)
209 radix = 10;
210
211 if (max_digit >= radix)
212 SYNTAX_ERROR2 ("invalid digit /"%c/" in octal constant", '0' + max_digit);
213
214 if (float_flag != NOT_FLOAT)
215 {
216 if (radix == 16 && CPP_PEDANTIC (pfile) && !CPP_OPTION (pfile, c99))
217 cpp_error (pfile, CPP_DL_PEDWARN,
218 "use of C99 hexadecimal floating constant");
219
220 if (float_flag == AFTER_EXPON)
221 {
222 if (*str == '+' || *str == '-')
223 str++;
224
225 /* Exponent is decimal, even if string is a hex float. */
226 if (!ISDIGIT (*str))
227 SYNTAX_ERROR ("exponent has no digits");
228
229 do
230 str++;
231 while (ISDIGIT (*str));
232 }
233 else if (radix == 16)
234 SYNTAX_ERROR ("hexadecimal floating constants require an exponent");
235
236 result = interpret_float_suffix (str, limit - str);
237 if (result == 0)
238 {
239 cpp_error (pfile, CPP_DL_ERROR,
240 "invalid suffix /"%.*s/" on floating constant",
241 (int) (limit - str), str);
242 return CPP_N_INVALID;
243 }
244
245 /* Traditional C didn't accept any floating suffixes. */
246 if (limit != str
247 && CPP_WTRADITIONAL (pfile)
248 && ! cpp_sys_macro_p (pfile))
249 cpp_error (pfile, CPP_DL_WARNING,
250 "traditional C rejects the /"%.*s/" suffix",
251 (int) (limit - str), str);
252
253 result |= CPP_N_FLOATING;
254 }
255 else
256 {
257 result = interpret_int_suffix (str, limit - str);
258 if (result == 0)
259 {
260 cpp_error (pfile, CPP_DL_ERROR,
261 "invalid suffix /"%.*s/" on integer constant",
262 (int) (limit - str), str);
263 return CPP_N_INVALID;
264 }
265
266 /* Traditional C only accepted the 'L' suffix.
267 Suppress warning about 'LL' with -Wno-long-long. */
268 if (CPP_WTRADITIONAL (pfile) && ! cpp_sys_macro_p (pfile))
269 {
270 int u_or_i = (result & (CPP_N_UNSIGNED|CPP_N_IMAGINARY));
271 int large = (result & CPP_N_WIDTH) == CPP_N_LARGE;
272
273 if (u_or_i || (large && CPP_OPTION (pfile, warn_long_long)))
274 cpp_error (pfile, CPP_DL_WARNING,
275 "traditional C rejects the /"%.*s/" suffix",
276 (int) (limit - str), str);
277 }
278
279 if ((result & CPP_N_WIDTH) == CPP_N_LARGE
280 && ! CPP_OPTION (pfile, c99)
281 && CPP_OPTION (pfile, warn_long_long))
282 cpp_error (pfile, CPP_DL_PEDWARN,
283 "use of C99 long long integer constant");
284
285 result |= CPP_N_INTEGER;
286 }
287
288 if ((result & CPP_N_IMAGINARY) && CPP_PEDANTIC (pfile))
289 cpp_error (pfile, CPP_DL_PEDWARN,
290 "imaginary constants are a GCC extension");
291
292 if (radix == 10)
293 result |= CPP_N_DECIMAL;
294 else if (radix == 16)
295 result |= CPP_N_HEX;
296 else
297 result |= CPP_N_OCTAL;
298
299 return result;
300
301 syntax_error:
302 return CPP_N_INVALID;
303 }
同样一个数值常量,在C++下可以有多个写法,比如:1234、0x4D2、02322、1.234e3、0x4.d2p8都是代表十进制值1234,当然后2个写法算作浮点数,一个以10为底(e),另一个以2为底(p)。另外,这些常量后还可跟有后缀,以限定其属性。浮点常量可用的后缀有f/F(单精度),l/L(双精度),i/I(虚数实部),j/J(虚数虚部)。整数常量可用后缀有l/L(long),ll/LL(long long),u/U(无符号),i/I(虚数实部),j/J(虚数虚部)。在这里,cpp_classify_number会检查这些后缀是否有效,并根据后缀设置CPP_N_SMALL,CPP_N_MEDIUM,CPP_N_LARGE。
整数常量由下面的函数来解析,并构建一个INTERGER_CST节点。
490 static tree
491 interpret_integer (const cpp_token *token, unsigned int flags) in c-lex.c
492 {
493 tree value, type;
494 enum integer_type_kind itk;
495 cpp_num integer;
496 cpp_options *options = cpp_get_options (parse_in);
497
498 integer = cpp_interpret_integer (parse_in, token, flags);
499 integer = cpp_num_sign_extend (integer, options->precision);
500 value = build_int_2_wide (integer.low, integer.high);
如果是整数,首先要验证对于目标机器来说,该数字是有效的。那么结构体cpp_num被用来从cpp_interpret_integer收集这个结果。
604 struct cpp_num in cpplib.h
605 {
606 cpp_num_part high;
607 cpp_num_part low;
608 bool unsignedp; /* True if value should be treated as unsigned. */
609 bool overflow; /* True if the most recent calculation overflowed. */
610 };
在其定义中,域high,low被设计成宿主机器(host machine)上最宽的整数。对于Linux/x86,这个类型是long。
602 typedef unsigned HOST_WIDE_INT cpp_num_part; in cpplib.h
函数cpp_interpret_integer的参数type就是cpp_classify_number辛苦分析出来的结果。
311 cpp_num
312 cpp_interpret_integer (cpp_reader *pfile, const cpp_token *token, in cppexp.c
313 unsigned int type)
314 {
315 const uchar *p, *end;
316 cpp_num result;
317
318 result.low = 0;
319 result.high = 0;
320 result.unsignedp = !!(type & CPP_N_UNSIGNED);
321 result.overflow = false;
322
323 p = token->val.str.text;
324 end = p + token->val.str.len;
325
326 /* Common case of a single digit. */
327 if (token->val.str.len == 1)
328 result.low = p[0] - '0';
329 else
330 {
331 cpp_num_part max;
332 size_t precision = CPP_OPTION (pfile, precision);
333 unsigned int base = 10, c = 0;
334 bool overflow = false;
335
336 if ((type & CPP_N_RADIX) == CPP_N_OCTAL)
337 {
338 base = 8;
339 p++;
340 }
341 else if ((type & CPP_N_RADIX) == CPP_N_HEX)
342 {
343 base = 16;
344 p += 2;
345 }
346
347 /* We can add a digit to numbers strictly less than this without
348 needing the precision and slowness of double integers. */
349 max = ~(cpp_num_part) 0;
350 if (precision < PART_PRECISION)
351 max >>= PART_PRECISION - precision;
352 max = (max - base + 1) / base + 1;
353
354 for (; p < end; p++)
355 {
356 c = *p;
357
358 if (ISDIGIT (c) || (base == 16 && ISXDIGIT (c)))
359 c = hex_value (c);
360 else
361 break;
362
363 /* Strict inequality for when max is set to zero. */
364 if (result.low < max)
365 result.low = result.low * base + c;
366 else
367 {
368 result = append_digit (result, c, base, precision);
369 overflow |= result.overflow;
370 max = 0;
371 }
372 }
上面的PART_PRECISION是high和low类型的比特数(为sizeof(long))。而pfile所保存的precision在cpp_create_reader中初始化且是long类型的比特大小。从349到352行的代码计算精度所允许的base的最大倍数。
那么当结果仍旧在max以下时,移入当前的数字还是安全的。否则。它需要由append_digit小心处理。
397 static cpp_num
398 append_digit (cpp_num num, int digit, int base, size_t precision) in cppexp.c
399 {
400 cpp_num result;
401 unsigned int shift = 3 + (base == 16);
402 bool overflow;
403 cpp_num_part add_high, add_low;
404
405 /* Multiply by 8 or 16. Catching this overflow here means we don't
406 need to worry about add_high overflowing. */
407 overflow = !!(num.high >> (PART_PRECISION - shift));
408 result.high = num.high << shift;
409 result.low = num.low << shift;
410 result.high |= num.low >> (PART_PRECISION - shift);
411
412 if (base == 10)
413 {
414 add_low = num.low << 1;
415 add_high = (num.high << 1) + (num.low >> (PART_PRECISION - 1));
416 }
417 else
418 add_high = add_low = 0;
419
420 if (add_low + digit < add_low)
421 add_high++;
422 add_low += digit;
423
424 if (result.low + add_low < result.low)
425 add_high++;
426 if (result.high + add_high < result.high)
427 overflow = true;
428
429 result.low += add_low;
430 result.high += add_high;
431
432 /* The above code catches overflow of a cpp_num type. This catches
433 overflow of the (possibly shorter) target precision. */
434 num.low = result.low;
435 num.high = result.high;
436 result = num_trim (result, precision);
437 if (!num_eq (result, num))
438 overflow = true;
439
440 result.unsignedp = num.unsignedp;
441 result.overflow = overflow;
442 return result;
443 }
因为result使用域high和low来保存被解释的数值,首先检查这个额外的数字是否将使result溢出。注意仅对于base为16,shift将是4,否则则是3。然而对于base为10,左移这个数值3个比特仅把它乘以8而不是10,因此414及415行加上该数值的2倍数。这个加法可能导致溢出,424至426行为之进行一个检查。
显然,其结果的数值很可能超出了pfile所指定的精度(参数precision)。函数num_trim根据精度裁剪这个结果。
1004 static cpp_num
1005 num_trim (cpp_num num, size_t precision) in cppexp.c
1006 {
1007 if (precision > PART_PRECISION)
1008 {
1009 precision -= PART_PRECISION;
1010 if (precision < PART_PRECISION)
1011 num.high &= ((cpp_num_part) 1 << precision) - 1;
1012 }
1013 else
1014 {
1015 if (precision < PART_PRECISION)
1016 num.low &= ((cpp_num_part) 1 << precision) - 1;
1017 num.high = 0;
1018 }
1019
1020 return num;
1021 }
无可如何,如果裁剪的结果与裁剪前不一样,毫无疑问,发生了溢出。从append_digit返回,因为它是给定精度下最后一个可以被加入的数字,在cpp_interpret_integer的670行, max被设为0,使得对余下的数字364行的条件永远为false。
cpp_interpret_integer (continue)
374 if (overflow)
375 cpp_error (pfile, CPP_DL_PEDWARN,
376 "integer constant is too large for its type");
377 /* If too big to be signed, consider it unsigned. Only warn for
378 decimal numbers. Traditional numbers were always signed (but
379 we still honor an explicit U suffix); but we only have
380 traditional semantics in directives. */
381 else if (!result.unsignedp
382 && !(CPP_OPTION (pfile, traditional)
383 && pfile->state.in_directive)
384 && !num_positive (result, precision))
385 {
386 if (base == 10)
387 cpp_error (pfile, CPP_DL_WARNING,
388 "integer constant is so large that it is unsigned");
389 result.unsignedp = true;
390 }
391 }
392
393 return result;
394 }
因为树节点INTEGER_CST也是使用HOST_WIDE_INT类型的high及low域,以2进制补码的形式保存数值。对于有符号的数值,这是要求符号扩展的。这项工作由cpp_num_sign_extend来完成。
1038 cpp_num
1039 cpp_num_sign_extend (cpp_num num, size_t precision) in cppexp.c
1040 {
1041 if (!num.unsignedp)
1042 {
1043 if (precision > PART_PRECISION)
1044 {
1045 precision -= PART_PRECISION;
1046 if (precision < PART_PRECISION
1047 && (num.high & (cpp_num_part) 1 << (precision - 1)))
1048 num.high |= ~(~(cpp_num_part) 0 >> (PART_PRECISION - precision));
1049 }
1050 else if (num.low & (cpp_num_part) 1 << (precision - 1))
1051 {
1052 if (precision < PART_PRECISION)
1053 num.low |= ~(~(cpp_num_part) 0 >> (PART_PRECISION - precision));
1054 num.high = ~(cpp_num_part) 0;
1055 }
1056 }
1057
1058 return num;
1059 }
然后经过验证的数值被填入由在interpret_integer 500行的built_int_2_wide所创建的value节点中。
接下来,要为结果关联上类型。我们已经看到,在编译器的初始化阶段,整数的类型节点都已经被创建。将要选出最适合的类型(包含尽可能少的比特位)。
interpret_integer (continue)
502 /* The type of a constant with a U suffix is straightforward. */
503 if (flags & CPP_N_UNSIGNED)
504 itk = narrowest_unsigned_type (value, flags);
505 else
506 {
507 /* The type of a potentially-signed integer constant varies
508 depending on the base it's in, the standard in use, and the
509 length suffixes. */
510 enum integer_type_kind itk_u = narrowest_unsigned_type (value, flags);
511 enum integer_type_kind itk_s = narrowest_signed_type (value, flags);
512
513 /* In both C89 and C99, octal and hex constants may be signed or
514 unsigned, whichever fits tighter. We do not warn about this
515 choice differing from the traditional choice, as the constant
516 is probably a bit pattern and either way will work. */
517 if ((flags & CPP_N_RADIX) != CPP_N_DECIMAL)
518 itk = MIN (itk_u, itk_s);
519 else
520 {
521 /* In C99, decimal constants are always signed.
522 In C89, decimal constants that don't fit in long have
523 undefined behavior; we try to make them unsigned long.
524 In GCC's extended C89, that last is true of decimal
525 constants that don't fit in long long, too. */
526
527 itk = itk_s;
528 if (itk_s > itk_u && itk_s > itk_long)
529 {
530 if (!flag_isoc99)
531 {
532 if (itk_u < itk_unsigned_long)
533 itk_u = itk_unsigned_long;
534 itk = itk_u;
535 warning ("this decimal constant is unsigned only in ISO C90");
536 }
537 else if (warn_traditional)
538 warning ("this decimal constant would be unsigned in ISO C90");
539 }
540 }
541 }
542
543 if (itk == itk_none)
544 /* cpplib has already issued a warning for overflow. */
545 type = ((flags & CPP_N_UNSIGNED)
546 ? widest_unsigned_literal_type_node
547 : widest_integer_literal_type_node);
548 else
549 type = integer_types[itk];
550
551 if (itk > itk_unsigned_long
552 && (flags & CPP_N_WIDTH) != CPP_N_LARGE
553 && ! in_system_header && ! flag_isoc99)
554 pedwarn ("integer constant is too large for /"%s/" type",
555 (flags & CPP_N_UNSIGNED) ? "unsigned long" : "long");
556
557 TREE_TYPE (value) = type;
558
559 /* Convert imaginary to a complex type. */
560 if (flags & CPP_N_IMAGINARY)
561 value = build_complex (NULL_TREE, convert (type, integer_zero_node), value);
562
563 return value;
564 }
函数narrowest_unsigned_type与narrowest_signed_type非常相似。对于整型常量,如果不带后缀‘l/L’,该值被认为是CPP_N_SMALL,如果带后缀‘l/L’,则被视为CPP_N_MEDIUM,否则就是CPP_N_LARGE(参考cpp_classify_number)。
442 static enum integer_type_kind
443 narrowest_unsigned_type (tree value, unsigned int flags) in c-lex.c
444 {
445 enum integer_type_kind itk;
446
447 if ((flags & CPP_N_WIDTH) == CPP_N_SMALL)
448 itk = itk_unsigned_int;
449 else if ((flags & CPP_N_WIDTH) == CPP_N_MEDIUM)
450 itk = itk_unsigned_long;
451 else
452 itk = itk_unsigned_long_long;
453
454 /* int_fits_type_p must think the type of its first argument is
455 wider than its second argument, or it won't do the proper check. */
456 TREE_TYPE (value) = widest_unsigned_literal_type_node;
457
458 for (; itk < itk_none; itk += 2 /* skip unsigned types */)
459 if (int_fits_type_p (value, integer_types[itk]))
460 return itk;
461
462 return itk_none;
463 }
在459行,integer_types设计成有符号、无符号类型间隔出现。而在456行,注意value的类型被故意设置成widest_unsigned_literal_type_node(在32位x86系统上为long long),这是该系统所能支持的最大整型。这将影响int_fits_type_p的结果。
4243 int
4244 int_fits_type_p (tree c, tree type) in tree.c
4245 {
4246 tree type_low_bound = TYPE_MIN_VALUE (type);
4247 tree type_high_bound = TYPE_MAX_VALUE (type);
4248 int ok_for_low_bound, ok_for_high_bound;
4249
4250 /* Perform some generic filtering first, which may allow making a decision
4251 even if the bounds are not constant. First, negative integers never fit
4252 in unsigned types, */
4253 if ((TREE_UNSIGNED (type) && tree_int_cst_sgn (c) < 0)
4254 /* Also, unsigned integers with top bit set never fit signed types. */
4255 || (! TREE_UNSIGNED (type)
4256 && TREE_UNSIGNED (TREE_TYPE (c)) && tree_int_cst_msb (c)))
4257 return 0;
4258
4259 /* If at least one bound of the type is a constant integer, we can check
4260 ourselves and maybe make a decision. If no such decision is possible, but
4261 this type is a subtype, try checking against that. Otherwise, use
4262 force_fit_type, which checks against the precision.
4263
4264 Compute the status for each possibly constant bound, and return if we see
4265 one does not match. Use ok_for_xxx_bound for this purpose, assigning -1
4266 for "unknown if constant fits", 0 for "constant known *not* to fit" and 1
4267 for "constant known to fit". */
4268
4269 ok_for_low_bound = -1;
4270 ok_for_high_bound = -1;
4271
4272 /* Check if C >= type_low_bound. */
4273 if (type_low_bound && TREE_CODE (type_low_bound) == INTEGER_CST)
4274 {
4275 ok_for_low_bound = ! tree_int_cst_lt (c, type_low_bound);
4276 if (! ok_for_low_bound)
4277 return 0;
4278 }
4279
4280 /* Check if c <= type_high_bound. */
4281 if (type_high_bound && TREE_CODE (type_high_bound) == INTEGER_CST)
4282 {
4283 ok_for_high_bound = ! tree_int_cst_lt (type_high_bound, c);
4284 if (! ok_for_high_bound)
4285 return 0;
4286 }
4287
4288 /* If the constant fits both bounds, the result is known. */
4289 if (ok_for_low_bound == 1 && ok_for_high_bound == 1)
4290 return 1;
4291
4292 /* If we haven't been able to decide at this point, there nothing more we
4293 can check ourselves here. Look at the base type if we have one. */
4294 else if (TREE_CODE (type) == INTEGER_TYPE && TREE_TYPE (type) != 0)
4295 return int_fits_type_p (c, TREE_TYPE (type));
4296
4297 /* Or to force_fit_type, if nothing else. */
4298 else
4299 {
4300 c = copy_node (c);
4301 TREE_TYPE (c) = type;
4302 return !force_fit_type (c, 0);
4303 }
4304 }
我们已经在前面看到,类型节点的TYPE_MIN_VALUE和TYPE_MAX_VALUE分别表示该类型所表示的最小、大值。如果超出此范围,该类型是不合适的。另外,对于负数,无符号类型不是合适类型;对于使用了最高位的无符号数,有符号类型也是不合适的。
注意只有类型的TYPE_MIN_VALUE和TYPE_MAX_VALUE没有被设定的情况下(比如,typedef声明的int类型),我们才会进入4292行以下的代码。
如果narrowest_unsigned_type找不出合适的类型,那么只好使用系统所能支持的最大类型(544行的注释指出,这时编译器应该已经给出溢出警告)。而这个被找出的type,在interpret_integer的557行,最终被设置为所解析数值的类型。而代表该整数的节点将返回给cp_token的value域。