Preprocessor knows little about the number except it knows what is digit. It is OK for preprocessor does so and it makes the preprocessor more flexible. But when receiving the digital sequence returned by preprocessor, the lexer now needs know how to interpret it. Routine cpp_classify_number tries to set flags according to the literal string of number.
143 unsigned int
144 cpp_classify_number (cpp_reader *pfile, const cpp_token *token) in cppexp.c
145 {
146 const uchar *str = token->val.str.text;
147 const uchar *limit;
148 unsigned int max_digit, result, radix;
149 enum {NOT_FLOAT = 0, AFTER_POINT, AFTER_EXPON} float_flag;
150
151 /* If the lexer has done its job, length one can only be a single
152 digit. Fast-path this very common case. */
153 if (token->val.str.len == 1)
154 return CPP_N_INTEGER | CPP_N_SMALL | CPP_N_DECIMAL;
155
156 limit = str + token->val.str.len;
157 float_flag = NOT_FLOAT;
158 max_digit = 0;
159 radix = 10;
160
161 /* First, interpret the radix. */
162 if (*str == '0')
163 {
164 radix = 8;
165 str++;
166
167 /* Require at least one hex digit to classify it as hex. */
168 if ((*str == 'x' || *str == 'X')
169 && (str[1] == '.' || ISXDIGIT (str[1])))
170 {
171 radix = 16;
172 str++;
173 }
174 }
175
176 /* Now scan for a well-formed integer or float. */
177 for (;;)
178 {
179 unsigned int c = *str++;
180
181 if (ISDIGIT (c) || (ISXDIGIT (c) && radix == 16))
182 {
183 c = hex_value (c);
184 if (c > max_digit)
185 max_digit = c;
186 }
187 else if (c == '.')
188 {
189 if (float_flag == NOT_FLOAT)
190 float_flag = AFTER_POINT;
191 else
192 SYNTAX_ERROR ("too many decimal points in number");
193 }
194 else if ((radix <= 10 && (c == 'e' || c == 'E'))
195 || (radix == 16 && (c == 'p' || c == 'P')))
196 {
197 float_flag = AFTER_EXPON;
198 break;
199 }
200 else
201 {
202 /* Start of suffix. */
203 str--;
204 break;
205 }
206 }
207
208 if (float_flag != NOT_FLOAT && radix == 8)
209 radix = 10;
210
211 if (max_digit >= radix)
212 SYNTAX_ERROR2 ("invalid digit /"%c/" in octal constant", '0' + max_digit);
213
214 if (float_flag != NOT_FLOAT)
215 {
216 if (radix == 16 && CPP_PEDANTIC (pfile) && !CPP_OPTION (pfile, c99))
217 cpp_error (pfile, CPP_DL_PEDWARN,
218 "use of C99 hexadecimal floating constant");
219
220 if (float_flag == AFTER_EXPON)
221 {
222 if (*str == '+' || *str == '-')
223 str++;
224
225 /* Exponent is decimal, even if string is a hex float. */
226 if (!ISDIGIT (*str))
227 SYNTAX_ERROR ("exponent has no digits");
228
229 do
230 str++;
231 while (ISDIGIT (*str));
232 }
233 else if (radix == 16)
234 SYNTAX_ERROR ("hexadecimal floating constants require an exponent");
235
236 result = interpret_float_suffix (str, limit - str);
237 if (result == 0)
238 {
239 cpp_error (pfile, CPP_DL_ERROR,
240 "invalid suffix /"%.*s/" on floating constant",
241 (int) (limit - str), str);
242 return CPP_N_INVALID;
243 }
244
245 /* Traditional C didn't accept any floating suffixes. */
246 if (limit != str
247 && CPP_WTRADITIONAL (pfile)
248 && ! cpp_sys_macro_p (pfile))
249 cpp_error (pfile, CPP_DL_WARNING,
250 "traditional C rejects the /"%.*s/" suffix",
251 (int) (limit - str), str);
252
253 result |= CPP_N_FLOATING;
254 }
255 else
256 {
257 result = interpret_int_suffix (str, limit - str);
258 if (result == 0)
259 {
260 cpp_error (pfile, CPP_DL_ERROR,
261 "invalid suffix /"%.*s/" on integer constant",
262 (int) (limit - str), str);
263 return CPP_N_INVALID;
264 }
265
266 /* Traditional C only accepted the 'L' suffix.
267 Suppress warning about 'LL' with -Wno-long-long. */
268 if (CPP_WTRADITIONAL (pfile) && ! cpp_sys_macro_p (pfile))
269 {
270 int u_or_i = (result & (CPP_N_UNSIGNED|CPP_N_IMAGINARY));
271 int large = (result & CPP_N_WIDTH) == CPP_N_LARGE;
272
273 if (u_or_i || (large && CPP_OPTION (pfile, warn_long_long)))
274 cpp_error (pfile, CPP_DL_WARNING,
275 "traditional C rejects the /"%.*s/" suffix",
276 (int) (limit - str), str);
277 }
278
279 if ((result & CPP_N_WIDTH) == CPP_N_LARGE
280 && ! CPP_OPTION (pfile, c99)
281 && CPP_OPTION (pfile, warn_long_long))
282 cpp_error (pfile, CPP_DL_PEDWARN,
283 "use of C99 long long integer constant");
284
285 result |= CPP_N_INTEGER;
286 }
287
288 if ((result & CPP_N_IMAGINARY) && CPP_PEDANTIC (pfile))
289 cpp_error (pfile, CPP_DL_PEDWARN,
290 "imaginary constants are a GCC extension");
291
292 if (radix == 10)
293 result |= CPP_N_DECIMAL;
294 else if (radix == 16)
295 result |= CPP_N_HEX;
296 else
297 result |= CPP_N_OCTAL;
298
299 return result;
300
301 syntax_error:
302 return CPP_N_INVALID;
303 }
In C++, there several representations for a numeric constant, for instance: 1234, 0x4D2, 02322, 1.234e3, 0x4.d2p8 all stand for decimal value of 1234. Of course, the rear two are regarded as floating point number, one uses 10 as log base (e), the other uses 2 (p). Further, these constants can have suffix to restrict its property. Constant of floating point can use f/F (single precision), l/L (double precision), i/I (real part of complex), j/J (imagery part). Constant of integer can use l/L (long), ll/LL (long long), u/U (unsigned), i/I (real part of complex), j/J (imagery part). Here, cpp_classify_number will validate the suffix, and set CPP_N_SMALL, CPP_N_MEDIUM, or CPP_N_LARGE accordingly.
Integer constant is handled by followin function, and an INTERGER_CST node will be built.
490 static tree
491 interpret_integer (const cpp_token *token, unsigned int flags) in c-lex.c
492 {
493 tree value, type;
494 enum integer_type_kind itk;
495 cpp_num integer;
496 cpp_options *options = cpp_get_options (parse_in);
497
498 integer = cpp_interpret_integer (parse_in, token, flags);
499 integer = cpp_num_sign_extend (integer, options->precision);
500 value = build_int_2_wide (integer.low, integer.high);
If it’s an integer number, the number must be first validated for the target machine. Then structure cpp_num is used to collect result from cpp_interpret_integer.
604 struct cpp_num in cpplib.h
605 {
606 cpp_num_part high;
607 cpp_num_part low;
608 bool unsignedp; /* True if value should be treated as unsigned. */
609 bool overflow; /* True if the most recent calculation overflowed. */
610 };
In the definition, slot high, low are defined as the widest integer on the host machine. For Linux/x86, this type is long.
602 typedef unsigned HOST_WIDE_INT cpp_num_part; in cpplib.h
Argument type of cpp_interpret_integer is the result gotten hardly by cpp_classify_number.
311 cpp_num
312 cpp_interpret_integer (cpp_reader *pfile, const cpp_token *token, in cppexp.c
313 unsigned int type)
314 {
315 const uchar *p, *end;
316 cpp_num result;
317
318 result.low = 0;
319 result.high = 0;
320 result.unsignedp = !!(type & CPP_N_UNSIGNED);
321 result.overflow = false;
322
323 p = token->val.str.text;
324 end = p + token->val.str.len;
325
326 /* Common case of a single digit. */
327 if (token->val.str.len == 1)
328 result.low = p[0] - '0';
329 else
330 {
331 cpp_num_part max;
332 size_t precision = CPP_OPTION (pfile, precision);
333 unsigned int base = 10, c = 0;
334 bool overflow = false;
335
336 if ((type & CPP_N_RADIX) == CPP_N_OCTAL)
337 {
338 base = 8;
339 p++;
340 }
341 else if ((type & CPP_N_RADIX) == CPP_N_HEX)
342 {
343 base = 16;
344 p += 2;
345 }
346
347 /* We can add a digit to numbers strictly less than this without
348 needing the precision and slowness of double integers. */
349 max = ~(cpp_num_part) 0;
350 if (precision < PART_PRECISION)
351 max >>= PART_PRECISION - precision;
352 max = (max - base + 1) / base + 1;
353
354 for (; p < end; p++)
355 {
356 c = *p;
357
358 if (ISDIGIT (c) || (base == 16 && ISXDIGIT (c)))
359 c = hex_value (c);
360 else
361 break;
362
363 /* Strict inequality for when max is set to zero. */
364 if (result.low < max)
365 result.low = result.low * base + c;
366 else
367 {
368 result = append_digit (result, c, base, precision);
369 overflow |= result.overflow;
370 max = 0;
371 }
372 }
PART_PRECISION above is the bit size of the type of high and low (it is sizeof(long)). And the precision held in pfile is initialized in cpp_create_reader which is the bit size of long. Code at lines from 349 to 352 calculates the max multiple of base allowed by precision.
Then when the result is still below max, it is safe to shift in current digit. Otherwise, it should be handled carefully by append_digit.
397 static cpp_num
398 append_digit (cpp_num num, int digit, int base, size_t precision) in cppexp.c
399 {
400 cpp_num result;
401 unsigned int shift = 3 + (base == 16);
402 bool overflow;
403 cpp_num_part add_high, add_low;
404
405 /* Multiply by 8 or 16. Catching this overflow here means we don't
406 need to worry about add_high overflowing. */
407 overflow = !!(num.high >> (PART_PRECISION - shift));
408 result.high = num.high << shift;
409 result.low = num.low << shift;
410 result.high |= num.low >> (PART_PRECISION - shift);
411
412 if (base == 10)
413 {
414 add_low = num.low << 1;
415 add_high = (num.high << 1) + (num.low >> (PART_PRECISION - 1));
416 }
417 else
418 add_high = add_low = 0;
419
420 if (add_low + digit < add_low)
421 add_high++;
422 add_low += digit;
423
424 if (result.low + add_low < result.low)
425 add_high++;
426 if (result.high + add_high < result.high)
427 overflow = true;
428
429 result.low += add_low;
430 result.high += add_high;
431
432 /* The above code catches overflow of a cpp_num type. This catches
433 overflow of the (possibly shorter) target precision. */
434 num.low = result.low;
435 num.high = result.high;
436 result = num_trim (result, precision);
437 if (!num_eq (result, num))
438 overflow = true;
439
440 result.unsignedp = num.unsignedp;
441 result.overflow = overflow;
442 return result;
443 }
As it uses slots high and low to hold the interpreted number, it first checks whether this extra digit will cause result overflows. Notice that only for base of 16, shift will be 4, otherwise it is 3. Nevertheless for base of 10, shifting the number with 3 bits only mulitpy the number by 8 instead of 10, so line 414 to 415 adds the left 2 multiple of the number. This addition may overflow, a checking is taken in line 424 to 426.
Obviously, the result number is apt to overwhelm the precision specified by pfile (parameter precision). Routine num_trim trims the result according to precision.
1004 static cpp_num
1005 num_trim (cpp_num num, size_t precision) in cppexp.c
1006 {
1007 if (precision > PART_PRECISION)
1008 {
1009 precision -= PART_PRECISION;
1010 if (precision < PART_PRECISION)
1011 num.high &= ((cpp_num_part) 1 << precision) - 1;
1012 }
1013 else
1014 {
1015 if (precision < PART_PRECISION)
1016 num.low &= ((cpp_num_part) 1 << precision) - 1;
1017 num.high = 0;
1018 }
1019
1020 return num;
1021 }
Anyway, if the trimmed result is not same as that before trimming, of course, overflow occurs. Returned from append_digit, as it is the last digit can be added for the specified precision, at line 670 in cpp_interpret_integer, max is set as 0 and forces condition at line 364 always false for following digits.
cpp_interpret_integer (continue)
374 if (overflow)
375 cpp_error (pfile, CPP_DL_PEDWARN,
376 "integer constant is too large for its type");
377 /* If too big to be signed, consider it unsigned. Only warn for
378 decimal numbers. Traditional numbers were always signed (but
379 we still honor an explicit U suffix); but we only have
380 traditional semantics in directives. */
381 else if (!result.unsignedp
382 && !(CPP_OPTION (pfile, traditional)
383 && pfile->state.in_directive)
384 && !num_positive (result, precision))
385 {
386 if (base == 10)
387 cpp_error (pfile, CPP_DL_WARNING,
388 "integer constant is so large that it is unsigned");
389 result.unsignedp = true;
390 }
391 }
392
393 return result;
394 }
As tree node of INTEGER_CST also uses high and low slots of HOST_WIDE_INT with format of 2 complement coding to hold number. For signed number, sign extension is required. It is done by cpp_num_sign_extend.
1038 cpp_num
1039 cpp_num_sign_extend (cpp_num num, size_t precision) in cppexp.c
1040 {
1041 if (!num.unsignedp)
1042 {
1043 if (precision > PART_PRECISION)
1044 {
1045 precision -= PART_PRECISION;
1046 if (precision < PART_PRECISION
1047 && (num.high & (cpp_num_part) 1 << (precision - 1)))
1048 num.high |= ~(~(cpp_num_part) 0 >> (PART_PRECISION - precision));
1049 }
1050 else if (num.low & (cpp_num_part) 1 << (precision - 1))
1051 {
1052 if (precision < PART_PRECISION)
1053 num.low |= ~(~(cpp_num_part) 0 >> (PART_PRECISION - precision));
1054 num.high = ~(cpp_num_part) 0;
1055 }
1056 }
1057
1058 return num;
1059 }
Then the validated number will be filled the node value created by built_int_2_wide at line 500 in interpret_integer.
Following, it needs associate type for the result. As we have seen, during compiler starts up, type nodes for integer have been created. The type that most fit (containing as less bit as possible) will be selected.
interpret_integer (continue)
502 /* The type of a constant with a U suffix is straightforward. */
503 if (flags & CPP_N_UNSIGNED)
504 itk = narrowest_unsigned_type (value, flags);
505 else
506 {
507 /* The type of a potentially-signed integer constant varies
508 depending on the base it's in, the standard in use, and the
509 length suffixes. */
510 enum integer_type_kind itk_u = narrowest_unsigned_type (value, flags);
511 enum integer_type_kind itk_s = narrowest_signed_type (value, flags);
512
513 /* In both C89 and C99, octal and hex constants may be signed or
514 unsigned, whichever fits tighter. We do not warn about this
515 choice differing from the traditional choice, as the constant
516 is probably a bit pattern and either way will work. */
517 if ((flags & CPP_N_RADIX) != CPP_N_DECIMAL)
518 itk = MIN (itk_u, itk_s);
519 else
520 {
521 /* In C99, decimal constants are always signed.
522 In C89, decimal constants that don't fit in long have
523 undefined behavior; we try to make them unsigned long.
524 In GCC's extended C89, that last is true of decimal
525 constants that don't fit in long long, too. */
526
527 itk = itk_s;
528 if (itk_s > itk_u && itk_s > itk_long)
529 {
530 if (!flag_isoc99)
531 {
532 if (itk_u < itk_unsigned_long)
533 itk_u = itk_unsigned_long;
534 itk = itk_u;
535 warning ("this decimal constant is unsigned only in ISO C90");
536 }
537 else if (warn_traditional)
538 warning ("this decimal constant would be unsigned in ISO C90");
539 }
540 }
541 }
542
543 if (itk == itk_none)
544 /* cpplib has already issued a warning for overflow. */
545 type = ((flags & CPP_N_UNSIGNED)
546 ? widest_unsigned_literal_type_node
547 : widest_integer_literal_type_node);
548 else
549 type = integer_types[itk];
550
551 if (itk > itk_unsigned_long
552 && (flags & CPP_N_WIDTH) != CPP_N_LARGE
553 && ! in_system_header && ! flag_isoc99)
554 pedwarn ("integer constant is too large for /"%s/" type",
555 (flags & CPP_N_UNSIGNED) ? "unsigned long" : "long");
556
557 TREE_TYPE (value) = type;
558
559 /* Convert imaginary to a complex type. */
560 if (flags & CPP_N_IMAGINARY)
561 value = build_complex (NULL_TREE, convert (type, integer_zero_node), value);
562
563 return value;
564 }
Routine narrowest_unsigned_type and narrowest_signed_type is resemble. For integer constant, it is regarded as CPP_N_SMALL without suffix, and CPP_N_MEDIUM with suffix ‘l/L’, otherwise CPP_N_LARGE (refer to cpp_classify_number).
442 static enum integer_type_kind
443 narrowest_unsigned_type (tree value, unsigned int flags) in c-lex.c
444 {
445 enum integer_type_kind itk;
446
447 if ((flags & CPP_N_WIDTH) == CPP_N_SMALL)
448 itk = itk_unsigned_int;
449 else if ((flags & CPP_N_WIDTH) == CPP_N_MEDIUM)
450 itk = itk_unsigned_long;
451 else
452 itk = itk_unsigned_long_long;
453
454 /* int_fits_type_p must think the type of its first argument is
455 wider than its second argument, or it won't do the proper check. */
456 TREE_TYPE (value) = widest_unsigned_literal_type_node;
457
458 for (; itk < itk_none; itk += 2 /* skip unsigned types */)
459 if (int_fits_type_p (value, integer_types[itk]))
460 return itk;
461
462 return itk_none;
463 }
At line 459, integer_types is designed as unsigned and signed appear alternatively. Then at line 456, see that value is set as widest_unsigned_literal_type_node deliberately (it’s long long in 32 bit x86 system), which is the largest int type the system can support. And it will affect the result of int_fits_type_p.
4243 int
4244 int_fits_type_p (tree c, tree type) in tree.c
4245 {
4246 tree type_low_bound = TYPE_MIN_VALUE (type);
4247 tree type_high_bound = TYPE_MAX_VALUE (type);
4248 int ok_for_low_bound, ok_for_high_bound;
4249
4250 /* Perform some generic filtering first, which may allow making a decision
4251 even if the bounds are not constant. First, negative integers never fit
4252 in unsigned types, */
4253 if ((TREE_UNSIGNED (type) && tree_int_cst_sgn (c) < 0)
4254 /* Also, unsigned integers with top bit set never fit signed types. */
4255 || (! TREE_UNSIGNED (type)
4256 && TREE_UNSIGNED (TREE_TYPE (c)) && tree_int_cst_msb (c)))
4257 return 0;
4258
4259 /* If at least one bound of the type is a constant integer, we can check
4260 ourselves and maybe make a decision. If no such decision is possible, but
4261 this type is a subtype, try checking against that. Otherwise, use
4262 force_fit_type, which checks against the precision.
4263
4264 Compute the status for each possibly constant bound, and return if we see
4265 one does not match. Use ok_for_xxx_bound for this purpose, assigning -1
4266 for "unknown if constant fits", 0 for "constant known *not* to fit" and 1
4267 for "constant known to fit". */
4268
4269 ok_for_low_bound = -1;
4270 ok_for_high_bound = -1;
4271
4272 /* Check if C >= type_low_bound. */
4273 if (type_low_bound && TREE_CODE (type_low_bound) == INTEGER_CST)
4274 {
4275 ok_for_low_bound = ! tree_int_cst_lt (c, type_low_bound);
4276 if (! ok_for_low_bound)
4277 return 0;
4278 }
4279
4280 /* Check if c <= type_high_bound. */
4281 if (type_high_bound && TREE_CODE (type_high_bound) == INTEGER_CST)
4282 {
4283 ok_for_high_bound = ! tree_int_cst_lt (type_high_bound, c);
4284 if (! ok_for_high_bound)
4285 return 0;
4286 }
4287
4288 /* If the constant fits both bounds, the result is known. */
4289 if (ok_for_low_bound == 1 && ok_for_high_bound == 1)
4290 return 1;
4291
4292 /* If we haven't been able to decide at this point, there nothing more we
4293 can check ourselves here. Look at the base type if we have one. */
4294 else if (TREE_CODE (type) == INTEGER_TYPE && TREE_TYPE (type) != 0)
4295 return int_fits_type_p (c, TREE_TYPE (type));
4296
4297 /* Or to force_fit_type, if nothing else. */
4298 else
4299 {
4300 c = copy_node (c);
4301 TREE_TYPE (c) = type;
4302 return !force_fit_type (c, 0);
4303 }
4304 }
We have seen in before, TYPE_MIN_VALUE and TYPE_MAX_VALUE of type node represents the min and max value the type can hold. If out of this range, the type isn’t suitable. Besides, for minus, unsigned type isn’t right; and for unsigned number using most significant bit, signed type can’t be used.
Note that only when the type hasn’t TYPE_MIN_VALUE and TYPE_MAX_VALUE set (e.g., typedef declaration of int), we will enter code below line 4292.
If narrowest_unsigned_type can’t find out a suitable type, it can only use the largest type the system supports (comment at line 544 points out that the compiler should have given warning about overflow). This found type, is set as the type for the parsing number at line 557 in interpret_integer. And this node of number is returned to value field of cp_token.