At line 205 above, cpp_create_reader tries to create a cpp_reader object. For language requires preprocessor like C and C++, usually compiler should provide a pass to preprocess the source. But in current version GCC, using cpp_reader, it implements a on-fly preprocessing (preprocessing as soon as readin) and doesn’t need the specific pass any more.
Out of this aim, the definition of cpp_reader is quite complex. In later chapters about lexer and parser, we will come back to see its functionality.
602 struct cpp_reader in cpphash.h
603 {
604 /* Top of buffer stack. */
605 cpp_buffer *buffer;
606
607 /* Overlaid buffer (can be different after processing #include). */
608 cpp_buffer *overlaid_buffer;
609
610 /* Lexer state. */
611 struct lexer_state state;
612
613 /* Source line tracking. */
614 struct line_maps line_maps;
615 const struct line_map *map;
616 fileline line;
617
618 /* The line of the '#' of the current directive. */
619 fileline directive_line;
620
621 /* Memory buffers. */
622 _cpp_buff *a_buff; /* Aligned permanent storage. */
623 _cpp_buff *u_buff; /* Unaligned permanent storage. */
624 _cpp_buff *free_buffs; /* Free buffer chain. */
625
626 /* Context stack. */
627 struct cpp_context base_context;
628 struct cpp_context *context;
629
630 /* If in_directive, the directive if known. */
631 const struct directive *directive;
632
633 /* Search paths for include files. */
634 struct cpp_dir *quote_include; /* "" */
635 struct cpp_dir *bracket_include; /* <> */
636 struct cpp_dir no_search_path; /* No path. */
637
638 /* Chain of all hashed _cpp_file instances. */
639 struct _cpp_file *all_files;
640
641 struct _cpp_file *main_file;
642
643 /* File and directory hash table. */
644 struct htab *file_hash;
645 struct htab *dir_hash;
646 struct file_hash_entry *file_hash_entries;
647 unsigned int file_hash_entries_allocated, file_hash_entries_used;
648
649 /* Nonzero means don't look for #include "foo" the source-file
650 directory. */
651 bool quote_ignores_source_dir;
652
653 /* Nonzero if any file has contained #pragma once or #import has
654 been used. */
655 bool seen_once_only;
656
657 /* Multiple include optimization. */
658 const cpp_hashnode *mi_cmacro;
659 const cpp_hashnode *mi_ind_cmacro;
660 bool mi_valid;
661
662 /* Lexing. */
663 cpp_token *cur_token;
664 tokenrun base_run, *cur_run;
665 unsigned int lookaheads;
666
667 /* Nonzero prevents the lexer from re-using the token runs. */
668 unsigned int keep_tokens;
669
670 /* Error counter for exit code. */
671 unsigned int errors;
672
673 /* Buffer to hold macro definition string. */
674 unsigned char *macro_buffer;
675 unsigned int macro_buffer_len;
676
677 /* Descriptor for converting from the source character set to the
678 execution character set. */
679 struct cset_converter narrow_cset_desc;
680
681 /* Descriptor for converting from the source character set to the
682 wide execution character set. */
683 struct cset_converter wide_cset_desc;
684
685 /* Date and time text. Calculated together if either is requested. */
686 const uchar *date;
687 const uchar *time;
688
689 /* EOF token, and a token forcing paste avoidance. */
690 cpp_token avoid_paste;
691 cpp_token eof;
692
693 /* Opaque handle to the dependencies of mkdeps.c. */
694 struct deps *deps;
695
696 /* Obstack holding all macro hash nodes. This never shrinks.
697 See cpphash.c */
698 struct obstack hash_ob;
699
700 /* Obstack holding buffer and conditional structures. This is a
701 real stack. See cpplib.c. */
702 struct obstack buffer_ob;
703
704 /* Pragma table - dynamic, because a library user can add to the
705 list of recognized pragmas. */
706 struct pragma_entry *pragmas;
707
708 /* Call backs to cpplib client. */
709 struct cpp_callbacks cb;
710
711 /* Identifier hash table. */
712 struct ht *hash_table;
713
714 /* Expression parser stack. */
715 struct op *op_stack, *op_limit;
716
717 /* User visible options. */
718 struct cpp_options opts;
719
720 /* Special nodes - identifiers with predefined significance to the
721 preprocessor. */
722 struct spec_nodes spec_nodes;
723
724 /* Whether cpplib owns the hashtable. */
725 bool our_hashtable;
726
727 /* Traditional preprocessing output buffer (a logical line). */
728 struct
729 {
730 uchar *base;
731 uchar *limit;
732 uchar *cur;
733 fileline first_line;
734 } out;
735
736 /* Used to save the original line number during traditional
737 preprocessing. */
738 unsigned int saved_line;
739
740 /* A saved list of the defined macros, for dependency checking
741 of precompiled headers. */
742 struct cpp_savedstate *savedstate;
743 };
Below the parameter lang of cpp_create_reader is type of c_lang which has definition as below:
154 enum c_lang {CLK_GNUC89 = 0, CLK_GNUC99, CLK_STDC89, CLK_STDC94,
155 CLK_STDC99, CLK_GNUCXX, CLK_CXX98, CLK_ASM};
We see that, at line 205 in c_common_init_options, for C or C++ front-end, CLK_GNUCXX and CLK_GNUC89 are the only values applicable. At line 132 in cpp_create_reader below, after v2.7 init_library does nothing.
126 cpp_reader *
127 cpp_create_reader (enum c_lang lang, hash_table *table) in cppinit.c
128 {
129 cpp_reader *pfile;
130
131 /* Initialize this instance of the library if it hasn't been already. */
132 init_library ();
133
134 pfile = xcalloc (1, sizeof (cpp_reader));
135
136 cpp_set_lang (pfile, lang);
137
At line 136 above, cpp_set_lang sets information about the language into the cpp_reader object. It has the definition as below.
92 void
93 cpp_set_lang (cpp_reader *pfile, enum c_lang lang) in cppinit.c
94 {
95 const struct lang_flags *l = &lang_defaults[(int) lang];
96
97 CPP_OPTION (pfile, lang) = lang;
98
99 CPP_OPTION (pfile, c99) = l->c99;
100 CPP_OPTION (pfile, cplusplus) = l->cplusplus;
101 CPP_OPTION (pfile, extended_numbers) = l->extended_numbers;
102 CPP_OPTION (pfile, std) = l->std;
103 CPP_OPTION (pfile, trigraphs) = l->std;
104 CPP_OPTION (pfile, cplusplus_comments) = l->cplusplus_comments;
105 CPP_OPTION (pfile, digraphs) = l->digraphs;
106 }
lang_defaults, at line 95 above, is a global array decribes the characteristics of the supported langauges. In detail, c99 field if nonzero, means the language conforms to 1999 C standard. std field if nonzero, means the language conforms to specific C/C++ standard. extended_number field if nonzero, means the language allows hexadecimal floats and LL suffixes for numeric constant. cplusplus_comments field if nonzero, means the language allows “//” style comment of C++. digraphs field if nonzero, means the language supports the ISO digraph sequences.
69 struct lang_flags in cppinit.c
70 {
71 char c99;
72 char cplusplus;
73 char extended_numbers;
74 char std;
75 char cplusplus_comments;
76 char digraphs;
77 };
78
79 static const struct lang_flags lang_defaults[] = in cppinit.c
80 { /* c99 c++ xnum std // digr */
81 /* GNUC89 */ { 0, 0, 1, 0, 1, 1 },
82 /* GNUC99 */ { 1, 0, 1, 0, 1, 1 },
83 /* STDC89 */ { 0, 0, 0, 1, 0, 0 },
84 /* STDC94 */ { 0, 0, 0, 1, 0, 1 },
85 /* STDC99 */ { 1, 0, 1, 1, 1, 1 },
86 /* GNUCXX */ { 0, 1, 1, 0, 1, 1 },
87 /* CXX98 */ { 0, 1, 1, 1, 1, 1 },
88 /* ASM */ { 0, 0, 1, 0, 1, 0 }
89 };
At line 97 in cpp_set_lang, macro CPP_OPTION is defined as the field selector as following:
#define CPP_OPTION(PFILE, OPTION) ((PFILE)->opts.OPTION) in cpphash.h
From the definition, we can see that the macro can be used to initialize the opts field of cpp_reader, which is of type cpp_options. It is the collection of flags to control the behavior of lexer and preprocessor.
211 struct cpp_options in cpplib.h
212 {
213 /* Characters between tab stops. */
214 unsigned int tabstop;
215
216 /* The language we're preprocessing. */
217 enum c_lang lang;
218
219 /* Nonzero means use extra default include directories for C++. */
220 unsigned char cplusplus;
221
222 /* Nonzero means handle cplusplus style comments. */
223 unsigned char cplusplus_comments;
224
225 /* Nonzero means define __OBJC__, treat @ as a special token, and
226 use the OBJC[PLUS]_INCLUDE_PATH environment variable. */
227 unsigned char objc;
228
229 /* Nonzero means don't copy comments into the output file. */
230 unsigned char discard_comments;
231
232 /* Nonzero means don't copy comments into the output file during
233 macro expansion. */
234 unsigned char discard_comments_in_macro_exp;
235
236 /* Nonzero means process the ISO trigraph sequences. */
237 unsigned char trigraphs;
238
239 /* Nonzero means process the ISO digraph sequences. */
240 unsigned char digraphs;
241
242 /* Nonzero means to allow hexadecimal floats and LL suffixes. */
243 unsigned char extended_numbers;
244
245 /* Nonzero means print names of header files (-H). */
246 unsigned char print_include_names;
247
248 /* Nonzero means cpp_pedwarn causes a hard error. */
249 unsigned char pedantic_errors;
250
251 /* Nonzero means don't print warning messages. */
252 unsigned char inhibit_warnings;
253
254 /* Nonzero means complain about deprecated features. */
255 unsigned char warn_deprecated;
256
257 /* Nonzero means don't suppress warnings from system headers. */
258 unsigned char warn_system_headers;
259
260 /* Nonzero means don't print error messages. Has no option to
261 select it, but can be set by a user of cpplib (e.g. fix-header). */
262 unsigned char inhibit_errors;
263
264 /* Nonzero means warn if slash-star appears in a comment. */
265 unsigned char warn_comments;
266
267 /* Nonzero means warn if there are any trigraphs. */
268 unsigned char warn_trigraphs;
269
270 /* Nonzero means warn about multicharacter charconsts. */
271 unsigned char warn_multichar;
272
273 /* Nonzero means warn about various incompatibilities with
274 traditional C. */
275 unsigned char warn_traditional;
276
277 /* Nonzero means warn about long long numeric constants. */
278 unsigned char warn_long_long;
279
280 /* Nonzero means warn about text after an #endif (or #else). */
281 unsigned char warn_endif_labels;
282
283 /* Nonzero means warn about implicit sign changes owing to integer
284 promotions. */
285 unsigned char warn_num_sign_change;
286
287 /* Nonzero means turn warnings into errors. */
288 unsigned char warnings_are_errors;
289
290 /* Nonzero means we should look for header.gcc files that remap file
291 names. */
292 unsigned char remap;
293
294 /* Zero means dollar signs are punctuation. */
295 unsigned char dollars_in_ident;
296
297 /* True if we should warn about dollars in identifiers or numbers
298 for this translation unit. */
299 unsigned char warn_dollars;
300
301 /* Nonzero means warn if undefined identifiers are evaluated in an #if. */
302 unsigned char warn_undef;
303
304 /* Nonzero means warn of unused macros from the main file. */
305 unsigned char warn_unused_macros;
306
307 /* Nonzero for the 1999 C Standard, including corrigenda and amendments. */
308 unsigned char c99;
309
310 /* Nonzero if we are conforming to a specific C or C++ standard. */
311 unsigned char std;
312
313 /* Nonzero means give all the error messages the ANSI standard requires. */
314 unsigned char pedantic;
315
316 /* Nonzero means we're looking at already preprocessed code, so don't
317 bother trying to do macro expansion and whatnot. */
318 unsigned char preprocessed;
319
320 /* Print column number in error messages. */
321 unsigned char show_column;
322
323 /* Nonzero means handle C++ alternate operator names. */
324 unsigned char operator_names;
325
326 /* True for traditional preprocessing. */
327 unsigned char traditional;
328
329 /* Holds the name of the target (execution) character set. */
330 const char *narrow_charset;
331
332 /* Holds the name of the target wide character set. */
333 const char *wide_charset;
334
335 /* Holds the name of the input character set. */
336 const char *input_charset;
337
338 /* True to warn about precompiled header files we couldn't use. */
339 bool warn_invalid_pch;
340
341 /* True if dependencies should be restored from a precompiled header. */
342 bool restore_pch_deps;
343
344 /* Dependency generation. */
345 struct
346 {
347 /* Style of header dependencies to generate. */
348 enum {DEPS_NONE = 0, DEPS_USER, DEPS_SYSTEM } style;
349
350 /* Assume missing files are generated files. */
351 bool missing_files;
352
353 /* Generate phony targets for each dependency apart from the first
354 one. */
355 bool phony_targets;
356
357 /* If true, no dependency is generated on the main file. */
358 bool ignore_main_file;
359 } deps;
360
361 /* Target-specific features set by the front end or client. */
362
363 /* Precision for target CPP arithmetic, target characters, target
364 ints and target wide characters, respectively. */
365 size_t precision, char_precision, int_precision, wchar_precision;
366
367 /* True means chars (wide chars) are unsigned. */
368 bool unsigned_char, unsigned_wchar;
369
370 /* True if the most significant byte in a word has the lowest
371 address in memory. */
372 bool bytes_big_endian;
373
374 /* Nonzero means __STDC__ should have the value 0 in system headers. */
375 unsigned char stdc_0_in_system_headers;
376 };
cpp_create_reader continue to initialize some fields of opts, which are very basic settings and can be determined beforehand. And the same is initial header files search path, it is set as nil (“”) at line 169 below.
cpp_create_reader (continue)
137 CPP_OPTION (pfile, warn_multichar) = 1;
138 CPP_OPTION (pfile, discard_comments) = 1;
139 CPP_OPTION (pfile, discard_comments_in_macro_exp) = 1;
140 CPP_OPTION (pfile, show_column) = 1;
141 CPP_OPTION (pfile, tabstop) = 8;
142 CPP_OPTION (pfile, operator_names) = 1;
143 CPP_OPTION (pfile, warn_trigraphs) = 2;
144 CPP_OPTION (pfile, warn_endif_labels) = 1;
145 CPP_OPTION (pfile, warn_deprecated) = 1;
146 CPP_OPTION (pfile, warn_long_long) = !CPP_OPTION (pfile, c99);
147 CPP_OPTION (pfile, dollars_in_ident) = 1;
148 CPP_OPTION (pfile, warn_dollars) = 1;
149
150 /* Default CPP arithmetic to something sensible for the host for the
151 benefit of dumb users like fix-header. */
152 CPP_OPTION (pfile, precision) = CHAR_BIT * sizeof (long);
153 CPP_OPTION (pfile, char_precision) = CHAR_BIT;
154 CPP_OPTION (pfile, wchar_precision) = CHAR_BIT * sizeof (int);
155 CPP_OPTION (pfile, int_precision) = CHAR_BIT * sizeof (int);
156 CPP_OPTION (pfile, unsigned_char) = 0;
157 CPP_OPTION (pfile, unsigned_wchar) = 1;
158 CPP_OPTION (pfile, bytes_big_endian) = 1; /* does not matter */
159
160 /* Default to locale/UTF-8. */
161 CPP_OPTION (pfile, narrow_charset) = _cpp_default_encoding ();
162 CPP_OPTION (pfile, wide_charset) = 0;
163 CPP_OPTION (pfile, input_charset) = _cpp_default_encoding ();
164
165 /* A fake empty "directory" used as the starting point for files
166 looked up without a search path. Name cannot be '/' because we
167 don't want to prepend anything at all to filenames using it. All
168 other entries are correct zero-initialized. */
169 pfile->no_search_path.name = (char *) "";
170
171 /* Initialize the line map. Start at logical line 1, so we can use
172 a line number of zero for special states. */
173 linemap_init (&pfile->line_maps);
174 pfile->line = 1;
Macros and included files will be expanded by preprocessor, and then be feed in lexer. When error found, the lexer can’t tell the location in the source file, but the location in the code after preprocessing. The same happens in parser. So it requires a mapping between source file and the proprocessed output. Structure line_maps is provided for this purpose.
58 struct line_maps in line-map.h
59 {
60 struct line_map *maps;
61 unsigned int allocated;
62 unsigned int used;
63
64 /* The most recently listed include stack, if any, starts with
65 LAST_LISTED as the topmost including file. -1 indicates nothing
66 has been listed yet. */
67 int last_listed;
68
69 /* Depth of the include stack, including the current file. */
70 unsigned int depth;
71
72 /* If true, prints an include trace a la -H. */
73 bool trace_includes;
74 };
39 /* The logical line FROM_LINE maps to physical source file TO_FILE at
40 line TO_LINE, and subsequently one-to-one until the next line_map
41 structure in the set. INCLUDED_FROM is an index into the set that
42 gives the line mapping at whose end the current one was included.
43 File(s) at the bottom of the include stack have this set to -1.
44 REASON is the reason for creation of this line map, SYSP is one for
45 a system header, two for a C system header file that therefore
46 needs to be extern "C" protected in C++, and zero otherwise. */
47 struct line_map in line-map.h
48 {
49 const char *to_file;
50 unsigned int to_line;
51 source_location from_line;
52 int included_from;
53 ENUM_BITFIELD (lc_reason) reason : CHAR_BIT;
54 unsigned char sysp;
55 };
26 /* Reason for adding a line change with add_line_map (). LC_ENTER is
27 when including a new file, e.g. a #include directive in C.
28 LC_LEAVE is when reaching a file's end. LC_RENAME is when a file
29 name or line number changes for neither of the above reasons
30 (e.g. a #line directive in C). */
31 enum lc_reason {LC_ENTER = 0, LC_LEAVE, LC_RENAME};
In later chapter, we can see that the setting up mapping between logical postion to physical position is not simple. Every time including a file, it needs setup a new mapping; and when exitting from included file, it also needs remap.
At line 173 in cpp_create_reader, linemap_init is used to initialize the line_maps.
32 void
33 linemap_init (struct line_maps *set)
34 {
35 set->maps = 0;
36 set->allocated = 0;
37 set->used = 0;
38 set->last_listed = -1;
39 set->trace_includes = false;
40 set->depth = 0;
41 }
cpp_create_reader (continue)
176 /* Initialize lexer state. */
177 pfile->state.save_comments = ! CPP_OPTION (pfile, discard_comments);
178
179 /* Set up static tokens. */
180 pfile->avoid_paste.type = CPP_PADDING;
181 pfile->avoid_paste.val.source = NULL;
182 pfile->eof.type = CPP_EOF;
183 pfile->eof.flags = 0;
The state field of cpp_reader, at line 177 above, records lexer state; avoid_paste field is a token forcing paste avoidance, it is of type cpp_token we will see in below. eof field stands for the end of the file.
cpp_create_reader (continue)
185 /* Create a token buffer for the lexer. */
186 _cpp_init_tokenrun (&pfile->base_run, 250);
187 pfile->cur_run = &pfile->base_run;
188 pfile->cur_token = pfile->base_run.base;
base_run is doubly linked blocks of size of 250 tokens. At first, one block is allocated. It’s of type tokenrun which has definition as below.
158 typedef struct tokenrun tokenrun;
159 struct tokenrun in cpphash.h
160 {
161 tokenrun *next, *prev;
162 cpp_token *base, *limit;
163 };
644 void
645 _cpp_init_tokenrun (tokenrun *run, unsigned int count) in cpplex.c
646 {
647 run->base = xnewvec (cpp_token, count);
648 run->limit = run->base + count;
649 run->next = NULL;
650 }
cpp_create_reader (continue)
190 /* Initialize the base context. */
191 pfile->context = &pfile->base_context;
192 pfile->base_context.macro = 0;
193 pfile->base_context.prev = pfile->base_context.next = 0;
context field of cpp_reader is the buffer used for marco expansion purpose, it is a doubly linked list too. It is defined as following:
172 struct cpp_context in cpphash.h
173 {
174 /* Doubly-linked list. */
175 cpp_context *next, *prev;
176
177 union
178 {
179 /* For ISO macro expansion. Contexts other than the base context
180 are contiguous tokens. e.g. macro expansions, expanded
181 argument tokens. */
182 struct
183 {
184 union utoken first;
185 union utoken last;
186 } iso;
187
188 /* For traditional macro expansion. */
189 struct
190 {
191 const uchar *cur;
192 const uchar *rlimit;
193 } trad;
194 } u;
195
196 /* If non-NULL, a buffer used for storage related to this context.
197 When the context is popped, the buffer is released. */
198 _cpp_buff *buff;
199
200 /* For a macro context, the macro node, otherwise NULL. */
201 cpp_hashnode *macro;
202
203 /* True if utoken element is token, else ptoken. */
204 bool direct_p;
205 };
In line 177~194, the union is used to handle traditional and ISO macro expansion. Traditinal expansion will be used if we require the gcc to do the compilation with –traditional switch (to cope with the pre-standard language). The major difference between traditional and ISO macro is that the former expand to text rather than to a token sequence. So we see that between line 182~186, the struct used for ISO macros contains the union type of utoken which has definition as following.
151 union utoken in cpphash.h
152 {
153 const cpp_token *token;
154 const cpp_token **ptoken;
155 };
cpp_create_reader (continue)
196 /* Aligned and unaligned storage. */
197 pfile->a_buff = _cpp_get_buff (pfile, 0);
198 pfile->u_buff = _cpp_get_buff (pfile, 0);
a_buff and u_buff are two other buffers used by cpp_reader, which u_buff is used for handling stringnize (preprocessing operator ‘#’), and a_buff is used for others. To facilitate the memory management, cpp_reader uses free_buffs to hold freed blocks from a_buff and u_buff. The buffer has following defintion and operation.
130 struct _cpp_buff in cpphash.h
131 {
132 struct _cpp_buff *next;
133 unsigned char *base, *cur, *limit;
134 };
1423 _cpp_buff *
1424 _cpp_get_buff (cpp_reader *pfile, size_t min_size) in cpplex.c
1425 {
1426 _cpp_buff *result, **p;
1427
1428 for (p = &pfile->free_buffs;; p = &(*p)->next)
1429 {
1430 size_t size;
1431
1432 if (*p == NULL)
1433 return new_buff (min_size);
1434 result = *p;
1435 size = result->limit - result->base;
1436 /* Return a buffer that's big enough, but don't waste one that's
1437 way too big. */
1438 if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
1439 break;
1440 }
1441
1442 *p = result->next;
1443 result->next = NULL;
1444 result->cur = result->base;
1445 return result;
1446 }
1391 static _cpp_buff *
1392 new_buff (size_t len) in cpplex.c
1393 {
1394 _cpp_buff *result;
1395 unsigned char *base;
1396
1397 if (len < MIN_BUFF_SIZE)
1398 len = MIN_BUFF_SIZE;
1399 len = CPP_ALIGN (len);
1400
1401 base = xmalloc (len + sizeof (_cpp_buff));
1402 result = (_cpp_buff *) (base + len);
1403 result->base = base;
1404 result->cur = base;
1405 result->limit = base + len;
1406 result->next = NULL;
1407 return result;
1408 }