在上面的205行,cpp_create_reader尝试创建一个cpp_reader对象。对于象C,C++这样需要预处理机制的语言,一般来说,编译器需要提供一个预处理遍,对源程序进行预处理操作。而当前版本的GCC,应用cpp_reader对象,实现了在线的预处理展开,不再需要预处理遍。
因为这个目的,cpp_reader的定义也变得相当复杂。在以后有关词法分析和解析器的章节中,我们再来详细研究它的作用。
602 struct cpp_reader in cpphash.h
603 {
604 /* Top of buffer stack. */
605 cpp_buffer *buffer;
606
607 /* Overlaid buffer (can be different after processing #include). */
608 cpp_buffer *overlaid_buffer;
609
610 /* Lexer state. */
611 struct lexer_state state;
612
613 /* Source line tracking. */
614 struct line_maps line_maps;
615 const struct line_map *map;
616 fileline line;
617
618 /* The line of the '#' of the current directive. */
619 fileline directive_line;
620
621 /* Memory buffers. */
622 _cpp_buff *a_buff; /* Aligned permanent storage. */
623 _cpp_buff *u_buff; /* Unaligned permanent storage. */
624 _cpp_buff *free_buffs; /* Free buffer chain. */
625
626 /* Context stack. */
627 struct cpp_context base_context;
628 struct cpp_context *context;
629
630 /* If in_directive, the directive if known. */
631 const struct directive *directive;
632
633 /* Search paths for include files. */
634 struct cpp_dir *quote_include; /* "" */
635 struct cpp_dir *bracket_include; /* <> */
636 struct cpp_dir no_search_path; /* No path. */
637
638 /* Chain of all hashed _cpp_file instances. */
639 struct _cpp_file *all_files;
640
641 struct _cpp_file *main_file;
642
643 /* File and directory hash table. */
644 struct htab *file_hash;
645 struct htab *dir_hash;
646 struct file_hash_entry *file_hash_entries;
647 unsigned int file_hash_entries_allocated, file_hash_entries_used;
648
649 /* Nonzero means don't look for #include "foo" the source-file
650 directory. */
651 bool quote_ignores_source_dir;
652
653 /* Nonzero if any file has contained #pragma once or #import has
654 been used. */
655 bool seen_once_only;
656
657 /* Multiple include optimization. */
658 const cpp_hashnode *mi_cmacro;
659 const cpp_hashnode *mi_ind_cmacro;
660 bool mi_valid;
661
662 /* Lexing. */
663 cpp_token *cur_token;
664 tokenrun base_run, *cur_run;
665 unsigned int lookaheads;
666
667 /* Nonzero prevents the lexer from re-using the token runs. */
668 unsigned int keep_tokens;
669
670 /* Error counter for exit code. */
671 unsigned int errors;
672
673 /* Buffer to hold macro definition string. */
674 unsigned char *macro_buffer;
675 unsigned int macro_buffer_len;
676
677 /* Descriptor for converting from the source character set to the
678 execution character set. */
679 struct cset_converter narrow_cset_desc;
680
681 /* Descriptor for converting from the source character set to the
682 wide execution character set. */
683 struct cset_converter wide_cset_desc;
684
685 /* Date and time text. Calculated together if either is requested. */
686 const uchar *date;
687 const uchar *time;
688
689 /* EOF token, and a token forcing paste avoidance. */
690 cpp_token avoid_paste;
691 cpp_token eof;
692
693 /* Opaque handle to the dependencies of mkdeps.c. */
694 struct deps *deps;
695
696 /* Obstack holding all macro hash nodes. This never shrinks.
697 See cpphash.c */
698 struct obstack hash_ob;
699
700 /* Obstack holding buffer and conditional structures. This is a
701 real stack. See cpplib.c. */
702 struct obstack buffer_ob;
703
704 /* Pragma table - dynamic, because a library user can add to the
705 list of recognized pragmas. */
706 struct pragma_entry *pragmas;
707
708 /* Call backs to cpplib client. */
709 struct cpp_callbacks cb;
710
711 /* Identifier hash table. */
712 struct ht *hash_table;
713
714 /* Expression parser stack. */
715 struct op *op_stack, *op_limit;
716
717 /* User visible options. */
718 struct cpp_options opts;
719
720 /* Special nodes - identifiers with predefined significance to the
721 preprocessor. */
722 struct spec_nodes spec_nodes;
723
724 /* Whether cpplib owns the hashtable. */
725 bool our_hashtable;
726
727 /* Traditional preprocessing output buffer (a logical line). */
728 struct
729 {
730 uchar *base;
731 uchar *limit;
732 uchar *cur;
733 fileline first_line;
734 } out;
735
736 /* Used to save the original line number during traditional
737 preprocessing. */
738 unsigned int saved_line;
739
740 /* A saved list of the defined macros, for dependency checking
741 of precompiled headers. */
742 struct cpp_savedstate *savedstate;
743 };
下面cpp_create_reader定义中的参数lang为c_lang类型,这个类型的定义如下:
154 enum c_lang {CLK_GNUC89 = 0, CLK_GNUC99, CLK_STDC89, CLK_STDC94,
155 CLK_STDC99, CLK_GNUCXX, CLK_CXX98, CLK_ASM};
注意到在c_common_init_options的205行,对于C、C++前端,CLK_GNUCXX和CLK_GNUC89是仅有的可用值。而在cpp_create_reader的132行,在v2.7版本后, init_library不作任何事情。
126 cpp_reader *
127 cpp_create_reader (enum c_lang lang, hash_table *table) in cppinit.c
128 {
129 cpp_reader *pfile;
130
131 /* Initialize this instance of the library if it hasn't been already. */
132 init_library ();
133
134 pfile = xcalloc (1, sizeof (cpp_reader));
135
136 cpp_set_lang (pfile, lang);
在上面的136行,cpp_set_lang为cpp_reader对象设置了语言的信息。它有如下的定义。
92 void
93 cpp_set_lang (cpp_reader *pfile, enum c_lang lang) in cppinit.c
94 {
95 const struct lang_flags *l = &lang_defaults[(int) lang];
96
97 CPP_OPTION (pfile, lang) = lang;
98
99 CPP_OPTION (pfile, c99) = l->c99;
100 CPP_OPTION (pfile, cplusplus) = l->cplusplus;
101 CPP_OPTION (pfile, extended_numbers) = l->extended_numbers;
102 CPP_OPTION (pfile, std) = l->std;
103 CPP_OPTION (pfile, trigraphs) = l->std;
104 CPP_OPTION (pfile, cplusplus_comments) = l->cplusplus_comments;
105 CPP_OPTION (pfile, digraphs) = l->digraphs;
106 }
在上面95行的lang_defaults,是一个描述所支持语言特性的全局数组。具体来说,c99域如果是非0值,表示语言符合1999 C标准。域std如果为非0值,表示语言符合指定的C/C++标准。域extended_number如果为非0值,表示语言允许16进制的浮点数及用于数值常量的LL前缀。域cplusplus_comments如果为非0值,表示语言允许C++的“//”风格的注释。域digraphs如果为非0值,表示语言支持ISO的复合字母序列(digraph sequence)。
69 struct lang_flags in cppinit.c
70 {
71 char c99;
72 char cplusplus;
73 char extended_numbers;
74 char std;
75 char cplusplus_comments;
76 char digraphs;
77 };
78
79 static const struct lang_flags lang_defaults[] = in cppinit.c
80 { /* c99 c++ xnum std // digr */
81 /* GNUC89 */ { 0, 0, 1, 0, 1, 1 },
82 /* GNUC99 */ { 1, 0, 1, 0, 1, 1 },
83 /* STDC89 */ { 0, 0, 0, 1, 0, 0 },
84 /* STDC94 */ { 0, 0, 0, 1, 0, 1 },
85 /* STDC99 */ { 1, 0, 1, 1, 1, 1 },
86 /* GNUCXX */ { 0, 1, 1, 0, 1, 1 },
87 /* CXX98 */ { 0, 1, 1, 1, 1, 1 },
88 /* ASM */ { 0, 0, 1, 0, 1, 0 }
89 };
在cpp_set_lang的97行,宏CPP_OPTION被定义为如下的域选择子。
#define CPP_OPTION(PFILE, OPTION) ((PFILE)->opts.OPTION) in cpphash.h
从这个定义,我们可以知道这个宏可被用于初始化cpp_reader的opts域。这个域具有cpp_options类型。它是一组用于控制词法分析和预处理的旗标。
211 struct cpp_options in cpplib.h
212 {
213 /* Characters between tab stops. */
214 unsigned int tabstop;
215
216 /* The language we're preprocessing. */
217 enum c_lang lang;
218
219 /* Nonzero means use extra default include directories for C++. */
220 unsigned char cplusplus;
221
222 /* Nonzero means handle cplusplus style comments. */
223 unsigned char cplusplus_comments;
224
225 /* Nonzero means define __OBJC__, treat @ as a special token, and
226 use the OBJC[PLUS]_INCLUDE_PATH environment variable. */
227 unsigned char objc;
228
229 /* Nonzero means don't copy comments into the output file. */
230 unsigned char discard_comments;
231
232 /* Nonzero means don't copy comments into the output file during
233 macro expansion. */
234 unsigned char discard_comments_in_macro_exp;
235
236 /* Nonzero means process the ISO trigraph sequences. */
237 unsigned char trigraphs;
238
239 /* Nonzero means process the ISO digraph sequences. */
240 unsigned char digraphs;
241
242 /* Nonzero means to allow hexadecimal floats and LL suffixes. */
243 unsigned char extended_numbers;
244
245 /* Nonzero means print names of header files (-H). */
246 unsigned char print_include_names;
247
248 /* Nonzero means cpp_pedwarn causes a hard error. */
249 unsigned char pedantic_errors;
250
251 /* Nonzero means don't print warning messages. */
252 unsigned char inhibit_warnings;
253
254 /* Nonzero means complain about deprecated features. */
255 unsigned char warn_deprecated;
256
257 /* Nonzero means don't suppress warnings from system headers. */
258 unsigned char warn_system_headers;
259
260 /* Nonzero means don't print error messages. Has no option to
261 select it, but can be set by a user of cpplib (e.g. fix-header). */
262 unsigned char inhibit_errors;
263
264 /* Nonzero means warn if slash-star appears in a comment. */
265 unsigned char warn_comments;
266
267 /* Nonzero means warn if there are any trigraphs. */
268 unsigned char warn_trigraphs;
269
270 /* Nonzero means warn about multicharacter charconsts. */
271 unsigned char warn_multichar;
272
273 /* Nonzero means warn about various incompatibilities with
274 traditional C. */
275 unsigned char warn_traditional;
276
277 /* Nonzero means warn about long long numeric constants. */
278 unsigned char warn_long_long;
279
280 /* Nonzero means warn about text after an #endif (or #else). */
281 unsigned char warn_endif_labels;
282
283 /* Nonzero means warn about implicit sign changes owing to integer
284 promotions. */
285 unsigned char warn_num_sign_change;
286
287 /* Nonzero means turn warnings into errors. */
288 unsigned char warnings_are_errors;
289
290 /* Nonzero means we should look for header.gcc files that remap file
291 names. */
292 unsigned char remap;
293
294 /* Zero means dollar signs are punctuation. */
295 unsigned char dollars_in_ident;
296
297 /* True if we should warn about dollars in identifiers or numbers
298 for this translation unit. */
299 unsigned char warn_dollars;
300
301 /* Nonzero means warn if undefined identifiers are evaluated in an #if. */
302 unsigned char warn_undef;
303
304 /* Nonzero means warn of unused macros from the main file. */
305 unsigned char warn_unused_macros;
306
307 /* Nonzero for the 1999 C Standard, including corrigenda and amendments. */
308 unsigned char c99;
309
310 /* Nonzero if we are conforming to a specific C or C++ standard. */
311 unsigned char std;
312
313 /* Nonzero means give all the error messages the ANSI standard requires. */
314 unsigned char pedantic;
315
316 /* Nonzero means we're looking at already preprocessed code, so don't
317 bother trying to do macro expansion and whatnot. */
318 unsigned char preprocessed;
319
320 /* Print column number in error messages. */
321 unsigned char show_column;
322
323 /* Nonzero means handle C++ alternate operator names. */
324 unsigned char operator_names;
325
326 /* True for traditional preprocessing. */
327 unsigned char traditional;
328
329 /* Holds the name of the target (execution) character set. */
330 const char *narrow_charset;
331
332 /* Holds the name of the target wide character set. */
333 const char *wide_charset;
334
335 /* Holds the name of the input character set. */
336 const char *input_charset;
337
338 /* True to warn about precompiled header files we couldn't use. */
339 bool warn_invalid_pch;
340
341 /* True if dependencies should be restored from a precompiled header. */
342 bool restore_pch_deps;
343
344 /* Dependency generation. */
345 struct
346 {
347 /* Style of header dependencies to generate. */
348 enum {DEPS_NONE = 0, DEPS_USER, DEPS_SYSTEM } style;
349
350 /* Assume missing files are generated files. */
351 bool missing_files;
352
353 /* Generate phony targets for each dependency apart from the first
354 one. */
355 bool phony_targets;
356
357 /* If true, no dependency is generated on the main file. */
358 bool ignore_main_file;
359 } deps;
360
361 /* Target-specific features set by the front end or client. */
362
363 /* Precision for target CPP arithmetic, target characters, target
364 ints and target wide characters, respectively. */
365 size_t precision, char_precision, int_precision, wchar_precision;
366
367 /* True means chars (wide chars) are unsigned. */
368 bool unsigned_char, unsigned_wchar;
369
370 /* True if the most significant byte in a word has the lowest
371 address in memory. */
372 bool bytes_big_endian;
373
374 /* Nonzero means __STDC__ should have the value 0 in system headers. */
375 unsigned char stdc_0_in_system_headers;
376 };
接下来cpp_create_reader接着初始化opts的某些域,这些域都是很基本的,已预先确定的设置。同样在169行,把初始的头文件查找路径设置为空(“”)。
cpp_create_reader (continue)
137 CPP_OPTION (pfile, warn_multichar) = 1;
138 CPP_OPTION (pfile, discard_comments) = 1;
139 CPP_OPTION (pfile, discard_comments_in_macro_exp) = 1;
140 CPP_OPTION (pfile, show_column) = 1;
141 CPP_OPTION (pfile, tabstop) = 8;
142 CPP_OPTION (pfile, operator_names) = 1;
143 CPP_OPTION (pfile, warn_trigraphs) = 2;
144 CPP_OPTION (pfile, warn_endif_labels) = 1;
145 CPP_OPTION (pfile, warn_deprecated) = 1;
146 CPP_OPTION (pfile, warn_long_long) = !CPP_OPTION (pfile, c99);
147 CPP_OPTION (pfile, dollars_in_ident) = 1;
148 CPP_OPTION (pfile, warn_dollars) = 1;
149
150 /* Default CPP arithmetic to something sensible for the host for the
151 benefit of dumb users like fix-header. */
152 CPP_OPTION (pfile, precision) = CHAR_BIT * sizeof (long);
153 CPP_OPTION (pfile, char_precision) = CHAR_BIT;
154 CPP_OPTION (pfile, wchar_precision) = CHAR_BIT * sizeof (int);
155 CPP_OPTION (pfile, int_precision) = CHAR_BIT * sizeof (int);
156 CPP_OPTION (pfile, unsigned_char) = 0;
157 CPP_OPTION (pfile, unsigned_wchar) = 1;
158 CPP_OPTION (pfile, bytes_big_endian) = 1; /* does not matter */
159
160 /* Default to locale/UTF-8. */
161 CPP_OPTION (pfile, narrow_charset) = _cpp_default_encoding ();
162 CPP_OPTION (pfile, wide_charset) = 0;
163 CPP_OPTION (pfile, input_charset) = _cpp_default_encoding ();
164
165 /* A fake empty "directory" used as the starting point for files
166 looked up without a search path. Name cannot be '/' because we
167 don't want to prepend anything at all to filenames using it. All
168 other entries are correct zero-initialized. */
169 pfile->no_search_path.name = (char *) "";
170
171 /* Initialize the line map. Start at logical line 1, so we can use
172 a line number of zero for special states. */
173 linemap_init (&pfile->line_maps);
174 pfile->line = 1;
宏和被包含文件会被预处理器展开,然后作为词法分析器的输入。当发现错误时,词法分析器不能知道错误在源文件中的位置,只能知道它在预处理结果中的位置。语法分析器也有同样的问题。因此要给出正确的出错消息,需要知道源文件和预处理结果间的映射关系。结构line_maps为此目的而定义。
58 struct line_maps in line-map.h
59 {
60 struct line_map *maps;
61 unsigned int allocated;
62 unsigned int used;
63
64 /* The most recently listed include stack, if any, starts with
65 LAST_LISTED as the topmost including file. -1 indicates nothing
66 has been listed yet. */
67 int last_listed;
68
69 /* Depth of the include stack, including the current file. */
70 unsigned int depth;
71
72 /* If true, prints an include trace a la -H. */
73 bool trace_includes;
74 };
39 /* The logical line FROM_LINE maps to physical source file TO_FILE at
40 line TO_LINE, and subsequently one-to-one until the next line_map
41 structure in the set. INCLUDED_FROM is an index into the set that
42 gives the line mapping at whose end the current one was included.
43 File(s) at the bottom of the include stack have this set to -1.
44 REASON is the reason for creation of this line map, SYSP is one for
45 a system header, two for a C system header file that therefore
46 needs to be extern "C" protected in C++, and zero otherwise. */
47 struct line_map in line-map.h
48 {
49 const char *to_file;
50 unsigned int to_line;
51 source_location from_line;
52 int included_from;
53 ENUM_BITFIELD (lc_reason) reason : CHAR_BIT;
54 unsigned char sysp;
55 };
26 /* Reason for adding a line change with add_line_map (). LC_ENTER is
27 when including a new file, e.g. a #include directive in C.
28 LC_LEAVE is when reaching a file's end. LC_RENAME is when a file
29 name or line number changes for neither of the above reasons
30 (e.g. a #line directive in C). */
31 enum lc_reason {LC_ENTER = 0, LC_LEAVE, LC_RENAME};
在后面的章节可以看到,逻辑位置与物理位置的映射实际上不简单,每包含一个文件,需要建立一个新的映射关系,而从包含文件返回,也需要重新建立一个映射关系。
在cpp_create_reader的173行,linemap_init被调用初始化line_maps。
32 void
33 linemap_init (struct line_maps *set)
34 {
35 set->maps = 0;
36 set->allocated = 0;
37 set->used = 0;
38 set->last_listed = -1;
39 set->trace_includes = false;
40 set->depth = 0;
41 }
cpp_create_reader (continue)
176 /* Initialize lexer state. */
177 pfile->state.save_comments = ! CPP_OPTION (pfile, discard_comments);
178
179 /* Set up static tokens. */
180 pfile->avoid_paste.type = CPP_PADDING;
181 pfile->avoid_paste.val.source = NULL;
182 pfile->eof.type = CPP_EOF;
183 pfile->eof.flags = 0;
上面177行,cpp_reader的state域,记录了词法分析器的状态,域avoid_paste域是强制避免粘贴的符号(token),它具有cpp_token类型,下面我们会看到。域eof代表文件的结尾。
cpp_create_reader (continue)
185 /* Create a token buffer for the lexer. */
186 _cpp_init_tokenrun (&pfile->base_run, 250);
187 pfile->cur_run = &pfile->base_run;
188 pfile->cur_token = pfile->base_run.base;
base_run是块大小为250个符号(token)的双向链表。一开始,只有一个块被分配出来。它具有tokenrun类型,其定义如下。
158 typedef struct tokenrun tokenrun;
159 struct tokenrun in cpphash.h
160 {
161 tokenrun *next, *prev;
162 cpp_token *base, *limit;
163 };
644 void
645 _cpp_init_tokenrun (tokenrun *run, unsigned int count) in cpplex.c
646 {
647 run->base = xnewvec (cpp_token, count);
648 run->limit = run->base + count;
649 run->next = NULL;
650 }
cpp_create_reader (continue)
190 /* Initialize the base context. */
191 pfile->context = &pfile->base_context;
192 pfile->base_context.macro = 0;
193 pfile->base_context.prev = pfile->base_context.next = 0;
cpp_reader中的域context是用于处理宏展开的缓存,它也是双向链表。它的定义如下。
172 struct cpp_context in cpphash.h
173 {
174 /* Doubly-linked list. */
175 cpp_context *next, *prev;
176
177 union
178 {
179 /* For ISO macro expansion. Contexts other than the base context
180 are contiguous tokens. e.g. macro expansions, expanded
181 argument tokens. */
182 struct
183 {
184 union utoken first;
185 union utoken last;
186 } iso;
187
188 /* For traditional macro expansion. */
189 struct
190 {
191 const uchar *cur;
192 const uchar *rlimit;
193 } trad;
194 } u;
195
196 /* If non-NULL, a buffer used for storage related to this context.
197 When the context is popped, the buffer is released. */
198 _cpp_buff *buff;
199
200 /* For a macro context, the macro node, otherwise NULL. */
201 cpp_hashnode *macro;
202
203 /* True if utoken element is token, else ptoken. */
204 bool direct_p;
205 };
177到194行的union用于处理传统(traditional)及ISO的宏展开。如果我们使用–traditional选项进行编译,编译器采用传统方式展开(用于处理标准化之前的语言)。传统和ISO宏展开的主要区别是,前者展开为文本(text)而后者展开为符号串。因此,我们看到182 ~ 186行,用于ISO宏的结构中,包含了utoken类型的union结构。
151 union utoken in cpphash.h
152 {
153 const cpp_token *token;
154 const cpp_token **ptoken;
155 };
cpp_create_reader (continue)
196 /* Aligned and unaligned storage. */
197 pfile->a_buff = _cpp_get_buff (pfile, 0);
198 pfile->u_buff = _cpp_get_buff (pfile, 0);
a_buff和u_buff是cpp_reader用到的另2个缓存。其中u_buff用于处理字符串(’#’预处理操作符),a_buff用于其它处理。为了管理内存,cpp_reader使用free_buffs来保存从a_buff 和u_buff释放的块。缓存具有如下的定义及操作。
130 struct _cpp_buff in cpphash.h
131 {
132 struct _cpp_buff *next;
133 unsigned char *base, *cur, *limit;
134 };
1423 _cpp_buff *
1424 _cpp_get_buff (cpp_reader *pfile, size_t min_size) in cpplex.c
1425 {
1426 _cpp_buff *result, **p;
1427
1428 for (p = &pfile->free_buffs;; p = &(*p)->next)
1429 {
1430 size_t size;
1431
1432 if (*p == NULL)
1433 return new_buff (min_size);
1434 result = *p;
1435 size = result->limit - result->base;
1436 /* Return a buffer that's big enough, but don't waste one that's
1437 way too big. */
1438 if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
1439 break;
1440 }
1441
1442 *p = result->next;
1443 result->next = NULL;
1444 result->cur = result->base;
1445 return result;
1446 }
1391 static _cpp_buff *
1392 new_buff (size_t len) in cpplex.c
1393 {
1394 _cpp_buff *result;
1395 unsigned char *base;
1396
1397 if (len < MIN_BUFF_SIZE)
1398 len = MIN_BUFF_SIZE;
1399 len = CPP_ALIGN (len);
1400
1401 base = xmalloc (len + sizeof (_cpp_buff));
1402 result = (_cpp_buff *) (base + len);
1403 result->base = base;
1404 result->cur = base;
1405 result->limit = base + len;
1406 result->next = NULL;
1407 return result;
1408 }