当预处理器碰到字符或字符串常量时,它忠实地记录其内容,但不去解释它,因为预处理器没有掌握其格式、编码的信息。这是这里的函数的任务。
c_lex_with_flags (continue)
357 case CPP_ATSIGN:
…
388 case CPP_OTHER:
389 {
390 cppchar_t c = tok->val.str.text[0];
391
392 if (c == '"' || c == '/'')
393 error ("missing terminating %c character", (int) c);
394 else if (ISGRAPH (c))
395 error ("stray '%c' in program", (int) c);
396 else
397 error ("stray '//%o' in program", (int) c);
398 }
399 goto retry;
400
401 case CPP_CHAR:
402 case CPP_WCHAR:
403 *value = lex_charconst (tok);
404 break;
405
406 case CPP_STRING:
407 case CPP_WSTRING:
408 return lex_string (tok, value, false);
409 break;
410
411 /* These tokens should not be visible outside cpplib. */
412 case CPP_HEADER_NAME:
413 case CPP_COMMENT:
414 case CPP_MACRO_ARG:
415 abort ();
416
417 default:
418 *value = NULL_TREE;
419 break;
420 }
对于字符常量,其树节点仍旧是INTEGER_CST,不过其内容必须是从原字符翻译过来的目标机器的字符。函数lex_charconst协助这样的转换并参加该节点。
727 static tree
728 lex_charconst (const cpp_token *token) in c-lex.c
729 {
730 cppchar_t result;
731 tree type, value;
732 unsigned int chars_seen;
733 int unsignedp;
734
735 result = cpp_interpret_charconst (parse_in, token,
736 &chars_seen, &unsignedp);
737
738 /* Cast to cppchar_signed_t to get correct sign-extension of RESULT
739 before possibly widening to HOST_WIDE_INT for build_int_2. */
740 if (unsignedp || (cppchar_signed_t) result >= 0)
741 value = build_int_2 (result, 0);
742 else
743 value = build_int_2 ((cppchar_signed_t) result, -1);
744
745 if (token->type == CPP_WCHAR)
746 type = wchar_type_node;
747 /* In C, a character constant has type 'int'.
748 In C++ 'char', but multi-char charconsts have type 'int'. */
749 else if (!c_dialect_cxx () || chars_seen > 1)
750 type = integer_type_node;
751 else
752 type = char_type_node;
753
754 TREE_TYPE (value) = type;
755 return value;
756 }
在前面我们看到,GCC可以在使用EBCDIC编码的宿主机器(host)上运行,同时GCC 可以为目标机器提供宽字符支持(可选的有UTF-8,UTF-16LE,UTF-16BE,UTF32_LE,UTF32_BE)。
1328 cppchar_t
1329 cpp_interpret_charconst (cpp_reader *pfile, const cpp_token *token, in cppcharset.c
1330 unsigned int *pchars_seen, int *unsignedp)
1331 {
1332 cpp_string str = { 0, 0 };
1333 bool wide = (token->type == CPP_WCHAR);
1334 cppchar_t result;
1335
1336 /* an empty constant will appear as L'' or '' */
1337 if (token->val.str.len == (size_t) (2 + wide))
1338 {
1339 cpp_error (pfile, CPP_DL_ERROR, "empty character constant");
1340 return 0;
1341 }
1342 else if (!cpp_interpret_string (pfile, &token->val.str, 1, &str, wide))
1343 return 0;
1344
1345 if (wide)
1346 result = wide_str_to_charconst (pfile, str, pchars_seen, unsignedp);
1347 else
1348 result = narrow_str_to_charconst (pfile, str, pchars_seen, unsignedp);
1349
1350 if (str.text != token->val.str.text)
1351 free ((void *)str.text);
1352
1353 return result;
1354 }
在上面1334行,cppchar_t 具有32比特大小,它足够保存unicode(UTF-32)的字符。为了把字符从预处理器传到词法分析器,使用了结构体cpp_string。注意到在这个结构体中预处理器返回的字符是一个字节数组。
158 struct cpp_string in cpplib.h
159 {
160 unsigned int len;
161 const unsigned char *text;
162 };
这个字符被词法分析器处理后,它可能占据不止一个字节。需要一个新的结构体用于转换后的字符。
97 struct _cpp_strbuf in cppcharset.c
98 {
99 uchar *text;
100 size_t asize;
101 size_t len;
102 };
GCC定义了UTF编码之间的转换函数,而系统调用iconv提供了UTF和EBCDIC编码之间的转换。在这些例程的协助下,cpp_interpret_string可以把from中的输入转换为具有期望格式的to。
1133 bool
1134 cpp_interpret_string (cpp_reader *pfile, const cpp_string *from, in cppcharset.c
1135 size_t count, cpp_string *to, bool wide)
1136 {
1137 struct _cpp_strbuf tbuf;
1138 const uchar *p, *base, *limit;
1139 size_t i;
1140 struct cset_converter cvt
1141 = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
1142
1143 tbuf.asize = MAX (OUTBUF_BLOCK_SIZE, from->len);
1144 tbuf.text = xmalloc (tbuf.asize);
1145 tbuf.len = 0;
1146
1147 for (i = 0; i < count; i++)
1148 {
1149 p = from[i].text;
1150 if (*p == 'L') p++;
1151 p++; /* Skip leading quote. */
1152 limit = from[i].text + from[i].len - 1; /* Skip trailing quote. */
1153
1154 for (;;)
1155 {
1156 base = p;
1157 while (p < limit && *p != '//')
1158 p++;
1159 if (p > base)
1160 {
1161 /* We have a run of normal characters; these can be fed
1162 directly to convert_cset. */
1163 if (!APPLY_CONVERSION (cvt, base, p - base, &tbuf))
1164 goto fail;
1165 }
1166 if (p == limit)
1167 break;
1168
1169 p = convert_escape (pfile, p + 1, limit, &tbuf, wide);
1170 }
1171 }
1172 /* NUL-terminate the 'to' buffer and translate it to a cpp_string
1173 structure. */
1174 emit_numeric_escape (pfile, 0, &tbuf, wide);
1175 tbuf.text = xrealloc (tbuf.text, tbuf.len);
1176 to->text = tbuf.text;
1177 to->len = tbuf.len;
1178 return true;
1179
1180 fail:
1181 cpp_errno (pfile, CPP_DL_ERROR, "converting to execution character set");
1182 free (tbuf.text);
1183 return false;
1184 }
不过转义序列是这些转换函数中的例外,因为预处理器把字符串中的转义序列也忠实地记录可,例如,/a记为“/a”。需要重新解释下面的宿主字符集,然后转换至目标字符集。
1043 static const uchar *
1044 convert_escape (cpp_reader *pfile, const uchar *from, const uchar *limit, in cppcharset.c
1045 struct _cpp_strbuf *tbuf, bool wide)
1046 {
1047 /* Values of /a /b /e /f /n /r /t /v respectively. */
1048 #if HOST_CHARSET == HOST_CHARSET_ASCII
1049 static const uchar charconsts[] = { 7, 8, 27, 12, 10, 13, 9, 11 };
1050 #elif HOST_CHARSET == HOST_CHARSET_EBCDIC
1051 static const uchar charconsts[] = { 47, 22, 39, 12, 21, 13, 5, 11 };
1052 #else
1053 #error "unknown host character set"
1054 #endif
1055
1056 uchar c;
1057 struct cset_converter cvt
1058 = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
1059
1060 c = *from;
1061 switch (c)
1062 {
1063 /* UCNs, hex escapes, and octal escapes are processed separately. */
1064 case 'u': case 'U':
1065 return convert_ucn (pfile, from, limit, tbuf, wide);
1066
1067 case 'x':
1068 return convert_hex (pfile, from, limit, tbuf, wide);
1069 break;
1070
1071 case '0': case '1': case '2': case '3':
1072 case '4': case '5': case '6': case '7':
1073 return convert_oct (pfile, from, limit, tbuf, wide);
1074
1075 /* Various letter escapes. Get the appropriate host-charset
1076 value into C. */
1077 case '//': case '/'': case '"': case '?': break;
1078
1079 case '(': case '{': case '[': case '%':
1080 /* '/(', etc, can be used at the beginning of a line in a long
1081 string split onto multiple lines with /-newline, to prevent
1082 Emacs or other text editors from getting confused. '/%' can
1083 be used to prevent SCCS from mangling printf format strings. */
1084 if (CPP_PEDANTIC (pfile))
1085 goto unknown;
1086 break;
1087
1088 case 'b': c = charconsts[1]; break;
1089 case 'f': c = charconsts[3]; break;
1090 case 'n': c = charconsts[4]; break;
1091 case 'r': c = charconsts[5]; break;
1092 case 't': c = charconsts[6]; break;
1093 case 'v': c = charconsts[7]; break;
1094
1095 case 'a':
1096 if (CPP_WTRADITIONAL (pfile))
1097 cpp_error (pfile, CPP_DL_WARNING,
1098 "the meaning of '//a' is different in traditional C");
1099 c = charconsts[0];
1100 break;
1101
1102 case 'e': case 'E':
1103 if (CPP_PEDANTIC (pfile))
1104 cpp_error (pfile, CPP_DL_PEDWARN,
1105 "non-ISO-standard escape sequence, '//%c'", (int) c);
1106 c = charconsts[2];
1107 break;
1108
1109 default:
1110 unknown:
1111 if (ISGRAPH (c))
1112 cpp_error (pfile, CPP_DL_PEDWARN,
1113 "unknown escape sequence '//%c'", (int) c);
1114 else
1115 cpp_error (pfile, CPP_DL_PEDWARN,
1116 "unknown escape sequence: '//%03o'", (int) c);
1117 }
1118
1119 /* Now convert what we have to the execution character set. */
1120 if (!APPLY_CONVERSION (cvt, &c, 1, tbuf))
1121 cpp_errno (pfile, CPP_DL_ERROR,
1122 "converting escape sequence to execution character set");
1123
1124 return from + 1;
1125 }
除了类似‘/a’的转义序列外;具有八进制数值NNN(1到3个数字)的序列‘/NNN’由convert_oct处理;具有16进制值NN(1到2个数字)的序列‘/xNN’由convert_hex处理;具有16进制值NNNN(4个数字)的序列‘/uNNNN’,及具有16进制值NNNNNNNN(8个数字)的序列‘/UNNNNNNNN’由convert_ucn处理。
对于那些在cpp_reader中的转换句柄所不能处理的字符,emit_numeric_escape被调用将这些字符记录到缓存。
907 static void
908 emit_numeric_escape (cpp_reader *pfile, cppchar_t n, in cppcharset.c
909 struct _cpp_strbuf *tbuf, bool wide)
910 {
911 if (wide)
912 {
913 /* We have to render this into the target byte order, which may not
914 be our byte order. */
915 bool bigend = CPP_OPTION (pfile, bytes_big_endian);
916 size_t width = CPP_OPTION (pfile, wchar_precision);
917 size_t cwidth = CPP_OPTION (pfile, char_precision);
918 size_t cmask = width_to_mask (cwidth);
919 size_t nbwc = width / cwidth;
920 size_t i;
921 size_t off = tbuf->len;
922 cppchar_t c;
923
924 if (tbuf->len + nbwc > tbuf->asize)
925 {
926 tbuf->asize += OUTBUF_BLOCK_SIZE;
927 tbuf->text = xrealloc (tbuf->text, tbuf->asize);
928 }
929
930 for (i = 0; i < nbwc; i++)
931 {
932 c = n & cmask;
933 n >>= cwidth;
934 tbuf->text[off + (bigend ? nbwc - i - 1 : i)] = c;
935 }
936 tbuf->len += nbwc;
937 }
938 else
939 {
940 if (tbuf->len + 1 > tbuf->asize)
941 {
942 tbuf->asize += OUTBUF_BLOCK_SIZE;
943 tbuf->text = xrealloc (tbuf->text, tbuf->asize);
944 }
945 tbuf->text[tbuf->len++] = n;
946 }
947 }
注意到_cpp_strbuf的asize域告知缓存的大小,而len域则告知字符的数目。
接下来在c_lex_with_flags里,对于字符串常量的情形,lex_string的核心也是调用cpp_interpret_string。这里我们跳过它。
如果我们没有在上面因错误退出,那么到达这里,不会再有PCH文件需要读入了。
c_lex_with_flags (continue)
422 if (! no_more_pch)
423 {
424 no_more_pch = true;
425 c_common_no_more_pch ();
426 }
427
428 if (cpp_flags)
429 *cpp_flags = tok->flags;
430 return tok->type;
431 }
这个函数重置了parse_in的句柄valid_pch,这样如果再读入PCH文件时,将导致错误。
425 void
426 c_common_no_more_pch (void) in c-pch.c
427 {
428 if (cpp_get_callbacks (parse_in)->valid_pch)
429 {
430 cpp_get_callbacks (parse_in)->valid_pch = NULL;
431 host_hooks.gt_pch_use_address (NULL, 0, -1, 0);
432 }
433 }
钩子host_hooks提供了特定于目标机器的方法来获取为PCH文件分配的内存空间。把0作为gt_pch_use_address的第二个参数size传入,将允许钩子释放在加载期间可能分配的静态内存。但在当前版本中,该功能尚未实现。