Preprocessor when meeting character or string constant, records the content faithfully, but can’t interpret it as it hasn’t any information about the format or coding, it is the duty of funcitons here.
c_lex_with_flags (continue)
357 case CPP_ATSIGN:
…
388 case CPP_OTHER:
389 {
390 cppchar_t c = tok->val.str.text[0];
391
392 if (c == '"' || c == '/'')
393 error ("missing terminating %c character", (int) c);
394 else if (ISGRAPH (c))
395 error ("stray '%c' in program", (int) c);
396 else
397 error ("stray '//%o' in program", (int) c);
398 }
399 goto retry;
400
401 case CPP_CHAR:
402 case CPP_WCHAR:
403 *value = lex_charconst (tok);
404 break;
405
406 case CPP_STRING:
407 case CPP_WSTRING:
408 return lex_string (tok, value, false);
409 break;
410
411 /* These tokens should not be visible outside cpplib. */
412 case CPP_HEADER_NAME:
413 case CPP_COMMENT:
414 case CPP_MACRO_ARG:
415 abort ();
416
417 default:
418 *value = NULL_TREE;
419 break;
420 }
For character constant, the tree node is still INTEGER_CST, and its content must be that of target character set translated from source ones. Routine lex_charconst helps to do the conversion and create the node.
727 static tree
728 lex_charconst (const cpp_token *token) in c-lex.c
729 {
730 cppchar_t result;
731 tree type, value;
732 unsigned int chars_seen;
733 int unsignedp;
734
735 result = cpp_interpret_charconst (parse_in, token,
736 &chars_seen, &unsignedp);
737
738 /* Cast to cppchar_signed_t to get correct sign-extension of RESULT
739 before possibly widening to HOST_WIDE_INT for build_int_2. */
740 if (unsignedp || (cppchar_signed_t) result >= 0)
741 value = build_int_2 (result, 0);
742 else
743 value = build_int_2 ((cppchar_signed_t) result, -1);
744
745 if (token->type == CPP_WCHAR)
746 type = wchar_type_node;
747 /* In C, a character constant has type 'int'.
748 In C++ 'char', but multi-char charconsts have type 'int'. */
749 else if (!c_dialect_cxx () || chars_seen > 1)
750 type = integer_type_node;
751 else
752 type = char_type_node;
753
754 TREE_TYPE (value) = type;
755 return value;
756 }
As we have seen in previous section GCC can generate binary on host using EBCDIC coding, and at same time GCC can support wide char for target (the candidates are UTF-8, UTF-16LE, UTF-16BE, UTF32_LE, UTF32_BE).
1328 cppchar_t
1329 cpp_interpret_charconst (cpp_reader *pfile, const cpp_token *token, in cppcharset.c
1330 unsigned int *pchars_seen, int *unsignedp)
1331 {
1332 cpp_string str = { 0, 0 };
1333 bool wide = (token->type == CPP_WCHAR);
1334 cppchar_t result;
1335
1336 /* an empty constant will appear as L'' or '' */
1337 if (token->val.str.len == (size_t) (2 + wide))
1338 {
1339 cpp_error (pfile, CPP_DL_ERROR, "empty character constant");
1340 return 0;
1341 }
1342 else if (!cpp_interpret_string (pfile, &token->val.str, 1, &str, wide))
1343 return 0;
1344
1345 if (wide)
1346 result = wide_str_to_charconst (pfile, str, pchars_seen, unsignedp);
1347 else
1348 result = narrow_str_to_charconst (pfile, str, pchars_seen, unsignedp);
1349
1350 if (str.text != token->val.str.text)
1351 free ((void *)str.text);
1352
1353 return result;
1354 }
Above at line 1334, cppchar_t is of size 32 bits, it is big enough to hold character of unicode (UTF-32). To pass the character from preprocessor to lexer, structure cpp_string is used. Notice that in this structure the characters returned by preprocessor is byte array.
158 struct cpp_string in cpplib.h
159 {
160 unsigned int len;
161 const unsigned char *text;
162 };
After this character handled by lexer, it may not only occupy one byte. It needs new structure for the conversed character.
97 struct _cpp_strbuf in cppcharset.c
98 {
99 uchar *text;
100 size_t asize;
101 size_t len;
102 };
GCC defines conversion functions for converting between UTF coding, and system call iconv offers that for converting between UTF and EBCDIC coding. With the aid of these routines, cpp_interpret_string can translate the input in from into expected format in to.
1133 bool
1134 cpp_interpret_string (cpp_reader *pfile, const cpp_string *from, in cppcharset.c
1135 size_t count, cpp_string *to, bool wide)
1136 {
1137 struct _cpp_strbuf tbuf;
1138 const uchar *p, *base, *limit;
1139 size_t i;
1140 struct cset_converter cvt
1141 = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
1142
1143 tbuf.asize = MAX (OUTBUF_BLOCK_SIZE, from->len);
1144 tbuf.text = xmalloc (tbuf.asize);
1145 tbuf.len = 0;
1146
1147 for (i = 0; i < count; i++)
1148 {
1149 p = from[i].text;
1150 if (*p == 'L') p++;
1151 p++; /* Skip leading quote. */
1152 limit = from[i].text + from[i].len - 1; /* Skip trailing quote. */
1153
1154 for (;;)
1155 {
1156 base = p;
1157 while (p < limit && *p != '//')
1158 p++;
1159 if (p > base)
1160 {
1161 /* We have a run of normal characters; these can be fed
1162 directly to convert_cset. */
1163 if (!APPLY_CONVERSION (cvt, base, p - base, &tbuf))
1164 goto fail;
1165 }
1166 if (p == limit)
1167 break;
1168
1169 p = convert_escape (pfile, p + 1, limit, &tbuf, wide);
1170 }
1171 }
1172 /* NUL-terminate the 'to' buffer and translate it to a cpp_string
1173 structure. */
1174 emit_numeric_escape (pfile, 0, &tbuf, wide);
1175 tbuf.text = xrealloc (tbuf.text, tbuf.len);
1176 to->text = tbuf.text;
1177 to->len = tbuf.len;
1178 return true;
1179
1180 fail:
1181 cpp_errno (pfile, CPP_DL_ERROR, "converting to execution character set");
1182 free (tbuf.text);
1183 return false;
1184 }
However escape sequence is the except for the converting routine, as preprocessor records the escape sequence in literal faithfully, for example, /a to “/a”. It needs reinterpreted following the host character set again then converted to target set.
1043 static const uchar *
1044 convert_escape (cpp_reader *pfile, const uchar *from, const uchar *limit, in cppcharset.c
1045 struct _cpp_strbuf *tbuf, bool wide)
1046 {
1047 /* Values of /a /b /e /f /n /r /t /v respectively. */
1048 #if HOST_CHARSET == HOST_CHARSET_ASCII
1049 static const uchar charconsts[] = { 7, 8, 27, 12, 10, 13, 9, 11 };
1050 #elif HOST_CHARSET == HOST_CHARSET_EBCDIC
1051 static const uchar charconsts[] = { 47, 22, 39, 12, 21, 13, 5, 11 };
1052 #else
1053 #error "unknown host character set"
1054 #endif
1055
1056 uchar c;
1057 struct cset_converter cvt
1058 = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
1059
1060 c = *from;
1061 switch (c)
1062 {
1063 /* UCNs, hex escapes, and octal escapes are processed separately. */
1064 case 'u': case 'U':
1065 return convert_ucn (pfile, from, limit, tbuf, wide);
1066
1067 case 'x':
1068 return convert_hex (pfile, from, limit, tbuf, wide);
1069 break;
1070
1071 case '0': case '1': case '2': case '3':
1072 case '4': case '5': case '6': case '7':
1073 return convert_oct (pfile, from, limit, tbuf, wide);
1074
1075 /* Various letter escapes. Get the appropriate host-charset
1076 value into C. */
1077 case '//': case '/'': case '"': case '?': break;
1078
1079 case '(': case '{': case '[': case '%':
1080 /* '/(', etc, can be used at the beginning of a line in a long
1081 string split onto multiple lines with /-newline, to prevent
1082 Emacs or other text editors from getting confused. '/%' can
1083 be used to prevent SCCS from mangling printf format strings. */
1084 if (CPP_PEDANTIC (pfile))
1085 goto unknown;
1086 break;
1087
1088 case 'b': c = charconsts[1]; break;
1089 case 'f': c = charconsts[3]; break;
1090 case 'n': c = charconsts[4]; break;
1091 case 'r': c = charconsts[5]; break;
1092 case 't': c = charconsts[6]; break;
1093 case 'v': c = charconsts[7]; break;
1094
1095 case 'a':
1096 if (CPP_WTRADITIONAL (pfile))
1097 cpp_error (pfile, CPP_DL_WARNING,
1098 "the meaning of '//a' is different in traditional C");
1099 c = charconsts[0];
1100 break;
1101
1102 case 'e': case 'E':
1103 if (CPP_PEDANTIC (pfile))
1104 cpp_error (pfile, CPP_DL_PEDWARN,
1105 "non-ISO-standard escape sequence, '//%c'", (int) c);
1106 c = charconsts[2];
1107 break;
1108
1109 default:
1110 unknown:
1111 if (ISGRAPH (c))
1112 cpp_error (pfile, CPP_DL_PEDWARN,
1113 "unknown escape sequence '//%c'", (int) c);
1114 else
1115 cpp_error (pfile, CPP_DL_PEDWARN,
1116 "unknown escape sequence: '//%03o'", (int) c);
1117 }
1118
1119 /* Now convert what we have to the execution character set. */
1120 if (!APPLY_CONVERSION (cvt, &c, 1, tbuf))
1121 cpp_errno (pfile, CPP_DL_ERROR,
1122 "converting escape sequence to execution character set");
1123
1124 return from + 1;
1125 }
Besides the escape sequence like ‘/a’; sequence ‘/NNN’ catches character with octal value NNN (1 to 3 digits) handled by convert_oct; sequence ‘/xNN’ is byte with hexadecimal value NN (1 to 2 digits) handled by convert_hex; then sequence ‘/uNNNN’ is character with hexadecimal value NNNN (4 digits) and ‘/UNNNNNNNN’ is character with hexadecimal value NNNNNNNN (8 digits) which are handled by convert_ucn.
For those characters not handled by conversion handler in cpp_reader, emit_numeric_escape is invoked to record the character into the buffer.
907 static void
908 emit_numeric_escape (cpp_reader *pfile, cppchar_t n, in cppcharset.c
909 struct _cpp_strbuf *tbuf, bool wide)
910 {
911 if (wide)
912 {
913 /* We have to render this into the target byte order, which may not
914 be our byte order. */
915 bool bigend = CPP_OPTION (pfile, bytes_big_endian);
916 size_t width = CPP_OPTION (pfile, wchar_precision);
917 size_t cwidth = CPP_OPTION (pfile, char_precision);
918 size_t cmask = width_to_mask (cwidth);
919 size_t nbwc = width / cwidth;
920 size_t i;
921 size_t off = tbuf->len;
922 cppchar_t c;
923
924 if (tbuf->len + nbwc > tbuf->asize)
925 {
926 tbuf->asize += OUTBUF_BLOCK_SIZE;
927 tbuf->text = xrealloc (tbuf->text, tbuf->asize);
928 }
929
930 for (i = 0; i < nbwc; i++)
931 {
932 c = n & cmask;
933 n >>= cwidth;
934 tbuf->text[off + (bigend ? nbwc - i - 1 : i)] = c;
935 }
936 tbuf->len += nbwc;
937 }
938 else
939 {
940 if (tbuf->len + 1 > tbuf->asize)
941 {
942 tbuf->asize += OUTBUF_BLOCK_SIZE;
943 tbuf->text = xrealloc (tbuf->text, tbuf->asize);
944 }
945 tbuf->text[tbuf->len++] = n;
946 }
947 }
Note that asize field of _cpp_strbuf tells the size of the buffer while len field tells the number of characters.
Further in c_lex_with_flags, for string constant case, the core of lex_string is invoking cpp_interpret_string too. We don’t go into it here.
If we don’t exit above with error, then arrive at here, no PCH file should be read in now.
c_lex_with_flags (continue)
422 if (! no_more_pch)
423 {
424 no_more_pch = true;
425 c_common_no_more_pch ();
426 }
427
428 if (cpp_flags)
429 *cpp_flags = tok->flags;
430 return tok->type;
431 }
The routine just reset valid_pch handle of parse_in which then will trigger error when try to read in PCH file again.
425 void
426 c_common_no_more_pch (void) in c-pch.c
427 {
428 if (cpp_get_callbacks (parse_in)->valid_pch)
429 {
430 cpp_get_callbacks (parse_in)->valid_pch = NULL;
431 host_hooks.gt_pch_use_address (NULL, 0, -1, 0);
432 }
433 }
Hook host_hooks also provides specified methods to get and use the memory space allocated for PCH file. It is said passing second argument size with 0 to gt_pch_use_address, will allow the hook to free any static space that might have allocated at load time. But this facility seems still not realized in this version yet.