GCC-3.4.6源代码学习笔记(80)

5.6.1.1.2.3.  字符、字符串常量

当预处理器碰到字符或字符串常量时,它忠实地记录其内容,但不去解释它,因为预处理器没有掌握其格式、编码的信息。这是这里的函数的任务。

 

c_lex_with_flags (continue)

 

357      case CPP_ATSIGN:

           

388      case CPP_OTHER:

389      {

390        cppchar_t c = tok->val.str.text[0];

391 

392        if (c == '"' || c == '/'')

393          error ("missing terminating %c character", (int) c);

394        else if (ISGRAPH (c))

395          error ("stray '%c' in program", (int) c);

396        else

397          error ("stray '//%o' in program", (int) c);

398      }

399      goto retry;

400 

401      case CPP_CHAR:

402      case CPP_WCHAR:

403        *value = lex_charconst (tok);

404        break;

405 

406      case CPP_STRING:

407      case CPP_WSTRING:

408        return lex_string (tok, value, false);

409        break;

410 

411       /* These tokens should not be visible outside cpplib.  */

412      case CPP_HEADER_NAME:

413      case CPP_COMMENT:

414      case CPP_MACRO_ARG:

415        abort ();

416 

417      default:

418        *value = NULL_TREE;

419        break;

420    }

 

对于字符常量,其树节点仍旧是INTEGER_CST,不过其内容必须是从原字符翻译过来的目标机器的字符。函数lex_charconst协助这样的转换并参加该节点。

 

727  static tree

728  lex_charconst (const cpp_token *token)                                                       in c-lex.c

729  {

730    cppchar_t result;

731    tree type, value;

732    unsigned int chars_seen;

733    int unsignedp;

734 

735    result = cpp_interpret_charconst (parse_in, token,

736                               &chars_seen, &unsignedp);

737 

738   /* Cast to cppchar_signed_t to get correct sign-extension of RESULT

739      before possibly widening to HOST_WIDE_INT for build_int_2.  */

740    if (unsignedp || (cppchar_signed_t) result >= 0)

741      value = build_int_2 (result, 0);

742    else

743      value = build_int_2 ((cppchar_signed_t) result, -1);

744 

745    if (token->type == CPP_WCHAR)

746      type = wchar_type_node;

747    /* In C, a character constant has type 'int'.

748      In C++ 'char', but multi-char charconsts have type 'int'.  */

749    else if (!c_dialect_cxx () || chars_seen > 1)

750      type = integer_type_node;

751    else

752      type = char_type_node;

753 

754    TREE_TYPE (value) = type;

755    return value;

756  }

 

在前面我们看到,GCC可以在使用EBCDIC编码的宿主机器(host)上运行,同时GCC 可以为目标机器提供宽字符支持(可选的有UTF-8UTF-16LEUTF-16BEUTF32_LEUTF32_BE)。

 

1328 cppchar_t

1329 cpp_interpret_charconst (cpp_reader *pfile, const cpp_token *token,       in cppcharset.c

1330               unsigned int *pchars_seen, int *unsignedp)

1331 {

1332   cpp_string str = { 0, 0 };

1333   bool wide = (token->type == CPP_WCHAR);

1334   cppchar_t result;

1335

1336   /* an empty constant will appear as L'' or '' */

1337   if (token->val.str.len == (size_t) (2 + wide))

1338   {

1339     cpp_error (pfile, CPP_DL_ERROR, "empty character constant");

1340     return 0;

1341   }

1342   else if (!cpp_interpret_string (pfile, &token->val.str, 1, &str, wide))

1343     return 0;

1344

1345   if (wide)

1346     result = wide_str_to_charconst (pfile, str, pchars_seen, unsignedp);

1347   else

1348     result = narrow_str_to_charconst (pfile, str, pchars_seen, unsignedp);

1349

1350   if (str.text != token->val.str.text)

1351     free ((void *)str.text);

1352

1353   return result;

1354 }

 

在上面1334行,cppchar_t 具有32比特大小,它足够保存unicodeUTF-32)的字符。为了把字符从预处理器传到词法分析器,使用了结构体cpp_string。注意到在这个结构体中预处理器返回的字符是一个字节数组。

 

158  struct cpp_string                                                                                        in cpplib.h

159  {

160    unsigned int len;

161    const unsigned char *text;

162  };

 

这个字符被词法分析器处理后,它可能占据不止一个字节。需要一个新的结构体用于转换后的字符。

 

97    struct _cpp_strbuf                                                                               in cppcharset.c

98    {

99      uchar *text;

100    size_t asize;

101    size_t len;

102  };

 

GCC定义了UTF编码之间的转换函数,而系统调用iconv提供了UTFEBCDIC编码之间的转换。在这些例程的协助下,cpp_interpret_string可以把from中的输入转换为具有期望格式的to

 

1133 bool

1134 cpp_interpret_string (cpp_reader *pfile, const cpp_string *from,                    in cppcharset.c

1135              size_t count, cpp_string *to, bool wide)

1136 {

1137   struct _cpp_strbuf tbuf;

1138   const uchar *p, *base, *limit;

1139   size_t i;

1140   struct cset_converter cvt

1141     = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;

1142

1143   tbuf.asize = MAX (OUTBUF_BLOCK_SIZE, from->len);

1144   tbuf.text = xmalloc (tbuf.asize);

1145   tbuf.len = 0;

1146

1147   for (i = 0; i < count; i++)

1148   {

1149     p = from[i].text;

1150     if (*p == 'L') p++;

1151     p++; /* Skip leading quote.  */

1152     limit = from[i].text + from[i].len - 1; /* Skip trailing quote.  */

1153

1154     for (;;)

1155     {

1156       base = p;

1157       while (p < limit && *p != '//')

1158         p++;

1159       if (p > base)

1160       {

1161         /* We have a run of normal characters; these can be fed

1162           directly to convert_cset.  */

1163         if (!APPLY_CONVERSION (cvt, base, p - base, &tbuf))

1164           goto fail;

1165       }

1166       if (p == limit)

1167         break;

1168

1169       p = convert_escape (pfile, p + 1, limit, &tbuf, wide);

1170     }

1171   }

1172   /* NUL-terminate the 'to' buffer and translate it to a cpp_string

1173     structure.  */

1174   emit_numeric_escape (pfile, 0, &tbuf, wide);

1175   tbuf.text = xrealloc (tbuf.text, tbuf.len);

1176   to->text = tbuf.text;

1177   to->len = tbuf.len;

1178   return true;

1179

1180 fail:

1181   cpp_errno (pfile, CPP_DL_ERROR, "converting to execution character set");

1182   free (tbuf.text);

1183   return false;

1184 }

 

不过转义序列是这些转换函数中的例外,因为预处理器把字符串中的转义序列也忠实地记录可,例如,/a记为“/a”。需要重新解释下面的宿主字符集,然后转换至目标字符集。

 

1043 static const uchar *

1044 convert_escape (cpp_reader *pfile, const uchar *from, const uchar *limit, in cppcharset.c

1045        struct _cpp_strbuf *tbuf, bool wide)

1046 {

1047  /* Values of /a /b /e /f /n /r /t /v respectively.  */

1048 #if HOST_CHARSET == HOST_CHARSET_ASCII

1049   static const uchar charconsts[] = {  7,  8, 27, 12, 10, 13,  9, 11 };

1050 #elif HOST_CHARSET == HOST_CHARSET_EBCDIC

1051   static const uchar charconsts[] = { 47, 22, 39, 12, 21, 13,  5, 11 };

1052 #else

1053 #error "unknown host character set"

1054 #endif

1055

1056   uchar c;

1057   struct cset_converter cvt

1058     = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;

1059

1060   c = *from;

1061   switch (c)

1062   {

1063     /* UCNs, hex escapes, and octal escapes are processed separately.  */

1064     case 'u': case 'U':

1065       return convert_ucn (pfile, from, limit, tbuf, wide);

1066

1067     case 'x':

1068       return convert_hex (pfile, from, limit, tbuf, wide);

1069       break;

1070

1071     case '0':  case '1':  case '2':  case '3':

1072     case '4':  case '5':  case '6':  case '7':

1073       return convert_oct (pfile, from, limit, tbuf, wide);

1074

1075     /* Various letter escapes. Get the appropriate host-charset

1076       value into C.  */

1077     case '//': case '/'': case '"': case '?': break;

1078

1079     case '(': case '{': case '[': case '%':

1080       /* '/(', etc, can be used at the beginning of a line in a long

1081         string split onto multiple lines with /-newline, to prevent

1082         Emacs or other text editors from getting confused. '/%' can

1083         be used to prevent SCCS from mangling printf format strings.  */

1084       if (CPP_PEDANTIC (pfile))

1085   goto unknown;

1086       break;

1087

1088     case 'b': c = charconsts[1];  break;

1089     case 'f': c = charconsts[3];  break;

1090     case 'n': c = charconsts[4];  break;

1091     case 'r': c = charconsts[5];  break;

1092     case 't': c = charconsts[6];  break;

1093     case 'v': c = charconsts[7];  break;

1094

1095     case 'a':

1096       if (CPP_WTRADITIONAL (pfile))

1097         cpp_error (pfile, CPP_DL_WARNING,

1098                  "the meaning of '//a' is different in traditional C");

1099       c = charconsts[0];

1100       break;

1101

1102     case 'e': case 'E':

1103       if (CPP_PEDANTIC (pfile))

1104         cpp_error (pfile, CPP_DL_PEDWARN,

1105                  "non-ISO-standard escape sequence, '//%c'", (int) c);

1106       c = charconsts[2];

1107       break;

1108

1109     default:

1110     unknown:

1111       if (ISGRAPH (c))

1112         cpp_error (pfile, CPP_DL_PEDWARN,

1113                  "unknown escape sequence '//%c'", (int) c);

1114       else

1115         cpp_error (pfile, CPP_DL_PEDWARN,

1116                 "unknown escape sequence: '//%03o'", (int) c);

1117   }

1118

1119   /* Now convert what we have to the execution character set.  */

1120   if (!APPLY_CONVERSION (cvt, &c, 1, tbuf))

1121     cpp_errno (pfile, CPP_DL_ERROR,

1122              "converting escape sequence to execution character set");

1123

1124   return from + 1;

1125 }

 

除了类似‘/a’的转义序列外;具有八进制数值NNN13个数字)的序列‘/NNN’convert_oct处理;具有16进制值NN12个数字)的序列‘/xNN’convert_hex处理;具有16进制值NNNN4个数字)的序列‘/uNNNN’,及具有16进制值NNNNNNNN8个数字)的序列‘/UNNNNNNNN’convert_ucn处理。

对于那些在cpp_reader中的转换句柄所不能处理的字符,emit_numeric_escape被调用将这些字符记录到缓存。

 

907  static void

908  emit_numeric_escape (cpp_reader *pfile, cppchar_t n,                           in cppcharset.c

909              struct _cpp_strbuf *tbuf, bool wide)

910  {

911     if (wide)

912    {

913      /* We have to render this into the target byte order, which may not

914        be our byte order.  */

915      bool bigend = CPP_OPTION (pfile, bytes_big_endian);

916      size_t width = CPP_OPTION (pfile, wchar_precision);

917      size_t cwidth = CPP_OPTION (pfile, char_precision);

918      size_t cmask = width_to_mask (cwidth);

919      size_t nbwc = width / cwidth;

920      size_t i;

921      size_t off = tbuf->len;

922      cppchar_t c;

923 

924      if (tbuf->len + nbwc > tbuf->asize)

925      {

926        tbuf->asize += OUTBUF_BLOCK_SIZE;

927        tbuf->text = xrealloc (tbuf->text, tbuf->asize);

928      }

929 

930      for (i = 0; i < nbwc; i++)

931      {

932        c = n & cmask;

933        n >>= cwidth;

934        tbuf->text[off + (bigend ? nbwc - i - 1 : i)] = c;

935      }

936      tbuf->len += nbwc;

937    }

938    else

939    {

940      if (tbuf->len + 1 > tbuf->asize)

941      {

942        tbuf->asize += OUTBUF_BLOCK_SIZE;

943        tbuf->text = xrealloc (tbuf->text, tbuf->asize);

944      }

945      tbuf->text[tbuf->len++] = n;

946    }

947  }

 

注意到_cpp_strbufasize域告知缓存的大小,而len域则告知字符的数目。

接下来在c_lex_with_flags里,对于字符串常量的情形,lex_string的核心也是调用cpp_interpret_string。这里我们跳过它。

5.6.1.1.3.            完成预处理符号提取

如果我们没有在上面因错误退出,那么到达这里,不会再有PCH文件需要读入了。

 

c_lex_with_flags (continue)

 

422    if (! no_more_pch)

423    {

424      no_more_pch = true;

425      c_common_no_more_pch ();

426    }

427 

428    if (cpp_flags)

429      *cpp_flags = tok->flags;

430    return tok->type;

431  }

 

这个函数重置了parse_in的句柄valid_pch,这样如果再读入PCH文件时,将导致错误。

 

425  void

426  c_common_no_more_pch (void)                                                                 in c-pch.c

427  {

428    if (cpp_get_callbacks (parse_in)->valid_pch)

429    {

430      cpp_get_callbacks (parse_in)->valid_pch = NULL;

431      host_hooks.gt_pch_use_address (NULL, 0, -1, 0);

432    }

433  }

 

钩子host_hooks提供了特定于目标机器的方法来获取为PCH文件分配的内存空间。把0作为gt_pch_use_address的第二个参数size传入,将允许钩子释放在加载期间可能分配的静态内存。但在当前版本中,该功能尚未实现。

 

你可能感兴趣的:(c,String,struct,Integer,token,character)