系列入口:编程实战:类C语法的编译型脚本解释器(系列)-CSDN博客
现在开始解释所有的设计思想和与源代码。先从外围入手,最后会进入到一个巨大的解析语法的类。
本文介绍TOKEN和变量。
目录
一、TOKEN
1.1 定义Token类型
1.2 将脚本拆解为Token
1.3 TryGetKeyword识别关键字
1.4 TryGetNumber识别数值
1.5 其余TryGetXXXX略
二、变量
token是编程语言的基本单元,是最小单位,包括分隔符、标识符、操作符、关键字、数字、字符串字面值,不包括空白。在C和类C语法中,空白字符包括换行都会被忽略(但预处理程序并非如此,所以预处理是额外的东西)。
所有编译程序首先都会把源代码分解成一系列token,本代码也是如此。token的相关定义如下:
enum { TOKEN_BUF_LEN = 128 };//仅用于预定义的关键字、运算符,其它标识符任意长度
//语法标记,去除空白之后的每个元素
struct Token
{
enum types { DELIMITER = 0, OPERATOR, IDENTIFIER, NUMBER, KEYWORD, STRING };
types type;//类型
string text;//文本
size_t pos;//在源代码中的位置
Token(types _type, char const* _text, size_t _pos) :type(_type), text(_text), pos(_pos) {}
string ToString()const
{
STATIC_C const char typestr[][TOKEN_BUF_LEN] = { "DELIMITER","OPERATOR","IDENTIFIER","NUMBER","KEYWORD","STRING" };//必须与types对应
char buf[TOKEN_BUF_LEN * 2];
string ret;
sprintf(buf, "%03ld %-12s ", pos, typestr[type]);
ret = buf;
ret += text.c_str();
return ret;
}
};
每个Token包含类型和文本(对应源代码中的表现形式),同时为了调试需要,增加了pos记录在脚本中的位置。
作为编译的第一步,显然是将源代码分解为token。
这一步由一个类来实现:
class CTokens
{
public:
vector m_tokens;//解析出的语法元素
bool ToTokens(string& source)
{
string::size_type pos = 0;
while (GetToken(source, pos));
return true;
}
};
解析出的token保存在m_tokens中,而函数ToTokens()仅仅是循环调用GetToken()解析出一个一个token而已。
GetToken()是关键的主控函数:
bool GetToken(string& source, string::size_type& pos)
{
Token::types type;
string token;
char c;
bool isInComment = false;
while (pos < source.size())
{
c = source[pos];
if (isInComment)
{
if ('\n' == c)
{
isInComment = false;
}
++pos;
continue;
}
if ('/' == c && pos + 1 < source.size() && '/' == source[pos + 1])
{
isInComment = true;
pos += 2;
continue;
}
if (!IsBlank(c))break;
++pos;
}
if (source.size() == pos)return false;
if (TryGetKeyword(source.c_str(), pos, token))
{
type = Token::KEYWORD;
}
else if (TryGetNumber(source.c_str(), pos, token))
{
type = Token::NUMBER;
}
else if (TryGetString(source.c_str(), pos, token))
{
type = Token::STRING;
}
else if (TryGetDelimiter(source.c_str(), pos, token))
{
type = Token::DELIMITER;
}
else if (TryGetOperator(source.c_str(), pos, token))
{
type = Token::OPERATOR;
}
else if (TryGetIdentifier(source.c_str(), pos, token))
{
type = Token::IDENTIFIER;
}
else
{
CmyException::Throw(__FILE__, __LINE__, source.c_str(), pos, "无法识别的符号");
return false;
}
m_tokens.push_back(Token(type, token.c_str(), pos - token.size()));
return true;
}
这个函数的流程不复杂,先跳过注释(仅支持单行注释),然后依次尝试每种token,每种尝试如果成功会修改当前位置pos(通过引用参数),如果失败则不会修改pos。
TryGetXXXX这一组函数每个都不复杂,不过调用顺序有名堂,关键字是最优先的,这就保证关键字不可能被用作变量名。
这个复杂一些,由几个函数组合而成:
//headset最后一个必须是空串
bool IsStartWith(char const* str, char const (*headset)[TOKEN_BUF_LEN], string& ret)const
{
long i = 0;
ret = "";
while (headset[i][0] != '\0')
{
size_t keylen = strlen(headset[i]);
if (0 == strncmp(headset[i], str, keylen))
{
if (ret.size() < strlen(headset[i]))ret = headset[i];
}
++i;
}
return ret.size() != 0;
}
bool IsKeyword(char const* str, string& key)const
{
STATIC_C char const buf[][TOKEN_BUF_LEN] = {
"asm","default","float","operator","static_cast","union",
"auto","delete","for","private","struct","unsigned",
"bool","do","friend","protected","switch","using",
"break","double","goto","public","template","virtual",
"case","dynamic_cast","if","register","this","void",
"catch","else","inline","reinterpret_cast","throw","volatile",
"char","enum","int","return","true","wchar_t",
"class","explicit","long","short","try","while",
"const","export","mutable","signed","typedef",
"const_cast","extern","namespace","sizeof","typeid",
"continue","false","new","static","typename","string",
""
};//必须以空串结尾
return IsStartWith(str, buf, key);
}
bool TryGetKeyword(char const* source, string::size_type& pos, string& ret)
{
string key;
string nextkey;
size_t keylen;
if (IsKeyword(source + pos, key))
{
keylen = key.size();
if ('\0' == source[pos + keylen] || IsBlank(source[pos + keylen]) || IsDelimiter(source[pos + keylen]) || IsOperator(source + pos + keylen, nextkey))
{
ret = key;
pos += keylen;
return true;
}
}
return false;
}
规则其实也很简单:以关键字开头并且其后是{空白、分隔符、操作符}则为一个关键字。
数值是由数字或点开始的字母数字小数点的串,同时还需要符合一些规则,代码里分两步进行,第一步识别出串,第二步则将串根据各种规则转换为数值:
//以数字或点开头的串
bool TryGetNumber(char const* source, string::size_type& pos, string& ret)
{
ret = "";
char c = source[pos];
if (c >= '0' && c <= '9' || c == '.' && source[pos + 1] >= '0' && source[pos + 1] <= '9')
{
while ((c = source[pos]) != '\0')
{
if (c >= '0' && c <= '9' || c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' || '.' == c || '_' == c)
{
}
else
{
break;
}
ret += c;
++pos;
}
}
return ret.size() != 0;
}
bool NumberToVariable(char const* source, Variable& var)
{
char* endptr;
if (IsCharIn('.', source) || IsCharIn('e', source) || IsCharIn('E', source))
{
var.type = Variable::DOUBLE;
var.dValue = strtod(source, &endptr);
}
else
{
var.type = Variable::LONG;
long prefix = 0;
long radix = 10;
if (strlen(source) >= 1 && '0' == source[0])
{
if (strlen(source) >= 2 && ('x' == source[1] || 'X' == source[1]))
{
radix = 16;
prefix = 2;
}
else
{
radix = 8;
prefix = 1;
}
}
var.lValue = strtol(source + prefix, &endptr, radix);
}
if (strlen(endptr) != 0)
{
if (Variable::DOUBLE == var.type && (0 == stricmp(endptr, "f") || 0 == stricmp(endptr, "l"))
|| Variable::LONG == var.type && (0 == stricmp(endptr, "u") || 0 == stricmp(endptr, "l") || 0 == stricmp(endptr, "i64")))
{
return true;
}
string str;
str = "数值常量格式错误 ";
str += endptr;
CException::Throw(__FILE__, __LINE__, source, endptr - source, str.c_str());
return false;
}
return true;
}
我非常痛恨无类型变量,比如JavaScript,所以我在这个脚本里面使用强类型。
变量类型做了简化,分为long、double和string。通过一个类存储所有的变量,代码如下:
//变量
struct Variable
{
enum types { NULLVARIABLE = 0, LONG, DOUBLE, STRING };
types type;
bool isconst;
long lValue;
double dValue;
string strValue;
Variable() :type(NULLVARIABLE), isconst(false), lValue(0), dValue(0.) {}
bool isNull() { return type == NULLVARIABLE; }
bool isNumber() { return type == LONG || type == DOUBLE; }
bool isString() { return type == STRING; }
void clear()
{
type = NULLVARIABLE;
isconst = false;
lValue = 0;
dValue = 0;
strValue = "";
}
void initvalue()
{
lValue = 0;
dValue = 0;
strValue = "";
}
Variable& operator = (long v)
{
char buf[256];
if (NULLVARIABLE == type)type = LONG;
switch (type)
{
case LONG:lValue = v; break;
case DOUBLE:dValue = v; break;
case STRING:
sprintf(buf, "%ld", v);
strValue = buf;
break;
default:break;
}
return *this;
}
Variable& operator = (double v)
{
char buf[256];
if (NULLVARIABLE == type)type = DOUBLE;
switch (type)
{
case LONG:lValue = (long)v; break;
case DOUBLE:dValue = v; break;
case STRING:
gcvt(v, 200, buf);
strValue = buf;
break;
default:break;
}
return *this;
}
Variable& operator = (string const& v)
{
if (NULLVARIABLE == type)type = STRING;
switch (type)
{
case LONG:lValue = atol(v.c_str()); break;
case DOUBLE:dValue = atof(v.c_str()); break;
case STRING:strValue = v; break;
default:break;
}
return *this;
}
Variable& operator = (Variable const& v)
{
if (NULLVARIABLE == type)type = v.type;
switch (type)
{
case LONG:lValue = v.GetLong(); break;
case DOUBLE:dValue = v.GetDouble(); break;
case STRING:strValue = v.GetString(); break;
default:break;
}
return *this;
}
Variable operator-()const
{
Variable tmp = *this;
switch (type)
{
case LONG:tmp.lValue = -lValue; break;
case DOUBLE:tmp.dValue = -dValue; break;
default:break;
}
return tmp;
}
//eva=true则是为赋值提升,结果以左边为准
static types typeUpgrade(types a, types b, bool eva = false)
{
if (NULLVARIABLE == a || NULLVARIABLE == b)return NULLVARIABLE;
if (LONG == a && LONG == b)return LONG;
if (STRING == a && STRING == b)return STRING;
if (DOUBLE == a && DOUBLE == b)return DOUBLE;
if (!eva)
{
if (DOUBLE == a && LONG == b)return DOUBLE;
if (LONG == a && DOUBLE == b)return DOUBLE;
}
else
{
if (DOUBLE == a && LONG == b)return DOUBLE;
if (LONG == a && DOUBLE == b)return LONG;
}
return NULLVARIABLE;
}
long GetLong()const
{
string tmp;
switch (type)
{
case LONG: return lValue;
case DOUBLE: return (long)dValue;
case STRING: tmp = strValue; return atol(tmp.c_str());
default:return 0;
}
}
double GetDouble()const
{
string tmp;
switch (type)
{
case LONG: return lValue;
case DOUBLE: return dValue;
case STRING: tmp = strValue; return atof(tmp.c_str());
default:return 0.;
}
}
bool GetBool()const
{
switch (type)
{
case LONG: return 0 != lValue;
case DOUBLE: return 0 != dValue;
default:return false;
}
}
string GetString()const
{
char buf[256];
switch (type)
{
case LONG: sprintf(buf, "%ld", lValue); return buf;
case DOUBLE: gcvt(dValue, 200, buf); return buf;
case STRING: return strValue;
default:return "";
}
}
Variable operator+(Variable const& b)const
{
Variable tmp = *this;
tmp.type = typeUpgrade(type, b.type);
switch (tmp.type)
{
case LONG:tmp.lValue = GetLong() + b.GetLong(); break;
case DOUBLE:tmp.dValue = GetDouble() + b.GetDouble(); break;
case STRING:tmp.strValue = GetString() + b.GetString(); break;
default:break;
}
return tmp;
}
Variable operator-(Variable const& b)const
{
Variable tmp = *this;
tmp.type = typeUpgrade(type, b.type);
switch (tmp.type)
{
case LONG:tmp.lValue = GetLong() - b.GetLong(); break;
case DOUBLE:tmp.dValue = GetDouble() - b.GetDouble(); break;
default:break;
}
return tmp;
}
Variable operator*(Variable const& b)const
{
Variable tmp = *this;
tmp.type = typeUpgrade(type, b.type);
switch (tmp.type)
{
case LONG:tmp.lValue = GetLong() * b.GetLong(); break;
case DOUBLE:tmp.dValue = GetDouble() * b.GetDouble(); break;
default:break;
}
return tmp;
}
Variable operator/(Variable const& b)const
{
Variable tmp = *this;
tmp.type = typeUpgrade(type, b.type);
switch (tmp.type)
{
case LONG:
{
if (0 == b.GetLong())throw "div zero";
tmp.lValue = GetLong() / b.GetLong(); break;
}
case DOUBLE:
{
if (0 == b.GetDouble())throw "div zero";
tmp.dValue = GetDouble() / b.GetDouble(); break;
}
default:break;
}
return tmp;
}
Variable operator%(Variable const& b)const
{
Variable tmp = *this;
tmp.type = typeUpgrade(type, b.type);
switch (tmp.type)
{
case LONG:
{
if (0 == b.GetLong())throw "mod zero";
tmp.lValue = GetLong() % b.GetLong(); break;
}
default:break;
}
return tmp;
}
Variable operator>(Variable const& b)const
{
Variable tmp = *this;
tmp.type = LONG;
switch (typeUpgrade(type, b.type))
{
case LONG:tmp.lValue = GetLong() > b.GetLong(); break;
case DOUBLE:tmp.lValue = GetDouble() > b.GetDouble(); break;
case STRING:tmp.lValue = GetString() > b.GetString(); break;
default:break;
}
return tmp;
}
Variable operator<(Variable const& b)const
{
Variable tmp = *this;
tmp.type = LONG;
switch (typeUpgrade(type, b.type))
{
case LONG:tmp.lValue = GetLong() < b.GetLong(); break;
case DOUBLE:tmp.lValue = GetDouble() < b.GetDouble(); break;
case STRING:tmp.lValue = GetString() < b.GetString(); break;
default:break;
}
return tmp;
}
Variable operator>=(Variable const& b)const
{
Variable tmp = *this;
tmp.type = LONG;
switch (typeUpgrade(type, b.type))
{
case LONG:tmp.lValue = GetLong() >= b.GetLong(); break;
case DOUBLE:tmp.lValue = GetDouble() >= b.GetDouble(); break;
case STRING:tmp.lValue = GetString() >= b.GetString(); break;
default:break;
}
return tmp;
}
Variable operator<=(Variable const& b)const
{
Variable tmp = *this;
tmp.type = LONG;
switch (typeUpgrade(type, b.type))
{
case LONG:tmp.lValue = GetLong() <= b.GetLong(); break;
case DOUBLE:tmp.lValue = GetDouble() <= b.GetDouble(); break;
case STRING:tmp.lValue = GetString() <= b.GetString(); break;
default:break;
}
return tmp;
}
Variable operator==(Variable const& b)const
{
Variable tmp = *this;
tmp.type = LONG;
switch (typeUpgrade(type, b.type))
{
case LONG:tmp.lValue = GetLong() == b.GetLong(); break;
case DOUBLE:tmp.lValue = GetDouble() == b.GetDouble(); break;
case STRING:tmp.lValue = GetString() == b.GetString(); break;
default:break;
}
return tmp;
}
Variable operator!=(Variable const& b)const
{
Variable tmp = *this;
tmp.type = LONG;
switch (typeUpgrade(type, b.type))
{
case LONG:tmp.lValue = GetLong() != b.GetLong(); break;
case DOUBLE:tmp.lValue = GetDouble() != b.GetDouble(); break;
case STRING:tmp.lValue = GetString() != b.GetString(); break;
default:break;
}
return tmp;
}
Variable operator&&(Variable const& b)const
{
Variable tmp = *this;
tmp.type = LONG;
switch (typeUpgrade(type, b.type))
{
case LONG:tmp.lValue = GetLong() && b.GetLong(); break;
case DOUBLE:tmp.lValue = GetDouble() && b.GetDouble(); break;
default:break;
}
return tmp;
}
Variable operator||(Variable const& b)const
{
Variable tmp = *this;
tmp.type = LONG;
switch (typeUpgrade(type, b.type))
{
case LONG:tmp.lValue = GetLong() || b.GetLong(); break;
case DOUBLE:tmp.lValue = GetDouble() || b.GetDouble(); break;
default:break;
}
return tmp;
}
static char const* TypeStr(types type)
{
STATIC_C const char typestr[][TOKEN_BUF_LEN] = { "NULLVARIABLE","LONG","DOUBLE","STRING" };//必须与types对应
if(type>=0 && type<4)return typestr[type];
else
{
static char buf[256];
sprintf(buf, "错误的类型 %d", type);
//cout << buf << endl; exit(0);
return buf;
}
}
string ToString(long level = 0)const
{
string ret;
char buf[256];
string prefix;
prefix.assign(level * 4, ' ');
switch (type)
{
case LONG:sprintf(buf, "%ld", lValue); break;
case DOUBLE:gcvt(dValue, 200, buf); break;
case STRING:strcpy(buf, strValue.c_str()); break;
default:sprintf(buf, "NULL"); break;
}
ret = prefix + " ";
ret += (isconst ? "常量" : "变量");
ret += "类型 ";
ret += TypeStr(type);
ret += " : ";
ret += buf;
return ret;
}
};
没有使用union,直接用类型和三个变量来存储,空间当然是有浪费的,但是没人知道啊。
重载了各种类型的相互操作,都很简单,只是繁琐。
成员变量:
类型 | 变量名 | 功能 |
enum types | type | 实际存储的类型 |
bool | isconst | 是否是常量,常量不允许修改 |
long | lValue | type为LONG时使用 |
double | dValue | type为DOUBLE时使用 |
string | strValue | type为STRING时使用 |
(待续)
(这里是结束)