[源码来来源NFC](https://swtch.com/~rsc/regexp/nfa.c.txt)
/*
* Regular expression implementation.
* Supports only ( | ) * + ?. No escapes.
* Compiles to NFA and then simulates NFA
* using Thompson's algorithm.
*
* See also http://swtch.com/~rsc/regexp/ and
* Thompson, Ken. Regular Expression Search Algorithm,
* Communications of the ACM 11(6) (June 1968), pp. 419-422.
*
* Copyright (c) 2007 Russ Cox.
* Can be distributed under the MIT license, see bottom of file.
*/
#include
#include
#include
#include
/*
* Convert infix regexp re to postfix notation.
* Insert . as explicit concatenation operator.
* Cheesy parser, return static buffer.
*/
char*
re2post(char *re)
{
int nalt, natom; // nalt: 记录 | 符合个数。natom: 记录表达式中原子符号的个数。
static char buf[8000];
char *dst;
struct {
int nalt;
int natom;
} paren[100], *p; // 记录遇到左括号前的表达式状态信息
p = paren;
dst = buf;
nalt = 0;
natom = 0;
if(strlen(re) >= sizeof buf/2)
return NULL;
for(; *re; re++){
switch(*re){
case '(':
if(natom > 1){
//如果原子符号超过1个,要先加"."符号。
--natom;
*dst++ = '.';
}
if(p >= paren+100)
return NULL;
//记录原来状态
p->nalt = nalt;
p->natom = natom;
p++;
//重置括号中表达式状态
nalt = 0;
natom = 0;
break;
case '|':
if(natom == 0)
return NULL;
while(--natom > 0)
*dst++ = '.';
nalt++;//记录"|"状态
break;
case ')':
if(p == paren)
return NULL;
if(natom == 0)
return NULL;
// 根据记录的括号中表达式的状态,转换成后置式
while(--natom > 0)
*dst++ = '.';
for(; nalt > 0; nalt--)
*dst++ = '|';
// 重新置回遇到括号前的表达式状态
--p;
nalt = p->nalt;
natom = p->natom;
natom++;
break;
case '*':
case '+':
case '?':
if(natom == 0)
return NULL;
*dst++ = *re;
break;
default:
if(natom > 1){
// 原子符号超1个要加"."符号。
--natom;
*dst++ = '.';
}
*dst++ = *re;
natom++;
break;
}
}
if(p != paren)
return NULL;
while(--natom > 0) //加"."符号在后缀表达式
*dst++ = '.';
for(; nalt > 0; nalt--) // 加入"|"符号在后缀表达式
*dst++ = '|';
*dst = 0;
return buf;
}
/*
* Represents an NFA state plus zero or one or two arrows exiting.
* if c == Match, no arrows out; matching state.
* If c == Split, unlabeled arrows to out and out1 (if != NULL).
* If c < 256, labeled arrow with character c to out.
*/
enum
{
Match = 256,
Split = 257
};
typedef struct State State;
struct State
{
int c;
State *out;
State *out1;
int lastlist;
};
State matchstate = { Match }; /* matching state */
int nstate;
/* Allocate and initialize State */
State*
state(int c, State *out, State *out1)
{
State *s;
nstate++;
s = malloc(sizeof *s);
s->lastlist = 0;
s->c = c;
s->out = out;
s->out1 = out1;
return s;
}
/*
* A partially built NFA without the matching state filled in.
* Frag.start points at the start state.
* Frag.out is a list of places that need to be set to the
* next state for this fragment.
*/
typedef struct Frag Frag;
typedef union Ptrlist Ptrlist;
struct Frag
{
State *start;
Ptrlist *out;
};
/* Initialize Frag struct. */
Frag
frag(State *start, Ptrlist *out)
{
Frag n = { start, out };
return n;
}
/*
* Since the out pointers in the list are always
* uninitialized, we use the pointers themselves
* as storage for the Ptrlists.
*/
union Ptrlist
{
Ptrlist *next;
State *s;
};
/* Create singleton list containing just outp. */
Ptrlist*
list1(State **outp)
{
Ptrlist *l;
l = (Ptrlist*)outp;
l->next = NULL;
return l;
}
/* Patch the list of states at out to point to start. */
void
patch(Ptrlist *l, State *s)
{
Ptrlist *next;
for(; l; l=next){
next = l->next;
l->s = s;
}
}
/* Join the two lists l1 and l2, returning the combination. */
Ptrlist*
append(Ptrlist *l1, Ptrlist *l2)
{
Ptrlist *oldl1;
oldl1 = l1;
while(l1->next)
l1 = l1->next;
l1->next = l2;
return oldl1;
}
/*
* Convert postfix regular expression to NFA.
* Return start state.
*/
/* frag example:嵌套式的把状态保存起来。
* e1(start: c=a,out=e2.start,out1=0,lastlist=0.
* out = e2.out.)
* e2(start: c=b,out=0,out1=0,lastlist=0.
* out = 0).
* /
State*
post2nfa(char *postfix)
{
char *p;
Frag stack[1000], *stackp, e1, e2, e;
State *s;
// fprintf(stderr, "postfix: %s\n", postfix);
if(postfix == NULL)
return NULL;
#define push(s) *stackp++ = s
#define pop() *--stackp
stackp = stack;
for(p=postfix; *p; p++){
switch(*p){
default:
s = state(*p, NULL, NULL);
push(frag(s, list1(&s->out)));
break;
case '.': /* catenate */
e2 = pop();
e1 = pop();
patch(e1.out, e2.start); //e1.out 指针指向e2.start
push(frag(e1.start, e2.out)); // 把e1.start和e2.out组成frag结构体对象,压入栈
break;
case '|': /* alternate */
e2 = pop();
e1 = pop();
s = state(Split, e1.start, e2.start); //创建Split类型的state, out指e1.start, out1指e2.start.
push(frag(s, append(e1.out, e2.out)));//append , e1.out的next指向e2.out
break;
case '?': /* zero or one */
e = pop();
s = state(Split, e.start, NULL);
push(frag(s, append(e.out, list1(&s->out1))));
break;
case '*': /* zero or more */
e = pop();
s = state(Split, e.start, NULL);
patch(e.out, s);
push(frag(s, list1(&s->out1)));
break;
case '+': /* one or more */
e = pop();
s = state(Split, e.start, NULL);
patch(e.out, s);
push(frag(e.start, list1(&s->out1)));
break;
}
}
e = pop();
if(stackp != stack)
return NULL;
patch(e.out, &matchstate);
return e.start;
#undef pop
#undef push
}
typedef struct List List;
struct List
{
State **s;
int n;
};
List l1, l2;
static int listid;
void addstate(List*, State*);
void step(List*, int, List*);
/* Compute initial state list */
List*
startlist(State *start, List *l)
{
l->n = 0;
listid++;
addstate(l, start);
return l;
}
/* Check whether state list contains a match. */
int
ismatch(List *l)
{// 判断最后一个状态位是否是matchstate
int i;
for(i=0; in; i++)
if(l->s[i] == &matchstate)
return 1;
return 0;
}
/* Add s to l, following unlabeled arrows. */
void
addstate(List *l, State *s)
{//递归的方法把state存入List
if(s == NULL || s->lastlist == listid)
return;
s->lastlist = listid; //listid是指每个state的下标
if(s->c == Split){
/* follow unlabeled arrows */
addstate(l, s->out);
addstate(l, s->out1);
return;
}
l->s[l->n++] = s;
}
/*
* Step the NFA from the states in clist
* past the character c,
* to create next NFA state set nlist.
*/
void
step(List *clist, int c, List *nlist)
{
int i;
State *s;
listid++;
nlist->n = 0;
for(i=0; in; i++){ // clist->n表示当前正则表达式同一状态位,有n个状态.同时比较同位置的字符c
s = clist->s[i];
if(s->c == c) //字符匹配时,才获取下一个状态
addstate(nlist, s->out);//获取下一个状态放入nlist
}
}
/* Run NFA to determine whether it matches s. */
int
match(State *start, char *s)
{
int i, c;
List *clist, *nlist, *t;
clist = startlist(start, &l1);
nlist = &l2;
for(; *s; s++){ //单个字符比较
c = *s & 0xFF;
step(clist, c, nlist);
t = clist; clist = nlist; nlist = t; /* swap clist, nlist */
}
return ismatch(clist);
}
int
main(int argc, char **argv)
{
int i;
char *post;
State *start;
if(argc < 3){
fprintf(stderr, "usage: nfa regexp string...\n");
return 1;
}
post = re2post(argv[1]);
if(post == NULL){
fprintf(stderr, "bad regexp %s\n", argv[1]);
return 1;
}
start = post2nfa(post);
if(start == NULL){
fprintf(stderr, "error in post2nfa %s\n", post);
return 1;
}
l1.s = malloc(nstate*sizeof l1.s[0]); // l1: clist
l2.s = malloc(nstate*sizeof l2.s[0]);// l2: nlist
for(i=2; i