正则表达式重复和贪婪算法(Pascal)

正则表达式重复和贪婪算法(Pascal)

用 FreePascal 实现正则表达式字符串查找,180 行代码,支持 . ? * + {n} {n,} {n,m} ?? *? +? {}?语法。

program Project1;

type

  // 正则表达式解析结果
  TNode = class
    FNext       : TNode;    // 下一个节点
    FData       : Char;     // 节点内容
    FMinRepeats : Integer;  // 最小重复次数
    FMaxRepeats : Integer;  // 最大重复次数
    FRepeats    : Integer;  // 当前重复次数
    FGreedy     : Boolean;  // 当前节点是否使用贪婪模式
    constructor Create(Data: Char);
    destructor  Destroy; override;
  end;

constructor TNode.Create(Data: Char);
begin
  FNext       := nil;
  FData       := Data;
  FMinRepeats := 1;
  FMaxRepeats := 1;
  FRepeats    := 0;
  FGreedy     := True;
end;

destructor TNode.Destroy;
begin
  if FNext <> nil then FNext.Free;
  inherited Destroy;
end;

type

  TRegExpr = class
  private
    FInput   : String;                // 输入的源串
    FCur     : PChar;                 // 当前位置
    FEnd     : PChar;                 // 结束位置
    FProgram : TNode;                 // 初始节点
    FGreedy  : Boolean;               // 是否使用贪婪模式
    FGroup   : array [0..1] of PChar; // 匹配结果的首尾位置

    function Match(Node: TNode): PChar;
    function Parse(PStr, PEnd: PChar): TNode;
  public
    constructor Create(Pattern: String);
    destructor  Destroy; override;
    function Exec(Input: String; Greedy: Boolean): Boolean;
    function ExecNext: Boolean;
  end;

constructor TRegExpr.Create(Pattern: String);
begin
  FProgram := Parse(PChar(Pattern), PChar(Pattern) + Length(Pattern));
end;

destructor TRegExpr.Destroy;
begin
  FProgram.Free;
  inherited Destroy;
end;

function StrToInt(var PStr: PChar): Integer;
begin
  Result := 0;
  while PStr^ in ['0'.. '9'] do begin
    Result := Result * 10 + (Ord(PStr^) - Ord('0'));
    PStr += 1;
  end;
end;

// 解析正则表达式,返回首节点
// 支持 . ? * + {n} {n,} {n,m} ?? *? +? {}?
function TRegExpr.Parse(PStr, PEnd: PChar): TNode;
var
  Node : TNode;
begin
  if PStr >= PEnd then Exit(nil);
  Result := TNode.Create(PStr^);

  Node := Result;
  PStr += 1;
  while PStr < PEnd do begin
    case PStr^ of
      '?': begin
        if (Node.FMinRepeats = 1) and (Node.FMaxRepeats = 1) then begin
          Node.FMinRepeats := 0;  Node.FMaxRepeats := 1
        end else
          Node.FGreedy := False;
      end;
      '*': begin Node.FMinRepeats := 0;  Node.FMaxRepeats := 0 end;
      '+': begin Node.FMinRepeats := 1;  Node.FMaxRepeats := 0 end;
      '{': begin
        PStr += 1;
        Node.FMinRepeats := StrToInt(PStr);
        if PStr^ = ',' then begin
          PStr += 1;
          if PStr^ = '}' then
            Node.FMaxRepeats := 0                 // {n,}
          else
            Node.FMaxRepeats := StrToInt(PStr);   // {n,m}
        end else
          Node.FMaxRepeats := Node.FMinRepeats;   // {m}
        if PStr^ <> '}' then Exit(nil);
      end
      else begin Node.FNext := TNode.Create(PStr^); Node := Node.FNext; end;
    end;
    PStr += 1;
  end;
end;

// 检查从 FCur 开始的字符串是否与 Node 及其后续节点匹配,返回匹配的结束位置
function TRegExpr.Match(Node: TNode): PChar;
var
  PLast, PMatched: PChar;
begin
  if Node = nil then Exit(FCur); // 空节点表示正则表达式匹配完毕
  Node.FRepeats := 0;            // 复位上次 Exec 时改变的状态

  // 先满足最小匹配次数
  while (FCur < FEnd) and (Node.FRepeats < Node.FMinRepeats) do begin
    if (Node.FData <> '.') and (Node.FData <> FCur^) then Exit(nil);
    FCur += 1;
    Node.FRepeats += 1;
  end;

  // 后续节点匹配成功才算匹配成功
  Result := nil;

  while FCur < FEnd do begin
    PLast := FCur;  // 记下此位置,贪婪匹配会从这里开始
    // 匹配后续节点
    PMatched := Match(Node.FNext);
    if PMatched <> nil then Result := PMatched;
    // 成功后是否贪婪匹配
    if (Result <> nil) and (not FGreedy) or (not Node.FGreedy) then Exit;
    // 是否到达最大重复次数
    if (Node.FMaxRepeats > 0) and (Node.FRepeats = Node.FMaxRepeats) then Exit;
    // 重复匹配
    FCur := PLast;  // 恢复到匹配 FNext 之前的位置
    if (Node.FData <> '.') and (Node.FData <> FCur^) then Exit;
    FCur += 1;
    Node.FRepeats += 1;
  end;
end;

// 查找第一个匹配结果
function TRegExpr.Exec(Input: String; Greedy: Boolean): Boolean;
begin
  FInput  := Input;
  FCur    := PChar(FInput);
  FEnd    := FCur + Length(FInput);
  FGreedy := Greedy;
  Result  := ExecNext;
end;

// 查找下一个匹配结果
function TRegExpr.ExecNext: Boolean;
var
  PStart, PMatched: PChar;
begin
  PStart   := FCur;
  PMatched := nil;
  while (FCur < FEnd) do begin
    PMatched := Match(FProgram);
    if PMatched <> nil then Break;
    PStart += 1;
    FCur := PStart;
  end;
  FGroup[0] := PStart;
  FGroup[1] := PMatched;
  Result := PMatched <> nil;
end;

// 测试
function SubStr(PStr, PEnd: PChar): String;
begin
  SetLength(Result, PEnd - PStr);
  Move(PStr[0], PChar(Result)[0], PEnd - PStr);
end;

procedure MatchAll(InputString, Pattern: String; Greedy: Boolean);
var
  Reg: TRegExpr;
begin
  WriteLn('------------------------------');
  Reg := TRegExpr.Create(Pattern);
  if Reg.Exec(InputString, Greedy) then
    repeat
      WriteLn(SubStr(Reg.FGroup[0], Reg.FGroup[1]));
    until not Reg.ExecNext;
  Reg.Free;
end;

begin
  MatchAll('Hello, World, Hallo, World!', 'H.*o', True);
  MatchAll('Hello, World, Hallo, World!', 'H.*o', False);
  MatchAll('Hello, World, Hallo, World!', '.{2,5}l' , True);
  MatchAll('Hello, World, Hallo, World!', '.{2,5}?l', True);
end.

你可能感兴趣的:(代码,正则表达式)