lazarus/components/jcf2/Parse/BuildTokenList.pas

{(*}
(*------------------------------------------------------------------------------
 Delphi Code formatter source code

The Original Code is BuildTokenList.pas, released April 2000.
The Initial Developer of the Original Code is Anthony Steele.
Portions created by Anthony Steele are Copyright (C) 1999-2008 Anthony Steele.
All Rights Reserved.
Contributor(s): Anthony Steele.

The contents of this file are subject to the Mozilla Public License Version 1.1
(the "License"). you may not use this file except in compliance with the License.
You may obtain a copy of the License at http://www.mozilla.org/NPL/

Software distributed under the License is distributed on an "AS IS" basis,
WITHOUT WARRANTY OF ANY KIND, either express or implied.
See the License for the specific language governing rights and limitations
under the License.

Alternatively, the contents of this file may be used under the terms of
the GNU General Public License Version 2 or later (the "GPL")
See http://www.gnu.org/licenses/gpl.html
------------------------------------------------------------------------------*)
{*)}

unit BuildTokenList;

{ AFS 29 Nov 1999
 converts the input string of chars into a list of tokens
 This is the lexical analysis phase of the parsing

 2014.11.02 ~bk Added lexing of binary constants (ex. -> const a=%101001;)
 2017.05.17 ~pktv Added lexing of octal constants (ex. -> const a=&777;)
}

{$I JcfGlobal.inc}

interface

uses
  { local }
  Tokens, SourceToken, SourceTokenList;

type

  { TBuildTokenList }

  TBuildTokenList = class(TObject)
  private
    { property implementation }
    fsSourceCode: String;
    fsFileName: string;

    { woker procs }
    fiCurrentIndex: integer;

    procedure SetSourceCode(const Value: String);

    function Current: Char;
    function CurrentChars(const piCount: integer): String;
    function ForwardChar(const piOffset: integer): Char;
    function ForwardChars(const piOffset, piCount: integer): String;
    procedure Consume(const piCount: integer = 1);
    function EndOfFile: boolean;
    function EndOfFileAfter(const piChars: integer): boolean;

    { implementation of GetNextToken }
    function TryReturn(const pcToken: TSourceToken): boolean;

    function TryCurlyComment(const pcToken: TSourceToken): boolean;
    function TrySlashComment(const pcToken: TSourceToken): boolean;
    function TryBracketStarComment(const pcToken: TSourceToken): boolean;

    function TryWhiteSpace(const pcToken: TSourceToken): boolean;
    function TryLiteralString(const pcToken: TSourceToken;
      const pcDelimiter: Char): boolean;

    function TryNumber(const pcToken: TSourceToken): boolean;
    function TryHexNumber(const pcToken: TSourceToken): boolean;
    function TryBinNumber(const pcToken: TSourceToken): boolean; // ~bk 14.11.01
    function TryOctNumber(const pcToken: TSourceToken): boolean; // ~pktv 17.05.19

    function TryDots(const pcToken: TSourceToken): boolean;

    function TryAssign(const pcToken: TSourceToken): boolean;

    function TrySingleCharToken(const pcToken: TSourceToken): boolean;

    function TryPunctuation(const pcToken: TSourceToken): boolean;
    function TryWord(const pcToken: TSourceToken): boolean;

    function GetNextToken: TSourceToken;

  protected

  public
    constructor Create;
    destructor Destroy; override;

    function BuildTokenList: TSourceTokenList;

    property SourceCode: String read fsSourceCode write SetSourceCode;
    property FileName: string read fsFileName write fsFileName;
  end;


implementation

uses
  Forms, SysUtils,
  { local }
  JcfStringUtils, JcfSystemUtils,
  JcfRegistrySettings;

const
  CurlyLeft =  '{'; //widechar(123);
  CurlyRight = '}'; //widechar(125);

function CheckMultiByte(const pcChar: char): boolean;
begin
  Result := False;
  if GetRegSettings.CheckMultiByteChars then
    Result := IsMultiByte(pcChar);
end;

{ TBuildTokenList }

constructor TBuildTokenList.Create;
begin
  inherited;
  SourceCode := '';
end;

destructor TBuildTokenList.Destroy;
begin
  inherited;
end;

procedure TBuildTokenList.SetSourceCode(const Value: String);
begin
  fsSourceCode := Value;
  // reset the index
  fiCurrentIndex := 1;
end;

function TBuildTokenList.GetNextToken: TSourceToken;
var
  lcNewToken: TSourceToken;

  procedure DoAllTheTries;
  begin
    { first look for return }
    if TryReturn(lcNewToken) then
      exit;
    { comments }
    if TryCurlyComment(lcNewToken) then
      exit;
    if TrySlashComment(lcNewToken) then
      exit;
    if TryBracketStarComment(lcNewToken) then
      exit;
    { the rest }
    if TryWhiteSpace(lcNewToken) then
      exit;
    if TryLiteralString(lcNewToken, NativeSingleQuote) then
      exit;
    if TryLiteralString(lcNewToken, NativeDoubleQuote) then
      exit;

    if TryWord(lcNewToken) then
      exit;
    if TryNumber(lcNewToken) then
      exit;
    if TryHexNumber(lcNewToken) then
      exit;
    if TryBinNumber(lcNewToken) then // ~bk 2014.11.01
      exit;
    if TryOctNumber(lcNewToken) then // ~pktv 2017.05.19
      exit;

    if TryDots(lcNewToken) then
      exit;

    { attempt assign before colon }
    if TryAssign(lcNewToken) then
      exit;

    if TryPunctuation(lcNewToken) then
      exit;

    if TrySingleCharToken(lcNewToken) then
      exit;

    { default }
    lcNewToken.TokenType  := ttUnknown;
    lcNewToken.SourceCode := Current;
    Consume(1);
  end;

begin
  if EndOfFile then
    Result := nil
  else
  begin
    lcNewToken := TSourceToken.Create;
    lcNewToken.FileName := FileName;
    DoAllTheTries;

    lcNewToken.WordType := WordTypeOfToken(lcNewToken.TokenType);
    Result := lcNewToken;
  end;
end;

{-------------------------------------------------------------------------------
  worker fns for GetNextComment }

function TBuildTokenList.TryBracketStarComment(const pcToken: TSourceToken): boolean;
var
  liCommentLength: integer;

  procedure MoveToCommentEnd;
  begin
    { comment is ended by *) or by EOF (bad source) }
    while True do
    begin
      if EndOfFileAfter(liCommentLength) then
        break;

      if CheckMultiByte(ForwardChar(liCommentLength)) then
      begin
        liCommentLength := liCommentLength + 2;
        continue;
      end;

      if ForwardChars(liCommentLength, 2) = '*)' then
        break;

      inc(liCommentLength);
    end;

    // include the comment end
    if not EndOfFileAfter(liCommentLength) and (ForwardChars(liCommentLength, 2) = '*)') then
      inc(liCommentLength, 2);
  end;


begin
  Result := False;
  if not (Current = '(') then
    exit;


  if CurrentChars(2) <> '(*' then
    exit;

  { if the comment starts with (*) that is not the end of the comment }
  liCommentLength := 2;

  MoveToCommentEnd;

  pcToken.TokenType := ttComment;
  pcToken.CommentStyle := eBracketStar;
  pcToken.SourceCode := CurrentChars(liCommentLength);
  Consume(liCommentLength);

  Result := True;
end;

function TBuildTokenList.TryCurlyComment(const pcToken: TSourceToken): boolean;
var
  liCommentLength: integer;
  lNestedDepth: integer;
  procedure MoveToCommentEnd;
  var
    lForwardChar: char;
  begin
    { comment is ended by (close-curly AND lNestedDepth=0) or by EOF (bad source) }
    while True do
    begin
      if EndOfFileAfter(liCommentLength) then
        break;
      lForwardChar:=ForwardChar(liCommentLength);
      if CheckMultiByte(lForwardChar) then
      begin
        liCommentLength := liCommentLength + 2;
        continue;
      end;
      Inc(liCommentLength);
      if lForwardChar = CurlyLeft then
        Inc(lNestedDepth)
      else if lForwardChar = CurlyRight then begin
        Dec(lNestedDepth);
        if (lNestedDepth = 0) then
          break;
      end;
    end;
  end;

begin
  Result := False;
  if Current <> '{' then
    exit;

  pcToken.TokenType  := ttComment;
  lNestedDepth := 1;
  liCommentLength := 1;

  { compiler directive are the comments with a $ just after the open-curly
    this is always the case }
  if ForwardChar(1) = '$' then
    pcToken.CommentStyle := eCompilerDirective
  else
    pcToken.CommentStyle := eCurlyBrace;

  MoveToCommentEnd;

  pcToken.SourceCode := CurrentChars(liCommentLength);
  Consume(liCommentLength);

  Result := True;
end;

function TBuildTokenList.TrySlashComment(const pcToken: TSourceToken): boolean;
var
  liCommentLength: integer;

  procedure MoveToCommentEnd;
  begin
    { comment is ended by return or by EOF (bad source) }
    while True do
    begin
      if EndOfFileAfter(liCommentLength) then
        break;

      if CheckMultiByte(ForwardChar(liCommentLength)) then
      begin
        liCommentLength := liCommentLength + 2;
        continue;
      end;

      if CharIsReturn(ForwardChar(liCommentLength)) then
        break;

      inc(liCommentLength);
    end;
  end;

begin
  Result := False;
  if Current <> '/' then
    exit;

  { until end of line or file }
  if CurrentChars(2) <> '//' then
    exit;

  liCommentLength := 2;

  MoveToCommentEnd;

  pcToken.TokenType := ttComment;
  pcToken.CommentStyle := eDoubleSlash;
  pcToken.SourceCode := CurrentChars(liCommentLength);
  Consume(liCommentLength);

  Result := True;
end;


function TBuildTokenList.TryReturn(const pcToken: TSourceToken): boolean;
var
  chNext: Char;
begin
  Result := False;
  if not CharIsReturn(Current) then
    exit;

  Result := True;
  pcToken.TokenType  := ttReturn;
  pcToken.SourceCode := Current;
  Consume;
  if fiCurrentIndex > Length(fsSourceCode) then
    exit;

  { concat the next return char if it is not the same
    This will recognise <cr><lf> or <lf><cr>, but not <cr><cr> }
  chNext := Current;
  if CharIsReturn(chNext) and (chNext <> pcToken.SourceCode[1]) then
  begin
    pcToken.SourceCode := pcToken.SourceCode + chNext;
    Consume;
  end;
end;

{ complexities like 'Hello'#32'World' and #$12'Foo' are assemlbed in the parser }
function TBuildTokenList.TryLiteralString(const pcToken: TSourceToken;
  const pcDelimiter: Char): boolean;
begin
  Result := False;

  if Current = pcDelimiter then
  begin
    Result := True;
    { read the opening ' }
    pcToken.SourceCode := pcToken.SourceCode + Current;
    Consume;

    { read until the close ' }
    repeat
      if Current = #0 then
        break;
      if CharIsReturn(Current) then
        Raise Exception.Create('Unterminated string: ' + pcToken.SourceCode);

      { two quotes in a row are still part of the string }
      if (Current = pcDelimiter) then
      begin
        { two consecutive quote chars inside string, read them }
        if (ForwardChar(1) = pcDelimiter) then
        begin
          pcToken.SourceCode := pcToken.SourceCode + CurrentChars(2);
          Consume(2);
        end
        else
        begin
          { single quote char ends string }
          pcToken.SourceCode := pcToken.SourceCode + Current;
          Consume;
          break;
        end
      end
      else
      begin
        { normal char, read it }
        pcToken.SourceCode := pcToken.SourceCode + Current;
        Consume;
      end;

    until False;

    pcToken.TokenType := ttQuotedLiteralString;
  end;
end;


function TBuildTokenList.TryWord(const pcToken: TSourceToken): boolean;
begin
  Result := False;

  if not CharIsWordChar(Current) then
    exit;

  pcToken.SourceCode := Current;
  Consume;

  { concat any subsequent word chars }
  while CharIsWordChar(Current) or CharIsDigit(Current) do
  begin
    pcToken.SourceCode := pcToken.SourceCode + Current;
    Consume;
  end;

  { try to recognise the word as built in }
  pcToken.TokenType := TypeOfToken(pcToken.SourceCode);
  if pcToken.TokenType = ttUnknown then
    pcToken.TokenType := ttIdentifier;

  Result := True;
end;

function TBuildTokenList.TryWhiteSpace(const pcToken: TSourceToken): boolean;
begin
  Result := False;
  if not CharIsWhiteSpaceNoReturn(Current) then
    exit;

  pcToken.TokenType  := ttWhiteSpace;
  pcToken.SourceCode := Current;
  Consume;

  { concat any subsequent return chars }
  while CharIsWhiteSpaceNoReturn(Current) do
  begin
    pcToken.SourceCode := pcToken.SourceCode + Current;
    Consume;
  end;

  Result := True;
end;

function TBuildTokenList.TryAssign(const pcToken: TSourceToken): boolean;
var
  TwoChars: String;
begin
  Result := False;

  if not (CharInSet(Char(Current), [':', '+', '-', '*', '/'])) then
    exit;

  TwoChars := CurrentChars(2);

  if TwoChars = ':=' then
    pcToken.TokenType := ttAssign
  else
  if TwoChars = '+=' then
    pcToken.TokenType := ttPlusAssign
  else
  if TwoChars = '-=' then
    pcToken.TokenType := ttMinusAssign
  else
  if TwoChars = '*=' then
    pcToken.TokenType := ttTimesAssign
  else
  if TwoChars = '/=' then
    pcToken.TokenType := ttFloatDivAssign
  else
    exit;

  pcToken.SourceCode := TwoChars;
  Consume(2);

  Result := True;
end;

function TBuildTokenList.TryNumber(const pcToken: TSourceToken): boolean;
var
  lbHasDecimalSep: boolean;
begin
  Result := False;

  { recognise a number -
   they don't start with a '.' but may contain one

   a minus sign in front is considered unary operator not part of the number
   this is bourne out by the compiler considering
    '- 0.3' and -0.3' to be the same value
    and -.3 is not legal at all }

  { first one must be a digit }
  if not CharIsDigit(Current) then
    exit;

  if (Current = '.') or (Current = '-') then
    exit;

  pcToken.TokenType  := ttNumber;
  pcToken.SourceCode := Current;
  Consume;
  lbHasDecimalSep := False;

  { concat any subsequent number chars
    only one decimal seperator allowed

    also NB that two dots can be array range, as in
    var foo = array[1..100] of integer;
    ie one dat = decimal
    two dots = end of number
  }
  while CharIsDigit(Current) or (Current = '.') do
  begin
    // have we got to the dot?
    if (Current = '.') then
    begin
      if CurrentChars(2) = '..' then
        break;

      if lbHasDecimalSep then
        // oops! a second one
        break
      else
        lbHasDecimalSep := True;
    end;

    pcToken.SourceCode := pcToken.SourceCode + Current;
    Consume;
  end;

  { scientific notation suffix, eg 3e2 = 30, 2.1e-3 = 0.0021 }

  { check for a trailing 'e' }
  if CharInSet(Current, ['e', 'E']) then
  begin
    // sci notation mode
    pcToken.SourceCode := pcToken.SourceCode + Current;
    Consume;

    // can be a minus or plus here
    if CharInSet(Current, ['-', '+']) then
    begin
      pcToken.SourceCode := pcToken.SourceCode + Current;
      Consume;
    end;

    { exponent must be integer }
    while CharIsDigit(Current) do
    begin
      pcToken.SourceCode := pcToken.SourceCode + Current;
      Consume;
    end;
  end;

  Result := True;
end;

{ NB: do not localise '.' with DecimalSeperator
  Delphi source code does *not* change for this }
function TBuildTokenList.TryHexNumber(const pcToken: TSourceToken): boolean;
var
  lbHasDecimalSep: boolean;
begin
  Result := False;

  { starts with a $ }
  if Current <> '$' then
    exit;

  pcToken.TokenType  := ttNumber;
  pcToken.SourceCode := Current;
  Consume;
  lbHasDecimalSep := False;

  { concat any subsequent number chars }
  while CharIsHexDigitDot(Current) do
  begin
    // have we got to the dot?
    if (Current = '.') then
    begin
      if CurrentChars(2) = '..' then
        break;

      if lbHasDecimalSep then
        // oops! a second one
        break
      else
        lbHasDecimalSep := True;
    end;

    pcToken.SourceCode := pcToken.SourceCode + Current;
    Consume;
  end;

  Result := True;
end;

{ Bin numbers are prefixed with % }
function TBuildTokenList.TryBinNumber(const pcToken: TSourceToken): boolean;
begin
  Result := False;

  { starts with a % }
  if Current <> '%' then
    exit;

  pcToken.TokenType  := ttNumber;
  pcToken.SourceCode := Current;
  Consume;

  { concat any subsequent binary chars }
  while CharIsBinDigit(Current) do
  begin
    pcToken.SourceCode := pcToken.SourceCode + Current;
    Consume;
  end;

  Result := True;
end;

{ ~pktb 2017.05.19 - Oct numbers are prefixed with & }
function TBuildTokenList.TryOctNumber(const pcToken: TSourceToken): boolean;
function CharIsOctDigit(const c: Char): Boolean;
const
  OctDigits: set of AnsiChar = [
    '0', '1', '2', '3', '4', '5', '6', '7'];
begin
  Result := (c in OctDigits);
end;
begin
  Result := False;

  { starts with a & }
  if Current <> '&' then
    exit;

  pcToken.TokenType  := ttNumber;
  pcToken.SourceCode := Current;
  Consume;

  { concat any subsequent binary chars }
  while CharIsOctDigit(Current) do
  begin
    pcToken.SourceCode := pcToken.SourceCode + Current;
    Consume;
  end;

  Result := True;
end;

{ try the range '..' operator and object access  '.' operator }
function TBuildTokenList.TryDots(const pcToken: TSourceToken): boolean;
begin
  Result := False;

  if Current <> '.' then
    exit;

  pcToken.SourceCode := Current;
  Consume;

  if Current = '.' then
  begin
    pcToken.TokenType  := ttDoubleDot;
    pcToken.SourceCode := pcToken.SourceCode + Current;
    Consume;
  end
  else
  begin
    pcToken.TokenType := ttDot;
  end;

  Result := True;
end;

function TBuildTokenList.TryPunctuation(const pcToken: TSourceToken): boolean;


  function FollowsPunctuation(const chLast, ch: Char): boolean;
  const
    { These have meanings on thier own and should not be recognised as part of the punc.
     e.g '=(' is not a punctation symbol, but 2 of them ( for e.g. in const a=(3);
     simlarly ');' is 2 puncs }
    UnitaryPunctuation: set of AnsiChar = [
      NativeSingleQuote, '"', '(', ')', '[', ']', '{',
      '#', '$', '_', ';', '@', '^', ','];

   { These can't have anything following them:
    for e.g, catch the case if a=-1 then ...
      where '=-1' should be read as '=' '-1' not '=-' '1'
      Nothing legitimately comes after '=' AFAIK
      also a:=a*-1;
      q:=q--1; // q equals q minus minus-one. It sucks but it compiles so it must be parsed
      etc }
    SingleChars: set of AnsiChar = ['=', '+', '-', '/', '\'];

  begin
    Result := False;

    if CharInSet(chLast, UnitaryPunctuation) or CharInSet(ch, UnitaryPunctuation) then
      exit;

    if CharInSet(chLast, SingleChars) then
      exit;

    { '<' or '<' can only be followed by '<', '>' or '='.
     Beware of "if x<-1"
     }
    if CharInSet(chLast, ['<', '>']) and not CharInSet(ch, ['<', '>', '=']) then
      exit;

    // ':' can be followed by '=' only
    if (chLast = ':') and (ch <> '=') then
      exit;

    // * can be followed by another *
    if (chLast = '*') and (ch <> '*') then
      exit;


    // "<<" is the start of two nested generics,
    // likewise '>>' is not an operator, it is two "end-of-generic" signs in sucession
    if (chLast = '<') and (ch = '<') then
      exit;
    if (chLast = '>') and (ch = '>') then
      exit;


    Result := CharIsPuncChar(ch);
  end;

var
  leWordType:  TWordType;
  leTokenType: TTokenType;
  lcLast:      Char;
begin
  Result := False;

  if not CharIsPuncChar(Current) then
    exit;

  pcToken.TokenType := ttPunctuation;
  lcLast := Current;
  pcToken.SourceCode := lcLast;
  Consume;

  { concat any subsequent punc chars }
  while FollowsPunctuation(lcLast, Current) do
  begin
    lcLast := Current;
    pcToken.SourceCode := pcToken.SourceCode + lcLast;
    Consume;
  end;

  { try to recognise the punctuation as an operator }
  TypeOfToken(pcToken.SourceCode, leWordType, leTokenType);
  if leTokenType <> ttUnknown then
  begin
    pcToken.TokenType := leTokenType;
  end;

  Result := True;
end;

function TBuildTokenList.TrySingleCharToken(const pcToken: TSourceToken): boolean;
begin
  Result := False;

  pcToken.TokenType := TypeOfToken(Current);
  if pcToken.TokenType <> ttUnknown then
  begin
    pcToken.SourceCode := Current;
    Consume;
    Result := True;
  end;
end;

function TBuildTokenList.BuildTokenList: TSourceTokenList;
const
  UPDATE_INTERVAL = 4096; // big increments here, this goes faster than parsing
var
  lcList:    TSourceTokenList;
  lcNew:     TSourceToken;
  {$IFNDEF COMMAND_LINE}
  liCounter: integer;
  {$ENDIF}
begin
  Assert(SourceCode <> '');

  {$IFNDEF COMMAND_LINE}
  liCounter := 0;
  {$ENDIF}
  lcList    := TSourceTokenList.Create;

  while not EndOfFile do
  begin
    lcNew := GetNextToken;
    lcList.Add(lcNew);

    {$IFNDEF COMMAND_LINE}
    Inc(liCounter);
    if (liCounter mod UPDATE_INTERVAL) = 0 then
       Application.ProcessMessages;
    {$ENDIF}
  end;

  Result := lcList;
end;

function TBuildTokenList.Current: Char;
begin
  Result := fsSourceCode[fiCurrentIndex];
end;

function TBuildTokenList.CurrentChars(const piCount: integer): String;
begin
  Result := Copy(fsSourceCode, fiCurrentIndex, piCount);
end;

function TBuildTokenList.ForwardChar(const piOffset: integer): Char;
begin
  Result := fsSourceCode[fiCurrentIndex + piOffset];
end;

function TBuildTokenList.ForwardChars(const piOffset, piCount: integer): String;
begin
  Result := Copy(fsSourceCode, fiCurrentIndex + piOffset, piCount);
end;


procedure TBuildTokenList.Consume(const piCount: integer);
begin
  inc(fiCurrentIndex, piCount);
end;

function TBuildTokenList.EndOfFile: boolean;
begin
  Result := fiCurrentIndex > Length(fsSourceCode);
end;

function TBuildTokenList.EndOfFileAfter(const piChars: integer): boolean;
begin
  Result := (fiCurrentIndex + piChars) > Length(fsSourceCode);
end;

end.