lazarus/components/jcf2/Parse/BuildTokenList.pas
2020-03-25 13:49:42 +00:00

894 lines
21 KiB
ObjectPascal

{(*}
(*------------------------------------------------------------------------------
Delphi Code formatter source code
The Original Code is BuildTokenList.pas, released April 2000.
The Initial Developer of the Original Code is Anthony Steele.
Portions created by Anthony Steele are Copyright (C) 1999-2008 Anthony Steele.
All Rights Reserved.
Contributor(s): Anthony Steele.
The contents of this file are subject to the Mozilla Public License Version 1.1
(the "License"). you may not use this file except in compliance with the License.
You may obtain a copy of the License at http://www.mozilla.org/NPL/
Software distributed under the License is distributed on an "AS IS" basis,
WITHOUT WARRANTY OF ANY KIND, either express or implied.
See the License for the specific language governing rights and limitations
under the License.
Alternatively, the contents of this file may be used under the terms of
the GNU General Public License Version 2 or later (the "GPL")
See http://www.gnu.org/licenses/gpl.html
------------------------------------------------------------------------------*)
{*)}
unit BuildTokenList;
{ AFS 29 Nov 1999
converts the input string of chars into a list of tokens
This is the lexical analysis phase of the parsing
2014.11.02 ~bk Added lexing of binary constants (ex. -> const a=%101001;)
2017.05.17 ~pktv Added lexing of octal constants (ex. -> const a=&777;)
}
{$I JcfGlobal.inc}
interface
uses
{ local }
Tokens, SourceToken, SourceTokenList;
type
{ TBuildTokenList }
TBuildTokenList = class(TObject)
private
{ property implementation }
fsSourceCode: String;
fsFileName: string;
{ woker procs }
fiCurrentIndex: integer;
procedure SetSourceCode(const Value: String);
function Current: Char;
function CurrentChars(const piCount: integer): String;
function ForwardChar(const piOffset: integer): Char;
function ForwardChars(const piOffset, piCount: integer): String;
procedure Consume(const piCount: integer = 1);
function EndOfFile: boolean;
function EndOfFileAfter(const piChars: integer): boolean;
{ implementation of GetNextToken }
function TryReturn(const pcToken: TSourceToken): boolean;
function TryCurlyComment(const pcToken: TSourceToken): boolean;
function TrySlashComment(const pcToken: TSourceToken): boolean;
function TryBracketStarComment(const pcToken: TSourceToken): boolean;
function TryWhiteSpace(const pcToken: TSourceToken): boolean;
function TryLiteralString(const pcToken: TSourceToken;
const pcDelimiter: Char): boolean;
function TryNumber(const pcToken: TSourceToken): boolean;
function TryHexNumber(const pcToken: TSourceToken): boolean;
function TryBinNumber(const pcToken: TSourceToken): boolean; // ~bk 14.11.01
function TryOctNumber(const pcToken: TSourceToken): boolean; // ~pktv 17.05.19
function TryDots(const pcToken: TSourceToken): boolean;
function TryAssign(const pcToken: TSourceToken): boolean;
function TrySingleCharToken(const pcToken: TSourceToken): boolean;
function TryPunctuation(const pcToken: TSourceToken): boolean;
function TryWord(const pcToken: TSourceToken): boolean;
function GetNextToken: TSourceToken;
protected
public
constructor Create;
destructor Destroy; override;
function BuildTokenList: TSourceTokenList;
property SourceCode: String read fsSourceCode write SetSourceCode;
property FileName: string read fsFileName write fsFileName;
end;
implementation
uses
Forms, SysUtils,
{ local }
JcfStringUtils, JcfSystemUtils,
JcfRegistrySettings;
const
CurlyLeft = '{'; //widechar(123);
CurlyRight = '}'; //widechar(125);
function CheckMultiByte(const pcChar: char): boolean;
begin
Result := False;
if GetRegSettings.CheckMultiByteChars then
Result := IsMultiByte(pcChar);
end;
{ TBuildTokenList }
constructor TBuildTokenList.Create;
begin
inherited;
SourceCode := '';
end;
destructor TBuildTokenList.Destroy;
begin
inherited;
end;
procedure TBuildTokenList.SetSourceCode(const Value: String);
begin
fsSourceCode := Value;
// reset the index
fiCurrentIndex := 1;
end;
function TBuildTokenList.GetNextToken: TSourceToken;
var
lcNewToken: TSourceToken;
procedure DoAllTheTries;
begin
{ first look for return }
if TryReturn(lcNewToken) then
exit;
{ comments }
if TryCurlyComment(lcNewToken) then
exit;
if TrySlashComment(lcNewToken) then
exit;
if TryBracketStarComment(lcNewToken) then
exit;
{ the rest }
if TryWhiteSpace(lcNewToken) then
exit;
if TryLiteralString(lcNewToken, NativeSingleQuote) then
exit;
if TryLiteralString(lcNewToken, NativeDoubleQuote) then
exit;
if TryWord(lcNewToken) then
exit;
if TryNumber(lcNewToken) then
exit;
if TryHexNumber(lcNewToken) then
exit;
if TryBinNumber(lcNewToken) then // ~bk 2014.11.01
exit;
if TryOctNumber(lcNewToken) then // ~pktv 2017.05.19
exit;
if TryDots(lcNewToken) then
exit;
{ attempt assign before colon }
if TryAssign(lcNewToken) then
exit;
if TryPunctuation(lcNewToken) then
exit;
if TrySingleCharToken(lcNewToken) then
exit;
{ default }
lcNewToken.TokenType := ttUnknown;
lcNewToken.SourceCode := Current;
Consume(1);
end;
begin
if EndOfFile then
Result := nil
else
begin
lcNewToken := TSourceToken.Create;
lcNewToken.FileName := FileName;
DoAllTheTries;
lcNewToken.WordType := WordTypeOfToken(lcNewToken.TokenType);
Result := lcNewToken;
end;
end;
{-------------------------------------------------------------------------------
worker fns for GetNextComment }
function TBuildTokenList.TryBracketStarComment(const pcToken: TSourceToken): boolean;
var
liCommentLength: integer;
procedure MoveToCommentEnd;
begin
{ comment is ended by *) or by EOF (bad source) }
while True do
begin
if EndOfFileAfter(liCommentLength) then
break;
if CheckMultiByte(ForwardChar(liCommentLength)) then
begin
liCommentLength := liCommentLength + 2;
continue;
end;
if ForwardChars(liCommentLength, 2) = '*)' then
break;
inc(liCommentLength);
end;
// include the comment end
if not EndOfFileAfter(liCommentLength) and (ForwardChars(liCommentLength, 2) = '*)') then
inc(liCommentLength, 2);
end;
begin
Result := False;
if not (Current = '(') then
exit;
if CurrentChars(2) <> '(*' then
exit;
{ if the comment starts with (*) that is not the end of the comment }
liCommentLength := 2;
MoveToCommentEnd;
pcToken.TokenType := ttComment;
pcToken.CommentStyle := eBracketStar;
pcToken.SourceCode := CurrentChars(liCommentLength);
Consume(liCommentLength);
Result := True;
end;
function TBuildTokenList.TryCurlyComment(const pcToken: TSourceToken): boolean;
var
liCommentLength: integer;
lNestedDepth: integer;
procedure MoveToCommentEnd;
var
lForwardChar: char;
begin
{ comment is ended by (close-curly AND lNestedDepth=0) or by EOF (bad source) }
while True do
begin
if EndOfFileAfter(liCommentLength) then
break;
lForwardChar:=ForwardChar(liCommentLength);
if CheckMultiByte(lForwardChar) then
begin
liCommentLength := liCommentLength + 2;
continue;
end;
Inc(liCommentLength);
if lForwardChar = CurlyLeft then
Inc(lNestedDepth)
else if lForwardChar = CurlyRight then begin
Dec(lNestedDepth);
if (lNestedDepth = 0) then
break;
end;
end;
end;
begin
Result := False;
if Current <> '{' then
exit;
pcToken.TokenType := ttComment;
lNestedDepth := 1;
liCommentLength := 1;
{ compiler directive are the comments with a $ just after the open-curly
this is always the case }
if ForwardChar(1) = '$' then
pcToken.CommentStyle := eCompilerDirective
else
pcToken.CommentStyle := eCurlyBrace;
MoveToCommentEnd;
pcToken.SourceCode := CurrentChars(liCommentLength);
Consume(liCommentLength);
Result := True;
end;
function TBuildTokenList.TrySlashComment(const pcToken: TSourceToken): boolean;
var
liCommentLength: integer;
procedure MoveToCommentEnd;
begin
{ comment is ended by return or by EOF (bad source) }
while True do
begin
if EndOfFileAfter(liCommentLength) then
break;
if CheckMultiByte(ForwardChar(liCommentLength)) then
begin
liCommentLength := liCommentLength + 2;
continue;
end;
if CharIsReturn(ForwardChar(liCommentLength)) then
break;
inc(liCommentLength);
end;
end;
begin
Result := False;
if Current <> '/' then
exit;
{ until end of line or file }
if CurrentChars(2) <> '//' then
exit;
liCommentLength := 2;
MoveToCommentEnd;
pcToken.TokenType := ttComment;
pcToken.CommentStyle := eDoubleSlash;
pcToken.SourceCode := CurrentChars(liCommentLength);
Consume(liCommentLength);
Result := True;
end;
function TBuildTokenList.TryReturn(const pcToken: TSourceToken): boolean;
var
chNext: Char;
begin
Result := False;
if not CharIsReturn(Current) then
exit;
Result := True;
pcToken.TokenType := ttReturn;
pcToken.SourceCode := Current;
Consume;
if fiCurrentIndex > Length(fsSourceCode) then
exit;
{ concat the next return char if it is not the same
This will recognise <cr><lf> or <lf><cr>, but not <cr><cr> }
chNext := Current;
if CharIsReturn(chNext) and (chNext <> pcToken.SourceCode[1]) then
begin
pcToken.SourceCode := pcToken.SourceCode + chNext;
Consume;
end;
end;
{ complexities like 'Hello'#32'World' and #$12'Foo' are assemlbed in the parser }
function TBuildTokenList.TryLiteralString(const pcToken: TSourceToken;
const pcDelimiter: Char): boolean;
begin
Result := False;
if Current = pcDelimiter then
begin
Result := True;
{ read the opening ' }
pcToken.SourceCode := pcToken.SourceCode + Current;
Consume;
{ read until the close ' }
repeat
if Current = #0 then
break;
if CharIsReturn(Current) then
Raise Exception.Create('Unterminated string: ' + pcToken.SourceCode);
{ two quotes in a row are still part of the string }
if (Current = pcDelimiter) then
begin
{ two consecutive quote chars inside string, read them }
if (ForwardChar(1) = pcDelimiter) then
begin
pcToken.SourceCode := pcToken.SourceCode + CurrentChars(2);
Consume(2);
end
else
begin
{ single quote char ends string }
pcToken.SourceCode := pcToken.SourceCode + Current;
Consume;
break;
end
end
else
begin
{ normal char, read it }
pcToken.SourceCode := pcToken.SourceCode + Current;
Consume;
end;
until False;
pcToken.TokenType := ttQuotedLiteralString;
end;
end;
function TBuildTokenList.TryWord(const pcToken: TSourceToken): boolean;
begin
Result := False;
if not CharIsWordChar(Current) then
exit;
pcToken.SourceCode := Current;
Consume;
{ concat any subsequent word chars }
while CharIsWordChar(Current) or CharIsDigit(Current) do
begin
pcToken.SourceCode := pcToken.SourceCode + Current;
Consume;
end;
{ try to recognise the word as built in }
pcToken.TokenType := TypeOfToken(pcToken.SourceCode);
if pcToken.TokenType = ttUnknown then
pcToken.TokenType := ttIdentifier;
Result := True;
end;
function TBuildTokenList.TryWhiteSpace(const pcToken: TSourceToken): boolean;
begin
Result := False;
if not CharIsWhiteSpaceNoReturn(Current) then
exit;
pcToken.TokenType := ttWhiteSpace;
pcToken.SourceCode := Current;
Consume;
{ concat any subsequent return chars }
while CharIsWhiteSpaceNoReturn(Current) do
begin
pcToken.SourceCode := pcToken.SourceCode + Current;
Consume;
end;
Result := True;
end;
function TBuildTokenList.TryAssign(const pcToken: TSourceToken): boolean;
var
TwoChars: String;
begin
Result := False;
if not (CharInSet(Char(Current), [':', '+', '-', '*', '/'])) then
exit;
TwoChars := CurrentChars(2);
if TwoChars = ':=' then
pcToken.TokenType := ttAssign
else
if TwoChars = '+=' then
pcToken.TokenType := ttPlusAssign
else
if TwoChars = '-=' then
pcToken.TokenType := ttMinusAssign
else
if TwoChars = '*=' then
pcToken.TokenType := ttTimesAssign
else
if TwoChars = '/=' then
pcToken.TokenType := ttFloatDivAssign
else
exit;
pcToken.SourceCode := TwoChars;
Consume(2);
Result := True;
end;
function TBuildTokenList.TryNumber(const pcToken: TSourceToken): boolean;
var
lbHasDecimalSep: boolean;
begin
Result := False;
{ recognise a number -
they don't start with a '.' but may contain one
a minus sign in front is considered unary operator not part of the number
this is bourne out by the compiler considering
'- 0.3' and -0.3' to be the same value
and -.3 is not legal at all }
{ first one must be a digit }
if not CharIsDigit(Current) then
exit;
if (Current = '.') or (Current = '-') then
exit;
pcToken.TokenType := ttNumber;
pcToken.SourceCode := Current;
Consume;
lbHasDecimalSep := False;
{ concat any subsequent number chars
only one decimal seperator allowed
also NB that two dots can be array range, as in
var foo = array[1..100] of integer;
ie one dat = decimal
two dots = end of number
}
while CharIsDigit(Current) or (Current = '.') do
begin
// have we got to the dot?
if (Current = '.') then
begin
if CurrentChars(2) = '..' then
break;
if lbHasDecimalSep then
// oops! a second one
break
else
lbHasDecimalSep := True;
end;
pcToken.SourceCode := pcToken.SourceCode + Current;
Consume;
end;
{ scientific notation suffix, eg 3e2 = 30, 2.1e-3 = 0.0021 }
{ check for a trailing 'e' }
if CharInSet(Current, ['e', 'E']) then
begin
// sci notation mode
pcToken.SourceCode := pcToken.SourceCode + Current;
Consume;
// can be a minus or plus here
if CharInSet(Current, ['-', '+']) then
begin
pcToken.SourceCode := pcToken.SourceCode + Current;
Consume;
end;
{ exponent must be integer }
while CharIsDigit(Current) do
begin
pcToken.SourceCode := pcToken.SourceCode + Current;
Consume;
end;
end;
Result := True;
end;
{ NB: do not localise '.' with DecimalSeperator
Delphi source code does *not* change for this }
function TBuildTokenList.TryHexNumber(const pcToken: TSourceToken): boolean;
var
lbHasDecimalSep: boolean;
begin
Result := False;
{ starts with a $ }
if Current <> '$' then
exit;
pcToken.TokenType := ttNumber;
pcToken.SourceCode := Current;
Consume;
lbHasDecimalSep := False;
{ concat any subsequent number chars }
while CharIsHexDigitDot(Current) do
begin
// have we got to the dot?
if (Current = '.') then
begin
if CurrentChars(2) = '..' then
break;
if lbHasDecimalSep then
// oops! a second one
break
else
lbHasDecimalSep := True;
end;
pcToken.SourceCode := pcToken.SourceCode + Current;
Consume;
end;
Result := True;
end;
{ Bin numbers are prefixed with % }
function TBuildTokenList.TryBinNumber(const pcToken: TSourceToken): boolean;
begin
Result := False;
{ starts with a % }
if Current <> '%' then
exit;
pcToken.TokenType := ttNumber;
pcToken.SourceCode := Current;
Consume;
{ concat any subsequent binary chars }
while CharIsBinDigit(Current) do
begin
pcToken.SourceCode := pcToken.SourceCode + Current;
Consume;
end;
Result := True;
end;
{ ~pktb 2017.05.19 - Oct numbers are prefixed with & }
function TBuildTokenList.TryOctNumber(const pcToken: TSourceToken): boolean;
function CharIsOctDigit(const c: Char): Boolean;
const
OctDigits: set of AnsiChar = [
'0', '1', '2', '3', '4', '5', '6', '7'];
begin
Result := (c in OctDigits);
end;
begin
Result := False;
{ starts with a & }
if Current <> '&' then
exit;
pcToken.TokenType := ttNumber;
pcToken.SourceCode := Current;
Consume;
{ concat any subsequent binary chars }
while CharIsOctDigit(Current) do
begin
pcToken.SourceCode := pcToken.SourceCode + Current;
Consume;
end;
Result := True;
end;
{ try the range '..' operator and object access '.' operator }
function TBuildTokenList.TryDots(const pcToken: TSourceToken): boolean;
begin
Result := False;
if Current <> '.' then
exit;
pcToken.SourceCode := Current;
Consume;
if Current = '.' then
begin
pcToken.TokenType := ttDoubleDot;
pcToken.SourceCode := pcToken.SourceCode + Current;
Consume;
end
else
begin
pcToken.TokenType := ttDot;
end;
Result := True;
end;
function TBuildTokenList.TryPunctuation(const pcToken: TSourceToken): boolean;
function FollowsPunctuation(const chLast, ch: Char): boolean;
const
{ These have meanings on thier own and should not be recognised as part of the punc.
e.g '=(' is not a punctation symbol, but 2 of them ( for e.g. in const a=(3);
simlarly ');' is 2 puncs }
UnitaryPunctuation: set of AnsiChar = [
NativeSingleQuote, '"', '(', ')', '[', ']', '{',
'#', '$', '_', ';', '@', '^', ','];
{ These can't have anything following them:
for e.g, catch the case if a=-1 then ...
where '=-1' should be read as '=' '-1' not '=-' '1'
Nothing legitimately comes after '=' AFAIK
also a:=a*-1;
q:=q--1; // q equals q minus minus-one. It sucks but it compiles so it must be parsed
etc }
SingleChars: set of AnsiChar = ['=', '+', '-', '/', '\'];
begin
Result := False;
if CharInSet(chLast, UnitaryPunctuation) or CharInSet(ch, UnitaryPunctuation) then
exit;
if CharInSet(chLast, SingleChars) then
exit;
{ '<' or '<' can only be followed by '<', '>' or '='.
Beware of "if x<-1"
}
if CharInSet(chLast, ['<', '>']) and not CharInSet(ch, ['<', '>', '=']) then
exit;
// ':' can be followed by '=' only
if (chLast = ':') and (ch <> '=') then
exit;
// * can be followed by another *
if (chLast = '*') and (ch <> '*') then
exit;
// "<<" is the start of two nested generics,
// likewise '>>' is not an operator, it is two "end-of-generic" signs in sucession
if (chLast = '<') and (ch = '<') then
exit;
if (chLast = '>') and (ch = '>') then
exit;
Result := CharIsPuncChar(ch);
end;
var
leWordType: TWordType;
leTokenType: TTokenType;
lcLast: Char;
begin
Result := False;
if not CharIsPuncChar(Current) then
exit;
pcToken.TokenType := ttPunctuation;
lcLast := Current;
pcToken.SourceCode := lcLast;
Consume;
{ concat any subsequent punc chars }
while FollowsPunctuation(lcLast, Current) do
begin
lcLast := Current;
pcToken.SourceCode := pcToken.SourceCode + lcLast;
Consume;
end;
{ try to recognise the punctuation as an operator }
TypeOfToken(pcToken.SourceCode, leWordType, leTokenType);
if leTokenType <> ttUnknown then
begin
pcToken.TokenType := leTokenType;
end;
Result := True;
end;
function TBuildTokenList.TrySingleCharToken(const pcToken: TSourceToken): boolean;
begin
Result := False;
pcToken.TokenType := TypeOfToken(Current);
if pcToken.TokenType <> ttUnknown then
begin
pcToken.SourceCode := Current;
Consume;
Result := True;
end;
end;
function TBuildTokenList.BuildTokenList: TSourceTokenList;
const
UPDATE_INTERVAL = 4096; // big increments here, this goes faster than parsing
var
lcList: TSourceTokenList;
lcNew: TSourceToken;
{$IFNDEF COMMAND_LINE}
liCounter: integer;
{$ENDIF}
begin
Assert(SourceCode <> '');
{$IFNDEF COMMAND_LINE}
liCounter := 0;
{$ENDIF}
lcList := TSourceTokenList.Create;
while not EndOfFile do
begin
lcNew := GetNextToken;
lcList.Add(lcNew);
{$IFNDEF COMMAND_LINE}
Inc(liCounter);
if (liCounter mod UPDATE_INTERVAL) = 0 then
Application.ProcessMessages;
{$ENDIF}
end;
Result := lcList;
end;
function TBuildTokenList.Current: Char;
begin
Result := fsSourceCode[fiCurrentIndex];
end;
function TBuildTokenList.CurrentChars(const piCount: integer): String;
begin
Result := Copy(fsSourceCode, fiCurrentIndex, piCount);
end;
function TBuildTokenList.ForwardChar(const piOffset: integer): Char;
begin
Result := fsSourceCode[fiCurrentIndex + piOffset];
end;
function TBuildTokenList.ForwardChars(const piOffset, piCount: integer): String;
begin
Result := Copy(fsSourceCode, fiCurrentIndex + piOffset, piCount);
end;
procedure TBuildTokenList.Consume(const piCount: integer);
begin
inc(fiCurrentIndex, piCount);
end;
function TBuildTokenList.EndOfFile: boolean;
begin
Result := fiCurrentIndex > Length(fsSourceCode);
end;
function TBuildTokenList.EndOfFileAfter(const piChars: integer): boolean;
begin
Result := (fiCurrentIndex + piChars) > Length(fsSourceCode);
end;
end.