lazarus/components/jcf2/Parse/BuildTokenList.pas

805 lines
19 KiB
ObjectPascal

{(*}
(*------------------------------------------------------------------------------
Delphi Code formatter source code
The Original Code is BuildTokenList.pas, released April 2000.
The Initial Developer of the Original Code is Anthony Steele.
Portions created by Anthony Steele are Copyright (C) 1999-2008 Anthony Steele.
All Rights Reserved.
Contributor(s): Anthony Steele.
The contents of this file are subject to the Mozilla Public License Version 1.1
(the "License"). you may not use this file except in compliance with the License.
You may obtain a copy of the License at http://www.mozilla.org/NPL/
Software distributed under the License is distributed on an "AS IS" basis,
WITHOUT WARRANTY OF ANY KIND, either express or implied.
See the License for the specific language governing rights and limitations
under the License.
Alternatively, the contents of this file may be used under the terms of
the GNU General Public License Version 2 or later (the "GPL")
See http://www.gnu.org/licenses/gpl.html
------------------------------------------------------------------------------*)
{*)}
unit BuildTokenList;
{ AFS 29 Nov 1999
converts the input string of chars into a list of tokens
This is the lexical analysis phase of the parsing
}
{$I JcfGlobal.inc}
interface
uses
{ delphi }
Windows,
{ local }
Tokens, SourceToken, SourceTokenList;
type
TBuildTokenList = class(TObject)
private
{ property implementation }
fsSourceCode: WideString;
fsFileName: string;
{ woker procs }
fiCurrentIndex: integer;
procedure SetSourceCode(const Value: WideString);
function Current: WideChar;
function CurrentChars(const piCount: integer): WideString;
function ForwardChar(const piOffset: integer): WideChar;
function ForwardChars(const piOffset, piCount: integer): WideString;
function Consume(const piCount: integer = 1): WideString;
function EndOfFile: boolean;
function EndOfFileAfter(const piChars: integer): boolean;
{ implementation of GetNextToken }
function TryReturn(const pcToken: TSourceToken): boolean;
function TryCurlyComment(const pcToken: TSourceToken): boolean;
function TrySlashComment(const pcToken: TSourceToken): boolean;
function TryBracketStarComment(const pcToken: TSourceToken): boolean;
function TryWhiteSpace(const pcToken: TSourceToken): boolean;
function TryLiteralString(const pcToken: TSourceToken;
const pcDelimiter: WideChar): boolean;
function TryNumber(const pcToken: TSourceToken): boolean;
function TryHexNumber(const pcToken: TSourceToken): boolean;
function TryDots(const pcToken: TSourceToken): boolean;
function TryAssign(const pcToken: TSourceToken): boolean;
function TrySingleCharToken(const pcToken: TSourceToken): boolean;
function TryPunctuation(const pcToken: TSourceToken): boolean;
function TryWord(const pcToken: TSourceToken): boolean;
function GetNextToken: TSourceToken;
protected
public
constructor Create;
destructor Destroy; override;
function BuildTokenList: TSourceTokenList;
property SourceCode: WideString read fsSourceCode write SetSourceCode;
property FileName: string read fsFileName write fsFileName;
end;
implementation
uses
{ delphi }
Forms, SysUtils,
{ jcl }
JclAnsiStrings, JcfUnicode,
{ local }
JcfRegistrySettings;
const
codepage_Chinese = 950;
function CheckMultiByte(const pcChar: WideChar): Boolean;
begin
Result := False;
if GetRegSettings.CheckMultiByteChars then
Result := IsDBCSLeadByte(Byte(pcChar));
//Result := IsDBCSLeadByteEx(codepage_Chinese, Byte(pcChar));
end;
{ TBuildTokenList }
constructor TBuildTokenList.Create;
begin
inherited;
SourceCode := '';
end;
destructor TBuildTokenList.Destroy;
begin
inherited;
end;
procedure TBuildTokenList.SetSourceCode(const Value: WideString);
begin
fsSourceCode := Value;
// reset the index
fiCurrentIndex := 1;
end;
function TBuildTokenList.GetNextToken: TSourceToken;
var
lcNewToken: TSourceToken;
procedure DoAllTheTries;
begin
{ first look for return }
if TryReturn(lcNewToken) then
exit;
{ comments }
if TryCurlyComment(lcNewToken) then
exit;
if TrySlashComment(lcNewToken) then
exit;
if TryBracketStarComment(lcNewToken) then
exit;
{ the rest }
if TryWhiteSpace(lcNewToken) then
exit;
if TryLiteralString(lcNewToken, AnsiSingleQuote) then
exit;
if TryLiteralString(lcNewToken, AnsiDoubleQuote) then
exit;
if TryWord(lcNewToken) then
exit;
if TryNumber(lcNewToken) then
exit;
if TryHexNumber(lcNewToken) then
exit;
if TryDots(lcNewToken) then
exit;
{ attempt assign before colon }
if TryAssign(lcNewToken) then
exit;
if TryPunctuation(lcNewToken) then
exit;
if TrySingleCharToken(lcNewToken) then
exit;
{ default }
lcNewToken.TokenType := ttUnknown;
lcNewToken.SourceCode := Current;
Consume(1);
end;
begin
if EndOfFile then
Result := nil
else
begin
lcNewToken := TSourceToken.Create;
lcNewToken.FileName := FileName;
DoAllTheTries;
lcNewToken.WordType := WordTypeOfToken(lcNewToken.TokenType);
Result := lcNewToken;
end;
end;
{-------------------------------------------------------------------------------
worker fns for GetNextComment }
function TBuildTokenList.TryBracketStarComment(const pcToken: TSourceToken): boolean;
var
liCommentLength: integer;
procedure MoveToCommentEnd;
begin
{ comment is ended by *) or by EOF (bad source) }
while True do
begin
if EndOfFileAfter(liCommentLength) then
break;
if CheckMultiByte(ForwardChar(liCommentLength)) then
begin
liCommentLength := liCommentLength + 2;
continue;
end;
if ForwardChars(liCommentLength, 2) = '*)' then
break;
inc(liCommentLength);
end;
// include the comment end
if not EndOfFileAfter(liCommentLength) and (ForwardChars(liCommentLength, 2) = '*)') then
inc(liCommentLength, 2);
end;
begin
Result := False;
if not (Current = '(') then
exit;
if CurrentChars(2) <> '(*' then
exit;
{ if the comment starts with (*) that is not the end of the comment }
liCommentLength := 2;
MoveToCommentEnd;
pcToken.TokenType := ttComment;
pcToken.CommentStyle := eBracketStar;
pcToken.SourceCode := CurrentChars(liCommentLength);
Consume(liCommentLength);
Result := True;
end;
function TBuildTokenList.TryCurlyComment(const pcToken: TSourceToken): boolean;
var
liCommentLength: integer;
procedure MoveToCommentEnd;
begin
{ comment is ended by close-curly or by EOF (bad source) }
while True do
begin
if EndOfFileAfter(liCommentLength) then
break;
if CheckMultiByte(ForwardChar(liCommentLength)) then
begin
liCommentLength := liCommentLength + 2;
continue;
end;
if ForwardChar(liCommentLength) = '}' then
break;
inc(liCommentLength);
end;
{ include the closing brace }
if not EndOfFileAfter(liCommentLength) and (ForwardChars(liCommentLength, 1) = '}') then
inc(liCommentLength);
end;
begin
Result := False;
if Current <> '{' then
exit;
pcToken.TokenType := ttComment;
liCommentLength := 1;
{ compiler directive are the comments with a $ just after the open-curly
this is always the case }
if ForwardChar(1) = '$' then
pcToken.CommentStyle := eCompilerDirective
else
pcToken.CommentStyle := eCurlyBrace;
MoveToCommentEnd;
pcToken.SourceCode := CurrentChars(liCommentLength);
Consume(liCommentLength);
Result := True;
end;
function TBuildTokenList.TrySlashComment(const pcToken: TSourceToken): boolean;
var
liCommentLength: integer;
procedure MoveToCommentEnd;
begin
{ comment is ended by return or by EOF (bad source) }
while True do
begin
if EndOfFileAfter(liCommentLength) then
break;
if CheckMultiByte(ForwardChar(liCommentLength)) then
begin
liCommentLength := liCommentLength + 2;
continue;
end;
if WideCharIsReturn(ForwardChar(liCommentLength)) then
break;
inc(liCommentLength);
end;
end;
begin
Result := False;
if Current <> '/' then
exit;
{ until end of line or file }
if CurrentChars(2) <> '//' then
exit;
liCommentLength := 2;
MoveToCommentEnd;
pcToken.TokenType := ttComment;
pcToken.CommentStyle := eDoubleSlash;
pcToken.SourceCode := CurrentChars(liCommentLength);
Consume(liCommentLength);
Result := True;
end;
function TBuildTokenList.TryReturn(const pcToken: TSourceToken): boolean;
var
chNext: WideChar;
begin
Result := False;
if not WideCharIsReturn(Current) then
exit;
pcToken.TokenType := ttReturn;
pcToken.SourceCode := Current;
Consume;
{ concat the next return char if it is not the same
This will recognise <cr><lf> or <lf><cr>, but not <cr><cr> }
chNext := Current;
if WideCharIsReturn(chNext) and (chNext <> pcToken.SourceCode[1]) then
begin
pcToken.SourceCode := pcToken.SourceCode + chNext;
Consume;
end;
Result := True;
end;
{ complexities like 'Hello'#32'World' and #$12'Foo' are assemlbed in the parser }
function TBuildTokenList.TryLiteralString(const pcToken: TSourceToken;
const pcDelimiter: WideChar): boolean;
begin
Result := False;
if Current = pcDelimiter then
begin
Result := True;
{ read the opening ' }
pcToken.SourceCode := pcToken.SourceCode + Current;
Consume;
{ read until the close ' }
repeat
if Current = #0 then
break;
if WideCharIsReturn(Current) then
Raise Exception.Create('Unterminated string: ' + pcToken.SourceCode);
{ two quotes in a row are still part of the string }
if (Current = pcDelimiter) then
begin
{ two consecutive quote chars inside string, read them }
if (ForwardChar(1) = pcDelimiter) then
begin
pcToken.SourceCode := pcToken.SourceCode + CurrentChars(2);
Consume(2);
end
else
begin
{ single quote char ends string }
pcToken.SourceCode := pcToken.SourceCode + Current;
Consume;
break;
end
end
else
begin
{ normal char, read it }
pcToken.SourceCode := pcToken.SourceCode + Current;
Consume;
end;
until False;
pcToken.TokenType := ttQuotedLiteralString;
end;
end;
function TBuildTokenList.TryWord(const pcToken: TSourceToken): boolean;
begin
Result := False;
if not WideCharIsWordChar(Current) then
exit;
pcToken.SourceCode := Current;
Consume;
{ concat any subsequent word chars }
while WideCharIsWordChar(Current) or WideCharIsDigit(Current) do
begin
pcToken.SourceCode := pcToken.SourceCode + Current;
Consume;
end;
{ try to recognise the word as built in }
pcToken.TokenType := TypeOfToken(pcToken.SourceCode);
if pcToken.TokenType = ttUnknown then
pcToken.TokenType := ttIdentifier;
Result := True;
end;
function TBuildTokenList.TryWhiteSpace(const pcToken: TSourceToken): boolean;
begin
Result := False;
if not WideCharIsWhiteSpaceNoReturn(Current) then
exit;
pcToken.TokenType := ttWhiteSpace;
pcToken.SourceCode := Current;
Consume;
{ concat any subsequent return chars }
while WideCharIsWhiteSpaceNoReturn(Current) do
begin
pcToken.SourceCode := pcToken.SourceCode + Current;
Consume;
end;
Result := True;
end;
function TBuildTokenList.TryAssign(const pcToken: TSourceToken): boolean;
begin
Result := False;
if Current <> ':' then
exit;
if CurrentChars(2) <> ':=' then
exit;
pcToken.TokenType := ttAssign;
pcToken.SourceCode := CurrentChars(2);
Consume(2);
Result := True;
end;
function TBuildTokenList.TryNumber(const pcToken: TSourceToken): boolean;
var
lbHasDecimalSep: boolean;
begin
Result := False;
{ recognise a number -
they don't start with a '.' but may contain one
a minus sign in front is considered unary operator not part of the number
this is bourne out by the compiler considering
'- 0.3' and -0.3' to be the same value
and -.3 is not legal at all }
{ first one must be a digit }
if not WideCharIsDigit(Current) then
exit;
if (Current = '.') or (Current = '-') then
exit;
pcToken.TokenType := ttNumber;
pcToken.SourceCode := Current;
Consume;
lbHasDecimalSep := False;
{ concat any subsequent number chars
only one decimal seperator allowed
also NB that two dots can be array range, as in
var foo = array[1..100] of integer;
ie one dat = decimal
two dots = end of number
}
while WideCharIsDigit(Current) or (Current = '.') do
begin
// have we got to the dot?
if (Current = '.') then
begin
if CurrentChars(2) = '..' then
break;
if lbHasDecimalSep then
// oops! a second one
break
else
lbHasDecimalSep := True;
end;
pcToken.SourceCode := pcToken.SourceCode + Current;
Consume;
end;
{ scientific notation suffix, eg 3e2 = 30, 2.1e-3 = 0.0021 }
{ check for a trailing 'e' }
if WideCharInSet(Current, ['e', 'E']) then
begin
// sci notation mode
pcToken.SourceCode := pcToken.SourceCode + Current;
Consume;
// can be a minus or plus here
if WideCharInSet(Current, ['-', '+']) then
begin
pcToken.SourceCode := pcToken.SourceCode + Current;
Consume;
end;
{ exponent must be integer }
while WideCharIsDigit(Current) do
begin
pcToken.SourceCode := pcToken.SourceCode + Current;
Consume;
end;
end;
Result := True;
end;
{ NB: do not localise '.' with DecimalSeperator
Delphi source code does *not* change for this }
function TBuildTokenList.TryHexNumber(const pcToken: TSourceToken): boolean;
var
lbHasDecimalSep: boolean;
begin
Result := False;
{ starts with a $ }
if Current <> '$' then
exit;
pcToken.TokenType := ttNumber;
pcToken.SourceCode := Current;
Consume;
lbHasDecimalSep := False;
{ concat any subsequent number chars }
while WideCharIsHexDigitDot(Current) do
begin
// have we got to the dot?
if (Current = '.') then
begin
if CurrentChars(2) = '..' then
break;
if lbHasDecimalSep then
// oops! a second one
break
else
lbHasDecimalSep := True;
end;
pcToken.SourceCode := pcToken.SourceCode + Current;
Consume;
end;
Result := True;
end;
{ try the range '..' operator and object access '.' operator }
function TBuildTokenList.TryDots(const pcToken: TSourceToken): boolean;
begin
Result := False;
if Current <> '.' then
exit;
pcToken.SourceCode := Current;
Consume;
if Current = '.' then
begin
pcToken.TokenType := ttDoubleDot;
pcToken.SourceCode := pcToken.SourceCode + Current;
Consume;
end
else
begin
pcToken.TokenType := ttDot;
end;
Result := True;
end;
function TBuildTokenList.TryPunctuation(const pcToken: TSourceToken): boolean;
function FollowsPunctuation(const chLast, ch: WideChar): boolean;
const
{ these have meanings on thier own and should not be recognised as part of the punc.
e.g '=(' is not a punctation symbol, but 2 of them ( for e.g. in const a=(3);
simlarly ');' is 2 puncs }
UnitaryPunctuation: set of AnsiChar = [
AnsiSingleQuote, '"', '(', ')', '[', ']', '{',
'#', '$', '_', ';', '@', '^', ','];
{ these can't have anything following them:
for e.g, catch the case if a=-1 then ...
where '=-1' should be read as '=' '-1' not '=-' '1'
Nothing legitimately comes after '=' AFAIK
also a:=a*-1;
q:=q--1; // q equals q minus minus-one. It sucks but it compiles so it must be parsed
etc }
SingleChars: set of AnsiChar = ['=', '+', '-', '*', '/', '\'];
begin
Result := False;
if WideCharInSet(chLast, UnitaryPunctuation) or WideCharInSet(ch, UnitaryPunctuation) then
exit;
if WideCharInSet(chLast, SingleChars) then
exit;
{ '<' or '<' can only be followed by '<', '>' or '='.
Beware of "if x<-1"
}
if WideCharInSet(chLast, ['<', '>']) and not WideCharInSet(ch, ['<', '>', '=']) then
exit;
// ':' can be followed by '=' only
if (chLast = ':') and (ch <> '=') then
exit;
// "<<" is the start of two nested generics,
// likewise '>>' is not an operator, it is two "end-of-generic" signs in sucession
if (chLast = '<') and (ch = '<') then
exit;
if (chLast = '>') and (ch = '>') then
exit;
Result := WideCharIsPuncChar(ch);
end;
var
leWordType: TWordType;
leTokenType: TTokenType;
lcLast: WideChar;
begin
Result := False;
if not WideCharIsPuncChar(Current) then
exit;
pcToken.TokenType := ttPunctuation;
lcLast := Current;
pcToken.SourceCode := lcLast;
Consume;
{ concat any subsequent punc chars }
while FollowsPunctuation(lcLast, Current) do
begin
lcLast := Current;
pcToken.SourceCode := pcToken.SourceCode + lcLast;
Consume;
end;
{ try to recognise the punctuation as an operator }
TypeOfToken(pcToken.SourceCode, leWordType, leTokenType);
if leTokenType <> ttUnknown then
begin
pcToken.TokenType := leTokenType;
end;
Result := True;
end;
function TBuildTokenList.TrySingleCharToken(const pcToken: TSourceToken): boolean;
begin
Result := False;
pcToken.TokenType := TypeOfToken(Current);
if pcToken.TokenType <> ttUnknown then
begin
pcToken.SourceCode := Current;
Consume;
Result := True;
end;
end;
function TBuildTokenList.BuildTokenList: TSourceTokenList;
const
UPDATE_INTERVAL = 4096; // big incre,ents here, this is fatser than parsing
var
lcList: TSourceTokenList;
lcNew: TSourceToken;
liCounter: integer;
begin
Assert(SourceCode <> '');
liCounter := 0;
lcList := TSourceTokenList.Create;
while not EndOfFile do
begin
lcNew := GetNextToken;
lcList.Add(lcNew);
Inc(liCounter);
if (liCounter mod UPDATE_INTERVAL) = 0 then
Application.ProcessMessages;
end;
Result := lcList;
end;
function TBuildTokenList.Current: WideChar;
begin
Result := fsSourceCode[fiCurrentIndex];
end;
function TBuildTokenList.CurrentChars(const piCount: integer): WideString;
begin
Result := Copy(fsSourceCode, fiCurrentIndex, piCount);
end;
function TBuildTokenList.ForwardChar(const piOffset: integer): WideChar;
begin
Result := fsSourceCode[fiCurrentIndex + piOffset];
end;
function TBuildTokenList.ForwardChars(const piOffset, piCount: integer): WideString;
begin
Result := Copy(fsSourceCode, fiCurrentIndex + piOffset, piCount);
end;
function TBuildTokenList.Consume(const piCount: integer): WideString;
begin
inc(fiCurrentIndex, piCount);
end;
function TBuildTokenList.EndOfFile: boolean;
begin
Result := fiCurrentIndex > Length(fsSourceCode);
end;
function TBuildTokenList.EndOfFileAfter(const piChars: integer): boolean;
begin
Result := (fiCurrentIndex + piChars) > Length(fsSourceCode);
end;
end.