fpc/fcl/xml/xmlread.pp
michael 90b916085a * Patch from Sergei Gorelkin:
- Full support for XML 1.1
  - Support for ISO-8859-1 input encoding (started support for any
    encoding, but went a bit wrong way. I should stick to using iconv,
    but for it I need second level of buffering, otherwise it's too
    slow).
  - Input buffering (can now read from non-seekable streams and
    TextFile's)
  - External entity loading, processing of default attributes and other
    stuff needed to comply with "non-validating parser" specification.

git-svn-id: trunk@5018 -
2006-10-24 12:22:00 +00:00

2577 lines
65 KiB
ObjectPascal

{
This file is part of the Free Component Library
XML reading routines.
Copyright (c) 1999-2000 by Sebastian Guenther, sg@freepascal.org
Modified in 2006 by Sergei Gorelkin, sergei_gorelkin@mail.ru
See the file COPYING.FPC, included in this distribution,
for details about the copyright.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
**********************************************************************}
unit XMLRead;
{$ifdef fpc}
{$MODE objfpc}{$H+}
{$endif}
interface
uses
SysUtils, Classes, DOM;
type
EXMLReadError = class(Exception);
procedure ReadXMLFile(out ADoc: TXMLDocument; const AFilename: String); overload;
procedure ReadXMLFile(out ADoc: TXMLDocument; var f: Text); overload;
procedure ReadXMLFile(out ADoc: TXMLDocument; var f: TStream); overload;
procedure ReadXMLFile(out ADoc: TXMLDocument; var f: TStream; const ABaseURI: String); overload;
procedure ReadXMLFragment(AParentNode: TDOMNode; const AFilename: String); overload;
procedure ReadXMLFragment(AParentNode: TDOMNode; var f: Text); overload;
procedure ReadXMLFragment(AParentNode: TDOMNode; var f: TStream); overload;
procedure ReadXMLFragment(AParentNode: TDOMNode; var f: TStream; const ABaseURI: String); overload;
procedure ReadDTDFile(out ADoc: TXMLDocument; const AFilename: String); overload;
procedure ReadDTDFile(out ADoc: TXMLDocument; var f: Text); overload;
procedure ReadDTDFile(out ADoc: TXMLDocument; var f: TStream); overload;
procedure ReadDTDFile(out ADoc: TXMLDocument; var f: TStream; const ABaseURI: String); overload;
// =======================================================
implementation
uses
UriParser;
type
TSetOfChar = set of Char;
const
Letter = ['A'..'Z', 'a'..'z'];
Digit = ['0'..'9'];
PubidChars: TSetOfChar = [' ', #13, #10, 'a'..'z', 'A'..'Z', '0'..'9',
'-', '''', '(', ')', '+', ',', '.', '/', ':', '=', '?', ';', '!', '*',
'#', '@', '$', '_', '%'];
type
TDOMNotationEx = class(TDOMNotation);
TDOMAttrEx = class(TDOMAttr);
TXMLInputSource = class;
TDOMElementDef = class;
TDOMEntityEx = class(TDOMEntity)
protected
FInternal: Boolean;
FResolved: Boolean;
FOnStack: Boolean;
FReplacementText: DOMString;
end;
// TODO: Do I need PEMap in DocType? Maybe move it to Reader itself?
// (memory usage - they are not needed after parsing)
TDOMDocumentTypeEx = class(TDOMDocumentType)
private
FHasPERefs: Boolean;
FPEMap: TDOMNamedNodeMap;
FElementDefs: TDOMNamedNodeMap;
function GetPEMap: TDOMNamedNodeMap;
function GetElementDefs: TDOMNamedNodeMap;
protected
property PEMap: TDOMNamedNodeMap read GetPEMap;
property ElementDefs: TDOMNamedNodeMap read GetElementDefs;
property HasPERefs: Boolean read FHasPERefs write FHasPERefs;
public
destructor Destroy; override;
end;
TXMLReader = class;
TDecoder = class;
TDecoderRef = class of TDecoder;
TXMLInputSource = class
private
FBuf: PChar;
FBufEnd: PChar;
FEof: Boolean;
FSurrogate: WideChar;
FReader: TXMLReader;
FParent: TXMLInputSource;
FEntity: TObject; // weak reference
FCursor: TObject; // weak reference
FLine: Integer;
FColumn: Integer;
FSystemID: WideString;
FPublicID: WideString;
function GetSystemID: WideString;
function GetPublicID: WideString;
protected
procedure FetchData; virtual;
public
constructor Create(const AData: WideString);
function NextChar: WideChar; virtual;
procedure Initialize; virtual;
procedure SetEncoding(const AEncoding: string); virtual;
property SystemID: WideString read GetSystemID write FSystemID;
property PublicID: WideString read GetPublicID write FPublicID;
end;
TXMLDecodingSource = class(TXMLInputSource)
private
FDecoder: TDecoder;
FSeenCR: Boolean;
function InternalNextChar: WideChar;
procedure DecodingError(const Msg: string); overload;
procedure DecodingError(const Msg: string; const Args: array of const); overload;
public
destructor Destroy; override;
function NextChar: WideChar; override;
procedure SetEncoding(const AEncoding: string); override;
procedure Initialize; override;
end;
TXMLStreamInputSource = class(TXMLDecodingSource)
private
FAllocated: PChar;
FStream: TStream;
FBufSize: Integer;
FOwnStream: Boolean;
public
constructor Create(AStream: TStream; AOwnStream: Boolean);
destructor Destroy; override;
procedure FetchData; override;
end;
TXMLFileInputSource = class(TXMLDecodingSource)
private
FFile: ^Text;
FString: string;
public
constructor Create(var AFile: Text);
procedure FetchData; override;
end;
TDecoder = class
private
FSource: TXMLDecodingSource;
public
constructor Create(ASource: TXMLDecodingSource);
function DecodeNext: WideChar; virtual; abstract;
class function Supports(const AEncoding: string): Boolean; virtual; abstract;
end;
TISO8859_1Decoder = class(TDecoder)
public
function DecodeNext: WideChar; override;
class function Supports(const AEncoding: string): Boolean; override;
end;
TUCS2Decoder = class(TDecoder)
private
FSwapEndian: Boolean;
FEncoding: string;
public
function DecodeNext: WideChar; override;
class function Supports(const AEncoding: string): Boolean; override;
end;
TUTF8Decoder = class(TDecoder)
public
function DecodeNext: WideChar; override;
class function Supports(const AEncoding: string): Boolean; override;
end;
PWideCharBuf = ^TWideCharBuf;
TWideCharBuf = record
Buffer: PWideChar;
Length: Integer;
MaxLength: Integer;
end;
TEntityResolveEvent = procedure(const PubID, SysID: WideString; var Source: TXMLInputSource) of object;
TDeclType = (dtNone, dtXml, dtText);
TXMLReader = class
private
FSource: TXMLInputSource;
FCurChar: WideChar;
FWhitespace: Boolean;
FXML11: Boolean;
FValue: TWideCharBuf;
FName: TWideCharBuf;
FCopyBuf: PWideCharBuf;
FIntSubset: Boolean;
FAllowedDecl: TDeclType;
FDtdParsed: Boolean;
FRecognizePE: Boolean;
FStandalone: Boolean; // property of Doc ?
FInvalid: Boolean;
// TODO: This array must be stored globally, not per instance
FNamePages: PByteArray;
FForbiddenAscii: TSetOfChar;
FDocType: TDOMDocumentTypeEx; // a shortcut
FEntityLevel: Integer;
FPreserveWhitespace: Boolean;
FCreateEntityRefs: Boolean;
procedure RaiseExpectedQmark;
procedure GetChar;
procedure GetCharRaw;
procedure Unget(wc: WideChar);
procedure Initialize(ASource: TXMLInputSource);
procedure InitializeRoot(ASource: TXMLInputSource);
procedure DoParseAttValue(Delim: WideChar);
procedure DoParseFragment;
procedure DoParseExtSubset(ASource: TXMLInputSource);
function ContextPush(AEntity: TDOMEntityEx): Boolean;
function ContextPop: Boolean;
procedure XML11_BuildTables;
function XML11_CheckName: Boolean;
protected
FCursor: TDOMNode;
procedure RaiseExc(const descr: String); overload;
procedure RaiseExc(const descr: string; const args: array of const); overload;
procedure RaiseExc(Expected: WideChar); overload;
function SkipWhitespace: Boolean;
procedure ExpectWhitespace;
procedure ExpectString(const s: String);
procedure ExpectChar(wc: WideChar);
function CheckForChar(c: WideChar): Boolean;
procedure SkipString(const ValidChars: TSetOfChar);
function GetString(const ValidChars: TSetOfChar): WideString;
procedure RaiseNameNotFound;
function CheckName: Boolean;
function CheckNmToken: Boolean;
function ExpectName: WideString; // [5]
procedure SkipName;
function SkipQuotedLiteral: Boolean;
procedure ExpectAttValue; // [10]
procedure SkipPubidLiteral; // [12]
procedure SkipSystemLiteral(out Literal: WideString; Required: Boolean);
procedure ParseComment; // [15]
procedure ParsePI; // [16]
procedure ParseCDSect; // [18]
procedure ParseXmlOrTextDecl(TextDecl: Boolean);
function ParseEq: Boolean; // [25]
procedure ExpectEq;
procedure ParseMisc; // [27]
procedure ParseDoctypeDecl; // [28]
procedure ParseMarkupDecl; // [29]
procedure ParseElement; // [39]
procedure ParseContent; // [43]
function ResolvePredefined(const RefName: WideString): WideChar;
procedure IncludeEntity(AEntity: TDOMEntityEx; InAttr: Boolean);
procedure StartPE;
function ParseCharRef: Boolean; // [66]
function ParseReference: TDOMEntityEx; // [67]
function ParsePEReference: Boolean; // [69]
function ParseExternalID(out SysID, PubID: WideString; // [75]
SysIdOptional: Boolean): Boolean;
procedure ProcessTextAndRefs;
procedure AssertPENesting(CurrentLevel: Integer);
procedure ParseEntityDecl;
procedure ParseEntityDeclValue(Delim: WideChar);
procedure ParseAttlistDecl;
procedure ExpectChoiceOrSeq;
procedure ParseMixedOrChildren;
procedure ParseElementDecl;
procedure ParseNotationDecl;
function ResolveEntity(const SystemID, PublicID: WideString; out Source: TXMLInputSource): Boolean;
procedure ProcessDefaultAttributes(Element: TDOMElement);
procedure ValidationError(const Msg: string; const args: array of const);
public
doc: TDOMDocument;
constructor Create;
destructor Destroy; override;
procedure ProcessXML(ASource: TXMLInputSource); // [1]
procedure ProcessFragment(ASource: TXMLInputSource; AOwner: TDOMNode);
procedure ProcessDTD(ASource: TXMLInputSource); // ([29])
end;
// AttDef/ElementDef support
TAttrDataType = (
DT_CDATA,
DT_ID,
DT_IDREF,
DT_IDREFS,
DT_ENTITY,
DT_ENTITIES,
DT_NMTOKEN,
DT_NMTOKENS,
DT_NOTATION
);
TAttrDefault = (
AD_IMPLIED,
AD_DEFAULT,
AD_REQUIRED,
AD_FIXED
);
TDOMAttrDef = class(TDOMAttr)
protected
FDataType: TAttrDataType;
FDefault: TAttrDefault;
// FEnumeration: TWideStringList? array of WideStrings?
end;
TDOMElementDef = class(TDOMElement);
{$i names.inc}
// TODO: List of registered/supported decoders
function FindDecoder(const Encoding: string): TDecoderRef;
begin
if TISO8859_1Decoder.Supports(Encoding) then
Result := TISO8859_1Decoder
else
Result := nil;
end;
procedure BufAllocate(var ABuffer: TWideCharBuf; ALength: Integer);
begin
ABuffer.MaxLength := ALength;
ABuffer.Length := 0;
GetMem(ABuffer.Buffer, ABuffer.MaxLength*SizeOf(WideChar));
end;
procedure BufAppend(var ABuffer: TWideCharBuf; wc: WideChar);
begin
if ABuffer.Length >= ABuffer.MaxLength then
begin
ABuffer.MaxLength := ABuffer.MaxLength * 2;
ReallocMem(ABuffer.Buffer, ABuffer.MaxLength * SizeOf(WideChar));
end;
ABuffer.Buffer[ABuffer.Length] := wc;
Inc(ABuffer.Length);
end;
function IsValidEncName(const s: WideString): Boolean;
var
I: Integer;
begin
Result := False;
if (s = '') or (s[1] > #255) or not (char(s[1]) in ['A'..'Z', 'a'..'z']) then
Exit;
for I := 2 to Length(s) do
if (s[I] > #255) or not (char(s[I]) in ['A'..'Z', 'a'..'z', '0'..'9', '.', '_', '-']) then
Exit;
Result := True;
end;
{ TDOMDocumentTypeEx }
destructor TDOMDocumentTypeEx.Destroy;
begin
FPEMap.Free;
FElementDefs.Free;
inherited Destroy;
end;
function TDOMDocumentTypeEx.GetElementDefs: TDOMNamedNodeMap;
begin
if FElementDefs = nil then
FElementDefs := TDOMNamedNodeMap.Create(Self, ELEMENT_NODE);
Result := FElementDefs;
end;
function TDOMDocumentTypeEx.GetPEMap: TDOMNamedNodeMap;
begin
if FPEMap = nil then
FPEMap := TDOMNamedNodeMap.Create(Self, ENTITY_NODE);
Result := FPEMap;
end;
// TODO: These classes still cannot be considered as the final solution...
{ TXMLInputSource }
constructor TXMLInputSource.Create(const AData: WideString);
begin
inherited Create;
FBuf := PChar(PWideChar(AData));
FBufEnd := FBuf + Length(AData) * sizeof(WideChar);
end;
procedure TXMLInputSource.Initialize;
begin
FLine := 1;
FColumn := 0;
end;
function TXMLInputSource.NextChar: WideChar;
begin
if FSurrogate <> #0 then
begin
Result := FSurrogate;
FSurrogate := #0;
end
else if FBufEnd <= FBuf then
begin
Result := #0;
FEof := True;
end
else
begin
Result := PWideChar(FBuf)^;
Inc(FBuf, sizeof(WideChar));
end;
// TODO: Column counting - surrogate pair is a single char!
if Result = #10 then
begin
Inc(FLine);
FColumn := 0;
end
else
Inc(FColumn);
end;
procedure TXMLDecodingSource.DecodingError(const Msg: string);
begin
FReader.RaiseExc(Msg);
end;
procedure TXMLDecodingSource.DecodingError(const Msg: string;
const Args: array of const);
begin
FReader.RaiseExc(Msg, Args);
end;
procedure TXMLInputSource.FetchData;
begin
FEof := True;
end;
procedure TXMLInputSource.SetEncoding(const AEncoding: string);
begin
// do nothing
end;
function TXMLInputSource.GetPublicID: WideString;
begin
if FPublicID <> '' then
Result := FPublicID
else if Assigned(FParent) then
Result := FParent.PublicID
else
Result := '';
end;
function TXMLInputSource.GetSystemID: WideString;
begin
if FSystemID <> '' then
Result := FSystemID
else if Assigned(FParent) then
Result := FParent.SystemID
else
Result := '';
end;
{ TXMLDecodingSource }
destructor TXMLDecodingSource.Destroy;
begin
FDecoder.Free;
inherited Destroy;
end;
function TXMLDecodingSource.InternalNextChar: WideChar;
begin
// TODO: find a place for it, finally...
if FSurrogate <> #0 then
begin
Result := FSurrogate;
FSurrogate := #0;
Exit;
end;
if FBufEnd <= FBuf then
FetchData;
if not FEof then
Result := FDecoder.DecodeNext
else
Result := #0;
end;
function TXMLDecodingSource.NextChar: WideChar;
begin
Result := InternalNextChar;
if FSeenCR then
begin
if (Result = #10) or ((Result = #$85) and FReader.FXML11) then
Result := InternalNextChar;
FSeenCR := False;
end;
case Result of
#13: begin
FSeenCR := True;
Result := #10;
end;
#$85, #$2028:
if FReader.FXML11 then
Result := #10;
end;
if (Result < #256) and (char(Result) in FReader.FForbiddenAscii) or
((ord(Result) or 1) = $FFFF) then
DecodingError('Invalid character');
// TODO: Column counting - surrogate pair is a single char!
if Result = #10 then
begin
Inc(FLine);
FColumn := 0;
end
else
Inc(FColumn);
end;
procedure TXMLDecodingSource.Initialize;
begin
inherited;
if FBufEnd-FBuf > 1 then
repeat
if (FBuf[0] = #$FE) and (FBuf[1] = #$FF) then // BE
begin
FDecoder := TUCS2Decoder.Create(Self);
TUCS2Decoder(FDecoder).FEncoding := 'UTF-16BE';
{$IFNDEF ENDIAN_BIG}
TUCS2Decoder(FDecoder).FSwapEndian := True;
{$ENDIF}
Exit;
end
else if (FBuf[0] = #$FF) and (FBuf[1] = #$FE) then // LE
begin
FDecoder := TUCS2Decoder.Create(Self);
TUCS2Decoder(FDecoder).FEncoding := 'UTF-16LE';
{$IFDEF ENDIAN_BIG}
TUCS2Decoder(FDecoder).FSwapEndian := True;
{$ENDIF}
Exit;
end
else
Break;
until False;
FDecoder := TUTF8Decoder.Create(Self);
end;
procedure TXMLDecodingSource.SetEncoding(const AEncoding: string);
var
NewDecoder: TDecoderRef;
begin
if FDecoder.Supports(AEncoding) then // no change needed
Exit;
// hardcoded stuff - special case of UCS2
if FDecoder is TUCS2Decoder then
begin
// check for 'UTF-16LE' or 'UTF-16BE'
if SameText(AEncoding, TUCS2Decoder(FDecoder).FEncoding) then
Exit
else
DecodingError('Current encoding cannot be switched to ''%s''', [AEncoding]);
end;
NewDecoder := FindDecoder(AEncoding);
if Assigned(NewDecoder) then
begin
FDecoder.Free;
FDecoder := NewDecoder.Create(Self);
end
else
DecodingError('Encoding ''%s'' is not supported', [AEncoding]);
end;
{ TXMLStreamInputSource }
constructor TXMLStreamInputSource.Create(AStream: TStream; AOwnStream: Boolean);
begin
FStream := AStream;
FBufSize := 4096;
GetMem(FAllocated, FBufSize+8);
FBuf := FAllocated+8;
FBufEnd := FBuf;
FOwnStream := AOwnStream;
FetchData;
end;
destructor TXMLStreamInputSource.Destroy;
begin
FreeMem(FAllocated);
if FOwnStream then
FStream.Free;
inherited Destroy;
end;
procedure TXMLStreamInputSource.FetchData;
var
Remainder, BytesRead: Integer;
OldBuf: PChar;
begin
Assert(FBufEnd - FBuf < 8);
OldBuf := FBuf;
Remainder := FBufEnd - FBuf;
FBuf := FAllocated+8-Remainder;
Move(OldBuf^, FBuf^, Remainder);
BytesRead := FStream.Read(FAllocated[8], FBufSize);
if BytesRead = 0 then
FEof := True;
FBufEnd := FAllocated + 8 + BytesRead;
end;
{ TXMLFileInputSource }
constructor TXMLFileInputSource.Create(var AFile: Text);
begin
FFile := @AFile;
ReadLn(FFile^, FString);
FBuf := PChar(FString);
FBufEnd := FBuf + Length(FString);
end;
procedure TXMLFileInputSource.FetchData;
begin
FEof := Eof(FFile^);
if not FEof then
begin
ReadLn(FFile^, FString);
FString := FString + #10; // bad solution...
FBuf := PChar(FString);
FBufEnd := FBuf + Length(FString);
end;
end;
{ TDecoder }
constructor TDecoder.Create(ASource: TXMLDecodingSource);
begin
inherited Create;
FSource := ASource;
end;
{ TISO8859_1Decoder}
function TISO8859_1Decoder.DecodeNext: WideChar;
begin
with FSource do
begin
Result := WideChar(FBuf[0]);
Inc(FBuf);
end;
end;
class function TISO8859_1Decoder.Supports(const AEncoding: string): Boolean;
begin
Result := SameText(AEncoding, 'ISO-8859-1') or
SameText(AEncoding, 'ISO_8859-1') or
SameText(AEncoding, 'latin1') or
SameText(AEncoding, 'iso-ir-100') or
SameText(AEncoding, 'l1') or
SameText(AEncoding, 'IBM819') or
SameText(AEncoding, 'CP819') or
SameText(AEncoding, 'csISOLatin1') or
// This one is not in character-sets.txt, but used in most FPC documentation...
SameText(AEncoding, 'ISO8859-1');
end;
{ TUCS2Decoder }
function TUCS2Decoder.DecodeNext: WideChar;
begin
with FSource do
begin
Result := PWideChar(FBuf)^;
Inc(FBuf, sizeof(WideChar));
end;
if FSwapEndian then
Result := WideChar(Swap(Word(Result)));
end;
class function TUCS2Decoder.Supports(const AEncoding: string): Boolean;
begin
// generic aliases for both LE and BE
Result := SameText(AEncoding, 'UTF-16') or
SameText(AEncoding, 'unicode');
end;
{ TUTF8Decoder }
function TUTF8Decoder.DecodeNext: WideChar;
const
MaxCode: array[0..3] of Cardinal = ($7F, $7FF, $FFFF, $1FFFFF);
var
Value: Cardinal;
I, bc: Integer;
begin
with FSource do
begin
Result := WideChar(FBuf[0]);
Inc(FBuf);
if Result < #$80 then
Exit;
if Byte(Result) and $40 = 0 then
DecodingError('Invalid UTF8 sequence start byte');
bc := 1;
if Byte(Result) and $20 <> 0 then
begin
Inc(bc);
if Byte(Result) and $10 <> 0 then
begin
Inc(bc);
if Byte(Result) and $8 <> 0 then
DecodingError('UCS4 character out of supported range');
end;
end;
// DONE: (?) check that bc bytes available
if FBufEnd-FBuf < bc then
FetchData;
Value := Byte(Result);
I := bc; // note: I is never zero
while bc > 0 do
begin
if ord(FBuf[0]) and $C0 <> $80 then
DecodingError('Invalid byte in UTF8 sequence');
Value := (Value shl 6) or (Cardinal(FBuf[0]) and $3F);
Inc(FBuf);
Dec(bc);
end;
Value := Value and MaxCode[I];
// RFC2279 check
if Value <= MaxCode[I-1] then
DecodingError('Invalid UTF8 sequence');
case Value of
0..$D7FF, $E000..$FFFF:
begin
Result := WideChar(Value);
Exit;
end;
$10000..$10FFFF:
begin
Result := WideChar($D7C0 + (Value shr 10));
FSurrogate := WideChar($DC00 xor (Value and $3FF));
Exit;
end;
end;
DecodingError('UCS4 character out of supported range');
end;
end;
class function TUTF8Decoder.Supports(const AEncoding: string): Boolean;
begin
Result := SameText(AEncoding, 'UTF-8');
end;
{ TXMLReader }
function TXMLReader.ResolveEntity(const SystemID, PublicID: WideString; out Source: TXMLInputSource): Boolean;
var
AbsSysID: WideString;
Filename: string;
Stream: TStream;
begin
Result := False;
if ResolveRelativeURI(FSource.SystemID, SystemID, AbsSysID) then
begin
Source := nil;
// TODO: alternative resolvers
if URIToFilename(AbsSysID, Filename) then
begin
try
Stream := TFileStream.Create(Filename, fmOpenRead + fmShareDenyWrite);
Source := TXMLStreamInputSource.Create(Stream, True);
Source.SystemID := AbsSysID;
Source.PublicID := PublicID;
Result := True;
except
on E: Exception do
ValidationError('%s', [E.Message]);
end;
end;
end;
end;
procedure TXMLReader.InitializeRoot(ASource: TXMLInputSource);
begin
Initialize(ASource);
GetChar;
// TODO: presence of BOM must prevent UTF-8 encoding from being changed
CheckForChar(#$FEFF); // skip BOM, if one is present
end;
procedure TXMLReader.Initialize(ASource: TXMLInputSource);
begin
FSource := ASource;
FSource.FReader := Self;
FSource.Initialize;
end;
procedure TXMLReader.GetCharRaw;
begin
FCurChar := FSource.NextChar;
FWhitespace := (FCurChar = #32) or (FCurChar = #10) or
(FCurChar = #9) or (FCurChar = #13);
// Used for handling the internal DTD subset
if Assigned(FCopyBuf) and (FSource.FParent = nil) then
BufAppend(FCopyBuf^, FCurChar);
end;
procedure TXMLReader.GetChar;
begin
GetCharRaw;
if not FRecognizePE then
Exit;
if (FCurChar = #0) and ContextPop then
begin
Unget(FCurChar);
FCurChar := #32;
FWhitespace := True;
end
else if FCurChar = '%' then
begin
FCurChar := FSource.NextChar;
if not CheckName then
begin
Unget(FCurChar);
FCurChar := '%';
Exit;
end;
if FCurChar = ';' then // "%pe1;%pe2" - must not recognize pe2 immediately!
GetCharRaw
else
RaiseExc(WideChar(';'));
StartPE;
FCurChar := #32;
FWhitespace := True;
end;
end;
procedure TXMLReader.Unget(wc: WideChar);
begin
FSource.FSurrogate := wc;
end;
procedure TXMLReader.RaiseExpectedQmark;
begin
RaiseExc('Expected single or double quote');
end;
procedure TXMLReader.RaiseExc(Expected: WideChar);
begin
// FIX: don't output what is found - anything may be found, including exploits...
RaiseExc('Expected "%1s"', [string(Expected)]);
end;
procedure TXMLReader.RaiseExc(const descr: String);
begin
raise EXMLReadError.CreateFmt('In ''%s'' (line %d pos %d): %s', [FSource.SystemID, FSource.FLine, FSource.FColumn, descr]);
end;
procedure TXMLReader.RaiseExc(const descr: string; const args: array of const);
begin
RaiseExc(Format(descr, args));
end;
function TXMLReader.SkipWhitespace: Boolean;
begin
Result := False;
while FWhitespace do
begin
GetChar;
Result := True;
end;
end;
procedure TXMLReader.ExpectWhitespace;
begin
if not SkipWhitespace then
RaiseExc('Expected whitespace');
end;
procedure TXMLReader.ExpectChar(wc: WideChar);
begin
if FCurChar = wc then
GetChar
else
RaiseExc(wc);
end;
procedure TXMLReader.ExpectString(const s: String);
var
I: Integer;
begin
for I := 1 to Length(s) do
begin
if FCurChar <> WideChar(s[i]) then
RaiseExc('Expected "%s"', [s]);
GetChar;
end;
end;
function TXMLReader.CheckForChar(c: WideChar): Boolean;
begin
Result := (FCurChar = c);
if Result then
GetChar;
end;
procedure TXMLReader.SkipString(const ValidChars: TSetOfChar);
begin
FValue.Length := 0;
while (ord(FCurChar) < 256) and (char(FCurChar) in ValidChars) do
begin
BufAppend(FValue, FCurChar);
GetChar;
end;
end;
function TXMLReader.GetString(const ValidChars: TSetOfChar): WideString;
begin
SkipString(ValidChars);
SetString(Result, FValue.Buffer, FValue.Length);
end;
constructor TXMLReader.Create;
begin
inherited Create;
// Naming bitmap: Point to static data for XML 1.0,
// and allocate buffer in XML11_BuildTables when necessary.
FNamePages := @NamePages;
BufAllocate(FName, 128);
BufAllocate(FValue, 512);
FForbiddenAscii := [#1..#8, #11..#12, #14..#31];
// TODO: put under user control
FPreserveWhitespace := True;
FCreateEntityRefs := True;
end;
destructor TXMLReader.Destroy;
begin
if FXML11 then
FreeMem(FNamePages);
FreeMem(FName.Buffer);
FreeMem(FValue.Buffer);
while ContextPop do; // clean input stack
FSource.Free;
inherited Destroy;
end;
procedure TXMLReader.XML11_BuildTables;
var
I: Integer;
begin
if not FXML11 then
GetMem(FNamePages, 512);
FXML11 := True;
for I := 0 to 255 do
FNamePages^[I] := ord(Byte(I) in Xml11HighPages);
FNamePages^[0] := 2;
FNamePages^[3] := $2c;
FNamePages^[$20] := $2a;
FNamePages^[$21] := $2b;
FNamePages^[$2f] := $29;
FNamePages^[$30] := $2d;
FNamePages^[$fd] := $28;
Move(FNamePages^, FNamePages^[256], 256);
FNamePages^[$100] := $19;
FNamePages^[$103] := $2E;
FNamePages^[$120] := $2F;
FForbiddenAscii := [#1..#8, #11..#12, #14..#31, #$7F..#$84, #$86..#$9F];
end;
procedure TXMLReader.ProcessXML(ASource: TXMLInputSource);
begin
doc := TXMLDocument.Create;
FCursor := doc;
InitializeRoot(ASource);
FAllowedDecl := dtXml;
ParseMisc;
FDtdParsed := True;
if FDocType = nil then
ValidationError('Missing DTD', []);
if CheckName then
ParseElement
else
RaiseExc('Expected element');
ParseMisc;
if Assigned(FDocType) and (doc.DocumentElement.TagName <> FDocType.Name) then
ValidationError('DTD name does not match root element', []);
if FCurChar <> #0 then
RaiseExc('Text after end of document element found');
end;
procedure TXMLReader.ProcessFragment(ASource: TXMLInputSource; AOwner: TDOMNode);
begin
doc := AOwner.OwnerDocument;
FCursor := AOwner;
InitializeRoot(ASource);
FXML11 := doc.InheritsFrom(TXMLDocument) and (TXMLDocument(doc).XMLVersion = '1.1');
FAllowedDecl := dtText;
DoParseFragment;
end;
// XML 1.1 allowed range $10000..$EFFFF is [D800..DB7F] followed by [DC00..DFFF]
function TXMLReader.XML11_CheckName: Boolean;
begin
if (FCurChar >= #$D800) and (FCurChar <= #$DB7F) then
begin
BufAppend(FName, FCurChar);
GetCharRaw;
Result := (FCurChar >= #$DC00) and (FCurChar <= #$DFFF);
end
else
Result := False;
end;
function TXMLReader.CheckName: Boolean;
begin
FName.Length := 0;
Result := (Byte(FCurChar) in NamingBitmap[FNamePages^[hi(Word(FCurChar))]]) or
(FXML11 and XML11_CheckName);
if Result then
repeat
BufAppend(FName, FCurChar);
GetChar;
until not ((Byte(FCurChar) in NamingBitmap[FNamePages^[$100+hi(Word(FCurChar))]]) or
(FXML11 and XML11_CheckName));
end;
function TXMLReader.CheckNmToken: Boolean;
begin
FName.Length := 0;
Result := False;
while (Byte(FCurChar) in NamingBitmap[FNamePages^[$100+hi(Word(FCurChar))]]) or
(FXML11 and XML11_CheckName) do
begin
BufAppend(FName, FCurChar);
GetChar;
Result := True;
end;
end;
procedure TXMLReader.RaiseNameNotFound;
begin
RaiseExc('Name starts with invalid character');
end;
function TXMLReader.ExpectName: WideString;
begin
if not CheckName then
RaiseNameNotFound;
SetString(Result, FName.Buffer, FName.Length);
end;
procedure TXMLReader.SkipName;
begin
if not CheckName then
RaiseNameNotFound;
end;
function TXMLReader.ResolvePredefined(const RefName: WideString): WideChar;
begin
if RefName = 'amp' then
Result := '&'
else if RefName = 'apos' then
Result := ''''
else if RefName = 'gt' then
Result := '>'
else if RefName = 'lt' then
Result := '<'
else if RefName = 'quot' then
Result := '"'
else
Result := #0;
end;
function TXMLReader.ParseCharRef: Boolean; // [66]
var
Value: Integer;
begin
Result := FCurChar = '#';
if Result then
begin
GetCharRaw;
Value := 0;
if CheckForChar('x') then
repeat
case FCurChar of
'0'..'9': Value := Value * 16 + Ord(FCurChar) - Ord('0');
'a'..'f': Value := Value * 16 + Ord(FCurChar) - (Ord('a') - 10);
'A'..'F': Value := Value * 16 + Ord(FCurChar) - (Ord('A') - 10);
else
Break;
end;
GetCharRaw;
until False
else
repeat
case FCurChar of
'0'..'9': Value := Value * 10 + Ord(FCurChar) - Ord('0');
else
Break;
end;
GetCharRaw;
until False;
ExpectChar(';');
case Value of
$01..$08, $0B..$0C, $0E..$1F:
if FXML11 then
BufAppend(FValue, WideChar(Value))
else
RaiseExc('Invalid character reference');
$09, $0A, $0D, $20..$D7FF, $E000..$FFFD:
BufAppend(FValue, WideChar(Value));
$10000..$10FFFF:
begin
BufAppend(FValue, WideChar($D7C0 + (Value shr 10)));
BufAppend(FValue, WideChar($DC00 xor (Value and $3FF)));
end;
else
RaiseExc('Invalid character reference');
end;
end;
end;
procedure TXMLReader.DoParseAttValue(Delim: WideChar);
var
RefNode: TDOMEntityEx;
begin
FValue.Length := 0;
while (FCurChar <> Delim) and (FCurChar <> #0) do
begin
if FCurChar = '<' then
RaiseExc('Literal "<" in attribute value')
else if FCurChar <> '&' then
begin
if FWhitespace then
FCurChar := #32;
BufAppend(FValue, FCurChar);
GetCharRaw;
end
else
begin
GetCharRaw; // skip '&'
if ParseCharRef then
Continue;
RefNode := ParseReference;
if Assigned(RefNode) then
begin
if FValue.Length > 0 then
begin
FCursor.AppendChild(doc.CreateTextNodeBuf(FValue.Buffer, FValue.Length));
FValue.Length := 0;
end;
if RefNode.SystemID <> '' then
RaiseExc('External entity reference is not allowed in attribute value');
IncludeEntity(RefNode, True);
end;
end;
end; // while
if FValue.Length > 0 then
begin
FCursor.AppendChild(doc.CreateTextNodeBuf(FValue.Buffer, FValue.Length));
FValue.Length := 0;
end;
end;
procedure TXMLReader.DoParseFragment;
begin
ParseContent;
if FCurChar <> #0 then
RaiseExc('Closing tag not allowed here');
end;
function TXMLReader.ContextPush(AEntity: TDOMEntityEx): Boolean;
var
Src: TXMLInputSource;
begin
if AEntity.SystemID <> '' then
begin
Result := ResolveEntity(AEntity.SystemID, AEntity.PublicID, Src);
if not Result then
Exit;
{
TODO: need different handling of TextDecl in external PEs
it cannot be parsed if PE is referenced INSIDE declaration
But - is such case ever met in the wild ?? E.g. MSXML fails such things...
}
FAllowedDecl := dtText;
end
else
Src := TXMLInputSource.Create(AEntity.FReplacementText);
AEntity.FOnStack := True;
Src.FEntity := AEntity;
Src.FParent := FSource;
Src.FCursor := FCursor;
Unget(FCurChar); // remember FCurChar in previous context
Inc(FEntityLevel);
Initialize(Src);
Result := True;
end;
function TXMLReader.ContextPop: Boolean;
var
Src: TXMLInputSource;
begin
Result := Assigned(FSource.FParent);
if Result then
begin
Src := FSource.FParent;
if Assigned(FSource.FEntity) then
TDOMEntityEx(FSource.FEntity).FOnStack := False;
FCursor := TDOMNode(FSource.FCursor);
FSource.Free;
FSource := Src;
Dec(FEntityLevel);
GetChar; // re-classify - case of "%pe1;%pe2;"
end;
end;
procedure TXMLReader.IncludeEntity(AEntity: TDOMEntityEx; InAttr: Boolean);
var
Node, Child: TDOMNode;
begin
if not AEntity.FResolved then
begin
if AEntity.FOnStack then
RaiseExc('Entity ''%s'' recursively references itself', [AEntity.NodeName]);
if ContextPush(AEntity) then
begin
GetCharRaw;
CheckForChar(#$FEFF);
FCursor := AEntity; // build child node tree for the entity
try
if InAttr then
DoParseAttValue(#0)
else
DoParseFragment;
AEntity.FResolved := True;
finally
ContextPop; // FCursor restored
FValue.Length := 0;
end;
end;
end;
Node := FCursor;
if FCreateEntityRefs or (not AEntity.FResolved) then
begin
Node := doc.CreateEntityReference(AEntity.NodeName);
FCursor.AppendChild(Node);
end;
Child := AEntity.FirstChild; // clone the entity node tree
while Assigned(Child) do
begin
Node.AppendChild(Child.CloneNode(True));
Child := Child.NextSibling;
end;
end;
procedure TXMLReader.StartPE;
var
PEName: WideString;
PEnt: TDOMEntityEx;
begin
SetString(PEName, FName.Buffer, FName.Length);
PEnt := FDocType.PEMap.GetNamedItem(PEName) as TDOMEntityEx;
if PEnt = nil then // TODO -cVC: Referencing undefined PE
begin // (These are classified as 'optional errors'...)
// ValidationError('Undefined parameter entity referenced: %s', [PEName]);
Exit;
end;
if PEnt.FOnStack then
RaiseExc('Entity ''%%%s'' recursively references itself', [PEnt.NodeName]);
ContextPush(PEnt);
end;
function TXMLReader.ParseReference: TDOMEntityEx;
var
RefName: WideString;
Predef: WideChar;
begin
Result := nil;
RefName := ExpectName;
ExpectChar(';');
Predef := ResolvePredefined(RefName);
if Predef <> #0 then
BufAppend(FValue, Predef)
else
begin
if Assigned(FDocType) then
Result := FDocType.Entities.GetNamedItem(RefName) as TDOMEntityEx;
if Result = nil then
begin
if FStandalone or (FDocType = nil) or not (FDocType.HasPERefs or (FDocType.SystemID <> '')) then
RaiseExc('Undefined entity ''%s'' referenced', [RefName])
else
ValidationError('Undefined entity ''%s'' referenced', [RefName]);
end
else
begin
if FStandalone and (not Result.FInternal) then
RaiseExc('Standalone constraint violation');
if Result.NotationName <> '' then
RaiseExc('Reference to unparsed entity ''%s''', [RefName]);
end;
end;
end;
procedure TXMLReader.ProcessTextAndRefs;
var
nonWs: Boolean;
last: WideChar;
RefNode: TDOMEntityEx;
begin
FValue.Length := 0;
nonWs := False;
FAllowedDecl := dtNone;
while (FCurChar <> '<') and (FCurChar <> #0) do
begin
if FCurChar <> '&' then
begin
if not FWhitespace then
nonWs := True;
BufAppend(FValue, FCurChar);
if FCurChar = '>' then
with FValue do
if (Length >= 3) and
(Buffer[Length-2] = ']') and (Buffer[Length-3] = ']') then
RaiseExc('Literal '']]>'' is not allowed in text');
GetCharRaw;
end
else
begin
GetCharRaw; // skip '&'
if ParseCharRef then
begin
last := FValue.Buffer[FValue.Length-1];
if (last <> #9) and (last <> #10) and (last <> #13) and (last <> #32) then
nonWs := True;
Continue;
end;
nonWs := True;
RefNode := ParseReference;
if Assigned(RefNode) then
begin
if (nonWs or FPreserveWhitespace) and (FValue.Length > 0) then
begin
FCursor.AppendChild(doc.CreateTextNodeBuf(FValue.Buffer, FValue.Length));
FValue.Length := 0;
nonWs := False;
end;
IncludeEntity(RefNode, False);
end;
end;
end; // while
if (nonWs or FPreserveWhitespace) and (FValue.Length > 0) then
begin
FCursor.AppendChild(doc.CreateTextNodeBuf(FValue.Buffer, FValue.Length));
FValue.Length := 0;
end;
end;
procedure TXMLReader.ExpectAttValue; // [10]
var
Delim: WideChar;
begin
if (FCurChar <> '''') and (FCurChar <> '"') then
RaiseExpectedQmark;
Delim := FCurChar;
GetCharRaw; // skip quote
DoParseAttValue(Delim);
GetChar; // NOTE: not GetCharRaw - when parsing AttDef in DTD,
// immediately following PERef must be recognized
end;
function TXMLReader.SkipQuotedLiteral: Boolean;
var
Delim: WideChar;
begin
Result := (FCurChar = '''') or (FCurChar = '"');
if Result then
begin
Delim := FCurChar;
GetCharRaw; // skip quote
FValue.Length := 0;
while (FCurChar <> Delim) and (FCurChar <> #0) do
begin
BufAppend(FValue, FCurChar);
GetCharRaw;
end;
ExpectChar(Delim); // <-- to check the EOF only
end;
end;
procedure TXMLReader.SkipPubidLiteral; // [12]
var
I: Integer;
begin
if SkipQuotedLiteral then
begin
for I := 0 to FValue.Length-1 do
if (FValue.Buffer[I] > #255) or not (Char(FValue.Buffer[I]) in PubidChars) then
RaiseExc('Illegal Public ID literal')
end
else
RaiseExpectedQMark;
end;
procedure TXMLReader.SkipSystemLiteral(out Literal: WideString; Required: Boolean);
begin
if SkipQuotedLiteral then
SetString(Literal, FValue.Buffer, FValue.Length)
else if Required then
RaiseExpectedQMark;
end;
procedure TXMLReader.ParseComment; // [15]
begin
ExpectString('--');
FValue.Length := 0;
repeat
BufAppend(FValue, FCurChar);
GetCharRaw;
with FValue do
if (Length >= 2) and (Buffer[Length-1] = '-') and
(Buffer[Length-2] = '-') then
begin
Dec(Length, 2);
if Assigned(FCursor) then
FCursor.AppendChild(doc.CreateCommentBuf(Buffer, Length));
ExpectChar('>');
Exit;
end;
until FCurChar = #0;
RaiseExc('Unterminated comment');
end;
procedure TXMLReader.ParsePI; // [16]
var
Name, Value: WideString;
begin
GetCharRaw; // skip '?'
Name := ExpectName;
with FName do
if (Length = 3) and
((Buffer[0] = 'X') or (Buffer[0] = 'x')) and
((Buffer[1] = 'M') or (Buffer[1] = 'm')) and
((Buffer[2] = 'L') or (Buffer[2] = 'l')) then
begin
if Name <> 'xml' then
RaiseExc('''xml'' is a reserved word; it must be lowercase');
if FAllowedDecl <> dtNone then
begin
ParseXmlOrTextDecl(FAllowedDecl = dtText);
FAllowedDecl := dtNone;
Exit;
end
else
RaiseExc('XML declaration not allowed here');
end;
if FCurChar <> '?' then
ExpectWhitespace;
FAllowedDecl := dtNone;
FValue.Length := 0;
repeat
BufAppend(FValue, FCurChar);
GetCharRaw;
with FValue do
if (Length >= 2) and (Buffer[Length-1] = '>') and
(Buffer[Length-2] = '?') then
begin
Dec(Length, 2);
SetString(Value, Buffer, Length);
if Assigned(FCursor) then
FCursor.AppendChild(Doc.CreateProcessingInstruction(Name, Value));
Exit;
end;
until FCurChar = #0;
RaiseExc('Unterminated processing instruction');
end;
// here we come from ParsePI, 'xml' is already consumed
procedure TXMLReader.ParseXmlOrTextDecl(TextDecl: Boolean);
var
TmpStr: WideString;
IsXML11: Boolean;
begin
ExpectWhitespace;
// VersionInfo: optional in TextDecl, required in XmlDecl
if (not TextDecl) or (FCurChar = 'v') then
begin
ExpectString('version'); // [24]
ExpectEq;
SkipSystemLiteral(TmpStr, True);
IsXML11 := False;
if TmpStr = '1.1' then // Checking for bad chars is implied
IsXML11 := True
else if TmpStr <> '1.0' then
RaiseExc('Illegal version number');
if not TextDecl then
begin
if doc.InheritsFrom(TXMLDocument) then
TXMLDocument(doc).XMLVersion := TmpStr;
if IsXML11 then
XML11_BuildTables;
end
else // parsing external entity
if IsXML11 and not FXML11 then
RaiseExc('XML 1.0 document cannot invoke XML 1.1 entities');
if FCurChar <> '?' then
ExpectWhitespace;
end;
// EncodingDecl: required in TextDecl, optional in XmlDecl
if TextDecl or (FCurChar = 'e') then // [80]
begin
ExpectString('encoding');
ExpectEq;
SkipSystemLiteral(TmpStr, True);
if not IsValidEncName(TmpStr) then
RaiseExc('Illegal encoding name');
FSource.SetEncoding(TmpStr); // <-- Wide2Ansi conversion here
// getting here means that specified encoding is supported
// TODO: maybe assign the 'preferred' encoding name?
if not TextDecl and doc.InheritsFrom(TXMLDocument) then
TXMLDocument(doc).Encoding := TmpStr;
if FCurChar <> '?' then
ExpectWhitespace;
end;
// SDDecl: forbidden in TextDecl, optional in XmlDecl
if (not TextDecl) and (FCurChar = 's') then
begin
ExpectString('standalone');
ExpectEq;
SkipSystemLiteral(TmpStr, True);
if TmpStr = 'yes' then
FStandalone := True
else if TmpStr <> 'no' then
RaiseExc('Only "yes" or "no" are permitted as values of "standalone"');
SkipWhitespace;
end;
ExpectString('?>');
end;
procedure TXMLReader.ParseDoctypeDecl; // [28]
var
IntSubset: TWideCharBuf;
Src, OldSrc: TXMLInputSource;
begin
FAllowedDecl := dtNone;
if FDtdParsed then
RaiseExc('Markup declaration not allowed here');
ExpectString('DOCTYPE'); // gives possibly incorrect error message
ExpectWhitespace;
FDocType := TDOMDocumentTypeEx.Create(doc);
FDtdParsed := True;
{ To comply with certain output tests, we must insert PIs coming from internal
subset before DocType node. This looks very synthetic, but let it be...
Moreover, this code actually duplicates such PIs }
try
FDocType.FName := ExpectName;
ExpectWhitespace;
ParseExternalID(FDocType.FSystemID, FDocType.FPublicID, False);
SkipWhitespace;
if FCurChar = '[' then
begin
BufAllocate(IntSubset, 256);
FCopyBuf := @IntSubset;
GetChar; // cause very first char after '[' to be appended
try
FIntSubset := True;
ParseMarkupDecl;
if IntSubset.Length > 0 then // sanity check - must at least contain ']'
SetString(FDocType.FInternalSubset, IntSubset.Buffer, IntSubset.Length-1);
ExpectChar(']');
finally
FIntSubset := False;
FCopyBuf := nil;
FreeMem(IntSubset.Buffer);
end;
SkipWhitespace;
end;
ExpectChar('>');
if FDocType.SystemID <> '' then
begin
if ResolveEntity(FDocType.SystemID, FDocType.PublicID, Src) then
begin
OldSrc := FSource;
Unget(FCurChar);
FCursor := nil;
try
DoParseExtSubset(Src);
finally
while ContextPop do; // Cleanup after possible exceptions
FSource.Free;
FSource := OldSrc;
GetChar;
FCursor := Doc;
end;
end;
end;
finally
doc.AppendChild(FDocType);
end;
end;
procedure TXMLReader.ParseMisc;
begin
repeat
if SkipWhitespace then
FAllowedDecl := dtNone;
if not CheckForChar('<') then
Break;
if CheckForChar('!') then
begin
FAllowedDecl := dtNone;
if FCurChar = '-' then
ParseComment
else
ParseDoctypeDecl;
end
else
if FCurChar = '?' then
ParsePI
else
Break;
until FCurChar = #0;
FAllowedDecl := dtNone;
end;
function TXMLReader.ParseEq: Boolean; // [25]
begin
while FWhitespace do GetCharRaw;
Result := FCurChar = '=';
if Result then
begin
GetCharRaw;
while FWhitespace do GetCharRaw;
end;
end;
procedure TXMLReader.ExpectEq;
begin
if not ParseEq then
RaiseExc('Expected "="');
end;
{ DTD stuff }
procedure TXMLReader.AssertPENesting(CurrentLevel: Integer);
begin
if CurrentLevel <> FEntityLevel then
ValidationError('Parameter entities must be properly nested', []);
end;
// content model
type
TElementContentType = (
ctEmpty,
ctAny,
ctMixed,
ctName,
ctChoice,
ctSeq
);
TElementContentQuant = (
cqNone,
cqOpt,
cqReq,
cqPlus
);
{
TElementContent = record
ContentType: TElementContentType;
ContentQuant: TElementContentQuant;
Name: WideString;
Children: array of TElementContent;
end;
}
procedure TXMLReader.ExpectChoiceOrSeq(); // [49], [50]
var
Delim: WideChar;
PELevel: Integer;
begin
Delim := #0;
repeat
SkipWhitespace;
if FCurChar = '(' then
begin
PELevel := FEntityLevel;
GetChar;
ExpectChoiceOrSeq;
AssertPENesting(PELevel);
GetChar;
end
else
SkipName;
if CheckForChar('?') then
else if CheckForChar('*') then
else if CheckForChar('+') then;
SkipWhitespace;
if FCurChar = ')' then
Break;
if Delim = #0 then
begin
if (FCurChar = '|') or (FCurChar = ',') then
Delim := FCurChar
else
RaiseExc('Expected "|" or ","');
end
else
if FCurChar <> Delim then
RaiseExc(Delim);
GetChar; // skip delimiter
until False;
end;
procedure TXMLReader.ParseMixedOrChildren;
var
PELevel: Integer;
NeedAsterisk: Boolean;
begin
PELevel := FEntityLevel;
GetChar; // starting bracket
SkipWhitespace;
if CheckForChar('#') then // Mixed section [51]
begin
ExpectString('PCDATA');
SkipWhitespace;
NeedAsterisk := False;
while FCurChar <> ')' do
begin
ExpectChar('|');
NeedAsterisk := True;
SkipWhitespace;
SkipName;
SkipWhitespace;
end;
AssertPENesting(PELevel);
GetChar;
if NeedAsterisk then
ExpectChar('*')
else
CheckForChar('*');
end
else // Parse Children section [47]
begin
ExpectChoiceOrSeq;
AssertPENesting(PELevel);
GetChar;
if CheckForChar('?') then
else if CheckForChar('*') then
else if CheckForChar('+') then;
end;
end;
procedure TXMLReader.ParseElementDecl; // [45]
begin
SkipName;
ExpectWhitespace;
// Get contentspec [46]
if FCurChar = 'E' then
ExpectString('EMPTY')
else if FCurChar = 'A' then
ExpectString('ANY')
else if FCurChar = '(' then
ParseMixedOrChildren
else
RaiseExc('Invalid content specification');
end;
procedure TXMLReader.ParseNotationDecl; // [82]
var
Notation: TDOMNotationEx;
begin
Notation := TDOMNotationEx(TDOMNotation.Create(Doc));
try
Notation.FName := ExpectName;
ExpectWhitespace;
if not ParseExternalID(Notation.FSystemID, Notation.FPublicID, True) then
RaiseExc('Expected external or public ID');
except
Notation.Free;
raise;
end;
if FDocType.Notations.GetNamedItem(Notation.FName) = nil then
FDocType.Notations.SetNamedItem(Notation)
else
begin
ValidationError('Duplicate notation declaration: %s', [Notation.FName]);
Notation.Free;
end;
end;
procedure TXMLReader.ParseAttlistDecl; // [52]
var
SaveCurNode: TDOMNode;
ValueRequired: Boolean;
Token: WideString;
ElDef: TDOMElementDef;
AttDef: TDOMAttrDef;
begin
Token := ExpectName;
ElDef := TDOMElementDef(FDocType.ElementDefs.GetNamedItem(Token));
if ElDef = nil then
begin
// TODO -cVC: must distinguish ElementDef created here from one explicitly declared
ElDef := TDOMElementDef.Create(doc);
ElDef.FNodeName := Token;
FDocType.ElementDefs.SetNamedItem(ElDef);
end;
SkipWhitespace;
while FCurChar <> '>' do
begin
SkipWhitespace; { !!! }
AttDef := TDOMAttrDef.Create(doc);
try
AttDef.FName := ExpectName;
ExpectWhitespace;
Token := GetString(['A'..'Z']); // Get AttType [54], [55], [56]
if Token = 'CDATA' then
AttDef.FDataType := DT_CDATA
else if Token = 'ID' then
AttDef.FDataType := DT_ID
else if Token = 'IDREF' then
AttDef.FDataType := DT_IDREF
else if Token = 'IDREFS' then
AttDef.FDataType := DT_IDREFS
else if Token = 'ENTITY' then
AttDef.FDataType := DT_ENTITY
else if Token = 'ENTITIES' then
AttDef.FDataType := DT_ENTITIES
else if Token = 'NMTOKEN' then
AttDef.FDataType := DT_NMTOKEN
else if Token = 'NMTOKENS' then
AttDef.FDataType := DT_NMTOKENS
else if Token = 'NOTATION' then // [57], [58]
begin
AttDef.FDataType := DT_NOTATION;
ExpectWhitespace;
ExpectChar('(');
repeat
SkipWhitespace;
SkipName;
SkipWhitespace;
until not CheckForChar('|');
ExpectChar(')');
end
else
if CheckForChar('(') then // [59]
begin
AttDef.FDataType := DT_NMTOKEN;
repeat
SkipWhitespace;
if not CheckNmToken then
RaiseNameNotFound; // not completely correct error message
SkipWhitespace;
until not CheckForChar('|');
ExpectChar(')');
end else
RaiseExc('Invalid tokenized type');
ExpectWhitespace;
// Get DefaultDecl [60]
ValueRequired := False;
if CheckForChar('#') then
begin
Token := GetString(['A'..'Z']);
if Token = 'REQUIRED' then
AttDef.FDefault := AD_REQUIRED
else if Token = 'IMPLIED' then
AttDef.FDefault := AD_IMPLIED
else if Token = 'FIXED' then
begin
AttDef.FDefault := AD_FIXED;
ExpectWhitespace;
ValueRequired := True;
end
else
RaiseExc('Illegal attribute default');
end
else
begin
AttDef.FDefault := AD_DEFAULT;
ValueRequired := True;
end;
if ValueRequired then
begin
SaveCurNode := FCursor;
FCursor := AttDef;
// tricky moment, no tests for that
{ FRecognizePE := False; } // TODO: shall it really be disabled?
try
ExpectAttValue;
finally
FCursor := SaveCurNode;
{ FRecognizePE := not FIntSubset; }
end;
if AttDef.FDataType = DT_ID then
ValidationError('Attributes of type ID must not have a default value',[]);
end;
// First declaration is binding, subsequent should be ignored
if Assigned(ElDef.GetAttributeNode(AttDef.Name)) then
AttDef.Free
else
ElDef.SetAttributeNode(AttDef);
except
AttDef.Free;
raise;
end;
SkipWhitespace;
end;
end;
procedure TXMLReader.ParseEntityDeclValue(Delim: WideChar); // [9]
var
I: Integer;
Src: TXMLInputSource;
begin
Src := FSource;
// "Included in literal": process until delimiter hit IN SAME context
while not ((FSource = Src) and CheckForChar(Delim)) do
if ParsePEReference then
begin
if FIntSubset and (FSource.FParent = nil) then
RaiseExc('PE references in internal subset not allowed inside declarations');
StartPE;
GetCharRaw;
end
else if FCurChar = '&' then // CharRefs: include, EntityRefs: bypass
begin
GetCharRaw;
if not ParseCharRef then
begin
BufAppend(FValue, '&');
ExpectName;
ExpectChar(';');
for I := 0 to FName.Length-1 do
BufAppend(FValue, FName.Buffer[I]);
BufAppend(FValue, ';');
end;
end
else if FCurChar <> #0 then // Regular character
begin
BufAppend(FValue, FCurChar);
GetCharRaw;
end
else if not ContextPop then // #0
Break;
end;
procedure TXMLReader.ParseEntityDecl; // [70]
var
NDataAllowed: Boolean;
Delim: WideChar;
Entity: TDOMEntityEx;
Map: TDOMNamedNodeMap;
begin
NDataAllowed := True;
Map := FDocType.Entities;
if CheckForChar('%') then // [72]
begin
ExpectWhitespace;
NDataAllowed := False;
Map := FDocType.PEMap;
end;
Entity := TDOMEntityEx.Create(Doc);
try
Entity.FInternal := FIntSubset and (FSource.FParent = nil);
Entity.FName := ExpectName;
ExpectWhitespace;
if (FCurChar = '"') or (FCurChar = '''') then
begin
NDataAllowed := False;
Delim := FCurChar;
FRecognizePE := False; // PERef right after delimiter should not be recognized
GetCharRaw; // at char level - we process it 'manually'
FValue.Length := 0;
ParseEntityDeclValue(Delim);
FRecognizePE := not FIntSubset;
SetString(Entity.FReplacementText, FValue.Buffer, FValue.Length);
end
else
if not ParseExternalID(Entity.FSystemID, Entity.FPublicID, False) then
RaiseExc('Expected entity value or external ID');
if NDataAllowed then // [76]
begin
if FCurChar <> '>' then
ExpectWhitespace;
if FCurChar = 'N' then
begin
ExpectString('NDATA');
ExpectWhitespace;
SkipName;
// TODO -cVC: Notation declared. Here or after all has been read?
SetString(Entity.FNotationName, FName.Buffer, FName.Length);
if FDocType.Notations.GetNamedItem(Entity.NotationName) = nil then
ValidationError('Reference to undeclared notation ''%s''', [Entity.NotationName]);
end;
end;
except
Entity.Free;
raise;
end;
// Repeated declarations of same entity are legal but must be ignored
if Map.GetNamedItem(Entity.NodeName) = nil then
Map.SetNamedItem(Entity)
else
Entity.Free;
end;
procedure TXMLReader.ParseMarkupDecl; // [29]
var
Token: WideString;
IncludeLevel: Integer;
IgnoreLevel: Integer;
PELevel: Integer;
begin
IncludeLevel := 0;
IgnoreLevel := 0;
repeat
if SkipWhitespace then
FAllowedDecl := dtNone;
if ParsePEReference then // PERef between declarations should always be recognized
begin
FAllowedDecl := dtNone;
if Assigned(FDocType) then
FDocType.HasPERefs := True;
StartPE;
GetChar;
Continue;
end;
if (FCurChar = #0) and ContextPop then
Continue;
if (FCurChar = ']') and (IncludeLevel > 0) then
begin
ExpectString(']]>');
Dec(IncludeLevel);
Continue;
end;
if FCurChar <> '<' then
Break;
PELevel := FEntityLevel;
GetCharRaw;
if CheckForChar('!') then
begin
FAllowedDecl := dtNone;
if FCurChar = '-' then
ParseComment
else if FCurChar = '[' then
begin
if FIntSubset and (FSource.FParent = nil) then
RaiseExc('Conditional sections not allowed in internal subset');
FRecognizePE := not FIntSubset;
GetChar; // skip '['
SkipWhitespace;
Token := GetString(['A'..'Z']);
SkipWhitespace;
if Token = 'INCLUDE' then
Inc(IncludeLevel)
else if Token = 'IGNORE' then
IgnoreLevel := 1
else
RaiseExc('Expected "INCLUDE" or "IGNORE"');
AssertPENesting(PELevel);
ExpectChar('[');
if IgnoreLevel > 0 then
repeat
FRecognizePE := False; // PEs not recognized in IGNORE section
if CheckForChar('<') and CheckForChar('!') and CheckForChar('[') then
Inc(IgnoreLevel)
else if CheckForChar(']') and CheckForChar(']') and CheckForChar('>') then
Dec(IgnoreLevel)
else GetChar;
until (IgnoreLevel=0) or (FCurChar = #0);
end
else
begin
FRecognizePE := not FIntSubset;
Token := GetString(['A'..'Z']);
ExpectWhitespace;
if Token = 'ELEMENT' then
ParseElementDecl
else if Token = 'ENTITY' then
ParseEntityDecl
else if Token = 'ATTLIST' then
ParseAttlistDecl
else if Token = 'NOTATION' then
ParseNotationDecl
else
RaiseExc('Illegal markup declaration');
SkipWhitespace;
FRecognizePE := False; // ! Don't auto-pop context on last markup delimiter
ExpectChar('>'); // This enables correct nesting check
end;
{
MarkupDecl starting in PE and ending in root is a WFC [28a]
MarkupDecl starting in root but ending in PE is a VC (erratum 2e-14)
}
if PELevel > FEntityLevel then
RaiseExc('Parameter entities must be properly nested')
else
AssertPENesting(PELevel);
end
else if FCurChar = '?' then
ParsePI;
until False;
FRecognizePE := False;
if (IncludeLevel > 0) or (IgnoreLevel > 0) then
RaiseExc('Conditional section not closed');
end;
procedure TXMLReader.DoParseExtSubset(ASource: TXMLInputSource);
begin
InitializeRoot(ASource);
FAllowedDecl := dtText;
ParseMarkupDecl;
if FCurChar <> #0 then
RaiseExc('Illegal character in DTD');
end;
procedure TXMLReader.ProcessDTD(ASource: TXMLInputSource);
begin
doc := TXMLDocument.Create;
FDocType := TDOMDocumentTypeEx.Create(doc);
// TODO: DTD labeled version 1.1 will be rejected - must set FXML11 flag
// TODO: what shall be FCursor? FDocType cannot - it does not accept child nodes
doc.AppendChild(FDocType);
DoParseExtSubset(ASource);
end;
procedure TXMLReader.ParseCDSect; // [18]
var
name: WideString;
begin
ExpectString('[CDATA[');
FValue.Length := 0;
repeat
BufAppend(FValue, FCurChar);
GetCharRaw;
with FValue do
if (Length >= 3) and (Buffer[Length-1] = '>') and
(Buffer[Length-2] = ']') and (Buffer[Length-3] = ']') then
begin
Dec(Length, 3);
SetString(name, Buffer, Length);
FCursor.AppendChild(doc.CreateCDATASection(name));
Exit;
end;
until FCurChar = #0;
RaiseExc('Unterminated CDATA section');
end;
procedure TXMLReader.ParseContent;
begin
repeat
if FCurChar = '<' then
begin
GetCharRaw;
if CheckName then
ParseElement
else if FCurChar = '!' then
begin
GetCharRaw;
FAllowedDecl := dtNone;
if FCurChar = '[' then
ParseCDSect
else if FCurChar = '-' then
ParseComment
else
ParseDoctypeDecl; // actually will raise error
end
else if FCurChar = '?' then
ParsePI
else
Exit;
end
else
ProcessTextAndRefs;
until FCurChar = #0;
end;
// Element name already in FNameBuffer
procedure TXMLReader.ParseElement; // [39] [40] [44]
var
NewElem: TDOMElement;
IsEmpty: Boolean;
attr, OldAttr: TDOMNode;
begin
NewElem := doc.CreateElementBuf(FName.Buffer, FName.Length);
FCursor.AppendChild(NewElem);
Assert(NewElem.ParentNode = FCursor, 'AppendChild did not set ParentNode');
FCursor := NewElem;
IsEmpty := False;
while FCurChar <> '>' do
begin
if FCurChar = '/' then
begin
GetCharRaw;
IsEmpty := True;
FCursor := FCursor.ParentNode;
Break;
end;
// Get Attribute [41]
ExpectWhitespace;
if not CheckName then // allow stuff like <element >, <element />
Continue;
attr := doc.CreateAttributeBuf(FName.Buffer, FName.Length);
// !!cannot use TDOMElement.SetAttributeNode because it will free old attribute
OldAttr := NewElem.Attributes.SetNamedItem(Attr);
if Assigned(OldAttr) then
begin
OldAttr.Free;
RaiseExc('Duplicate attribute');
end;
ExpectEq;
Assert(TDOMAttr(attr).OwnerElement = NewElem, 'DOMAttr.OwnerElement not set correctly');
FCursor := attr;
ExpectAttValue;
FCursor := NewElem;
end;
ExpectChar('>');
ProcessDefaultAttributes(NewElem);
if not IsEmpty then
begin
if not FPreserveWhitespace then // critical for testsuite compliance
SkipWhitespace;
ParseContent;
if FCurChar = '/' then // Get ETag [42]
begin
GetCharRaw;
if ExpectName <> NewElem.TagName then
RaiseExc('Unmatching element end tag (expected "</%s>")', [NewElem.TagName]);
SkipWhitespace;
ExpectChar('>');
FCursor := FCursor.ParentNode;
end
else if FCurChar <> #0 then
RaiseNameNotFound
else // End of stream in content
RaiseExc('Document element not closed');
end;
end;
procedure TXMLReader.ProcessDefaultAttributes(Element: TDOMElement);
var
I: Integer;
ElDef: TDOMElementDef;
AttDefs: TDOMNamedNodeMap;
AttDef: TDOMAttrDef;
Attr: TDOMAttrEx;
Spec: Boolean;
begin
if Assigned(FDocType) then
begin
ElDef := TDOMElementDef(FDocType.ElementDefs.GetNamedItem(Element.TagName));
if Assigned(ElDef) and ElDef.HasAttributes then
begin
AttDefs := ElDef.Attributes;
for I := 0 to AttDefs.Length-1 do
begin
AttDef := AttDefs[I] as TDOMAttrDef;
Spec := True;
// no validity checking yet; just append default values
Attr := TDOMAttrEx(Element.GetAttributeNode(AttDef.Name));
if (AttDef.FDefault in [AD_DEFAULT, AD_FIXED]) and (Attr = nil) then
begin
Attr := TDOMAttrEx(AttDef.CloneNode(True));
Element.SetAttributeNode(Attr);
Spec := False;
end;
if Assigned(Attr) then
begin
Attr.FSpecified := Spec;
Attr.FNormalize := (AttDef.FDataType <> DT_CDATA);
end;
end;
end;
end;
end;
function TXMLReader.ParsePEReference: Boolean; // [69]
begin
Result := CheckForChar('%');
if Result then
begin
SkipName;
ExpectChar(';');
end;
end;
function TXMLReader.ParseExternalID(out SysID, PubID: WideString; // [75]
SysIdOptional: Boolean): Boolean;
begin
if FCurChar = 'S' then
begin
ExpectString('SYSTEM');
ExpectWhitespace;
SkipSystemLiteral(SysID, True);
Result := True;
end
else if FCurChar = 'P' then
begin
ExpectString('PUBLIC');
ExpectWhitespace;
SkipPubidLiteral;
SetString(PubID, FValue.Buffer, FValue.Length);
if SysIdOptional then
begin
SkipWhitespace;
SkipSystemLiteral(SysID, False);
end
else
begin
ExpectWhitespace;
SkipSystemLiteral(SysID, True);
end;
Result := True;
end else
Result := False;
end;
procedure TXMLReader.ValidationError(const Msg: string;
const args: array of const);
begin
// TODO: just a stub now
FInvalid := True;
end;
procedure ReadXMLFile(out ADoc: TXMLDocument; var f: Text);
var
Reader: TXMLReader;
Src: TXMLInputSource;
begin
ADoc := nil;
Src := TXMLFileInputSource.Create(f);
Src.SystemID := FilenameToURI(TTextRec(f).Name);
Reader := TXMLReader.Create;
try
Reader.ProcessXML(Src);
ADoc := TXMLDocument(Reader.Doc);
finally
Reader.Free;
end;
end;
procedure ReadXMLFile(out ADoc: TXMLDocument; var f: TStream; const ABaseURI: String);
var
Reader: TXMLReader;
Src: TXMLInputSource;
begin
ADoc := nil;
Reader := TXMLReader.Create;
try
Src := TXMLStreamInputSource.Create(f, False);
Src.SystemID := ABaseURI;
Reader.ProcessXML(Src);
finally
ADoc := TXMLDocument(Reader.doc);
Reader.Free;
end;
end;
procedure ReadXMLFile(out ADoc: TXMLDocument; var f: TStream);
begin
ReadXMLFile(ADoc, f, 'stream:');
end;
procedure ReadXMLFile(out ADoc: TXMLDocument; const AFilename: String);
var
FileStream: TStream;
begin
ADoc := nil;
FileStream := TFileStream.Create(AFilename, fmOpenRead+fmShareDenyWrite);
try
ReadXMLFile(ADoc, FileStream, FilenameToURI(AFilename));
finally
FileStream.Free;
end;
end;
procedure ReadXMLFragment(AParentNode: TDOMNode; var f: Text);
var
Reader: TXMLReader;
Src: TXMLInputSource;
begin
Reader := TXMLReader.Create;
try
Src := TXMLFileInputSource.Create(f);
Src.SystemID := FilenameToURI(TTextRec(f).Name);
Reader.ProcessFragment(Src, AParentNode);
finally
Reader.Free;
end;
end;
procedure ReadXMLFragment(AParentNode: TDOMNode; var f: TStream; const ABaseURI: String);
var
Reader: TXMLReader;
Src: TXMLInputSource;
begin
Reader := TXMLReader.Create;
try
Src := TXMLStreamInputSource.Create(f, False);
Src.SystemID := ABaseURI;
Reader.ProcessFragment(Src, AParentNode);
finally
Reader.Free;
end;
end;
procedure ReadXMLFragment(AParentNode: TDOMNode; var f: TStream);
begin
ReadXMLFragment(AParentNode, f, 'stream:');
end;
procedure ReadXMLFragment(AParentNode: TDOMNode; const AFilename: String);
var
Stream: TStream;
begin
Stream := TFileStream.Create(AFilename, fmOpenRead+fmShareDenyWrite);
try
ReadXMLFragment(AParentNode, Stream, FilenameToURI(AFilename));
finally
Stream.Free;
end;
end;
procedure ReadDTDFile(out ADoc: TXMLDocument; var f: Text);
var
Reader: TXMLReader;
Src: TXMLInputSource;
begin
ADoc := nil;
Reader := TXMLReader.Create;
try
Src := TXMLFileInputSource.Create(f);
Src.SystemID := FilenameToURI(TTextRec(f).Name);
Reader.ProcessDTD(Src);
ADoc := TXMLDocument(Reader.doc);
finally
Reader.Free;
end;
end;
procedure ReadDTDFile(out ADoc: TXMLDocument; var f: TStream; const ABaseURI: String);
var
Reader: TXMLReader;
Src: TXMLInputSource;
begin
ADoc := nil;
Reader := TXMLReader.Create;
try
Src := TXMLStreamInputSource.Create(f, False);
Src.SystemID := ABaseURI;
Reader.ProcessDTD(Src);
ADoc := TXMLDocument(Reader.doc);
finally
Reader.Free;
end;
end;
procedure ReadDTDFile(out ADoc: TXMLDocument; var f: TStream);
begin
ReadDTDFile(ADoc, f, 'stream:');
end;
procedure ReadDTDFile(out ADoc: TXMLDocument; const AFilename: String);
var
Stream: TStream;
begin
ADoc := nil;
Stream := TFileStream.Create(AFilename, fmOpenRead+fmShareDenyWrite);
try
ReadDTDFile(ADoc, Stream, FilenameToURI(AFilename));
finally
Stream.Free;
end;
end;
end.