diff --git a/fcl/xml/xmlread.pp b/fcl/xml/xmlread.pp index 538e1e0078..e25ea3dcd9 100644 --- a/fcl/xml/xmlread.pp +++ b/fcl/xml/xmlread.pp @@ -22,36 +22,34 @@ unit XMLRead; interface -{off $DEFINE MEM_CHECK} - uses - {$IFDEF MEM_CHECK}MemCheck,{$ENDIF} SysUtils, Classes, DOM; type EXMLReadError = class(Exception); - procedure ReadXMLFile(out ADoc: TXMLDocument; const AFilename: String); overload; -procedure ReadXMLFile(out ADoc: TXMLDocument; var f: File); overload; +procedure ReadXMLFile(out ADoc: TXMLDocument; var f: Text); overload; procedure ReadXMLFile(out ADoc: TXMLDocument; var f: TStream); overload; -procedure ReadXMLFile(out ADoc: TXMLDocument; var f: TStream; const AFilename: String); overload; +procedure ReadXMLFile(out ADoc: TXMLDocument; var f: TStream; const ABaseURI: String); overload; procedure ReadXMLFragment(AParentNode: TDOMNode; const AFilename: String); overload; -procedure ReadXMLFragment(AParentNode: TDOMNode; var f: File); overload; +procedure ReadXMLFragment(AParentNode: TDOMNode; var f: Text); overload; procedure ReadXMLFragment(AParentNode: TDOMNode; var f: TStream); overload; -procedure ReadXMLFragment(AParentNode: TDOMNode; var f: TStream; const AFilename: String); overload; +procedure ReadXMLFragment(AParentNode: TDOMNode; var f: TStream; const ABaseURI: String); overload; procedure ReadDTDFile(out ADoc: TXMLDocument; const AFilename: String); overload; -procedure ReadDTDFile(out ADoc: TXMLDocument; var f: File); overload; +procedure ReadDTDFile(out ADoc: TXMLDocument; var f: Text); overload; procedure ReadDTDFile(out ADoc: TXMLDocument; var f: TStream); overload; -procedure ReadDTDFile(out ADoc: TXMLDocument; var f: TStream; const AFilename: String); overload; - +procedure ReadDTDFile(out ADoc: TXMLDocument; var f: TStream; const ABaseURI: String); overload; // ======================================================= implementation +uses + UriParser; + type TSetOfChar = set of Char; @@ -62,67 +60,185 @@ const '-', '''', '(', ')', '+', ',', '.', '/', ':', '=', '?', ';', '!', '*', '#', '@', '$', '_', '%']; - NmToken: TSetOfChar = Letter + Digit + ['.', '-', '_', ':']; - type - TXMLReaderDocumentType = class(TDOMDocumentType); + TDOMNotationEx = class(TDOMNotation); + TDOMAttrEx = class(TDOMAttr); + + TXMLInputSource = class; + TDOMElementDef = class; + + TDOMEntityEx = class(TDOMEntity) + protected + FInternal: Boolean; + FResolved: Boolean; + FOnStack: Boolean; + FReplacementText: DOMString; + end; + + // TODO: Do I need PEMap in DocType? Maybe move it to Reader itself? + // (memory usage - they are not needed after parsing) + TDOMDocumentTypeEx = class(TDOMDocumentType) + private + FHasPERefs: Boolean; + FPEMap: TDOMNamedNodeMap; + FElementDefs: TDOMNamedNodeMap; + function GetPEMap: TDOMNamedNodeMap; + function GetElementDefs: TDOMNamedNodeMap; + protected + property PEMap: TDOMNamedNodeMap read GetPEMap; + property ElementDefs: TDOMNamedNodeMap read GetElementDefs; + property HasPERefs: Boolean read FHasPERefs write FHasPERefs; + public + destructor Destroy; override; + end; TXMLReader = class; + TDecoder = class; + TDecoderRef = class of TDecoder; - TCharSource = class + TXMLInputSource = class private - Buf: PChar; + FBuf: PChar; + FBufEnd: PChar; + FEof: Boolean; + FSurrogate: WideChar; FReader: TXMLReader; + FParent: TXMLInputSource; + FEntity: TObject; // weak reference + FCursor: TObject; // weak reference + FLine: Integer; + FColumn: Integer; + FSystemID: WideString; + FPublicID: WideString; + function GetSystemID: WideString; + function GetPublicID: WideString; + protected + procedure FetchData; virtual; public - constructor Create(AReader: TXMLReader; ABuffer: PChar); - function NextChar: WideChar; virtual; abstract; + constructor Create(const AData: WideString); + function NextChar: WideChar; virtual; + procedure Initialize; virtual; + procedure SetEncoding(const AEncoding: string); virtual; + property SystemID: WideString read GetSystemID write FSystemID; + property PublicID: WideString read GetPublicID write FPublicID; end; - TUCS2CharSource = class(TCharSource) + TXMLDecodingSource = class(TXMLInputSource) + private + FDecoder: TDecoder; + FSeenCR: Boolean; + function InternalNextChar: WideChar; + procedure DecodingError(const Msg: string); overload; + procedure DecodingError(const Msg: string; const Args: array of const); overload; + public + destructor Destroy; override; + function NextChar: WideChar; override; + procedure SetEncoding(const AEncoding: string); override; + procedure Initialize; override; + end; + + TXMLStreamInputSource = class(TXMLDecodingSource) + private + FAllocated: PChar; + FStream: TStream; + FBufSize: Integer; + FOwnStream: Boolean; + public + constructor Create(AStream: TStream; AOwnStream: Boolean); + destructor Destroy; override; + procedure FetchData; override; + end; + + TXMLFileInputSource = class(TXMLDecodingSource) + private + FFile: ^Text; + FString: string; + public + constructor Create(var AFile: Text); + procedure FetchData; override; + end; + + TDecoder = class + private + FSource: TXMLDecodingSource; + public + constructor Create(ASource: TXMLDecodingSource); + function DecodeNext: WideChar; virtual; abstract; + class function Supports(const AEncoding: string): Boolean; virtual; abstract; + end; + + TISO8859_1Decoder = class(TDecoder) + public + function DecodeNext: WideChar; override; + class function Supports(const AEncoding: string): Boolean; override; + end; + + TUCS2Decoder = class(TDecoder) private FSwapEndian: Boolean; + FEncoding: string; public - function NextChar: WideChar; override; + function DecodeNext: WideChar; override; + class function Supports(const AEncoding: string): Boolean; override; end; - TUTF8CharSource = class(TCharSource) - private - procedure BadChar; + TUTF8Decoder = class(TDecoder) public - function NextChar: WideChar; override; + function DecodeNext: WideChar; override; + class function Supports(const AEncoding: string): Boolean; override; end; - TISO_8859_1CharSource = class(TCharSource) - public - function NextChar: WideChar; override; - end; + PWideCharBuf = ^TWideCharBuf; + TWideCharBuf = record + Buffer: PWideChar; + Length: Integer; + MaxLength: Integer; + end; + + TEntityResolveEvent = procedure(const PubID, SysID: WideString; var Source: TXMLInputSource) of object; + TDeclType = (dtNone, dtXml, dtText); TXMLReader = class private - FSource: TCharSource; + FSource: TXMLInputSource; FCurChar: WideChar; - FLine: Integer; // <- To Locator - FColumn: Integer; // <- To Locator - FSeenCR: Boolean; FWhitespace: Boolean; - FValue: array of WideChar; - FValueLength: Integer; - FName: array of WideChar; - FNameLength: Integer; - FInternalSubset: Boolean; - FPrologParsed: Boolean; + FXML11: Boolean; + FValue: TWideCharBuf; + FName: TWideCharBuf; + FCopyBuf: PWideCharBuf; + FIntSubset: Boolean; + FAllowedDecl: TDeclType; + FDtdParsed: Boolean; + FRecognizePE: Boolean; + FStandalone: Boolean; // property of Doc ? + FInvalid: Boolean; + // TODO: This array must be stored globally, not per instance + FNamePages: PByteArray; + FForbiddenAscii: TSetOfChar; + FDocType: TDOMDocumentTypeEx; // a shortcut + FEntityLevel: Integer; + FPreserveWhitespace: Boolean; + FCreateEntityRefs: Boolean; procedure RaiseExpectedQmark; procedure GetChar; - procedure AppendValue(wc: WideChar); - procedure AppendName(wc: WideChar); - procedure DetectEncoding; + procedure GetCharRaw; + procedure Unget(wc: WideChar); + procedure Initialize(ASource: TXMLInputSource); + procedure InitializeRoot(ASource: TXMLInputSource); + procedure DoParseAttValue(Delim: WideChar); + procedure DoParseFragment; + procedure DoParseExtSubset(ASource: TXMLInputSource); + function ContextPush(AEntity: TDOMEntityEx): Boolean; + function ContextPop: Boolean; + procedure XML11_BuildTables; + function XML11_CheckName: Boolean; protected - buf: PChar; // <- To InputSource - Filename: String; // <- To InputSource FCursor: TDOMNode; procedure RaiseExc(const descr: String); overload; - procedure RaiseExc(Expected, Found: WideChar); overload; + procedure RaiseExc(const descr: string; const args: array of const); overload; + procedure RaiseExc(Expected: WideChar); overload; function SkipWhitespace: Boolean; procedure ExpectWhitespace; procedure ExpectString(const s: String); @@ -133,170 +249,187 @@ type procedure RaiseNameNotFound; function CheckName: Boolean; + function CheckNmToken: Boolean; function ExpectName: WideString; // [5] procedure SkipName; + function SkipQuotedLiteral: Boolean; procedure ExpectAttValue; // [10] procedure SkipPubidLiteral; // [12] + procedure SkipSystemLiteral(out Literal: WideString; Required: Boolean); procedure ParseComment; // [15] procedure ParsePI; // [16] - procedure ExpectProlog; // [22] - function ParseInternalDtd: Boolean; - procedure ParseProlog; + procedure ParseCDSect; // [18] + procedure ParseXmlOrTextDecl(TextDecl: Boolean); function ParseEq: Boolean; // [25] procedure ExpectEq; procedure ParseMisc; // [27] - function ParseMarkupDecl(InternalSubset: Boolean): Boolean; // [29] - procedure ParseCDSect; // [18] - function ParseElementContent: Boolean; + procedure ParseDoctypeDecl; // [28] + procedure ParseMarkupDecl; // [29] procedure ParseElement; // [39] - procedure ExpectElement; - function ResolvePredefined(const RefName: WideString): Boolean; - function ParseReference: TDOMEntityReference; // [67] + procedure ParseContent; // [43] + function ResolvePredefined(const RefName: WideString): WideChar; + procedure IncludeEntity(AEntity: TDOMEntityEx; InAttr: Boolean); + procedure StartPE; + function ParseCharRef: Boolean; // [66] + function ParseReference: TDOMEntityEx; // [67] function ParsePEReference: Boolean; // [69] - function ParseExternalID(InNotation: Boolean): Boolean; // [75] - procedure ExpectExternalID; - procedure ProcessTextAndRefs(Delim: WideChar; DiscardWS: Boolean); + function ParseExternalID(out SysID, PubID: WideString; // [75] + SysIdOptional: Boolean): Boolean; + procedure ProcessTextAndRefs; + procedure AssertPENesting(CurrentLevel: Integer); procedure ParseEntityDecl; + procedure ParseEntityDeclValue(Delim: WideChar); procedure ParseAttlistDecl; + procedure ExpectChoiceOrSeq; + procedure ParseMixedOrChildren; procedure ParseElementDecl; procedure ParseNotationDecl; - - procedure ResolveEntities(RootNode: TDOMNode); + function ResolveEntity(const SystemID, PublicID: WideString; out Source: TXMLInputSource): Boolean; + procedure ProcessDefaultAttributes(Element: TDOMElement); + procedure ValidationError(const Msg: string; const args: array of const); public doc: TDOMDocument; + constructor Create; destructor Destroy; override; - procedure ProcessXML(ABuf: PChar; const AFilename: String); // [1] - procedure ProcessFragment(AOwner: TDOMNode; ABuf: PChar; const AFilename: String); - procedure ProcessDTD(ABuf: PChar; const AFilename: String); // ([29]) + procedure ProcessXML(ASource: TXMLInputSource); // [1] + procedure ProcessFragment(ASource: TXMLInputSource; AOwner: TDOMNode); + procedure ProcessDTD(ASource: TXMLInputSource); // ([29]) end; + // AttDef/ElementDef support + TAttrDataType = ( + DT_CDATA, + DT_ID, + DT_IDREF, + DT_IDREFS, + DT_ENTITY, + DT_ENTITIES, + DT_NMTOKEN, + DT_NMTOKENS, + DT_NOTATION + ); + + TAttrDefault = ( + AD_IMPLIED, + AD_DEFAULT, + AD_REQUIRED, + AD_FIXED + ); + + TDOMAttrDef = class(TDOMAttr) + protected + FDataType: TAttrDataType; + FDefault: TAttrDefault; + // FEnumeration: TWideStringList? array of WideStrings? + end; + + TDOMElementDef = class(TDOMElement); + + {$i names.inc} -// TODO: These CharSource classes still cannot be considered as the final solution... -{ TCharSource } +// TODO: List of registered/supported decoders +function FindDecoder(const Encoding: string): TDecoderRef; +begin + if TISO8859_1Decoder.Supports(Encoding) then + Result := TISO8859_1Decoder + else + Result := nil; +end; -constructor TCharSource.Create(AReader: TXMLReader; ABuffer: PChar); + +procedure BufAllocate(var ABuffer: TWideCharBuf; ALength: Integer); +begin + ABuffer.MaxLength := ALength; + ABuffer.Length := 0; + GetMem(ABuffer.Buffer, ABuffer.MaxLength*SizeOf(WideChar)); +end; + +procedure BufAppend(var ABuffer: TWideCharBuf; wc: WideChar); +begin + if ABuffer.Length >= ABuffer.MaxLength then + begin + ABuffer.MaxLength := ABuffer.MaxLength * 2; + ReallocMem(ABuffer.Buffer, ABuffer.MaxLength * SizeOf(WideChar)); + end; + ABuffer.Buffer[ABuffer.Length] := wc; + Inc(ABuffer.Length); +end; + +function IsValidEncName(const s: WideString): Boolean; +var + I: Integer; +begin + Result := False; + if (s = '') or (s[1] > #255) or not (char(s[1]) in ['A'..'Z', 'a'..'z']) then + Exit; + for I := 2 to Length(s) do + if (s[I] > #255) or not (char(s[I]) in ['A'..'Z', 'a'..'z', '0'..'9', '.', '_', '-']) then + Exit; + Result := True; +end; + +{ TDOMDocumentTypeEx } + +destructor TDOMDocumentTypeEx.Destroy; +begin + FPEMap.Free; + FElementDefs.Free; + inherited Destroy; +end; + +function TDOMDocumentTypeEx.GetElementDefs: TDOMNamedNodeMap; +begin + if FElementDefs = nil then + FElementDefs := TDOMNamedNodeMap.Create(Self, ELEMENT_NODE); + Result := FElementDefs; +end; + +function TDOMDocumentTypeEx.GetPEMap: TDOMNamedNodeMap; +begin + if FPEMap = nil then + FPEMap := TDOMNamedNodeMap.Create(Self, ENTITY_NODE); + Result := FPEMap; +end; + + +// TODO: These classes still cannot be considered as the final solution... + +{ TXMLInputSource } + +constructor TXMLInputSource.Create(const AData: WideString); begin inherited Create; - FReader := AReader; - Buf := ABuffer; + FBuf := PChar(PWideChar(AData)); + FBufEnd := FBuf + Length(AData) * sizeof(WideChar); end; -{ TUCS2CharSource } - -function TUCS2CharSource.NextChar: WideChar; +procedure TXMLInputSource.Initialize; begin - Result := PWideChar(buf)^; - Inc(buf, sizeof(WideChar)); - if FSwapEndian then - Result := WideChar(Swap(Word(Result))); + FLine := 1; + FColumn := 0; end; -{ TUTF8CharSource } - -procedure TUTF8CharSource.BadChar; +function TXMLInputSource.NextChar: WideChar; begin - FReader.RaiseExc('Invalid character in UTF8 sequence'); -end; - -function TUTF8CharSource.NextChar: WideChar; -var - ch2, ch3: Byte; -begin - Result := WideChar(buf[0]); - Inc(buf); - if Result < #128 then { ASCII } - Exit - else if (Byte(Result) and $E0) = $C0 then { #$0080 - #$07FF } + if FSurrogate <> #0 then begin - ch2 := ord(buf[0]); Inc(Buf); - if (Ch2 and $C0) <> $80 then - BadChar; - Result := WideChar((Byte(Result) and $1F) shl 6 + (Ch2 and $3F)); + Result := FSurrogate; + FSurrogate := #0; end - else if (Byte(Result) and $F0) = $E0 then { #$0800 - #$FFFF } + else if FBufEnd <= FBuf then begin - ch2 := ord(buf[0]); Inc(buf); - if (Ch2 and $C0) <> $80 then - BadChar; - ch3 := ord(buf[0]); Inc(buf); - if (Ch3 and $C0) <> $80 then - BadChar; - Result := WideChar(Word((Byte(Result) and $0F) shl 12) + - (Ch2 and $3F) shl 6 + (Ch3 and $3F)); - end - else { if (Byte(Result) and $F8) = $F0) then } // and $FC = $F8 - // and $FE = $FC - FReader.RaiseExc('Unsupported UTF8 character'); -end; - -{ TISO8859_1CharSource } - -function TISO_8859_1CharSource.NextChar: WideChar; -begin - Result := WideChar(buf[0]); Inc(Buf); -end; - -{ TXMLReader } - -procedure TXMLReader.DetectEncoding; -var - b: Char; -begin - b := buf[0]; - if (b = #$FE) and (buf[1] = #$FF) then - begin - Inc(buf, 2); - FSource := TUCS2CharSource.Create(Self, buf); - {$IFNDEF ENDIAN_BIG} - TUCS2CharSource(FSource).FSwapEndian := True; - {$ENDIF} - end - else if (b = #$FF) and (buf[1] = #$FE) then - begin - Inc(buf, 2); - FSource := TUCS2CharSource.Create(Self, buf); - {$IFDEF ENDIAN_BIG} - TUCS2CharSource(FSource).FSwapEndian := True; - {$ENDIF} + Result := #0; + FEof := True; end else - FSource := TUTF8CharSource.Create(Self, Buf); - - GetChar; - if FCurChar = #$FEFF then // skip BOM, if one is present - GetChar; -end; - -procedure TXMLReader.GetChar; -begin - FCurChar := FSource.NextChar; - if FSeenCR then begin - case FCurChar of - #10, #$85: FCurChar := FSource.NextChar; // #$85 is xml 1.1 specific - end; - FSeenCR := False; + Result := PWideChar(FBuf)^; + Inc(FBuf, sizeof(WideChar)); end; - FWhitespace := False; - case FCurChar of - #9, #10, #32: FWhitespace := True; - #13: begin - FSeenCR := True; - FCurChar := #10; - FWhitespace := True; - end; - #$85, #$2028: // xml 1.1 specific - FCurChar := #10; - - #1..#8, #11, #12, #14..#31, // never allowed... btw, #0 is also forbidden - #$D800..#$DFFF, // surrogates - should be supported some way - #$FFFE..#$FFFF: // never allowed - RaiseExc('Invalid character'); - end; - - if FCurChar = #10 then + // TODO: Column counting - surrogate pair is a single char! + if Result = #10 then begin Inc(FLine); FColumn := 0; @@ -305,54 +438,452 @@ begin Inc(FColumn); end; -procedure TXMLReader.AppendValue(wc: WideChar); -var - Alloc: Integer; +procedure TXMLDecodingSource.DecodingError(const Msg: string); begin - Alloc := Length(FValue); - if FValueLength >= Alloc then - begin - if Alloc = 0 then - Alloc := 512 - else - Alloc := Alloc * 2; - SetLength(FValue, Alloc); - end; - FValue[FValueLength] := wc; - Inc(FValueLength); + FReader.RaiseExc(Msg); end; -procedure TXMLReader.AppendName(wc: WideChar); -var - Alloc: Integer; +procedure TXMLDecodingSource.DecodingError(const Msg: string; + const Args: array of const); begin - Alloc := Length(FName); - if FNameLength >= Alloc then - begin - if Alloc = 0 then - Alloc := 128 - else - Alloc := Alloc * 2; - SetLength(FName, Alloc); - end; - FName[FNameLength] := wc; - Inc(FNameLength); + FReader.RaiseExc(Msg, Args); end; +procedure TXMLInputSource.FetchData; +begin + FEof := True; +end; + +procedure TXMLInputSource.SetEncoding(const AEncoding: string); +begin + // do nothing +end; + +function TXMLInputSource.GetPublicID: WideString; +begin + if FPublicID <> '' then + Result := FPublicID + else if Assigned(FParent) then + Result := FParent.PublicID + else + Result := ''; +end; + +function TXMLInputSource.GetSystemID: WideString; +begin + if FSystemID <> '' then + Result := FSystemID + else if Assigned(FParent) then + Result := FParent.SystemID + else + Result := ''; +end; + +{ TXMLDecodingSource } + +destructor TXMLDecodingSource.Destroy; +begin + FDecoder.Free; + inherited Destroy; +end; + +function TXMLDecodingSource.InternalNextChar: WideChar; +begin + // TODO: find a place for it, finally... + if FSurrogate <> #0 then + begin + Result := FSurrogate; + FSurrogate := #0; + Exit; + end; + if FBufEnd <= FBuf then + FetchData; + if not FEof then + Result := FDecoder.DecodeNext + else + Result := #0; +end; + +function TXMLDecodingSource.NextChar: WideChar; +begin + Result := InternalNextChar; + if FSeenCR then + begin + if (Result = #10) or ((Result = #$85) and FReader.FXML11) then + Result := InternalNextChar; + FSeenCR := False; + end; + case Result of + #13: begin + FSeenCR := True; + Result := #10; + end; + + #$85, #$2028: + if FReader.FXML11 then + Result := #10; + end; + if (Result < #256) and (char(Result) in FReader.FForbiddenAscii) or + ((ord(Result) or 1) = $FFFF) then + DecodingError('Invalid character'); + + // TODO: Column counting - surrogate pair is a single char! + if Result = #10 then + begin + Inc(FLine); + FColumn := 0; + end + else + Inc(FColumn); +end; + +procedure TXMLDecodingSource.Initialize; +begin + inherited; + if FBufEnd-FBuf > 1 then + repeat + if (FBuf[0] = #$FE) and (FBuf[1] = #$FF) then // BE + begin + FDecoder := TUCS2Decoder.Create(Self); + TUCS2Decoder(FDecoder).FEncoding := 'UTF-16BE'; + {$IFNDEF ENDIAN_BIG} + TUCS2Decoder(FDecoder).FSwapEndian := True; + {$ENDIF} + Exit; + end + else if (FBuf[0] = #$FF) and (FBuf[1] = #$FE) then // LE + begin + FDecoder := TUCS2Decoder.Create(Self); + TUCS2Decoder(FDecoder).FEncoding := 'UTF-16LE'; + {$IFDEF ENDIAN_BIG} + TUCS2Decoder(FDecoder).FSwapEndian := True; + {$ENDIF} + Exit; + end + else + Break; + until False; + FDecoder := TUTF8Decoder.Create(Self); +end; + +procedure TXMLDecodingSource.SetEncoding(const AEncoding: string); +var + NewDecoder: TDecoderRef; +begin + if FDecoder.Supports(AEncoding) then // no change needed + Exit; + // hardcoded stuff - special case of UCS2 + if FDecoder is TUCS2Decoder then + begin + // check for 'UTF-16LE' or 'UTF-16BE' + if SameText(AEncoding, TUCS2Decoder(FDecoder).FEncoding) then + Exit + else + DecodingError('Current encoding cannot be switched to ''%s''', [AEncoding]); + end; + NewDecoder := FindDecoder(AEncoding); + if Assigned(NewDecoder) then + begin + FDecoder.Free; + FDecoder := NewDecoder.Create(Self); + end + else + DecodingError('Encoding ''%s'' is not supported', [AEncoding]); +end; + + +{ TXMLStreamInputSource } + +constructor TXMLStreamInputSource.Create(AStream: TStream; AOwnStream: Boolean); +begin + FStream := AStream; + FBufSize := 4096; + GetMem(FAllocated, FBufSize+8); + FBuf := FAllocated+8; + FBufEnd := FBuf; + FOwnStream := AOwnStream; + FetchData; +end; + +destructor TXMLStreamInputSource.Destroy; +begin + FreeMem(FAllocated); + if FOwnStream then + FStream.Free; + inherited Destroy; +end; + +procedure TXMLStreamInputSource.FetchData; +var + Remainder, BytesRead: Integer; + OldBuf: PChar; +begin + Assert(FBufEnd - FBuf < 8); + + OldBuf := FBuf; + Remainder := FBufEnd - FBuf; + FBuf := FAllocated+8-Remainder; + Move(OldBuf^, FBuf^, Remainder); + BytesRead := FStream.Read(FAllocated[8], FBufSize); + if BytesRead = 0 then + FEof := True; + FBufEnd := FAllocated + 8 + BytesRead; +end; + +{ TXMLFileInputSource } + +constructor TXMLFileInputSource.Create(var AFile: Text); +begin + FFile := @AFile; + ReadLn(FFile^, FString); + FBuf := PChar(FString); + FBufEnd := FBuf + Length(FString); +end; + +procedure TXMLFileInputSource.FetchData; +begin + FEof := Eof(FFile^); + if not FEof then + begin + ReadLn(FFile^, FString); + FString := FString + #10; // bad solution... + FBuf := PChar(FString); + FBufEnd := FBuf + Length(FString); + end; +end; + + +{ TDecoder } + +constructor TDecoder.Create(ASource: TXMLDecodingSource); +begin + inherited Create; + FSource := ASource; +end; + +{ TISO8859_1Decoder} + +function TISO8859_1Decoder.DecodeNext: WideChar; +begin + with FSource do + begin + Result := WideChar(FBuf[0]); + Inc(FBuf); + end; +end; + +class function TISO8859_1Decoder.Supports(const AEncoding: string): Boolean; +begin + Result := SameText(AEncoding, 'ISO-8859-1') or + SameText(AEncoding, 'ISO_8859-1') or + SameText(AEncoding, 'latin1') or + SameText(AEncoding, 'iso-ir-100') or + SameText(AEncoding, 'l1') or + SameText(AEncoding, 'IBM819') or + SameText(AEncoding, 'CP819') or + SameText(AEncoding, 'csISOLatin1') or +// This one is not in character-sets.txt, but used in most FPC documentation... + SameText(AEncoding, 'ISO8859-1'); +end; + +{ TUCS2Decoder } + +function TUCS2Decoder.DecodeNext: WideChar; +begin + with FSource do + begin + Result := PWideChar(FBuf)^; + Inc(FBuf, sizeof(WideChar)); + end; + if FSwapEndian then + Result := WideChar(Swap(Word(Result))); +end; + +class function TUCS2Decoder.Supports(const AEncoding: string): Boolean; +begin + // generic aliases for both LE and BE + Result := SameText(AEncoding, 'UTF-16') or + SameText(AEncoding, 'unicode'); +end; + +{ TUTF8Decoder } + +function TUTF8Decoder.DecodeNext: WideChar; +const + MaxCode: array[0..3] of Cardinal = ($7F, $7FF, $FFFF, $1FFFFF); +var + Value: Cardinal; + I, bc: Integer; +begin + with FSource do + begin + Result := WideChar(FBuf[0]); + Inc(FBuf); + if Result < #$80 then + Exit; + if Byte(Result) and $40 = 0 then + DecodingError('Invalid UTF8 sequence start byte'); + bc := 1; + if Byte(Result) and $20 <> 0 then + begin + Inc(bc); + if Byte(Result) and $10 <> 0 then + begin + Inc(bc); + if Byte(Result) and $8 <> 0 then + DecodingError('UCS4 character out of supported range'); + end; + end; + // DONE: (?) check that bc bytes available + if FBufEnd-FBuf < bc then + FetchData; + + Value := Byte(Result); + I := bc; // note: I is never zero + while bc > 0 do + begin + if ord(FBuf[0]) and $C0 <> $80 then + DecodingError('Invalid byte in UTF8 sequence'); + Value := (Value shl 6) or (Cardinal(FBuf[0]) and $3F); + Inc(FBuf); + Dec(bc); + end; + Value := Value and MaxCode[I]; + // RFC2279 check + if Value <= MaxCode[I-1] then + DecodingError('Invalid UTF8 sequence'); + case Value of + 0..$D7FF, $E000..$FFFF: + begin + Result := WideChar(Value); + Exit; + end; + $10000..$10FFFF: + begin + Result := WideChar($D7C0 + (Value shr 10)); + FSurrogate := WideChar($DC00 xor (Value and $3FF)); + Exit; + end; + end; + DecodingError('UCS4 character out of supported range'); + end; +end; + +class function TUTF8Decoder.Supports(const AEncoding: string): Boolean; +begin + Result := SameText(AEncoding, 'UTF-8'); +end; + +{ TXMLReader } + +function TXMLReader.ResolveEntity(const SystemID, PublicID: WideString; out Source: TXMLInputSource): Boolean; +var + AbsSysID: WideString; + Filename: string; + Stream: TStream; +begin + Result := False; + + if ResolveRelativeURI(FSource.SystemID, SystemID, AbsSysID) then + begin + Source := nil; + // TODO: alternative resolvers + if URIToFilename(AbsSysID, Filename) then + begin + try + Stream := TFileStream.Create(Filename, fmOpenRead + fmShareDenyWrite); + Source := TXMLStreamInputSource.Create(Stream, True); + Source.SystemID := AbsSysID; + Source.PublicID := PublicID; + Result := True; + except + on E: Exception do + ValidationError('%s', [E.Message]); + end; + end; + end; +end; + +procedure TXMLReader.InitializeRoot(ASource: TXMLInputSource); +begin + Initialize(ASource); + GetChar; + // TODO: presence of BOM must prevent UTF-8 encoding from being changed + CheckForChar(#$FEFF); // skip BOM, if one is present +end; + + +procedure TXMLReader.Initialize(ASource: TXMLInputSource); +begin + FSource := ASource; + FSource.FReader := Self; + FSource.Initialize; +end; + +procedure TXMLReader.GetCharRaw; +begin + FCurChar := FSource.NextChar; + FWhitespace := (FCurChar = #32) or (FCurChar = #10) or + (FCurChar = #9) or (FCurChar = #13); + // Used for handling the internal DTD subset + if Assigned(FCopyBuf) and (FSource.FParent = nil) then + BufAppend(FCopyBuf^, FCurChar); +end; + +procedure TXMLReader.GetChar; +begin + GetCharRaw; + if not FRecognizePE then + Exit; + if (FCurChar = #0) and ContextPop then + begin + Unget(FCurChar); + FCurChar := #32; + FWhitespace := True; + end + else if FCurChar = '%' then + begin + FCurChar := FSource.NextChar; + if not CheckName then + begin + Unget(FCurChar); + FCurChar := '%'; + Exit; + end; + if FCurChar = ';' then // "%pe1;%pe2" - must not recognize pe2 immediately! + GetCharRaw + else + RaiseExc(WideChar(';')); + StartPE; + FCurChar := #32; + FWhitespace := True; + end; +end; + +procedure TXMLReader.Unget(wc: WideChar); +begin + FSource.FSurrogate := wc; +end; procedure TXMLReader.RaiseExpectedQmark; begin - RaiseExc('Expected single or double quotation mark'); + RaiseExc('Expected single or double quote'); end; -procedure TXMLReader.RaiseExc(Expected, Found: WideChar); +procedure TXMLReader.RaiseExc(Expected: WideChar); begin - RaiseExc('Expected "' + Expected + '", but found "' + Found + '"'); +// FIX: don't output what is found - anything may be found, including exploits... + RaiseExc('Expected "%1s"', [string(Expected)]); end; procedure TXMLReader.RaiseExc(const descr: String); begin - raise EXMLReadError.CreateFmt('In %s (line %d pos %d): %s', [Filename, FLine, FColumn, descr]); + raise EXMLReadError.CreateFmt('In ''%s'' (line %d pos %d): %s', [FSource.SystemID, FSource.FLine, FSource.FColumn, descr]); +end; + +procedure TXMLReader.RaiseExc(const descr: string; const args: array of const); +begin + RaiseExc(Format(descr, args)); end; function TXMLReader.SkipWhitespace: Boolean; @@ -373,24 +904,20 @@ end; procedure TXMLReader.ExpectChar(wc: WideChar); begin - if not CheckForChar(wc) then - RaiseExc(wc, FCurChar); + if FCurChar = wc then + GetChar + else + RaiseExc(wc); end; procedure TXMLReader.ExpectString(const s: String); - - procedure RaiseStringNotFound; - begin - RaiseExc('Expected "' + s + '"'); - end; - var I: Integer; begin for I := 1 to Length(s) do begin if FCurChar <> WideChar(s[i]) then - RaiseStringNotFound; + RaiseExc('Expected "%s"', [s]); GetChar; end; end; @@ -404,10 +931,10 @@ end; procedure TXMLReader.SkipString(const ValidChars: TSetOfChar); begin - FValueLength := 0; + FValue.Length := 0; while (ord(FCurChar) < 256) and (char(FCurChar) in ValidChars) do begin - AppendValue(FCurChar); + BufAppend(FValue, FCurChar); GetChar; end; end; @@ -415,59 +942,128 @@ end; function TXMLReader.GetString(const ValidChars: TSetOfChar): WideString; begin SkipString(ValidChars); - SetString(Result, PWideChar(@FValue[0]), FValueLength); + SetString(Result, FValue.Buffer, FValue.Length); end; +constructor TXMLReader.Create; +begin + inherited Create; + // Naming bitmap: Point to static data for XML 1.0, + // and allocate buffer in XML11_BuildTables when necessary. + FNamePages := @NamePages; + BufAllocate(FName, 128); + BufAllocate(FValue, 512); + FForbiddenAscii := [#1..#8, #11..#12, #14..#31]; + // TODO: put under user control + FPreserveWhitespace := True; + FCreateEntityRefs := True; +end; destructor TXMLReader.Destroy; begin + if FXML11 then + FreeMem(FNamePages); + FreeMem(FName.Buffer); + FreeMem(FValue.Buffer); + while ContextPop do; // clean input stack FSource.Free; inherited Destroy; end; -procedure TXMLReader.ProcessXML(ABuf: PChar; const AFilename: String); // [1] +procedure TXMLReader.XML11_BuildTables; +var + I: Integer; begin - buf := ABuf; - Filename := AFilename; - FLine := 1; - FColumn := 0; + if not FXML11 then + GetMem(FNamePages, 512); + FXML11 := True; + for I := 0 to 255 do + FNamePages^[I] := ord(Byte(I) in Xml11HighPages); + FNamePages^[0] := 2; + FNamePages^[3] := $2c; + FNamePages^[$20] := $2a; + FNamePages^[$21] := $2b; + FNamePages^[$2f] := $29; + FNamePages^[$30] := $2d; + FNamePages^[$fd] := $28; + Move(FNamePages^, FNamePages^[256], 256); + FNamePages^[$100] := $19; + FNamePages^[$103] := $2E; + FNamePages^[$120] := $2F; + FForbiddenAscii := [#1..#8, #11..#12, #14..#31, #$7F..#$84, #$86..#$9F]; +end; + + +procedure TXMLReader.ProcessXML(ASource: TXMLInputSource); +begin doc := TXMLDocument.Create; FCursor := doc; - DetectEncoding; - ExpectProlog; - {$IFDEF MEM_CHECK}CheckHeapWrtMemCnt('TXMLReader.ProcessXML A');{$ENDIF} - ExpectElement; - {$IFDEF MEM_CHECK}CheckHeapWrtMemCnt('TXMLReader.ProcessXML B');{$ENDIF} + InitializeRoot(ASource); + + FAllowedDecl := dtXml; ParseMisc; + FDtdParsed := True; + if FDocType = nil then + ValidationError('Missing DTD', []); + if CheckName then + ParseElement + else + RaiseExc('Expected element'); + ParseMisc; + if Assigned(FDocType) and (doc.DocumentElement.TagName <> FDocType.Name) then + ValidationError('DTD name does not match root element', []); if FCurChar <> #0 then RaiseExc('Text after end of document element found'); end; -procedure TXMLReader.ProcessFragment(AOwner: TDOMNode; ABuf: PChar; const AFilename: String); +procedure TXMLReader.ProcessFragment(ASource: TXMLInputSource; AOwner: TDOMNode); begin - buf := ABuf; - Filename := AFilename; - FLine := 1; - FColumn := 0; + doc := AOwner.OwnerDocument; FCursor := AOwner; - DetectEncoding; - - if not ParseElementContent then - ; + InitializeRoot(ASource); + FXML11 := doc.InheritsFrom(TXMLDocument) and (TXMLDocument(doc).XMLVersion = '1.1'); + FAllowedDecl := dtText; + DoParseFragment; end; -function TXMLReader.CheckName: Boolean; // [5] +// XML 1.1 allowed range $10000..$EFFFF is [D800..DB7F] followed by [DC00..DFFF] +function TXMLReader.XML11_CheckName: Boolean; begin - Result := (Byte(FCurChar) in NamingBitmap[namePages[hi(Word(FCurChar))]]); - if Result then + if (FCurChar >= #$D800) and (FCurChar <= #$DB7F) then begin - FNameLength := 0; - repeat - AppendName(FCurChar); - GetChar; - until not (Byte(FCurChar) in NamingBitmap[namePages[$100 + hi(Word(FCurChar))]]); + BufAppend(FName, FCurChar); + GetCharRaw; + Result := (FCurChar >= #$DC00) and (FCurChar <= #$DFFF); + end + else + Result := False; +end; + +function TXMLReader.CheckName: Boolean; +begin + FName.Length := 0; + Result := (Byte(FCurChar) in NamingBitmap[FNamePages^[hi(Word(FCurChar))]]) or + (FXML11 and XML11_CheckName); + if Result then + repeat + BufAppend(FName, FCurChar); + GetChar; + until not ((Byte(FCurChar) in NamingBitmap[FNamePages^[$100+hi(Word(FCurChar))]]) or + (FXML11 and XML11_CheckName)); +end; + +function TXMLReader.CheckNmToken: Boolean; +begin + FName.Length := 0; + Result := False; + while (Byte(FCurChar) in NamingBitmap[FNamePages^[$100+hi(Word(FCurChar))]]) or + (FXML11 and XML11_CheckName) do + begin + BufAppend(FName, FCurChar); + GetChar; + Result := True; end; end; @@ -476,12 +1072,12 @@ begin RaiseExc('Name starts with invalid character'); end; -function TXMLReader.ExpectName: WideString; // [5] +function TXMLReader.ExpectName: WideString; begin if not CheckName then RaiseNameNotFound; - SetString(Result, PWideChar(@FName[0]), FNameLength); + SetString(Result, FName.Buffer, FName.Length); end; procedure TXMLReader.SkipName; @@ -490,114 +1086,320 @@ begin RaiseNameNotFound; end; -// --------------------- - -function TXMLReader.ResolvePredefined(const RefName: WideString): Boolean; +function TXMLReader.ResolvePredefined(const RefName: WideString): WideChar; begin - Result := True; if RefName = 'amp' then - AppendValue('&') + Result := '&' else if RefName = 'apos' then - AppendValue('''') + Result := '''' else if RefName = 'gt' then - AppendValue('>') + Result := '>' else if RefName = 'lt' then - AppendValue('<') + Result := '<' else if RefName = 'quot' then - AppendValue('"') + Result := '"' else - Result := False; + Result := #0; end; -function TXMLReader.ParseReference: TDOMEntityReference; +function TXMLReader.ParseCharRef: Boolean; // [66] var - RefName: WideString; - Radix, Value: Integer; + Value: Integer; begin - Result := nil; - if CheckForChar('#') then // character reference [66] + Result := FCurChar = '#'; + if Result then begin - if CheckForChar('x') then - Radix := 16 - else - Radix := 10; + GetCharRaw; Value := 0; + if CheckForChar('x') then repeat case FCurChar of - '0'..'9': Value := Value * Radix + Ord(FCurChar) - Ord('0'); - 'a'..'f': if Radix = 16 then Value := Value * 16 + Ord(FCurChar) - Ord('a') + 10 else Break; - 'A'..'F': if Radix = 16 then Value := Value * 16 + Ord(FCurChar) - Ord('A') + 10 else Break; + '0'..'9': Value := Value * 16 + Ord(FCurChar) - Ord('0'); + 'a'..'f': Value := Value * 16 + Ord(FCurChar) - (Ord('a') - 10); + 'A'..'F': Value := Value * 16 + Ord(FCurChar) - (Ord('A') - 10); else Break; end; - GetChar; + GetCharRaw; + until False + else + repeat + case FCurChar of + '0'..'9': Value := Value * 10 + Ord(FCurChar) - Ord('0'); + else + Break; + end; + GetCharRaw; until False; - + + ExpectChar(';'); + case Value of - // TODO: in XML1.1, references to $01..$1F are VALID + $01..$08, $0B..$0C, $0E..$1F: + if FXML11 then + BufAppend(FValue, WideChar(Value)) + else + RaiseExc('Invalid character reference'); $09, $0A, $0D, $20..$D7FF, $E000..$FFFD: - AppendValue(WideChar(Value)); + BufAppend(FValue, WideChar(Value)); $10000..$10FFFF: begin - AppendValue(WideChar($D7C0 + (Value shr 10))); - AppendValue(WideChar($DC00 xor (Value and $3FF))); + BufAppend(FValue, WideChar($D7C0 + (Value shr 10))); + BufAppend(FValue, WideChar($DC00 xor (Value and $3FF))); end; else RaiseExc('Invalid character reference'); end; - end - else - begin - RefName := ExpectName; - if not ResolvePredefined(RefName) then - begin - // TODO: try resolve the entity here - Result := doc.CreateEntityReference(RefName); - end; end; - ExpectChar(';'); // reference terminator end; -procedure TXMLReader.ProcessTextAndRefs(Delim: WideChar; DiscardWS: Boolean); +procedure TXMLReader.DoParseAttValue(Delim: WideChar); var - nonWs: Boolean; - RefNode: TDOMEntityReference; + RefNode: TDOMEntityEx; begin - FValueLength := 0; - nonWs := False; - while (FCurChar <> Delim) and (FCurChar <> #0) and (FCurChar <> '<') do + FValue.Length := 0; + while (FCurChar <> Delim) and (FCurChar <> #0) do begin - if not FWhitespace then - nonWs := True; - if FCurChar <> '&' then + if FCurChar = '<' then + RaiseExc('Literal "<" in attribute value') + else if FCurChar <> '&' then begin - AppendValue(FCurChar); - if (FValueLength >= 3) and (FValue[FValueLength-1] = '>') and - (FValue[FValueLength-2] = ']') and (FValue[FValueLength-3] = ']') then - RaiseExc('Literal '']]>'' is not allowed in text'); - GetChar; + if FWhitespace then + FCurChar := #32; + BufAppend(FValue, FCurChar); + GetCharRaw; end else begin - GetChar; // skip '&' + GetCharRaw; // skip '&' + if ParseCharRef then + Continue; + RefNode := ParseReference; if Assigned(RefNode) then begin - if FValueLength > 0 then + if FValue.Length > 0 then begin - if (not DiscardWs) or nonWs then - FCursor.AppendChild(doc.CreateTextNodeBuf(@FValue[0], FValueLength)); - FValueLength := 0; - nonWs := False; + FCursor.AppendChild(doc.CreateTextNodeBuf(FValue.Buffer, FValue.Length)); + FValue.Length := 0; end; - FCursor.AppendChild(RefNode); + + if RefNode.SystemID <> '' then + RaiseExc('External entity reference is not allowed in attribute value'); + + IncludeEntity(RefNode, True); end; end; end; // while - if ((not DiscardWs) or nonWs) and (FValueLength > 0) then + if FValue.Length > 0 then begin - FCursor.AppendChild(doc.CreateTextNodeBuf(@FValue[0], FValueLength)); - FValueLength := 0; + FCursor.AppendChild(doc.CreateTextNodeBuf(FValue.Buffer, FValue.Length)); + FValue.Length := 0; + end; +end; + +procedure TXMLReader.DoParseFragment; +begin + ParseContent; + if FCurChar <> #0 then + RaiseExc('Closing tag not allowed here'); +end; + + +function TXMLReader.ContextPush(AEntity: TDOMEntityEx): Boolean; +var + Src: TXMLInputSource; +begin + if AEntity.SystemID <> '' then + begin + Result := ResolveEntity(AEntity.SystemID, AEntity.PublicID, Src); + if not Result then + Exit; +{ + TODO: need different handling of TextDecl in external PEs + it cannot be parsed if PE is referenced INSIDE declaration + But - is such case ever met in the wild ?? E.g. MSXML fails such things... +} + FAllowedDecl := dtText; + end + else + Src := TXMLInputSource.Create(AEntity.FReplacementText); + + AEntity.FOnStack := True; + Src.FEntity := AEntity; + + Src.FParent := FSource; + Src.FCursor := FCursor; + Unget(FCurChar); // remember FCurChar in previous context + + Inc(FEntityLevel); + Initialize(Src); + Result := True; +end; + +function TXMLReader.ContextPop: Boolean; +var + Src: TXMLInputSource; +begin + Result := Assigned(FSource.FParent); + if Result then + begin + Src := FSource.FParent; + if Assigned(FSource.FEntity) then + TDOMEntityEx(FSource.FEntity).FOnStack := False; + FCursor := TDOMNode(FSource.FCursor); + FSource.Free; + FSource := Src; + Dec(FEntityLevel); + GetChar; // re-classify - case of "%pe1;%pe2;" + end; +end; + +procedure TXMLReader.IncludeEntity(AEntity: TDOMEntityEx; InAttr: Boolean); +var + Node, Child: TDOMNode; +begin + if not AEntity.FResolved then + begin + if AEntity.FOnStack then + RaiseExc('Entity ''%s'' recursively references itself', [AEntity.NodeName]); + + if ContextPush(AEntity) then + begin + GetCharRaw; + CheckForChar(#$FEFF); + + FCursor := AEntity; // build child node tree for the entity + try + if InAttr then + DoParseAttValue(#0) + else + DoParseFragment; + AEntity.FResolved := True; + finally + ContextPop; // FCursor restored + FValue.Length := 0; + end; + end; + end; + Node := FCursor; + if FCreateEntityRefs or (not AEntity.FResolved) then + begin + Node := doc.CreateEntityReference(AEntity.NodeName); + FCursor.AppendChild(Node); + end; + + Child := AEntity.FirstChild; // clone the entity node tree + while Assigned(Child) do + begin + Node.AppendChild(Child.CloneNode(True)); + Child := Child.NextSibling; + end; +end; + +procedure TXMLReader.StartPE; +var + PEName: WideString; + PEnt: TDOMEntityEx; +begin + SetString(PEName, FName.Buffer, FName.Length); + PEnt := FDocType.PEMap.GetNamedItem(PEName) as TDOMEntityEx; + if PEnt = nil then // TODO -cVC: Referencing undefined PE + begin // (These are classified as 'optional errors'...) +// ValidationError('Undefined parameter entity referenced: %s', [PEName]); + Exit; + end; + + if PEnt.FOnStack then + RaiseExc('Entity ''%%%s'' recursively references itself', [PEnt.NodeName]); + + ContextPush(PEnt); +end; + +function TXMLReader.ParseReference: TDOMEntityEx; +var + RefName: WideString; + Predef: WideChar; +begin + Result := nil; + RefName := ExpectName; + ExpectChar(';'); + Predef := ResolvePredefined(RefName); + if Predef <> #0 then + BufAppend(FValue, Predef) + else + begin + if Assigned(FDocType) then + Result := FDocType.Entities.GetNamedItem(RefName) as TDOMEntityEx; + + if Result = nil then + begin + if FStandalone or (FDocType = nil) or not (FDocType.HasPERefs or (FDocType.SystemID <> '')) then + RaiseExc('Undefined entity ''%s'' referenced', [RefName]) + else + ValidationError('Undefined entity ''%s'' referenced', [RefName]); + end + else + begin + if FStandalone and (not Result.FInternal) then + RaiseExc('Standalone constraint violation'); + if Result.NotationName <> '' then + RaiseExc('Reference to unparsed entity ''%s''', [RefName]); + end; + end; +end; + +procedure TXMLReader.ProcessTextAndRefs; +var + nonWs: Boolean; + last: WideChar; + RefNode: TDOMEntityEx; +begin + FValue.Length := 0; + nonWs := False; + FAllowedDecl := dtNone; + while (FCurChar <> '<') and (FCurChar <> #0) do + begin + if FCurChar <> '&' then + begin + if not FWhitespace then + nonWs := True; + BufAppend(FValue, FCurChar); + if FCurChar = '>' then + with FValue do + if (Length >= 3) and + (Buffer[Length-2] = ']') and (Buffer[Length-3] = ']') then + RaiseExc('Literal '']]>'' is not allowed in text'); + GetCharRaw; + end + else + begin + GetCharRaw; // skip '&' + if ParseCharRef then + begin + last := FValue.Buffer[FValue.Length-1]; + if (last <> #9) and (last <> #10) and (last <> #13) and (last <> #32) then + nonWs := True; + Continue; + end; + nonWs := True; + RefNode := ParseReference; + if Assigned(RefNode) then + begin + if (nonWs or FPreserveWhitespace) and (FValue.Length > 0) then + begin + FCursor.AppendChild(doc.CreateTextNodeBuf(FValue.Buffer, FValue.Length)); + FValue.Length := 0; + nonWs := False; + end; + IncludeEntity(RefNode, False); + end; + end; + end; // while + if (nonWs or FPreserveWhitespace) and (FValue.Length > 0) then + begin + FCursor.AppendChild(doc.CreateTextNodeBuf(FValue.Buffer, FValue.Length)); + FValue.Length := 0; end; end; @@ -608,231 +1410,292 @@ begin if (FCurChar <> '''') and (FCurChar <> '"') then RaiseExpectedQmark; Delim := FCurChar; - GetChar; // skip quote + GetCharRaw; // skip quote + DoParseAttValue(Delim); + GetChar; // NOTE: not GetCharRaw - when parsing AttDef in DTD, + // immediately following PERef must be recognized +end; - ProcessTextAndRefs(Delim, False); - if FCurChar = '<' then - RaiseExc('"<" is not allowed in attribute value'); - - GetChar; // skip trailing quote +function TXMLReader.SkipQuotedLiteral: Boolean; +var + Delim: WideChar; +begin + Result := (FCurChar = '''') or (FCurChar = '"'); + if Result then + begin + Delim := FCurChar; + GetCharRaw; // skip quote + FValue.Length := 0; + while (FCurChar <> Delim) and (FCurChar <> #0) do + begin + BufAppend(FValue, FCurChar); + GetCharRaw; + end; + ExpectChar(Delim); // <-- to check the EOF only + end; end; procedure TXMLReader.SkipPubidLiteral; // [12] var - Delim: WideChar; + I: Integer; begin - if (FCurChar = '''') or (FCurChar = '"') then + if SkipQuotedLiteral then begin - Delim := FCurChar; - GetChar; // skip quote - SkipString(PubidChars - [Char(Delim)]); // <-- PubidChars do not contain `"` - ExpectChar(Delim); + for I := 0 to FValue.Length-1 do + if (FValue.Buffer[I] > #255) or not (Char(FValue.Buffer[I]) in PubidChars) then + RaiseExc('Illegal Public ID literal') end else RaiseExpectedQMark; end; -// starting '= 2) and (FValue[FValueLength-1] = '-') and - (FValue[FValueLength-2] = '-') then + BufAppend(FValue, FCurChar); + GetCharRaw; + with FValue do + if (Length >= 2) and (Buffer[Length-1] = '-') and + (Buffer[Length-2] = '-') then begin - Dec(FValueLength, 2); - Break; + Dec(Length, 2); + if Assigned(FCursor) then + FCursor.AppendChild(doc.CreateCommentBuf(Buffer, Length)); + ExpectChar('>'); + Exit; end; - until FCurChar = #0; // should not happen - - if FCurChar = #0 then - RaiseExc('Unterminated comment'); - ExpectChar('>'); - - FCursor.AppendChild(doc.CreateCommentBuf(@FValue[0], FValueLength)); + until FCurChar = #0; + RaiseExc('Unterminated comment'); end; -// starting ' 'xml' then // FIX: ibm23n04.xml - RaiseExc('"xml" reserved word must be lowercase'); - if not FPrologParsed then + if Name <> 'xml' then + RaiseExc('''xml'' is a reserved word; it must be lowercase'); + if FAllowedDecl <> dtNone then begin - ParseProlog; - FPrologParsed := True; + ParseXmlOrTextDecl(FAllowedDecl = dtText); + FAllowedDecl := dtNone; Exit; end else - RaiseExc('" '?' then ExpectWhitespace; - FValueLength := 0; + FAllowedDecl := dtNone; + FValue.Length := 0; repeat - AppendValue(FCurChar); - GetChar; - if (FValueLength >= 2) and (FValue[FValueLength-1] = '>') and - (FValue[FValueLength-2] = '?') then + BufAppend(FValue, FCurChar); + GetCharRaw; + with FValue do + if (Length >= 2) and (Buffer[Length-1] = '>') and + (Buffer[Length-2] = '?') then begin - Dec(FValueLength, 2); - Break; + Dec(Length, 2); + SetString(Value, Buffer, Length); + if Assigned(FCursor) then + FCursor.AppendChild(Doc.CreateProcessingInstruction(Name, Value)); + Exit; end; - until FCurChar = #0; // should not happen - - if FCurChar = #0 then - RaiseExc('Unterminated processing instruction'); - - SetString(Value, PWideChar(@FValue[0]), FValueLength); - FCursor.AppendChild(Doc.CreateProcessingInstruction(Name, Value)); + until FCurChar = #0; + RaiseExc('Unterminated processing instruction'); end; + // here we come from ParsePI, 'xml' is already consumed -procedure TXMLReader.ParseProlog; +procedure TXMLReader.ParseXmlOrTextDecl(TextDecl: Boolean); var - Delim: WideChar; - svalue: WideString; + TmpStr: WideString; + IsXML11: Boolean; begin - // '' - // VersionInfo: S 'version' Eq (' VersionNum ' | " VersionNum ") - SkipWhitespace; - ExpectString('version'); - ExpectEq; - if (FCurChar = '''') or (FCurChar = '"') then + ExpectWhitespace; + // VersionInfo: optional in TextDecl, required in XmlDecl + if (not TextDecl) or (FCurChar = 'v') then begin - Delim := FCurChar; - GetChar; // skip quote - if doc.InheritsFrom(TXMLDocument) then - TXMLDocument(doc).XMLVersion := GetString(NmToken); - ExpectChar(Delim); + ExpectString('version'); // [24] + ExpectEq; + SkipSystemLiteral(TmpStr, True); + IsXML11 := False; + if TmpStr = '1.1' then // Checking for bad chars is implied + IsXML11 := True + else if TmpStr <> '1.0' then + RaiseExc('Illegal version number'); + + if not TextDecl then + begin + if doc.InheritsFrom(TXMLDocument) then + TXMLDocument(doc).XMLVersion := TmpStr; + if IsXML11 then + XML11_BuildTables; + end + else // parsing external entity + if IsXML11 and not FXML11 then + RaiseExc('XML 1.0 document cannot invoke XML 1.1 entities'); + if FCurChar <> '?' then ExpectWhitespace; - end - else - RaiseExpectedQMark; + end; - if FCurChar = 'e' then // [80] + // EncodingDecl: required in TextDecl, optional in XmlDecl + if TextDecl or (FCurChar = 'e') then // [80] begin ExpectString('encoding'); ExpectEq; - if (FCurChar = '''') or (FCurChar = '"') then - begin - Delim := FCurChar; - GetChar; // skip quote - if not ((ord(FCurChar) < 256) and (char(FCurChar) in ['A'..'Z', 'a'..'z'])) then - RaiseExc('Expected character (A-Z, a-z)'); - SkipString(['A'..'Z', 'a'..'z', '0'..'9', '.', '_', '-']); - // TODO: analyze encoding string, and adjust FSource if needed and possible - ExpectChar(Delim); - if FCurChar <> '?' then - ExpectWhitespace; - end - else - RaiseExpectedQMark; + SkipSystemLiteral(TmpStr, True); + + if not IsValidEncName(TmpStr) then + RaiseExc('Illegal encoding name'); + + FSource.SetEncoding(TmpStr); // <-- Wide2Ansi conversion here + // getting here means that specified encoding is supported + // TODO: maybe assign the 'preferred' encoding name? + if not TextDecl and doc.InheritsFrom(TXMLDocument) then + TXMLDocument(doc).Encoding := TmpStr; + + if FCurChar <> '?' then + ExpectWhitespace; end; - // SDDecl? - if FCurChar = 's' then + // SDDecl: forbidden in TextDecl, optional in XmlDecl + if (not TextDecl) and (FCurChar = 's') then begin ExpectString('standalone'); ExpectEq; - if (FCurChar = '''') or (FCurChar = '"') then - begin - Delim := FCurChar; - GetChar; // skip quote - svalue := ExpectName; - if (svalue <> 'yes') and (svalue <> 'no') then - RaiseExc('Standalone attribute may only have value "yes" or "no"'); - ExpectChar(Delim); - end - else - RaiseExpectedQMark; + SkipSystemLiteral(TmpStr, True); + if TmpStr = 'yes' then + FStandalone := True + else if TmpStr <> 'no' then + RaiseExc('Only "yes" or "no" are permitted as values of "standalone"'); SkipWhitespace; end; ExpectString('?>'); end; -function TXMLReader.ParseInternalDtd: Boolean; +procedure TXMLReader.ParseDoctypeDecl; // [28] var - DocType: TXMLReaderDocumentType; + IntSubset: TWideCharBuf; + Src, OldSrc: TXMLInputSource; begin - // Check for "(doctypedecl Misc*)?" [28] - Result := (FCurChar = 'D'); - if Result then - begin - FPrologParsed := True; - ExpectString('DOCTYPE'); - // create the DTD object - DocType := TXMLReaderDocumentType.Create(doc as TXMLDocument); - if doc.InheritsFrom(TXMLDocument) then - TXMLDocument(doc).AppendChild(DocType); - SkipWhitespace; - DocType.FName := ExpectName; - SkipWhitespace; - ParseExternalID(False); // may be absent, ignore result + FAllowedDecl := dtNone; + + if FDtdParsed then + RaiseExc('Markup declaration not allowed here'); + + ExpectString('DOCTYPE'); // gives possibly incorrect error message + ExpectWhitespace; + + FDocType := TDOMDocumentTypeEx.Create(doc); + FDtdParsed := True; +{ To comply with certain output tests, we must insert PIs coming from internal + subset before DocType node. This looks very synthetic, but let it be... + Moreover, this code actually duplicates such PIs } + try + FDocType.FName := ExpectName; + ExpectWhitespace; + ParseExternalID(FDocType.FSystemID, FDocType.FPublicID, False); SkipWhitespace; - if CheckForChar('[') then + if FCurChar = '[' then begin - repeat - SkipWhitespace; - until not (ParseMarkupDecl(True) or ParsePEReference); - ExpectChar(']'); + BufAllocate(IntSubset, 256); + FCopyBuf := @IntSubset; + GetChar; // cause very first char after '[' to be appended + try + FIntSubset := True; + ParseMarkupDecl; + if IntSubset.Length > 0 then // sanity check - must at least contain ']' + SetString(FDocType.FInternalSubset, IntSubset.Buffer, IntSubset.Length-1); + ExpectChar(']'); + finally + FIntSubset := False; + FCopyBuf := nil; + FreeMem(IntSubset.Buffer); + end; SkipWhitespace; end; ExpectChar('>'); - ParseMisc; - Exit; + + if FDocType.SystemID <> '' then + begin + if ResolveEntity(FDocType.SystemID, FDocType.PublicID, Src) then + begin + OldSrc := FSource; + Unget(FCurChar); + FCursor := nil; + try + DoParseExtSubset(Src); + finally + while ContextPop do; // Cleanup after possible exceptions + FSource.Free; + FSource := OldSrc; + GetChar; + FCursor := Doc; + end; + end; + end; + finally + doc.AppendChild(FDocType); end; end; -procedure TXMLReader.ExpectProlog; // [22] +procedure TXMLReader.ParseMisc; begin - FPrologParsed := False; - // Check for "Misc*". - // ParseMisc() is inlined here and slightly modified - // because we need to distinguish ' Delim then + RaiseExc(Delim); + GetChar; // skip delimiter until False; end; -{ DTD stuff } +procedure TXMLReader.ParseMixedOrChildren; +var + PELevel: Integer; + NeedAsterisk: Boolean; +begin + PELevel := FEntityLevel; + GetChar; // starting bracket + SkipWhitespace; + if CheckForChar('#') then // Mixed section [51] + begin + ExpectString('PCDATA'); + SkipWhitespace; + NeedAsterisk := False; + while FCurChar <> ')' do + begin + ExpectChar('|'); + NeedAsterisk := True; + SkipWhitespace; + SkipName; + SkipWhitespace; + end; + AssertPENesting(PELevel); + GetChar; + if NeedAsterisk then + ExpectChar('*') + else + CheckForChar('*'); + end + else // Parse Children section [47] + begin + ExpectChoiceOrSeq; + AssertPENesting(PELevel); + GetChar; + if CheckForChar('?') then + else if CheckForChar('*') then + else if CheckForChar('+') then; + end; +end; procedure TXMLReader.ParseElementDecl; // [45] - - procedure ExpectChoiceOrSeq; // [49], [50] - - procedure ExpectCP; // [48] - begin - if CheckForChar('(') then - ExpectChoiceOrSeq - else - SkipName; - if CheckForChar('?') then - else if CheckForChar('*') then - else if CheckForChar('+') then; - end; - - var - Delim: WideChar; - begin - SkipWhitespace; - ExpectCP; - Delim := #0; - repeat - SkipWhitespace; - if (FCurChar = #0) or CheckForChar(')') then - Break; - if Delim = #0 then - begin - if (FCurChar = '|') or (FCurChar = ',') then - Delim := FCurChar - else - RaiseExc('Expected "|" or ","'); - end - else - if FCurChar <> Delim then - RaiseExc(Delim, FCurChar); - GetChar; // skip delimiter - SkipWhitespace; - ExpectCP; - until False; - end; - begin SkipName; ExpectWhitespace; @@ -918,60 +1829,35 @@ begin ExpectString('EMPTY') else if FCurChar = 'A' then ExpectString('ANY') - else if CheckForChar('(') then - begin - SkipWhitespace; - if CheckForChar('#') then - begin - // Parse Mixed section [51] - ExpectString('PCDATA'); - SkipWhitespace; - if not CheckForChar(')') then - begin - repeat - ExpectChar('|'); - SkipWhitespace; - SkipName; - SkipWhitespace; - until FCurChar = ')'; - GetChar; - ExpectChar('*'); - end - else // 'PCDATA' followed by ')' - fixes valid/P96/ibm69v01.xml - CheckForChar('*'); - end - else // Parse Children section [47] - begin - ExpectChoiceOrSeq; - - if CheckForChar('?') then - else if CheckForChar('*') then - else if CheckForChar('+') then; - end; - end + else if FCurChar = '(' then + ParseMixedOrChildren else RaiseExc('Invalid content specification'); - - SkipWhitespace; - ExpectChar('>'); end; + procedure TXMLReader.ParseNotationDecl; // [82] +var + Notation: TDOMNotationEx; begin - SkipName; - ExpectWhitespace; - // Unclear rule... - // IE understands 'SYSTEM' followed by literal and 'PUBLIC' followed by 2 literals - // this is what is handled in ParseExternalID(). - if ParseExternalID(True) then -(* else if CheckFor('PUBLIC') then - begin // [83] + Notation := TDOMNotationEx(TDOMNotation.Create(Doc)); + try + Notation.FName := ExpectName; ExpectWhitespace; - SkipPubidLiteral; - end *) else - RaiseExc('Expected external or public ID'); - SkipWhitespace; - ExpectChar('>'); + if not ParseExternalID(Notation.FSystemID, Notation.FPublicID, True) then + RaiseExc('Expected external or public ID'); + except + Notation.Free; + raise; + end; + + if FDocType.Notations.GetNamedItem(Notation.FName) = nil then + FDocType.Notations.SetNamedItem(Notation) + else + begin + ValidationError('Duplicate notation declaration: %s', [Notation.FName]); + Notation.Free; + end; end; procedure TXMLReader.ParseAttlistDecl; // [52] @@ -979,170 +1865,302 @@ var SaveCurNode: TDOMNode; ValueRequired: Boolean; Token: WideString; + ElDef: TDOMElementDef; + AttDef: TDOMAttrDef; begin - SkipName; - SkipWhitespace; - while not CheckForChar('>') do + Token := ExpectName; + ElDef := TDOMElementDef(FDocType.ElementDefs.GetNamedItem(Token)); + if ElDef = nil then begin - SkipName; - ExpectWhitespace; - Token := GetString(['A'..'Z']); // Get AttType [54], [55], [56] - if Token = 'CDATA' then - else if Token = 'ID' then - else if Token = 'IDREF' then - else if Token = 'IDREFS' then - else if Token = 'ENTITY' then - else if Token = 'ENTITIES' then - else if Token = 'NMTOKEN' then - else if Token = 'NMTOKENS' then - else if Token = 'NOTATION' then // [57], [58] - begin + // TODO -cVC: must distinguish ElementDef created here from one explicitly declared + ElDef := TDOMElementDef.Create(doc); + ElDef.FNodeName := Token; + FDocType.ElementDefs.SetNamedItem(ElDef); + end; + SkipWhitespace; + while FCurChar <> '>' do + begin + SkipWhitespace; { !!! } + AttDef := TDOMAttrDef.Create(doc); + try + AttDef.FName := ExpectName; ExpectWhitespace; - ExpectChar('('); - SkipWhitespace; - SkipName; - SkipWhitespace; - while not CheckForChar(')') do + Token := GetString(['A'..'Z']); // Get AttType [54], [55], [56] + if Token = 'CDATA' then + AttDef.FDataType := DT_CDATA + else if Token = 'ID' then + AttDef.FDataType := DT_ID + else if Token = 'IDREF' then + AttDef.FDataType := DT_IDREF + else if Token = 'IDREFS' then + AttDef.FDataType := DT_IDREFS + else if Token = 'ENTITY' then + AttDef.FDataType := DT_ENTITY + else if Token = 'ENTITIES' then + AttDef.FDataType := DT_ENTITIES + else if Token = 'NMTOKEN' then + AttDef.FDataType := DT_NMTOKEN + else if Token = 'NMTOKENS' then + AttDef.FDataType := DT_NMTOKENS + else if Token = 'NOTATION' then // [57], [58] begin - ExpectChar('|'); - SkipWhitespace; - SkipName; - SkipWhitespace; - end; - end - else - if CheckForChar('(') then - begin // [59] - SkipWhitespace; - SkipString(Nmtoken); - if FValueLength = 0 then // Fix ibm59n01.xml - name should be present - RaiseNameNotFound; - SkipWhitespace; - while not CheckForChar(')') do - begin - ExpectChar('|'); - SkipWhitespace; - SkipString(Nmtoken); - SkipWhitespace; - end; - end else - RaiseExc('Invalid tokenized type'); - - ExpectWhitespace; - - // Get DefaultDecl [60] - ValueRequired := False; - if CheckForChar('#') then - begin - Token := GetString(['A'..'Z']); - if Token = 'REQUIRED' then - else if Token = 'IMPLIED' then - else if Token = 'FIXED' then - begin - ExpectWhitespace; // Fix ibm60n05.xml - ValueRequired := True; + AttDef.FDataType := DT_NOTATION; + ExpectWhitespace; + ExpectChar('('); + repeat + SkipWhitespace; + SkipName; + SkipWhitespace; + until not CheckForChar('|'); + ExpectChar(')'); end else - RaiseExc('Illegal attribute definition'); // Fix sun/not-wf/attlist08.xml - end - else - ValueRequired := True; + if CheckForChar('(') then // [59] + begin + AttDef.FDataType := DT_NMTOKEN; + repeat + SkipWhitespace; + if not CheckNmToken then + RaiseNameNotFound; // not completely correct error message + SkipWhitespace; + until not CheckForChar('|'); + ExpectChar(')'); + end else + RaiseExc('Invalid tokenized type'); - if ValueRequired then - begin - SaveCurNode := FCursor; - FCursor := doc.CreateAttribute(''); - ExpectAttValue; - FCursor.Free; // avoid memory leaks - FCursor := SaveCurNode; + ExpectWhitespace; + + // Get DefaultDecl [60] + ValueRequired := False; + if CheckForChar('#') then + begin + Token := GetString(['A'..'Z']); + if Token = 'REQUIRED' then + AttDef.FDefault := AD_REQUIRED + else if Token = 'IMPLIED' then + AttDef.FDefault := AD_IMPLIED + else if Token = 'FIXED' then + begin + AttDef.FDefault := AD_FIXED; + ExpectWhitespace; + ValueRequired := True; + end + else + RaiseExc('Illegal attribute default'); + end + else + begin + AttDef.FDefault := AD_DEFAULT; + ValueRequired := True; + end; + + if ValueRequired then + begin + SaveCurNode := FCursor; + FCursor := AttDef; +// tricky moment, no tests for that +{ FRecognizePE := False; } // TODO: shall it really be disabled? + try + ExpectAttValue; + finally + FCursor := SaveCurNode; +{ FRecognizePE := not FIntSubset; } + end; + if AttDef.FDataType = DT_ID then + ValidationError('Attributes of type ID must not have a default value',[]); + end; + + // First declaration is binding, subsequent should be ignored + if Assigned(ElDef.GetAttributeNode(AttDef.Name)) then + AttDef.Free + else + ElDef.SetAttributeNode(AttDef); + except + AttDef.Free; + raise; end; SkipWhitespace; end; end; -procedure TXMLReader.ParseEntityDecl; // [70] - - function ParseEntityValue: Boolean; // [9] - var - Delim: WideChar; +procedure TXMLReader.ParseEntityDeclValue(Delim: WideChar); // [9] +var + I: Integer; + Src: TXMLInputSource; +begin + Src := FSource; + // "Included in literal": process until delimiter hit IN SAME context + while not ((FSource = Src) and CheckForChar(Delim)) do + if ParsePEReference then begin - if (FCurChar = '''') or (FCurChar = '"') then + if FIntSubset and (FSource.FParent = nil) then + RaiseExc('PE references in internal subset not allowed inside declarations'); + StartPE; + GetCharRaw; + end + else if FCurChar = '&' then // CharRefs: include, EntityRefs: bypass + begin + GetCharRaw; + if not ParseCharRef then begin - Delim := FCurChar; - GetChar; // skip quote - while not ((FCurChar = #0) or CheckForChar(Delim)) do - if ParsePEReference then - begin - if FInternalSubset then - RaiseExc('PE references in internal subset may not occur inside declarations'); - end - else if CheckForChar('&') then - begin - ParseReference().Free; // may look awful... but avoid memory leaks - end - else begin - GetChar; // Normal character - end; - Result := True; - end - else - Result := False; + BufAppend(FValue, '&'); + ExpectName; + ExpectChar(';'); + for I := 0 to FName.Length-1 do + BufAppend(FValue, FName.Buffer[I]); + BufAppend(FValue, ';'); + end; + end + else if FCurChar <> #0 then // Regular character + begin + BufAppend(FValue, FCurChar); + GetCharRaw; + end + else if not ContextPop then // #0 + Break; +end; + +procedure TXMLReader.ParseEntityDecl; // [70] +var + NDataAllowed: Boolean; + Delim: WideChar; + Entity: TDOMEntityEx; + Map: TDOMNamedNodeMap; +begin + NDataAllowed := True; + Map := FDocType.Entities; + if CheckForChar('%') then // [72] + begin + ExpectWhitespace; + NDataAllowed := False; + Map := FDocType.PEMap; end; -begin - if CheckForChar('%') then // [72] - begin + Entity := TDOMEntityEx.Create(Doc); + try + Entity.FInternal := FIntSubset and (FSource.FParent = nil); + Entity.FName := ExpectName; ExpectWhitespace; - ExpectName; - ExpectWhitespace; - // Get PEDef [74] - if ParseEntityValue then - // SYSTEM | PUBLIC - else if ParseExternalID(False) then - else - RaiseExc('Expected entity value or external ID'); - end - else // [71] - begin - ExpectName; - ExpectWhitespace; - // Get EntityDef [73] - if ParseEntityValue then - else + + if (FCurChar = '"') or (FCurChar = '''') then + begin + NDataAllowed := False; + Delim := FCurChar; + FRecognizePE := False; // PERef right after delimiter should not be recognized + GetCharRaw; // at char level - we process it 'manually' + FValue.Length := 0; + ParseEntityDeclValue(Delim); + FRecognizePE := not FIntSubset; + SetString(Entity.FReplacementText, FValue.Buffer, FValue.Length); + end + else + if not ParseExternalID(Entity.FSystemID, Entity.FPublicID, False) then + RaiseExc('Expected entity value or external ID'); + + if NDataAllowed then // [76] begin - ExpectExternalID; - // Get NDataDecl [76] if FCurChar <> '>' then - ExpectWhitespace; // FIX: ibm76n03.xml: whitespace REQUIRED before NDATA + ExpectWhitespace; if FCurChar = 'N' then begin ExpectString('NDATA'); ExpectWhitespace; SkipName; + // TODO -cVC: Notation declared. Here or after all has been read? + SetString(Entity.FNotationName, FName.Buffer, FName.Length); + if FDocType.Notations.GetNamedItem(Entity.NotationName) = nil then + ValidationError('Reference to undeclared notation ''%s''', [Entity.NotationName]); end; end; + except + Entity.Free; + raise; end; - SkipWhitespace; - ExpectChar('>'); + + // Repeated declarations of same entity are legal but must be ignored + if Map.GetNamedItem(Entity.NodeName) = nil then + Map.SetNamedItem(Entity) + else + Entity.Free; end; -function TXMLReader.ParseMarkupDecl(InternalSubset: Boolean): Boolean; // [29] +procedure TXMLReader.ParseMarkupDecl; // [29] var Token: WideString; + IncludeLevel: Integer; + IgnoreLevel: Integer; + PELevel: Integer; begin - Result := False; - FInternalSubset := InternalSubset; + IncludeLevel := 0; + IgnoreLevel := 0; repeat - SkipWhitespace; - if not CheckForChar('<') then // condition is true for #0 - Exit; + if SkipWhitespace then + FAllowedDecl := dtNone; + + if ParsePEReference then // PERef between declarations should always be recognized + begin + FAllowedDecl := dtNone; + if Assigned(FDocType) then + FDocType.HasPERefs := True; + StartPE; + GetChar; + Continue; + end; + + if (FCurChar = #0) and ContextPop then + Continue; + + if (FCurChar = ']') and (IncludeLevel > 0) then + begin + ExpectString(']]>'); + Dec(IncludeLevel); + Continue; + end; + + if FCurChar <> '<' then + Break; + + PELevel := FEntityLevel; + GetCharRaw; + if CheckForChar('!') then begin + FAllowedDecl := dtNone; if FCurChar = '-' then ParseComment + else if FCurChar = '[' then + begin + if FIntSubset and (FSource.FParent = nil) then + RaiseExc('Conditional sections not allowed in internal subset'); + + FRecognizePE := not FIntSubset; + GetChar; // skip '[' + SkipWhitespace; + Token := GetString(['A'..'Z']); + SkipWhitespace; + + if Token = 'INCLUDE' then + Inc(IncludeLevel) + else if Token = 'IGNORE' then + IgnoreLevel := 1 + else + RaiseExc('Expected "INCLUDE" or "IGNORE"'); + AssertPENesting(PELevel); + ExpectChar('['); + if IgnoreLevel > 0 then + repeat + FRecognizePE := False; // PEs not recognized in IGNORE section + if CheckForChar('<') and CheckForChar('!') and CheckForChar('[') then + Inc(IgnoreLevel) + else if CheckForChar(']') and CheckForChar(']') and CheckForChar('>') then + Dec(IgnoreLevel) + else GetChar; + until (IgnoreLevel=0) or (FCurChar = #0); + end else begin + FRecognizePE := not FIntSubset; Token := GetString(['A'..'Z']); ExpectWhitespace; if Token = 'ELEMENT' then @@ -1154,88 +2172,97 @@ begin else if Token = 'NOTATION' then ParseNotationDecl else - RaiseExc('Wrong declaration type'); + RaiseExc('Illegal markup declaration'); + + SkipWhitespace; + FRecognizePE := False; // ! Don't auto-pop context on last markup delimiter + ExpectChar('>'); // This enables correct nesting check end; +{ + MarkupDecl starting in PE and ending in root is a WFC [28a] + MarkupDecl starting in root but ending in PE is a VC (erratum 2e-14) +} + if PELevel > FEntityLevel then + RaiseExc('Parameter entities must be properly nested') + else + AssertPENesting(PELevel); end - else if CheckForChar('?') then - ParsePI + else if FCurChar = '?' then + ParsePI; until False; + FRecognizePE := False; + if (IncludeLevel > 0) or (IgnoreLevel > 0) then + RaiseExc('Conditional section not closed'); end; -procedure TXMLReader.ProcessDTD(ABuf: PChar; const AFilename: String); +procedure TXMLReader.DoParseExtSubset(ASource: TXMLInputSource); begin - buf := ABuf; - Filename := AFilename; - FLine := 1; - FColumn := 0; - DetectEncoding; - doc := TXMLDocument.Create; - repeat - SkipWhitespace; - until not (ParseMarkupDecl(False) or ParsePEReference); + InitializeRoot(ASource); + FAllowedDecl := dtText; + ParseMarkupDecl; + if FCurChar <> #0 then + RaiseExc('Illegal character in DTD'); end; +procedure TXMLReader.ProcessDTD(ASource: TXMLInputSource); +begin + doc := TXMLDocument.Create; + FDocType := TDOMDocumentTypeEx.Create(doc); + // TODO: DTD labeled version 1.1 will be rejected - must set FXML11 flag + // TODO: what shall be FCursor? FDocType cannot - it does not accept child nodes + doc.AppendChild(FDocType); + DoParseExtSubset(ASource); +end; -// starting '= 3) and (FValue[FValueLength-1] = '>') and - (FValue[FValueLength-2] = ']') and (FValue[FValueLength-3] = ']') then + BufAppend(FValue, FCurChar); + GetCharRaw; + with FValue do + if (Length >= 3) and (Buffer[Length-1] = '>') and + (Buffer[Length-2] = ']') and (Buffer[Length-3] = ']') then begin - Dec(FValueLength, 3); - Break; + Dec(Length, 3); + SetString(name, Buffer, Length); + FCursor.AppendChild(doc.CreateCDATASection(name)); + Exit; end; until FCurChar = #0; - - if FCurChar = #0 then - RaiseExc('Unterminated CDATA section'); - - SetString(name, PWideChar(@FValue[0]), FValueLength); - FCursor.AppendChild(doc.CreateCDATASection(name)); + RaiseExc('Unterminated CDATA section'); end; -{ - returns True at end of stream. - this is ok for fragments but error for document - returns False when '<' is followed by ([^![?] | NameStartChar) - this is ok for document (expect ETag then) but error for fragment -} - -function TXMLReader.ParseElementContent: Boolean; +procedure TXMLReader.ParseContent; begin - Result := False; repeat if FCurChar = '<' then begin - GetChar; - if FCurChar = '!' then + GetCharRaw; + if CheckName then + ParseElement + else if FCurChar = '!' then begin - GetChar; + GetCharRaw; + FAllowedDecl := dtNone; if FCurChar = '[' then ParseCDSect else if FCurChar = '-' then ParseComment else - RaiseExc('Document type declarations not allowed here'); + ParseDoctypeDecl; // actually will raise error end - else if CheckName then - ParseElement - else if CheckForChar('?') then + else if FCurChar = '?' then ParsePI else Exit; end else - ProcessTextAndRefs('<', True); + ProcessTextAndRefs; until FCurChar = #0; - Result := True; end; // Element name already in FNameBuffer @@ -1243,11 +2270,9 @@ procedure TXMLReader.ParseElement; // [39] [40] [44] var NewElem: TDOMElement; IsEmpty: Boolean; - attr, OldAttr: TDOMAttr; + attr, OldAttr: TDOMNode; begin - {$IFDEF MEM_CHECK}CheckHeapWrtMemCnt(' ParseElement A');{$ENDIF} - - NewElem := doc.CreateElementBuf(@FName[0], FNameLength); + NewElem := doc.CreateElementBuf(FName.Buffer, FName.Length); FCursor.AppendChild(NewElem); Assert(NewElem.ParentNode = FCursor, 'AppendChild did not set ParentNode'); FCursor := NewElem; @@ -1255,9 +2280,9 @@ begin IsEmpty := False; while FCurChar <> '>' do begin - {$IFDEF MEM_CHECK}CheckHeapWrtMemCnt(' ParseElement E');{$ENDIF} - if CheckForChar('/') then + if FCurChar = '/' then begin + GetCharRaw; IsEmpty := True; FCursor := FCursor.ParentNode; Break; @@ -1268,53 +2293,80 @@ begin if not CheckName then // allow stuff like , Continue; - attr := doc.CreateAttributeBuf(@FName[0], FNameLength); + attr := doc.CreateAttributeBuf(FName.Buffer, FName.Length); - // WFC: Attribute must be unique // !!cannot use TDOMElement.SetAttributeNode because it will free old attribute - OldAttr := TDOMAttr(NewElem.Attributes.SetNamedItem(Attr)); + OldAttr := NewElem.Attributes.SetNamedItem(Attr); if Assigned(OldAttr) then begin OldAttr.Free; RaiseExc('Duplicate attribute'); end; ExpectEq; - Assert(attr.OwnerElement = NewElem, 'DOMAttr.OwnerElement not set correctly'); + Assert(TDOMAttr(attr).OwnerElement = NewElem, 'DOMAttr.OwnerElement not set correctly'); FCursor := attr; ExpectAttValue; FCursor := NewElem; end; ExpectChar('>'); + ProcessDefaultAttributes(NewElem); if not IsEmpty then begin - SkipWhitespace; - if not ParseElementContent then + if not FPreserveWhitespace then // critical for testsuite compliance + SkipWhitespace; + ParseContent; + if FCurChar = '/' then // Get ETag [42] begin - if CheckForChar('/') then // Get ETag [42] - begin - if ExpectName <> NewElem.NodeName then - RaiseExc('Unmatching element end tag (expected "")'); - SkipWhitespace; - ExpectChar('>'); - FCursor := FCursor.ParentNode; - end - else - RaiseNameNotFound; + GetCharRaw; + if ExpectName <> NewElem.TagName then + RaiseExc('Unmatching element end tag (expected "")', [NewElem.TagName]); + SkipWhitespace; + ExpectChar('>'); + FCursor := FCursor.ParentNode; end + else if FCurChar <> #0 then + RaiseNameNotFound else // End of stream in content RaiseExc('Document element not closed'); end; - {$IFDEF MEM_CHECK}CheckHeapWrtMemCnt(' ParseElement END');{$ENDIF} end; - -procedure TXMLReader.ExpectElement; +procedure TXMLReader.ProcessDefaultAttributes(Element: TDOMElement); +var + I: Integer; + ElDef: TDOMElementDef; + AttDefs: TDOMNamedNodeMap; + AttDef: TDOMAttrDef; + Attr: TDOMAttrEx; + Spec: Boolean; begin - if CheckName then - ParseElement - else - RaiseExc('Expected element'); + if Assigned(FDocType) then + begin + ElDef := TDOMElementDef(FDocType.ElementDefs.GetNamedItem(Element.TagName)); + if Assigned(ElDef) and ElDef.HasAttributes then + begin + AttDefs := ElDef.Attributes; + for I := 0 to AttDefs.Length-1 do + begin + AttDef := AttDefs[I] as TDOMAttrDef; + Spec := True; + // no validity checking yet; just append default values + Attr := TDOMAttrEx(Element.GetAttributeNode(AttDef.Name)); + if (AttDef.FDefault in [AD_DEFAULT, AD_FIXED]) and (Attr = nil) then + begin + Attr := TDOMAttrEx(AttDef.CloneNode(True)); + Element.SetAttributeNode(Attr); + Spec := False; + end; + if Assigned(Attr) then + begin + Attr.FSpecified := Spec; + Attr.FNormalize := (AttDef.FDataType <> DT_CDATA); + end; + end; + end; + end; end; function TXMLReader.ParsePEReference: Boolean; // [69] @@ -1327,180 +2379,84 @@ begin end; end; - -function TXMLReader.ParseExternalID(InNotation: Boolean): Boolean; // [75] - - function SkipSystemLiteral: Boolean; - var - Delim: WideChar; - begin - if (FCurChar = '''') or (FCurChar = '"') then - begin - Delim := FCurChar; - GetChar; // skip quote - while (FCurChar <> Delim) and (FCurChar <> #0) do - begin - GetChar; - end; - ExpectChar(Delim); // <-- to check the EOF only - Result := True; - end - else - Result := False; - end; - +function TXMLReader.ParseExternalID(out SysID, PubID: WideString; // [75] + SysIdOptional: Boolean): Boolean; begin if FCurChar = 'S' then begin ExpectString('SYSTEM'); ExpectWhitespace; - if not SkipSystemLiteral then - RaiseExpectedQMark; // FIX ibm75n06.xml: system literal MUST be present + SkipSystemLiteral(SysID, True); Result := True; end - else - if FCurChar = 'P' then + else if FCurChar = 'P' then begin ExpectString('PUBLIC'); ExpectWhitespace; SkipPubidLiteral; - if InNotation then + SetString(PubID, FValue.Buffer, FValue.Length); + if SysIdOptional then begin SkipWhitespace; - SkipSystemLiteral; + SkipSystemLiteral(SysID, False); end else begin ExpectWhitespace; - if not SkipSystemLiteral then - RaiseExpectedQMark; // FIX ibm75n06.xml: system literal MUST be present - end; + SkipSystemLiteral(SysID, True); + end; Result := True; end else Result := False; end; -procedure TXMLReader.ExpectExternalID; +procedure TXMLReader.ValidationError(const Msg: string; + const args: array of const); begin - if not ParseExternalID(False) then - RaiseExc('Expected external ID'); -end; - - -{ Currently, this method will only resolve the entities which are - predefined in XML: } - -procedure TXMLReader.ResolveEntities(RootNode: TDOMNode); -var - Node, NextNode: TDOMNode; - - procedure ReplaceEntityRef(EntityNode: TDOMNode; const Replacement: WideString); - var - PrevSibling, NextSibling: TDOMNode; - begin - PrevSibling := EntityNode.PreviousSibling; - NextSibling := EntityNode.NextSibling; - if Assigned(PrevSibling) and (PrevSibling.NodeType = TEXT_NODE) then - begin - TDOMCharacterData(PrevSibling).AppendData(Replacement); - RootNode.RemoveChild(EntityNode); - if Assigned(NextSibling) and (NextSibling.NodeType = TEXT_NODE) then - begin - // next sibling is to be removed, so we can't use it anymore - NextNode := NextSibling.NextSibling; - TDOMCharacterData(PrevSibling).AppendData( - TDOMCharacterData(NextSibling).Data); - RootNode.RemoveChild(NextSibling); - end - end else - if Assigned(NextSibling) and (NextSibling.NodeType = TEXT_NODE) then - begin - TDOMCharacterData(NextSibling).InsertData(0, Replacement); - RootNode.RemoveChild(EntityNode); - end else - RootNode.ReplaceChild(Doc.CreateTextNode(Replacement), EntityNode); - end; - -begin - Node := RootNode.FirstChild; - while Assigned(Node) do - begin - NextNode := Node.NextSibling; - if Node.NodeType = ENTITY_REFERENCE_NODE then - if Node.NodeName = 'amp' then - ReplaceEntityRef(Node, '&') - else if Node.NodeName = 'apos' then - ReplaceEntityRef(Node, '''') - else if Node.NodeName = 'gt' then - ReplaceEntityRef(Node, '>') - else if Node.NodeName = 'lt' then - ReplaceEntityRef(Node, '<') - else if Node.NodeName = 'quot' then - ReplaceEntityRef(Node, '"'); - Node := NextNode; - end; + // TODO: just a stub now + FInvalid := True; end; -procedure ReadXMLFile(out ADoc: TXMLDocument; var f: File); + +procedure ReadXMLFile(out ADoc: TXMLDocument; var f: Text); var - reader: TXMLReader; - buf: PChar; - BufSize: LongInt; + Reader: TXMLReader; + Src: TXMLInputSource; begin ADoc := nil; - BufSize := FileSize(f) + 2; // need double termination for the case of Unicode - if BufSize <= 2 then - exit; - - GetMem(buf, BufSize); + Src := TXMLFileInputSource.Create(f); + Src.SystemID := FilenameToURI(TTextRec(f).Name); + Reader := TXMLReader.Create; try - BlockRead(f, buf^, BufSize - 2); - buf[BufSize - 1] := #0; - buf[BufSize] := #0; - Reader := TXMLReader.Create; - try - Reader.ProcessXML(buf, TFileRec(f).name); - ADoc := TXMLDocument(Reader.doc); - finally - Reader.Free; - end; + Reader.ProcessXML(Src); + ADoc := TXMLDocument(Reader.Doc); finally - FreeMem(buf); + Reader.Free; end; end; -procedure ReadXMLFile(out ADoc: TXMLDocument; var f: TStream; const AFilename: String); +procedure ReadXMLFile(out ADoc: TXMLDocument; var f: TStream; const ABaseURI: String); var - reader: TXMLReader; - buf: PChar; - StreamSize: Int64; + Reader: TXMLReader; + Src: TXMLInputSource; begin ADoc := nil; - StreamSize := f.Size; // access to Size causes at least two seeks... - if StreamSize = 0 then exit; - - GetMem(buf, StreamSize + 2); + Reader := TXMLReader.Create; try - f.Read(buf^, StreamSize); - buf[StreamSize] := #0; - buf[StreamSize+1] := #0; - Reader := TXMLReader.Create; - try - Reader.ProcessXML(buf, AFilename); - finally - ADoc := TXMLDocument(Reader.doc); - Reader.Free; - end; + Src := TXMLStreamInputSource.Create(f, False); + Src.SystemID := ABaseURI; + Reader.ProcessXML(Src); finally - FreeMem(buf); + ADoc := TXMLDocument(Reader.doc); + Reader.Free; end; end; procedure ReadXMLFile(out ADoc: TXMLDocument; var f: TStream); begin - ReadXMLFile(ADoc, f, ''); + ReadXMLFile(ADoc, f, 'stream:'); end; procedure ReadXMLFile(out ADoc: TXMLDocument; const AFilename: String); @@ -1509,71 +2465,46 @@ var begin ADoc := nil; FileStream := TFileStream.Create(AFilename, fmOpenRead+fmShareDenyWrite); - if FileStream = nil then exit; //? it throws exception if cannot be created... try - ReadXMLFile(ADoc, FileStream, AFilename); + ReadXMLFile(ADoc, FileStream, FilenameToURI(AFilename)); finally FileStream.Free; end; end; -procedure ReadXMLFragment(AParentNode: TDOMNode; var f: File); +procedure ReadXMLFragment(AParentNode: TDOMNode; var f: Text); var Reader: TXMLReader; - buf: PChar; - BufSize: LongInt; + Src: TXMLInputSource; begin - BufSize := FileSize(f) + 2; - if BufSize <= 2 then - exit; - - GetMem(buf, BufSize); + Reader := TXMLReader.Create; try - BlockRead(f, buf^, BufSize - 2); - buf[BufSize - 1] := #0; - buf[BufSize] := #0; - Reader := TXMLReader.Create; - try - Reader.Doc := AParentNode.OwnerDocument; - Reader.ProcessFragment(AParentNode, buf, TFileRec(f).name); - finally - Reader.Free; - end; + Src := TXMLFileInputSource.Create(f); + Src.SystemID := FilenameToURI(TTextRec(f).Name); + Reader.ProcessFragment(Src, AParentNode); finally - FreeMem(buf); + Reader.Free; end; end; -procedure ReadXMLFragment(AParentNode: TDOMNode; var f: TStream; const AFilename: String); +procedure ReadXMLFragment(AParentNode: TDOMNode; var f: TStream; const ABaseURI: String); var Reader: TXMLReader; - buf: PChar; - StreamSize: Int64; + Src: TXMLInputSource; begin - StreamSize := f.Size; - if StreamSize = 0 then - exit; - - GetMem(buf, StreamSize + 2); + Reader := TXMLReader.Create; try - f.Read(buf^, StreamSize); - buf[StreamSize] := #0; - buf[StreamSize+1] := #0; - Reader := TXMLReader.Create; - Reader.Doc := AParentNode.OwnerDocument; - try - Reader.ProcessFragment(AParentNode, buf, AFilename); - finally - Reader.Free; - end; + Src := TXMLStreamInputSource.Create(f, False); + Src.SystemID := ABaseURI; + Reader.ProcessFragment(Src, AParentNode); finally - FreeMem(buf); + Reader.Free; end; end; procedure ReadXMLFragment(AParentNode: TDOMNode; var f: TStream); begin - ReadXMLFragment(AParentNode, f, ''); + ReadXMLFragment(AParentNode, f, 'stream:'); end; procedure ReadXMLFragment(AParentNode: TDOMNode; const AFilename: String); @@ -1582,68 +2513,50 @@ var begin Stream := TFileStream.Create(AFilename, fmOpenRead+fmShareDenyWrite); try - ReadXMLFragment(AParentNode, Stream, AFilename); + ReadXMLFragment(AParentNode, Stream, FilenameToURI(AFilename)); finally Stream.Free; end; end; -procedure ReadDTDFile(out ADoc: TXMLDocument; var f: File); +procedure ReadDTDFile(out ADoc: TXMLDocument; var f: Text); var Reader: TXMLReader; - buf: PChar; - BufSize: LongInt; + Src: TXMLInputSource; begin ADoc := nil; - BufSize := FileSize(f) + 1; - if BufSize <= 1 then - exit; - - GetMem(buf, BufSize); + Reader := TXMLReader.Create; try - BlockRead(f, buf^, BufSize - 1); - buf[BufSize - 1] := #0; - Reader := TXMLReader.Create; - try - Reader.ProcessDTD(buf, TFileRec(f).name); - ADoc := TXMLDocument(Reader.doc); - finally - Reader.Free; - end; + Src := TXMLFileInputSource.Create(f); + Src.SystemID := FilenameToURI(TTextRec(f).Name); + Reader.ProcessDTD(Src); + ADoc := TXMLDocument(Reader.doc); finally - FreeMem(buf); + Reader.Free; end; end; -procedure ReadDTDFile(out ADoc: TXMLDocument; var f: TStream; const AFilename: String); +procedure ReadDTDFile(out ADoc: TXMLDocument; var f: TStream; const ABaseURI: String); var Reader: TXMLReader; - buf: PChar; + Src: TXMLInputSource; begin ADoc := nil; - if f.Size = 0 then - exit; - - GetMem(buf, f.Size + 1); + Reader := TXMLReader.Create; try - f.Read(buf^, f.Size); - buf[f.Size] := #0; - Reader := TXMLReader.Create; - try - Reader.ProcessDTD(buf, AFilename); - ADoc := TXMLDocument(Reader.doc); - finally - Reader.Free; - end; + Src := TXMLStreamInputSource.Create(f, False); + Src.SystemID := ABaseURI; + Reader.ProcessDTD(Src); + ADoc := TXMLDocument(Reader.doc); finally - FreeMem(buf); + Reader.Free; end; end; procedure ReadDTDFile(out ADoc: TXMLDocument; var f: TStream); begin - ReadDTDFile(ADoc, f, ''); + ReadDTDFile(ADoc, f, 'stream:'); end; procedure ReadDTDFile(out ADoc: TXMLDocument; const AFilename: String); @@ -1653,7 +2566,7 @@ begin ADoc := nil; Stream := TFileStream.Create(AFilename, fmOpenRead+fmShareDenyWrite); try - ReadDTDFile(ADoc, Stream, AFilename); + ReadDTDFile(ADoc, Stream, FilenameToURI(AFilename)); finally Stream.Free; end;