* xmlread.pp, doing progress with streaming API:

+ state transitions needed to report start/end element events correctly
  + procedures for maintaining attribute data
  * excluded FCursor from attribute value parsing

git-svn-id: trunk@16161 -
This commit is contained in:
sergei 2010-10-14 19:41:22 +00:00
parent 89df75c731
commit d3bdf2577c

View File

@ -164,6 +164,17 @@ type
LinePos: Integer; LinePos: Integer;
end; end;
TXMLNodeType = (ntNone, ntElement, ntAttribute, ntText,
ntCDATA, ntEntityReference, ntEntity, ntProcessingInstruction,
ntComment, ntDocument, ntDocumentType, ntDocumentFragment,
ntNotation,
ntWhitespace,
ntSignificantWhitespace,
ntEndElement,
ntEndEntity,
ntXmlDeclaration
);
TDOMEntityEx = class(TDOMEntity) TDOMEntityEx = class(TDOMEntity)
protected protected
FExternallyDeclared: Boolean; FExternallyDeclared: Boolean;
@ -287,9 +298,14 @@ type
PNodeData = ^TNodeData; PNodeData = ^TNodeData;
TNodeData = object TNodeData = object
// generic members // generic members
FNext: PNodeData;
FQName: PHashItem; FQName: PHashItem;
FNodeType: TXMLNodeType;
FDOMNode: TObject; // temporary FDOMNode: TObject; // temporary
FValueStr: WideString;
FValueStart: PWideChar;
FValueLength: Integer;
// validation-specific members // validation-specific members
FElement: TDOMElement; FElement: TDOMElement;
@ -300,6 +316,8 @@ type
function Incomplete: Boolean; function Incomplete: Boolean;
end; end;
TNodeDataDynArray = array of TNodeData;
TXMLReadState = (rsProlog, rsDTD, rsRoot, rsEpilog); TXMLReadState = (rsProlog, rsDTD, rsRoot, rsEpilog);
TElementContentType = ( TElementContentType = (
@ -312,9 +330,10 @@ type
TCheckNameFlags = set of (cnOptional, cnToken); TCheckNameFlags = set of (cnOptional, cnToken);
TXMLToken = (xtNone, xtEOF, xtText, xtWhitespace, xtElement, xtEndElement, xtCDSect, xtComment, xtPI, xtDoctype, xtEntity, xtEntityEnd); TXMLToken = (xtNone, xtEOF, xtText, xtWhitespace, xtElement, xtEndElement,
xtCDSect, xtComment, xtPI, xtDoctype, xtEntity, xtEntityEnd, xtPopElement,
xtPopEmptyElement, xtPushElement);
TPrefixedAttr = record TPrefixedAttr = record
Attr: TDOMAttr; Attr: TDOMAttr;
PrefixLen: Integer; // to avoid recalculation PrefixLen: Integer; // to avoid recalculation
@ -388,12 +407,19 @@ type
function SkipUntilSeq(const Delim: TSetOfChar; c1: WideChar; c2: WideChar = #0): Boolean; function SkipUntilSeq(const Delim: TSetOfChar; c1: WideChar; c2: WideChar = #0): Boolean;
procedure CheckMaxChars; procedure CheckMaxChars;
function AllocNodeData(AIndex: Integer): PNodeData; function AllocNodeData(AIndex: Integer): PNodeData;
function AllocAttributeData(AName: PHashItem): PNodeData;
function AllocAttributeValueChunk(APrev: PNodeData): PNodeData;
procedure CleanupAttributeData;
procedure SetNodeInfoWithValue(typ: TXMLNodeType);
protected protected
FCursor: TDOMNode_WithChildren; FCursor: TDOMNode_WithChildren;
FNesting: Integer; FNesting: Integer;
FCurrNode: PNodeData; FCurrNode: PNodeData;
FAttrCount: Integer; FAttrCount: Integer;
FNodeStack: array of TNodeData; FNodeStack: TNodeDataDynArray;
FAttrChunks: TFPList;
FFreeAttrChunk: PNodeData;
FAttrCleanupFlag: Boolean;
procedure DoError(Severity: TErrorSeverity; const descr: string; LineOffs: Integer=0); procedure DoError(Severity: TErrorSeverity; const descr: string; LineOffs: Integer=0);
procedure DoErrorPos(Severity: TErrorSeverity; const descr: string; procedure DoErrorPos(Severity: TErrorSeverity; const descr: string;
@ -414,7 +440,7 @@ type
function ExpectName: WideString; // [5] function ExpectName: WideString; // [5]
function ParseLiteral(var ToFill: TWideCharBuf; aType: TLiteralType; function ParseLiteral(var ToFill: TWideCharBuf; aType: TLiteralType;
Required: Boolean; Normalized: PBoolean = nil): Boolean; Required: Boolean; Normalized: PBoolean = nil): Boolean;
procedure ExpectAttValue; // [10] procedure ExpectAttValue(attr: TDOMAttr); // [10]
procedure ParseComment; // [15] procedure ParseComment; // [15]
procedure ParsePI; // [16] procedure ParsePI; // [16]
procedure ParseXmlOrTextDecl(TextDecl: Boolean); procedure ParseXmlOrTextDecl(TextDecl: Boolean);
@ -429,7 +455,7 @@ type
function Read: Boolean; function Read: Boolean;
function ResolvePredefined: Boolean; function ResolvePredefined: Boolean;
function EntityCheck(NoExternals: Boolean = False): TDOMEntityEx; function EntityCheck(NoExternals: Boolean = False): TDOMEntityEx;
procedure AppendReference(AEntity: TDOMEntityEx); procedure AppendReference(cur: TDOMNode; AEntity: TDOMEntityEx);
function PrefetchEntity(AEntity: TDOMEntityEx): Boolean; function PrefetchEntity(AEntity: TDOMEntityEx): Boolean;
procedure StartPE; procedure StartPE;
function ParseRef(var ToFill: TWideCharBuf): Boolean; // [67] function ParseRef(var ToFill: TWideCharBuf): Boolean; // [67]
@ -453,7 +479,7 @@ type
procedure ValidateDTD; procedure ValidateDTD;
procedure ValidateRoot; procedure ValidateRoot;
procedure ValidationError(const Msg: string; const args: array of const; LineOffs: Integer = -1); procedure ValidationError(const Msg: string; const args: array of const; LineOffs: Integer = -1);
procedure DoAttrText(ch: PWideChar; Count: Integer); procedure DoAttrText(node: TDOMAttr; ch: PWideChar; Count: Integer);
procedure DTDReloadHook; procedure DTDReloadHook;
procedure ConvertSource(SrcIn: TXMLInputSource; out SrcOut: TXMLCharSource); procedure ConvertSource(SrcIn: TXMLInputSource; out SrcOut: TXMLCharSource);
// Some SAX-alike stuff (at a very early stage) // Some SAX-alike stuff (at a very early stage)
@ -1278,6 +1304,7 @@ begin
FNotationRefs := TFPList.Create; FNotationRefs := TFPList.Create;
FNSHelper := TNSSupport.Create; FNSHelper := TNSSupport.Create;
FAttrChunks := TFPList.Create;
FNsAttHash := TDblHashArray.Create; FNsAttHash := TDblHashArray.Create;
SetLength(FWorkAtts, 16); SetLength(FWorkAtts, 16);
@ -1307,7 +1334,11 @@ begin
end; end;
destructor TXMLReader.Destroy; destructor TXMLReader.Destroy;
var
i: Integer;
begin begin
for i := FAttrChunks.Count-1 downto 0 do
Dispose(PNodeData(FAttrChunks.List^[i]));
if Assigned(FEntityValue.Buffer) then if Assigned(FEntityValue.Buffer) then
FreeMem(FEntityValue.Buffer); FreeMem(FEntityValue.Buffer);
FreeMem(FName.Buffer); FreeMem(FName.Buffer);
@ -1325,6 +1356,7 @@ begin
FNotationRefs.Free; FNotationRefs.Free;
FIDRefs.Free; FIDRefs.Free;
FAttrChunks.Free;
inherited Destroy; inherited Destroy;
end; end;
@ -1361,6 +1393,7 @@ begin
FState := rsRoot; FState := rsRoot;
FNesting := 0; FNesting := 0;
FCurrNode := @FNodeStack[0]; FCurrNode := @FNodeStack[0];
FCurrNode^.FDOMNode := FCursor;
FXML11 := doc.InheritsFrom(TXMLDocument) and (TXMLDocument(doc).XMLVersion = '1.1'); FXML11 := doc.InheritsFrom(TXMLDocument) and (TXMLDocument(doc).XMLVersion = '1.1');
Initialize(ASource); Initialize(ASource);
FDocType := TDOMDocumentTypeEx(doc.DocType); FDocType := TDOMDocumentTypeEx(doc.DocType);
@ -1550,11 +1583,16 @@ begin
ExpectChar(';'); ExpectChar(';');
end; end;
procedure TXMLReader.DoAttrText(node: TDOMAttr; ch: PWideChar; Count: Integer);
begin
node.InternalAppend(Doc.CreateTextNodeBuf(ch, Count, False));
end;
const const
AttrDelims: TSetOfChar = [#0, '<', '&', '''', '"', #9, #10, #13]; AttrDelims: TSetOfChar = [#0, '<', '&', '''', '"', #9, #10, #13];
GT_Delim: TSetOfChar = [#0, '>']; GT_Delim: TSetOfChar = [#0, '>'];
procedure TXMLReader.ExpectAttValue; procedure TXMLReader.ExpectAttValue(attr: TDOMAttr);
var var
wc: WideChar; wc: WideChar;
Delim: WideChar; Delim: WideChar;
@ -1578,10 +1616,10 @@ begin
begin begin
if FValue.Length > 0 then if FValue.Length > 0 then
begin begin
DoAttrText(FValue.Buffer, FValue.Length); DoAttrText(attr, FValue.Buffer, FValue.Length);
FValue.Length := 0; FValue.Length := 0;
end; end;
AppendReference(ent); AppendReference(attr, ent);
end end
else else
ContextPush(ent); ContextPush(ent);
@ -1599,7 +1637,7 @@ begin
FatalError('Literal has no closing quote', -1); FatalError('Literal has no closing quote', -1);
until False; until False;
if FValue.Length > 0 then if FValue.Length > 0 then
DoAttrText(FValue.Buffer, FValue.Length); DoAttrText(attr, FValue.Buffer, FValue.Length);
FValue.Length := 0; FValue.Length := 0;
end; end;
@ -1706,12 +1744,14 @@ begin
// To build children of the entity itself, we must parse it "out of context" // To build children of the entity itself, we must parse it "out of context"
InnerReader := TXMLReader.Create(FCtrl); InnerReader := TXMLReader.Create(FCtrl);
try try
InnerReader.FAttrTag := FAttrTag;
EntityToSource(Result, Src); EntityToSource(Result, Src);
Result.SetReadOnly(False); Result.SetReadOnly(False);
if Assigned(Src) then if Assigned(Src) then
InnerReader.ProcessFragment(Src, Result); InnerReader.ProcessFragment(Src, Result);
Result.FResolved := True; Result.FResolved := True;
finally finally
FAttrTag := InnerReader.FAttrTag;
InnerReader.Free; InnerReader.Free;
Result.FOnStack := False; Result.FOnStack := False;
Result.SetReadOnly(True); Result.SetReadOnly(True);
@ -2464,12 +2504,10 @@ begin
if AttDef.DataType = dtId then if AttDef.DataType = dtId then
ValidationError('An attribute of type ID cannot have a default value',[]); ValidationError('An attribute of type ID cannot have a default value',[]);
FCursor := AttDef;
// See comments to valid-sa-094: PE expansion should be disabled in AttDef. // See comments to valid-sa-094: PE expansion should be disabled in AttDef.
// ExpectAttValue() does not recognize PEs anyway, so setting FRecognizePEs isn't needed // ExpectAttValue() does not recognize PEs anyway, so setting FRecognizePEs isn't needed
// Saving/restoring FCursor is also redundant because it is always nil here. // Saving/restoring FCursor is also redundant because it is always nil here.
ExpectAttValue; ExpectAttValue(AttDef);
FCursor := nil;
if not ValidateAttrSyntax(AttDef, AttDef.NodeValue) then if not ValidateAttrSyntax(AttDef, AttDef.NodeValue) then
ValidationError('Default value for attribute ''%s'' has wrong syntax', [AttDef.Name]); ValidationError('Default value for attribute ''%s'' has wrong syntax', [AttDef.Name]);
end; end;
@ -2675,7 +2713,7 @@ begin
ParseMarkupDecl; ParseMarkupDecl;
end; end;
procedure TXMLReader.AppendReference(AEntity: TDOMEntityEx); procedure TXMLReader.AppendReference(cur: TDOMNode; AEntity: TDOMEntityEx);
var var
s: WideString; s: WideString;
begin begin
@ -2683,7 +2721,7 @@ begin
SetString(s, FName.Buffer, FName.Length) SetString(s, FName.Buffer, FName.Length)
else else
s := AEntity.nodeName; s := AEntity.nodeName;
FCursor.AppendChild(doc.CreateEntityReference(s)); cur.AppendChild(doc.CreateEntityReference(s));
end; end;
@ -2735,6 +2773,11 @@ const
[#0, '>'] [#0, '>']
); );
textNodeTypes: array[Boolean] of TXMLNodeType = (
ntText,
ntWhitespace
);
procedure TXMLReader.ParseContent; procedure TXMLReader.ParseContent;
begin begin
FNext := xtText; FNext := xtText;
@ -2745,6 +2788,8 @@ begin
DoText(FValue.Buffer, FValue.Length, FToken = xtWhitespace); DoText(FValue.Buffer, FValue.Length, FToken = xtWhitespace);
xtCDSect: xtCDSect:
DoCDSect(FValue.Buffer, FValue.Length); DoCDSect(FValue.Buffer, FValue.Length);
xtEndElement:
DoEndElement(-1);
end; end;
end; end;
end; end;
@ -2756,6 +2801,32 @@ var
InCDATA: Boolean; InCDATA: Boolean;
tok: TXMLToken; tok: TXMLToken;
begin begin
if FNext = xtPopEmptyElement then
begin
FNext := xtPopElement;
FToken := xtEndElement;
FCurrNode^.FNodeType := ntEndElement;
if FAttrCleanupFlag then
CleanupAttributeData;
FAttrCount := 0;
Result := True;
Exit;
end;
if FNext = xtPushElement then
begin
if FAttrCleanupFlag then
CleanupAttributeData;
FAttrCount := 0;
FNext := xtText;
end
else if FNext = xtPopElement then
begin
if FNamespaces then
FNSHelper.EndElement;
PopVC;
FNext := xtText;
end;
InCDATA := (FNext = xtCDSect); InCDATA := (FNext = xtCDSect);
StoreLocation(FTokenStart); StoreLocation(FTokenStart);
nonWs := False; nonWs := False;
@ -2827,6 +2898,7 @@ begin
InCDATA := False; InCDATA := False;
if FCDSectionsAsText then if FCDSectionsAsText then
Continue; Continue;
SetNodeInfoWithValue(ntCDATA);
FToken := xtCDSect; FToken := xtCDSect;
FNext := xtText; FNext := xtText;
Result := True; Result := True;
@ -2861,6 +2933,7 @@ begin
end; end;
if FValue.Length <> 0 then if FValue.Length <> 0 then
begin begin
SetNodeInfoWithValue(textNodeTypes[nonWs]);
if nonWs then FToken := xtText else FToken := xtWhitespace; if nonWs then FToken := xtText else FToken := xtWhitespace;
FNext := tok; FNext := tok;
Result := True; Result := True;
@ -2871,16 +2944,22 @@ begin
else // not (FNext in [xtText, xtCDSect]) else // not (FNext in [xtText, xtCDSect])
tok := FNext; tok := FNext;
FToken := tok;
FNext := xtText;
case tok of case tok of
xtEntity: AppendReference(FCurrEntity); xtEntity: AppendReference(FCursor, FCurrEntity);
xtElement: ParseStartTag; xtElement: ParseStartTag;
xtEndElement: ParseEndTag; xtEndElement:
begin
ParseEndTag;
FCurrNode^.FNodeType := ntEndElement;
FNext := xtPopElement;
end;
xtPI: ParsePI; xtPI: ParsePI;
xtDoctype: ParseDoctypeDecl; xtDoctype: ParseDoctypeDecl;
xtComment: ParseComment; xtComment: ParseComment;
end; end;
FToken := tok;
FNext := xtText;
Result := tok <> xtEOF; Result := tok <> xtEOF;
end; end;
@ -2920,6 +2999,8 @@ begin
FCursor.AppendChild(NewElem); FCursor.AppendChild(NewElem);
// we're about to process a new set of attributes // we're about to process a new set of attributes
Inc(FAttrTag); Inc(FAttrTag);
// can point to a child text/comment/PI node, so restore it
FCurrNode := @FNodeStack[FNesting];
// Remember the hash entry, we'll need it often // Remember the hash entry, we'll need it often
ElName := NewElem.NSI.QName; ElName := NewElem.NSI.QName;
@ -2937,6 +3018,7 @@ begin
FAttrCount := 0; FAttrCount := 0;
PushVC(NewElem, ElDef); // this increases FNesting PushVC(NewElem, ElDef); // this increases FNesting
FCurrNode^.FQName := ElName; FCurrNode^.FQName := ElName;
FCurrNode^.FNodeType := ntElement;
while (FSource.FBuf^ <> '>') and (FSource.FBuf^ <> '/') do while (FSource.FBuf^ <> '>') and (FSource.FBuf^ <> '/') do
begin begin
@ -2945,6 +3027,8 @@ begin
Break; Break;
ParseAttribute(NewElem, ElDef); ParseAttribute(NewElem, ElDef);
end; end;
// ParseAttribute might have reallocated FNodeStack, so restore FCurrNode once again
FCurrNode := @FNodeStack[FNesting];
if FSource.FBuf^ = '/' then if FSource.FBuf^ = '/' then
begin begin
@ -2964,9 +3048,10 @@ begin
FCursor := NewElem; FCursor := NewElem;
if not FPreserveWhitespace then // critical for testsuite compliance if not FPreserveWhitespace then // critical for testsuite compliance
SkipS; SkipS;
FNext := xtPushElement;
end end
else else
DoEndElement(0); FNext := xtPopEmptyElement;
end; end;
procedure TXMLReader.DoEndElement(ErrOffset: Integer); procedure TXMLReader.DoEndElement(ErrOffset: Integer);
@ -2981,9 +3066,6 @@ begin
if FValidate and FCurrNode^.Incomplete then if FValidate and FCurrNode^.Incomplete then
ValidationError('Element ''%s'' is missing required sub-elements', [NewElem.NSI.QName^.Key], ErrOffset); ValidationError('Element ''%s'' is missing required sub-elements', [NewElem.NSI.QName^.Key], ErrOffset);
if FNamespaces then
FNSHelper.EndElement;
PopVC;
end; end;
procedure TXMLReader.ParseEndTag; // [42] procedure TXMLReader.ParseEndTag; // [42]
@ -2994,24 +3076,20 @@ begin
FatalError('End-tag is not allowed here'); FatalError('End-tag is not allowed here');
Inc(FSource.FBuf); Inc(FSource.FBuf);
ElName := FCurrNode^.FElement.NSI.QName; FCurrNode := @FNodeStack[FNesting]; // move off the possible child
ElName := FCurrNode^.FQName;
CheckName; CheckName;
if not BufEquals(FName, ElName^.Key) then if not BufEquals(FName, ElName^.Key) then
FatalError('Unmatching element end tag (expected "</%s>")', [ElName^.Key], FName.Length); FatalError('Unmatching element end tag (expected "</%s>")', [ElName^.Key], FName.Length);
if FSource.FBuf^ = '>' then // this handles majority of cases if FSource.FBuf^ = '>' then // this handles majority of cases
FSource.NextChar
else
begin begin
FSource.NextChar;
DoEndElement(FName.Length+1);
end
else // but if closing '>' is preceded by whitespace,
begin // skipping it is likely to lose position info.
StoreLocation(FTokenStart);
Dec(FTokenStart.LinePos, FName.Length);
SkipS; SkipS;
ExpectChar('>'); ExpectChar('>');
DoEndElement(-1);
end; end;
Inc(FTokenStart.LinePos, 2); // move over '</' chars
end; end;
procedure TXMLReader.ParseAttribute(Elem: TDOMElement; ElDef: TDOMElementDef); procedure TXMLReader.ParseAttribute(Elem: TDOMElement; ElDef: TDOMElementDef);
@ -3068,9 +3146,9 @@ begin
OldAttr.Free; OldAttr.Free;
FatalError('Duplicate attribute', FName.Length); FatalError('Duplicate attribute', FName.Length);
end; end;
ExpectEq; ExpectEq;
FCursor := attr; ExpectAttValue(attr);
ExpectAttValue;
if Assigned(AttDef) and ((AttDef.DataType <> dtCdata) or (AttDef.Default = adFixed)) then if Assigned(AttDef) and ((AttDef.DataType <> dtCdata) or (AttDef.Default = adFixed)) then
CheckValue; CheckValue;
@ -3384,11 +3462,6 @@ begin
FCursor.AppendChild(TextNode); FCursor.AppendChild(TextNode);
end; end;
procedure TXMLReader.DoAttrText(ch: PWideChar; Count: Integer);
begin
FCursor.AppendChild(Doc.CreateTextNodeBuf(ch, Count, False));
end;
procedure TXMLReader.DoComment(ch: PWideChar; Count: Integer); procedure TXMLReader.DoComment(ch: PWideChar; Count: Integer);
var var
Node: TDOMComment; Node: TDOMComment;
@ -3434,14 +3507,70 @@ begin
ValidationError('Duplicate notation declaration: ''%s''', [aName]); ValidationError('Duplicate notation declaration: ''%s''', [aName]);
end; end;
function TXMLReader.AllocAttributeData(AName: PHashItem): PNodeData;
begin
Result := AllocNodeData(FNesting + FAttrCount + 1);
Result^.FNodeType := ntAttribute;
Result^.FQName := AName;
Inc(FAttrCount);
end;
function TXMLReader.AllocNodeData(AIndex: Integer): PNodeData; function TXMLReader.AllocNodeData(AIndex: Integer): PNodeData;
begin begin
if AIndex >= Length(FNodeStack) then {make sure we have an extra slot to place child text/comment/etc}
SetLength(FNodeStack, AIndex * 2); if AIndex >= Length(FNodeStack)-1 then
SetLength(FNodeStack, AIndex * 2 + 2);
Result := @FNodeStack[AIndex]; Result := @FNodeStack[AIndex];
end; end;
function TXMLReader.AllocAttributeValueChunk(APrev: PNodeData): PNodeData;
begin
result := FFreeAttrChunk;
if Assigned(result) then
begin
FFreeAttrChunk := result^.FNext;
result^.FNext := nil;
end
else { no free chunks, create a new one }
begin
New(result);
FillChar(result^, sizeof(TNodeData), 0);
FAttrChunks.Add(result);
end;
APrev^.FNext := result;
end;
procedure TXMLReader.CleanupAttributeData;
var
i: Integer;
chunk, tmp: PNodeData;
begin
for i := 1 to FAttrCount do
begin
chunk := FNodeStack[FNesting+i].FNext;
while Assigned(chunk) do
begin
tmp := chunk^.FNext;
chunk^.FNext := FFreeAttrChunk;
FFreeAttrChunk := chunk;
chunk := tmp;
end;
FNodeStack[FNesting+i].FNext := nil;
end;
FAttrCleanupFlag := False;
end;
procedure TXMLReader.SetNodeInfoWithValue(typ: TXMLNodeType);
begin
{FNesting+1 is available due to overallocation in AllocNodeData() }
FCurrNode := @FNodeStack[FNesting+1];
FCurrNode^.FNodeType := typ;
FCurrNode^.FQName := nil;
FCurrNode^.FValueStart := FValue.Buffer;
FCurrNode^.FValueLength := FValue.Length;
end;
procedure TXMLReader.PushVC(aElement: TDOMElement; aElDef: TDOMElementDef); procedure TXMLReader.PushVC(aElement: TDOMElement; aElDef: TDOMElementDef);
begin begin
Inc(FNesting); Inc(FNesting);