mirror of
https://gitlab.com/freepascal.org/fpc/source.git
synced 2025-04-21 18:09:30 +02:00
htmldefs.pp:
+ Define elements which may omit end-tag (except HTML, HEAD and BODY which may also omit the start-tag) + Define which elements may close other elements (modelled after libxml2). * DIV may have #PCDATA content. sax_html.pp: * Improve the parser to report startElement/endElement events properly. Should resolve Mantis #14073 and related element hierarchy issues. git-svn-id: trunk@13357 -
This commit is contained in:
parent
8c7f6fb7ac
commit
fcd96805fa
@ -97,8 +97,9 @@ type
|
||||
efPCDATAContent, // may have PCDATA content
|
||||
efPreserveWhitespace, // preserve all whitespace
|
||||
efDeprecated, // can be dropped in future versions
|
||||
efNoChecks // Checks (attributes,subtags,...) can only be implemented in descendants
|
||||
);
|
||||
efNoChecks, // Checks (attributes,subtags,...) can only be implemented in descendants
|
||||
efEndTagOptional
|
||||
);
|
||||
THTMLElementFlags = set of THTMLElementFlag;
|
||||
|
||||
PHTMLElementProps = ^THTMLElementProps;
|
||||
@ -184,10 +185,10 @@ const
|
||||
(Name: 'col'; Flags: [];
|
||||
Attributes: atsattrs+atscellhalign+[atvalign,atspan,atwidth]),
|
||||
|
||||
(Name: 'colgroup'; Flags: [efSubelementContent];
|
||||
(Name: 'colgroup'; Flags: [efSubelementContent, efEndTagOptional];
|
||||
Attributes: atsattrs+atscellhalign+[atvalign,atspan,atwidth]),
|
||||
|
||||
(Name: 'dd'; Flags: efSubcontent; Attributes: atsattrs),
|
||||
(Name: 'dd'; Flags: efSubcontent+[efEndTagOptional]; Attributes: atsattrs),
|
||||
|
||||
(Name: 'del'; Flags: [efSubelementContent]; Attributes: atsattrs+[atcite,atdatetime]),
|
||||
|
||||
@ -195,11 +196,11 @@ const
|
||||
|
||||
(Name: 'dir'; Flags: [efSubelementContent,efDeprecated]; Attributes: atsattrs),
|
||||
|
||||
(Name: 'div'; Flags: [efSubelementContent]; Attributes: atsattrs),
|
||||
(Name: 'div'; Flags: efSubContent; Attributes: atsattrs),
|
||||
|
||||
(Name: 'dl'; Flags: [efSubelementContent]; Attributes: atsattrs),
|
||||
|
||||
(Name: 'dt'; Flags: [efPCDataContent]; Attributes: atsattrs),
|
||||
(Name: 'dt'; Flags: [efPCDataContent, efEndTagOptional]; Attributes: atsattrs),
|
||||
|
||||
(Name: 'em'; Flags: efSubcontent; Attributes: atsattrs),
|
||||
|
||||
@ -260,7 +261,7 @@ const
|
||||
|
||||
(Name: 'legend'; Flags: efSubcontent; Attributes: atsattrs+[ataccesskey]),
|
||||
|
||||
(Name: 'li'; Flags: efSubcontent; Attributes: atsattrs),
|
||||
(Name: 'li'; Flags: efSubcontent+[efEndTagOptional]; Attributes: atsattrs),
|
||||
|
||||
(Name: 'link'; Flags: [];
|
||||
Attributes: atsattrs+[atcharset,athref,athreflang,attype,atrel,atrev,atmedia]),
|
||||
@ -283,10 +284,10 @@ const
|
||||
|
||||
(Name: 'optgroup'; Flags: efSubcontent; Attributes: atsattrs+[atdisabled,atlabel]),
|
||||
|
||||
(Name: 'option'; Flags: efSubcontent;
|
||||
(Name: 'option'; Flags: efSubcontent+[efEndTagOptional];
|
||||
Attributes: atsattrs+[atselected,atdisabled,atlabel,atvalue]),
|
||||
|
||||
(Name: 'p'; Flags: efSubcontent; Attributes: atsattrs),
|
||||
(Name: 'p'; Flags: efSubcontent+[efEndTagOptional]; Attributes: atsattrs),
|
||||
|
||||
(Name: 'param'; Flags: []; Attributes: [atid,atname,atvalue,atvaluetype,attype]),
|
||||
|
||||
@ -324,23 +325,23 @@ const
|
||||
|
||||
(Name: 'tbody'; Flags: [efSubelementContent]; Attributes: atsattrs+atscellhalign+[atvalign]),
|
||||
|
||||
(Name: 'td'; Flags: efSubcontent;
|
||||
(Name: 'td'; Flags: efSubcontent+[efEndTagOptional];
|
||||
Attributes: atsattrs+atscellhalign+[atvalign,atabbr,ataxis,atheaders,atscope,atrowspan,atcolspan]),
|
||||
|
||||
(Name: 'textarea'; Flags: [efPCDATAContent];
|
||||
Attributes: atsattrs+[atname,atrows,atcols,atdisabled,atreadonly,attabindex,
|
||||
ataccesskey,atonfocus,atonblur,atonselect,atonchange]),
|
||||
|
||||
(Name: 'tfoot'; Flags: [efSubelementContent]; Attributes: atsattrs+atscellhalign+[atvalign]),
|
||||
(Name: 'tfoot'; Flags: [efSubelementContent,efEndTagOptional]; Attributes: atsattrs+atscellhalign+[atvalign]),
|
||||
|
||||
(Name: 'th'; Flags: efSubcontent;
|
||||
(Name: 'th'; Flags: efSubcontent+[efEndTagOptional];
|
||||
Attributes: atsattrs+atscellhalign+[atvalign,atabbr,ataxis,atheaders,atscope,atrowspan,atcolspan]),
|
||||
|
||||
(Name: 'thead'; Flags: [efSubelementContent]; Attributes: atsattrs+atscellhalign+[atvalign]),
|
||||
(Name: 'thead'; Flags: [efSubelementContent, efEndTagOptional]; Attributes: atsattrs+atscellhalign+[atvalign]),
|
||||
|
||||
(Name: 'title'; Flags: efSubcontent; Attributes: atsi18n),
|
||||
|
||||
(Name: 'tr'; Flags: [efSubelementContent];
|
||||
(Name: 'tr'; Flags: [efSubelementContent, efEndTagOptional];
|
||||
Attributes: atsattrs+atscellhalign+[atvalign]),
|
||||
|
||||
(Name: 'tt'; Flags: efSubcontent; Attributes: atsattrs),
|
||||
@ -559,12 +560,81 @@ const
|
||||
function ResolveHTMLEntityReference(const Name: String;
|
||||
var Entity: WideChar): Boolean;
|
||||
|
||||
function IsAutoClose(NewTag, OldTag: THTMLElementTag): Boolean;
|
||||
|
||||
|
||||
implementation
|
||||
|
||||
uses SysUtils;
|
||||
|
||||
{ Define which elements auto-close other elements, modelled after libxml2.
|
||||
This is an array of variable-length lists, each terminated by etUnknown.
|
||||
Indices to first element of each list are provided by AutoCloseIndex array,
|
||||
which *must* be updated after any change. }
|
||||
const
|
||||
AutoCloseTab: array[0..277] of THTMLElementTag = (
|
||||
|
||||
etform, etform, etp, ethr, eth1, eth2, eth3, eth4, eth5, eth6,
|
||||
etdl, etul, etol, etmenu, etdir, etaddress, etpre,
|
||||
ethead, etUnknown,
|
||||
ethead, etp, etUnknown,
|
||||
ettitle, etp, etUnknown,
|
||||
etbody, ethead, etstyle, etlink, ettitle, etp, etUnknown,
|
||||
etframeset, ethead, etstyle, etlink, ettitle, etp, etUnknown,
|
||||
etli, etp, eth1, eth2, eth3, eth4, eth5, eth6, etdl, etaddress,
|
||||
etpre, ethead, etli, etUnknown,
|
||||
ethr, etp, ethead, etUnknown,
|
||||
eth1, etp, ethead, etUnknown,
|
||||
eth2, etp, ethead, etUnknown,
|
||||
eth3, etp, ethead, etUnknown,
|
||||
eth4, etp, ethead, etUnknown,
|
||||
eth5, etp, ethead, etUnknown,
|
||||
eth6, etp, ethead, etUnknown,
|
||||
etdir, etp, ethead, etUnknown,
|
||||
etaddress, etp, ethead, etul, etUnknown,
|
||||
etpre, etp, ethead, etul, etUnknown,
|
||||
etblockquote, etp, ethead, etUnknown,
|
||||
etdl, etp, etdt, etmenu, etdir, etaddress, etpre,
|
||||
ethead, etUnknown,
|
||||
etdt, etp, etmenu, etdir, etaddress, etpre,
|
||||
ethead, etdd, etUnknown,
|
||||
etdd, etp, etmenu, etdir, etaddress, etpre,
|
||||
ethead, etdt, etUnknown,
|
||||
etul, etp, ethead, etol, etmenu, etdir, etaddress, etpre, etUnknown,
|
||||
etol, etp, ethead, etul, etUnknown,
|
||||
etmenu, etp, ethead, etul, etUnknown,
|
||||
etp, etp, ethead, eth1, eth2, eth3, eth4, eth5, eth6, etUnknown,
|
||||
etdiv, etp, ethead, etUnknown,
|
||||
etnoscript, etp, ethead, etUnknown,
|
||||
etcenter, etfont, etb, eti, etp, ethead, etUnknown,
|
||||
eta, eta, etUnknown,
|
||||
etcaption, etp, etUnknown,
|
||||
etcolgroup, etcaption, etcolgroup, etcol, etp, etUnknown,
|
||||
etcol, etcaption, etcol, etp, etUnknown,
|
||||
ettable, etp, ethead, eth1, eth2, eth3, eth4, eth5, eth6, etpre,
|
||||
eta, etUnknown,
|
||||
etth, etth, ettd, etp, etspan, etfont, eta, etb, eti, etu, etUnknown,
|
||||
ettd, etth, ettd, etp, etspan, etfont, eta, etb, eti, etu, etUnknown,
|
||||
ettr, etth, ettd, ettr, etcaption, etcol, etcolgroup, etp, etUnknown,
|
||||
etthead, etcaption, etcol, etcolgroup, etUnknown,
|
||||
ettfoot, etth, ettd, ettr, etcaption, etcol, etcolgroup, etthead,
|
||||
ettbody, etp, etUnknown,
|
||||
ettbody, etth, ettd, ettr, etcaption, etcol, etcolgroup, etthead,
|
||||
ettfoot, ettbody, etp, etUnknown,
|
||||
etoptgroup, etoption, etUnknown,
|
||||
etoption, etoption, etUnknown,
|
||||
etfieldset, etlegend, etp, ethead, eth1, eth2, eth3, eth4, eth5, eth6,
|
||||
etpre, eta, etUnknown,
|
||||
etUnknown);
|
||||
|
||||
AutoCloseIndex: array[0..40] of Integer = (
|
||||
0, 19, 22, 25, 32, 39, 53, 57, 61, 65, 69,
|
||||
73, 77, 81, 85, 90, 95, 99, 108, 117, 126,
|
||||
135, 140, 145, 155, 159, 163, 170, 173, 176,
|
||||
182, 187, 199, 210, 221, 230, 235, 246, 258,
|
||||
261, 264
|
||||
);
|
||||
|
||||
function ResolveHTMLEntityReference(const Name: String;
|
||||
var Entity: WideChar): Boolean;
|
||||
var
|
||||
@ -639,4 +709,26 @@ begin
|
||||
end;
|
||||
end;
|
||||
|
||||
function IsAutoClose(NewTag, OldTag: THTMLElementTag): Boolean;
|
||||
var
|
||||
i, j: Integer;
|
||||
begin
|
||||
Result := False;
|
||||
for i := 0 to high(AutoCloseIndex) do
|
||||
if NewTag = AutoCloseTab[AutoCloseIndex[i]] then
|
||||
begin
|
||||
j := AutoCloseIndex[i]+1;
|
||||
while AutoCloseTab[j] <> etUnknown do
|
||||
begin
|
||||
if AutoCloseTab[j] = OldTag then
|
||||
begin
|
||||
Result := True;
|
||||
Exit;
|
||||
end;
|
||||
Inc(j);
|
||||
end;
|
||||
Exit;
|
||||
end;
|
||||
end;
|
||||
|
||||
end.
|
||||
|
@ -52,6 +52,11 @@ type
|
||||
FTokenText: SAXString;
|
||||
FCurStringValueDelimiter: Char;
|
||||
FAttrNameRead: Boolean;
|
||||
FStack: array of THTMLElementTag;
|
||||
FNesting: Integer;
|
||||
procedure AutoClose(const aName: string);
|
||||
procedure NamePush(const aName: string);
|
||||
procedure NamePop;
|
||||
protected
|
||||
procedure EnterNewScannerContext(NewContext: THTMLScannerContext);
|
||||
public
|
||||
@ -122,6 +127,7 @@ constructor THTMLReader.Create;
|
||||
begin
|
||||
inherited Create;
|
||||
FScannerContext := scUnknown;
|
||||
SetLength(FStack, 16);
|
||||
end;
|
||||
|
||||
destructor THTMLReader.Destroy;
|
||||
@ -265,89 +271,135 @@ begin
|
||||
end;
|
||||
end;
|
||||
|
||||
procedure THTMLReader.EnterNewScannerContext(NewContext: THTMLScannerContext);
|
||||
|
||||
function SplitTagString(const s: String; var Attr: TSAXAttributes): String;
|
||||
var
|
||||
i, j: Integer;
|
||||
AttrName: String;
|
||||
ValueDelimiter: Char;
|
||||
DoIncJ: Boolean;
|
||||
begin
|
||||
Attr := nil;
|
||||
i := Pos(' ', s);
|
||||
if i <= 0 then
|
||||
Result := LowerCase(s)
|
||||
else
|
||||
function LookupTag(const aName: string): THTMLElementTag;
|
||||
var
|
||||
j: THTMLElementTag;
|
||||
begin
|
||||
for j := Low(THTMLElementTag) to High(THTMLElementTag) do
|
||||
if SameText(HTMLElementProps[j].Name, aName) then
|
||||
begin
|
||||
Result := LowerCase(Copy(s, 1, i - 1));
|
||||
Attr := TSAXAttributes.Create;
|
||||
Result := j;
|
||||
Exit;
|
||||
end;
|
||||
Result := etUnknown;
|
||||
end;
|
||||
|
||||
procedure THTMLReader.AutoClose(const aName: string);
|
||||
var
|
||||
newTag: THTMLElementTag;
|
||||
begin
|
||||
newTag := LookupTag(aName);
|
||||
while (FNesting > 0) and IsAutoClose(newTag, FStack[FNesting-1]) do
|
||||
begin
|
||||
DoEndElement('', HTMLElementProps[FStack[FNesting-1]].Name, '');
|
||||
namePop;
|
||||
end;
|
||||
end;
|
||||
|
||||
procedure THTMLReader.NamePush(const aName: string);
|
||||
var
|
||||
tag: THTMLElementTag;
|
||||
begin
|
||||
tag := LookupTag(aName);
|
||||
if FNesting >= Length(FStack) then
|
||||
SetLength(FStack, FNesting * 2);
|
||||
FStack[FNesting] := tag;
|
||||
Inc(FNesting);
|
||||
end;
|
||||
|
||||
procedure THTMLReader.NamePop;
|
||||
begin
|
||||
if FNesting <= 0 then
|
||||
Exit;
|
||||
Dec(FNesting);
|
||||
FStack[FNesting] := etUnknown;
|
||||
end;
|
||||
|
||||
function SplitTagString(const s: String; var Attr: TSAXAttributes): String;
|
||||
var
|
||||
i, j: Integer;
|
||||
AttrName: String;
|
||||
ValueDelimiter: Char;
|
||||
DoIncJ: Boolean;
|
||||
begin
|
||||
Attr := nil;
|
||||
i := Pos(' ', s);
|
||||
if i <= 0 then
|
||||
Result := LowerCase(s)
|
||||
else
|
||||
begin
|
||||
Result := LowerCase(Copy(s, 1, i - 1));
|
||||
Attr := TSAXAttributes.Create;
|
||||
Inc(i);
|
||||
|
||||
while (i <= Length(s)) and (s[i] in WhitespaceChars) do
|
||||
Inc(i);
|
||||
|
||||
while (i <= Length(s)) and (s[i] in WhitespaceChars) do
|
||||
Inc(i);
|
||||
SetLength(AttrName, 0);
|
||||
j := i;
|
||||
|
||||
SetLength(AttrName, 0);
|
||||
j := i;
|
||||
|
||||
while j <= Length(s) do
|
||||
if s[j] = '=' then
|
||||
while j <= Length(s) do
|
||||
if s[j] = '=' then
|
||||
begin
|
||||
AttrName := LowerCase(Copy(s, i, j - i));
|
||||
Inc(j);
|
||||
if (j < Length(s)) and ((s[j] = '''') or (s[j] = '"')) then
|
||||
begin
|
||||
AttrName := LowerCase(Copy(s, i, j - i));
|
||||
ValueDelimiter := s[j];
|
||||
Inc(j);
|
||||
if (j < Length(s)) and ((s[j] = '''') or (s[j] = '"')) then
|
||||
begin
|
||||
ValueDelimiter := s[j];
|
||||
Inc(j);
|
||||
end else
|
||||
ValueDelimiter := #0;
|
||||
i := j;
|
||||
DoIncJ := False;
|
||||
while j <= Length(s) do
|
||||
if ValueDelimiter = #0 then
|
||||
if s[j] in WhitespaceChars then
|
||||
break
|
||||
else
|
||||
Inc(j)
|
||||
else if s[j] = ValueDelimiter then
|
||||
begin
|
||||
DoIncJ := True;
|
||||
break
|
||||
end else
|
||||
Inc(j);
|
||||
|
||||
Attr.AddAttribute('', AttrName, '', '', Copy(s, i, j - i));
|
||||
|
||||
if DoIncJ then
|
||||
Inc(j);
|
||||
|
||||
while (j <= Length(s)) and (s[j] in WhitespaceChars) do
|
||||
Inc(j);
|
||||
i := j;
|
||||
end
|
||||
else if s[j] in WhitespaceChars then
|
||||
begin
|
||||
Attr.AddAttribute('', Copy(s, i, j - i), '', '', '');
|
||||
Inc(j);
|
||||
while (j <= Length(s)) and (s[j] in WhitespaceChars) do
|
||||
Inc(j);
|
||||
i := j;
|
||||
end else
|
||||
Inc(j);
|
||||
end;
|
||||
end;
|
||||
ValueDelimiter := #0;
|
||||
i := j;
|
||||
DoIncJ := False;
|
||||
while j <= Length(s) do
|
||||
if ValueDelimiter = #0 then
|
||||
if s[j] in WhitespaceChars then
|
||||
break
|
||||
else
|
||||
Inc(j)
|
||||
else if s[j] = ValueDelimiter then
|
||||
begin
|
||||
DoIncJ := True;
|
||||
break
|
||||
end else
|
||||
Inc(j);
|
||||
|
||||
Attr.AddAttribute('', AttrName, '', '', Copy(s, i, j - i));
|
||||
|
||||
if DoIncJ then
|
||||
Inc(j);
|
||||
|
||||
while (j <= Length(s)) and (s[j] in WhitespaceChars) do
|
||||
Inc(j);
|
||||
i := j;
|
||||
end
|
||||
else if s[j] in WhitespaceChars then
|
||||
begin
|
||||
Attr.AddAttribute('', Copy(s, i, j - i), '', '', '');
|
||||
Inc(j);
|
||||
while (j <= Length(s)) and (s[j] in WhitespaceChars) do
|
||||
Inc(j);
|
||||
i := j;
|
||||
end else
|
||||
Inc(j);
|
||||
end;
|
||||
end;
|
||||
|
||||
procedure THTMLReader.EnterNewScannerContext(NewContext: THTMLScannerContext);
|
||||
var
|
||||
Attr: TSAXAttributes;
|
||||
TagName: String;
|
||||
Found: Boolean;
|
||||
Ent: SAXChar;
|
||||
i: Integer;
|
||||
elTag: THTMLElementTag;
|
||||
begin
|
||||
case ScannerContext of
|
||||
scWhitespace:
|
||||
DoIgnorableWhitespace(PSAXChar(TokenText), 1, Length(TokenText));
|
||||
if (FNesting > 0) and (efPCDataContent in HTMLElementProps[FStack[FNesting-1]].Flags) then
|
||||
DoCharacters(PSAXChar(TokenText), 0, Length(TokenText))
|
||||
else
|
||||
DoIgnorableWhitespace(PSAXChar(TokenText), 0, Length(TokenText));
|
||||
scText:
|
||||
DoCharacters(PSAXChar(TokenText), 0, Length(TokenText));
|
||||
scEntityReference:
|
||||
@ -382,18 +434,35 @@ begin
|
||||
setlength(fTokenText,length(fTokenText)-1);
|
||||
// Do NOT combine to a single line, as Attr is an output value!
|
||||
TagName := SplitTagString(TokenText, Attr);
|
||||
AutoClose(TagName);
|
||||
DoStartElement('', TagName, '', Attr);
|
||||
DoEndElement('', TagName, '');
|
||||
end
|
||||
else if TokenText[1] = '/' then
|
||||
begin
|
||||
DoEndElement('',
|
||||
SplitTagString(Copy(TokenText, 2, Length(TokenText)), Attr), '');
|
||||
Delete(FTokenText, 1, 1);
|
||||
TagName := SplitTagString(TokenText, Attr);
|
||||
elTag := LookupTag(TagName);
|
||||
i := FNesting-1;
|
||||
while (i >= 0) and (FStack[i] <> elTag) and
|
||||
(efEndTagOptional in HTMLElementProps[FStack[i]].Flags) do
|
||||
Dec(i);
|
||||
if (i>=0) and (FStack[i] = elTag) then
|
||||
while FStack[FNesting-1] <> elTag do
|
||||
begin
|
||||
DoEndElement('', HTMLElementProps[FStack[FNesting-1]].Name, '');
|
||||
namePop;
|
||||
end;
|
||||
|
||||
DoEndElement('', TagName, '');
|
||||
namePop;
|
||||
end
|
||||
else if TokenText[1] <> '!' then
|
||||
begin
|
||||
// Do NOT combine to a single line, as Attr is an output value!
|
||||
TagName := SplitTagString(TokenText, Attr);
|
||||
AutoClose(TagName);
|
||||
namePush(TagName);
|
||||
DoStartElement('', TagName, '', Attr);
|
||||
end;
|
||||
if Assigned(Attr) then
|
||||
@ -427,16 +496,7 @@ end;
|
||||
constructor THTMLToDOMConverter.CreateFragment(AReader: THTMLReader;
|
||||
AFragmentRoot: TDOMNode);
|
||||
begin
|
||||
inherited Create;
|
||||
FReader := AReader;
|
||||
FReader.OnCharacters := @ReaderCharacters;
|
||||
FReader.OnIgnorableWhitespace := @ReaderIgnorableWhitespace;
|
||||
FReader.OnSkippedEntity := @ReaderSkippedEntity;
|
||||
FReader.OnStartElement := @ReaderStartElement;
|
||||
FReader.OnEndElement := @ReaderEndElement;
|
||||
FDocument := AFragmentRoot.OwnerDocument;
|
||||
FElementStack := TList.Create;
|
||||
FNodeBuffer := TList.Create;
|
||||
Create(AReader, AFragmentRoot.OwnerDocument);
|
||||
FragmentRoot := AFragmentRoot;
|
||||
IsFragmentMode := True;
|
||||
end;
|
||||
|
Loading…
Reference in New Issue
Block a user