htmldefs.pp:

+ Define elements which may omit end-tag (except HTML, HEAD and BODY which may also omit the start-tag)
+ Define which elements may close other elements (modelled after libxml2).
* DIV may have #PCDATA content.

sax_html.pp:
* Improve the parser to report startElement/endElement events properly. Should resolve Mantis #14073 and related element hierarchy issues.

git-svn-id: trunk@13357 -
This commit is contained in:
sergei 2009-07-02 23:13:44 +00:00
parent 8c7f6fb7ac
commit fcd96805fa
2 changed files with 243 additions and 91 deletions

View File

@ -97,8 +97,9 @@ type
efPCDATAContent, // may have PCDATA content
efPreserveWhitespace, // preserve all whitespace
efDeprecated, // can be dropped in future versions
efNoChecks // Checks (attributes,subtags,...) can only be implemented in descendants
);
efNoChecks, // Checks (attributes,subtags,...) can only be implemented in descendants
efEndTagOptional
);
THTMLElementFlags = set of THTMLElementFlag;
PHTMLElementProps = ^THTMLElementProps;
@ -184,10 +185,10 @@ const
(Name: 'col'; Flags: [];
Attributes: atsattrs+atscellhalign+[atvalign,atspan,atwidth]),
(Name: 'colgroup'; Flags: [efSubelementContent];
(Name: 'colgroup'; Flags: [efSubelementContent, efEndTagOptional];
Attributes: atsattrs+atscellhalign+[atvalign,atspan,atwidth]),
(Name: 'dd'; Flags: efSubcontent; Attributes: atsattrs),
(Name: 'dd'; Flags: efSubcontent+[efEndTagOptional]; Attributes: atsattrs),
(Name: 'del'; Flags: [efSubelementContent]; Attributes: atsattrs+[atcite,atdatetime]),
@ -195,11 +196,11 @@ const
(Name: 'dir'; Flags: [efSubelementContent,efDeprecated]; Attributes: atsattrs),
(Name: 'div'; Flags: [efSubelementContent]; Attributes: atsattrs),
(Name: 'div'; Flags: efSubContent; Attributes: atsattrs),
(Name: 'dl'; Flags: [efSubelementContent]; Attributes: atsattrs),
(Name: 'dt'; Flags: [efPCDataContent]; Attributes: atsattrs),
(Name: 'dt'; Flags: [efPCDataContent, efEndTagOptional]; Attributes: atsattrs),
(Name: 'em'; Flags: efSubcontent; Attributes: atsattrs),
@ -260,7 +261,7 @@ const
(Name: 'legend'; Flags: efSubcontent; Attributes: atsattrs+[ataccesskey]),
(Name: 'li'; Flags: efSubcontent; Attributes: atsattrs),
(Name: 'li'; Flags: efSubcontent+[efEndTagOptional]; Attributes: atsattrs),
(Name: 'link'; Flags: [];
Attributes: atsattrs+[atcharset,athref,athreflang,attype,atrel,atrev,atmedia]),
@ -283,10 +284,10 @@ const
(Name: 'optgroup'; Flags: efSubcontent; Attributes: atsattrs+[atdisabled,atlabel]),
(Name: 'option'; Flags: efSubcontent;
(Name: 'option'; Flags: efSubcontent+[efEndTagOptional];
Attributes: atsattrs+[atselected,atdisabled,atlabel,atvalue]),
(Name: 'p'; Flags: efSubcontent; Attributes: atsattrs),
(Name: 'p'; Flags: efSubcontent+[efEndTagOptional]; Attributes: atsattrs),
(Name: 'param'; Flags: []; Attributes: [atid,atname,atvalue,atvaluetype,attype]),
@ -324,23 +325,23 @@ const
(Name: 'tbody'; Flags: [efSubelementContent]; Attributes: atsattrs+atscellhalign+[atvalign]),
(Name: 'td'; Flags: efSubcontent;
(Name: 'td'; Flags: efSubcontent+[efEndTagOptional];
Attributes: atsattrs+atscellhalign+[atvalign,atabbr,ataxis,atheaders,atscope,atrowspan,atcolspan]),
(Name: 'textarea'; Flags: [efPCDATAContent];
Attributes: atsattrs+[atname,atrows,atcols,atdisabled,atreadonly,attabindex,
ataccesskey,atonfocus,atonblur,atonselect,atonchange]),
(Name: 'tfoot'; Flags: [efSubelementContent]; Attributes: atsattrs+atscellhalign+[atvalign]),
(Name: 'tfoot'; Flags: [efSubelementContent,efEndTagOptional]; Attributes: atsattrs+atscellhalign+[atvalign]),
(Name: 'th'; Flags: efSubcontent;
(Name: 'th'; Flags: efSubcontent+[efEndTagOptional];
Attributes: atsattrs+atscellhalign+[atvalign,atabbr,ataxis,atheaders,atscope,atrowspan,atcolspan]),
(Name: 'thead'; Flags: [efSubelementContent]; Attributes: atsattrs+atscellhalign+[atvalign]),
(Name: 'thead'; Flags: [efSubelementContent, efEndTagOptional]; Attributes: atsattrs+atscellhalign+[atvalign]),
(Name: 'title'; Flags: efSubcontent; Attributes: atsi18n),
(Name: 'tr'; Flags: [efSubelementContent];
(Name: 'tr'; Flags: [efSubelementContent, efEndTagOptional];
Attributes: atsattrs+atscellhalign+[atvalign]),
(Name: 'tt'; Flags: efSubcontent; Attributes: atsattrs),
@ -559,12 +560,81 @@ const
function ResolveHTMLEntityReference(const Name: String;
var Entity: WideChar): Boolean;
function IsAutoClose(NewTag, OldTag: THTMLElementTag): Boolean;
implementation
uses SysUtils;
{ Define which elements auto-close other elements, modelled after libxml2.
This is an array of variable-length lists, each terminated by etUnknown.
Indices to first element of each list are provided by AutoCloseIndex array,
which *must* be updated after any change. }
const
AutoCloseTab: array[0..277] of THTMLElementTag = (
etform, etform, etp, ethr, eth1, eth2, eth3, eth4, eth5, eth6,
etdl, etul, etol, etmenu, etdir, etaddress, etpre,
ethead, etUnknown,
ethead, etp, etUnknown,
ettitle, etp, etUnknown,
etbody, ethead, etstyle, etlink, ettitle, etp, etUnknown,
etframeset, ethead, etstyle, etlink, ettitle, etp, etUnknown,
etli, etp, eth1, eth2, eth3, eth4, eth5, eth6, etdl, etaddress,
etpre, ethead, etli, etUnknown,
ethr, etp, ethead, etUnknown,
eth1, etp, ethead, etUnknown,
eth2, etp, ethead, etUnknown,
eth3, etp, ethead, etUnknown,
eth4, etp, ethead, etUnknown,
eth5, etp, ethead, etUnknown,
eth6, etp, ethead, etUnknown,
etdir, etp, ethead, etUnknown,
etaddress, etp, ethead, etul, etUnknown,
etpre, etp, ethead, etul, etUnknown,
etblockquote, etp, ethead, etUnknown,
etdl, etp, etdt, etmenu, etdir, etaddress, etpre,
ethead, etUnknown,
etdt, etp, etmenu, etdir, etaddress, etpre,
ethead, etdd, etUnknown,
etdd, etp, etmenu, etdir, etaddress, etpre,
ethead, etdt, etUnknown,
etul, etp, ethead, etol, etmenu, etdir, etaddress, etpre, etUnknown,
etol, etp, ethead, etul, etUnknown,
etmenu, etp, ethead, etul, etUnknown,
etp, etp, ethead, eth1, eth2, eth3, eth4, eth5, eth6, etUnknown,
etdiv, etp, ethead, etUnknown,
etnoscript, etp, ethead, etUnknown,
etcenter, etfont, etb, eti, etp, ethead, etUnknown,
eta, eta, etUnknown,
etcaption, etp, etUnknown,
etcolgroup, etcaption, etcolgroup, etcol, etp, etUnknown,
etcol, etcaption, etcol, etp, etUnknown,
ettable, etp, ethead, eth1, eth2, eth3, eth4, eth5, eth6, etpre,
eta, etUnknown,
etth, etth, ettd, etp, etspan, etfont, eta, etb, eti, etu, etUnknown,
ettd, etth, ettd, etp, etspan, etfont, eta, etb, eti, etu, etUnknown,
ettr, etth, ettd, ettr, etcaption, etcol, etcolgroup, etp, etUnknown,
etthead, etcaption, etcol, etcolgroup, etUnknown,
ettfoot, etth, ettd, ettr, etcaption, etcol, etcolgroup, etthead,
ettbody, etp, etUnknown,
ettbody, etth, ettd, ettr, etcaption, etcol, etcolgroup, etthead,
ettfoot, ettbody, etp, etUnknown,
etoptgroup, etoption, etUnknown,
etoption, etoption, etUnknown,
etfieldset, etlegend, etp, ethead, eth1, eth2, eth3, eth4, eth5, eth6,
etpre, eta, etUnknown,
etUnknown);
AutoCloseIndex: array[0..40] of Integer = (
0, 19, 22, 25, 32, 39, 53, 57, 61, 65, 69,
73, 77, 81, 85, 90, 95, 99, 108, 117, 126,
135, 140, 145, 155, 159, 163, 170, 173, 176,
182, 187, 199, 210, 221, 230, 235, 246, 258,
261, 264
);
function ResolveHTMLEntityReference(const Name: String;
var Entity: WideChar): Boolean;
var
@ -639,4 +709,26 @@ begin
end;
end;
function IsAutoClose(NewTag, OldTag: THTMLElementTag): Boolean;
var
i, j: Integer;
begin
Result := False;
for i := 0 to high(AutoCloseIndex) do
if NewTag = AutoCloseTab[AutoCloseIndex[i]] then
begin
j := AutoCloseIndex[i]+1;
while AutoCloseTab[j] <> etUnknown do
begin
if AutoCloseTab[j] = OldTag then
begin
Result := True;
Exit;
end;
Inc(j);
end;
Exit;
end;
end;
end.

View File

@ -52,6 +52,11 @@ type
FTokenText: SAXString;
FCurStringValueDelimiter: Char;
FAttrNameRead: Boolean;
FStack: array of THTMLElementTag;
FNesting: Integer;
procedure AutoClose(const aName: string);
procedure NamePush(const aName: string);
procedure NamePop;
protected
procedure EnterNewScannerContext(NewContext: THTMLScannerContext);
public
@ -122,6 +127,7 @@ constructor THTMLReader.Create;
begin
inherited Create;
FScannerContext := scUnknown;
SetLength(FStack, 16);
end;
destructor THTMLReader.Destroy;
@ -265,89 +271,135 @@ begin
end;
end;
procedure THTMLReader.EnterNewScannerContext(NewContext: THTMLScannerContext);
function SplitTagString(const s: String; var Attr: TSAXAttributes): String;
var
i, j: Integer;
AttrName: String;
ValueDelimiter: Char;
DoIncJ: Boolean;
begin
Attr := nil;
i := Pos(' ', s);
if i <= 0 then
Result := LowerCase(s)
else
function LookupTag(const aName: string): THTMLElementTag;
var
j: THTMLElementTag;
begin
for j := Low(THTMLElementTag) to High(THTMLElementTag) do
if SameText(HTMLElementProps[j].Name, aName) then
begin
Result := LowerCase(Copy(s, 1, i - 1));
Attr := TSAXAttributes.Create;
Result := j;
Exit;
end;
Result := etUnknown;
end;
procedure THTMLReader.AutoClose(const aName: string);
var
newTag: THTMLElementTag;
begin
newTag := LookupTag(aName);
while (FNesting > 0) and IsAutoClose(newTag, FStack[FNesting-1]) do
begin
DoEndElement('', HTMLElementProps[FStack[FNesting-1]].Name, '');
namePop;
end;
end;
procedure THTMLReader.NamePush(const aName: string);
var
tag: THTMLElementTag;
begin
tag := LookupTag(aName);
if FNesting >= Length(FStack) then
SetLength(FStack, FNesting * 2);
FStack[FNesting] := tag;
Inc(FNesting);
end;
procedure THTMLReader.NamePop;
begin
if FNesting <= 0 then
Exit;
Dec(FNesting);
FStack[FNesting] := etUnknown;
end;
function SplitTagString(const s: String; var Attr: TSAXAttributes): String;
var
i, j: Integer;
AttrName: String;
ValueDelimiter: Char;
DoIncJ: Boolean;
begin
Attr := nil;
i := Pos(' ', s);
if i <= 0 then
Result := LowerCase(s)
else
begin
Result := LowerCase(Copy(s, 1, i - 1));
Attr := TSAXAttributes.Create;
Inc(i);
while (i <= Length(s)) and (s[i] in WhitespaceChars) do
Inc(i);
while (i <= Length(s)) and (s[i] in WhitespaceChars) do
Inc(i);
SetLength(AttrName, 0);
j := i;
SetLength(AttrName, 0);
j := i;
while j <= Length(s) do
if s[j] = '=' then
while j <= Length(s) do
if s[j] = '=' then
begin
AttrName := LowerCase(Copy(s, i, j - i));
Inc(j);
if (j < Length(s)) and ((s[j] = '''') or (s[j] = '"')) then
begin
AttrName := LowerCase(Copy(s, i, j - i));
ValueDelimiter := s[j];
Inc(j);
if (j < Length(s)) and ((s[j] = '''') or (s[j] = '"')) then
begin
ValueDelimiter := s[j];
Inc(j);
end else
ValueDelimiter := #0;
i := j;
DoIncJ := False;
while j <= Length(s) do
if ValueDelimiter = #0 then
if s[j] in WhitespaceChars then
break
else
Inc(j)
else if s[j] = ValueDelimiter then
begin
DoIncJ := True;
break
end else
Inc(j);
Attr.AddAttribute('', AttrName, '', '', Copy(s, i, j - i));
if DoIncJ then
Inc(j);
while (j <= Length(s)) and (s[j] in WhitespaceChars) do
Inc(j);
i := j;
end
else if s[j] in WhitespaceChars then
begin
Attr.AddAttribute('', Copy(s, i, j - i), '', '', '');
Inc(j);
while (j <= Length(s)) and (s[j] in WhitespaceChars) do
Inc(j);
i := j;
end else
Inc(j);
end;
end;
ValueDelimiter := #0;
i := j;
DoIncJ := False;
while j <= Length(s) do
if ValueDelimiter = #0 then
if s[j] in WhitespaceChars then
break
else
Inc(j)
else if s[j] = ValueDelimiter then
begin
DoIncJ := True;
break
end else
Inc(j);
Attr.AddAttribute('', AttrName, '', '', Copy(s, i, j - i));
if DoIncJ then
Inc(j);
while (j <= Length(s)) and (s[j] in WhitespaceChars) do
Inc(j);
i := j;
end
else if s[j] in WhitespaceChars then
begin
Attr.AddAttribute('', Copy(s, i, j - i), '', '', '');
Inc(j);
while (j <= Length(s)) and (s[j] in WhitespaceChars) do
Inc(j);
i := j;
end else
Inc(j);
end;
end;
procedure THTMLReader.EnterNewScannerContext(NewContext: THTMLScannerContext);
var
Attr: TSAXAttributes;
TagName: String;
Found: Boolean;
Ent: SAXChar;
i: Integer;
elTag: THTMLElementTag;
begin
case ScannerContext of
scWhitespace:
DoIgnorableWhitespace(PSAXChar(TokenText), 1, Length(TokenText));
if (FNesting > 0) and (efPCDataContent in HTMLElementProps[FStack[FNesting-1]].Flags) then
DoCharacters(PSAXChar(TokenText), 0, Length(TokenText))
else
DoIgnorableWhitespace(PSAXChar(TokenText), 0, Length(TokenText));
scText:
DoCharacters(PSAXChar(TokenText), 0, Length(TokenText));
scEntityReference:
@ -382,18 +434,35 @@ begin
setlength(fTokenText,length(fTokenText)-1);
// Do NOT combine to a single line, as Attr is an output value!
TagName := SplitTagString(TokenText, Attr);
AutoClose(TagName);
DoStartElement('', TagName, '', Attr);
DoEndElement('', TagName, '');
end
else if TokenText[1] = '/' then
begin
DoEndElement('',
SplitTagString(Copy(TokenText, 2, Length(TokenText)), Attr), '');
Delete(FTokenText, 1, 1);
TagName := SplitTagString(TokenText, Attr);
elTag := LookupTag(TagName);
i := FNesting-1;
while (i >= 0) and (FStack[i] <> elTag) and
(efEndTagOptional in HTMLElementProps[FStack[i]].Flags) do
Dec(i);
if (i>=0) and (FStack[i] = elTag) then
while FStack[FNesting-1] <> elTag do
begin
DoEndElement('', HTMLElementProps[FStack[FNesting-1]].Name, '');
namePop;
end;
DoEndElement('', TagName, '');
namePop;
end
else if TokenText[1] <> '!' then
begin
// Do NOT combine to a single line, as Attr is an output value!
TagName := SplitTagString(TokenText, Attr);
AutoClose(TagName);
namePush(TagName);
DoStartElement('', TagName, '', Attr);
end;
if Assigned(Attr) then
@ -427,16 +496,7 @@ end;
constructor THTMLToDOMConverter.CreateFragment(AReader: THTMLReader;
AFragmentRoot: TDOMNode);
begin
inherited Create;
FReader := AReader;
FReader.OnCharacters := @ReaderCharacters;
FReader.OnIgnorableWhitespace := @ReaderIgnorableWhitespace;
FReader.OnSkippedEntity := @ReaderSkippedEntity;
FReader.OnStartElement := @ReaderStartElement;
FReader.OnEndElement := @ReaderEndElement;
FDocument := AFragmentRoot.OwnerDocument;
FElementStack := TList.Create;
FNodeBuffer := TList.Create;
Create(AReader, AFragmentRoot.OwnerDocument);
FragmentRoot := AFragmentRoot;
IsFragmentMode := True;
end;