* HTML reader, attempt recovery from misplaced '<' characters (another part of Mantis #16916).

git-svn-id: trunk@15574 -
This commit is contained in:
sergei 2010-07-15 04:54:14 +00:00
parent 28041f3e3f
commit b4b83a7edb

View File

@ -136,6 +136,22 @@ begin
inherited Destroy;
end;
function CheckForName(const Tag: SAXString): Boolean;
var
p, p1: PSAXChar;
begin
p := PSAXChar(Tag);
result := False;
if p^ <> '!' then
begin
if p^ = '/' then Inc(p);
p1 := p;
while (p1^ <> #0) and (p1^ <> '/') and not IsXMLWhitespace(p1^) do
Inc(p1);
result := IsXMLName(p, p1-p);
end;
end;
procedure THTMLReader.Parse(AInput: TSAXInputSource);
const
MaxBufferSize = 1024;
@ -163,6 +179,7 @@ begin
BufferPos := 0;
while (BufferPos < BufferSize) and not FStopFlag do
begin
case ScannerContext of
scUnknown:
case Buffer[BufferPos] of
@ -261,13 +278,30 @@ begin
if FCurStringValueDelimiter = #0 then
EnterNewScannerContext(scUnknown);
end;
else
begin
FTokenText := FTokenText + Buffer[BufferPos];
Inc(BufferPos);
end;
'<': // either an unclosed tag or unescaped '<' in text; attempt recovery
begin
// TODO: this check is hardly complete, probably must also check if
// tag name is followed by legal attributes.
if CheckForName(FTokenText) then
EnterNewScannerContext(scUnknown) // assume unclosed tag
else if (FTokenText <> '') and (FTokenText[1] <> '!') then
begin
Insert('<', FTokenText, 1); // assume plaintext
FScannerContext := scText;
EnterNewScannerContext(scUnknown);
end
else
begin // in comment, ignore
FTokenText := FTokenText + Buffer[BufferPos];
Inc(BufferPos);
end;
end;
else
FTokenText := FTokenText + Buffer[BufferPos];
Inc(BufferPos);
end;
end;
end; // case ScannerContext of
end; // while not endOfBuffer
end;
end;