From b4b83a7edb70aa0be88118f2d486d384e6740cdb Mon Sep 17 00:00:00 2001 From: sergei Date: Thu, 15 Jul 2010 04:54:14 +0000 Subject: [PATCH] * HTML reader, attempt recovery from misplaced '<' characters (another part of Mantis #16916). git-svn-id: trunk@15574 - --- packages/fcl-xml/src/sax_html.pp | 46 +++++++++++++++++++++++++++----- 1 file changed, 40 insertions(+), 6 deletions(-) diff --git a/packages/fcl-xml/src/sax_html.pp b/packages/fcl-xml/src/sax_html.pp index 4345b289e2..e87a792c66 100644 --- a/packages/fcl-xml/src/sax_html.pp +++ b/packages/fcl-xml/src/sax_html.pp @@ -136,6 +136,22 @@ begin inherited Destroy; end; +function CheckForName(const Tag: SAXString): Boolean; +var + p, p1: PSAXChar; +begin + p := PSAXChar(Tag); + result := False; + if p^ <> '!' then + begin + if p^ = '/' then Inc(p); + p1 := p; + while (p1^ <> #0) and (p1^ <> '/') and not IsXMLWhitespace(p1^) do + Inc(p1); + result := IsXMLName(p, p1-p); + end; +end; + procedure THTMLReader.Parse(AInput: TSAXInputSource); const MaxBufferSize = 1024; @@ -163,6 +179,7 @@ begin BufferPos := 0; while (BufferPos < BufferSize) and not FStopFlag do + begin case ScannerContext of scUnknown: case Buffer[BufferPos] of @@ -261,13 +278,30 @@ begin if FCurStringValueDelimiter = #0 then EnterNewScannerContext(scUnknown); end; - else - begin - FTokenText := FTokenText + Buffer[BufferPos]; - Inc(BufferPos); - end; + '<': // either an unclosed tag or unescaped '<' in text; attempt recovery + begin + // TODO: this check is hardly complete, probably must also check if + // tag name is followed by legal attributes. + if CheckForName(FTokenText) then + EnterNewScannerContext(scUnknown) // assume unclosed tag + else if (FTokenText <> '') and (FTokenText[1] <> '!') then + begin + Insert('<', FTokenText, 1); // assume plaintext + FScannerContext := scText; + EnterNewScannerContext(scUnknown); + end + else + begin // in comment, ignore + FTokenText := FTokenText + Buffer[BufferPos]; + Inc(BufferPos); + end; + end; + else + FTokenText := FTokenText + Buffer[BufferPos]; + Inc(BufferPos); end; - end; + end; // case ScannerContext of + end; // while not endOfBuffer end; end;