* The second part of changes suggested by Maris Janis Vasilevskis (Mantis #22434):

* Parse comments and script/style elements using dedicated scanner states * Ignore markup in comments * Ignore unescaped '&' in script/style elements * Additionally, report comments via OnComment event * Reverted r17003 since it is entirely replaced by above changes. git-svn-id: trunk@33250 -
2025-08-29 13:21:35 +02:00 · 2016-03-13 23:14:31 +00:00 · 2016-03-13 23:14:31 +00:00 · 5cc8ddfbf1
commit 5cc8ddfbf1
parent f01edc0a57
1 changed files with 111 additions and 60 deletions
--- a/packages/fcl-xml/src/sax_html.pp
+++ b/packages/fcl-xml/src/sax_html.pp
@ -42,7 +42,10 @@ type
    scWhitespace,       // within whitespace
    scText,             // within text
    scEntityReference,  // within entity reference ("&...;")
-    scTag);             // within a start tag or end tag
+    scTag,              // within a start tag or end tag
+    scComment,
+    scScript
+  );

  THTMLReader = class(TSAXReader)
  private
@ -51,6 +54,8 @@ type
    FScannerContext: THTMLScannerContext;
    FTokenText: SAXString;
    FRawTokenText: string;
+    FScriptEndTag: string;
+    FScriptEndMatchPos: Integer;
    FCurStringValueDelimiter: Char;
    FAttrNameRead: Boolean;
    FStack: array of THTMLElementTag;
@ -155,6 +160,8 @@ const
 var
  Buffer: array[0..MaxBufferSize - 1] of Char;
  BufferSize, BufferPos: Integer;
+  len: Integer;
+  ch: Char;
 begin
  if not FStarted then
  begin
@ -295,8 +302,62 @@ begin
              end;
          else
            FRawTokenText := FRawTokenText + Buffer[BufferPos];
+            if FRawTokenText='!--' then
+            begin
+              FScannerContext := scComment;
+              FRawTokenText := '';
+            end;
            Inc(BufferPos);
          end;
+        scComment:
+          begin
+            FRawTokenText := FRawTokenText + Buffer[BufferPos];
+            Inc(BufferPos);
+
+            if (Buffer[BufferPos-1]='>') then
+            begin
+              len:=length(FRawTokenText);
+              if (len>2) and (FRawTokenText[len-1]='-') and (FRawTokenText[len-2]='-') then
+              begin
+                Delete(FRawTokenText, Length(FRawTokenText)-2, MaxInt);
+                EnterNewScannerContext(scUnknown);
+              end;
+            end;
+          end;
+        scScript:
+          begin
+            ch := Buffer[BufferPos];
+            if FScriptEndMatchPos <= Length(FScriptEndTag) then
+            begin
+              if lowercase(ch) = FScriptEndTag[FScriptEndMatchPos] then
+                Inc(FScriptEndMatchPos)
+              else
+                FScriptEndMatchPos := 1;
+              FRawTokenText := FRawTokenText + ch;
+              Inc(BufferPos);
+            end
+            else
+            begin
+              case ch of
+                #9,#10,#13,' ':
+                  begin
+                    FRawTokenText := FRawTokenText + ch;
+                    Inc(BufferPos);
+                    Inc(FScriptEndMatchPos);
+                  end;
+                '>':
+                  begin
+                    Inc(BufferPos);
+                    Delete(FRawTokenText, Length(FRawTokenText)-FScriptEndMatchPos+2, MaxInt);
+                    EnterNewScannerContext(scUnknown);
+                  end;
+              else
+                FRawTokenText := FRawTokenText + ch;
+                Inc(BufferPos);
+                FScriptEndMatchPos := 1;
+              end;
+            end;
+          end;
        end;    // case ScannerContext of
    end;        // while not endOfBuffer
  end;
@ -429,18 +490,6 @@ begin
  until false;
 end;

-function RightTrimmedLength(const s: SAXString): Integer;
-begin
-  result := Length(s);
-  while IsXmlWhitespace(s[result]) do Dec(result);
-end;
-
-function TagPos(elTag: THTMLElementTag; s: SAXString): Integer;
-begin
-  WStrLower(s);
-  Result := Pos(HTMLElementProps[elTag].Name, s);
-end;
-
 procedure THTMLReader.EnterNewScannerContext(NewContext: THTMLScannerContext);
 var
  Attr: TSAXAttributes;
@ -468,60 +517,62 @@ begin
    scTag:
      if Length(TokenText) > 0 then
      begin
-        { ignore possibly unescaped markup in SCRIPT and STYLE }
-        if (FNesting > 0) and (FStack[FNesting-1] in [etScript,etStyle]) and
-          not (
-           (TokenText[1] = '/') and
-           (RightTrimmedLength(TokenText)=Length(HTMLElementProps[FStack[FNesting-1]].Name)+1) and
-           (TagPos(FStack[FNesting-1], TokenText) = 2)
-          )
-          and (TokenText[1] <> '!') then
+        Attr := nil;
+        if TokenText[Length(fTokenText)]='/' then  // handle xml/xhtml style empty tag
        begin
-          FTokenText := '<'+FTokenText+'>';
-          DoCharacters(PSAXChar(TokenText), 0, Length(TokenText));
+          setlength(fTokenText,length(fTokenText)-1);
+          // Do NOT combine to a single line, as Attr is an output value!
+          TagName := SplitTagString(TokenText, Attr);
+          AutoClose(TagName);
+          DoStartElement('', TagName, '', Attr);
+          DoEndElement('', TagName, '');
        end
-        else
+        else if TokenText[1] = '/' then
        begin
-          Attr := nil;
-          if TokenText[Length(fTokenText)]='/' then  // handle xml/xhtml style empty tag
-          begin
-            setlength(fTokenText,length(fTokenText)-1);
-            // Do NOT combine to a single line, as Attr is an output value!
-            TagName := SplitTagString(TokenText, Attr);
-            AutoClose(TagName);
-            DoStartElement('', TagName, '', Attr);
-            DoEndElement('', TagName, '');
-          end
-          else if TokenText[1] = '/' then
-          begin
-            Delete(FTokenText, 1, 1);
-            TagName := SplitTagString(TokenText, Attr);
-            elTag := LookupTag(TagName);
-            i := FNesting-1;
-            while (i >= 0) and (FStack[i] <> elTag) and
-              (efEndTagOptional in HTMLElementProps[FStack[i]].Flags) do
-              Dec(i);
-            if (i>=0) and (FStack[i] = elTag) then
-              while FStack[FNesting-1] <> elTag do
-              begin
-                DoEndElement('', HTMLElementProps[FStack[FNesting-1]].Name, '');
-                namePop;
-              end;
+          Delete(FTokenText, 1, 1);
+          TagName := SplitTagString(TokenText, Attr);
+          elTag := LookupTag(TagName);
+          i := FNesting-1;
+          while (i >= 0) and (FStack[i] <> elTag) and
+            (efEndTagOptional in HTMLElementProps[FStack[i]].Flags) do
+            Dec(i);
+          if (i>=0) and (FStack[i] = elTag) then
+            while FStack[FNesting-1] <> elTag do
+            begin
+              DoEndElement('', HTMLElementProps[FStack[FNesting-1]].Name, '');
+              namePop;
+            end;

-            DoEndElement('', TagName, '');
-            namePop;
-          end
-          else if TokenText[1] <> '!' then
+          DoEndElement('', TagName, '');
+          namePop;
+        end
+        else if TokenText[1] <> '!' then
+        begin
+          // Do NOT combine to a single line, as Attr is an output value!
+          TagName := SplitTagString(TokenText, Attr);
+          AutoClose(TagName);
+          namePush(TagName);
+          DoStartElement('', TagName, '', Attr);
+          if FStack[FNesting-1] in [etScript,etStyle] then
          begin
-            // Do NOT combine to a single line, as Attr is an output value!
-            TagName := SplitTagString(TokenText, Attr);
-            AutoClose(TagName);
-            namePush(TagName);
-            DoStartElement('', TagName, '', Attr);
+            NewContext := scScript;
+            FScriptEndTag := '</' + HTMLElementProps[FStack[FNesting-1]].Name;
+            FScriptEndMatchPos := 1;
          end;
-          if Assigned(Attr) then
-            Attr.Free;
        end;
+        if Assigned(Attr) then
+          Attr.Free;
+      end;
+    scComment:
+      begin
+        DoComment(PSAXChar(TokenText), 0, Length(TokenText));
+      end;
+    scScript:
+      begin
+        DoCharacters(PSAXChar(TokenText), 0, Length(TokenText));
+        DoEndElement('', HTMLElementProps[FStack[FNesting-1]].Name, '');
+        namePop;
+        FScriptEndTag := '';
      end;
  end;
  FScannerContext := NewContext;