* HTML parser: in case of malformed input, do not create attributes with invalid names (Mantis #16916).

* Along the way, eliminated one layer of useless converting strings from wide to ansi and back.

git-svn-id: trunk@15564 -
This commit is contained in:
sergei 2010-07-14 13:54:09 +00:00
parent 119277166e
commit 7e2f713d09
2 changed files with 40 additions and 32 deletions

View File

@ -31,7 +31,7 @@ unit SAX_HTML;
interface
uses SysUtils, Classes, SAX, DOM, DOM_HTML,htmldefs;
uses SysUtils, Classes, SAX, DOM, DOM_HTML,htmldefs,xmlutils;
type
@ -54,8 +54,8 @@ type
FAttrNameRead: Boolean;
FStack: array of THTMLElementTag;
FNesting: Integer;
procedure AutoClose(const aName: string);
procedure NamePush(const aName: string);
procedure AutoClose(const aName: SAXString);
procedure NamePush(const aName: SAXString);
procedure NamePop;
protected
procedure EnterNewScannerContext(NewContext: THTMLScannerContext);
@ -271,12 +271,14 @@ begin
end;
end;
function LookupTag(const aName: string): THTMLElementTag;
function LookupTag(const aName: SAXString): THTMLElementTag;
var
j: THTMLElementTag;
ansiName: string;
begin
ansiName := aName;
for j := Low(THTMLElementTag) to High(THTMLElementTag) do
if SameText(HTMLElementProps[j].Name, aName) then
if SameText(HTMLElementProps[j].Name, ansiName) then
begin
Result := j;
Exit;
@ -284,7 +286,7 @@ begin
Result := etUnknown;
end;
procedure THTMLReader.AutoClose(const aName: string);
procedure THTMLReader.AutoClose(const aName: SAXString);
var
newTag: THTMLElementTag;
begin
@ -296,7 +298,7 @@ begin
end;
end;
procedure THTMLReader.NamePush(const aName: string);
procedure THTMLReader.NamePush(const aName: SAXString);
var
tag: THTMLElementTag;
begin
@ -315,27 +317,27 @@ begin
FStack[FNesting] := etUnknown;
end;
function SplitTagString(const s: String; var Attr: TSAXAttributes): String;
function SplitTagString(const s: SAXString; var Attr: TSAXAttributes): SAXString;
var
i, j: Integer;
AttrName: String;
ValueDelimiter: Char;
AttrName: SAXString;
ValueDelimiter: WideChar;
DoIncJ: Boolean;
begin
Attr := nil;
i := 1;
while (i <= Length(s)) and not (s[i] in WhitespaceChars) do
while (i <= Length(s)) and not IsXMLWhitespace(s[i]) do
Inc(i);
if i = Length(s) then
Result := LowerCase(s)
Result := s
else
begin
Result := LowerCase(Copy(s, 1, i - 1));
Result := Copy(s, 1, i - 1);
Attr := TSAXAttributes.Create;
Inc(i);
while (i <= Length(s)) and (s[i] in WhitespaceChars) do
while (i <= Length(s)) and IsXMLWhitespace(s[i]) do
Inc(i);
SetLength(AttrName, 0);
@ -344,7 +346,8 @@ begin
while j <= Length(s) do
if s[j] = '=' then
begin
AttrName := LowerCase(Copy(s, i, j - i));
AttrName := Copy(s, i, j - i);
WStrLower(AttrName);
Inc(j);
if (j < Length(s)) and ((s[j] = '''') or (s[j] = '"')) then
begin
@ -356,7 +359,7 @@ begin
DoIncJ := False;
while j <= Length(s) do
if ValueDelimiter = #0 then
if s[j] in WhitespaceChars then
if IsXMLWhitespace(s[j]) then
break
else
Inc(j)
@ -367,31 +370,34 @@ begin
end else
Inc(j);
Attr.AddAttribute('', AttrName, '', '', Copy(s, i, j - i));
if IsXMLName(AttrName) then
Attr.AddAttribute('', AttrName, '', '', Copy(s, i, j - i));
if DoIncJ then
Inc(j);
while (j <= Length(s)) and (s[j] in WhitespaceChars) do
while (j <= Length(s)) and IsXMLWhitespace(s[j]) do
Inc(j);
i := j;
end
else if s[j] in WhitespaceChars then
else if IsXMLWhitespace(s[j]) then
begin
Attr.AddAttribute('', Copy(s, i, j - i), '', '', '');
if IsXMLName(@s[i], j-i) then
Attr.AddAttribute('', Copy(s, i, j - i), '', '', '');
Inc(j);
while (j <= Length(s)) and (s[j] in WhitespaceChars) do
while (j <= Length(s)) and IsXMLWhitespace(s[j]) do
Inc(j);
i := j;
end else
Inc(j);
end;
WStrLower(result);
end;
procedure THTMLReader.EnterNewScannerContext(NewContext: THTMLScannerContext);
var
Attr: TSAXAttributes;
TagName: String;
TagName: SAXString;
Ent: SAXChar;
i: Integer;
elTag: THTMLElementTag;
@ -502,30 +508,22 @@ end;
procedure THTMLToDOMConverter.ReaderCharacters(Sender: TObject;
const ch: PSAXChar; Start, Count: Integer);
var
s: SAXString;
NodeInfo: THTMLNodeInfo;
begin
SetLength(s, Count);
Move(ch^, s[1], Count * SizeOf(SAXChar));
NodeInfo := THTMLNodeInfo.Create;
NodeInfo.NodeType := ntText;
NodeInfo.DOMNode := FDocument.CreateTextNode(s);
NodeInfo.DOMNode := FDocument.CreateTextNodeBuf(ch, Count, False);
FNodeBuffer.Add(NodeInfo);
end;
procedure THTMLToDOMConverter.ReaderIgnorableWhitespace(Sender: TObject;
const ch: PSAXChar; Start, Count: Integer);
var
s: SAXString;
NodeInfo: THTMLNodeInfo;
begin
SetLength(s, Count);
Move(ch^, s[1], Count * SizeOf(SAXChar));
NodeInfo := THTMLNodeInfo.Create;
NodeInfo.NodeType := ntWhitespace;
NodeInfo.DOMNode := FDocument.CreateTextNode(s);
NodeInfo.DOMNode := FDocument.CreateTextNodeBuf(ch, Count, False);
FNodeBuffer.Add(NodeInfo);
end;

View File

@ -35,6 +35,7 @@ function IsXmlWhiteSpace(c: WideChar): Boolean;
function Hash(InitValue: LongWord; Key: PWideChar; KeyLen: Integer): LongWord;
{ beware, works in ASCII range only }
function WStrLIComp(S1, S2: PWideChar; Len: Integer): Integer;
procedure WStrLower(var S: WideString);
type
TXMLVersion = (xmlVersionUnknown, xmlVersion10, xmlVersion11);
@ -385,6 +386,15 @@ begin
result := c1 - c2;
end;
procedure WStrLower(var S: WideString);
var
i: Integer;
begin
for i := 1 to Length(S) do
if (S[i] >= 'A') and (S[i] <= 'Z') then
Inc(word(S[i]), 32);
end;
function Hash(InitValue: LongWord; Key: PWideChar; KeyLen: Integer): LongWord;
begin
Result := InitValue;