sax_xml.pp:

* Applied counterpart of sax_html.pp r15564, eliminating redundant wide-to-ansi conversions;
* AStart parameter of IgnorableWhitespace event should be zero, not 1;
* XML is case-sensitive, removed calls to lowercase();
* Accumulate token characters in FRawTokenText, then convert it all at once to SAXString. Without it, handling multi-byte encodings like UTF-8 was impossible, because it was converting by individual bytes which always resulted in errors. Provides a partial fix for Mantis #16732. Also provides a single location to insert a proper decoding procedure.

git-svn-id: trunk@15738 -
This commit is contained in:
sergei 2010-08-08 05:13:45 +00:00
parent f138637678
commit 1c7c97dc93

View File

@ -40,6 +40,7 @@ type
FEndOfStream: Boolean; FEndOfStream: Boolean;
FScannerContext: TXMLScannerContext; FScannerContext: TXMLScannerContext;
FTokenText: SAXString; FTokenText: SAXString;
FRawTokenText: string;
FCurStringValueDelimiter: Char; FCurStringValueDelimiter: Char;
FAttrNameRead: Boolean; FAttrNameRead: Boolean;
protected protected
@ -103,7 +104,9 @@ procedure ReadXMLFragment(AParentNode: TDOMNode; var f: TStream);
implementation implementation
uses htmldefs; // for entities... uses
xmlutils,
htmldefs; // for entities...
const const
WhitespaceChars = [#9, #10, #13, ' ']; WhitespaceChars = [#9, #10, #13, ' '];
@ -154,6 +157,7 @@ begin
BufferPos := 0; BufferPos := 0;
while (BufferPos < BufferSize) and not FStopFlag do while (BufferPos < BufferSize) and not FStopFlag do
begin
case ScannerContext of case ScannerContext of
scUnknown: scUnknown:
case Buffer[BufferPos] of case Buffer[BufferPos] of
@ -176,7 +180,7 @@ begin
case Buffer[BufferPos] of case Buffer[BufferPos] of
#9, #10, #13, ' ': #9, #10, #13, ' ':
begin begin
FTokenText := FTokenText + Buffer[BufferPos]; FRawTokenText := FRawTokenText + Buffer[BufferPos];
Inc(BufferPos); Inc(BufferPos);
end; end;
'&': '&':
@ -190,7 +194,7 @@ begin
EnterNewScannerContext(scTag); EnterNewScannerContext(scTag);
end; end;
else else
FScannerContext := scText FScannerContext := scText;
end; end;
scText: scText:
case Buffer[BufferPos] of case Buffer[BufferPos] of
@ -206,7 +210,7 @@ begin
end; end;
else else
begin begin
FTokenText := FTokenText + Buffer[BufferPos]; FRawTokenText := FRawTokenText + Buffer[BufferPos];
Inc(BufferPos); Inc(BufferPos);
end; end;
end; end;
@ -220,7 +224,7 @@ begin
EnterNewScannerContext(scUnknown) EnterNewScannerContext(scUnknown)
else else
begin begin
FTokenText := FTokenText + Buffer[BufferPos]; FRawTokenText := FRawTokenText + Buffer[BufferPos];
Inc(BufferPos); Inc(BufferPos);
end; end;
scTag: scTag:
@ -237,13 +241,13 @@ begin
FAttrNameRead := False; FAttrNameRead := False;
end; end;
end; end;
FTokenText := FTokenText + Buffer[BufferPos]; FRawTokenText := FRawTokenText + Buffer[BufferPos];
Inc(BufferPos); Inc(BufferPos);
end; end;
'=': '=':
begin begin
FAttrNameRead := True; FAttrNameRead := True;
FTokenText := FTokenText + Buffer[BufferPos]; FRawTokenText := FRawTokenText + Buffer[BufferPos];
Inc(BufferPos); Inc(BufferPos);
end; end;
'>': '>':
@ -254,39 +258,37 @@ begin
end; end;
else else
begin begin
FTokenText := FTokenText + Buffer[BufferPos]; FRawTokenText := FRawTokenText + Buffer[BufferPos];
Inc(BufferPos); Inc(BufferPos);
end; end;
end; end;
end; end; // case ScannerContext of
end; // while not endOfBuffer
end; end;
end; end;
procedure TSAXXMLReader.EnterNewScannerContext(NewContext: TXMLScannerContext); function SplitTagString(const s: SAXString; var Attr: TSAXAttributes): SAXString;
function SplitTagString(const s: String; var Attr: TSAXAttributes): String;
var var
i, j: Integer; i, j: Integer;
AttrName: String; AttrName: SAXString;
ValueDelimiter: Char; ValueDelimiter: WideChar;
DoIncJ: Boolean; DoIncJ: Boolean;
begin begin
Attr := nil; Attr := nil;
i := 0; i := 0;
repeat repeat
Inc(i) Inc(i)
until (i > Length(s)) or (s[i] in WhitespaceChars); until (i > Length(s)) or IsXMLWhitespace(s[i]);
if i > Length(s) then if i > Length(s) then
Result := LowerCase(s) Result := s
else else
begin begin
Result := LowerCase(Copy(s, 1, i - 1)); Result := Copy(s, 1, i - 1);
Attr := TSAXAttributes.Create; Attr := TSAXAttributes.Create;
Inc(i); Inc(i);
while (i <= Length(s)) and (s[i] in WhitespaceChars) do while (i <= Length(s)) and IsXMLWhitespace(s[i]) do
Inc(i); Inc(i);
SetLength(AttrName, 0); SetLength(AttrName, 0);
@ -295,7 +297,7 @@ procedure TSAXXMLReader.EnterNewScannerContext(NewContext: TXMLScannerContext);
while j <= Length(s) do while j <= Length(s) do
if s[j] = '=' then if s[j] = '=' then
begin begin
AttrName := LowerCase(Copy(s, i, j - i)); AttrName := Copy(s, i, j - i);
Inc(j); Inc(j);
if (j < Length(s)) and ((s[j] = '''') or (s[j] = '"')) then if (j < Length(s)) and ((s[j] = '''') or (s[j] = '"')) then
begin begin
@ -307,7 +309,7 @@ procedure TSAXXMLReader.EnterNewScannerContext(NewContext: TXMLScannerContext);
DoIncJ := False; DoIncJ := False;
while j <= Length(s) do while j <= Length(s) do
if ValueDelimiter = #0 then if ValueDelimiter = #0 then
if s[j] in WhitespaceChars then if IsXMLWhitespace(s[j]) then
break break
else else
Inc(j) Inc(j)
@ -318,20 +320,22 @@ procedure TSAXXMLReader.EnterNewScannerContext(NewContext: TXMLScannerContext);
end else end else
Inc(j); Inc(j);
if IsXMLName(AttrName) then
Attr.AddAttribute('', AttrName, '', '', Copy(s, i, j - i)); Attr.AddAttribute('', AttrName, '', '', Copy(s, i, j - i));
if DoIncJ then if DoIncJ then
Inc(j); Inc(j);
while (j <= Length(s)) and (s[j] in WhitespaceChars) do while (j <= Length(s)) and IsXMLWhitespace(s[j]) do
Inc(j); Inc(j);
i := j; i := j;
end end
else if s[j] in WhitespaceChars then else if IsXMLWhitespace(s[j]) then
begin begin
if IsXMLName(@s[i], j-i) then
Attr.AddAttribute('', Copy(s, i, j - i), '', '', ''); Attr.AddAttribute('', Copy(s, i, j - i), '', '', '');
Inc(j); Inc(j);
while (j <= Length(s)) and (s[j] in WhitespaceChars) do while (j <= Length(s)) and IsXMLWhitespace(s[j]) do
Inc(j); Inc(j);
i := j; i := j;
end else end else
@ -339,14 +343,16 @@ procedure TSAXXMLReader.EnterNewScannerContext(NewContext: TXMLScannerContext);
end; end;
end; end;
procedure TSAXXMLReader.EnterNewScannerContext(NewContext: TXMLScannerContext);
var var
Attr: TSAXAttributes; Attr: TSAXAttributes;
TagName: String; TagName: SAXString;
Ent: SAXChar; Ent: SAXChar;
begin begin
FTokenText := FRawTokenText; // this is where conversion takes place
case ScannerContext of case ScannerContext of
scWhitespace: scWhitespace:
DoIgnorableWhitespace(PSAXChar(TokenText), 1, Length(TokenText)); DoIgnorableWhitespace(PSAXChar(TokenText), 0, Length(TokenText));
scText: scText:
DoCharacters(PSAXChar(TokenText), 0, Length(TokenText)); DoCharacters(PSAXChar(TokenText), 0, Length(TokenText));
scEntityReference: scEntityReference:
@ -397,7 +403,8 @@ begin
end; end;
end; end;
FScannerContext := NewContext; FScannerContext := NewContext;
SetLength(FTokenText, 0); FTokenText := '';
FRawTokenText := '';
FCurStringValueDelimiter := #0; FCurStringValueDelimiter := #0;
FAttrNameRead := False; FAttrNameRead := False;
end; end;