sax_xml.pp:

* Applied counterpart of sax_html.pp r15564, eliminating redundant wide-to-ansi conversions;
* AStart parameter of IgnorableWhitespace event should be zero, not 1;
* XML is case-sensitive, removed calls to lowercase();
* Accumulate token characters in FRawTokenText, then convert it all at once to SAXString. Without it, handling multi-byte encodings like UTF-8 was impossible, because it was converting by individual bytes which always resulted in errors. Provides a partial fix for Mantis #16732. Also provides a single location to insert a proper decoding procedure.

git-svn-id: trunk@15738 -
This commit is contained in:
sergei 2010-08-08 05:13:45 +00:00
parent f138637678
commit 1c7c97dc93

View File

@ -40,6 +40,7 @@ type
FEndOfStream: Boolean; FEndOfStream: Boolean;
FScannerContext: TXMLScannerContext; FScannerContext: TXMLScannerContext;
FTokenText: SAXString; FTokenText: SAXString;
FRawTokenText: string;
FCurStringValueDelimiter: Char; FCurStringValueDelimiter: Char;
FAttrNameRead: Boolean; FAttrNameRead: Boolean;
protected protected
@ -103,7 +104,9 @@ procedure ReadXMLFragment(AParentNode: TDOMNode; var f: TStream);
implementation implementation
uses htmldefs; // for entities... uses
xmlutils,
htmldefs; // for entities...
const const
WhitespaceChars = [#9, #10, #13, ' ']; WhitespaceChars = [#9, #10, #13, ' '];
@ -154,6 +157,7 @@ begin
BufferPos := 0; BufferPos := 0;
while (BufferPos < BufferSize) and not FStopFlag do while (BufferPos < BufferSize) and not FStopFlag do
begin
case ScannerContext of case ScannerContext of
scUnknown: scUnknown:
case Buffer[BufferPos] of case Buffer[BufferPos] of
@ -176,7 +180,7 @@ begin
case Buffer[BufferPos] of case Buffer[BufferPos] of
#9, #10, #13, ' ': #9, #10, #13, ' ':
begin begin
FTokenText := FTokenText + Buffer[BufferPos]; FRawTokenText := FRawTokenText + Buffer[BufferPos];
Inc(BufferPos); Inc(BufferPos);
end; end;
'&': '&':
@ -190,7 +194,7 @@ begin
EnterNewScannerContext(scTag); EnterNewScannerContext(scTag);
end; end;
else else
FScannerContext := scText FScannerContext := scText;
end; end;
scText: scText:
case Buffer[BufferPos] of case Buffer[BufferPos] of
@ -206,7 +210,7 @@ begin
end; end;
else else
begin begin
FTokenText := FTokenText + Buffer[BufferPos]; FRawTokenText := FRawTokenText + Buffer[BufferPos];
Inc(BufferPos); Inc(BufferPos);
end; end;
end; end;
@ -220,7 +224,7 @@ begin
EnterNewScannerContext(scUnknown) EnterNewScannerContext(scUnknown)
else else
begin begin
FTokenText := FTokenText + Buffer[BufferPos]; FRawTokenText := FRawTokenText + Buffer[BufferPos];
Inc(BufferPos); Inc(BufferPos);
end; end;
scTag: scTag:
@ -237,13 +241,13 @@ begin
FAttrNameRead := False; FAttrNameRead := False;
end; end;
end; end;
FTokenText := FTokenText + Buffer[BufferPos]; FRawTokenText := FRawTokenText + Buffer[BufferPos];
Inc(BufferPos); Inc(BufferPos);
end; end;
'=': '=':
begin begin
FAttrNameRead := True; FAttrNameRead := True;
FTokenText := FTokenText + Buffer[BufferPos]; FRawTokenText := FRawTokenText + Buffer[BufferPos];
Inc(BufferPos); Inc(BufferPos);
end; end;
'>': '>':
@ -254,99 +258,101 @@ begin
end; end;
else else
begin begin
FTokenText := FTokenText + Buffer[BufferPos]; FRawTokenText := FRawTokenText + Buffer[BufferPos];
Inc(BufferPos); Inc(BufferPos);
end; end;
end; end;
end; end; // case ScannerContext of
end; // while not endOfBuffer
end;
end;
function SplitTagString(const s: SAXString; var Attr: TSAXAttributes): SAXString;
var
i, j: Integer;
AttrName: SAXString;
ValueDelimiter: WideChar;
DoIncJ: Boolean;
begin
Attr := nil;
i := 0;
repeat
Inc(i)
until (i > Length(s)) or IsXMLWhitespace(s[i]);
if i > Length(s) then
Result := s
else
begin
Result := Copy(s, 1, i - 1);
Attr := TSAXAttributes.Create;
Inc(i);
while (i <= Length(s)) and IsXMLWhitespace(s[i]) do
Inc(i);
SetLength(AttrName, 0);
j := i;
while j <= Length(s) do
if s[j] = '=' then
begin
AttrName := Copy(s, i, j - i);
Inc(j);
if (j < Length(s)) and ((s[j] = '''') or (s[j] = '"')) then
begin
ValueDelimiter := s[j];
Inc(j);
end else
ValueDelimiter := #0;
i := j;
DoIncJ := False;
while j <= Length(s) do
if ValueDelimiter = #0 then
if IsXMLWhitespace(s[j]) then
break
else
Inc(j)
else if s[j] = ValueDelimiter then
begin
DoIncJ := True;
break
end else
Inc(j);
if IsXMLName(AttrName) then
Attr.AddAttribute('', AttrName, '', '', Copy(s, i, j - i));
if DoIncJ then
Inc(j);
while (j <= Length(s)) and IsXMLWhitespace(s[j]) do
Inc(j);
i := j;
end
else if IsXMLWhitespace(s[j]) then
begin
if IsXMLName(@s[i], j-i) then
Attr.AddAttribute('', Copy(s, i, j - i), '', '', '');
Inc(j);
while (j <= Length(s)) and IsXMLWhitespace(s[j]) do
Inc(j);
i := j;
end else
Inc(j);
end; end;
end; end;
procedure TSAXXMLReader.EnterNewScannerContext(NewContext: TXMLScannerContext); procedure TSAXXMLReader.EnterNewScannerContext(NewContext: TXMLScannerContext);
function SplitTagString(const s: String; var Attr: TSAXAttributes): String;
var
i, j: Integer;
AttrName: String;
ValueDelimiter: Char;
DoIncJ: Boolean;
begin
Attr := nil;
i := 0;
repeat
Inc(i)
until (i > Length(s)) or (s[i] in WhitespaceChars);
if i > Length(s) then
Result := LowerCase(s)
else
begin
Result := LowerCase(Copy(s, 1, i - 1));
Attr := TSAXAttributes.Create;
Inc(i);
while (i <= Length(s)) and (s[i] in WhitespaceChars) do
Inc(i);
SetLength(AttrName, 0);
j := i;
while j <= Length(s) do
if s[j] = '=' then
begin
AttrName := LowerCase(Copy(s, i, j - i));
Inc(j);
if (j < Length(s)) and ((s[j] = '''') or (s[j] = '"')) then
begin
ValueDelimiter := s[j];
Inc(j);
end else
ValueDelimiter := #0;
i := j;
DoIncJ := False;
while j <= Length(s) do
if ValueDelimiter = #0 then
if s[j] in WhitespaceChars then
break
else
Inc(j)
else if s[j] = ValueDelimiter then
begin
DoIncJ := True;
break
end else
Inc(j);
Attr.AddAttribute('', AttrName, '', '', Copy(s, i, j - i));
if DoIncJ then
Inc(j);
while (j <= Length(s)) and (s[j] in WhitespaceChars) do
Inc(j);
i := j;
end
else if s[j] in WhitespaceChars then
begin
Attr.AddAttribute('', Copy(s, i, j - i), '', '', '');
Inc(j);
while (j <= Length(s)) and (s[j] in WhitespaceChars) do
Inc(j);
i := j;
end else
Inc(j);
end;
end;
var var
Attr: TSAXAttributes; Attr: TSAXAttributes;
TagName: String; TagName: SAXString;
Ent: SAXChar; Ent: SAXChar;
begin begin
FTokenText := FRawTokenText; // this is where conversion takes place
case ScannerContext of case ScannerContext of
scWhitespace: scWhitespace:
DoIgnorableWhitespace(PSAXChar(TokenText), 1, Length(TokenText)); DoIgnorableWhitespace(PSAXChar(TokenText), 0, Length(TokenText));
scText: scText:
DoCharacters(PSAXChar(TokenText), 0, Length(TokenText)); DoCharacters(PSAXChar(TokenText), 0, Length(TokenText));
scEntityReference: scEntityReference:
@ -397,7 +403,8 @@ begin
end; end;
end; end;
FScannerContext := NewContext; FScannerContext := NewContext;
SetLength(FTokenText, 0); FTokenText := '';
FRawTokenText := '';
FCurStringValueDelimiter := #0; FCurStringValueDelimiter := #0;
FAttrNameRead := False; FAttrNameRead := False;
end; end;