mirror of
https://gitlab.com/freepascal.org/fpc/source.git
synced 2025-04-06 17:08:01 +02:00
Simplify SanitiseXMLString.
This commit is contained in:
parent
d87ba06608
commit
98ecfb1e41
@ -1090,7 +1090,7 @@ implementation
|
||||
|
||||
function SanitiseXMLString(const S: ansistring): ansistring;
|
||||
var
|
||||
X, UTF8Len, UTF8Char, CurrentChar: Integer;
|
||||
X, UTF8Len, CurrentChar: Integer;
|
||||
needs_quoting, in_quotes, add_end_quote: Boolean;
|
||||
DoASCII: Boolean;
|
||||
|
||||
@ -1234,96 +1234,17 @@ implementation
|
||||
end;
|
||||
end;
|
||||
|
||||
UTF8Char := CurrentChar and $3F; { The data bits of the continuation byte }
|
||||
UTF8Len := 1; { This variable actually holds 1 less than the length }
|
||||
UTF8Len := 1;
|
||||
repeat
|
||||
inc(UTF8Len);
|
||||
dec(X);
|
||||
until (X = 0) or (UTF8Len >= 4) or (ord(Result[X]) shr 6 <> 2);
|
||||
|
||||
{ By setting DoASCII to true, it marks the string as 'invalid UTF-8'
|
||||
automatically if it reaches the beginning of the string unexpectedly }
|
||||
DoASCII := True;
|
||||
|
||||
Dec(X);
|
||||
while X > 0 do
|
||||
if (X = 0) or (Utf8CodepointLen(@Result[X], UTF8Len, False) <> UTF8Len) then
|
||||
begin
|
||||
CurrentChar := Ord(Result[X]);
|
||||
|
||||
case CurrentChar of
|
||||
{ A standard character here is invalid UTF-8 }
|
||||
$00..$7F:
|
||||
Break;
|
||||
|
||||
{ Another continuation byte }
|
||||
$80..$BF:
|
||||
begin
|
||||
UTF8Char := UTF8Char or ((CurrentChar and $3F) shl (6 * UTF8Len));
|
||||
|
||||
dec(X);
|
||||
Inc(UTF8Len);
|
||||
if UTF8Len >= 4 then
|
||||
{ Sequence too long }
|
||||
Break;
|
||||
end;
|
||||
|
||||
{ Lead byte for 2-byte sequences }
|
||||
$C2..$DF:
|
||||
begin
|
||||
if UTF8Len <> 1 then Break;
|
||||
|
||||
UTF8Char := UTF8Char or ((CurrentChar and $1F) shl 6);
|
||||
|
||||
{ Check to see if the code is in range and not part of an 'overlong' sequence }
|
||||
case UTF8Char of
|
||||
$0080..$07FF:
|
||||
DoASCII := False;
|
||||
else
|
||||
{ Do nothing - DoASCII is already true }
|
||||
end;
|
||||
Break;
|
||||
end;
|
||||
|
||||
{ Lead byte for 3-byte sequences }
|
||||
$E0..$EF:
|
||||
begin
|
||||
if UTF8Len <> 2 then Break;
|
||||
|
||||
UTF8Char := UTF8Char or ((CurrentChar and $0F) shl 12);
|
||||
|
||||
{ Check to see if the code is in range and not part of an 'overlong' sequence }
|
||||
case UTF8Char of
|
||||
$0800..$D7FF, $E000..$FFFF: { $D800..$DFFF is reserved and hence invalid }
|
||||
DoASCII := False;
|
||||
else
|
||||
{ Do nothing - DoASCII is already true }
|
||||
end;
|
||||
Break;
|
||||
end;
|
||||
|
||||
{ Lead byte for 4-byte sequences }
|
||||
$F0..$F4:
|
||||
begin
|
||||
if UTF8Len <> 3 then Break;
|
||||
|
||||
UTF8Char := UTF8Char or ((CurrentChar and $07) shl 18);
|
||||
|
||||
{ Check to see if the code is in range and not part of an 'overlong' sequence }
|
||||
case UTF8Char of
|
||||
$010000..$10FFFF:
|
||||
DoASCII := False;
|
||||
else
|
||||
{ Do nothing - DoASCII is already true }
|
||||
end;
|
||||
Break;
|
||||
end;
|
||||
|
||||
{ Invalid character }
|
||||
else
|
||||
Break;
|
||||
end;
|
||||
DoASCII := True;
|
||||
break;
|
||||
end;
|
||||
|
||||
if DoASCII then
|
||||
Break;
|
||||
|
||||
{ If all is fine, we don't need to encode any more characters }
|
||||
end;
|
||||
|
||||
{ Invalid UTF-8 bytes and lead bytes without continuation bytes }
|
||||
|
Loading…
Reference in New Issue
Block a user