mirror of
https://gitlab.com/freepascal.org/lazarus/lazarus.git
synced 2025-12-11 05:40:45 +01:00
lazutils: FindInvalidUTF8Character: check for wrong mapped codes, changed default to report gaps
git-svn-id: trunk@47172 -
This commit is contained in:
parent
98578ff53a
commit
814cf1a717
@ -206,7 +206,7 @@ var
|
||||
begin
|
||||
Result:=s;
|
||||
if Result='' then exit;
|
||||
i:=FindInvalidUTF8Character(PChar(Result),length(Result),true);
|
||||
i:=FindInvalidUTF8Character(PChar(Result),length(Result));
|
||||
if i<0 then exit;
|
||||
Result:=ISO_8859_1ToUTF8(Result);
|
||||
end;
|
||||
|
||||
@ -95,7 +95,7 @@ function UTF8LowerString(const s: string): string;
|
||||
function UTF8UpperCase(const AInStr: string; ALanguage: string=''): string;
|
||||
function UTF8UpperString(const s: string): string;
|
||||
function FindInvalidUTF8Character(p: PChar; Count: PtrInt;
|
||||
StopOnNonASCII: Boolean = false): PtrInt;
|
||||
StopOnNonASCII: Boolean = true): PtrInt;
|
||||
function ValidUTF8String(const s: String): String;
|
||||
function Utf8StringOfChar(AUtf8Char: String; N: Integer): String;
|
||||
function Utf8AddChar(AUtf8Char: String; const S: String; N: Integer): String;
|
||||
@ -696,7 +696,7 @@ end;
|
||||
procedure UTF8FixBroken(var S: string);
|
||||
begin
|
||||
if S='' then exit;
|
||||
if FindInvalidUTF8Character(PChar(S),length(S),true)<0 then exit;
|
||||
if FindInvalidUTF8Character(PChar(S),length(S))<0 then exit;
|
||||
UniqueString(S);
|
||||
UTF8FixBroken(PChar(S));
|
||||
end;
|
||||
@ -2492,38 +2492,44 @@ begin
|
||||
Result:=0;
|
||||
while Result<Count do begin
|
||||
c:=p^;
|
||||
if ord(c)<128 then begin
|
||||
if ord(c)<%10000000 then begin
|
||||
// regular single byte ASCII character (#0 is a character, this is pascal ;)
|
||||
CharLen:=1;
|
||||
end
|
||||
else if ord(c)<%11000000 then begin
|
||||
// regular single byte character
|
||||
if StopOnNonASCII then
|
||||
end else if ord(c)<=%11000001 then begin
|
||||
// single byte character, between valid UTF-8 encodings
|
||||
// %11000000 and %11000001 map 2 byte to #0..#128, which is invalid and used for XSS attacks
|
||||
if StopOnNonASCII or (ord(c)>=192) then
|
||||
exit;
|
||||
CharLen:=1;
|
||||
end
|
||||
else if ((ord(c) and %11100000) = %11000000) then begin
|
||||
// could be 2 byte character
|
||||
if (Result<Count-1) and ((ord(p[1]) and %11000000) = %10000000) then
|
||||
end else if ord(c)<=%11011111 then begin
|
||||
// could be 2 byte character (%110xxxxx %10xxxxxx)
|
||||
if (Result<Count-1)
|
||||
and ((ord(p[1]) and %11000000) = %10000000) then
|
||||
CharLen:=2
|
||||
else
|
||||
exit; // missing following bytes
|
||||
end
|
||||
else if ((ord(c) and %11110000) = %11100000) then begin
|
||||
// could be 3 byte character
|
||||
if (Result<Count-2) and ((ord(p[1]) and %11000000) = %10000000)
|
||||
and ((ord(p[2]) and %11000000) = %10000000) then
|
||||
CharLen:=3
|
||||
else
|
||||
else if ord(c)<=%11101111 then begin
|
||||
// could be 3 byte character (%1110xxxx %10xxxxxx %10xxxxxx)
|
||||
if (Result<Count-2)
|
||||
and ((ord(p[1]) and %11000000) = %10000000)
|
||||
and ((ord(p[2]) and %11000000) = %10000000) then begin
|
||||
if (ord(c)=%11100000) and (ord(p[1])<=%10011111) then
|
||||
exit; // XSS attack: 3 bytes are mapped to the 1 or 2 byte codes
|
||||
CharLen:=3;
|
||||
end else
|
||||
exit; // missing following bytes
|
||||
end
|
||||
else if ((ord(c) and %11111000) = %11110000) then begin
|
||||
// could be 4 byte character
|
||||
if (Result<Count-3) and ((ord(p[1]) and %11000000) = %10000000)
|
||||
else if ord(c)<=%11110111 then begin
|
||||
// could be 4 byte character (%11110xxx %10xxxxxx %10xxxxxx %10xxxxxx)
|
||||
if (Result<Count-3)
|
||||
and ((ord(p[1]) and %11000000) = %10000000)
|
||||
and ((ord(p[2]) and %11000000) = %10000000)
|
||||
and ((ord(p[3]) and %11000000) = %10000000) then
|
||||
CharLen:=4
|
||||
else
|
||||
and ((ord(p[3]) and %11000000) = %10000000) then begin
|
||||
if (ord(c)=%11110000) and (ord(p[1])<=%10001111) then
|
||||
exit; // XSS attack: 4 bytes are mapped to the 1-3 byte codes
|
||||
CharLen:=4;
|
||||
end else
|
||||
exit; // missing following bytes
|
||||
end
|
||||
else begin
|
||||
|
||||
@ -467,7 +467,7 @@ Returns 0 if not found.
|
||||
</element>
|
||||
<!-- function Visibility: default -->
|
||||
<element name="ValidUTF8String">
|
||||
<short/>
|
||||
<short>Replace invalid UTF8 and replace #0..#31 characters with '#0'..'#31'</short>
|
||||
<descr/>
|
||||
<errors/>
|
||||
<seealso/>
|
||||
|
||||
@ -986,7 +986,7 @@ begin
|
||||
ExceptMsg := AExceptionText;
|
||||
// if AExceptionText is not a valid UTF8 string,
|
||||
// then assume it has the ansi encoding and convert it
|
||||
if FindInvalidUTF8Character(pchar(ExceptMsg),length(ExceptMsg), False) > 0 then
|
||||
if FindInvalidUTF8Character(pchar(ExceptMsg),length(ExceptMsg)) > 0 then
|
||||
ExceptMsg := AnsiToUtf8(ExceptMsg);
|
||||
msg := Format(lisProjectSRaisedExceptionClassSWithMessageSS,
|
||||
[GetTitle, AExceptionClass, LineEnding, ExceptMsg]);
|
||||
|
||||
@ -1544,7 +1544,7 @@ var
|
||||
begin
|
||||
if AppNoExceptionMessages in FFlags then exit;
|
||||
Msg := E.Message;
|
||||
if FindInvalidUTF8Character(PChar(Msg), Length(Msg), False) > 0 then
|
||||
if FindInvalidUTF8Character(PChar(Msg), Length(Msg)) > 0 then
|
||||
Msg := AnsiToUtf8(Msg);
|
||||
if (Msg <> '') and (Msg[length(Msg)] <> '.') then Msg := Msg + '.';
|
||||
if (not Terminated) and (Self <> nil) and (AppInitialized in FFlags) then
|
||||
|
||||
@ -355,7 +355,7 @@ procedure UTF8Insert(const source: String; var s: string; StartCharIndex: PtrInt
|
||||
function UTF8LowerCase(const s: String): String;
|
||||
function UTF8UpperCase(const s: String): String;
|
||||
function FindInvalidUTF8Character(p: PChar; Count: PtrInt;
|
||||
StopOnNonASCII: Boolean = false): PtrInt; inline;
|
||||
StopOnNonASCII: Boolean = true): PtrInt; inline;
|
||||
function ValidUTF8String(const s: String): String; inline;
|
||||
|
||||
procedure AssignUTF8ListToAnsi(UTF8List, AnsiList: TStrings);
|
||||
|
||||
@ -5,6 +5,7 @@
|
||||
Test specific with:
|
||||
./runtests --format=plain --suite=TestUTF8Trim
|
||||
./runtests --format=plain --suite=TestUTF8Pos
|
||||
./runtests --format=plain --suite=TestFindInvalidUTF8
|
||||
}
|
||||
unit TestLazUTF8;
|
||||
|
||||
@ -13,7 +14,7 @@ unit TestLazUTF8;
|
||||
interface
|
||||
|
||||
uses
|
||||
Classes, SysUtils, fpcunit, testglobals, LazUTF8;
|
||||
Classes, SysUtils, fpcunit, testglobals, LazUTF8, LazLoggerBase;
|
||||
|
||||
type
|
||||
|
||||
@ -24,6 +25,7 @@ type
|
||||
published
|
||||
procedure TestUTF8Trim;
|
||||
procedure TestUTF8Pos;
|
||||
procedure TestFindInvalidUTF8;
|
||||
end;
|
||||
|
||||
implementation
|
||||
@ -54,6 +56,53 @@ begin
|
||||
AssertEquals('Check #0',2,UTF8Pos('bc'#0,'abc'#0'abc'));
|
||||
end;
|
||||
|
||||
procedure TTestLazUTF8.TestFindInvalidUTF8;
|
||||
|
||||
procedure t(const s: string; Expected: PtrInt; const Title: string);
|
||||
var
|
||||
Actual: PtrInt;
|
||||
begin
|
||||
Actual:=FindInvalidUTF8Character(PChar(s),length(s));
|
||||
AssertEquals(Title+': '+dbgMemRange(Pointer(s),length(s)),Expected,Actual);
|
||||
end;
|
||||
|
||||
begin
|
||||
t('',-1,'empty');
|
||||
t('a',-1,'');
|
||||
t('a'#0,-1,'a with #0');
|
||||
t(#0'a',-1,'#0 with a');
|
||||
t(#128,0,'gap value 128');
|
||||
t(#191,0,'gap value 192');
|
||||
// 1 byte UTF-8
|
||||
t(UnicodeToUTF8(0),-1,'unicode(0)');
|
||||
t(UnicodeToUTF8(1),-1,'unicode(1)');
|
||||
t(UnicodeToUTF8(65),-1,'unicode(65)');
|
||||
t(UnicodeToUTF8($7f),-1,'unicode($7f)');
|
||||
// 2 bytes UTF-8
|
||||
t(UnicodeToUTF8($80),-1,'unicode($80)');
|
||||
t(UnicodeToUTF8($7ff),-1,'unicode($7ff)');
|
||||
// 3 bytes UTF-8
|
||||
t(UnicodeToUTF8($800),-1,'unicode($800)');
|
||||
t(UnicodeToUTF8($ffff),-1,'unicode($ffff)');
|
||||
// 4 bytes UTF-8
|
||||
t(UnicodeToUTF8($10000),-1,'unicode($10000)');
|
||||
t(UnicodeToUTF8($10ffff),-1,'unicode($10ffff)');
|
||||
t(#$c0#0,0,'invalid second byte of 2 byte');
|
||||
t(#$e0#0,0,'invalid second byte of 3 byte');
|
||||
t(#$e0#$80#0,0,'invalid third byte of 3 byte');
|
||||
t(#$f0#0,0,'invalid second byte of 4 byte');
|
||||
t(#$f0#$80#0,0,'invalid third byte of 4 byte');
|
||||
t(#$f0#$80#$80#0,0,'// invalid fourth byte of 4 byte');
|
||||
t(#$c0#$80,0,'invalid: ascii encoded as 2 byte');
|
||||
t(#$c0#$8f,0,'invalid: ascii encoded as 2 byte');
|
||||
t(#$c1#$80,0,'invalid: ascii encoded as 2 byte');
|
||||
t(#$c1#$bf,0,'invalid: ascii encoded as 2 byte');
|
||||
t(#$e0#$80#$80,0,'invalid: 0 encoded as 3 byte');
|
||||
t(#$e0#$9f#$bf,0,'invalid: $7ff encoded as 3 byte');
|
||||
t(#$f0#$80#$80#$80,0,'invalid: 0 encoded as 4 byte');
|
||||
t(#$f0#$8f#$bf#$bf,0,'invalid: $ffff encoded as 4 byte');
|
||||
end;
|
||||
|
||||
initialization
|
||||
AddToLazUtilsTestSuite(TTestLazUTF8);
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user