lazutils: FindInvalidUTF8Character: check for wrong mapped codes, changed default to report gaps

git-svn-id: trunk@47172 -
This commit is contained in:
mattias 2014-12-10 14:26:18 +00:00
parent 98578ff53a
commit 814cf1a717
7 changed files with 84 additions and 29 deletions

View File

@ -206,7 +206,7 @@ var
begin
Result:=s;
if Result='' then exit;
i:=FindInvalidUTF8Character(PChar(Result),length(Result),true);
i:=FindInvalidUTF8Character(PChar(Result),length(Result));
if i<0 then exit;
Result:=ISO_8859_1ToUTF8(Result);
end;

View File

@ -95,7 +95,7 @@ function UTF8LowerString(const s: string): string;
function UTF8UpperCase(const AInStr: string; ALanguage: string=''): string;
function UTF8UpperString(const s: string): string;
function FindInvalidUTF8Character(p: PChar; Count: PtrInt;
StopOnNonASCII: Boolean = false): PtrInt;
StopOnNonASCII: Boolean = true): PtrInt;
function ValidUTF8String(const s: String): String;
function Utf8StringOfChar(AUtf8Char: String; N: Integer): String;
function Utf8AddChar(AUtf8Char: String; const S: String; N: Integer): String;
@ -696,7 +696,7 @@ end;
procedure UTF8FixBroken(var S: string);
begin
if S='' then exit;
if FindInvalidUTF8Character(PChar(S),length(S),true)<0 then exit;
if FindInvalidUTF8Character(PChar(S),length(S))<0 then exit;
UniqueString(S);
UTF8FixBroken(PChar(S));
end;
@ -2492,38 +2492,44 @@ begin
Result:=0;
while Result<Count do begin
c:=p^;
if ord(c)<128 then begin
if ord(c)<%10000000 then begin
// regular single byte ASCII character (#0 is a character, this is pascal ;)
CharLen:=1;
end
else if ord(c)<%11000000 then begin
// regular single byte character
if StopOnNonASCII then
end else if ord(c)<=%11000001 then begin
// single byte character, between valid UTF-8 encodings
// %11000000 and %11000001 map 2 byte to #0..#128, which is invalid and used for XSS attacks
if StopOnNonASCII or (ord(c)>=192) then
exit;
CharLen:=1;
end
else if ((ord(c) and %11100000) = %11000000) then begin
// could be 2 byte character
if (Result<Count-1) and ((ord(p[1]) and %11000000) = %10000000) then
end else if ord(c)<=%11011111 then begin
// could be 2 byte character (%110xxxxx %10xxxxxx)
if (Result<Count-1)
and ((ord(p[1]) and %11000000) = %10000000) then
CharLen:=2
else
exit; // missing following bytes
end
else if ((ord(c) and %11110000) = %11100000) then begin
// could be 3 byte character
if (Result<Count-2) and ((ord(p[1]) and %11000000) = %10000000)
and ((ord(p[2]) and %11000000) = %10000000) then
CharLen:=3
else
else if ord(c)<=%11101111 then begin
// could be 3 byte character (%1110xxxx %10xxxxxx %10xxxxxx)
if (Result<Count-2)
and ((ord(p[1]) and %11000000) = %10000000)
and ((ord(p[2]) and %11000000) = %10000000) then begin
if (ord(c)=%11100000) and (ord(p[1])<=%10011111) then
exit; // XSS attack: 3 bytes are mapped to the 1 or 2 byte codes
CharLen:=3;
end else
exit; // missing following bytes
end
else if ((ord(c) and %11111000) = %11110000) then begin
// could be 4 byte character
if (Result<Count-3) and ((ord(p[1]) and %11000000) = %10000000)
else if ord(c)<=%11110111 then begin
// could be 4 byte character (%11110xxx %10xxxxxx %10xxxxxx %10xxxxxx)
if (Result<Count-3)
and ((ord(p[1]) and %11000000) = %10000000)
and ((ord(p[2]) and %11000000) = %10000000)
and ((ord(p[3]) and %11000000) = %10000000) then
CharLen:=4
else
and ((ord(p[3]) and %11000000) = %10000000) then begin
if (ord(c)=%11110000) and (ord(p[1])<=%10001111) then
exit; // XSS attack: 4 bytes are mapped to the 1-3 byte codes
CharLen:=4;
end else
exit; // missing following bytes
end
else begin

View File

@ -467,7 +467,7 @@ Returns 0 if not found.
</element>
<!-- function Visibility: default -->
<element name="ValidUTF8String">
<short/>
<short>Replace invalid UTF8 and replace #0..#31 characters with '#0'..'#31'</short>
<descr/>
<errors/>
<seealso/>

View File

@ -986,7 +986,7 @@ begin
ExceptMsg := AExceptionText;
// if AExceptionText is not a valid UTF8 string,
// then assume it has the ansi encoding and convert it
if FindInvalidUTF8Character(pchar(ExceptMsg),length(ExceptMsg), False) > 0 then
if FindInvalidUTF8Character(pchar(ExceptMsg),length(ExceptMsg)) > 0 then
ExceptMsg := AnsiToUtf8(ExceptMsg);
msg := Format(lisProjectSRaisedExceptionClassSWithMessageSS,
[GetTitle, AExceptionClass, LineEnding, ExceptMsg]);

View File

@ -1544,7 +1544,7 @@ var
begin
if AppNoExceptionMessages in FFlags then exit;
Msg := E.Message;
if FindInvalidUTF8Character(PChar(Msg), Length(Msg), False) > 0 then
if FindInvalidUTF8Character(PChar(Msg), Length(Msg)) > 0 then
Msg := AnsiToUtf8(Msg);
if (Msg <> '') and (Msg[length(Msg)] <> '.') then Msg := Msg + '.';
if (not Terminated) and (Self <> nil) and (AppInitialized in FFlags) then

View File

@ -355,7 +355,7 @@ procedure UTF8Insert(const source: String; var s: string; StartCharIndex: PtrInt
function UTF8LowerCase(const s: String): String;
function UTF8UpperCase(const s: String): String;
function FindInvalidUTF8Character(p: PChar; Count: PtrInt;
StopOnNonASCII: Boolean = false): PtrInt; inline;
StopOnNonASCII: Boolean = true): PtrInt; inline;
function ValidUTF8String(const s: String): String; inline;
procedure AssignUTF8ListToAnsi(UTF8List, AnsiList: TStrings);

View File

@ -5,6 +5,7 @@
Test specific with:
./runtests --format=plain --suite=TestUTF8Trim
./runtests --format=plain --suite=TestUTF8Pos
./runtests --format=plain --suite=TestFindInvalidUTF8
}
unit TestLazUTF8;
@ -13,7 +14,7 @@ unit TestLazUTF8;
interface
uses
Classes, SysUtils, fpcunit, testglobals, LazUTF8;
Classes, SysUtils, fpcunit, testglobals, LazUTF8, LazLoggerBase;
type
@ -24,6 +25,7 @@ type
published
procedure TestUTF8Trim;
procedure TestUTF8Pos;
procedure TestFindInvalidUTF8;
end;
implementation
@ -54,6 +56,53 @@ begin
AssertEquals('Check #0',2,UTF8Pos('bc'#0,'abc'#0'abc'));
end;
procedure TTestLazUTF8.TestFindInvalidUTF8;
procedure t(const s: string; Expected: PtrInt; const Title: string);
var
Actual: PtrInt;
begin
Actual:=FindInvalidUTF8Character(PChar(s),length(s));
AssertEquals(Title+': '+dbgMemRange(Pointer(s),length(s)),Expected,Actual);
end;
begin
t('',-1,'empty');
t('a',-1,'');
t('a'#0,-1,'a with #0');
t(#0'a',-1,'#0 with a');
t(#128,0,'gap value 128');
t(#191,0,'gap value 192');
// 1 byte UTF-8
t(UnicodeToUTF8(0),-1,'unicode(0)');
t(UnicodeToUTF8(1),-1,'unicode(1)');
t(UnicodeToUTF8(65),-1,'unicode(65)');
t(UnicodeToUTF8($7f),-1,'unicode($7f)');
// 2 bytes UTF-8
t(UnicodeToUTF8($80),-1,'unicode($80)');
t(UnicodeToUTF8($7ff),-1,'unicode($7ff)');
// 3 bytes UTF-8
t(UnicodeToUTF8($800),-1,'unicode($800)');
t(UnicodeToUTF8($ffff),-1,'unicode($ffff)');
// 4 bytes UTF-8
t(UnicodeToUTF8($10000),-1,'unicode($10000)');
t(UnicodeToUTF8($10ffff),-1,'unicode($10ffff)');
t(#$c0#0,0,'invalid second byte of 2 byte');
t(#$e0#0,0,'invalid second byte of 3 byte');
t(#$e0#$80#0,0,'invalid third byte of 3 byte');
t(#$f0#0,0,'invalid second byte of 4 byte');
t(#$f0#$80#0,0,'invalid third byte of 4 byte');
t(#$f0#$80#$80#0,0,'// invalid fourth byte of 4 byte');
t(#$c0#$80,0,'invalid: ascii encoded as 2 byte');
t(#$c0#$8f,0,'invalid: ascii encoded as 2 byte');
t(#$c1#$80,0,'invalid: ascii encoded as 2 byte');
t(#$c1#$bf,0,'invalid: ascii encoded as 2 byte');
t(#$e0#$80#$80,0,'invalid: 0 encoded as 3 byte');
t(#$e0#$9f#$bf,0,'invalid: $7ff encoded as 3 byte');
t(#$f0#$80#$80#$80,0,'invalid: 0 encoded as 4 byte');
t(#$f0#$8f#$bf#$bf,0,'invalid: $ffff encoded as 4 byte');
end;
initialization
AddToLazUtilsTestSuite(TTestLazUTF8);