From c887c889e3dab95372b6da887fb4b8ada7ffe55d Mon Sep 17 00:00:00 2001 From: mattias Date: Tue, 1 Jun 2021 21:02:23 +0000 Subject: [PATCH] lazutils: FindInvalidUTF8Codepoint: check if bigger U+10FFFF git-svn-id: trunk@65162 - --- components/lazutils/lazutf8.pas | 24 ++++++++++++++---------- test/lazutils/testlazutf8.pas | 3 +++ 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/components/lazutils/lazutf8.pas b/components/lazutils/lazutf8.pas index 1808d3b8dd..1fe44b0703 100644 --- a/components/lazutils/lazutf8.pas +++ b/components/lazutils/lazutf8.pas @@ -2813,22 +2813,22 @@ function FindInvalidUTF8Codepoint(p: PChar; Count: PtrInt; StopOnNonUTF8: Boolea // return -1 if ok var CharLen: Integer; - c: Char; + c: Byte; begin if (p<>nil) then begin Result:=0; while Result=192) then + if StopOnNonUTF8 or (c>=192) then exit; CharLen:=1; - end else if ord(c)<=%11011111 then begin + end else if c<=%11011111 then begin // could be 2 byte character (%110xxxxx %10xxxxxx) if (Result%11110100) then + exit; // out of range U+10FFFF + if (c=%11110100) and (ord(p[1])>%10001111) then + exit; // out of range U+10FFFF CharLen:=4; end else exit; // missing following bytes diff --git a/test/lazutils/testlazutf8.pas b/test/lazutils/testlazutf8.pas index d9cdd0e9cf..c1d485aee4 100644 --- a/test/lazutils/testlazutf8.pas +++ b/test/lazutils/testlazutf8.pas @@ -92,6 +92,8 @@ begin t(UnicodeToUTF8($10000),-1,'unicode($10000)'); t(UnicodeToUTF8($10900),-1,'unicode($10900)'); t(UnicodeToUTF8($10ffff),-1,'unicode($10ffff)'); + t(#$F4#$8F#$BF#$BF,-1,'unicode($10ffff)'); + t(#$F4#$90#$80#$80,0,'unicode($110000)'); t(#$c0#0,0,'invalid second byte of 2 byte'); t(#$e0#0,0,'invalid second byte of 3 byte'); t(#$e0#$80#0,0,'invalid third byte of 3 byte'); @@ -106,6 +108,7 @@ begin t(#$e0#$9f#$bf,0,'invalid: $7ff encoded as 3 byte'); t(#$f0#$80#$80#$80,0,'invalid: 0 encoded as 4 byte'); t(#$f0#$8f#$bf#$bf,0,'invalid: $ffff encoded as 4 byte'); + t(#$F7#$BF#$BF#$BF,0,'invalid 4 byte out of range'); end; procedure TTestLazUTF8.TestFindUnicodeToUTF8;