mirror of
https://gitlab.com/freepascal.org/lazarus/lazarus.git
synced 2025-04-08 07:21:34 +02:00
275 lines
7.7 KiB
ObjectPascal
275 lines
7.7 KiB
ObjectPascal
{
|
|
Test all with:
|
|
./runtests --format=plain --suite=TTestLazUTF8
|
|
|
|
Test specific with:
|
|
./runtests --format=plain --suite=TestUTF8Trim
|
|
./runtests --format=plain --suite=TestUTF8Pos
|
|
./runtests --format=plain --suite=TestFindInvalidUTF8
|
|
./runtests --format=plain --suite=TestFindUnicodeToUTF8
|
|
./runtests --format=plain --suite=TestUTF8QuotedStr
|
|
}
|
|
unit TestLazUTF8;
|
|
|
|
{$mode objfpc}{$H+}
|
|
|
|
interface
|
|
|
|
uses
|
|
Classes, SysUtils, fpcunit, testglobals, LazUTF8, LazLoggerBase;
|
|
|
|
type
|
|
|
|
{ TTestLazUTF8 }
|
|
|
|
TTestLazUTF8 = class(TTestCase)
|
|
public
|
|
published
|
|
procedure TestUTF8Trim;
|
|
procedure TestUTF8Pos;
|
|
procedure TestUTF8ToUTF16;
|
|
procedure TestFindInvalidUTF8;
|
|
procedure TestUnicodeToUTF8;
|
|
procedure TestUTF8QuotedStr;
|
|
procedure TestUTF8FixBroken;
|
|
end;
|
|
|
|
implementation
|
|
|
|
function dbgUnicodeStr(S: UnicodeString): string;
|
|
var
|
|
i: Integer;
|
|
begin
|
|
Result:='';
|
|
for i:=1 to length(S) do
|
|
Result:=Result+'$'+HexStr(ord(S[i]),4);
|
|
end;
|
|
|
|
{ TTestLazUTF8 }
|
|
|
|
procedure TTestLazUTF8.TestUTF8Trim;
|
|
begin
|
|
AssertEquals('Empty string','',UTF8Trim(''));
|
|
AssertEquals('Single space string','',UTF8Trim(' '));
|
|
AssertEquals('Single char string','a',UTF8Trim('a'));
|
|
AssertEquals('Space at start','a',UTF8Trim(' a'));
|
|
AssertEquals('Space at end','a',UTF8Trim('a '));
|
|
AssertEquals('Space at start and end','a',UTF8Trim(' a '));
|
|
AssertEquals('Tabs','a',UTF8Trim(#9'a'#9));
|
|
AssertEquals('Line breaks 1','a',UTF8Trim(#10'a'#13#10));
|
|
AssertEquals('Line breaks 2','',UTF8Trim(#10#13#13));
|
|
AssertEquals('Control characters 1','a',UTF8Trim(#0'a'#0));
|
|
AssertEquals('left-to-right','a',UTF8Trim(#$E2#$80#$8E'a'));
|
|
AssertEquals('right-to-left mark','a',UTF8Trim('a'#$E2#$80#$8F));
|
|
AssertEquals('left-to-right, right-to-left mark','a',UTF8Trim(#$E2#$80#$8E'a'#$E2#$80#$8F));
|
|
end;
|
|
|
|
procedure TTestLazUTF8.TestUTF8Pos;
|
|
begin
|
|
AssertEquals('Skip first occurrence',4,UTF8Pos('ab','abcabc',2));
|
|
AssertEquals('Not found',0,UTF8Pos('abc'#0,'abcabc'));
|
|
AssertEquals('Check #0',2,UTF8Pos('bc'#0,'abc'#0'abc'));
|
|
end;
|
|
|
|
procedure TTestLazUTF8.TestUTF8ToUTF16;
|
|
|
|
procedure t(theUTF8: string; Expected: UnicodeString);
|
|
var
|
|
Actual: UnicodeString;
|
|
begin
|
|
Actual:=LazUTF8.UTF8ToUTF16(theUTF8);
|
|
//writeln('TTestLazUTF8.TestUTF8ToUTF16 ','UTF8='+dbgMemRange(PChar(theUTF8),length(theUTF8)));
|
|
AssertEquals('UTF8='+dbgMemRange(PChar(theUTF8),length(theUTF8)),
|
|
dbgUnicodeStr(Expected),
|
|
dbgUnicodeStr(Actual));
|
|
end;
|
|
|
|
begin
|
|
t(#0,#0);
|
|
t(#1,#1);
|
|
t(#$20,' ');
|
|
t(#$7f,#$7f);
|
|
t(#$80,'');
|
|
t(#$C0#0,'?'#0); // invalid 2-byte UTF8
|
|
t(#$C2#$80,#$0080);
|
|
t(#$DF#$BF,#$07FF);
|
|
t(#$E0#$A0#$80,#$0800);
|
|
t(#$E0#$BF#$BF,#$0FFF);
|
|
t(#$E1#$BF#$BF,#$1FFF);
|
|
t(#$E3#$BF#$BF,#$3FFF);
|
|
t(#$E7#$BF#$BF,#$7FFF);
|
|
t(#$ED#$9F#$BF,#$D7FF);
|
|
t(#$ED#$A0#$80,'?'); // U+D800 is not Unicode standard conform, but many en/decoders support it anyway
|
|
t(#$ED#$BF#$BF,'?'); // U+DFFF is not Unicode standard conform, but many en/decoders support it anyway
|
|
t(#$EE#$80#$80,#$E000);
|
|
t(#$EF#$80#$80,#$F000);
|
|
t(#$EF#$BF#$BF,#$FFFF);
|
|
t(#$F0#$9F#$98#$80,#$D83D#$DE00); // U+1F600
|
|
t(#$F0#$9F#$BF#$BF,#$D83F#$DFFF); // U+1FFFF
|
|
t(#$F0#$AF#$BF#$BF,#$D87F#$DFFF); // U+2FFFF
|
|
t(#$F0#$BF#$BF#$BF,#$D8BF#$DFFF); // U+3FFFF
|
|
t(#$F1#$BF#$BF#$BF,#$D9BF#$DFFF); // U+7FFFF
|
|
t(#$F2#$BF#$BF#$BF,#$DABF#$DFFF); // U+8FFFF
|
|
t(#$F3#$BF#$BF#$BF,#$DBBF#$DFFF); // U+FFFFF
|
|
t(#$F4#$8F#$BF#$BF,#$DBFF#$DFFF); // U+10FFFF
|
|
t(#$F7#$BF#$BF#$BF,'?'); // invalid 4 byte out of range
|
|
end;
|
|
|
|
procedure TTestLazUTF8.TestFindInvalidUTF8;
|
|
|
|
procedure t(const s: string; Expected: PtrInt; const Title: string);
|
|
var
|
|
Actual: PtrInt;
|
|
begin
|
|
Actual:=FindInvalidUTF8Codepoint(PChar(s),length(s));
|
|
AssertEquals(Title+': '+dbgMemRange(Pointer(s),length(s)),Expected,Actual);
|
|
end;
|
|
|
|
begin
|
|
t('',-1,'empty');
|
|
t('a',-1,'');
|
|
t('a'#0,-1,'a with #0');
|
|
t(#0'a',-1,'#0 with a');
|
|
t(#128,0,'gap value 128');
|
|
t(#191,0,'gap value 192');
|
|
// 1 byte UTF-8
|
|
t(UnicodeToUTF8(0),-1,'unicode(0)');
|
|
t(UnicodeToUTF8(1),-1,'unicode(1)');
|
|
t(UnicodeToUTF8(65),-1,'unicode(65)');
|
|
t(UnicodeToUTF8($7f),-1,'unicode($7f)');
|
|
// 2 bytes UTF-8
|
|
t(UnicodeToUTF8($80),-1,'unicode($80)');
|
|
t(UnicodeToUTF8($7ff),-1,'unicode($7ff)');
|
|
// 3 bytes UTF-8
|
|
t(UnicodeToUTF8($800),-1,'unicode($800)');
|
|
t(UnicodeToUTF8($ffff),-1,'unicode($ffff)');
|
|
// 4 bytes UTF-8
|
|
t(UnicodeToUTF8($10000),-1,'unicode($10000)');
|
|
t(UnicodeToUTF8($10900),-1,'unicode($10900)');
|
|
t(UnicodeToUTF8($10ffff),-1,'unicode($10ffff)');
|
|
t(#$F4#$8F#$BF#$BF,-1,'unicode($10ffff)');
|
|
t(#$F4#$90#$80#$80,0,'unicode($110000)');
|
|
t(#$c0#0,0,'invalid second byte of 2 byte');
|
|
t(#$c2#0,0,'valid 2 byte');
|
|
t(#$e0#0,0,'invalid second byte of 3 byte');
|
|
t(#$e0#$80#0,0,'invalid third byte of 3 byte');
|
|
t(#$f0#0,0,'invalid second byte of 4 byte');
|
|
t(#$f0#$80#0,0,'invalid third byte of 4 byte');
|
|
t(#$f0#$80#$80#0,0,'// invalid fourth byte of 4 byte');
|
|
t(#$c0#$80,0,'invalid: ascii encoded as 2 byte');
|
|
t(#$c0#$8f,0,'invalid: ascii encoded as 2 byte');
|
|
t(#$c1#$80,0,'invalid: ascii encoded as 2 byte');
|
|
t(#$c1#$bf,0,'invalid: ascii encoded as 2 byte');
|
|
t(#$e0#$80#$80,0,'invalid: 0 encoded as 3 byte');
|
|
t(#$e0#$9f#$bf,0,'invalid: $7ff encoded as 3 byte');
|
|
t(#$f0#$80#$80#$80,0,'invalid: 0 encoded as 4 byte');
|
|
t(#$f0#$8f#$bf#$bf,0,'invalid: $ffff encoded as 4 byte');
|
|
t(#$F7#$BF#$BF#$BF,0,'invalid 4 byte out of range');
|
|
t(#$ED#$A0#$80,0,'3 byte encoding for reserved UTF-16 surrogate halve');
|
|
end;
|
|
|
|
procedure TTestLazUTF8.TestUnicodeToUTF8;
|
|
|
|
procedure t(CodePoint: cardinal; Expected: string);
|
|
var
|
|
Actual: String;
|
|
begin
|
|
Actual:=UnicodeToUTF8(CodePoint);
|
|
AssertEquals('CodePoint='+HexStr(CodePoint,8),
|
|
dbgMemRange(PChar(Expected),length(Expected)),
|
|
dbgMemRange(PChar(Actual),length(Actual)));
|
|
end;
|
|
|
|
begin
|
|
t($0,#0);
|
|
t($1,#1);
|
|
t($20,' ');
|
|
t($7f,#$7f);
|
|
t($80,#$C2#$80);
|
|
t($7ff,#$DF#$BF);
|
|
t($800,#$E0#$A0#$80);
|
|
t($fff,#$E0#$BF#$BF);
|
|
t($1fff,#$E1#$BF#$BF);
|
|
t($3fff,#$E3#$BF#$BF);
|
|
t($7fff,#$E7#$BF#$BF);
|
|
t($ffff,#$EF#$BF#$BF);
|
|
t($1f600,#$F0#$9F#$98#$80);
|
|
t($1ffff,#$F0#$9F#$BF#$BF);
|
|
t($3ffff,#$F0#$BF#$BF#$BF);
|
|
t($7ffff,#$F1#$BF#$BF#$BF);
|
|
t($fffff,#$F3#$BF#$BF#$BF);
|
|
end;
|
|
|
|
procedure TTestLazUTF8.TestUTF8QuotedStr;
|
|
|
|
procedure t(const S, Quote, Expected: string);
|
|
var
|
|
Actual: String;
|
|
begin
|
|
Actual:=UTF8QuotedStr(S,Quote);
|
|
AssertEquals('S="'+S+'" Quote="'+Quote+'"',Expected,Actual);
|
|
end;
|
|
|
|
begin
|
|
t('','=','==');
|
|
t('','AB','ABAB');
|
|
t('A','A','AAAA');
|
|
t('bAb','A','AbAAbA');
|
|
t('cABc','AB','ABcABABcAB');
|
|
end;
|
|
|
|
procedure TTestLazUTF8.TestUTF8FixBroken;
|
|
|
|
procedure t(const S, Expected: string);
|
|
var
|
|
Actual: String;
|
|
begin
|
|
Actual:=S;
|
|
UTF8FixBroken(Actual);
|
|
AssertEquals('S: '+dbgMemRange(PChar(S),length(S)),
|
|
dbgMemRange(PChar(Expected),length(Expected)),
|
|
dbgMemRange(PChar(Actual),length(Actual)));
|
|
end;
|
|
|
|
begin
|
|
t(#$0,#$0);
|
|
t(#$1,#$1);
|
|
t(#$7F,#$7F);
|
|
t(#$80,' ');
|
|
t(#$BF,' ');
|
|
t(#$C0#$0,' '#$0);
|
|
t(#$C0#$7F,' '#$7F);
|
|
t(#$C0#$80,' ');
|
|
t(#$C0#$CF,' ');
|
|
t(#$C1#$80,' ');
|
|
t(#$C2#$7F,' '#$7F);
|
|
t(#$C2#$80,#$C2#$80);
|
|
t(#$DF#$80,#$DF#$80);
|
|
t(#$DF#$BF,#$DF#$BF);
|
|
t(#$DF#$C0,' ');
|
|
t(#$DF#$70,' '#$70);
|
|
t(#$E0#$80,' ');
|
|
t(#$E0#$80#$80,' ');
|
|
t(#$E0#$9F#$BF,' ');
|
|
t(#$E0#$A0#$80,#$E0#$A0#$80);
|
|
t(#$E0#$80#$70,' '#$70);
|
|
t(#$EF#$BF#$BF,#$EF#$BF#$BF);
|
|
t(#$EF#$BF#$7F,' '#$7F);
|
|
t(#$EF#$BF#$C0,' ');
|
|
t(#$EF#$7F#$80,' '#$7F' ');
|
|
t(#$F0#$80#$80#$80,' ');
|
|
t(#$F0#$8F#$BF#$BF,' ');
|
|
t(#$F0#$9F#$BF#$BF,#$F0#$9F#$BF#$BF);
|
|
t(#$F0#$9F#$BF#$CF,' ');
|
|
t(#$F0#$9F#$CF#$BF,' '#$CF#$BF);
|
|
t(#$F0#$CF#$BF#$BF,' '#$CF#$BF' ');
|
|
t(#$F4#$8F#$BF#$BF,#$F4#$8F#$BF#$BF);
|
|
t(#$F4#$90#$80#$80,' ');
|
|
end;
|
|
|
|
initialization
|
|
AddToLazUtilsTestSuite(TTestLazUTF8);
|
|
|
|
end.
|
|
|