mirror of
https://gitlab.com/freepascal.org/fpc/source.git
synced 2025-04-13 06:29:32 +02:00
* Test from Inoussa to test NormalizeNFD
This commit is contained in:
parent
163d2ab344
commit
1619bef4a0
329
packages/rtl-unicode/tests/testnfd.lpr
Normal file
329
packages/rtl-unicode/tests/testnfd.lpr
Normal file
@ -0,0 +1,329 @@
|
||||
program testnfd;
|
||||
{
|
||||
This program tests the "NormalizeNFD" with the Unicode provided test file.
|
||||
The test file "NormalizationTest.txt" is to find in the Unicode Character
|
||||
Database.
|
||||
}
|
||||
|
||||
{$mode objfpc}{$H+}
|
||||
|
||||
uses
|
||||
SysUtils, Classes, Math, unicodedata;
|
||||
|
||||
type
|
||||
TDataPartLine = record
|
||||
c1, c2, c3, c4, c5 : UCS4String;
|
||||
end;
|
||||
PDataPartLine = ^TDataPartLine;
|
||||
|
||||
TDataPart = record
|
||||
Part : AnsiString;
|
||||
Lines : array of TDataPartLine;
|
||||
ActualLength : Integer;
|
||||
end;
|
||||
PDataPart = ^TDataPart;
|
||||
|
||||
const
|
||||
LINE_LENGTH = 1024;
|
||||
DEFAULT_DATA_LINE_LENGTH = 25000;
|
||||
|
||||
var
|
||||
p : PAnsiChar;
|
||||
bufferLength, bufferPos, lineLength, linePos : Integer;
|
||||
line : ansistring;
|
||||
totalErrorCount : Integer = 0;
|
||||
lineCount, errorCount : Integer;
|
||||
stream : TMemoryStream;
|
||||
part : ansistring;
|
||||
c1, c2, c3, c4, c5 : UCS4String;
|
||||
s1, s2, s3, s4, s5 : UnicodeString;
|
||||
dataList : array of TDataPart;
|
||||
dataListActualLength : Integer;
|
||||
pp, part1 : PDataPart;
|
||||
|
||||
function NextLine() : Boolean;
|
||||
var
|
||||
locOldPos : Integer;
|
||||
locOldPointer : PAnsiChar;
|
||||
begin
|
||||
Result := False;
|
||||
locOldPointer := p;
|
||||
locOldPos := bufferPos;
|
||||
while (bufferPos < bufferLength) and (p^ <> #10) do begin
|
||||
Inc(p);
|
||||
Inc(bufferPos);
|
||||
end;
|
||||
if (locOldPos = bufferPos) and (p^ = #10) then begin
|
||||
lineLength := 0;
|
||||
Inc(p);
|
||||
Inc(bufferPos);
|
||||
linePos := 1;
|
||||
Result := True;
|
||||
end else if (locOldPos < bufferPos) then begin
|
||||
lineLength := (bufferPos - locOldPos) + 1;
|
||||
Move(locOldPointer^,line[1],lineLength);
|
||||
if (p^ = #10) then begin
|
||||
Dec(lineLength);
|
||||
Inc(p);
|
||||
Inc(bufferPos);
|
||||
end;
|
||||
linePos := 1;
|
||||
Result := True;
|
||||
end;
|
||||
if Result then
|
||||
Inc(lineCount);
|
||||
end;
|
||||
|
||||
procedure SkipSpace();
|
||||
begin
|
||||
while (linePos < lineLength) and (line[linePos] in [' ',#9]) do
|
||||
Inc(linePos);
|
||||
end;
|
||||
|
||||
function NextToken() : ansistring;
|
||||
const C_SEPARATORS = [';','#','.','[',']','*','@'];
|
||||
var
|
||||
k : Integer;
|
||||
begin
|
||||
SkipSpace();
|
||||
k := linePos;
|
||||
if (linePos <= lineLength) and (line[linePos] in C_SEPARATORS) then begin
|
||||
Result := line[linePos];
|
||||
Inc(linePos);
|
||||
exit;
|
||||
end;
|
||||
while (linePos <= lineLength) and not(line[linePos] in (C_SEPARATORS+[' '])) do
|
||||
Inc(linePos);
|
||||
if (linePos > k) then begin
|
||||
if (line[Min(linePos,lineLength)] in C_SEPARATORS) then
|
||||
Result := Copy(line,k,(linePos-k))
|
||||
else
|
||||
Result := Copy(line,k,(linePos-k+1));
|
||||
Result := Trim(Result);
|
||||
end else begin
|
||||
Result := '';
|
||||
end;
|
||||
end;
|
||||
|
||||
function ParseLine() : Integer;
|
||||
var
|
||||
t : UCS4String;
|
||||
r : array[0..23] of UCS4String;
|
||||
rc, k : Integer;
|
||||
s : ansistring;
|
||||
begin
|
||||
rc := 0;
|
||||
SetLength(c1,0);
|
||||
SetLength(c2,0);
|
||||
SetLength(c3,0);
|
||||
SetLength(c4,0);
|
||||
SetLength(c5,0);
|
||||
SetLength(t,0);
|
||||
while (rc < Length(r)) do begin
|
||||
s := NextToken();
|
||||
if (s = '#') then
|
||||
break;
|
||||
if (s = '@') then begin
|
||||
part := NextToken();
|
||||
rc := 0;
|
||||
continue;
|
||||
end;
|
||||
if (s = '') or (s[1] = '#') then
|
||||
Break;
|
||||
if (s <> ';') then begin
|
||||
k := Length(t);
|
||||
SetLength(t,(k+1));
|
||||
t[k] := StrToInt('$' + s);
|
||||
end else if (s = ';') then begin
|
||||
k := Length(t);
|
||||
SetLength(t,(k+1));
|
||||
t[k] := 0;
|
||||
r[rc] := Copy(t);
|
||||
SetLength(t,0);
|
||||
Inc(rc);
|
||||
end;
|
||||
end;
|
||||
c1 := r[0]; s1 := UCS4StringToUnicodeString(c1);
|
||||
c2 := r[1]; s2 := UCS4StringToUnicodeString(c2);
|
||||
c3 := r[2]; s3 := UCS4StringToUnicodeString(c3);
|
||||
c4 := r[3]; s4 := UCS4StringToUnicodeString(c4);
|
||||
c5 := r[4]; s5 := UCS4StringToUnicodeString(c5);
|
||||
Result := rc;
|
||||
end;
|
||||
|
||||
procedure AddDataLine();
|
||||
var
|
||||
k : Integer;
|
||||
p : PDataPart;
|
||||
pline : PDataPartLine;
|
||||
begin
|
||||
p := nil;
|
||||
for k := Low(dataList) to High(dataList) do begin
|
||||
if (dataList[k].Part = part) then begin
|
||||
p := @dataList[k];
|
||||
break;
|
||||
end;
|
||||
end;
|
||||
if (p = nil) then begin
|
||||
k := dataListActualLength;
|
||||
if (k >= Length(dataList)) then
|
||||
SetLength(dataList,(k+5));
|
||||
dataListActualLength := k+1;
|
||||
p := @dataList[k];
|
||||
p^.Part := part;
|
||||
end;
|
||||
|
||||
k := p^.ActualLength;
|
||||
if (k >= Length(p^.Lines)) then
|
||||
SetLength(p^.Lines,(k+DEFAULT_DATA_LINE_LENGTH));
|
||||
pline := @p^.Lines[k];
|
||||
pline^.c1 := c1;
|
||||
pline^.c2 := c2;
|
||||
pline^.c3 := c3;
|
||||
pline^.c4 := c4;
|
||||
pline^.c5 := c5;
|
||||
p^.ActualLength := k+1;
|
||||
c1 := nil;
|
||||
c2 := nil;
|
||||
c3 := nil;
|
||||
c4 := nil;
|
||||
c5 := nil;
|
||||
end;
|
||||
|
||||
function IsInPart(ACodePoint : UCS4Char; APart : PDataPart) : boolean;
|
||||
var
|
||||
k : Integer;
|
||||
pline : PDataPartLine;
|
||||
begin
|
||||
pline := @APart^.Lines[0];
|
||||
for k := 0 to APart^.ActualLength-1 do begin
|
||||
if (Length(pline^.c1) = 2) and (pline^.c1[0] = ACodePoint) then
|
||||
exit(True);
|
||||
Inc(pline);
|
||||
end;
|
||||
Result := False;
|
||||
end;
|
||||
|
||||
procedure Prepare();
|
||||
begin
|
||||
bufferLength := stream.Size;
|
||||
bufferPos := 0;
|
||||
p := stream.Memory;
|
||||
lineLength := 0;
|
||||
SetLength(line,LINE_LENGTH);
|
||||
SetLength(dataList,10);
|
||||
dataListActualLength := 0;
|
||||
end;
|
||||
|
||||
procedure TestLines();
|
||||
var
|
||||
lineErrors : Integer;
|
||||
begin
|
||||
while NextLine() do begin
|
||||
if (ParseLine() < 5) then
|
||||
continue;
|
||||
AddDataLine();
|
||||
lineErrors := 0;
|
||||
//c3 == toNFD(c1) == toNFD(c2) == toNFD(c3)
|
||||
if (NormalizeNFD(s1) <> s3) then
|
||||
lineErrors := lineErrors+1;
|
||||
if (NormalizeNFD(s2) <> s3) then
|
||||
lineErrors := lineErrors+1;
|
||||
if (NormalizeNFD(s3) <> s3) then
|
||||
Inc(errorCount);
|
||||
//c5 == toNFD(c4) == toNFD(c5)
|
||||
if (NormalizeNFD(s4) <> s5) then
|
||||
lineErrors := lineErrors+1;
|
||||
if (NormalizeNFD(s5) <> s5) then
|
||||
lineErrors := lineErrors+1;
|
||||
if (lineErrors <> 0) then
|
||||
errorCount := errorCount+lineErrors;
|
||||
end;
|
||||
end;
|
||||
|
||||
{$IFDEF ALL_CODE_POINTS}
|
||||
procedure TestBmpCodePoints();
|
||||
var
|
||||
cp : Word;
|
||||
s : UnicodeString;
|
||||
pu : PUC_Prop;
|
||||
begin
|
||||
SetLength(s,1);
|
||||
for cp := Low(Word) to High(Word) do begin
|
||||
pu := GetProps(cp);
|
||||
if (pu^.Category <> UGC_Unassigned) and (pu^.Category <> UGC_Surrogate) and
|
||||
not(IsInPart(cp,part1))
|
||||
then begin
|
||||
//X == toNFC(X) == toNFD(X) == toNFKC(X) == toNFKD(X)
|
||||
PWord(@s[1])^ := cp;
|
||||
if (NormalizeNFD(s) <> s) then
|
||||
errorCount := errorCount+1;
|
||||
end;
|
||||
end;
|
||||
end;
|
||||
|
||||
procedure TestOBmpCodePoints();
|
||||
var
|
||||
cp : UCS4Char;
|
||||
s : UnicodeString;
|
||||
pu : PUC_Prop;
|
||||
begin
|
||||
SetLength(s,2);
|
||||
s[1] := 'a'; s[2] := 'a';
|
||||
for cp := High(Word)+1 to MAX_LEGAL_UTF32 do begin
|
||||
pu := GetProps(cp);
|
||||
if (pu^.Category <> UGC_Unassigned) and (pu^.Category <> UGC_Surrogate) and
|
||||
not(IsInPart(cp,part1))
|
||||
then begin
|
||||
//X == toNFC(X) == toNFD(X) == toNFKC(X) == toNFKD(X)
|
||||
FromUCS4(cp,s[1],s[2]);
|
||||
if (NormalizeNFD(s) <> s) then
|
||||
errorCount := errorCount+1;
|
||||
end;
|
||||
end;
|
||||
end;
|
||||
{$ENDIF ALL_CODE_POINTS}
|
||||
|
||||
var
|
||||
i, c : Integer;
|
||||
begin
|
||||
errorCount := 0;
|
||||
lineCount := 0;
|
||||
stream := TMemoryStream.Create();
|
||||
try
|
||||
stream.LoadFromFile('NormalizationTest.txt');
|
||||
Prepare();
|
||||
// Direct tests specified in NormalizationTest.txt
|
||||
TestLines();
|
||||
part1 := nil;
|
||||
c := 0;
|
||||
for i := 0 to dataListActualLength-1 do begin
|
||||
pp := @dataList[i];
|
||||
if (Length(pp^.Lines) <> pp^.ActualLength) then
|
||||
SetLength(pp^.Lines,pp^.ActualLength);
|
||||
c := c+pp^.ActualLength;
|
||||
if SameText(pp^.Part,'Part1') then
|
||||
part1 := pp;
|
||||
end;
|
||||
if (part1 = nil) then
|
||||
raise Exception.Create('"Part1" not found !');
|
||||
{ $DEFINE ALL_CODE_POINTS}
|
||||
{$IFDEF ALL_CODE_POINTS}
|
||||
// Tests for BMP Codepoints not is PART1
|
||||
TestBmpCodePoints();
|
||||
// Tests for BMP Codepoints not is PART1
|
||||
TestOBmpCodePoints();
|
||||
{$ENDIF ALL_CODE_POINTS}
|
||||
WriteLn('Line Count = ',lineCount);
|
||||
WriteLn('Actual Test Line Count = ',c);
|
||||
WriteLn('Error Count = ',errorCount);
|
||||
Inc(totalErrorCount,errorCount);
|
||||
finally
|
||||
stream.Free();
|
||||
end;
|
||||
if (totalErrorCount > 0) then begin
|
||||
WriteLn('Failed.');
|
||||
Halt(1);
|
||||
end;
|
||||
end.
|
||||
|
Loading…
Reference in New Issue
Block a user