* Test from Inoussa to test NormalizeNFD

This commit is contained in:
Michaël Van Canneyt 2022-03-26 22:44:54 +01:00
parent 163d2ab344
commit 1619bef4a0

View File

@ -0,0 +1,329 @@
program testnfd;
{
This program tests the "NormalizeNFD" with the Unicode provided test file.
The test file "NormalizationTest.txt" is to find in the Unicode Character
Database.
}
{$mode objfpc}{$H+}
uses
SysUtils, Classes, Math, unicodedata;
type
TDataPartLine = record
c1, c2, c3, c4, c5 : UCS4String;
end;
PDataPartLine = ^TDataPartLine;
TDataPart = record
Part : AnsiString;
Lines : array of TDataPartLine;
ActualLength : Integer;
end;
PDataPart = ^TDataPart;
const
LINE_LENGTH = 1024;
DEFAULT_DATA_LINE_LENGTH = 25000;
var
p : PAnsiChar;
bufferLength, bufferPos, lineLength, linePos : Integer;
line : ansistring;
totalErrorCount : Integer = 0;
lineCount, errorCount : Integer;
stream : TMemoryStream;
part : ansistring;
c1, c2, c3, c4, c5 : UCS4String;
s1, s2, s3, s4, s5 : UnicodeString;
dataList : array of TDataPart;
dataListActualLength : Integer;
pp, part1 : PDataPart;
function NextLine() : Boolean;
var
locOldPos : Integer;
locOldPointer : PAnsiChar;
begin
Result := False;
locOldPointer := p;
locOldPos := bufferPos;
while (bufferPos < bufferLength) and (p^ <> #10) do begin
Inc(p);
Inc(bufferPos);
end;
if (locOldPos = bufferPos) and (p^ = #10) then begin
lineLength := 0;
Inc(p);
Inc(bufferPos);
linePos := 1;
Result := True;
end else if (locOldPos < bufferPos) then begin
lineLength := (bufferPos - locOldPos) + 1;
Move(locOldPointer^,line[1],lineLength);
if (p^ = #10) then begin
Dec(lineLength);
Inc(p);
Inc(bufferPos);
end;
linePos := 1;
Result := True;
end;
if Result then
Inc(lineCount);
end;
procedure SkipSpace();
begin
while (linePos < lineLength) and (line[linePos] in [' ',#9]) do
Inc(linePos);
end;
function NextToken() : ansistring;
const C_SEPARATORS = [';','#','.','[',']','*','@'];
var
k : Integer;
begin
SkipSpace();
k := linePos;
if (linePos <= lineLength) and (line[linePos] in C_SEPARATORS) then begin
Result := line[linePos];
Inc(linePos);
exit;
end;
while (linePos <= lineLength) and not(line[linePos] in (C_SEPARATORS+[' '])) do
Inc(linePos);
if (linePos > k) then begin
if (line[Min(linePos,lineLength)] in C_SEPARATORS) then
Result := Copy(line,k,(linePos-k))
else
Result := Copy(line,k,(linePos-k+1));
Result := Trim(Result);
end else begin
Result := '';
end;
end;
function ParseLine() : Integer;
var
t : UCS4String;
r : array[0..23] of UCS4String;
rc, k : Integer;
s : ansistring;
begin
rc := 0;
SetLength(c1,0);
SetLength(c2,0);
SetLength(c3,0);
SetLength(c4,0);
SetLength(c5,0);
SetLength(t,0);
while (rc < Length(r)) do begin
s := NextToken();
if (s = '#') then
break;
if (s = '@') then begin
part := NextToken();
rc := 0;
continue;
end;
if (s = '') or (s[1] = '#') then
Break;
if (s <> ';') then begin
k := Length(t);
SetLength(t,(k+1));
t[k] := StrToInt('$' + s);
end else if (s = ';') then begin
k := Length(t);
SetLength(t,(k+1));
t[k] := 0;
r[rc] := Copy(t);
SetLength(t,0);
Inc(rc);
end;
end;
c1 := r[0]; s1 := UCS4StringToUnicodeString(c1);
c2 := r[1]; s2 := UCS4StringToUnicodeString(c2);
c3 := r[2]; s3 := UCS4StringToUnicodeString(c3);
c4 := r[3]; s4 := UCS4StringToUnicodeString(c4);
c5 := r[4]; s5 := UCS4StringToUnicodeString(c5);
Result := rc;
end;
procedure AddDataLine();
var
k : Integer;
p : PDataPart;
pline : PDataPartLine;
begin
p := nil;
for k := Low(dataList) to High(dataList) do begin
if (dataList[k].Part = part) then begin
p := @dataList[k];
break;
end;
end;
if (p = nil) then begin
k := dataListActualLength;
if (k >= Length(dataList)) then
SetLength(dataList,(k+5));
dataListActualLength := k+1;
p := @dataList[k];
p^.Part := part;
end;
k := p^.ActualLength;
if (k >= Length(p^.Lines)) then
SetLength(p^.Lines,(k+DEFAULT_DATA_LINE_LENGTH));
pline := @p^.Lines[k];
pline^.c1 := c1;
pline^.c2 := c2;
pline^.c3 := c3;
pline^.c4 := c4;
pline^.c5 := c5;
p^.ActualLength := k+1;
c1 := nil;
c2 := nil;
c3 := nil;
c4 := nil;
c5 := nil;
end;
function IsInPart(ACodePoint : UCS4Char; APart : PDataPart) : boolean;
var
k : Integer;
pline : PDataPartLine;
begin
pline := @APart^.Lines[0];
for k := 0 to APart^.ActualLength-1 do begin
if (Length(pline^.c1) = 2) and (pline^.c1[0] = ACodePoint) then
exit(True);
Inc(pline);
end;
Result := False;
end;
procedure Prepare();
begin
bufferLength := stream.Size;
bufferPos := 0;
p := stream.Memory;
lineLength := 0;
SetLength(line,LINE_LENGTH);
SetLength(dataList,10);
dataListActualLength := 0;
end;
procedure TestLines();
var
lineErrors : Integer;
begin
while NextLine() do begin
if (ParseLine() < 5) then
continue;
AddDataLine();
lineErrors := 0;
//c3 == toNFD(c1) == toNFD(c2) == toNFD(c3)
if (NormalizeNFD(s1) <> s3) then
lineErrors := lineErrors+1;
if (NormalizeNFD(s2) <> s3) then
lineErrors := lineErrors+1;
if (NormalizeNFD(s3) <> s3) then
Inc(errorCount);
//c5 == toNFD(c4) == toNFD(c5)
if (NormalizeNFD(s4) <> s5) then
lineErrors := lineErrors+1;
if (NormalizeNFD(s5) <> s5) then
lineErrors := lineErrors+1;
if (lineErrors <> 0) then
errorCount := errorCount+lineErrors;
end;
end;
{$IFDEF ALL_CODE_POINTS}
procedure TestBmpCodePoints();
var
cp : Word;
s : UnicodeString;
pu : PUC_Prop;
begin
SetLength(s,1);
for cp := Low(Word) to High(Word) do begin
pu := GetProps(cp);
if (pu^.Category <> UGC_Unassigned) and (pu^.Category <> UGC_Surrogate) and
not(IsInPart(cp,part1))
then begin
//X == toNFC(X) == toNFD(X) == toNFKC(X) == toNFKD(X)
PWord(@s[1])^ := cp;
if (NormalizeNFD(s) <> s) then
errorCount := errorCount+1;
end;
end;
end;
procedure TestOBmpCodePoints();
var
cp : UCS4Char;
s : UnicodeString;
pu : PUC_Prop;
begin
SetLength(s,2);
s[1] := 'a'; s[2] := 'a';
for cp := High(Word)+1 to MAX_LEGAL_UTF32 do begin
pu := GetProps(cp);
if (pu^.Category <> UGC_Unassigned) and (pu^.Category <> UGC_Surrogate) and
not(IsInPart(cp,part1))
then begin
//X == toNFC(X) == toNFD(X) == toNFKC(X) == toNFKD(X)
FromUCS4(cp,s[1],s[2]);
if (NormalizeNFD(s) <> s) then
errorCount := errorCount+1;
end;
end;
end;
{$ENDIF ALL_CODE_POINTS}
var
i, c : Integer;
begin
errorCount := 0;
lineCount := 0;
stream := TMemoryStream.Create();
try
stream.LoadFromFile('NormalizationTest.txt');
Prepare();
// Direct tests specified in NormalizationTest.txt
TestLines();
part1 := nil;
c := 0;
for i := 0 to dataListActualLength-1 do begin
pp := @dataList[i];
if (Length(pp^.Lines) <> pp^.ActualLength) then
SetLength(pp^.Lines,pp^.ActualLength);
c := c+pp^.ActualLength;
if SameText(pp^.Part,'Part1') then
part1 := pp;
end;
if (part1 = nil) then
raise Exception.Create('"Part1" not found !');
{ $DEFINE ALL_CODE_POINTS}
{$IFDEF ALL_CODE_POINTS}
// Tests for BMP Codepoints not is PART1
TestBmpCodePoints();
// Tests for BMP Codepoints not is PART1
TestOBmpCodePoints();
{$ENDIF ALL_CODE_POINTS}
WriteLn('Line Count = ',lineCount);
WriteLn('Actual Test Line Count = ',c);
WriteLn('Error Count = ',errorCount);
Inc(totalErrorCount,errorCount);
finally
stream.Free();
end;
if (totalErrorCount > 0) then begin
WriteLn('Failed.');
Halt(1);
end;
end.