fpc/utils/unicode/unihelper.lpr

444 lines
17 KiB
ObjectPascal

{ Unicode tables parser.
Copyright (c) 2012 by Inoussa OUEDRAOGO
The source code is distributed under the Library GNU
General Public License with the following modification:
- object files and libraries linked into an application may be
distributed without source code.
If you didn't receive a copy of the file COPYING, contact:
Free Software Foundation
675 Mass Ave
Cambridge, MA 02139
USA
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. }
{ This program generates tables as include-files for use
with the unicode related sources. It expects the following
unicode.org's files to be present in the same folder :
* HangulSyllableType.txt
* PropList.txt
* UnicodeData.txt
* allkeys.txt
}
{$DEFINE UCA_TEST}
program unihelper;
{$mode objfpc}{$H+}
{$typedaddress on}
uses
SysUtils, Classes,
helper, uca_test;
const
SUsage =
'This program generates tables as include-files for use ' + sLineBreak +
' with the unicode related sources. It expects the following ' + sLineBreak +
' unicode.org''s files to be present in the same folder : ' + sLineBreak +
' * HangulSyllableType.txt ' + sLineBreak +
' * PropList.txt ' + sLineBreak +
' * UnicodeData.txt ' + sLineBreak +
' * allkeys.txt : Note that this file is the one provided for the CLDR root.' + sLineBreak +
'' + sLineBreak +
'Usage : unihelper [<dataDir> <outputDir>] ' + sLineBreak +
' where ' + sLineBreak +
' dataDir : the directory where are stored the unicode files. The default' + sLineBreak +
' value is the program''s directory.' + sLineBreak +
' outputDir : The directory where the generated files will be stored. The' + sLineBreak +
' default value is the program''s directory.'+sLineBreak;
function DumpCodePoint(ACodePoint : TCodePointRec) : string;
begin
Result := '';
if (ACodePoint.LineType = 0) then
WriteStr(Result,IntToHex(ACodePoint.CodePoint,4))
else
WriteStr(Result,IntToHex(ACodePoint.StartCodePoint,4),'..',IntToHex(ACodePoint.EndCodePoint,4));
end;
var
dataPath, outputPath : string;
stream, binStreamNE, binStreamOE, tmpStream : TMemoryStream;
binaryStreamNE, binaryStreamOE : TMemoryStream;
hangulSyllables : TCodePointRecArray;
ucaBook : TUCA_DataBook;
ucaPropBook : PUCA_PropBook;
propList : TPropListLineRecArray;
whiteSpaceCodePoints : TCodePointRecArray;
unifiedIdeographCodePoints : TCodePointRecArray;
props : TPropRecArray;
numericTable : TNumericValueArray;
decomposition : TDecompositionArray;
decompositionBook : TDecompositionBook;
data : TDataLineRecArray;
//----------------
lvl3table1 : T3lvlBmp1Table;
lvl3table2 : T3lvlBmp2Table;
lvl3table3 : T3lvlBmp3Table;
//----------------
s : ansistring;
i, k, h : Integer;
p : PDataLineRec;
r : TDataLineRecArray;
olvl3table1 : T3lvlOBmp1Table;
olvl3table2 : T3lvlOBmp2Table;
olvl3table3 : T3lvlOBmp3Table;
//----------------
hs, ls : Word;
ucaFirstTable : TucaBmpFirstTable;
ucaSecondTable : TucaBmpSecondTable;
ucaoFirstTable : TucaoBmpFirstTable;
ucaoSecondTable : TucaOBmpSecondTable;
WL : Integer;
serializedHeader : TSerializedCollationHeader;
begin
WriteLn(SUsage+sLineBreak);
if (ParamCount > 0) then
dataPath := IncludeTrailingPathDelimiter(ParamStr(1))
else
dataPath := ExtractFilePath(ParamStr(0));
if (ParamCount > 1) then
outputPath := IncludeTrailingPathDelimiter(ParamStr(2))
else
outputPath := dataPath;
if not DirectoryExists(outputPath) then begin
WriteLn('Directory not found : ',outputPath);
if ForceDirectories(outputPath) then begin
WriteLn(' directory created successfully');
end else begin
WriteLn(' fail to create directory.');
Halt(1);
end;
end;
if not(
FileExists(dataPath + 'HangulSyllableType.txt') and
FileExists(dataPath + 'PropList.txt') and
FileExists(dataPath + 'UnicodeData.txt') and
FileExists(dataPath + 'allkeys.txt')
)
then begin
WriteLn('File(s) not found : HangulSyllableType.txt or PropList.txt or UnicodeData.txt or allkeys.txt .');
Halt(1);
end;
binaryStreamNE := nil;
binaryStreamOE := nil;
binStreamOE := nil;
binStreamNE := nil;
tmpStream := nil;
stream := TMemoryStream.Create();
try
binStreamNE := TMemoryStream.Create();
binStreamOE := TMemoryStream.Create();
tmpStream := TMemoryStream.Create();
WriteLn('Load file HangulSyllableType.txt ...', DateTimeToStr(Now));
stream.LoadFromFile(dataPath + 'HangulSyllableType.txt');
stream.Position := 0;
hangulSyllables := nil;
ParseHangulSyllableTypes(stream,hangulSyllables);
stream.Clear();
WriteLn('Load file PropList.txt ...', DateTimeToStr(Now));
stream.LoadFromFile(dataPath + 'PropList.txt');
stream.Position := 0;
propList := nil;
ParseProps(stream,propList);
stream.Clear();
whiteSpaceCodePoints := FindCodePointsByProperty('White_Space',propList);
writeln(' PropList Length = ',Length(propList));
writeln(' White_Space Length = ',Length(whiteSpaceCodePoints));
for i := Low(whiteSpaceCodePoints) to High(whiteSpaceCodePoints) do
WriteLn(' ',DumpCodePoint(whiteSpaceCodePoints[i]):12,' , IsWhiteSpace = ',IsWhiteSpace(whiteSpaceCodePoints[i].CodePoint,whiteSpaceCodePoints));
unifiedIdeographCodePoints := FindCodePointsByProperty('Unified_Ideograph',propList);
writeln(' Unified_Ideograph Length = ',Length(unifiedIdeographCodePoints));
WriteLn('Load file UnicodeData.txt ...', DateTimeToStr(Now));
stream.LoadFromFile(dataPath + 'UnicodeData.txt');
stream.Position := 0;
WriteLn('Parse file ...', DateTimeToStr(Now));
data := nil;
props := nil;
Parse_UnicodeData(
stream,props,numericTable,data,decomposition,hangulSyllables,
whiteSpaceCodePoints,unifiedIdeographCodePoints
);
WriteLn('Decomposition building ...');
MakeDecomposition(decomposition,decompositionBook);
WriteLn('Load file UCA allkeys.txt ...', DateTimeToStr(Now));
stream.LoadFromFile(dataPath + 'allkeys.txt');
stream.Position := 0;
ParseUCAFile(stream,ucaBook);
{ $IFDEF UCA_TEST}
k := 0; WL := 0; ;
for i := 0 to Length(ucaBook.Lines) - 1 do begin
h := GetPropID(ucaBook.Lines[i].CodePoints[0],data);
if (h <> -1) and
({props[h].HangulSyllable or} (props[h].DecompositionID <> -1))
then begin
Inc(k);
ucaBook.Lines[i].Stored := False;
end else begin
ucaBook.Lines[i].Stored := True;
if Length(ucaBook.Lines[i].Weights) > WL then
WL := Length(ucaBook.Lines[i].Weights);
end;
end;
WriteLn(
'UCA, Version = ',ucaBook.Version,
'; entries count = ',Length(ucaBook.Lines),
'; characters (Decomposition) count = ',k,
'; Max Weights Length = ',WL
);
{ $ENDIF UCA_TEST}
WriteLn('Construct UCA Property Book ...');
ucaPropBook := nil;
MakeUCA_Props(@ucaBook,ucaPropBook);
{$IFDEF UCA_TEST}
uca_CheckProp_1(ucaBook,ucaPropBook);
uca_CheckProp_x(ucaBook,ucaPropBook);
{$ENDIF UCA_TEST}
WriteLn('Construct UCA BMP tables ...');
MakeUCA_BmpTables(ucaFirstTable,ucaSecondTable,ucaPropBook);
WriteLn(' UCA BMP Second Table Length = ',Length(ucaSecondTable));
{$IFDEF UCA_TEST}
uca_CheckProp_1y(ucaBook,ucaPropBook,@ucaFirstTable,@ucaSecondTable);
{$ENDIF UCA_TEST}
WriteLn('Construct UCA OBMP tables ...');
MakeUCA_OBmpTables(ucaoFirstTable,ucaoSecondTable,ucaPropBook);
WriteLn(' UCA OBMP Second Table Length = ',Length(ucaoSecondTable));
{$IFDEF UCA_TEST}
uca_CheckProp_2y(ucaBook,ucaPropBook,@ucaoFirstTable,@ucaoSecondTable);
{$ENDIF UCA_TEST}
binaryStreamNE := TMemoryStream.Create();
binaryStreamOE := TMemoryStream.Create();
WriteLn('Generate UCA Props tables ...');
binStreamNE.Clear();
binStreamOE.Clear();
GenerateLicenceText(binStreamNE);
GenerateLicenceText(binStreamOE);
GenerateUCA_PropTable(binStreamNE,ucaPropBook,ENDIAN_NATIVE);
GenerateUCA_PropTable(binStreamOE,ucaPropBook,ENDIAN_NON_NATIVE);
WriteLn('Generate UCA BMP tables ...');
stream.Clear();
GenerateLicenceText(stream);
GenerateUCA_Head(stream,@ucaBook,ucaPropBook);
GenerateUCA_BmpTables(stream,binStreamNE,binStreamOE,ucaFirstTable,ucaSecondTable);
WriteLn('Generate UCA OBMP tables ...');
GenerateUCA_OBmpTables(stream,binStreamNE,binStreamOE,ucaoFirstTable,ucaoSecondTable);
stream.SaveToFile(outputPath + 'ucadata.inc');
s := outputPath + 'ucadata.inc';
binStreamNE.SaveToFile(GenerateEndianIncludeFileName(s,ENDIAN_NATIVE));
binStreamOE.SaveToFile(GenerateEndianIncludeFileName(s,ENDIAN_NON_NATIVE));
binStreamNE.Clear();
binStreamOE.Clear();
// Binary DUCET
FillChar(serializedHeader,SizeOf(TSerializedCollationHeader),0);
StringToByteArray(ucaBook.Version,serializedHeader.Version);
StringToByteArray('DUCET',serializedHeader.CollationName); //'Default Unicode Collation Element Table (DUCET)';
serializedHeader.VariableWeight := Ord(ucaBook.VariableWeight);
SetBit(serializedHeader.Backwards,0,ucaBook.Backwards[0]);
SetBit(serializedHeader.Backwards,1,ucaBook.Backwards[1]);
SetBit(serializedHeader.Backwards,2,ucaBook.Backwards[2]);
SetBit(serializedHeader.Backwards,3,ucaBook.Backwards[3]);
serializedHeader.BMP_Table1Length := Length(ucaFirstTable);
serializedHeader.BMP_Table2Length := Length(TucaBmpSecondTableItem) *
(Length(ucaSecondTable) * SizeOf(UInt24));
serializedHeader.OBMP_Table1Length := Length(ucaoFirstTable) * SizeOf(Word);
serializedHeader.OBMP_Table2Length := Length(TucaOBmpSecondTableItem) *
(Length(ucaoSecondTable) * SizeOf(UInt24));
serializedHeader.PropCount := ucaPropBook^.ItemSize;
serializedHeader.VariableLowLimit := ucaPropBook^.VariableLowLimit;
serializedHeader.VariableHighLimit := ucaPropBook^.VariableHighLimit;
binaryStreamNE.Write(serializedHeader,SizeOf(serializedHeader));
ReverseRecordBytes(serializedHeader);
binaryStreamOE.Write(serializedHeader,SizeOf(serializedHeader));
GenerateBinaryUCA_BmpTables(binaryStreamNE,binaryStreamOE,ucaFirstTable,ucaSecondTable);
GenerateBinaryUCA_OBmpTables(binaryStreamNE,binaryStreamOE,ucaoFirstTable,ucaoSecondTable);
GenerateBinaryUCA_PropTable(binaryStreamNE,binaryStreamOE,ucaPropBook);
binaryStreamNE.SaveToFile(
outputPath + Format('collation_ducet_%s.bco',[ENDIAN_SUFFIX[ENDIAN_NATIVE]])
);
binaryStreamOE.SaveToFile(
outputPath + Format('collation_ducet_%s.bco',[ENDIAN_SUFFIX[ENDIAN_NON_NATIVE]])
);
// Binary DUCET - END
stream.Clear();
GenerateLicenceText(stream);
WriteLn('File parsed ...', DateTimeToStr(Now));
WriteLn(' Props Len = ',Length(props));
WriteLn(' Data Len = ',Length(data));
{WriteLn('BMP Tables building ...', DateTimeToStr(Now));
MakeBmpTables(firstTable,secondTable,props,data);
WriteLn(' First Table length = ',Length(firstTable));
WriteLn(' Second Table length = ',Length(secondTable));}
WriteLn('BMP Tables building ...', DateTimeToStr(Now));
MakeBmpTables3Levels(lvl3table1,lvl3table2,lvl3table3,data);
WriteLn(' 3 Levels Tables :');
WriteLn(' Len 1 = ',Length(lvl3table1));
WriteLn(' Len 2 = ',Length(lvl3table2));
WriteLn(' Len 3 = ',Length(lvl3table3));
for i := 0 to 255 do begin
for k := 0 to 15 do begin
for h := 0 to 15 do begin
if lvl3table3[lvl3table2[lvl3table1[i]][k]][h] <>
GetPropID(256*i + 16*k +h,data)
then begin
writeln('3 levels errors, i=',i,'; k=',k,'; h=',h);
end;
end;
end;
end;
binStreamNE.Clear();
binStreamOE.Clear();
WriteLn('Source generation ...', DateTimeToStr(Now));
GenerateNumericTable(stream,numericTable,False);
WriteLn('BMP Tables sources ...', DateTimeToStr(Now));
Generate3lvlBmpTables(stream,lvl3table1,lvl3table2,lvl3table3);
WriteLn('Properties Table sources ...', DateTimeToStr(Now));
{tmpStream.Clear();
GenerateNumericTable(tmpStream,numericTable,True);
tmpStream.SaveToFile(outputPath + 'unicodenumtable.pas');
tmpStream.Clear();}
GeneratePropTable(binStreamNE,props,ENDIAN_NATIVE);
GeneratePropTable(binStreamOE,props,ENDIAN_NON_NATIVE);
//-------------------------------------------
r := Compress(data);
//-------------------
WriteLn('OBMP Tables building ...', DateTimeToStr(Now));
MakeOBmpTables3Levels(olvl3table1,olvl3table2,olvl3table3,r);
WriteLn(' 3 Levels Tables :');
WriteLn(' Len 1 = ',Length(olvl3table1));
WriteLn(' Len 2 = ',Length(olvl3table2));
WriteLn(' Len 3 = ',Length(olvl3table3));
for i := 0 to 1023 do begin
for k := 0 to 31 do begin
for h := 0 to 31 do begin
if olvl3table3[olvl3table2[olvl3table1[i]][k]][h] <>
GetPropID(ToUCS4(HIGH_SURROGATE_BEGIN + i,LOW_SURROGATE_BEGIN + (k*32) + h),data)
then begin
writeln('3, OBMP levels errors, i=',i,'; k=',k,'; h=',h);
end;
end;
end;
end;
WriteLn('OBMP Tables sources ...', DateTimeToStr(Now));
Generate3lvlOBmpTables(stream,olvl3table1,olvl3table2,olvl3table3);
//---------------------
WriteLn('Decomposition Table sources ...', DateTimeToStr(Now));
GenerateDecompositionBookTable(binStreamNE,decompositionBook,ENDIAN_NATIVE);
GenerateDecompositionBookTable(binStreamOE,decompositionBook,ENDIAN_NON_NATIVE);
stream.SaveToFile(outputPath + 'unicodedata.inc');
binStreamNE.SaveToFile(outputPath + 'unicodedata_'+ENDIAN_SUFFIX[ENDIAN_NATIVE]+'.inc');
binStreamOE.SaveToFile(outputPath + 'unicodedata_'+ENDIAN_SUFFIX[ENDIAN_NON_NATIVE]+'.inc');
binStreamNE.Clear();
binStreamOE.Clear();
h := -1;
for i := Low(data) to High(data) do
if (data[i].CodePoint > $FFFF) then begin
h := i;
Break;
end;
stream.Clear();
for i := h to High(data) do begin
p := @data[i];
if (p^.LineType = 0) then begin
FromUCS4(p^.CodePoint,hs,ls);
//k := GetProp(hs,ls,props,ofirstTable,osecondTable)^.PropID;
k := GetProp(
(hs-HIGH_SURROGATE_BEGIN),(ls-LOW_SURROGATE_BEGIN),
props,olvl3table1,olvl3table2,olvl3table3
)^.PropID;
if (p^.PropID <> k) then begin
s := Format('#%d-%d #%d',[p^.CodePoint,p^.PropID,k]) + sLineBreak;
stream.Write(s[1],Length(s));
end;
end else begin
for h := p^.StartCodePoint to p^.EndCodePoint do begin
FromUCS4(h,hs,ls);
//k := GetProp(hs,ls,props,ofirstTable,osecondTable)^.PropID;
k := GetProp(
(hs-HIGH_SURROGATE_BEGIN),(ls-LOW_SURROGATE_BEGIN),
props,olvl3table1,olvl3table2,olvl3table3
)^.PropID;
if (p^.PropID <> k) then begin
s := Format('##%d;%d-%d #%d',[p^.StartCodePoint,p^.EndCodePoint,p^.PropID,k]) + sLineBreak;
stream.Write(s[1],Length(s));
Break
end;
end;
end;
end;
stream.SaveToFile(outputPath + 'diff-obmp.txt');
stream.Clear();
for i := Low(data) to High(data) do begin
p := @data[i];
if (p^.LineType = 0) then begin
k := GetPropID(p^.CodePoint,r);
if (p^.PropID <> k) then begin
s := Format('#%d-%d #%d',[p^.CodePoint,p^.PropID,k]) + sLineBreak;
stream.Write(s[1],Length(s));
end;
end else begin
for h := p^.StartCodePoint to p^.EndCodePoint do begin
k := GetPropID(h,r);
if (p^.PropID <> k) then begin
s := Format('##%d;%d-%d #%d',[p^.StartCodePoint,p^.EndCodePoint,p^.PropID,k]) + sLineBreak;
stream.Write(s[1],Length(s));
Break
end;
end;
end;
end;
stream.SaveToFile(outputPath + 'diff.txt');
stream.Clear();
for i := Low(r) to High(r) do begin
p := @r[i];
if (p^.LineType = 0) then begin
k := GetPropID(p^.CodePoint,data);
if (p^.PropID <> k) then begin
s := Format('#%d-%d #%d',[p^.CodePoint,p^.PropID,k]) + sLineBreak;
stream.Write(s[1],Length(s));
end;
end else begin
for h := p^.StartCodePoint to p^.EndCodePoint do begin
k := GetPropID(h,r);
if (p^.PropID <> k) then begin
s := Format('##%d;%d-%d #%d',[p^.StartCodePoint,p^.EndCodePoint,p^.PropID,k]) + sLineBreak;
stream.Write(s[1],Length(s));
Break
end;
end;
end;
end;
stream.SaveToFile(outputPath + 'diff2.txt');
finally
binaryStreamOE.Free();
binaryStreamNE.Free();
tmpStream.Free();
binStreamOE.Free();
binStreamNE.Free();
stream.Free();
end;
end.