mirror of
https://gitlab.com/freepascal.org/fpc/source.git
synced 2025-04-05 11:18:18 +02:00
444 lines
17 KiB
ObjectPascal
444 lines
17 KiB
ObjectPascal
{ Unicode tables parser.
|
|
|
|
Copyright (c) 2012 by Inoussa OUEDRAOGO
|
|
|
|
The source code is distributed under the Library GNU
|
|
General Public License with the following modification:
|
|
|
|
- object files and libraries linked into an application may be
|
|
distributed without source code.
|
|
|
|
If you didn't receive a copy of the file COPYING, contact:
|
|
Free Software Foundation
|
|
675 Mass Ave
|
|
Cambridge, MA 02139
|
|
USA
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. }
|
|
|
|
{ This program generates tables as include-files for use
|
|
with the unicode related sources. It expects the following
|
|
unicode.org's files to be present in the same folder :
|
|
* HangulSyllableType.txt
|
|
* PropList.txt
|
|
* UnicodeData.txt
|
|
* allkeys.txt
|
|
}
|
|
|
|
{$DEFINE UCA_TEST}
|
|
program unihelper;
|
|
|
|
{$mode objfpc}{$H+}
|
|
{$typedaddress on}
|
|
|
|
uses
|
|
SysUtils, Classes,
|
|
helper, uca_test;
|
|
|
|
const
|
|
SUsage =
|
|
'This program generates tables as include-files for use ' + sLineBreak +
|
|
' with the unicode related sources. It expects the following ' + sLineBreak +
|
|
' unicode.org''s files to be present in the same folder : ' + sLineBreak +
|
|
' * HangulSyllableType.txt ' + sLineBreak +
|
|
' * PropList.txt ' + sLineBreak +
|
|
' * UnicodeData.txt ' + sLineBreak +
|
|
' * allkeys.txt : Note that this file is the one provided for the CLDR root.' + sLineBreak +
|
|
'' + sLineBreak +
|
|
'Usage : unihelper [<dataDir> <outputDir>] ' + sLineBreak +
|
|
' where ' + sLineBreak +
|
|
' dataDir : the directory where are stored the unicode files. The default' + sLineBreak +
|
|
' value is the program''s directory.' + sLineBreak +
|
|
' outputDir : The directory where the generated files will be stored. The' + sLineBreak +
|
|
' default value is the program''s directory.'+sLineBreak;
|
|
|
|
function DumpCodePoint(ACodePoint : TCodePointRec) : string;
|
|
begin
|
|
Result := '';
|
|
if (ACodePoint.LineType = 0) then
|
|
WriteStr(Result,IntToHex(ACodePoint.CodePoint,4))
|
|
else
|
|
WriteStr(Result,IntToHex(ACodePoint.StartCodePoint,4),'..',IntToHex(ACodePoint.EndCodePoint,4));
|
|
end;
|
|
|
|
var
|
|
dataPath, outputPath : string;
|
|
stream, binStreamNE, binStreamOE, tmpStream : TMemoryStream;
|
|
binaryStreamNE, binaryStreamOE : TMemoryStream;
|
|
hangulSyllables : TCodePointRecArray;
|
|
ucaBook : TUCA_DataBook;
|
|
ucaPropBook : PUCA_PropBook;
|
|
propList : TPropListLineRecArray;
|
|
whiteSpaceCodePoints : TCodePointRecArray;
|
|
unifiedIdeographCodePoints : TCodePointRecArray;
|
|
props : TPropRecArray;
|
|
numericTable : TNumericValueArray;
|
|
decomposition : TDecompositionArray;
|
|
decompositionBook : TDecompositionBook;
|
|
data : TDataLineRecArray;
|
|
//----------------
|
|
lvl3table1 : T3lvlBmp1Table;
|
|
lvl3table2 : T3lvlBmp2Table;
|
|
lvl3table3 : T3lvlBmp3Table;
|
|
//----------------
|
|
s : ansistring;
|
|
i, k, h : Integer;
|
|
p : PDataLineRec;
|
|
r : TDataLineRecArray;
|
|
olvl3table1 : T3lvlOBmp1Table;
|
|
olvl3table2 : T3lvlOBmp2Table;
|
|
olvl3table3 : T3lvlOBmp3Table;
|
|
//----------------
|
|
hs, ls : Word;
|
|
ucaFirstTable : TucaBmpFirstTable;
|
|
ucaSecondTable : TucaBmpSecondTable;
|
|
ucaoFirstTable : TucaoBmpFirstTable;
|
|
ucaoSecondTable : TucaOBmpSecondTable;
|
|
WL : Integer;
|
|
serializedHeader : TSerializedCollationHeader;
|
|
begin
|
|
WriteLn(SUsage+sLineBreak);
|
|
if (ParamCount > 0) then
|
|
dataPath := IncludeTrailingPathDelimiter(ParamStr(1))
|
|
else
|
|
dataPath := ExtractFilePath(ParamStr(0));
|
|
if (ParamCount > 1) then
|
|
outputPath := IncludeTrailingPathDelimiter(ParamStr(2))
|
|
else
|
|
outputPath := dataPath;
|
|
if not DirectoryExists(outputPath) then begin
|
|
WriteLn('Directory not found : ',outputPath);
|
|
if ForceDirectories(outputPath) then begin
|
|
WriteLn(' directory created successfully');
|
|
end else begin
|
|
WriteLn(' fail to create directory.');
|
|
Halt(1);
|
|
end;
|
|
end;
|
|
if not(
|
|
FileExists(dataPath + 'HangulSyllableType.txt') and
|
|
FileExists(dataPath + 'PropList.txt') and
|
|
FileExists(dataPath + 'UnicodeData.txt') and
|
|
FileExists(dataPath + 'allkeys.txt')
|
|
)
|
|
then begin
|
|
WriteLn('File(s) not found : HangulSyllableType.txt or PropList.txt or UnicodeData.txt or allkeys.txt .');
|
|
Halt(1);
|
|
end;
|
|
|
|
binaryStreamNE := nil;
|
|
binaryStreamOE := nil;
|
|
binStreamOE := nil;
|
|
binStreamNE := nil;
|
|
tmpStream := nil;
|
|
stream := TMemoryStream.Create();
|
|
try
|
|
binStreamNE := TMemoryStream.Create();
|
|
binStreamOE := TMemoryStream.Create();
|
|
tmpStream := TMemoryStream.Create();
|
|
WriteLn('Load file HangulSyllableType.txt ...', DateTimeToStr(Now));
|
|
stream.LoadFromFile(dataPath + 'HangulSyllableType.txt');
|
|
stream.Position := 0;
|
|
hangulSyllables := nil;
|
|
ParseHangulSyllableTypes(stream,hangulSyllables);
|
|
stream.Clear();
|
|
|
|
WriteLn('Load file PropList.txt ...', DateTimeToStr(Now));
|
|
stream.LoadFromFile(dataPath + 'PropList.txt');
|
|
stream.Position := 0;
|
|
propList := nil;
|
|
ParseProps(stream,propList);
|
|
stream.Clear();
|
|
whiteSpaceCodePoints := FindCodePointsByProperty('White_Space',propList);
|
|
writeln(' PropList Length = ',Length(propList));
|
|
writeln(' White_Space Length = ',Length(whiteSpaceCodePoints));
|
|
for i := Low(whiteSpaceCodePoints) to High(whiteSpaceCodePoints) do
|
|
WriteLn(' ',DumpCodePoint(whiteSpaceCodePoints[i]):12,' , IsWhiteSpace = ',IsWhiteSpace(whiteSpaceCodePoints[i].CodePoint,whiteSpaceCodePoints));
|
|
unifiedIdeographCodePoints := FindCodePointsByProperty('Unified_Ideograph',propList);
|
|
writeln(' Unified_Ideograph Length = ',Length(unifiedIdeographCodePoints));
|
|
|
|
WriteLn('Load file UnicodeData.txt ...', DateTimeToStr(Now));
|
|
stream.LoadFromFile(dataPath + 'UnicodeData.txt');
|
|
stream.Position := 0;
|
|
WriteLn('Parse file ...', DateTimeToStr(Now));
|
|
data := nil;
|
|
props := nil;
|
|
Parse_UnicodeData(
|
|
stream,props,numericTable,data,decomposition,hangulSyllables,
|
|
whiteSpaceCodePoints,unifiedIdeographCodePoints
|
|
);
|
|
WriteLn('Decomposition building ...');
|
|
MakeDecomposition(decomposition,decompositionBook);
|
|
|
|
WriteLn('Load file UCA allkeys.txt ...', DateTimeToStr(Now));
|
|
stream.LoadFromFile(dataPath + 'allkeys.txt');
|
|
stream.Position := 0;
|
|
ParseUCAFile(stream,ucaBook);
|
|
{ $IFDEF UCA_TEST}
|
|
k := 0; WL := 0; ;
|
|
for i := 0 to Length(ucaBook.Lines) - 1 do begin
|
|
h := GetPropID(ucaBook.Lines[i].CodePoints[0],data);
|
|
if (h <> -1) and
|
|
({props[h].HangulSyllable or} (props[h].DecompositionID <> -1))
|
|
then begin
|
|
Inc(k);
|
|
ucaBook.Lines[i].Stored := False;
|
|
end else begin
|
|
ucaBook.Lines[i].Stored := True;
|
|
if Length(ucaBook.Lines[i].Weights) > WL then
|
|
WL := Length(ucaBook.Lines[i].Weights);
|
|
end;
|
|
end;
|
|
WriteLn(
|
|
'UCA, Version = ',ucaBook.Version,
|
|
'; entries count = ',Length(ucaBook.Lines),
|
|
'; characters (Decomposition) count = ',k,
|
|
'; Max Weights Length = ',WL
|
|
);
|
|
{ $ENDIF UCA_TEST}
|
|
WriteLn('Construct UCA Property Book ...');
|
|
ucaPropBook := nil;
|
|
MakeUCA_Props(@ucaBook,ucaPropBook);
|
|
{$IFDEF UCA_TEST}
|
|
uca_CheckProp_1(ucaBook,ucaPropBook);
|
|
uca_CheckProp_x(ucaBook,ucaPropBook);
|
|
{$ENDIF UCA_TEST}
|
|
WriteLn('Construct UCA BMP tables ...');
|
|
MakeUCA_BmpTables(ucaFirstTable,ucaSecondTable,ucaPropBook);
|
|
WriteLn(' UCA BMP Second Table Length = ',Length(ucaSecondTable));
|
|
{$IFDEF UCA_TEST}
|
|
uca_CheckProp_1y(ucaBook,ucaPropBook,@ucaFirstTable,@ucaSecondTable);
|
|
{$ENDIF UCA_TEST}
|
|
|
|
WriteLn('Construct UCA OBMP tables ...');
|
|
MakeUCA_OBmpTables(ucaoFirstTable,ucaoSecondTable,ucaPropBook);
|
|
WriteLn(' UCA OBMP Second Table Length = ',Length(ucaoSecondTable));
|
|
{$IFDEF UCA_TEST}
|
|
uca_CheckProp_2y(ucaBook,ucaPropBook,@ucaoFirstTable,@ucaoSecondTable);
|
|
{$ENDIF UCA_TEST}
|
|
binaryStreamNE := TMemoryStream.Create();
|
|
binaryStreamOE := TMemoryStream.Create();
|
|
WriteLn('Generate UCA Props tables ...');
|
|
binStreamNE.Clear();
|
|
binStreamOE.Clear();
|
|
GenerateLicenceText(binStreamNE);
|
|
GenerateLicenceText(binStreamOE);
|
|
GenerateUCA_PropTable(binStreamNE,ucaPropBook,ENDIAN_NATIVE);
|
|
GenerateUCA_PropTable(binStreamOE,ucaPropBook,ENDIAN_NON_NATIVE);
|
|
WriteLn('Generate UCA BMP tables ...');
|
|
stream.Clear();
|
|
GenerateLicenceText(stream);
|
|
GenerateUCA_Head(stream,@ucaBook,ucaPropBook);
|
|
GenerateUCA_BmpTables(stream,binStreamNE,binStreamOE,ucaFirstTable,ucaSecondTable);
|
|
WriteLn('Generate UCA OBMP tables ...');
|
|
GenerateUCA_OBmpTables(stream,binStreamNE,binStreamOE,ucaoFirstTable,ucaoSecondTable);
|
|
stream.SaveToFile(outputPath + 'ucadata.inc');
|
|
s := outputPath + 'ucadata.inc';
|
|
binStreamNE.SaveToFile(GenerateEndianIncludeFileName(s,ENDIAN_NATIVE));
|
|
binStreamOE.SaveToFile(GenerateEndianIncludeFileName(s,ENDIAN_NON_NATIVE));
|
|
binStreamNE.Clear();
|
|
binStreamOE.Clear();
|
|
// Binary DUCET
|
|
FillChar(serializedHeader,SizeOf(TSerializedCollationHeader),0);
|
|
StringToByteArray(ucaBook.Version,serializedHeader.Version);
|
|
StringToByteArray('DUCET',serializedHeader.CollationName); //'Default Unicode Collation Element Table (DUCET)';
|
|
serializedHeader.VariableWeight := Ord(ucaBook.VariableWeight);
|
|
SetBit(serializedHeader.Backwards,0,ucaBook.Backwards[0]);
|
|
SetBit(serializedHeader.Backwards,1,ucaBook.Backwards[1]);
|
|
SetBit(serializedHeader.Backwards,2,ucaBook.Backwards[2]);
|
|
SetBit(serializedHeader.Backwards,3,ucaBook.Backwards[3]);
|
|
serializedHeader.BMP_Table1Length := Length(ucaFirstTable);
|
|
serializedHeader.BMP_Table2Length := Length(TucaBmpSecondTableItem) *
|
|
(Length(ucaSecondTable) * SizeOf(UInt24));
|
|
serializedHeader.OBMP_Table1Length := Length(ucaoFirstTable) * SizeOf(Word);
|
|
serializedHeader.OBMP_Table2Length := Length(TucaOBmpSecondTableItem) *
|
|
(Length(ucaoSecondTable) * SizeOf(UInt24));
|
|
serializedHeader.PropCount := ucaPropBook^.ItemSize;
|
|
serializedHeader.VariableLowLimit := ucaPropBook^.VariableLowLimit;
|
|
serializedHeader.VariableHighLimit := ucaPropBook^.VariableHighLimit;
|
|
binaryStreamNE.Write(serializedHeader,SizeOf(serializedHeader));
|
|
ReverseRecordBytes(serializedHeader);
|
|
binaryStreamOE.Write(serializedHeader,SizeOf(serializedHeader));
|
|
GenerateBinaryUCA_BmpTables(binaryStreamNE,binaryStreamOE,ucaFirstTable,ucaSecondTable);
|
|
GenerateBinaryUCA_OBmpTables(binaryStreamNE,binaryStreamOE,ucaoFirstTable,ucaoSecondTable);
|
|
GenerateBinaryUCA_PropTable(binaryStreamNE,binaryStreamOE,ucaPropBook);
|
|
binaryStreamNE.SaveToFile(
|
|
outputPath + Format('collation_ducet_%s.bco',[ENDIAN_SUFFIX[ENDIAN_NATIVE]])
|
|
);
|
|
binaryStreamOE.SaveToFile(
|
|
outputPath + Format('collation_ducet_%s.bco',[ENDIAN_SUFFIX[ENDIAN_NON_NATIVE]])
|
|
);
|
|
// Binary DUCET - END
|
|
|
|
|
|
stream.Clear();
|
|
GenerateLicenceText(stream);
|
|
WriteLn('File parsed ...', DateTimeToStr(Now));
|
|
WriteLn(' Props Len = ',Length(props));
|
|
WriteLn(' Data Len = ',Length(data));
|
|
|
|
{WriteLn('BMP Tables building ...', DateTimeToStr(Now));
|
|
MakeBmpTables(firstTable,secondTable,props,data);
|
|
WriteLn(' First Table length = ',Length(firstTable));
|
|
WriteLn(' Second Table length = ',Length(secondTable));}
|
|
|
|
WriteLn('BMP Tables building ...', DateTimeToStr(Now));
|
|
MakeBmpTables3Levels(lvl3table1,lvl3table2,lvl3table3,data);
|
|
WriteLn(' 3 Levels Tables :');
|
|
WriteLn(' Len 1 = ',Length(lvl3table1));
|
|
WriteLn(' Len 2 = ',Length(lvl3table2));
|
|
WriteLn(' Len 3 = ',Length(lvl3table3));
|
|
for i := 0 to 255 do begin
|
|
for k := 0 to 15 do begin
|
|
for h := 0 to 15 do begin
|
|
if lvl3table3[lvl3table2[lvl3table1[i]][k]][h] <>
|
|
GetPropID(256*i + 16*k +h,data)
|
|
then begin
|
|
writeln('3 levels errors, i=',i,'; k=',k,'; h=',h);
|
|
end;
|
|
end;
|
|
end;
|
|
end;
|
|
|
|
binStreamNE.Clear();
|
|
binStreamOE.Clear();
|
|
WriteLn('Source generation ...', DateTimeToStr(Now));
|
|
GenerateNumericTable(stream,numericTable,False);
|
|
WriteLn('BMP Tables sources ...', DateTimeToStr(Now));
|
|
Generate3lvlBmpTables(stream,lvl3table1,lvl3table2,lvl3table3);
|
|
WriteLn('Properties Table sources ...', DateTimeToStr(Now));
|
|
{tmpStream.Clear();
|
|
GenerateNumericTable(tmpStream,numericTable,True);
|
|
tmpStream.SaveToFile(outputPath + 'unicodenumtable.pas');
|
|
tmpStream.Clear();}
|
|
GeneratePropTable(binStreamNE,props,ENDIAN_NATIVE);
|
|
GeneratePropTable(binStreamOE,props,ENDIAN_NON_NATIVE);
|
|
//-------------------------------------------
|
|
|
|
r := Compress(data);
|
|
|
|
//-------------------
|
|
WriteLn('OBMP Tables building ...', DateTimeToStr(Now));
|
|
MakeOBmpTables3Levels(olvl3table1,olvl3table2,olvl3table3,r);
|
|
WriteLn(' 3 Levels Tables :');
|
|
WriteLn(' Len 1 = ',Length(olvl3table1));
|
|
WriteLn(' Len 2 = ',Length(olvl3table2));
|
|
WriteLn(' Len 3 = ',Length(olvl3table3));
|
|
for i := 0 to 1023 do begin
|
|
for k := 0 to 31 do begin
|
|
for h := 0 to 31 do begin
|
|
if olvl3table3[olvl3table2[olvl3table1[i]][k]][h] <>
|
|
GetPropID(ToUCS4(HIGH_SURROGATE_BEGIN + i,LOW_SURROGATE_BEGIN + (k*32) + h),data)
|
|
then begin
|
|
writeln('3, OBMP levels errors, i=',i,'; k=',k,'; h=',h);
|
|
end;
|
|
end;
|
|
end;
|
|
end;
|
|
WriteLn('OBMP Tables sources ...', DateTimeToStr(Now));
|
|
Generate3lvlOBmpTables(stream,olvl3table1,olvl3table2,olvl3table3);
|
|
|
|
//---------------------
|
|
WriteLn('Decomposition Table sources ...', DateTimeToStr(Now));
|
|
GenerateDecompositionBookTable(binStreamNE,decompositionBook,ENDIAN_NATIVE);
|
|
GenerateDecompositionBookTable(binStreamOE,decompositionBook,ENDIAN_NON_NATIVE);
|
|
stream.SaveToFile(outputPath + 'unicodedata.inc');
|
|
binStreamNE.SaveToFile(outputPath + 'unicodedata_'+ENDIAN_SUFFIX[ENDIAN_NATIVE]+'.inc');
|
|
binStreamOE.SaveToFile(outputPath + 'unicodedata_'+ENDIAN_SUFFIX[ENDIAN_NON_NATIVE]+'.inc');
|
|
binStreamNE.Clear();
|
|
binStreamOE.Clear();
|
|
|
|
|
|
h := -1;
|
|
for i := Low(data) to High(data) do
|
|
if (data[i].CodePoint > $FFFF) then begin
|
|
h := i;
|
|
Break;
|
|
end;
|
|
stream.Clear();
|
|
for i := h to High(data) do begin
|
|
p := @data[i];
|
|
if (p^.LineType = 0) then begin
|
|
FromUCS4(p^.CodePoint,hs,ls);
|
|
//k := GetProp(hs,ls,props,ofirstTable,osecondTable)^.PropID;
|
|
k := GetProp(
|
|
(hs-HIGH_SURROGATE_BEGIN),(ls-LOW_SURROGATE_BEGIN),
|
|
props,olvl3table1,olvl3table2,olvl3table3
|
|
)^.PropID;
|
|
if (p^.PropID <> k) then begin
|
|
s := Format('#%d-%d #%d',[p^.CodePoint,p^.PropID,k]) + sLineBreak;
|
|
stream.Write(s[1],Length(s));
|
|
end;
|
|
end else begin
|
|
for h := p^.StartCodePoint to p^.EndCodePoint do begin
|
|
FromUCS4(h,hs,ls);
|
|
//k := GetProp(hs,ls,props,ofirstTable,osecondTable)^.PropID;
|
|
k := GetProp(
|
|
(hs-HIGH_SURROGATE_BEGIN),(ls-LOW_SURROGATE_BEGIN),
|
|
props,olvl3table1,olvl3table2,olvl3table3
|
|
)^.PropID;
|
|
if (p^.PropID <> k) then begin
|
|
s := Format('##%d;%d-%d #%d',[p^.StartCodePoint,p^.EndCodePoint,p^.PropID,k]) + sLineBreak;
|
|
stream.Write(s[1],Length(s));
|
|
Break
|
|
end;
|
|
end;
|
|
end;
|
|
end;
|
|
stream.SaveToFile(outputPath + 'diff-obmp.txt');
|
|
|
|
stream.Clear();
|
|
for i := Low(data) to High(data) do begin
|
|
p := @data[i];
|
|
if (p^.LineType = 0) then begin
|
|
k := GetPropID(p^.CodePoint,r);
|
|
if (p^.PropID <> k) then begin
|
|
s := Format('#%d-%d #%d',[p^.CodePoint,p^.PropID,k]) + sLineBreak;
|
|
stream.Write(s[1],Length(s));
|
|
end;
|
|
end else begin
|
|
for h := p^.StartCodePoint to p^.EndCodePoint do begin
|
|
k := GetPropID(h,r);
|
|
if (p^.PropID <> k) then begin
|
|
s := Format('##%d;%d-%d #%d',[p^.StartCodePoint,p^.EndCodePoint,p^.PropID,k]) + sLineBreak;
|
|
stream.Write(s[1],Length(s));
|
|
Break
|
|
end;
|
|
end;
|
|
end;
|
|
end;
|
|
stream.SaveToFile(outputPath + 'diff.txt');
|
|
stream.Clear();
|
|
for i := Low(r) to High(r) do begin
|
|
p := @r[i];
|
|
if (p^.LineType = 0) then begin
|
|
k := GetPropID(p^.CodePoint,data);
|
|
if (p^.PropID <> k) then begin
|
|
s := Format('#%d-%d #%d',[p^.CodePoint,p^.PropID,k]) + sLineBreak;
|
|
stream.Write(s[1],Length(s));
|
|
end;
|
|
end else begin
|
|
for h := p^.StartCodePoint to p^.EndCodePoint do begin
|
|
k := GetPropID(h,r);
|
|
if (p^.PropID <> k) then begin
|
|
s := Format('##%d;%d-%d #%d',[p^.StartCodePoint,p^.EndCodePoint,p^.PropID,k]) + sLineBreak;
|
|
stream.Write(s[1],Length(s));
|
|
Break
|
|
end;
|
|
end;
|
|
end;
|
|
end;
|
|
stream.SaveToFile(outputPath + 'diff2.txt');
|
|
finally
|
|
binaryStreamOE.Free();
|
|
binaryStreamNE.Free();
|
|
tmpStream.Free();
|
|
binStreamOE.Free();
|
|
binStreamNE.Free();
|
|
stream.Free();
|
|
end;
|
|
end.
|
|
|