utils: add unicode utils from Inoussa (part of mantis #0022909)

git-svn-id: trunk@23748 -
This commit is contained in:
paul 2013-03-09 15:53:44 +00:00
parent 364a874623
commit ab8c293a36
20 changed files with 12879 additions and 3 deletions

17
.gitattributes vendored
View File

@ -14520,3 +14520,20 @@ utils/tply/yaccsem.pas svneol=native#text/plain
utils/tply/yacctabl.pas svneol=native#text/plain
utils/tply/yylex.cod svneol=native#text/plain
utils/tply/yyparse.cod svneol=native#text/plain
utils/unicode/Makefile svneol=native#text/plain
utils/unicode/Makefile.fpc svneol=native#text/plain
utils/unicode/cldrhelper.pas svneol=native#text/pascal
utils/unicode/cldrparser.lpi svneol=native#text/plain
utils/unicode/cldrparser.lpr svneol=native#text/pascal
utils/unicode/cldrtest.pas svneol=native#text/pascal
utils/unicode/cldrxml.pas svneol=native#text/pascal
utils/unicode/data/readme.txt svneol=native#text/plain
utils/unicode/grbtree.pas svneol=native#text/pascal
utils/unicode/helper.pas svneol=native#text/pascal
utils/unicode/parse-collations.bat svneol=native#text/plain
utils/unicode/trie.pas svneol=native#text/pascal
utils/unicode/uca_test.pas svneol=native#text/pascal
utils/unicode/unicodeset.pas svneol=native#text/pascal
utils/unicode/unihelper.lpi svneol=native#text/plain
utils/unicode/unihelper.lpr svneol=native#text/pascal
utils/unicode/weight_derivation.inc svneol=native#text/pascal

View File

@ -1,5 +1,5 @@
#
# Don't edit, this file is generated by FPCMake Version 2.0.0 [2013/01/16]
# Don't edit, this file is generated by FPCMake Version 2.0.0 [2013/03/08]
#
default: all
MAKEFILETARGETS=i386-linux i386-go32v2 i386-win32 i386-os2 i386-freebsd i386-beos i386-haiku i386-netbsd i386-solaris i386-qnx i386-netware i386-openbsd i386-wdosx i386-darwin i386-emx i386-watcom i386-netwlibc i386-wince i386-embedded i386-symbian i386-nativent i386-iphonesim i386-android m68k-linux m68k-freebsd m68k-netbsd m68k-amiga m68k-atari m68k-openbsd m68k-palmos m68k-embedded powerpc-linux powerpc-netbsd powerpc-amiga powerpc-macos powerpc-darwin powerpc-morphos powerpc-embedded powerpc-wii powerpc-aix sparc-linux sparc-netbsd sparc-solaris sparc-embedded x86_64-linux x86_64-freebsd x86_64-netbsd x86_64-solaris x86_64-openbsd x86_64-darwin x86_64-win64 x86_64-embedded arm-linux arm-palmos arm-darwin arm-wince arm-gba arm-nds arm-embedded arm-symbian arm-android powerpc64-linux powerpc64-darwin powerpc64-embedded powerpc64-aix avr-embedded armeb-linux armeb-embedded mips-linux mipsel-linux jvm-java jvm-android
@ -268,6 +268,19 @@ ifdef CROSSCOMPILE
ifndef DARWIN2DARWIN
ifneq ($(CPU_TARGET),jvm)
BINUTILSPREFIX=$(CPU_TARGET)-$(OS_TARGET)-
ifeq ($(OS_TARGET),android)
ifeq ($(CPU_TARGET),arm)
BINUTILSPREFIX=arm-linux-androideabi-
else
ifeq ($(CPU_TARGET),i386)
BINUTILSPREFIX=i686-linux-android-
else
ifeq ($(CPU_TARGET),mips)
BINUTILSPREFIX=mipsel-linux-android-
endif
endif
endif
endif
endif
endif
endif
@ -311,7 +324,7 @@ ifeq ($(FULL_TARGET),i386-go32v2)
override TARGET_DIRS+=fppkg fpcm tply h2pas fprcp dxegen fpdoc fpcmkcfg pas2ut pas2fpm rmwait
endif
ifeq ($(FULL_TARGET),i386-win32)
override TARGET_DIRS+=fppkg fpcm tply h2pas fprcp dxegen fpdoc fpcmkcfg pas2ut pas2fpm fpmc fpcres rmwait instantfpc importtl
override TARGET_DIRS+=fppkg fpcm tply h2pas fprcp dxegen fpdoc fpcmkcfg pas2ut pas2fpm fpmc fpcres rmwait instantfpc importtl unicode
endif
ifeq ($(FULL_TARGET),i386-os2)
override TARGET_DIRS+=fppkg fpcm tply h2pas fprcp dxegen fpdoc fpcmkcfg pas2ut pas2fpm fpmc fpcres rmwait
@ -3379,6 +3392,7 @@ TARGET_DIRS_FPCRES=1
TARGET_DIRS_RMWAIT=1
TARGET_DIRS_INSTANTFPC=1
TARGET_DIRS_IMPORTTL=1
TARGET_DIRS_UNICODE=1
endif
ifeq ($(FULL_TARGET),i386-os2)
TARGET_DIRS_FPPKG=1
@ -4970,6 +4984,51 @@ importtl:
$(MAKE) -C importtl all
.PHONY: importtl_all importtl_debug importtl_smart importtl_release importtl_units importtl_examples importtl_shared importtl_install importtl_sourceinstall importtl_exampleinstall importtl_distinstall importtl_zipinstall importtl_zipsourceinstall importtl_zipexampleinstall importtl_zipdistinstall importtl_clean importtl_distclean importtl_cleanall importtl_info importtl_makefiles importtl
endif
ifdef TARGET_DIRS_UNICODE
unicode_all:
$(MAKE) -C unicode all
unicode_debug:
$(MAKE) -C unicode debug
unicode_smart:
$(MAKE) -C unicode smart
unicode_release:
$(MAKE) -C unicode release
unicode_units:
$(MAKE) -C unicode units
unicode_examples:
$(MAKE) -C unicode examples
unicode_shared:
$(MAKE) -C unicode shared
unicode_install:
$(MAKE) -C unicode install
unicode_sourceinstall:
$(MAKE) -C unicode sourceinstall
unicode_exampleinstall:
$(MAKE) -C unicode exampleinstall
unicode_distinstall:
$(MAKE) -C unicode distinstall
unicode_zipinstall:
$(MAKE) -C unicode zipinstall
unicode_zipsourceinstall:
$(MAKE) -C unicode zipsourceinstall
unicode_zipexampleinstall:
$(MAKE) -C unicode zipexampleinstall
unicode_zipdistinstall:
$(MAKE) -C unicode zipdistinstall
unicode_clean:
$(MAKE) -C unicode clean
unicode_distclean:
$(MAKE) -C unicode distclean
unicode_cleanall:
$(MAKE) -C unicode cleanall
unicode_info:
$(MAKE) -C unicode info
unicode_makefiles:
$(MAKE) -C unicode makefiles
unicode:
$(MAKE) -C unicode all
.PHONY: unicode_all unicode_debug unicode_smart unicode_release unicode_units unicode_examples unicode_shared unicode_install unicode_sourceinstall unicode_exampleinstall unicode_distinstall unicode_zipinstall unicode_zipsourceinstall unicode_zipexampleinstall unicode_zipdistinstall unicode_clean unicode_distclean unicode_cleanall unicode_info unicode_makefiles unicode
endif
ifdef TARGET_DIRS_FPCRESLIPO
fpcreslipo_all:
$(MAKE) -C fpcreslipo all

View File

@ -10,7 +10,7 @@ version=2.7.1
dirs=fppkg fpcm tply h2pas fprcp dxegen fpdoc fpcmkcfg pas2ut pas2fpm
programs=ppdep ptop rstconv data2inc delp bin2obj postw32 rmcvsdir
programs_linux=grab_vcsa
dirs_win32=fpmc fpcres rmwait instantfpc importtl
dirs_win32=fpmc fpcres rmwait instantfpc importtl unicode
dirs_win64=fpmc fpcres rmwait instantfpc importtl
dirs_wince=fpcres rmwait instantfpc
dirs_haiku=fpcres instantfpc

2154
utils/unicode/Makefile Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,23 @@
#
# Makefile.fpc for Unicode Utils
#
[target]
programs=cldrparser unihelper
[clean]
units=cldrhelper cldrtest cldrxml grbtree helper trie uca_test unicodeset
[require]
packages=rtl
[install]
fpcpackage=y
[default]
fpcdir=../..
[rules]
.NOTPARALLEL:
cldrparser$(EXEEXT): cldrparser.lpr cldrhelper.pas helper.pas cldrtest.pas cldrxml.pas unicodeset.pas
unihelper$(EXEEXT): unihelper.lpr helper.pas uca_test.pas

1634
utils/unicode/cldrhelper.pas Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,105 @@
<?xml version="1.0"?>
<CONFIG>
<ProjectOptions>
<Version Value="9"/>
<PathDelim Value="\"/>
<General>
<Flags>
<MainUnitHasCreateFormStatements Value="False"/>
<MainUnitHasTitleStatement Value="False"/>
</Flags>
<SessionStorage Value="InProjectDir"/>
<MainUnit Value="0"/>
<Title Value="cldrparser"/>
<UseAppBundle Value="False"/>
<ResourceType Value="res"/>
</General>
<i18n>
<EnableI18N LFM="False"/>
</i18n>
<VersionInfo>
<StringTable ProductVersion=""/>
</VersionInfo>
<BuildModes Count="1">
<Item1 Name="Default" Default="True"/>
</BuildModes>
<PublishOptions>
<Version Value="2"/>
<IncludeFileFilter Value="*.(pas|pp|inc|lfm|lpr|lrs|lpi|lpk|sh|xml)"/>
<ExcludeFileFilter Value="*.(bak|ppu|o|so);*~;backup"/>
</PublishOptions>
<RunParams>
<local>
<FormatVersion Value="1"/>
<CommandLineParams Value="de.xml search"/>
</local>
</RunParams>
<Units Count="7">
<Unit0>
<Filename Value="cldrparser.lpr"/>
<IsPartOfProject Value="True"/>
<UnitName Value="cldrparser"/>
</Unit0>
<Unit1>
<Filename Value="cldrhelper.pas"/>
<IsPartOfProject Value="True"/>
<UnitName Value="cldrhelper"/>
</Unit1>
<Unit2>
<Filename Value="..\helper.pas"/>
<IsPartOfProject Value="True"/>
<UnitName Value="helper"/>
</Unit2>
<Unit3>
<Filename Value="cldrtest.pas"/>
<IsPartOfProject Value="True"/>
<UnitName Value="cldrtest"/>
</Unit3>
<Unit4>
<Filename Value="cldrxml.pas"/>
<IsPartOfProject Value="True"/>
<UnitName Value="cldrxml"/>
</Unit4>
<Unit5>
<Filename Value="..\tests\weight_derivation.inc"/>
<IsPartOfProject Value="True"/>
</Unit5>
<Unit6>
<Filename Value="unicodeset.pas"/>
<IsPartOfProject Value="True"/>
<UnitName Value="unicodeset"/>
</Unit6>
</Units>
</ProjectOptions>
<CompilerOptions>
<Version Value="11"/>
<PathDelim Value="\"/>
<Target>
<Filename Value="cldrparser"/>
</Target>
<SearchPaths>
<IncludeFiles Value="$(ProjOutDir);..\src"/>
<OtherUnitFiles Value="..\src;..\rbtree\src;..\trie"/>
<UnitOutputDirectory Value="lib\$(TargetCPU)-$(TargetOS)"/>
</SearchPaths>
<Other>
<CompilerMessages>
<MsgFileName Value=""/>
</CompilerMessages>
<CompilerPath Value="$(CompPath)"/>
</Other>
</CompilerOptions>
<Debugging>
<Exceptions Count="3">
<Item1>
<Name Value="EAbort"/>
</Item1>
<Item2>
<Name Value="ECodetoolError"/>
</Item2>
<Item3>
<Name Value="EFOpenError"/>
</Item3>
</Exceptions>
</Debugging>
</CONFIG>

View File

@ -0,0 +1,205 @@
{ Unicode CLDR's collation parser.
Copyright (c) 2013 by Inoussa OUEDRAOGO
It creates units from CLDR's collation files.
The source code is distributed under the Library GNU
General Public License with the following modification:
- object files and libraries linked into an application may be
distributed without source code.
If you didn't receive a copy of the file COPYING, contact:
Free Software Foundation
675 Mass Ave
Cambridge, MA 02139
USA
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
}
program cldrparser;
{$mode objfpc}{$H+}
{ $define test_suite} // Define this to execute the parser test suite.
{$define actual_parsing}
uses
SysUtils, classes, getopts,
cldrhelper, helper, cldrtest, cldrxml, unicodeset;
const
SUsageText =
'This program creates pascal units from CLDR''s collation files for usage ' + sLineBreak +
'with the FreePascal Native Unicode Manager.' + sLineBreak + sLineBreak +
'Usage : cldrparser <collationFileName> [<typeName>] [-d<dataDir>] [-o<outputDir>]' + sLineBreak + sLineBreak +
' where :' + sLineBreak +
' ' + sLineBreak +
' - collationFileName : specify the target file.' + sLineBreak +
' - typeName : optional, specify the collation'' type-name to be parse;' + sLineBreak +
' If this argument is not supplied, a default type-name' + sLineBreak +
' is chosen from the type-name list in the following order : ' + sLineBreak +
' * the "default" specified type-name indicated by the collation' + sLineBreak +
' * the type named "standard" ' + sLineBreak +
' * the type named "search" ' + sLineBreak +
' * the first type.' + sLineBreak +
' - dataDir : specify the directory that contains the collation files.' + sLineBreak +
' The default value is the program''s directory.' + sLineBreak +
' - outputDir : specify the directory where the generated files will be stored.' + sLineBreak +
' The default value is the program''s directory.' + sLineBreak +
' ' + sLineBreak +
' The program expects some files to be present in the <dataDir> folder : ' + sLineBreak +
' - UCA_Rules_SHORT.xml found in the CollationAuxiliary.zip available on unicode.org' + sLineBreak +
' - allkeys.txt this is the file allkeys_CLDR.txt contained in CollationAuxiliary.zip renamed to allkeys.txt' + sLineBreak +
' The CollationAuxiliary.zip archive is provided by unicode in the "unicode collation algorithm data files" section.';
function ParseOptions(
var ADataDir, AOuputDir, ACollationFileName, ACollationTypeName : string
) : Boolean;
var
c : Char;
idx, k : Integer;
s : string;
begin
if (ParamCount() = 0) then
exit(False);
Result := True;
repeat
c := GetOpt('d:o:h');
case c of
'd' : ADataDir := ExpandFileName(Trim(OptArg));
'o' : AOuputDir := ExpandFileName(Trim(OptArg));
'h', '?' :
begin
WriteLn(SUsageText);
Result := False;
end;
end;
until (c = EndOfOptions);
idx := 0;
for k := 1 to ParamCount() do begin
s := Trim(ParamStr(k));
if (s <> '') and (s[1] <> '-') then begin
if (idx = 0) then
ACollationFileName := s
else if (idx = 1) then
ACollationTypeName := s;
Inc(idx);
if (idx >= 2) then
Break;
end;
end;
end;
var
orderedChars : TOrderedCharacters;
ucaBook : TUCA_DataBook;
stream, endianStream : TMemoryStream;
s, collationFileName, collationTypeName : string;
i , c: Integer;
collation : TCldrCollation;
dataPath, outputPath : string;
begin
{$ifdef test_suite}
exec_tests();
{$endif test_suite}
{$ifdef actual_parsing}
dataPath := '';
outputPath := '';
collationFileName := '';
collationTypeName := '';
if not ParseOptions(dataPath,outputPath,collationFileName,collationTypeName) then
Halt(1);
if (dataPath <> '') and not(DirectoryExists(dataPath)) then begin
WriteLn('This directory does not exist : ',dataPath);
Halt(1);
end;
if (dataPath = '') then
dataPath := ExtractFilePath(ParamStr(0))
else
dataPath := IncludeTrailingPathDelimiter(dataPath);
if (outputPath = '') then
outputPath := dataPath
else
outputPath := IncludeTrailingPathDelimiter(outputPath);
if (ParamCount() = 0) then begin
WriteLn(SUsageText);
Halt(1);
end;
if not(
FileExists(dataPath+'UCA_Rules_SHORT.xml') and
FileExists(dataPath+'allkeys.txt')
)
then begin
WriteLn(Format('File not found : %s or %s.',[dataPath+'UCA_Rules_SHORT.xml',dataPath+'allkeys.txt']));
Halt(1);
end;
collationFileName := dataPath + collationFileName;
if not FileExists(collationFileName) then begin
WriteLn('File not found : "',collationFileName,'"');
Halt(1);
end;
WriteLn(sLineBreak,'Collation Parsing ',QuotedStr(collationFileName),' ...');
stream := nil;
endianStream := nil;
collation := TCldrCollation.Create();
try
ParseCollationDocument(collationFileName,collation);
WriteLn(Format(' Collation Count = %d',[collation.ItemCount]));
if (collation.ItemCount = 0) then begin
WriteLn('No collation in this file.');
end else begin
for i := 0 to collation.ItemCount - 1 do
WriteLn(Format(' Item[%d] = %d "resets"; Type = %s',[i, Length(collation.Items[i].Rules),collation.Items[i].TypeName]));
if (collation.Find(collationTypeName) = nil) then
collationTypeName := FindCollationDefaultItemName(collation);
WriteLn('Collation Item Name : ',collationTypeName);
s := dataPath + 'UCA_Rules_SHORT.xml';
WriteLn;
WriteLn('Parsing ',QuotedStr(s),' ...');
FillByte(orderedChars,SizeOf(orderedChars),0);
orderedChars.Clear();
ParseInitialDocument(@orderedChars,s);
WriteLn('File parsed, ',orderedChars.ActualLength,' characters.');
WriteLn('Loading CLDR root''s key table ...');
stream := TMemoryStream.Create();
s := dataPath + 'allkeys.txt';
stream.LoadFromFile(s);
ParseUCAFile(stream,ucaBook);
c := FillInitialPositions(@orderedChars.Data[0],orderedChars.ActualLength,ucaBook.Lines);
if (c > 0) then
WriteLn(' Missed Initial Positions = ',c);
WriteLn(' Loaded.');
WriteLn('Start generation ...');
stream.Clear();
endianStream := TMemoryStream.Create();
s := COLLATION_FILE_PREFIX + ChangeFileExt(LowerCase(ExtractFileName(collationFileName)),'.pas');
GenerateCdlrCollation(
collation,collationTypeName,s,stream,endianStream,
orderedChars,ucaBook.Lines
);
stream.SaveToFile(ExtractFilePath(collationFileName)+s);
if (endianStream.Size > 0) then
endianStream.SaveToFile(ExtractFilePath(collationFileName)+GenerateEndianIncludeFileName(s));
end;
finally
endianStream.Free();
stream.Free();
collation.Free();
end;
{$endif actual_parsing}
WriteLn(sLineBreak,'Finished.');
end.

1920
utils/unicode/cldrtest.pas Normal file

File diff suppressed because it is too large Load Diff

634
utils/unicode/cldrxml.pas Normal file
View File

@ -0,0 +1,634 @@
{ Parser of the CLDR collation xml files.
Copyright (c) 2013 by Inoussa OUEDRAOGO
The source code is distributed under the Library GNU
General Public License with the following modification:
- object files and libraries linked into an application may be
distributed without source code.
If you didn't receive a copy of the file COPYING, contact:
Free Software Foundation
675 Mass Ave
Cambridge, MA 02139
USA
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
}
unit cldrxml;
{$mode objfpc}{$H+}
{$TypedAddress on}
interface
uses
Classes, SysUtils, DOM,
cldrhelper;
procedure ParseInitialDocument(ASequence : POrderedCharacters; ADoc : TDOMDocument);overload;
procedure ParseInitialDocument(ASequence : POrderedCharacters; AFileName : string);overload;
procedure ParseCollationDocument(ADoc : TDOMDocument; ACollation : TCldrCollation);
procedure ParseCollationDocument(const AFileName : string; ACollation : TCldrCollation);
resourcestring
sCaseNothandled = 'This case is not handled : "%s", Position = %d.';
sCodePointExpected = 'Code Point node expected as child at this position "%d".';
sCollationsNodeNotFound = '"collations" node not found.';
sHexAttributeExpected = '"hex" attribute expected at this position "%d".';
sInvalidResetClause = 'Invalid "Reset" clause.';
sNodeNameAssertMessage = 'Expected NodeName "%s", got "%s".';
sRulesNodeNotFound = '"rules" node not found.';
sTextNodeChildExpected = '(Child) text node expected at this position "%d", but got "%s".';
sUniqueChildNodeExpected = 'Unique child node expected at this position "%d".';
sUnknownResetLogicalPosition = 'Unknown reset logical position : "%s".';
implementation
uses
typinfo, XMLRead, XPath, Helper, unicodeset;
const
s_AT = 'at';
s_BEFORE = 'before';
s_CODEPOINT = 'codepoint';
s_COLLATION = 'collation';
s_COLLATIONS = 'collations';
s_CONTEXT = 'context';
s_DEFAULT = 'default';
s_EXTEND = 'extend';
s_HEX = 'hex';
s_POSITION = 'position';
s_RESET = 'reset';
s_RULES = 'rules';
s_STANDART = 'standard';
s_TYPE = 'type';
procedure CheckNodeName(ANode : TDOMNode; const AExpectedName : DOMString);
begin
if (ANode.NodeName <> AExpectedName) then
raise Exception.CreateFmt(sNodeNameAssertMessage,[AExpectedName,ANode.NodeName]);
end;
function CharToReorderWeigthKind(const AChar : Char) : TReorderWeigthKind;inline;
begin
case AChar of
'p' : Result := TReorderWeigthKind.PriMary;
's' : Result := TReorderWeigthKind.Secondary;
't' : Result := TReorderWeigthKind.Tertiary;
'i' : Result := TReorderWeigthKind.Identity;
else
Result := TReorderWeigthKind.Identity;
end;
end;
function DomString2UnicodeCodePointArray(const AValue : DOMString): TUnicodeCodePointArray;
var
u4str : UCS4String;
k : Integer;
begin
if (Length(AValue) = 0) then
exit(nil);
if (Length(AValue) = 1) then begin
SetLength(Result,1);
Result[0] := Ord(AValue[1])
end else begin
u4str := WideStringToUCS4String(AValue);
k := Length(u4str) - 1; // remove the last #0
SetLength(Result,k);
for k := 0 to k - 1 do
Result[k] := u4str[k];
end;
end;
function TryStrToLogicalReorder(
const AValue : string;
out AResult : TReorderLogicalReset
) : Boolean;
var
s : string;
i : Integer;
begin
s := StringReplace(AValue,' ','',[rfReplaceAll]);
s := StringReplace(s,'_','',[rfReplaceAll]);
i := GetEnumValue(TypeInfo(TReorderLogicalReset),s);
Result := (i > -1);
if Result then
AResult := TReorderLogicalReset(i);
end;
function ParseStatement(
ARules : TDOMElement;
AStartPosition : Integer;
AStatement : PReorderSequence;
var ANextPos : Integer
) : Boolean;
var
startPosition : Integer;
statement : PReorderSequence;
elementActualCount : Integer;
list : TDOMNodeList;
inBlock : Boolean;
procedure SkipComments();
begin
while (startPosition < list.Count) do begin
if (list[startPosition].NodeType <> COMMENT_NODE) then
Break;
Inc(startPosition);
end;
end;
function parse_reset() : Integer;
var
n, t : TDOMNode;
s : string;
logicalPos : TReorderLogicalReset;
begin
SkipComments();
n := list[startPosition];
CheckNodeName(n,s_RESET);
if n.HasChildNodes() then begin
n := n.FirstChild;
if (n.NodeType = TEXT_NODE) then begin
statement^.Reset := DomString2UnicodeCodePointArray(Trim(TDOMText(n).Data));
Result := startPosition+1;
end else begin
if not TryStrToLogicalReorder(n.NodeName,logicalPos) then
raise Exception.CreateFmt(sUnknownResetLogicalPosition,[n.NodeName]);
statement^.LogicalPosition := logicalPos;
Result := startPosition+1;
end;
end else if not n.HasChildNodes() then begin
if (list[startPosition+1].NodeName = s_POSITION) then begin
s := list[startPosition+1].Attributes.GetNamedItem(s_AT).NodeValue;
if not TryStrToLogicalReorder(s,logicalPos) then
raise Exception.CreateFmt(sUnknownResetLogicalPosition,[s]);
statement^.LogicalPosition := logicalPos;
Result := startPosition+2;
end else begin
t := list[startPosition+1];
{if (t.NodeType <> TEXT_NODE) then
raise Exception.CreateFmt(sTextNodeChildExpected,[(startPosition+1),(t.NodeName+'('+t.ClassName+')')]);}
if (t.NodeType = TEXT_NODE) then
statement^.Reset := DomString2UnicodeCodePointArray(Trim(TDOMText(t).Data))
else
statement^.Reset := DomString2UnicodeCodePointArray(' ');
Result := startPosition+2;
end;
end;
if (statement^.LogicalPosition = TReorderLogicalReset.None) and
(Length(statement^.Reset) = 0)
then
raise Exception.Create(sInvalidResetClause);
end;
procedure EnsureElementLength(const ALength : Integer);
var
k, d : Integer;
begin
k := Length(statement^.Elements);
if (k < ALength) then begin
k := ALength;
if (k = 0) then begin
k := 50;
end else begin
if (k < 10) then
d := 10
else
d := 2;
k := k * d;
end;
SetLength(statement^.Elements,k);
end;
end;
{procedure AddElement(AText : DOMString; AWeigthKind : TReorderWeigthKind);overload;
var
u4str : UCS4String;
k : Integer;
kp : PReorderUnit;
begin
u4str := WideStringToUCS4String(AText);
EnsureElementLength(elementActualCount+1);
kp := @statement^.Elements[elementActualCount];
k := Length(u4str) - 1{null terminated};
SetLength(kp^.Characters,k);
for k := 0 to k - 1 do
kp^.Characters[k] := u4str[k];
kp^.WeigthKind:= AWeigthKind;
elementActualCount := elementActualCount + 1;
end;}
procedure AddElement(
const AChars : array of UCS4Char;
const AWeigthKind : TReorderWeigthKind;
const AContext : DOMString
);overload;
var
kp : PReorderUnit;
k : Integer;
begin
EnsureElementLength(elementActualCount+1);
kp := @statement^.Elements[elementActualCount];
SetLength(kp^.Characters,Length(AChars));
for k := 0 to Length(AChars) - 1 do
kp^.Characters[k] := AChars[k];
kp^.WeigthKind := AWeigthKind;
elementActualCount := elementActualCount + 1;
if (AContext <> '') then
kp^.Context := DomString2UnicodeCodePointArray(AContext);
end;
procedure ReadChars(
ANode : TDOMNode;
APos : Integer;
var AChars : UCS4String
);
var
t : TDOMNode;
u4str : UCS4String;
s : DOMString;
begin
if not ANode.HasChildNodes() then begin
SetLength(AChars,1);
AChars[0] := Ord(UnicodeChar(' '));
exit;
//raise Exception.CreateFmt(sCodePointExpected + ANode.ClassName,[APos]);
end;
t := ANode.FindNode(s_CODEPOINT);
if (t = nil) then begin
if (ANode.ChildNodes.Count <> 1) then
raise Exception.CreateFmt(sUniqueChildNodeExpected,[APos]);
t := ANode.ChildNodes[0];
if not t.InheritsFrom(TDOMText) then
raise Exception.CreateFmt(sTextNodeChildExpected,[APos,(t.NodeName+'('+t.ClassName+')')]);
s := TDOMText(t).Data;
if (Length(s) = 1) then begin
SetLength(AChars,1);
AChars[0] := Ord(s[1]);
end else begin
u4str := WideStringToUCS4String(s);
AChars := u4str;
SetLength(AChars,Length(AChars)-1);
end;
end else begin
t := t.Attributes.GetNamedItem(s_HEX);
if (t = nil) then
raise Exception.CreateFmt(sHexAttributeExpected,[APos]);
SetLength(AChars,1);
AChars[0] := StrToInt('$'+t.NodeValue);
end
end;
procedure AddPrefixChars(const APrefix : array of UCS4Char; var ADest : TUnicodeCodePointArray);
var
k : Integer;
begin
k := Length(ADest);
SetLength(ADest,(k+Length(APrefix)));
Move(ADest[0],ADest[k+1],(SizeOf(k*ADest[0])));
for k := 0 to k - 1 do
ADest[k] := APrefix[k];
end;
function ReadNextItem(const APos : Integer) : Integer;
var
n, t : TDOMNode;
s, contextStr : DOMString;
w : TReorderWeigthKind;
isSimpleCharTag : Boolean;
simpleCharTag : AnsiChar;
last : PReorderUnit;
u4str : UCS4String;
k : Integer;
begin
contextStr := '';
Result := APos;
n := list[APos];
isSimpleCharTag := (Length(n.NodeName) = 1) and (Ord(n.NodeName[1])<=127);
if isSimpleCharTag then begin
simpleCharTag := AnsiChar(n.NodeName[1]);
if (simpleCharTag = 'x') then begin
inBlock := True;
n := n.FirstChild;
if (n.NodeName = s_CONTEXT) then begin
if n.HasChildNodes() then begin
t := n.FirstChild;
if (t.NodeType = TEXT_NODE) then
contextStr := TDOMText(t).Data;
end;
n := n.NextSibling;
end;
isSimpleCharTag := (Length(n.NodeName) = 1) and (Ord(n.NodeName[1])<=127);
if isSimpleCharTag then
simpleCharTag := AnsiChar(n.NodeName[1]);
end;
end;
if isSimpleCharTag and (simpleCharTag in ['p','s','t','i']) then begin
w := CharToReorderWeigthKind(AnsiChar(n.NodeName[1]));
ReadChars(n,APos,u4str);
AddElement(u4str,w,contextStr);
Result := Result + 1;
if not inBlock then
exit;
last := @statement^.Elements[elementActualCount-1];
n := n.NextSibling;
if (n <> nil) and (n.NodeName = s_EXTEND) then begin
ReadChars(n,APos,u4str);
SetLength(last^.ExpansionChars,Length(u4str));
for k := 0 to Length(u4str) - 1 do
last^.ExpansionChars[k] := u4str[k];
end;
exit;
end;
if (Length(n.NodeName) = 2) and (n.NodeName[2] = 'c') and
(Ord(n.NodeName[1])<=127) and (AnsiChar(n.NodeName[1]) in ['p','s','t','i'])
then begin
w := CharToReorderWeigthKind(AnsiChar(n.NodeName[1]));
ReadChars(n,APos,u4str);
for k := Low(u4str) to High(u4str) do
AddElement(u4str[k],w,contextStr);
Result := Result + 1;
exit;
end;
raise Exception.CreateFmt(sCaseNothandled,[n.NodeName,APos]);
end;
var
i, c : Integer;
n : TDOMNode;
begin
Result := False;
inBlock := False;
elementActualCount := 0;
if (AStartPosition <= 0) then
startPosition := 0
else
startPosition := AStartPosition;
i := startPosition;
list := ARules.ChildNodes;
c := list.Count;
if (c <= i) then
exit;
statement := AStatement;
statement^.Clear();
n := list[i];
i := parse_reset();
while (i < c) do begin
n := list[i];
if (n.NodeName = s_RESET) then
Break;
i := ReadNextItem(i);
end;
SetLength(statement^.Elements,elementActualCount);
Result := (i > startPosition);
if Result then
ANextPos := i;
end;
procedure ParseInitialDocument(ASequence : POrderedCharacters; ADoc : TDOMDocument);
var
n : TDOMNode;
rulesElement : TDOMElement;
i, c, nextPost : Integer;
statement : TReorderSequence;
p : PReorderUnit;
begin
n := ADoc.DocumentElement.FindNode(s_RULES);
if (n = nil) then
raise Exception.Create(sRulesNodeNotFound);
rulesElement := n as TDOMElement;
c := rulesElement.ChildNodes.Count;
ASequence^.Clear();
SetLength(ASequence^.Data,c+100);
nextPost := 0;
i := 0;
while (i < c) do begin
statement.Clear();
if not ParseStatement(rulesElement,i,@statement,nextPost) then
Break;
i := nextPost;
try
ASequence^.ApplyStatement(@statement);
except
on e : Exception do begin
e.Message := Format('%s Position = %d',[e.Message,i]);
raise;
end;
end;
end;
if (ASequence^.ActualLength > 0) then begin
p := @ASequence^.Data[0];
for i := 0 to ASequence^.ActualLength - 1 do begin
p^.Changed := False;
Inc(p);
end;
end;
end;
procedure ParseInitialDocument(ASequence : POrderedCharacters; AFileName : string);
var
doc : TXMLDocument;
begin
ReadXMLFile(doc,AFileName);
try
ParseInitialDocument(ASequence,doc);
finally
doc.Free();
end;
end;
function EvaluateXPathStr(const AExpression : string; AContextNode : TDOMNode): DOMString;
var
xv : TXPathVariable;
begin
xv := EvaluateXPathExpression(AExpression,AContextNode);
try
if (xv <> nil) then
Result := xv.AsText
else
Result := '';
finally
xv.Free();
end;
end;
function ParseDeletion(
const APattern : DOMString;
ASequence : PReorderSequence
) : Integer;
var
r : array of TReorderUnit;
c, i : Integer;
uset : TUnicodeSet;
it : TUnicodeSet.TIterator;
p : PReorderUnit;
begin
if (APattern = '') then
exit(0);
it := nil;
uset := TUnicodeSet.Create();
try
uset.AddPattern(APattern);
it := uset.CreateIterator();
c := 0;
it.Reset();
while it.MoveNext() do begin
Inc(c);
end;
SetLength(r,c);
p := @r[0];
i := 0;
it.Reset();
while it.MoveNext() do begin
p^.Clear();
p^.WeigthKind := TReorderWeigthKind.Deletion;
p^.Characters := Copy(it.GetCurrent());
Inc(p);
Inc(i);
end;
ASequence^.Clear();
ASequence^.Elements := r;
finally
it.Free();
uset.Free();
end;
SetLength(r,0);
end;
procedure ParseCollationItem(ACollationNode : TDOMElement; AItem : TCldrCollationItem);
var
n : TDOMNode;
rulesElement : TDOMElement;
i, c, nextPos : Integer;
statementList : TReorderSequenceArray;
sal : Integer;//statement actual length
statement : PReorderSequence;
s : DOMString;
begin
AItem.TypeName := ACollationNode.GetAttribute(s_TYPE);
AItem.Base := EvaluateXPathStr('base',ACollationNode);
AItem.Backwards := (EvaluateXPathStr('settings/@backwards',ACollationNode) = 'on');
if AItem.Backwards then
AItem.ChangedFields := AItem.ChangedFields + [TCollationField.BackWard];
SetLength(statementList,15);
sal := 0;
statement := @statementList[0];
s := EvaluateXPathStr('suppress_contractions',ACollationNode);
if (s <> '') then begin
if (ParseDeletion(s,statement) > 0) then begin
Inc(sal);
Inc(statement);
end else begin
statement^.Clear();
end;
end;
n := ACollationNode.FindNode(s_RULES);
if (n <> nil) then begin
rulesElement := n as TDOMElement;
c := rulesElement.ChildNodes.Count;
nextPos := 0;
i := 0;
while (i < c) do begin
statement^.Clear();
if not ParseStatement(rulesElement,i,statement,nextPos) then
Break;
i := nextPos;
Inc(statement);
Inc(sal);
if (sal >= Length(statementList)) then begin
SetLength(statementList,(sal*2));
statement := @statementList[(sal-1)];
end;
end;
end;
SetLength(statementList,sal);
AItem.Rules := statementList;
end;
procedure ParseCollationDocument(ADoc : TDOMDocument; ACollation : TCldrCollation);
var
rulesNodes, n : TDOMNode;
collationsElement, rulesElement : TDOMElement;
i, c : Integer;
item : TCldrCollationItem;
nl : TDOMNodeList;
begin
n := ADoc.DocumentElement.FindNode(s_COLLATIONS);
if (n = nil) then
raise Exception.Create(sCollationsNodeNotFound);
collationsElement := n as TDOMElement;
ACollation.Clear();
ACollation.Language := EvaluateXPathStr('identity/language/@type',ADoc.DocumentElement);
ACollation.Version := EvaluateXPathStr('identity/version/@number',ADoc.DocumentElement);
ACollation.DefaultType := EvaluateXPathStr('collations/default/@type',ADoc.DocumentElement);
if collationsElement.HasChildNodes() then begin
nl := collationsElement.ChildNodes;
c := nl.Count;
item := nil;
try
for i := 0 to c - 1 do begin
n := nl[i];
if (n.NodeName = s_COLLATION) then begin
item := TCldrCollationItem.Create();
ParseCollationItem((n as TDOMElement),item);
ACollation.Add(item);
item := nil;
end
end;
except
FreeAndNil(item);
raise;
end;
end;
end;
function ReadXMLFile(f: TStream) : TXMLDocument;
var
src : TXMLInputSource;
parser: TDOMParser;
begin
src := TXMLInputSource.Create(f);
Result := TXMLDocument.Create;
parser := TDOMParser.Create();
try
parser.Options.IgnoreComments := True;
parser.Parse(src, Result);
finally
src.Free();
parser.Free;
end;
end;
function ReadXMLFile(const AFilename: String) : TXMLDocument;
var
FileStream: TStream;
begin
Result := nil;
FileStream := TFileStream.Create(AFilename, fmOpenRead+fmShareDenyWrite);
try
Result := ReadXMLFile(FileStream);
finally
FileStream.Free;
end;
end;
procedure ParseCollationDocument(const AFileName : string; ACollation : TCldrCollation);
var
doc : TXMLDocument;
begin
doc := ReadXMLFile(AFileName);
try
ParseCollationDocument(doc,ACollation);
ACollation.LocalID := ExtractFileName(ChangeFileExt(AFileName,''));
finally
doc.Free();
end;
end;
end.

View File

@ -0,0 +1,13 @@
This folder requires the next files to be present:
Extracted from http://www.unicode.org/Public/6.2.0/ucd/UCD.zip:
* UnicodeData.txt
* HangulSyllableType.txt
* PropList.txt
Extracted from http://www.unicode.org/Public/UCA/6.2.0/CollationAuxiliary.zip:
* allkeys.txt : this file is actually the allkeys_CLDR.txt file renamed. It is the CLDR's root collation.
* UCA_Rules_SHORT.xml
Extracted from http://www.unicode.org/Public/cldr/22/core.zip (see the "common\collation" folder):
* all the language specific xml files (de.xml, es.xml, ...)

667
utils/unicode/grbtree.pas Normal file
View File

@ -0,0 +1,667 @@
{ Red Black Tree implementation.
Copyright (c) 2013 by Inoussa OUEDRAOGO
Inspired by ideas of Julienne Walker
see http://www.eternallyconfuzzled.com/tuts/datastructures/jsw_tut_bst1.aspx
The source code is distributed under the Library GNU
General Public License with the following modification:
- object files and libraries linked into an application may be
distributed without source code.
If you didn't receive a copy of the file COPYING, contact:
Free Software Foundation
675 Mass Ave
Cambridge, MA 02139
USA
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. }
unit grbtree;
{$ifdef FPC}
{$mode delphi}
{$H+}
{$endif FPC}
{$TYPEDADDRESS ON}
{$define RB_DEBUG}
interface
const
HEIGHT_LIMIT = 64;
type
{KCOMP = class
public
// Return
// * if A>B then 1
// * if A=B then 0
// * if A<B then -1
class function Compare(const A, B : TRBTreeNodeData) : Integer;
end; }
TRBTree<T, KCOMP> = class
public type
TRBTreeNodeData = T;
PRBTreeNode = ^TRBTreeNode;
PRBTreeAllocator = ^TRBTreeAllocator;
TRBTreeNode = record
Links : array[Boolean] of PRBTreeNode;
Data : TRBTreeNodeData;
Red : Boolean;
end;
TRBTreeNodeCreator = function(AContext : Pointer) : PRBTreeNode;
TRBTreeNodeDestructor = procedure(ANode : PRBTreeNode; AContext : Pointer);
TRBTreeAllocator = record
CreateNode : TRBTreeNodeCreator;
FreeNode : TRBTreeNodeDestructor;
end;
TRBTreeNodeComparator = KCOMP;
ThisType = TRBTree<T,KCOMP>;
private type
TBaseIterator = record
Tree : ThisType;
StartingNode : PRBTreeNode;
StartingDir : Boolean;
Current : PRBTreeNode;
Top : NativeInt;
Path : array[0..(HEIGHT_LIMIT-1)] of PRBTreeNode;
end;
PBaseIterator = ^TBaseIterator;
public type
TIterator = class
private
FHandle : PBaseIterator;
FResetState : Boolean;
private
public
constructor Create(AHandle : PBaseIterator);
destructor Destroy;override;
procedure Reset();
function MoveNext() : Boolean;inline;
function MovePrevious() : Boolean;inline;
function GetCurrent : TRBTreeNodeData;inline;
function GetCurrentNode : PRBTreeNode;inline;
end;
public var
Root : PRBTreeNode;
//FSize : Integer;
Allocator : TRBTreeAllocator;
Comparator : TRBTreeNodeComparator;
private
class function TreeCreateIterator() : PBaseIterator;static;inline;
class procedure TreeFreeIterator(AItem : PBaseIterator);static;inline;
class procedure TreeInitIterator(
AIterator : PBaseIterator;
const ATree : ThisType;
const AStartingNode : PRBTreeNode;
const ADirection : Boolean
);static;
class function TreeIteratorMove(
AIterator : PBaseIterator;
ADirection : Boolean
) : PRBTreeNode;static;
class function TreeIteratorMoveNext(AIterator : PBaseIterator) : PRBTreeNode;static;inline;
class function TreeIteratorMovePrevious(AIterator : PBaseIterator) : PRBTreeNode;static;inline;
function CreateIterator(
const ANode : PRBTreeNode;
const ADirection : Boolean
) : TIterator;inline;
private
class function DefaultCreateNode(AContext : Pointer) : PRBTreeNode;static;
class procedure DefaultFreeNode(ANode : PRBTreeNode; AContext : Pointer);static;
function InitNode(ANode : PRBTreeNode; AData : TRBTreeNodeData) : PRBTreeNode;inline;
function IsRed(ANode : PRBTreeNode): Boolean;inline;
function RotateDouble(ARoot : PRBTreeNode; const ADir : Boolean) : PRBTreeNode;inline;
function RotateSingle(ARoot : PRBTreeNode; const ADir : Boolean) : PRBTreeNode;
public
constructor Create(const AAllocator : PRBTreeAllocator);overload;
constructor Create();overload;
destructor Destroy;override;
procedure Clear();
function FindNode(const AData : TRBTreeNodeData) : PRBTreeNode;
function Insert(const AData : TRBTreeNodeData) : PRBTreeNode;
function Remove(const AData : TRBTreeNodeData) : Boolean;
function CreateForwardIterator(const ANode : PRBTreeNode) : TIterator;overload;inline;
function CreateForwardIterator() : TIterator;overload;inline;
function CreateBackwardIterator(const ANode : PRBTreeNode) : TIterator;overload;inline;
function CreateBackwardIterator() : TIterator;overload;inline;
{$ifdef RB_DEBUG}
function SelfAssert(ARoot : PRBTreeNode; var AErrorMessage : string) : Boolean;overload;
function SelfAssert(var AErrorMessage : string) : Boolean;overload;
{$endif RB_DEBUG}
end;
TOrdinalComparator<T> = class
public type
TOrdinalType = T;
public
// Return
// * if A>B then 1
// * if A=B then 0
// * if A<B then -1
class function Compare(const A, B : TOrdinalType) : Integer;static;inline;
end;
implementation
{ TRBTree<T> }
function TRBTree<T,KCOMP>.IsRed(ANode : PRBTreeNode): Boolean;inline;
begin
Result := (ANode <> nil) and ANode^.Red;
end;
function TRBTree<T,KCOMP>.InitNode(ANode: PRBTreeNode; AData: TRBTreeNodeData): PRBTreeNode;inline;
begin
Result := ANode;
Result^.Data := AData;
Result^.Red := True;
Result^.Links[False] := nil;
Result^.Links[True] := nil;
end;
function TRBTree<T,KCOMP>.RotateDouble(ARoot: PRBTreeNode; const ADir: Boolean): PRBTreeNode;inline;
begin
ARoot^.Links[not ADir] := RotateSingle(ARoot^.Links[not ADir], not ADir );
Result := RotateSingle(ARoot,ADir);
end;
function TRBTree<T,KCOMP>.RotateSingle(ARoot: PRBTreeNode; const ADir: Boolean): PRBTreeNode;
var
t : PRBTreeNode;
begin
t := ARoot^.Links[not ADir];
ARoot^.Links[not ADir] := t^.Links[ADir];
t^.Links[ADir] := ARoot;
ARoot^.Red := True;
t^.Red := False;
Result := t;
end;
class function TRBTree<T,KCOMP>.TreeCreateIterator() : PBaseIterator;static;
begin
Result := AllocMem(SizeOf(Result^));
end;
class procedure TRBTree<T,KCOMP>.TreeFreeIterator(AItem : PBaseIterator);static;
begin
if (AItem <> nil) then
FreeMem(AItem,SizeOf(AItem^));
end;
class procedure TRBTree<T,KCOMP>.TreeInitIterator(
AIterator : PBaseIterator;
const ATree : ThisType;
const AStartingNode : PRBTreeNode;
const ADirection : Boolean
);static;
begin
AIterator^.Tree := ATree;
AIterator^.StartingNode := AStartingNode;
AIterator^.StartingDir := ADirection;
if (AStartingNode = nil) then
AIterator^.Current := AIterator^.Tree.Root
else
AIterator^.Current := AStartingNode;
AIterator^.Top := 0;
// Save the path for later traversal
if (AIterator^.Current <> nil) then begin
while (AIterator^.Current^.Links[ADirection] <> nil) do begin
AIterator^.Path[AIterator^.Top] := AIterator^.Current;
Inc(AIterator^.Top);
AIterator^.Current := AIterator^.Current^.Links[ADirection];
end;
end;
end;
class function TRBTree<T,KCOMP>.TreeIteratorMove(
AIterator : PBaseIterator;
ADirection : Boolean
) : PRBTreeNode;static;
var
last : PRBTreeNode;
begin
Result := nil;
if (AIterator^.Current = nil) then
exit;
if (AIterator^.Current^.Links[ADirection] <> nil) then begin
// Continue down this branch
AIterator^.Path[AIterator^.Top] := AIterator^.Current;
Inc(AIterator^.Top);
AIterator^.Current := AIterator^.Current^.Links[ADirection];
while ( AIterator^.Current^.Links[not ADirection] <> nil) do begin
AIterator^.Path[AIterator^.Top] := AIterator^.Current;
Inc(AIterator^.Top);
AIterator^.Current := AIterator^.Current^.Links[not ADirection];
end;
end else begin
// Move to the next branch
repeat
if (AIterator^.Top = 0) then begin
AIterator^.Current := nil;
break;
end;
last := AIterator^.Current;
Dec(AIterator^.Top);
AIterator^.Current := AIterator^.Path[AIterator^.Top];
until (last <> AIterator^.Current^.Links[ADirection]);
end;
Result := AIterator^.Current;
end;
class function TRBTree<T,KCOMP>.TreeIteratorMoveNext(
AIterator : PBaseIterator
) : PRBTreeNode;static;
begin
Result := TreeIteratorMove(AIterator,True);
end;
class function TRBTree<T,KCOMP>.TreeIteratorMovePrevious(
AIterator : PBaseIterator
) : PRBTreeNode;static;
begin
Result := TreeIteratorMove(AIterator,False);
end;
function TRBTree<T,KCOMP>.CreateIterator(
const ANode : PRBTreeNode;
const ADirection : Boolean
) : TIterator;
var
h : PBaseIterator;
begin
h := TreeCreateIterator();
TreeInitIterator(h,Self,ANode,ADirection);
Result := TIterator.Create(h);
end;
class function TRBTree<T,KCOMP>.DefaultCreateNode(AContext: Pointer): PRBTreeNode;
begin
New(Result);
end;
class procedure TRBTree<T,KCOMP>.DefaultFreeNode(ANode: PRBTreeNode; AContext: Pointer);
begin
Dispose(ANode);
end;
constructor TRBTree<T,KCOMP>.Create(const AAllocator : PRBTreeAllocator);
begin
Root := nil;
Allocator := AAllocator^;
//Comparator := TRBTreeNodeComparator.Create();
end;
constructor TRBTree < T, KCOMP > .Create();
var
a : TRBTreeAllocator;
begin
a.CreateNode := TRBTreeNodeCreator(DefaultCreateNode);
a.FreeNode := TRBTreeNodeDestructor(DefaultFreeNode);
Create(@a);
end;
destructor TRBTree<T,KCOMP>.Destroy;
begin
Clear();
//Comparator.Free();
inherited;
end;
procedure TRBTree<T,KCOMP>.Clear();
var
it, save : PRBTreeNode;
begin
it := Root;
while (it <> nil) do begin
if (it^.Links[False] <> nil) then begin
// Right rotation
save := it^.Links[False];
it^.Links[False] := save^.Links[True];
save^.Links[True] := it;
end else begin
save := it^.Links[True];
Allocator.FreeNode(it,Self);
end;
it := save;
end;
end;
function TRBTree<T,KCOMP>.FindNode(const AData: TRBTreeNodeData): PRBTreeNode;
var
it : PRBTreeNode;
cp : TRBTreeNodeComparator;
dir : Boolean;
begin
Result := nil;
it := Root;
if (it = nil) then
exit;
cp := Comparator;
while (it <> nil) do begin
if (cp.Compare(it^.Data,AData) = 0) then begin
Result := it;
Break;
end;
dir := (cp.Compare(it^.Data,AData) < 0);
it := it^.Links[dir];
end;
end;
function TRBTree<T,KCOMP>.Insert(const AData: TRBTreeNodeData): PRBTreeNode;
var
head : TRBTreeNode;
g, t : PRBTreeNode; // Grandparent & parent
p, q : PRBTreeNode; // Iterator & parent
dir, last, dir2 : Boolean;
cp : TRBTreeNodeComparator;
begin
if (Root = nil) then begin
// Empty tree case
Root := InitNode(Allocator.CreateNode(Self),AData);
Result := Root;
end else begin
FillChar(head,SizeOf(head),0); // False tree root
dir := False;
last := False;
// Set up helpers
t := @head;
g := nil;
p := nil;
t^.Links[True] := Root;
q := t^.Links[True];
// Search down the tree
cp := Comparator;
while True do begin
if (q = nil) then begin
// Insert new node at the bottom
q := InitNode(Allocator.CreateNode(Self),AData);
p^.Links[dir] := q;
end else if IsRed(q^.Links[False]) and IsRed(q^.Links[True]) then begin
// Color flip
q^.Red := True;
q^.Links[False]^.Red := False;
q^.Links[True]^.Red := False;
end;
// Fix red violation
if IsRed(q) and IsRed(p) then begin
dir2 := (t^.Links[True] = g);
if (q = p^.Links[last]) then
t^.Links[dir2] := RotateSingle(g, not last)
else
t^.Links[dir2] := RotateDouble(g, not last );
end;
// Stop if found
if (cp.Compare(q^.Data,AData) = 0) then
break;
last := dir;
dir := (cp.Compare(q^.Data,AData) < 0);
// Update helpers
if (g <> nil) then
t := g;
g := p;
p := q;
q := q^.Links[dir];
end;
// Update root
Root := head.Links[True];
end;
// Make root black
Root^.Red := False;
end;
function TRBTree<T,KCOMP>.Remove(const AData: TRBTreeNodeData): Boolean;
var
head : TRBTreeNode;
q, p, g, f, s : PRBTreeNode;
dir, last, dir2 : Boolean;
cp : TRBTreeNodeComparator;
begin
Result := False;
if (Root = nil) then
exit;
FillChar(head,SizeOf(head),0); // False tree root
f := nil;
dir := True;
// Set up helpers
q := @head;
p := nil;
g := nil;
q^.Links[True] := Root;
// Search and push a red down
cp := Comparator;
while (q^.Links[dir] <> nil) do begin
last := dir;
// Update helpers
g := p;
p := q;
q := q^.Links[dir];
dir := (cp.Compare(q^.Data,AData) < 0);
// Save found node
if (cp.Compare(q^.Data,AData) = 0) then
f := q;
// Push the red node down
if not(IsRed(q)) and not(IsRed(q^.Links[dir])) then begin
if IsRed(q^.Links[not dir]) then begin
p^.Links[last] := RotateSingle(q,dir);
p := p^.Links[last];
end else if not IsRed(q^.Links[not dir]) then begin
s := p^.Links[not last];
if (s <> nil) then begin
if not(IsRed(s^.Links[not last])) and not(IsRed(s^.Links[last])) then begin
// Color flip
p^.Red := False;
s^.Red := True;
q^.Red := True;
end else begin
dir2 := (g^.Links[True] = p);
if IsRed(s^.Links[last]) then
g^.Links[dir2] := RotateDouble(p,last)
else if IsRed(s^.Links[not last]) then
g^.Links[dir2] := RotateSingle(p,last);
// Ensure correct coloring
g^.Links[dir2]^.Red := True;
q^.Red := g^.Links[dir2]^.Red;
g^.Links[dir2]^.Links[False]^.Red := False;
g^.Links[dir2]^.Links[True]^.Red := False;
end;
end;
end;
end;
end;
// Replace and remove if found
if (f <> nil) then begin
f^.Data := q^.Data;
p^.Links[(p^.Links[True] = q)] :=
q^.Links[(q^.Links[False] = nil)];
Allocator.FreeNode(q,Self);
Result := True;
end;
// Update root and make it black
Root := head.Links[True];
if (Root <> nil) then
Root^.Red := False;
end;
function TRBTree<T,KCOMP>.CreateForwardIterator(const ANode : PRBTreeNode) : TIterator;
begin
Result := CreateIterator(ANode,False);
end;
function TRBTree<T,KCOMP>.CreateForwardIterator() : TIterator;
begin
Result := CreateForwardIterator(Root);
end;
function TRBTree<T,KCOMP>.CreateBackwardIterator(const ANode : PRBTreeNode) : TIterator;
begin
Result := CreateIterator(ANode,True);
end;
function TRBTree<T,KCOMP>.CreateBackwardIterator() : TIterator;
begin
Result := CreateBackwardIterator(Root);
end;
{$ifdef RB_DEBUG}
function TRBTree<T,KCOMP>.SelfAssert(ARoot : PRBTreeNode; var AErrorMessage: string): Boolean;
var
lh, rh : Boolean;
ln, rn : PRBTreeNode;
e : string;
begin
AErrorMessage := '';
if (ARoot = nil) then begin
Result := True;
exit;
end;
e := '';
ln := ARoot^.Links[False];
rn := ARoot^.Links[True];
// Consecutive red links
if IsRed(ARoot) then begin
if IsRed(ln) or IsRed(rn) then begin
AErrorMessage := 'Red violation';
Result := False;
exit;
end;
end;
lh := SelfAssert(ln,e);
AErrorMessage := AErrorMessage + ' ' + e;
rh := SelfAssert(rn,e);
AErrorMessage := AErrorMessage + ' ' + e;
// Invalid binary search tree
if ( ( (ln <> nil) and (Comparator.Compare(ln^.Data,ARoot^.Data) >= 0) ) or
( (rn <> nil) and (Comparator.Compare(rn^.Data,ARoot^.Data) <= 0) ) )
then begin
AErrorMessage := AErrorMessage + ' ' + 'Binary tree violation';
Result := False;
Exit;
end;
// Black height mismatch
if ( lh and rh and (lh <> rh) ) then begin
AErrorMessage := AErrorMessage + ' ' + 'Black violation';
Result := False;
Exit;
end;
Result := lh and rh;
end;
function TRBTree<T,KCOMP>.SelfAssert(var AErrorMessage: string): Boolean;
begin
Result := Self.SelfAssert(Root, AErrorMessage);
end;
{$endif RB_DEBUG}
constructor TRBTree<T,KCOMP>.TIterator.Create(AHandle : PBaseIterator);
begin
inherited Create();
FHandle := AHandle;
FResetState := True;
end;
destructor TRBTree<T,KCOMP>.TIterator.Destroy();
begin
TreeFreeIterator(FHandle);
inherited Destroy;
end;
function TRBTree<T,KCOMP>.TIterator.MoveNext : Boolean;
begin
if FResetState then begin
FResetState := False;
Result := (FHandle^.Current <> nil);
exit;
end;
Result := (TreeIteratorMoveNext(FHandle) <> nil);
end;
function TRBTree<T,KCOMP>.TIterator.MovePrevious : Boolean;
begin
if FResetState then begin
FResetState := False;
Result := (FHandle^.Current <> nil);
exit;
end;
Result := (TreeIteratorMovePrevious(FHandle) <> nil);
end;
function TRBTree<T,KCOMP>.TIterator.GetCurrent : TRBTreeNodeData;
begin
Result := GetCurrentNode()^.Data;
end;
function TRBTree<T,KCOMP>.TIterator.GetCurrentNode : PRBTreeNode;
begin
Result := FHandle^.Current;
end;
procedure TRBTree<T,KCOMP>.TIterator.Reset();
begin
FResetState := True;
TreeInitIterator(FHandle,FHandle^.Tree,FHandle^.StartingNode,FHandle^.StartingDir)
end;
{ TOrdinalComparator<T> }
class function TOrdinalComparator<T>.Compare(const A, B: TOrdinalType): Integer;
begin
if (A = B) then
exit(0);
if (A > B) then
exit(1);
exit(-1);
end;
end.

4036
utils/unicode/helper.pas Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,17 @@
cldrparser.exe de.xml -d.\data -o.\data
echo
cldrparser es.xml -d.\data -o.\data
echo
cldrparser fr_CA.xml -d.\data -o.\data
echo
cldrparser ja.xml -d.\data -o.\data
echo
cldrparser ko.xml -d.\data -o.\data
echo
cldrparser ru.xml -d.\data -o.\data
echo
cldrparser sv.xml -d.\data -o.\data
echo
cldrparser zh.xml -d.\data -o.\data
pause

158
utils/unicode/trie.pas Normal file
View File

@ -0,0 +1,158 @@
{ Simple TRIE implementation.
Copyright (c) 2012 by Inoussa OUEDRAOGO
The source code is distributed under the Library GNU
General Public License with the following modification:
- object files and libraries linked into an application may be
distributed without source code.
If you didn't receive a copy of the file COPYING, contact:
Free Software Foundation
675 Mass Ave
Cambridge, MA 02139
USA
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. }
unit trie;
{$mode objfpc}{$H+}
interface
uses
SysUtils;
const
MAX_CHILD_COUNT = 256;
type
TKeyType = Cardinal;
TDataType = Integer;
PTrieNode = ^TTrieNode;
TTrieNode = packed record
Key : TKeyType;
DataNode : Boolean;
Data : TDataType;
ChildCount : Byte;
Children : array[0..(MAX_CHILD_COUNT-1)] of PTrieNode;
end;
function CreateNode(
const AKey : TKeyType;
const AData : TDataType
) : PTrieNode; overload;
function CreateNode(const AKey : TKeyType) : PTrieNode;overload;
procedure FreeNode(ANode : PTrieNode);
function InsertWord(
const ARoot : PTrieNode;
const AWord : array of TKeyType;
const AValue : TDataType
) : Boolean;overload;
function InsertWord(
const ARoot : PTrieNode;
const AWord : TKeyType;
const AValue : TDataType
) : Boolean;overload;
implementation
function CreateNode(
const AKey : TKeyType;
const AData : TDataType
) : PTrieNode;
begin
New(Result);
Result^.Key := AKey;
Result^.DataNode := True;
Result^.Data := AData;
Result^.ChildCount := 0;
end;
function CreateNode(const AKey : TKeyType) : PTrieNode;
begin
New(Result);
Result^.Key := AKey;
Result^.DataNode := False;
Result^.ChildCount := 0;
end;
procedure FreeNode(ANode : PTrieNode);
var
p : PTrieNode;
i : Integer;
begin
if (ANode = nil) then
exit;
p := ANode;
for i := 0 to p^.ChildCount - 1 do
FreeNode(p^.Children[i]);
Dispose(p);
end;
function InsertWord(
const ARoot : PTrieNode;
const AWord : TKeyType;
const AValue : TDataType
) : Boolean;
begin
Result := InsertWord(ARoot,[AWord],AValue);
end;
function InsertWord(
const ARoot : PTrieNode;
const AWord : array of TKeyType;
const AValue : TDataType
) : Boolean;
var
p, q : PTrieNode;
i, k, c : Integer;
searching : TKeyType;
found : Boolean;
begin
Result := False;
if (ARoot^.Key <> AWord[0]) then
exit;
p := ARoot;
q := p;
i := 1;
c := Length(AWord);
while (i < c) do begin
searching := AWord[i];
found := False;
for k := 0 to p^.ChildCount - 1 do begin
if (p^.Children[k]^.Key = searching) then begin
q := p;
p := p^.Children[k];
found := True;
Break;
end;
end;
if not found then
Break;
Inc(i);
end;
if (i < c) then begin
if (i = c) then
i := i - 1;
for i := i to c - 2 do begin
k := p^.ChildCount;
p^.Children[k] := CreateNode(AWord[i]);
p^.ChildCount := k + 1;
p := p^.Children[k];
end;
i := c - 1;
k := p^.ChildCount;
p^.Children[k] := CreateNode(AWord[i],AValue);
p^.ChildCount := k + 1;
p := p^.Children[k];
Result := True;
end;
end;
end.

282
utils/unicode/uca_test.pas Normal file
View File

@ -0,0 +1,282 @@
{ Unicode Collation Algorithm test routines for generated data.
Copyright (c) 2012 by Inoussa OUEDRAOGO
The source code is distributed under the Library GNU
General Public License with the following modification:
- object files and libraries linked into an application may be
distributed without source code.
If you didn't receive a copy of the file COPYING, contact:
Free Software Foundation
675 Mass Ave
Cambridge, MA 02139
USA
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
}
unit uca_test;
{$mode objfpc}{$H+}
interface
uses
SysUtils,
helper;
procedure uca_CheckProp_1(
ABook : TUCA_DataBook;
APropBook : PUCA_PropBook
);
procedure uca_CheckProp_x(
ABook : TUCA_DataBook;
APropBook : PUCA_PropBook
);
procedure uca_CheckProp_1y(
const ABook : TUCA_DataBook;
const APropBook : PUCA_PropBook;
const AFirstTable : PucaBmpFirstTable;
const ASecondTable : PucaBmpSecondTable
);
procedure uca_CheckProp_2y(
const ABook : TUCA_DataBook;
const APropBook : PUCA_PropBook;
const AFirstTable : PucaOBmpFirstTable;
const ASecondTable : PucaOBmpSecondTable
);
implementation
function IndexOf(const ACodePoint : Cardinal; APropBook : PUCA_PropBook): Integer;
var
i : Integer;
begin
for i := 0 to Length(APropBook^.Index) - 1 do begin
if (ACodePoint = APropBook^.Index[i].CodePoint) then
exit(i);
end;
Result := -1;
end;
function CompareWeigth(AExpect : PUCA_LineRec; AActual : PUCA_PropItemRec) : Boolean;
var
i, k : Integer;
p : PUCA_PropWeights;
pw : array of TUCA_PropWeights;
begin
Result := False;
if (Length(AExpect^.Weights) <> AActual^.WeightLength) then
exit;
//p := PUCA_PropWeights(PtrUInt(AActual) + SizeOf(TUCA_PropItemRec));
SetLength(pw,AActual^.WeightLength);
p := @pw[0];
AActual^.GetWeightArray(p);
for i := 0 to Length(AExpect^.Weights) - 1 do begin
//if (BoolToByte(AExpect^.Weights[i].Variable) <> p^.Variable) then
//exit;
for k := 0 to 3 - 1 do begin
if (AExpect^.Weights[i].Weights[k] <> p^.Weights[k]) then
exit;
end;
Inc(p);
end;
Result := True;
end;
procedure uca_CheckProp_1(
ABook : TUCA_DataBook;
APropBook : PUCA_PropBook
);
var
i, c, k : Integer;
line : PUCA_LineRec;
uc : Cardinal;
p : PUCA_PropItemRec;
begin
WriteLn('uca_CheckProp_1 Start ... ');
line := @ABook.Lines[0];
c := Length(ABook.Lines);
for i := 0 to c - 1 do begin
if line^.Stored and (Length(line^.CodePoints) = 1) then begin
uc := line^.CodePoints[0];
k := IndexOf(uc,APropBook);
if (k = -1) then begin
WriteLn('Property not found for Code Point : ' + Format('%x',[uc]));
end else begin
p := PUCA_PropItemRec(PtrUInt(APropBook^.Items)+APropBook^.Index[k].Position);
if not CompareWeigth(line,p) then
WriteLn('CompareWeigth fail for Code Point : ' + Format('%x',[uc]));
end;
end;
Inc(line);
end;
WriteLn('uca_CheckProp_1 End');
end;
function FindWord(
const AWord : array of Cardinal;
const APropBook : PUCA_PropItemRec
) : PUCA_PropItemRec;
var
cc : Cardinal;
p : PUCA_PropItemRec;
i, k, kc : Integer;
ok : Boolean;
begin
Result := nil;
p := APropBook;
for i := 1 to Length(AWord) - 1 do begin
ok := False;
kc := p^.ChildCount - 1;
p := PUCA_PropItemRec(PtrUInt(p) + p^.GetSelfOnlySize());
if (i > 1) then
p := PUCA_PropItemRec(PtrUInt(p) + SizeOf(UInt24));
for k := 0 to kc do begin
if (AWord[i] = p^.CodePoint) then begin
ok := True;
Break;
end;
p := PUCA_PropItemRec(PtrUInt(p) + p^.Size);
end;
if not ok then
exit;
end;
Result := p;
end;
function DumpCodePoints(const AValues : array of Cardinal) : string;
var
i : Integer;
begin
Result := '';
for i := 0 to Length(AValues) - 1 do
Result := Format('%s %x',[Result,AValues[i]]);
Result := Trim(Result);
end;
procedure uca_CheckProp_x(
ABook : TUCA_DataBook;
APropBook : PUCA_PropBook
);
var
i, c, k : Integer;
line : PUCA_LineRec;
uc : Cardinal;
p, q : PUCA_PropItemRec;
begin
WriteLn('uca_CheckProp_x Start ... ');
line := @ABook.Lines[0];
c := Length(ABook.Lines);
for i := 0 to c - 1 do begin
if line^.Stored and (Length(line^.CodePoints) > 1) then begin
//WriteLn(' Code Point sequence : ' + DumpCodePoints(line^.CodePoints));
uc := line^.CodePoints[0];
k := IndexOf(uc,APropBook);
if (k = -1) then begin
WriteLn(' Property not found for Code Point : ' + Format('%x',[uc]));
end else begin
q := PUCA_PropItemRec(PtrUInt(APropBook^.Items)+APropBook^.Index[k].Position);
p := FindWord(line^.CodePoints,q);
if (p = nil) then
WriteLn(' Data not found for Code Point sequence : ' + DumpCodePoints(line^.CodePoints))
else if not CompareWeigth(line,p) then
WriteLn(' CompareWeigth fail for Code Point sequence : ' + DumpCodePoints(line^.CodePoints));
end;
end;
Inc(line);
end;
WriteLn('uca_CheckProp_x End');
end;
function GetPropPosition(
const ABMPCodePoint : Word;
const AFirstTable : PucaBmpFirstTable;
const ASecondTable : PucaBmpSecondTable
) : Integer; inline;overload;
begin
Result:=
ASecondTable^[AFirstTable^[WordRec(ABMPCodePoint).Hi]][WordRec(ABMPCodePoint).Lo] - 1
end;
procedure uca_CheckProp_1y(
const ABook : TUCA_DataBook;
const APropBook : PUCA_PropBook;
const AFirstTable : PucaBmpFirstTable;
const ASecondTable : PucaBmpSecondTable
);
var
i, c, k : Integer;
line : PUCA_LineRec;
uc : Cardinal;
p : PUCA_PropItemRec;
ucw : Word;
begin
WriteLn('uca_CheckProp_1y Start (BMP) ... ');
line := @ABook.Lines[0];
c := Length(ABook.Lines);
for i := 0 to c - 1 do begin
if line^.Stored and (Length(line^.CodePoints) = 1) then begin
uc := line^.CodePoints[0];
if (uc <= High(Word)) then begin
ucw := uc;
k := GetPropPosition(ucw,AFirstTable,ASecondTable);
if (k = -1) then begin
WriteLn('Property not found for Code Point : ' + Format('%x',[uc]));
end else begin
p := PUCA_PropItemRec(PtrUInt(APropBook^.Items)+k);
if not CompareWeigth(line,p) then
WriteLn('CompareWeigth fail for Code Point : ' + Format('%x',[uc]));
end;
end;
end;
Inc(line);
end;
WriteLn('uca_CheckProp_1y End');
end;
procedure uca_CheckProp_2y(
const ABook : TUCA_DataBook;
const APropBook : PUCA_PropBook;
const AFirstTable : PucaOBmpFirstTable;
const ASecondTable : PucaOBmpSecondTable
);
var
i, c, k : Integer;
line : PUCA_LineRec;
uc : Cardinal;
p : PUCA_PropItemRec;
uchs, ucls : Word;
begin
WriteLn('uca_CheckProp_2y Start (>BMP) ... ');
line := @ABook.Lines[0];
c := Length(ABook.Lines);
for i := 0 to c - 1 do begin
if line^.Stored and (Length(line^.CodePoints) = 1) then begin
uc := line^.CodePoints[0];
if (uc > High(Word)) then begin
FromUCS4(uc,uchs,ucls);
k := GetPropPosition(uchs,ucls,AFirstTable,ASecondTable);
if (k = -1) then begin
WriteLn('Property not found for Code Point : ' + Format('%x',[uc]));
end else begin
p := PUCA_PropItemRec(PtrUInt(APropBook^.Items)+k);
if not CompareWeigth(line,p) then
WriteLn('CompareWeigth fail for Code Point : ' + Format('%x',[uc]));
end;
end;
end;
Inc(line);
end;
WriteLn('uca_CheckProp_2y End');
end;
end.

View File

@ -0,0 +1,426 @@
{ UnicodeSet implementation.
Copyright (c) 2013 by Inoussa OUEDRAOGO
The source code is distributed under the Library GNU
General Public License with the following modification:
- object files and libraries linked into an application may be
distributed without source code.
If you didn't receive a copy of the file COPYING, contact:
Free Software Foundation
675 Mass Ave
Cambridge, MA 02139
USA
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
}
unit unicodeset;
{$mode delphi}{$H+}
{$scopedenums on}
interface
uses
SysUtils,
grbtree, helper;
type
EUnicodeSetException = class(Exception)
end;
TUnicodeSet = class;
TPatternParser = class
private
FBufferStr : UnicodeString;
FBuffer : PUnicodeChar;
FBufferLength : Integer;
FSet : TUnicodeSet;
FPosition : Integer;
private
procedure Error(const AMsg : string; const AArgs : array of const);overload;inline;
procedure Error(const AMsg : string);overload;inline;
procedure SetBuffer(const APattern : PUnicodeChar; const ALength : Integer);
procedure CheckEOF();inline;overload;
procedure CheckEOF(ALength : Integer);overload;inline;
procedure UnexpectedEOF();inline;
function IsThis(AItem : UnicodeString; const APosition : Integer) : Boolean;overload;
function IsThis(AItem : UnicodeString) : Boolean;overload;inline;
procedure Expect(AItem : UnicodeString; const APosition : Integer);overload;inline;
procedure Expect(AItem : UnicodeString);overload;inline;
procedure SkipSpaces();inline;
function NextChar() : TUnicodeCodePoint;
procedure ParseItem();
procedure DoParse();
public
procedure Parse(const APattern : PUnicodeChar; const ALength : Integer);overload;
procedure Parse(const APattern : UnicodeString);overload;inline;
property CurrentSet : TUnicodeSet read FSet write FSet;
end;
TUnicodeCodePointArrayComparator = class
public
// Return
// * if A>B then 1
// * if A=B then 0
// * if A<B then -1
class function Compare(const A, B : TUnicodeCodePointArray) : Integer;static;inline;
end;
TUnicodeSet = class
private type
TItem = TUnicodeCodePointArray;
TTree = TRBTree<TItem,TUnicodeCodePointArrayComparator>;
public type
TIterator = TTree.TIterator;
private
FTree : TTree;
FParser : TPatternParser;
private
procedure CreateParser();inline;
public
constructor Create();
destructor Destroy;override;
procedure Add(AChar : TUnicodeCodePoint);inline;overload;
procedure Add(AString : TUnicodeCodePointArray);inline;overload;
procedure AddRange(const AStart, AEnd : TUnicodeCodePoint);inline;
procedure AddPattern(const APattern : UnicodeString);inline;
function CreateIterator() : TIterator;
function Contains(const AString : array of TUnicodeCodePoint) : Boolean;overload;
function Contains(const AChar : TUnicodeCodePoint) : Boolean;inline;overload;
function Contains(const AChar : UnicodeChar) : Boolean;inline;overload;
function Contains(const AChar : AnsiChar) : Boolean;inline;overload;
end;
resourcestring
SInvalidLength = 'Invalid length value : "%d".';
SInvalidPosition = 'Invalid position : "%d".';
SInvalidRangeLimits = 'Invalid range limits : ["%x" , "%x"].';
SExpectedBut = 'Expects "%s" but got "%s..." .';
SUnexpectedEOF = 'Unexpected end of file.';
implementation
uses
unicodedata;
function ToArray(const AItem : TUnicodeCodePoint) : TUnicodeCodePointArray;inline;
begin
SetLength(Result,1);
Result[Low(Result)] := AItem;
end;
function CompareItem(const Item1, Item2 : TUnicodeCodePointArray): Integer;
var
a, b : ^TUnicodeCodePoint;
i, ha, hb : Integer;
begin
if (Pointer(Item1) = Pointer(Item2)) then
exit(0);
if (Item1 = nil) then
exit(-1);
if (Item2 = nil) then
exit(1);
a := @Item1[0];
b := @Item2[0];
Result := 1;
ha := Length(Item1) - 1;
hb := Length(Item2) - 1;
for i := 0 to ha do begin
if (i > hb) then
exit;
if (a^ < b^) then
exit(-1);
if (a^ > b^) then
exit(1);
Inc(a);
Inc(b);
end;
if (ha = hb) then
exit(0);
exit(-1);
end;
{ TUnicodeCodePointArrayComparator }
class function TUnicodeCodePointArrayComparator.Compare(const A, B : TUnicodeCodePointArray): Integer;
begin
Result := CompareItem(A,B);
end;
{ TPatternParser }
procedure TPatternParser.Error(const AMsg: string; const AArgs: array of const);
begin
raise EUnicodeSetException.CreateFmt(AMsg,AArgs);
end;
procedure TPatternParser.Error(const AMsg: string);
begin
raise EUnicodeSetException.Create(AMsg);
end;
procedure TPatternParser.SetBuffer(
const APattern : PUnicodeChar;
const ALength : Integer
);
begin
FPosition := 0;
if (ALength <= 1) then begin
FBufferStr := '';
FBuffer := nil;
FBufferLength := 0;
exit;
end;
FBufferLength := ALength;
SetLength(FBufferStr,FBufferLength);
FBuffer := @FBufferStr[1];
Move(APattern^,FBuffer^,(FBufferLength*SizeOf(FBuffer^)));
end;
procedure TPatternParser.CheckEOF();
begin
CheckEOF(0);
end;
procedure TPatternParser.CheckEOF(ALength : Integer);
begin
if (ALength < 0) then
Error(SInvalidLength,[ALength]);
if ((FPosition+ALength) >= FBufferLength) then
UnexpectedEOF();
end;
procedure TPatternParser.UnexpectedEOF();
begin
Error(SUnexpectedEOF);
end;
function TPatternParser.IsThis(AItem: UnicodeString; const APosition: Integer): Boolean;
var
i, k, c : Integer;
begin
if (APosition < 0) then
Error(SInvalidPosition,[APosition]);
Result := False;
c := Length(AItem);
if (c = 0) then
exit;
i := APosition;
k := i + c;
if (k >= FBufferLength) then
exit;
if CompareMem(@AItem[1], @FBuffer[APosition],c) then
Result := True;
end;
function TPatternParser.IsThis(AItem : UnicodeString) : Boolean;
begin
Result := IsThis(AItem,FPosition);
end;
procedure TPatternParser.Expect(AItem: UnicodeString; const APosition: Integer);
begin
if not IsThis(AItem,APosition) then
Error(SExpectedBut,[AItem,Copy(FBuffer,APosition,Length(AItem))]);
end;
procedure TPatternParser.Expect(AItem: UnicodeString);
begin
Expect(AItem,FPosition);
end;
procedure TPatternParser.SkipSpaces();
begin
while (FPosition < FBufferLength) do begin
if (FBuffer[FPosition] <> ' ') then
Break;
Inc(FPosition);
end;
end;
function TPatternParser.NextChar(): TUnicodeCodePoint;
var
i : Integer;
c : UnicodeChar;
cp : TUnicodeCodePoint;
s : UnicodeString;
begin
SkipSpaces();
CheckEOF();
c := FBuffer[FPosition];
cp := Ord(c);
Inc(FPosition);
cp := Ord(c);
if (c = '\') and (FPosition < FBufferLength) then begin
if IsThis('\') then begin
Inc(FPosition);
CheckEOF();
cp := Ord(FBuffer[FPosition]);
Inc(FPosition);
end else if IsThis('u') then begin
Inc(FPosition);
CheckEOF(4);
s := Copy(FBufferStr,(FPosition+1),4);
Inc(FPosition,4);
if not TryStrToInt('$'+s,i) then
Error(SExpectedBut,['\uXXXX',s]);
cp := i;
end;
end;
if (cp <= MAX_WORD) and UnicodeIsLowSurrogate(UnicodeChar(Word(cp))) then begin
SkipSpaces();
CheckEOF();
c := UnicodeChar(Word(cp));
if UnicodeIsSurrogatePair(c,FBuffer[FPosition]) then begin
cp := ToUCS4(c,FBuffer[FPosition]);
Inc(FPosition);
end;
end;
Result := cp;
end;
function CompareTo(const A : TUnicodeCodePoint; const B : UnicodeChar) : Boolean;inline;
begin
Result := (A = Ord(B));
end;
procedure TPatternParser.ParseItem();
var
cp, lastCp : TUnicodeCodePoint;
charCount, i : Integer;
begin
SkipSpaces();
Expect('[');
charCount := 0;
Inc(FPosition);
while (FPosition < FBufferLength) do begin
lastCp := cp;
cp := NextChar();
if CompareTo(cp,']') then
Break;
if CompareTo(cp,'-') then begin
if (charCount = 0) then
Error(SExpectedBut,['<char>','-']);
cp := NextChar();
FSet.AddRange(lastCp,cp);
end else begin
FSet.Add(cp);
end;
Inc(charCount);
end;
end;
procedure TPatternParser.DoParse();
begin
SkipSpaces();
while (FPosition < FBufferLength) do begin
ParseItem();
SkipSpaces();
end;
end;
procedure TPatternParser.Parse(const APattern: PUnicodeChar; const ALength: Integer);
begin
if (ALength < 2) then
exit;
SetBuffer(APattern,ALength);
DoParse();
end;
procedure TPatternParser.Parse(const APattern : UnicodeString);
begin
Parse(@APattern[1],Length(APattern));
end;
{ TUnicodeSet }
procedure TUnicodeSet.CreateParser();
begin
if (FParser = nil) then begin
FParser := TPatternParser.Create();
FParser.CurrentSet := Self;
end;
end;
constructor TUnicodeSet.Create;
begin
FTree := TTree.Create();
end;
destructor TUnicodeSet.Destroy;
begin
FParser.Free();
FTree.Free();
inherited Destroy;
end;
procedure TUnicodeSet.Add(AChar: TUnicodeCodePoint);
begin
FTree.Insert(ToArray(AChar));
end;
procedure TUnicodeSet.Add(AString: TUnicodeCodePointArray);
begin
if (AString <> nil) then
FTree.Insert(AString);
end;
procedure TUnicodeSet.AddRange(const AStart, AEnd : TUnicodeCodePoint);
var
i : Integer;
begin
if (AStart > AEnd) then
raise EUnicodeSetException.CreateFmt(SInvalidRangeLimits,[AStart,AEnd]);
for i := AStart to AEnd do
Add(i);
end;
procedure TUnicodeSet.AddPattern(const APattern : UnicodeString);
begin
CreateParser();
FParser.Parse(APattern);
end;
function TUnicodeSet.CreateIterator() : TIterator;
begin
Result := FTree.CreateForwardIterator();
end;
function TUnicodeSet.Contains(const AString : array of TUnicodeCodePoint) : Boolean;
var
c : Integer;
x : TUnicodeCodePointArray;
begin
Result := False;
c := Length(AString);
if (c = 0) then
exit;
SetLength(x,c);
Move(AString[Low(AString)],x[Low(x)],(c*SizeOf(x[0])));
if (FTree.FindNode(x) <> nil) then
Result := True;
end;
function TUnicodeSet.Contains(const AChar : TUnicodeCodePoint) : Boolean;
begin
Result := Contains([AChar]);
end;
function TUnicodeSet.Contains(const AChar : UnicodeChar) : Boolean;
begin
Result := Contains(TUnicodeCodePoint(Ord(AChar)));
end;
function TUnicodeSet.Contains(const AChar : AnsiChar) : Boolean;
begin
Result := Contains(TUnicodeCodePoint(Ord(AChar)));
end;
end.

View File

@ -0,0 +1,74 @@
<?xml version="1.0"?>
<CONFIG>
<ProjectOptions>
<Version Value="9"/>
<PathDelim Value="\"/>
<General>
<Flags>
<MainUnitHasCreateFormStatements Value="False"/>
<MainUnitHasTitleStatement Value="False"/>
</Flags>
<SessionStorage Value="InProjectDir"/>
<MainUnit Value="0"/>
<Title Value="unihelper"/>
<UseAppBundle Value="False"/>
<ResourceType Value="res"/>
</General>
<i18n>
<EnableI18N LFM="False"/>
</i18n>
<VersionInfo>
<StringTable ProductVersion=""/>
</VersionInfo>
<BuildModes Count="1">
<Item1 Name="Default" Default="True"/>
</BuildModes>
<PublishOptions>
<Version Value="2"/>
<IncludeFileFilter Value="*.(pas|pp|inc|lfm|lpr|lrs|lpi|lpk|sh|xml)"/>
<ExcludeFileFilter Value="*.(bak|ppu|o|so);*~;backup"/>
</PublishOptions>
<RunParams>
<local>
<FormatVersion Value="1"/>
</local>
</RunParams>
<Units Count="1">
<Unit0>
<Filename Value="unihelper.lpr"/>
<IsPartOfProject Value="True"/>
</Unit0>
</Units>
</ProjectOptions>
<CompilerOptions>
<Version Value="11"/>
<PathDelim Value="\"/>
<Target>
<Filename Value="unihelper"/>
</Target>
<SearchPaths>
<IncludeFiles Value="$(ProjOutDir)"/>
<OtherUnitFiles Value="..\trie"/>
<UnitOutputDirectory Value="lib\$(TargetCPU)-$(TargetOS)"/>
</SearchPaths>
<Other>
<CompilerMessages>
<MsgFileName Value=""/>
</CompilerMessages>
<CompilerPath Value="$(CompPath)"/>
</Other>
</CompilerOptions>
<Debugging>
<Exceptions Count="3">
<Item1>
<Name Value="EAbort"/>
</Item1>
<Item2>
<Name Value="ECodetoolError"/>
</Item2>
<Item3>
<Name Value="EFOpenError"/>
</Item3>
</Exceptions>
</Debugging>
</CONFIG>

386
utils/unicode/unihelper.lpr Normal file
View File

@ -0,0 +1,386 @@
{ Unicode tables parser.
Copyright (c) 2012 by Inoussa OUEDRAOGO
The source code is distributed under the Library GNU
General Public License with the following modification:
- object files and libraries linked into an application may be
distributed without source code.
If you didn't receive a copy of the file COPYING, contact:
Free Software Foundation
675 Mass Ave
Cambridge, MA 02139
USA
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. }
{ This program generates tables as include-files for use
with the unicode related sources. It expects the following
unicode.org's files to be present in the same folder :
* HangulSyllableType.txt
* PropList.txt
* UnicodeData.txt
* allkeys.txt
}
{$DEFINE UCA_TEST}
program unihelper;
{$mode objfpc}{$H+}
uses
SysUtils, Classes,
helper, uca_test;
const
SUsage =
'This program generates tables as include-files for use ' + sLineBreak +
' with the unicode related sources. It expects the following ' + sLineBreak +
' unicode.org''s files to be present in the same folder : ' + sLineBreak +
' * HangulSyllableType.txt ' + sLineBreak +
' * PropList.txt ' + sLineBreak +
' * UnicodeData.txt ' + sLineBreak +
' * allkeys.txt : Note that this file is the one provided for the CLDR root.' + sLineBreak +
'' + sLineBreak +
'Usage : unihelper [<dataDir> <outputDir>] ' + sLineBreak +
' where ' + sLineBreak +
' dataDir : the directory where are stored the unicode files. The default' + sLineBreak +
' value is the program''s directory.' + sLineBreak +
' outputDir : The directory where the generated files will be stored. The' + sLineBreak +
' default value is the program''s directory.'+sLineBreak;
function DumpCodePoint(ACodePoint : TCodePointRec) : string;
begin
Result := '';
if (ACodePoint.LineType = 0) then
WriteStr(Result,IntToHex(ACodePoint.CodePoint,4))
else
WriteStr(Result,IntToHex(ACodePoint.StartCodePoint,4),'..',IntToHex(ACodePoint.EndCodePoint,4));
end;
var
dataPath, outputPath : string;
stream, binStream, binStream2 : TMemoryStream;
hangulSyllables : TCodePointRecArray;
ucaBook : TUCA_DataBook;
ucaPropBook : PUCA_PropBook;
propList : TPropListLineRecArray;
whiteSpaceCodePoints : TCodePointRecArray;
props : TPropRecArray;
numericTable : TNumericValueArray;
decomposition : TDecompositionArray;
decompositionBook : TDecompositionBook;
data : TDataLineRecArray;
//----------------
lvl3table1 : T3lvlBmp1Table;
lvl3table2 : T3lvlBmp2Table;
lvl3table3 : T3lvlBmp3Table;
//----------------
s : ansistring;
i, k, h : Integer;
p : PDataLineRec;
r : TDataLineRecArray;
olvl3table1 : T3lvlOBmp1Table;
olvl3table2 : T3lvlOBmp2Table;
olvl3table3 : T3lvlOBmp3Table;
//----------------
hs, ls : Word;
ucaFirstTable : TucaBmpFirstTable;
ucaSecondTable : TucaBmpSecondTable;
ucaoFirstTable : TucaoBmpFirstTable;
ucaoSecondTable : TucaOBmpSecondTable;
WL : Integer;
begin
WriteLn(SUsage+sLineBreak);
if (ParamCount > 0) then
dataPath := IncludeTrailingPathDelimiter(ParamStr(1))
else
dataPath := ExtractFilePath(ParamStr(0));
if (ParamCount > 1) then
outputPath := IncludeTrailingPathDelimiter(ParamStr(2))
else
outputPath := dataPath;
if not DirectoryExists(outputPath) then begin
WriteLn('Directory not found : ',outputPath);
if ForceDirectories(outputPath) then begin
WriteLn(' directory created successfully');
end else begin
WriteLn(' fail to create directory.');
Halt(1);
end;
end;
if not(
FileExists(dataPath + 'HangulSyllableType.txt') and
FileExists(dataPath + 'PropList.txt') and
FileExists(dataPath + 'UnicodeData.txt') and
FileExists(dataPath + 'allkeys.txt')
)
then begin
WriteLn('File(s) not found : HangulSyllableType.txt or PropList.txt or UnicodeData.txt or allkeys.txt .');
Halt(1);
end;
binStream2 := nil;
binStream := nil;
stream := TMemoryStream.Create();
try
binStream := TMemoryStream.Create();
binStream2 := TMemoryStream.Create();
WriteLn('Load file HangulSyllableType.txt ...', DateTimeToStr(Now));
stream.LoadFromFile(dataPath + 'HangulSyllableType.txt');
stream.Position := 0;
hangulSyllables := nil;
ParseHangulSyllableTypes(stream,hangulSyllables);
stream.Clear();
WriteLn('Load file PropList.txt ...', DateTimeToStr(Now));
stream.LoadFromFile(dataPath + 'PropList.txt');
stream.Position := 0;
propList := nil;
ParseProps(stream,propList);
stream.Clear();
whiteSpaceCodePoints := FindCodePointsByProperty('White_Space',propList);
writeln(' PropList Length = ',Length(propList));
writeln(' White_Space Length = ',Length(whiteSpaceCodePoints));
for i := Low(whiteSpaceCodePoints) to High(whiteSpaceCodePoints) do
WriteLn(' ',DumpCodePoint(whiteSpaceCodePoints[i]):12,' , IsWhiteSpace = ',IsWhiteSpace(whiteSpaceCodePoints[i].CodePoint,whiteSpaceCodePoints));
WriteLn('Load file UnicodeData.txt ...', DateTimeToStr(Now));
stream.LoadFromFile(dataPath + 'UnicodeData.txt');
stream.Position := 0;
WriteLn('Parse file ...', DateTimeToStr(Now));
data := nil;
props := nil;
Parse_UnicodeData(stream,props,numericTable,data,decomposition,hangulSyllables,whiteSpaceCodePoints);
WriteLn('Decomposition building ...');
MakeDecomposition(decomposition,decompositionBook);
WriteLn('Load file UCA allkeys.txt ...', DateTimeToStr(Now));
stream.LoadFromFile(dataPath + 'allkeys.txt');
stream.Position := 0;
ParseUCAFile(stream,ucaBook);
{ $IFDEF UCA_TEST}
k := 0; WL := 0; ;
for i := 0 to Length(ucaBook.Lines) - 1 do begin
h := GetPropID(ucaBook.Lines[i].CodePoints[0],data);
if (h <> -1) and
({props[h].HangulSyllable or} (props[h].DecompositionID <> -1))
then begin
Inc(k);
ucaBook.Lines[i].Stored := False;
end else begin
ucaBook.Lines[i].Stored := True;
if Length(ucaBook.Lines[i].Weights) > WL then
WL := Length(ucaBook.Lines[i].Weights);
end;
end;
WriteLn(
'UCA, Version = ',ucaBook.Version,'; entries count = ',Length(ucaBook.Lines),' ; Hangul # = ',k,
'Max Weights Length = ',WL
);
{ $ENDIF UCA_TEST}
WriteLn('Construct UCA Property Book ...');
ucaPropBook := nil;
MakeUCA_Props(@ucaBook,ucaPropBook);
{$IFDEF UCA_TEST}
uca_CheckProp_1(ucaBook,ucaPropBook);
uca_CheckProp_x(ucaBook,ucaPropBook);
{$ENDIF UCA_TEST}
WriteLn('Construct UCA BMP tables ...');
MakeUCA_BmpTables(ucaFirstTable,ucaSecondTable,ucaPropBook);
WriteLn(' UCA BMP Second Table Length = ',Length(ucaSecondTable));
{$IFDEF UCA_TEST}
uca_CheckProp_1y(ucaBook,ucaPropBook,@ucaFirstTable,@ucaSecondTable);
{$ENDIF UCA_TEST}
WriteLn('Construct UCA OBMP tables ...');
MakeUCA_OBmpTables(ucaoFirstTable,ucaoSecondTable,ucaPropBook);
WriteLn(' UCA OBMP Second Table Length = ',Length(ucaoSecondTable));
{$IFDEF UCA_TEST}
uca_CheckProp_2y(ucaBook,ucaPropBook,@ucaoFirstTable,@ucaoSecondTable);
{$ENDIF UCA_TEST}
WriteLn('Generate UCA Props tables ...');
binStream.Clear();
GenerateLicenceText(binStream);
GenerateUCA_PropTable(binStream,ucaPropBook);
WriteLn('Generate UCA BMP tables ...');
stream.Clear();
GenerateLicenceText(stream);
GenerateUCA_Head(stream,@ucaBook,ucaPropBook);
GenerateUCA_BmpTables(stream,binStream,ucaFirstTable,ucaSecondTable,THIS_ENDIAN);
WriteLn('Generate UCA OBMP tables ...');
GenerateUCA_OBmpTables(stream,binStream,ucaoFirstTable,ucaoSecondTable,THIS_ENDIAN);
stream.SaveToFile(outputPath + 'ucadata.inc');
s := outputPath + 'ucadata.inc';
s := GenerateEndianIncludeFileName(s);
binStream.SaveToFile(s);
binStream.Clear();
stream.Clear();
GenerateLicenceText(stream);
WriteLn('File parsed ...', DateTimeToStr(Now));
WriteLn(' Props Len = ',Length(props));
WriteLn(' Data Len = ',Length(data));
{WriteLn('BMP Tables building ...', DateTimeToStr(Now));
MakeBmpTables(firstTable,secondTable,props,data);
WriteLn(' First Table length = ',Length(firstTable));
WriteLn(' Second Table length = ',Length(secondTable));}
WriteLn('BMP Tables building ...', DateTimeToStr(Now));
MakeBmpTables3Levels(lvl3table1,lvl3table2,lvl3table3,data);
WriteLn(' 3 Levels Tables :');
WriteLn(' Len 1 = ',Length(lvl3table1));
WriteLn(' Len 2 = ',Length(lvl3table2));
WriteLn(' Len 3 = ',Length(lvl3table3));
for i := 0 to 255 do begin
for k := 0 to 15 do begin
for h := 0 to 15 do begin
if lvl3table3[lvl3table2[lvl3table1[i]][k]][h] <>
GetPropID(256*i + 16*k +h,data)
then begin
writeln('3 levels errors, i=',i,'; k=',k,'; h=',h);
end;
end;
end;
end;
binStream2.Clear();
WriteLn('Source generation ...', DateTimeToStr(Now));
WriteLn('BMP Tables sources ...', DateTimeToStr(Now));
Generate3lvlBmpTables(stream,lvl3table1,lvl3table2,lvl3table3);
WriteLn('Properties Table sources ...', DateTimeToStr(Now));
binStream.Clear();
GenerateNumericTable(binStream,numericTable,True);
binStream.SaveToFile(outputPath + 'unicodenumtable.pas');
binStream.Clear();
GeneratePropTable(binStream,props,ekLittle);
GeneratePropTable(binStream2,props,ekBig);
//-------------------------------------------
r := Compress(data);
//-------------------
WriteLn('OBMP Tables building ...', DateTimeToStr(Now));
MakeOBmpTables3Levels(olvl3table1,olvl3table2,olvl3table3,r);
WriteLn(' 3 Levels Tables :');
WriteLn(' Len 1 = ',Length(olvl3table1));
WriteLn(' Len 2 = ',Length(olvl3table2));
WriteLn(' Len 3 = ',Length(olvl3table3));
for i := 0 to 1023 do begin
for k := 0 to 31 do begin
for h := 0 to 31 do begin
if olvl3table3[olvl3table2[olvl3table1[i]][k]][h] <>
GetPropID(ToUCS4(HIGH_SURROGATE_BEGIN + i,LOW_SURROGATE_BEGIN + (k*32) + h),data)
then begin
writeln('3, OBMP levels errors, i=',i,'; k=',k,'; h=',h);
end;
end;
end;
end;
WriteLn('OBMP Tables sources ...', DateTimeToStr(Now));
Generate3lvlOBmpTables(stream,olvl3table1,olvl3table2,olvl3table3);
//---------------------
WriteLn('Decomposition Table sources ...', DateTimeToStr(Now));
GenerateDecompositionBookTable(binStream,decompositionBook,ekLittle);
GenerateDecompositionBookTable(binStream2,decompositionBook,ekBig);
stream.SaveToFile(outputPath + 'unicodedata.inc');
binStream.SaveToFile(outputPath + 'unicodedata_le.inc');
binStream2.SaveToFile(outputPath + 'unicodedata_be.inc');
binStream.Clear();
binStream2.Clear();
h := -1;
for i := Low(data) to High(data) do
if (data[i].CodePoint > $FFFF) then begin
h := i;
Break;
end;
stream.Clear();
for i := h to High(data) do begin
p := @data[i];
if (p^.LineType = 0) then begin
FromUCS4(p^.CodePoint,hs,ls);
//k := GetProp(hs,ls,props,ofirstTable,osecondTable)^.PropID;
k := GetProp(
(hs-HIGH_SURROGATE_BEGIN),(ls-LOW_SURROGATE_BEGIN),
props,olvl3table1,olvl3table2,olvl3table3
)^.PropID;
if (p^.PropID <> k) then begin
s := Format('#%d-%d #%d',[p^.CodePoint,p^.PropID,k]) + sLineBreak;
stream.Write(s[1],Length(s));
end;
end else begin
for h := p^.StartCodePoint to p^.EndCodePoint do begin
FromUCS4(h,hs,ls);
//k := GetProp(hs,ls,props,ofirstTable,osecondTable)^.PropID;
k := GetProp(
(hs-HIGH_SURROGATE_BEGIN),(ls-LOW_SURROGATE_BEGIN),
props,olvl3table1,olvl3table2,olvl3table3
)^.PropID;
if (p^.PropID <> k) then begin
s := Format('##%d;%d-%d #%d',[p^.StartCodePoint,p^.EndCodePoint,p^.PropID,k]) + sLineBreak;
stream.Write(s[1],Length(s));
Break
end;
end;
end;
end;
stream.SaveToFile(outputPath + 'diff-obmp.txt');
stream.Clear();
for i := Low(data) to High(data) do begin
p := @data[i];
if (p^.LineType = 0) then begin
k := GetPropID(p^.CodePoint,r);
if (p^.PropID <> k) then begin
s := Format('#%d-%d #%d',[p^.CodePoint,p^.PropID,k]) + sLineBreak;
stream.Write(s[1],Length(s));
end;
end else begin
for h := p^.StartCodePoint to p^.EndCodePoint do begin
k := GetPropID(h,r);
if (p^.PropID <> k) then begin
s := Format('##%d;%d-%d #%d',[p^.StartCodePoint,p^.EndCodePoint,p^.PropID,k]) + sLineBreak;
stream.Write(s[1],Length(s));
Break
end;
end;
end;
end;
stream.SaveToFile(outputPath + 'diff.txt');
stream.Clear();
for i := Low(r) to High(r) do begin
p := @r[i];
if (p^.LineType = 0) then begin
k := GetPropID(p^.CodePoint,data);
if (p^.PropID <> k) then begin
s := Format('#%d-%d #%d',[p^.CodePoint,p^.PropID,k]) + sLineBreak;
stream.Write(s[1],Length(s));
end;
end else begin
for h := p^.StartCodePoint to p^.EndCodePoint do begin
k := GetPropID(h,r);
if (p^.PropID <> k) then begin
s := Format('##%d;%d-%d #%d',[p^.StartCodePoint,p^.EndCodePoint,p^.PropID,k]) + sLineBreak;
stream.Write(s[1],Length(s));
Break
end;
end;
end;
end;
stream.SaveToFile(outputPath + 'diff2.txt');
finally
binStream2.Free();
binStream.Free();
stream.Free();
end;
end.

View File

@ -0,0 +1,66 @@
function IsCJK_Unified_Ideographs(ACodePoint : Cardinal) : Boolean;inline;
begin
Result := (ACodePoint >= $4E00) and (ACodePoint <= $9FCC); // $9FFF
end;
function IsCJK_Compatibility_Ideographs(ACodePoint : Cardinal) : Boolean;inline;
begin
Result := (ACodePoint >= $F900) and (ACodePoint <= $FAFF);
end;
function IsCJK_Unified_Ideographs_Extension_A(ACodePoint : Cardinal) : Boolean;inline;
begin
Result := (ACodePoint >= $3400) and (ACodePoint <= $4DB5); // $4DBF
end;
function IsCJK_Unified_Ideographs_Extension_B(ACodePoint : Cardinal) : Boolean;inline;
begin
Result := (ACodePoint >= $20000) and (ACodePoint <= $2A6D6); // $2A6DF
end;
function IsCJK_Unified_Ideographs_Extension_C(ACodePoint : Cardinal) : Boolean;inline;
begin
Result := (ACodePoint >= $2A700) and (ACodePoint <= $2B734); // $2B73F
end;
function IsCJK_Unified_Ideographs_Extension_D(ACodePoint : Cardinal) : Boolean;inline;
begin
Result := (ACodePoint >= $2B740) and (ACodePoint <= $2B81D); // $2B81F
end;
function IsCJK_Compatibility_Ideographs_Supplement(ACodePoint : Cardinal) : Boolean;inline;
begin
Result := (ACodePoint >= $2F800) and (ACodePoint <= $2FA1F);
end;
procedure DeriveWeight(const ACodePoint : Cardinal; AResult : PUCA_PropWeights);
const
BASE_1 = Word($FB40);
BASE_2 = Word($FB80);
BASE_3 = Word($FBC0);
var
base : Word;
begin
if IsCJK_Unified_Ideographs(ACodePoint) or IsCJK_Compatibility_Ideographs(ACodePoint) then
base := BASE_1
else if IsCJK_Unified_Ideographs_Extension_A(ACodePoint) or
IsCJK_Unified_Ideographs_Extension_B(ACodePoint) or
IsCJK_Unified_Ideographs_Extension_C(ACodePoint) or
IsCJK_Unified_Ideographs_Extension_D(ACodePoint) or
IsCJK_Compatibility_Ideographs_Supplement(ACodePoint)
then begin
base := BASE_2;
end else begin
base := BASE_3;
end;
AResult[0].Weights[0] := base + (ACodePoint shr 15);
AResult[0].Weights[1] := $20;
AResult[0].Weights[2] := $2;
AResult[1].Weights[0] := (ACodePoint and $7FFF) or $8000;
AResult[1].Weights[1] := 0;
AResult[1].Weights[2] := 0;
end;