utils: patch to unicode utils from Inoussa:

The CLDR parser now actually parses the selected
  collation type only. This should significantly reduce
  the memory consumption.

git-svn-id: trunk@23883 -
This commit is contained in:
paul 2013-03-17 05:00:54 +00:00
parent be1fcc8e3d
commit f12a0b7ece
3 changed files with 132 additions and 43 deletions

View File

@ -144,6 +144,8 @@ type
TCldrCollation = class;
{ TCldrCollationItem }
TCldrCollationItem = class
private
FBackwards: Boolean;
@ -153,6 +155,7 @@ type
FRules: TReorderSequenceArray;
FTypeName: string;
public
procedure Clear();
property Parent : TCldrCollation read FParent;
property TypeName : string read FTypeName write FTypeName;
property Base : string read FBase write FBase;
@ -187,6 +190,8 @@ type
property Items[Index : Integer] : TCldrCollationItem read GetItem;
end;
TCldrParserMode = (HeaderParsing, FullParsing);
function ComputeWeigths(
const AData : PReorderUnit;
const ADataLen : Integer;
@ -1104,6 +1109,17 @@ begin
Result := locNotFound;
end;
{ TCldrCollationItem }
procedure TCldrCollationItem.Clear();
begin
FBackwards := False;
FBase := '';
FChangedFields := [];
SetLength(FRules,0);
FTypeName := '';
end;
{ TCldrCollation }
function TCldrCollation.GetItem(Index : Integer): TCldrCollationItem;

View File

@ -102,6 +102,7 @@ var
i , c: Integer;
collation : TCldrCollation;
dataPath, outputPath : string;
collationItem : TCldrCollationItem;
begin
{$ifdef test_suite}
exec_tests();
@ -150,17 +151,20 @@ begin
endianStream := nil;
collation := TCldrCollation.Create();
try
ParseCollationDocument(collationFileName,collation);
ParseCollationDocument(collationFileName,collation,TCldrParserMode.HeaderParsing);
WriteLn(Format(' Collation Count = %d',[collation.ItemCount]));
if (collation.ItemCount = 0) then begin
WriteLn('No collation in this file.');
end else begin
for i := 0 to collation.ItemCount - 1 do
WriteLn(Format(' Item[%d] = %d "resets"; Type = %s',[i, Length(collation.Items[i].Rules),collation.Items[i].TypeName]));
if (collation.Find(collationTypeName) = nil) then
WriteLn(Format(' Item[%d] = (Type = %s)',[i, collation.Items[i].TypeName]));
collationItem := collation.Find(collationTypeName);
if (collationItem = nil) then begin
collationTypeName := FindCollationDefaultItemName(collation);
WriteLn('Collation Item Name : ',collationTypeName);
collationItem := collation.Find(collationTypeName);
end;
WriteLn(Format('Parsing Collation Item "%s" ...',[collationTypeName]));
ParseCollationDocument(collationFileName,collationItem,collationTypeName);
s := dataPath + 'UCA_Rules_SHORT.xml';
WriteLn;

View File

@ -32,13 +32,33 @@ uses
procedure ParseInitialDocument(ASequence : POrderedCharacters; ADoc : TDOMDocument);overload;
procedure ParseInitialDocument(ASequence : POrderedCharacters; AFileName : string);overload;
procedure ParseCollationDocument(ADoc : TDOMDocument; ACollation : TCldrCollation);
procedure ParseCollationDocument(const AFileName : string; ACollation : TCldrCollation);
procedure ParseCollationDocument(
ADoc : TDOMDocument;
ACollation : TCldrCollation;
AMode : TCldrParserMode
);overload;
procedure ParseCollationDocument(
const AFileName : string;
ACollation : TCldrCollation;
AMode : TCldrParserMode
);overload;
procedure ParseCollationDocument(
const AFileName : string;
ACollation : TCldrCollationItem;
AType : string
);overload;
procedure ParseCollationDocument(
ADoc : TDOMDocument;
ACollation : TCldrCollationItem;
AType : string
);overload;
resourcestring
sCaseNothandled = 'This case is not handled : "%s", Position = %d.';
sCodePointExpected = 'Code Point node expected as child at this position "%d".';
sCollationsNodeNotFound = '"collations" node not found.';
sCollationTypeNotFound = 'collation "Type" not found : "%s".';
sHexAttributeExpected = '"hex" attribute expected at this position "%d".';
sInvalidResetClause = 'Invalid "Reset" clause.';
sNodeNameAssertMessage = 'Expected NodeName "%s", got "%s".';
@ -500,7 +520,11 @@ begin
SetLength(r,0);
end;
procedure ParseCollationItem(ACollationNode : TDOMElement; AItem : TCldrCollationItem);
procedure ParseCollationItem(
ACollationNode : TDOMElement;
AItem : TCldrCollationItem;
AMode : TCldrParserMode
);
var
n : TDOMNode;
rulesElement : TDOMElement;
@ -515,43 +539,49 @@ begin
AItem.Backwards := (EvaluateXPathStr('settings/@backwards',ACollationNode) = 'on');
if AItem.Backwards then
AItem.ChangedFields := AItem.ChangedFields + [TCollationField.BackWard];
SetLength(statementList,15);
sal := 0;
statement := @statementList[0];
s := EvaluateXPathStr('suppress_contractions',ACollationNode);
if (s <> '') then begin
if (ParseDeletion(s,statement) > 0) then begin
Inc(sal);
Inc(statement);
end else begin
statement^.Clear();
end;
end;
n := ACollationNode.FindNode(s_RULES);
if (n <> nil) then begin
rulesElement := n as TDOMElement;
c := rulesElement.ChildNodes.Count;
nextPos := 0;
i := 0;
while (i < c) do begin
statement^.Clear();
if not ParseStatement(rulesElement,i,statement,nextPos) then
Break;
i := nextPos;
Inc(statement);
Inc(sal);
if (sal >= Length(statementList)) then begin
SetLength(statementList,(sal*2));
statement := @statementList[(sal-1)];
AItem.Rules := nil;
if (AMode = TCldrParserMode.FullParsing) then begin
SetLength(statementList,15);
sal := 0;
statement := @statementList[0];
s := EvaluateXPathStr('suppress_contractions',ACollationNode);
if (s <> '') then begin
if (ParseDeletion(s,statement) > 0) then begin
Inc(sal);
Inc(statement);
end else begin
statement^.Clear();
end;
end;
n := ACollationNode.FindNode(s_RULES);
if (n <> nil) then begin
rulesElement := n as TDOMElement;
c := rulesElement.ChildNodes.Count;
nextPos := 0;
i := 0;
while (i < c) do begin
statement^.Clear();
if not ParseStatement(rulesElement,i,statement,nextPos) then
Break;
i := nextPos;
Inc(statement);
Inc(sal);
if (sal >= Length(statementList)) then begin
SetLength(statementList,(sal*2));
statement := @statementList[(sal-1)];
end;
end;
end;
SetLength(statementList,sal);
AItem.Rules := statementList;
end;
SetLength(statementList,sal);
AItem.Rules := statementList;
end;
procedure ParseCollationDocument(ADoc : TDOMDocument; ACollation : TCldrCollation);
procedure ParseCollationDocument(
ADoc : TDOMDocument;
ACollation : TCldrCollation;
AMode : TCldrParserMode
);
var
rulesNodes, n : TDOMNode;
collationsElement, rulesElement : TDOMElement;
@ -576,7 +606,7 @@ begin
n := nl[i];
if (n.NodeName = s_COLLATION) then begin
item := TCldrCollationItem.Create();
ParseCollationItem((n as TDOMElement),item);
ParseCollationItem((n as TDOMElement),item,AMode);
ACollation.Add(item);
item := nil;
end
@ -588,6 +618,25 @@ begin
end;
end;
procedure ParseCollationDocument(
ADoc : TDOMDocument;
ACollation : TCldrCollationItem;
AType : string
);
var
xv : TXPathVariable;
begin
xv := EvaluateXPathExpression(Format('collations/collation[@type=%s]',[QuotedStr(AType)]),ADoc.DocumentElement);
try
if (xv.AsNodeSet.Count = 0) then
raise Exception.CreateFmt(sCollationTypeNotFound,[AType]);
ACollation.Clear();
ParseCollationItem((TDOMNode(xv.AsNodeSet[0]) as TDOMElement),ACollation,TCldrParserMode.FullParsing);
finally
xv.Free();
end
end;
function ReadXMLFile(f: TStream) : TXMLDocument;
var
src : TXMLInputSource;
@ -618,17 +667,37 @@ begin
end;
end;
procedure ParseCollationDocument(const AFileName : string; ACollation : TCldrCollation);
procedure ParseCollationDocument(
const AFileName : string;
ACollation : TCldrCollation;
AMode : TCldrParserMode
);
var
doc : TXMLDocument;
begin
doc := ReadXMLFile(AFileName);
try
ParseCollationDocument(doc,ACollation);
ParseCollationDocument(doc,ACollation,AMode);
ACollation.LocalID := ExtractFileName(ChangeFileExt(AFileName,''));
finally
doc.Free();
end;
end;
procedure ParseCollationDocument(
const AFileName : string;
ACollation : TCldrCollationItem;
AType : string
);
var
doc : TXMLDocument;
begin
doc := ReadXMLFile(AFileName);
try
ParseCollationDocument(doc,ACollation,AType);
finally
doc.Free();
end;
end;
end.