* Translate attribute extraction tool

This commit is contained in:
Michaël Van Canneyt 2022-10-18 09:45:36 +02:00
parent 48aa6209c7
commit 64c9e3c5b3
3 changed files with 694 additions and 0 deletions

View File

@ -0,0 +1,56 @@
<?xml version="1.0" encoding="UTF-8"?>
<CONFIG>
<ProjectOptions>
<Version Value="12"/>
<General>
<Flags>
<MainUnitHasCreateFormStatements Value="False"/>
<MainUnitHasTitleStatement Value="False"/>
<MainUnitHasScaledStatement Value="False"/>
<UseDefaultCompilerOptions Value="True"/>
</Flags>
<SessionStorage Value="InProjectDir"/>
<Title Value="extractlang"/>
<UseAppBundle Value="False"/>
<ResourceType Value="res"/>
</General>
<BuildModes>
<Item Name="Default" Default="True"/>
</BuildModes>
<PublishOptions>
<Version Value="2"/>
<UseFileFilters Value="True"/>
</PublishOptions>
<RunParams>
<FormatVersion Value="2"/>
</RunParams>
<Units>
<Unit>
<Filename Value="extractlang.lpr"/>
<IsPartOfProject Value="True"/>
</Unit>
</Units>
</ProjectOptions>
<CompilerOptions>
<Version Value="11"/>
<Target>
<Filename Value="extractlang"/>
</Target>
<SearchPaths>
<UnitOutputDirectory Value="lib/$(TargetCPU)-$(TargetOS)"/>
</SearchPaths>
</CompilerOptions>
<Debugging>
<Exceptions>
<Item>
<Name Value="EAbort"/>
</Item>
<Item>
<Name Value="ECodetoolError"/>
</Item>
<Item>
<Name Value="EFOpenError"/>
</Item>
</Exceptions>
</Debugging>
</CONFIG>

View File

@ -0,0 +1,131 @@
{
This file is part of the Pas2JS run time library.
Copyright (c) 2019 by Michael Van Canneyt
Program to extract data-translate tags from a HTML file.
See the file COPYING.FPC, included in this distribution,
for details about the copyright.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
**********************************************************************}
program extractlang;
{$mode objfpc}{$H+}
uses
{$IFDEF UNIX}
cwstring,
{$ENDIF}
Classes, SysUtils, CustApp, jsonparser, langextractor;
type
{ TExtractLangApplication }
TExtractLangApplication = class(TCustomApplication)
private
procedure Logger({%H-}Sender: TObject; const Msg: String);
protected
FExtractor : THTMLLangExtractor;
procedure DoRun; override;
public
constructor Create(TheOwner: TComponent); override;
destructor Destroy; override;
procedure Usage(Const Msg : String); virtual;
end;
{ TExtractLangApplication }
procedure TExtractLangApplication.Logger(Sender: TObject; const Msg: String);
begin
Writeln(Msg);
end;
procedure TExtractLangApplication.DoRun;
var
ErrorMsg: String;
begin
Terminate;
ErrorMsg:=CheckOptions('cd:f:hl:mn:o:ts:r', ['clear','file-mode','help','html-dir','languages','minify','name','output','recurse','single-scope','trash-values']);
if (ErrorMsg<>'') or HasOption('h','help') then
begin
Usage(ErrorMsg);
exit;
end;
With FExtractor do
begin
OnLog:=@Logger;
HTMLDir:=GetOptionValue('d','html-dir');
OutputFileName:=GetOptionValue('o','output');
Languages:=GetOptionValue('l','languages');
Minified:=HasOption('m','minify');
TrashNewValues:=HasOption('t','trash-values');
SingleScope:=GetOptionValue('s','single-scope');
CleanOutput:=HasOption('c','clear');
Recurse:=HasOption('r','recurse');
TagName:=GetOptionValue('n','name');
if (HTMLDir='') or (OutputFileName='') then
Usage('Need input dir and output filename');
if HasOption('f','file-mode') then
Case LowerCase(GetOptionValue('f','file-mode')) of
'single':
OutputFileMode:=fmSingle;
'multiple',
'multi':
OutputFileMode:=fmMultiple;
else
OutputFileMode:=fmSingle;
end;
TrashNewValues:=HasOption('t','trash-values');
Execute;
end;
end;
constructor TExtractLangApplication.Create(TheOwner: TComponent);
begin
inherited Create(TheOwner);
StopOnException:=True;
FExtractor:=THTMLLangExtractor.Create(Self);
end;
destructor TExtractLangApplication.Destroy;
begin
FreeAndNil(FExtractor);
inherited Destroy;
end;
procedure TExtractLangApplication.Usage(const Msg: String);
begin
if Msg<>'' then
Writeln('Error : ',Msg);
Writeln('Usage: ', ExeName, ' [options]');
Writeln('Where options is one or more of:');
Writeln('-h --help This help text');
Writeln('-c --clear Clear output JSON file (Default is to update existing output file).');
Writeln('-d --html-dir=DIR Directory with HTML files to scan (recursively)');
Writeln('-f --file-mode=MODE Set file mode: one of single or multiple');
Writeln('-o --output=FILE File to write JSON translations (may get suffix depending on file mode)');
Writeln('-l --languages=LIST Comma-separated list of languages to create');
Writeln('-m --minify Minify output');
Writeln('-n --name=NAME Set name of data-tag to NAME (data-NAME)');
Writeln('-r --recurse Recurse into subdirectories of the HTML directory');
Writeln('-s --single-scope=SCOPE Put all translation names in a single scope');
Writeln('-t --trash-values Trash values for other languages');
ExitCode:=Ord(Msg<>'');
Halt;
end;
var
Application: TExtractLangApplication;
begin
Application:=TExtractLangApplication.Create(nil);
Application.Title:='Extract data-translate tag application';
Application.Run;
Application.Free;
end.

View File

@ -0,0 +1,507 @@
{
This file is part of the Pas2JS run time library.
Copyright (c) 2019 by Michael Van Canneyt
Unit to extract data-translate tags from a HTML file and create a JSON file from it.
See the file COPYING.FPC, included in this distribution,
for details about the copyright.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
**********************************************************************}
Unit langextractor;
{$mode objfpc}{$H+}
interface
uses
Classes, Contnrs, SysUtils, StrUtils, sax, sax_html, fpjson;
Type
TFileMode = (fmSingle,fmMultiple);
TLogEvent = Procedure(Sender : TObject; Const Msg : String) of object;
ETranslate = Class(Exception);
TTranslations = Class(TObject)
Strings : Array of string;
Used : Boolean;
end;
{ THTMLLangExtractor }
THTMLLangExtractor = Class(TComponent)
private
// Used in CollectFileNamesAndTexts...
FCurrent,
// texts in language used in HTML
FLangObjects : TJSONObject;
FFileMode: TFileMode;
FOutputFileName: String;
FCleanOutput: Boolean;
FMiniFied: Boolean;
FRecurse: Boolean;
FSingleScope: String;
FTagName: String;
// Map of language - JSON object
FTranslations : TFPObjectList;
FHTMLDir: String;
FCurrentName:String;
FCurrentCount: Integer;
FOnLog: TLogEvent;
FLanguages: String;
FTrash: Boolean;
procedure DoEndElement({%H-}Sender: TObject; const {%H-}NamespaceURI, {%H-}LocalName, {%H-}QName: SAXString);
procedure DoStartElement(Sender: TObject; const {%H-}NamespaceURI, {%H-}LocalName, {%H-}QName: SAXString; Atts: TSAXAttributes);
procedure DoTextElement({%H-}Sender: TObject; const ch: PSAXChar; {%H-}AStart, ALength: Integer);
function GetLanguageFile(aLang: String): String;
function GetTagName: String;
procedure LoadExistingFiles;
procedure CreateLanguageNodes;
function LoadFile(const aFileName: string): TJSONObject;
Protected
procedure AddString(const aName, aValue: String);
procedure CollectHTMLFileNamesAndTexts(const aFileName: String);
procedure CopyMissingWords;
procedure CopyWords(SrcScope, DestScope: TJSONObject; aList: TStrings);
Procedure Log(Const Msg : String); overload;
Procedure Log(Const Fmt : String; Const Args : Array of const); overload;
Procedure CollectHTMLNamesAndTexts(Const aDir : string);
Procedure CreateLanguageFiles;
Public
Constructor Create(aOwner : TComponent); override;
Destructor Destroy; override;
Procedure Execute;
// ClearOutput
Property CleanOutput : Boolean Read FCleanOutput Write FCleanOutput;
// HTML Files that need translation
Property HTMLDir : String Read FHTMLDir Write FHTMLDir;
// File for JSON file(s) with translations
Property OutputFileName : String Read FOutputFileName Write FOutputFileName;
// Emit Log messages
Property OnLog : TLogEvent Read FOnLog Write FOnlog;
// Minified language constants
Property Minified : Boolean Read FMiniFied Write FMinified;
// TagName (data-tag)
Property TagName : String Read GetTagName Write FTagName;
// Trash new values in translations.
Property TrashNewValues : Boolean Read FTrash Write FTrash;
// Single/Multiple files
Property OutputFileMode : TFileMode Read FFileMode Write FFileMode;
// Languages: comma-separated list. First is the input language (en)
Property Languages: String Read FLanguages Write FLanguages;
// Recurse : Boolean;
Property Recurse: Boolean Read FRecurse Write FRecurse;
// SingleScope : If this is set, all identifiers are set in a single scope.
Property SingleScope : String Read FSingleScope Write FSingleScope;
end;
implementation
{ THTMLLangExtractor }
procedure THTMLLangExtractor.Log(const Msg: String);
begin
if Assigned(FOnLog) then
FOnLog(Self,Msg);
end;
procedure THTMLLangExtractor.Log(const Fmt: String; const Args: array of const);
begin
Log(Format(Fmt,Args));
end;
procedure THTMLLangExtractor.DoStartElement(Sender: TObject; const {%H-}NamespaceURI, LocalName, {%H-}QName: SAXString; Atts: TSAXAttributes);
Var
aID,aTerm,aAttr : String;
I,P,aCount : Integer;
begin
if Not Assigned(atts) then exit;
aID:=UTF8Encode(Atts.GetValue('','data-'+Utf8Decode(tagname)));
if (aID='') then
exit;
aCount:=WordCount(aID,[';']);
FcurrentName:='';
for I:=1 to aCount do
begin
aTerm:=ExtractWord(I,aID,[';']);
P:=Pos('-',aTerm);
if (P=0) then
begin
if FCurrentName='' then
FCurrentName:=aID
else
Log('Translate element "%s" contains 2 IDs: "%s" "%s". Ignoring 2nd ',[aID,FCurrentName,aTerm]);
end
else
begin
aAttr:=Copy(aTerm,P+1);
AddString(aTerm,UTF8Encode(Atts.GetValue('',UTF8Decode(aAttr))));
end;
end;
end;
procedure THTMLLangExtractor.DoTextElement(Sender: TObject; const ch: PSAXChar; AStart, ALength: Integer);
Var
S : String;
W : UnicodeString;
begin
if FCurrentName='' then exit;
W:='';
SetLength(W,aLength);
Move(ch^,W[1],aLength*SizeOf(WideChar));
S:=Trim(UTF8Encode(W));
AddString(FCurrentName,S);
end;
procedure THTMLLangExtractor.AddString(const aName, aValue: String);
Var
Idx : Integer;
Old : String;
begin
Idx:=FCurrent.IndexOfName(aName,True);
If Idx<>-1 then
begin
Old:=FCurrent.Items[idx].AsString;
if (Old<>aValue) then
Log('Ignoring duplicate name %s. Old text = "%s", new = "%s"',[aName, Old, aValue]);
end
else
begin
FCurrent.Strings[aName]:=aValue;
FCurrentName:='';
Inc(FCurrentCount);
end;
end;
procedure THTMLLangExtractor.CollectHTMLFileNamesAndTexts(const aFileName : String);
Var
MyReader : THTMLReader;
F : TFileStream;
aScope : string;
begin
if SingleScope<>'' then
aScope:=SingleScope
else
aScope:=LowerCase(ChangeFileExt(ExtractFileName(aFileName),''));
Log('Searching %s for translatable terms, adding to scope : %s',[aFileName,aScope]);
if (FLangObjects.Items[0] as TJSONObject).IndexOfName(aScope)<>-1 then
FCurrent:=(FLangObjects.Items[0] as TJSONObject).Objects[aScope]
else
begin
FCurrent:=TJSONObject.Create;
// Add scope to default language
(FLangObjects.Items[0] as TJSONObject).Add(aScope,FCurrent);
end;
FCurrentCount:=0;
MyReader:=nil;
F:=TFileStream.Create(aFileName,fmOpenRead or fmShareDenyNone);
Try
MyReader:=THTMLReader.Create;
MyReader.OnStartElement:=@DoStartElement;
MyReader.OnCharacters:=@DoTextElement;
MyReader.OnEndElement:=@DoEndElement;
MyReader.ParseStream(F);
Log('Found %d translatable terms',[FCurrentCount]);
finally
FreeAndNil(MyReader);
FreeAndNil(F);
end;
end;
procedure THTMLLangExtractor.DoEndElement(Sender: TObject; const NamespaceURI, LocalName, QName: SAXString);
begin
FCurrentName:='';
end;
procedure THTMLLangExtractor.CollectHTMLNamesAndTexts(const aDir: string);
Var
Info : TSearchRec;
begin
// HTML files
If FindFirst(aDir+'*.html',0,Info)=0 then
try
Repeat
CollectHTMLFileNamesAndTexts(aDir+Info.Name);
Until FindNext(Info)<>0;
finally
FindClose(Info);
end;
// Subdirs
if Recurse then
If FindFirst(aDir+'*',faDirectory,Info)=0 then
try
Repeat
With Info do
if ((Attr and faDirectory)<>0) and (Name<>'.') and (Name<>'..') then
CollectHTMLNamesAndTexts(IncludeTrailingPathDelimiter(aDir+Name));
Until FindNext(Info)<>0;
finally
FindClose(Info);
end;
end;
function THTMLLangExtractor.GetLanguageFile(aLang: String): String;
Var
Ext : String;
begin
Ext:=ExtractFileExt(OutputFileName);
Result:=ChangeFileExt(OutputFileName,'-'+aLang+Ext);
end;
function THTMLLangExtractor.GetTagName: String;
begin
Result:=FTagName;
if Result='' then
Result:='translate';
end;
procedure THTMLLangExtractor.CreateLanguageFiles;
Function GetAsJSON(aObject : TJSONObject) : string;
begin
if FMinified then
Result:=aObject.AsJSON
else
Result:=aObject.FormatJSON
end;
Var
I : Integer;
S : TStringStream;
begin
if FFileMode=fmSingle then
begin
S:=TstringStream.Create(GetAsJSON(FLangObjects),TEncoding.UTF8);
try
S.SaveToFile(OutputFileName);
finally
S.Free;
end;
end
else
begin
For I:=0 to FLangObjects.Count-1 do
begin
S:=TstringStream.Create(GetAsJSON(FLangObjects.Items[i] as TJSONObject),TEncoding.UTF8);
try
S.SaveToFile(GetLanguageFile(FLangObjects.Names[i]));
finally
S.Free;
end;
end;
end;
end;
constructor THTMLLangExtractor.Create(aOwner: TComponent);
begin
inherited Create(aOwner);
FLangObjects:=TJSONObject.Create;
FTranslations:=TFPObjectList.Create(True);
end;
destructor THTMLLangExtractor.Destroy;
begin
FreeAndNil(FTranslations);
FreeAndNil(FLangObjects);
inherited Destroy;
end;
procedure THTMLLangExtractor.CopyWords(SrcScope,DestScope : TJSONObject; aList : TStrings);
Var
I : Integer;
aName,aValue : String;
begin
For I:=0 to SrcScope.Count-1 do
begin
aName:=SrcScope.Names[I];
if DestScope.IndexOfName(aName)=-1 then
begin
if TrashNewValues then
aValue:='生词'+IntToStr(i)
else
aValue:=SrcScope.Items[I].AsString;
DestScope.Add(aName,aValue);
if Assigned(aList) then
aList.Add(aName);
end;
end;
end;
procedure THTMLLangExtractor.CopyMissingWords;
Var
I,J,aSectionWordCount,aSectionCount : Integer;
NewWords : TStringList;
Src,Dest,SrcScope,DestScope : TJSONObject;
NewSection : Boolean;
aScope : String;
begin
aSectionCount:=0;
aSectionWordCount:=0;
NewWords:=TstringList.Create;
Try
NewWords.Sorted:=True;
NewWords.Duplicates:=dupIgnore;
Src:=FLangObjects.Items[0] as TJSONObject;
// Copy all scopes
For I:=0 to Src.Count-1 do
begin
aScope:=Src.Names[I];
SrcScope:=Src.Items[i] as TJSONObject;
NewSection:=False;
For J:=1 to FLangObjects.Count-1 do
begin
Dest:=FLangObjects.Items[J] as TJSONObject;
If (Dest.IndexOfName(aScope)=-1) then
begin
NewSection:=true;
if TrashNewValues then
begin
DestScope:=TJSONObject.Create;
Dest.Add(aScope,DestScope);
CopyWords(SrcScope,DestScope,Nil);
end
else
Dest.Add(aScope,Src.Items[I].Clone);
end
else
begin
DestScope:=Dest.Objects[aScope] as TJSONObject;
CopyWords(SrcScope,DestScope,NewWords);
end;
end;
If NewSection then
begin
Inc(aSectionCount);
Inc(aSectionWordCount,SrcScope.Count);
end;
end;
Log('Copied %d new scopes with %d words, added %d new words in existing scopes.',[aSectionCount,aSectionWordCount,NewWords.Count])
finally
NewWords.Free;
end;
end;
function THTMLLangExtractor.LoadFile(const aFileName: string): TJSONObject;
Var
F : TFileStream;
D : TJSONData;
begin
Log('Loading existing file "%s"',[aFileName]);
F:=TFileStream.Create(aFileName,fmOpenRead or fmShareDenyWrite);
try
D:=GetJSON(F);
if D is TJSONObject then
begin
Result:=D as TJSONObject;
D:=Nil;
end
else
begin
Log('File "%s" does not contain valid JSON',[aFileName]);
Result:=TJSONObject.Create;
end;
finally
D.Free;
F.Free;
end;
end;
procedure THTMLLangExtractor.LoadExistingFiles;
Var
I : Integer;
Obj : TJSONObject;
aLang : String;
begin
// Load global file, if any
if (OutputFileMode=fmSingle) and FileExists(OutputFileName) then
begin
Obj:=LoadFile(OutputFileName);
FreeAndNil(FLangObjects);
FLangObjects:=Obj;
end;
// Add all languages
for I:=1 to WordCount(Languages,[',']) do
begin
aLang:=ExtractWord(I,Languages,[',']);
if (OutputFileMode=fmMultiple) and FileExists(GetLanguageFile(aLang)) then
FLangObjects.Add(aLang,LoadFile(GetLanguageFile(aLang)))
else if FLangObjects.IndexOfName(aLang)=-1 then
FLangObjects.Add(aLang,TJSONObject.Create)
end;
end;
Procedure THTMLLangExtractor.CreateLanguageNodes;
var
I : Integer;
aLang : String;
begin
FreeAndNil(FLangObjects);
FLangObjects:=TJSONObject.Create;
// Add all languages
for I:=1 to WordCount(Languages,[',']) do
begin
aLang:=ExtractWord(I,Languages,[',']);
if FLangObjects.IndexOfName(aLang)=-1 then
FLangObjects.Add(aLang,TJSONObject.Create)
end;
end;
procedure THTMLLangExtractor.Execute;
Var
aCount : Integer;
begin
if Languages='' then
Languages:='en';
if not CleanOutput then
LoadExistingFiles
else
CreateLanguageNodes;
if (HTMLDir<>'') then
CollectHTMLNamesAndTexts(IncludeTrailingPathDelimiter(HTMLDir));
aCount:=FLangObjects.Items[0].Count;
Log('Collected %d message scopes',[aCount]);
CopyMissingWords;
CreateLanguageFiles;
end;
end.