pas2js/tools/extractlang/langextractor.pp
2022-10-18 10:01:18 +02:00

508 lines
13 KiB
ObjectPascal

{
This file is part of the Pas2JS run time library.
Copyright (c) 2019 by Michael Van Canneyt
Unit to extract data-translate tags from a HTML file and create a JSON file from it.
See the file COPYING.FPC, included in this distribution,
for details about the copyright.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
**********************************************************************}
Unit langextractor;
{$mode objfpc}{$H+}
interface
uses
Classes, Contnrs, SysUtils, StrUtils, sax, sax_html, fpjson;
Type
TFileMode = (fmSingle,fmMultiple);
TLogEvent = Procedure(Sender : TObject; Const Msg : String) of object;
ETranslate = Class(Exception);
TTranslations = Class(TObject)
Strings : Array of string;
Used : Boolean;
end;
{ THTMLLangExtractor }
THTMLLangExtractor = Class(TComponent)
private
// Used in CollectFileNamesAndTexts...
FCurrent,
// texts in language used in HTML
FLangObjects : TJSONObject;
FFileMode: TFileMode;
FOutputFileName: String;
FCleanOutput: Boolean;
FMiniFied: Boolean;
FRecurse: Boolean;
FSingleScope: String;
FTagName: String;
// Map of language - JSON object
FTranslations : TFPObjectList;
FHTMLDir: String;
FCurrentName:String;
FCurrentCount: Integer;
FOnLog: TLogEvent;
FLanguages: String;
FTrash: Boolean;
procedure DoEndElement({%H-}Sender: TObject; const {%H-}NamespaceURI, {%H-}LocalName, {%H-}QName: SAXString);
procedure DoStartElement(Sender: TObject; const {%H-}NamespaceURI, {%H-}LocalName, {%H-}QName: SAXString; Atts: TSAXAttributes);
procedure DoTextElement({%H-}Sender: TObject; const ch: PSAXChar; {%H-}AStart, ALength: Integer);
function GetLanguageFile(aLang: String): String;
function GetTagName: String;
procedure LoadExistingFiles;
procedure CreateLanguageNodes;
function LoadFile(const aFileName: string): TJSONObject;
Protected
procedure AddString(const aName, aValue: String);
procedure CollectHTMLFileNamesAndTexts(const aFileName: String);
procedure CopyMissingWords;
procedure CopyWords(SrcScope, DestScope: TJSONObject; aList: TStrings);
Procedure Log(Const Msg : String); overload;
Procedure Log(Const Fmt : String; Const Args : Array of const); overload;
Procedure CollectHTMLNamesAndTexts(Const aDir : string);
Procedure CreateLanguageFiles;
Public
Constructor Create(aOwner : TComponent); override;
Destructor Destroy; override;
Procedure Execute;
// ClearOutput
Property CleanOutput : Boolean Read FCleanOutput Write FCleanOutput;
// HTML Files that need translation
Property HTMLDir : String Read FHTMLDir Write FHTMLDir;
// File for JSON file(s) with translations
Property OutputFileName : String Read FOutputFileName Write FOutputFileName;
// Emit Log messages
Property OnLog : TLogEvent Read FOnLog Write FOnlog;
// Minified language constants
Property Minified : Boolean Read FMiniFied Write FMinified;
// TagName (data-tag)
Property TagName : String Read GetTagName Write FTagName;
// Trash new values in translations.
Property TrashNewValues : Boolean Read FTrash Write FTrash;
// Single/Multiple files
Property OutputFileMode : TFileMode Read FFileMode Write FFileMode;
// Languages: comma-separated list. First is the input language (en)
Property Languages: String Read FLanguages Write FLanguages;
// Recurse : Boolean;
Property Recurse: Boolean Read FRecurse Write FRecurse;
// SingleScope : If this is set, all identifiers are set in a single scope.
Property SingleScope : String Read FSingleScope Write FSingleScope;
end;
implementation
{ THTMLLangExtractor }
procedure THTMLLangExtractor.Log(const Msg: String);
begin
if Assigned(FOnLog) then
FOnLog(Self,Msg);
end;
procedure THTMLLangExtractor.Log(const Fmt: String; const Args: array of const);
begin
Log(Format(Fmt,Args));
end;
procedure THTMLLangExtractor.DoStartElement(Sender: TObject; const {%H-}NamespaceURI, LocalName, {%H-}QName: SAXString; Atts: TSAXAttributes);
Var
aID,aTerm,aAttr : String;
I,P,aCount : Integer;
begin
if Not Assigned(atts) then exit;
aID:=UTF8Encode(Atts.GetValue('','data-'+Utf8Decode(tagname)));
if (aID='') then
exit;
aCount:=WordCount(aID,[';']);
FcurrentName:='';
for I:=1 to aCount do
begin
aTerm:=ExtractWord(I,aID,[';']);
P:=Pos('-',aTerm);
if (P=0) then
begin
if FCurrentName='' then
FCurrentName:=aID
else
Log('Translate element "%s" contains 2 IDs: "%s" "%s". Ignoring 2nd ',[aID,FCurrentName,aTerm]);
end
else
begin
aAttr:=Copy(aTerm,P+1);
AddString(aTerm,UTF8Encode(Atts.GetValue('',UTF8Decode(aAttr))));
end;
end;
end;
procedure THTMLLangExtractor.DoTextElement(Sender: TObject; const ch: PSAXChar; AStart, ALength: Integer);
Var
S : String;
W : UnicodeString;
begin
if FCurrentName='' then exit;
W:='';
SetLength(W,aLength);
Move(ch^,W[1],aLength*SizeOf(WideChar));
S:=Trim(UTF8Encode(W));
AddString(FCurrentName,S);
end;
procedure THTMLLangExtractor.AddString(const aName, aValue: String);
Var
Idx : Integer;
Old : String;
begin
Idx:=FCurrent.IndexOfName(aName,True);
If Idx<>-1 then
begin
Old:=FCurrent.Items[idx].AsString;
if (Old<>aValue) then
Log('Ignoring duplicate name %s. Old text = "%s", new = "%s"',[aName, Old, aValue]);
end
else
begin
FCurrent.Strings[aName]:=aValue;
FCurrentName:='';
Inc(FCurrentCount);
end;
end;
procedure THTMLLangExtractor.CollectHTMLFileNamesAndTexts(const aFileName : String);
Var
MyReader : THTMLReader;
F : TFileStream;
aScope : string;
begin
if SingleScope<>'' then
aScope:=SingleScope
else
aScope:=LowerCase(ChangeFileExt(ExtractFileName(aFileName),''));
Log('Searching %s for translatable terms, adding to scope : %s',[aFileName,aScope]);
if (FLangObjects.Items[0] as TJSONObject).IndexOfName(aScope)<>-1 then
FCurrent:=(FLangObjects.Items[0] as TJSONObject).Objects[aScope]
else
begin
FCurrent:=TJSONObject.Create;
// Add scope to default language
(FLangObjects.Items[0] as TJSONObject).Add(aScope,FCurrent);
end;
FCurrentCount:=0;
MyReader:=nil;
F:=TFileStream.Create(aFileName,fmOpenRead or fmShareDenyNone);
Try
MyReader:=THTMLReader.Create;
MyReader.OnStartElement:=@DoStartElement;
MyReader.OnCharacters:=@DoTextElement;
MyReader.OnEndElement:=@DoEndElement;
MyReader.ParseStream(F);
Log('Found %d translatable terms',[FCurrentCount]);
finally
FreeAndNil(MyReader);
FreeAndNil(F);
end;
end;
procedure THTMLLangExtractor.DoEndElement(Sender: TObject; const NamespaceURI, LocalName, QName: SAXString);
begin
FCurrentName:='';
end;
procedure THTMLLangExtractor.CollectHTMLNamesAndTexts(const aDir: string);
Var
Info : TSearchRec;
begin
// HTML files
If FindFirst(aDir+'*.html',0,Info)=0 then
try
Repeat
CollectHTMLFileNamesAndTexts(aDir+Info.Name);
Until FindNext(Info)<>0;
finally
FindClose(Info);
end;
// Subdirs
if Recurse then
If FindFirst(aDir+'*',faDirectory,Info)=0 then
try
Repeat
With Info do
if ((Attr and faDirectory)<>0) and (Name<>'.') and (Name<>'..') then
CollectHTMLNamesAndTexts(IncludeTrailingPathDelimiter(aDir+Name));
Until FindNext(Info)<>0;
finally
FindClose(Info);
end;
end;
function THTMLLangExtractor.GetLanguageFile(aLang: String): String;
Var
Ext : String;
begin
Ext:=ExtractFileExt(OutputFileName);
Result:=ChangeFileExt(OutputFileName,'-'+aLang+Ext);
end;
function THTMLLangExtractor.GetTagName: String;
begin
Result:=FTagName;
if Result='' then
Result:='translate';
end;
procedure THTMLLangExtractor.CreateLanguageFiles;
Function GetAsJSON(aObject : TJSONObject) : string;
begin
if FMinified then
Result:=aObject.AsJSON
else
Result:=aObject.FormatJSON
end;
Var
I : Integer;
S : TStringStream;
begin
if FFileMode=fmSingle then
begin
S:=TstringStream.Create(GetAsJSON(FLangObjects),TEncoding.UTF8);
try
S.SaveToFile(OutputFileName);
finally
S.Free;
end;
end
else
begin
For I:=0 to FLangObjects.Count-1 do
begin
S:=TstringStream.Create(GetAsJSON(FLangObjects.Items[i] as TJSONObject),TEncoding.UTF8);
try
S.SaveToFile(GetLanguageFile(FLangObjects.Names[i]));
finally
S.Free;
end;
end;
end;
end;
constructor THTMLLangExtractor.Create(aOwner: TComponent);
begin
inherited Create(aOwner);
FLangObjects:=TJSONObject.Create;
FTranslations:=TFPObjectList.Create(True);
end;
destructor THTMLLangExtractor.Destroy;
begin
FreeAndNil(FTranslations);
FreeAndNil(FLangObjects);
inherited Destroy;
end;
procedure THTMLLangExtractor.CopyWords(SrcScope,DestScope : TJSONObject; aList : TStrings);
Var
I : Integer;
aName,aValue : String;
begin
For I:=0 to SrcScope.Count-1 do
begin
aName:=SrcScope.Names[I];
if DestScope.IndexOfName(aName)=-1 then
begin
if TrashNewValues then
aValue:='生词'+IntToStr(i)
else
aValue:=SrcScope.Items[I].AsString;
DestScope.Add(aName,aValue);
if Assigned(aList) then
aList.Add(aName);
end;
end;
end;
procedure THTMLLangExtractor.CopyMissingWords;
Var
I,J,aSectionWordCount,aSectionCount : Integer;
NewWords : TStringList;
Src,Dest,SrcScope,DestScope : TJSONObject;
NewSection : Boolean;
aScope : String;
begin
aSectionCount:=0;
aSectionWordCount:=0;
NewWords:=TstringList.Create;
Try
NewWords.Sorted:=True;
NewWords.Duplicates:=dupIgnore;
Src:=FLangObjects.Items[0] as TJSONObject;
// Copy all scopes
For I:=0 to Src.Count-1 do
begin
aScope:=Src.Names[I];
SrcScope:=Src.Items[i] as TJSONObject;
NewSection:=False;
For J:=1 to FLangObjects.Count-1 do
begin
Dest:=FLangObjects.Items[J] as TJSONObject;
If (Dest.IndexOfName(aScope)=-1) then
begin
NewSection:=true;
if TrashNewValues then
begin
DestScope:=TJSONObject.Create;
Dest.Add(aScope,DestScope);
CopyWords(SrcScope,DestScope,Nil);
end
else
Dest.Add(aScope,Src.Items[I].Clone);
end
else
begin
DestScope:=Dest.Objects[aScope] as TJSONObject;
CopyWords(SrcScope,DestScope,NewWords);
end;
end;
If NewSection then
begin
Inc(aSectionCount);
Inc(aSectionWordCount,SrcScope.Count);
end;
end;
Log('Copied %d new scopes with %d words, added %d new words in existing scopes.',[aSectionCount,aSectionWordCount,NewWords.Count])
finally
NewWords.Free;
end;
end;
function THTMLLangExtractor.LoadFile(const aFileName: string): TJSONObject;
Var
F : TFileStream;
D : TJSONData;
begin
Log('Loading existing file "%s"',[aFileName]);
F:=TFileStream.Create(aFileName,fmOpenRead or fmShareDenyWrite);
try
D:=GetJSON(F);
if D is TJSONObject then
begin
Result:=D as TJSONObject;
D:=Nil;
end
else
begin
Log('File "%s" does not contain valid JSON',[aFileName]);
Result:=TJSONObject.Create;
end;
finally
D.Free;
F.Free;
end;
end;
procedure THTMLLangExtractor.LoadExistingFiles;
Var
I : Integer;
Obj : TJSONObject;
aLang : String;
begin
// Load global file, if any
if (OutputFileMode=fmSingle) and FileExists(OutputFileName) then
begin
Obj:=LoadFile(OutputFileName);
FreeAndNil(FLangObjects);
FLangObjects:=Obj;
end;
// Add all languages
for I:=1 to WordCount(Languages,[',']) do
begin
aLang:=ExtractWord(I,Languages,[',']);
if (OutputFileMode=fmMultiple) and FileExists(GetLanguageFile(aLang)) then
FLangObjects.Add(aLang,LoadFile(GetLanguageFile(aLang)))
else if FLangObjects.IndexOfName(aLang)=-1 then
FLangObjects.Add(aLang,TJSONObject.Create)
end;
end;
Procedure THTMLLangExtractor.CreateLanguageNodes;
var
I : Integer;
aLang : String;
begin
FreeAndNil(FLangObjects);
FLangObjects:=TJSONObject.Create;
// Add all languages
for I:=1 to WordCount(Languages,[',']) do
begin
aLang:=ExtractWord(I,Languages,[',']);
if FLangObjects.IndexOfName(aLang)=-1 then
FLangObjects.Add(aLang,TJSONObject.Create)
end;
end;
procedure THTMLLangExtractor.Execute;
Var
aCount : Integer;
begin
if Languages='' then
Languages:='en';
if not CleanOutput then
LoadExistingFiles
else
CreateLanguageNodes;
if (HTMLDir<>'') then
CollectHTMLNamesAndTexts(IncludeTrailingPathDelimiter(HTMLDir));
aCount:=FLangObjects.Items[0].Count;
Log('Collected %d message scopes',[aCount]);
CopyMissingWords;
CreateLanguageFiles;
end;
end.