Explorar el Código

* Translate attribute extraction tool

Michaël Van Canneyt hace 2 años
padre
commit
64c9e3c5b3

+ 56 - 0
tools/extractlang/extractlang.lpi

@@ -0,0 +1,56 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<CONFIG>
+  <ProjectOptions>
+    <Version Value="12"/>
+    <General>
+      <Flags>
+        <MainUnitHasCreateFormStatements Value="False"/>
+        <MainUnitHasTitleStatement Value="False"/>
+        <MainUnitHasScaledStatement Value="False"/>
+        <UseDefaultCompilerOptions Value="True"/>
+      </Flags>
+      <SessionStorage Value="InProjectDir"/>
+      <Title Value="extractlang"/>
+      <UseAppBundle Value="False"/>
+      <ResourceType Value="res"/>
+    </General>
+    <BuildModes>
+      <Item Name="Default" Default="True"/>
+    </BuildModes>
+    <PublishOptions>
+      <Version Value="2"/>
+      <UseFileFilters Value="True"/>
+    </PublishOptions>
+    <RunParams>
+      <FormatVersion Value="2"/>
+    </RunParams>
+    <Units>
+      <Unit>
+        <Filename Value="extractlang.lpr"/>
+        <IsPartOfProject Value="True"/>
+      </Unit>
+    </Units>
+  </ProjectOptions>
+  <CompilerOptions>
+    <Version Value="11"/>
+    <Target>
+      <Filename Value="extractlang"/>
+    </Target>
+    <SearchPaths>
+      <UnitOutputDirectory Value="lib/$(TargetCPU)-$(TargetOS)"/>
+    </SearchPaths>
+  </CompilerOptions>
+  <Debugging>
+    <Exceptions>
+      <Item>
+        <Name Value="EAbort"/>
+      </Item>
+      <Item>
+        <Name Value="ECodetoolError"/>
+      </Item>
+      <Item>
+        <Name Value="EFOpenError"/>
+      </Item>
+    </Exceptions>
+  </Debugging>
+</CONFIG>

+ 131 - 0
tools/extractlang/extractlang.lpr

@@ -0,0 +1,131 @@
+{
+    This file is part of the Pas2JS run time library.
+    Copyright (c) 2019 by Michael Van Canneyt
+    
+    Program to extract data-translate tags from a HTML file.
+
+    See the file COPYING.FPC, included in this distribution,
+    for details about the copyright.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+
+ **********************************************************************}
+program extractlang;
+
+{$mode objfpc}{$H+}
+
+uses
+  {$IFDEF UNIX}
+  cwstring,
+  {$ENDIF}
+  Classes, SysUtils, CustApp, jsonparser, langextractor;
+
+type
+
+  { TExtractLangApplication }
+
+  TExtractLangApplication = class(TCustomApplication)
+  private
+    procedure Logger({%H-}Sender: TObject; const Msg: String);
+  protected
+    FExtractor : THTMLLangExtractor;
+    procedure DoRun; override;
+  public
+    constructor Create(TheOwner: TComponent); override;
+    destructor Destroy; override;
+    procedure Usage(Const Msg : String); virtual;
+  end;
+
+{ TExtractLangApplication }
+
+procedure TExtractLangApplication.Logger(Sender: TObject; const Msg: String);
+begin
+  Writeln(Msg);
+end;
+
+procedure TExtractLangApplication.DoRun;
+var
+  ErrorMsg: String;
+begin
+  Terminate;
+  ErrorMsg:=CheckOptions('cd:f:hl:mn:o:ts:r', ['clear','file-mode','help','html-dir','languages','minify','name','output','recurse','single-scope','trash-values']);
+  if (ErrorMsg<>'') or HasOption('h','help') then
+    begin
+    Usage(ErrorMsg);
+    exit;
+    end;
+  With FExtractor do
+    begin
+    OnLog:=@Logger;
+    HTMLDir:=GetOptionValue('d','html-dir');
+    OutputFileName:=GetOptionValue('o','output');
+    Languages:=GetOptionValue('l','languages');
+    Minified:=HasOption('m','minify');
+    TrashNewValues:=HasOption('t','trash-values');
+    SingleScope:=GetOptionValue('s','single-scope');
+    CleanOutput:=HasOption('c','clear');
+    Recurse:=HasOption('r','recurse');
+    TagName:=GetOptionValue('n','name');
+    if (HTMLDir='') or (OutputFileName='') then
+      Usage('Need input dir and output filename');
+    if HasOption('f','file-mode') then
+      Case LowerCase(GetOptionValue('f','file-mode')) of
+       'single':
+          OutputFileMode:=fmSingle;
+       'multiple',
+       'multi':
+          OutputFileMode:=fmMultiple;
+      else
+        OutputFileMode:=fmSingle;
+      end;
+    TrashNewValues:=HasOption('t','trash-values');
+    Execute;
+    end;
+end;
+
+constructor TExtractLangApplication.Create(TheOwner: TComponent);
+begin
+  inherited Create(TheOwner);
+  StopOnException:=True;
+  FExtractor:=THTMLLangExtractor.Create(Self);
+end;
+
+destructor TExtractLangApplication.Destroy;
+begin
+  FreeAndNil(FExtractor);
+  inherited Destroy;
+end;
+
+procedure TExtractLangApplication.Usage(const Msg: String);
+begin
+  if Msg<>'' then
+     Writeln('Error : ',Msg);
+  Writeln('Usage: ', ExeName, ' [options]');
+  Writeln('Where options is one or more of:');
+  Writeln('-h --help                   This help text');
+  Writeln('-c --clear                  Clear output JSON file (Default is to update existing output file).');
+  Writeln('-d --html-dir=DIR           Directory with HTML files to scan (recursively)');
+  Writeln('-f --file-mode=MODE         Set file mode: one of single or multiple');
+  Writeln('-o --output=FILE            File to write JSON translations (may get suffix depending on file mode)');
+  Writeln('-l --languages=LIST         Comma-separated list of languages to create');
+  Writeln('-m --minify                 Minify output');
+  Writeln('-n --name=NAME              Set name of data-tag to NAME (data-NAME)');
+  Writeln('-r --recurse                Recurse into subdirectories of the HTML directory');
+  Writeln('-s --single-scope=SCOPE     Put all translation names in a single scope');
+  Writeln('-t --trash-values           Trash values for other languages');
+  ExitCode:=Ord(Msg<>'');
+  Halt;
+end;
+
+var
+  Application: TExtractLangApplication;
+
+begin
+  Application:=TExtractLangApplication.Create(nil);
+  Application.Title:='Extract data-translate tag application';
+  Application.Run;
+  Application.Free;
+end.
+

+ 507 - 0
tools/extractlang/langextractor.pp

@@ -0,0 +1,507 @@
+{
+    This file is part of the Pas2JS run time library.
+    Copyright (c) 2019 by Michael Van Canneyt
+    
+    Unit to extract data-translate tags from a HTML file and create a JSON file from it.
+
+    See the file COPYING.FPC, included in this distribution,
+    for details about the copyright.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+
+ **********************************************************************}
+Unit langextractor;
+
+{$mode objfpc}{$H+}
+interface
+
+uses
+  Classes, Contnrs, SysUtils, StrUtils, sax, sax_html, fpjson;
+
+Type
+  TFileMode = (fmSingle,fmMultiple);
+
+  TLogEvent = Procedure(Sender : TObject; Const Msg : String) of object;
+  ETranslate = Class(Exception);
+
+  TTranslations = Class(TObject)
+    Strings : Array of string;
+    Used : Boolean;
+  end;
+
+  { THTMLLangExtractor }
+
+  THTMLLangExtractor = Class(TComponent)
+  private
+    // Used in CollectFileNamesAndTexts...
+    FCurrent,
+    // texts in language used in HTML
+    FLangObjects : TJSONObject;
+    FFileMode: TFileMode;
+    FOutputFileName: String;
+    FCleanOutput: Boolean;
+    FMiniFied: Boolean;
+    FRecurse: Boolean;
+    FSingleScope: String;
+    FTagName: String;
+    // Map of language - JSON object
+    FTranslations : TFPObjectList;
+    FHTMLDir: String;
+    FCurrentName:String;
+    FCurrentCount: Integer;
+    FOnLog: TLogEvent;
+    FLanguages: String;
+    FTrash: Boolean;
+    procedure DoEndElement({%H-}Sender: TObject; const {%H-}NamespaceURI, {%H-}LocalName, {%H-}QName: SAXString);
+    procedure DoStartElement(Sender: TObject; const {%H-}NamespaceURI, {%H-}LocalName, {%H-}QName: SAXString; Atts: TSAXAttributes);
+    procedure DoTextElement({%H-}Sender: TObject; const ch: PSAXChar; {%H-}AStart, ALength: Integer);
+    function GetLanguageFile(aLang: String): String;
+    function GetTagName: String;
+    procedure LoadExistingFiles;
+    procedure CreateLanguageNodes;
+    function LoadFile(const aFileName: string): TJSONObject;
+  Protected
+    
+    procedure AddString(const aName, aValue: String);
+    procedure CollectHTMLFileNamesAndTexts(const aFileName: String);
+    procedure CopyMissingWords;
+    procedure CopyWords(SrcScope, DestScope: TJSONObject; aList: TStrings);
+    Procedure Log(Const Msg : String); overload;
+    Procedure Log(Const Fmt : String; Const Args : Array of const); overload;
+    Procedure CollectHTMLNamesAndTexts(Const aDir : string);
+    Procedure CreateLanguageFiles;
+  Public
+    Constructor Create(aOwner : TComponent); override;
+    Destructor Destroy; override;
+    Procedure Execute;
+    // ClearOutput 
+    Property CleanOutput : Boolean Read FCleanOutput Write FCleanOutput;
+    // HTML Files that need translation
+    Property HTMLDir : String Read FHTMLDir Write FHTMLDir;
+    // File for JSON file(s) with translations
+    Property OutputFileName : String Read FOutputFileName Write FOutputFileName;
+    // Emit Log messages
+    Property OnLog : TLogEvent Read FOnLog Write FOnlog;
+    // Minified language constants
+    Property Minified : Boolean Read FMiniFied Write FMinified;
+    // TagName (data-tag)
+    Property TagName : String Read GetTagName Write FTagName;
+    // Trash new values in translations.
+    Property TrashNewValues : Boolean Read FTrash Write FTrash;
+    // Single/Multiple files
+    Property OutputFileMode : TFileMode Read FFileMode Write FFileMode;
+    // Languages: comma-separated list. First is the input language (en)
+    Property Languages: String Read FLanguages Write FLanguages;
+    // Recurse : Boolean;
+    Property Recurse: Boolean Read FRecurse Write FRecurse;
+    // SingleScope : If this is set, all identifiers are set in a single scope.
+    Property SingleScope : String Read FSingleScope Write FSingleScope;
+  end;
+
+implementation
+
+{ THTMLLangExtractor }
+
+procedure THTMLLangExtractor.Log(const Msg: String);
+begin
+  if Assigned(FOnLog) then
+    FOnLog(Self,Msg);
+end;
+
+procedure THTMLLangExtractor.Log(const Fmt: String; const Args: array of const);
+begin
+  Log(Format(Fmt,Args));
+end;
+
+
+
+
+procedure THTMLLangExtractor.DoStartElement(Sender: TObject; const {%H-}NamespaceURI, LocalName, {%H-}QName: SAXString; Atts: TSAXAttributes);
+
+Var
+  aID,aTerm,aAttr : String;
+  I,P,aCount : Integer;
+
+begin
+  if Not Assigned(atts) then exit;
+  aID:=UTF8Encode(Atts.GetValue('','data-'+Utf8Decode(tagname)));
+  if (aID='') then
+    exit;
+  aCount:=WordCount(aID,[';']);
+  FcurrentName:='';
+  for I:=1 to aCount do
+    begin
+    aTerm:=ExtractWord(I,aID,[';']);
+    P:=Pos('-',aTerm);
+    if (P=0) then
+      begin
+      if FCurrentName='' then
+        FCurrentName:=aID
+      else
+        Log('Translate element "%s" contains 2 IDs: "%s" "%s". Ignoring 2nd ',[aID,FCurrentName,aTerm]);
+      end
+    else
+      begin
+      aAttr:=Copy(aTerm,P+1);
+      AddString(aTerm,UTF8Encode(Atts.GetValue('',UTF8Decode(aAttr))));
+      end;
+    end;
+end;
+
+procedure THTMLLangExtractor.DoTextElement(Sender: TObject; const ch: PSAXChar; AStart, ALength: Integer);
+
+Var
+  S : String;
+  W : UnicodeString;
+
+begin
+  if FCurrentName='' then exit;
+  W:='';
+  SetLength(W,aLength);
+  Move(ch^,W[1],aLength*SizeOf(WideChar));
+  S:=Trim(UTF8Encode(W));
+  AddString(FCurrentName,S);
+end;
+
+procedure THTMLLangExtractor.AddString(const aName, aValue: String);
+
+Var
+  Idx : Integer;
+  Old : String;
+
+begin
+  Idx:=FCurrent.IndexOfName(aName,True);
+  If Idx<>-1 then
+    begin
+    Old:=FCurrent.Items[idx].AsString;
+    if (Old<>aValue) then
+      Log('Ignoring duplicate name %s. Old text = "%s", new = "%s"',[aName, Old, aValue]);
+    end
+  else
+    begin
+    FCurrent.Strings[aName]:=aValue;
+    FCurrentName:='';
+    Inc(FCurrentCount);
+    end;
+end;
+
+procedure THTMLLangExtractor.CollectHTMLFileNamesAndTexts(const aFileName : String);
+
+Var
+  MyReader : THTMLReader;
+  F : TFileStream;
+  aScope : string;
+
+
+begin
+  if SingleScope<>'' then
+    aScope:=SingleScope
+  else
+    aScope:=LowerCase(ChangeFileExt(ExtractFileName(aFileName),''));
+  Log('Searching %s for translatable terms, adding to scope : %s',[aFileName,aScope]);
+  if (FLangObjects.Items[0] as TJSONObject).IndexOfName(aScope)<>-1 then
+    FCurrent:=(FLangObjects.Items[0] as TJSONObject).Objects[aScope]
+  else
+    begin
+    FCurrent:=TJSONObject.Create;
+    // Add scope to default language
+    (FLangObjects.Items[0] as TJSONObject).Add(aScope,FCurrent);
+    end;
+  FCurrentCount:=0;
+  MyReader:=nil;
+  F:=TFileStream.Create(aFileName,fmOpenRead or fmShareDenyNone);
+  Try
+    MyReader:=THTMLReader.Create;
+    MyReader.OnStartElement:=@DoStartElement;
+    MyReader.OnCharacters:=@DoTextElement;
+    MyReader.OnEndElement:=@DoEndElement;
+    MyReader.ParseStream(F);
+    Log('Found %d translatable terms',[FCurrentCount]);
+  finally
+    FreeAndNil(MyReader);
+    FreeAndNil(F);
+  end;
+end;
+
+procedure THTMLLangExtractor.DoEndElement(Sender: TObject; const NamespaceURI, LocalName, QName: SAXString);
+begin
+  FCurrentName:='';
+end;
+
+
+procedure THTMLLangExtractor.CollectHTMLNamesAndTexts(const aDir: string);
+
+Var
+  Info : TSearchRec;
+
+begin
+  // HTML files
+  If FindFirst(aDir+'*.html',0,Info)=0 then
+    try
+      Repeat
+        CollectHTMLFileNamesAndTexts(aDir+Info.Name);
+      Until FindNext(Info)<>0;
+    finally
+      FindClose(Info);
+    end;
+  // Subdirs
+  if Recurse then
+    If FindFirst(aDir+'*',faDirectory,Info)=0 then
+      try
+        Repeat
+          With Info do
+            if ((Attr and faDirectory)<>0) and (Name<>'.') and (Name<>'..') then
+              CollectHTMLNamesAndTexts(IncludeTrailingPathDelimiter(aDir+Name));
+        Until FindNext(Info)<>0;
+      finally
+        FindClose(Info);
+      end;
+end;
+
+
+function THTMLLangExtractor.GetLanguageFile(aLang: String): String;
+
+Var
+  Ext : String;
+
+begin
+  Ext:=ExtractFileExt(OutputFileName);
+  Result:=ChangeFileExt(OutputFileName,'-'+aLang+Ext);
+end;
+
+function THTMLLangExtractor.GetTagName: String;
+begin
+  Result:=FTagName;
+  if Result='' then
+    Result:='translate';
+end;
+
+procedure THTMLLangExtractor.CreateLanguageFiles;
+
+  Function GetAsJSON(aObject : TJSONObject) : string;
+
+  begin
+    if FMinified then
+      Result:=aObject.AsJSON
+    else
+      Result:=aObject.FormatJSON
+  end;
+
+Var
+  I : Integer;
+  S : TStringStream;
+
+begin
+  if FFileMode=fmSingle then
+    begin
+    S:=TstringStream.Create(GetAsJSON(FLangObjects),TEncoding.UTF8);
+    try
+      S.SaveToFile(OutputFileName);
+    finally
+      S.Free;
+    end;
+    end
+  else
+    begin
+    For I:=0 to FLangObjects.Count-1 do
+      begin
+      S:=TstringStream.Create(GetAsJSON(FLangObjects.Items[i] as TJSONObject),TEncoding.UTF8);
+      try
+        S.SaveToFile(GetLanguageFile(FLangObjects.Names[i]));
+      finally
+        S.Free;
+      end;
+      end;
+    end;
+end;
+
+
+constructor THTMLLangExtractor.Create(aOwner: TComponent);
+begin
+  inherited Create(aOwner);
+  FLangObjects:=TJSONObject.Create;
+  FTranslations:=TFPObjectList.Create(True);
+end;
+
+destructor THTMLLangExtractor.Destroy;
+begin
+  FreeAndNil(FTranslations);
+  FreeAndNil(FLangObjects);
+  inherited Destroy;
+end;
+
+procedure THTMLLangExtractor.CopyWords(SrcScope,DestScope : TJSONObject; aList : TStrings);
+
+Var
+  I : Integer;
+  aName,aValue : String;
+
+begin
+  For I:=0 to SrcScope.Count-1 do
+    begin
+    aName:=SrcScope.Names[I];
+    if DestScope.IndexOfName(aName)=-1 then
+      begin
+      if TrashNewValues then
+        aValue:='生词'+IntToStr(i)
+      else
+        aValue:=SrcScope.Items[I].AsString;
+      DestScope.Add(aName,aValue);
+      if Assigned(aList) then
+        aList.Add(aName);
+      end;
+    end;
+end;
+
+procedure THTMLLangExtractor.CopyMissingWords;
+
+Var
+  I,J,aSectionWordCount,aSectionCount : Integer;
+  NewWords : TStringList;
+  Src,Dest,SrcScope,DestScope : TJSONObject;
+  NewSection : Boolean;
+  aScope : String;
+
+begin
+  aSectionCount:=0;
+  aSectionWordCount:=0;
+  NewWords:=TstringList.Create;
+  Try
+    NewWords.Sorted:=True;
+    NewWords.Duplicates:=dupIgnore;
+    Src:=FLangObjects.Items[0] as TJSONObject;
+    // Copy all scopes
+    For I:=0 to Src.Count-1 do
+      begin
+      aScope:=Src.Names[I];
+      SrcScope:=Src.Items[i] as TJSONObject;
+      NewSection:=False;
+      For J:=1 to FLangObjects.Count-1 do
+        begin
+        Dest:=FLangObjects.Items[J] as TJSONObject;
+        If (Dest.IndexOfName(aScope)=-1) then
+          begin
+          NewSection:=true;
+          if TrashNewValues then
+            begin
+            DestScope:=TJSONObject.Create;
+            Dest.Add(aScope,DestScope);
+            CopyWords(SrcScope,DestScope,Nil);
+            end
+          else
+            Dest.Add(aScope,Src.Items[I].Clone);
+          end
+        else
+          begin
+          DestScope:=Dest.Objects[aScope] as TJSONObject;
+          CopyWords(SrcScope,DestScope,NewWords);
+          end;
+        end;
+      If NewSection then
+        begin
+        Inc(aSectionCount);
+        Inc(aSectionWordCount,SrcScope.Count);
+        end;
+      end;
+    Log('Copied %d new scopes with %d words, added %d new words in existing scopes.',[aSectionCount,aSectionWordCount,NewWords.Count])
+  finally
+    NewWords.Free;
+  end;
+end;
+
+function THTMLLangExtractor.LoadFile(const aFileName: string): TJSONObject;
+
+Var
+  F : TFileStream;
+  D : TJSONData;
+
+begin
+  Log('Loading existing file "%s"',[aFileName]);
+  F:=TFileStream.Create(aFileName,fmOpenRead or fmShareDenyWrite);
+  try
+    D:=GetJSON(F);
+    if D is TJSONObject then
+      begin
+      Result:=D as TJSONObject;
+      D:=Nil;
+      end
+    else
+      begin
+      Log('File "%s" does not contain valid JSON',[aFileName]);
+      Result:=TJSONObject.Create;
+      end;
+  finally
+    D.Free;
+    F.Free;
+  end;
+end;
+
+
+procedure THTMLLangExtractor.LoadExistingFiles;
+
+Var
+  I : Integer;
+  Obj : TJSONObject;
+  aLang : String;
+
+begin
+  // Load global file, if any
+  if (OutputFileMode=fmSingle) and FileExists(OutputFileName) then
+    begin
+    Obj:=LoadFile(OutputFileName);
+    FreeAndNil(FLangObjects);
+    FLangObjects:=Obj;
+    end;
+  // Add all languages
+  for I:=1 to WordCount(Languages,[',']) do
+    begin
+    aLang:=ExtractWord(I,Languages,[',']);
+    if (OutputFileMode=fmMultiple) and FileExists(GetLanguageFile(aLang)) then
+      FLangObjects.Add(aLang,LoadFile(GetLanguageFile(aLang)))
+    else if FLangObjects.IndexOfName(aLang)=-1 then
+      FLangObjects.Add(aLang,TJSONObject.Create)
+    end;
+end;
+
+Procedure THTMLLangExtractor.CreateLanguageNodes;
+
+var
+  I : Integer;
+  aLang : String;
+  
+begin
+  FreeAndNil(FLangObjects);
+  FLangObjects:=TJSONObject.Create;
+  // Add all languages
+  for I:=1 to WordCount(Languages,[',']) do
+    begin
+    aLang:=ExtractWord(I,Languages,[',']);
+    if FLangObjects.IndexOfName(aLang)=-1 then
+      FLangObjects.Add(aLang,TJSONObject.Create)
+    end;
+end;
+
+procedure THTMLLangExtractor.Execute;
+
+Var
+  aCount : Integer;
+
+begin
+  if Languages='' then
+    Languages:='en';
+  if not CleanOutput then
+    LoadExistingFiles
+  else
+    CreateLanguageNodes;  
+  if (HTMLDir<>'') then
+    CollectHTMLNamesAndTexts(IncludeTrailingPathDelimiter(HTMLDir));
+  aCount:=FLangObjects.Items[0].Count;
+  Log('Collected %d message scopes',[aCount]);
+  CopyMissingWords;
+  CreateLanguageFiles;
+end;
+
+end.
+