langextractor.pp 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507
  1. {
  2. This file is part of the Pas2JS run time library.
  3. Copyright (c) 2019 by Michael Van Canneyt
  4. Unit to extract data-translate tags from a HTML file and create a JSON file from it.
  5. See the file COPYING.FPC, included in this distribution,
  6. for details about the copyright.
  7. This program is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  10. **********************************************************************}
  11. Unit langextractor;
  12. {$mode objfpc}{$H+}
  13. interface
  14. uses
  15. Classes, Contnrs, SysUtils, StrUtils, sax, sax_html, fpjson;
  16. Type
  17. TFileMode = (fmSingle,fmMultiple);
  18. TLogEvent = Procedure(Sender : TObject; Const Msg : String) of object;
  19. ETranslate = Class(Exception);
  20. TTranslations = Class(TObject)
  21. Strings : Array of string;
  22. Used : Boolean;
  23. end;
  24. { THTMLLangExtractor }
  25. THTMLLangExtractor = Class(TComponent)
  26. private
  27. // Used in CollectFileNamesAndTexts...
  28. FCurrent,
  29. // texts in language used in HTML
  30. FLangObjects : TJSONObject;
  31. FFileMode: TFileMode;
  32. FOutputFileName: String;
  33. FCleanOutput: Boolean;
  34. FMiniFied: Boolean;
  35. FRecurse: Boolean;
  36. FSingleScope: String;
  37. FTagName: String;
  38. // Map of language - JSON object
  39. FTranslations : TFPObjectList;
  40. FHTMLDir: String;
  41. FCurrentName:String;
  42. FCurrentCount: Integer;
  43. FOnLog: TLogEvent;
  44. FLanguages: String;
  45. FTrash: Boolean;
  46. procedure DoEndElement({%H-}Sender: TObject; const {%H-}NamespaceURI, {%H-}LocalName, {%H-}QName: SAXString);
  47. procedure DoStartElement(Sender: TObject; const {%H-}NamespaceURI, {%H-}LocalName, {%H-}QName: SAXString; Atts: TSAXAttributes);
  48. procedure DoTextElement({%H-}Sender: TObject; const ch: PSAXChar; {%H-}AStart, ALength: Integer);
  49. function GetLanguageFile(aLang: String): String;
  50. function GetTagName: String;
  51. procedure LoadExistingFiles;
  52. procedure CreateLanguageNodes;
  53. function LoadFile(const aFileName: string): TJSONObject;
  54. Protected
  55. procedure AddString(const aName, aValue: String);
  56. procedure CollectHTMLFileNamesAndTexts(const aFileName: String);
  57. procedure CopyMissingWords;
  58. procedure CopyWords(SrcScope, DestScope: TJSONObject; aList: TStrings);
  59. Procedure Log(Const Msg : String); overload;
  60. Procedure Log(Const Fmt : String; Const Args : Array of const); overload;
  61. Procedure CollectHTMLNamesAndTexts(Const aDir : string);
  62. Procedure CreateLanguageFiles;
  63. Public
  64. Constructor Create(aOwner : TComponent); override;
  65. Destructor Destroy; override;
  66. Procedure Execute;
  67. // ClearOutput
  68. Property CleanOutput : Boolean Read FCleanOutput Write FCleanOutput;
  69. // HTML Files that need translation
  70. Property HTMLDir : String Read FHTMLDir Write FHTMLDir;
  71. // File for JSON file(s) with translations
  72. Property OutputFileName : String Read FOutputFileName Write FOutputFileName;
  73. // Emit Log messages
  74. Property OnLog : TLogEvent Read FOnLog Write FOnlog;
  75. // Minified language constants
  76. Property Minified : Boolean Read FMiniFied Write FMinified;
  77. // TagName (data-tag)
  78. Property TagName : String Read GetTagName Write FTagName;
  79. // Trash new values in translations.
  80. Property TrashNewValues : Boolean Read FTrash Write FTrash;
  81. // Single/Multiple files
  82. Property OutputFileMode : TFileMode Read FFileMode Write FFileMode;
  83. // Languages: comma-separated list. First is the input language (en)
  84. Property Languages: String Read FLanguages Write FLanguages;
  85. // Recurse : Boolean;
  86. Property Recurse: Boolean Read FRecurse Write FRecurse;
  87. // SingleScope : If this is set, all identifiers are set in a single scope.
  88. Property SingleScope : String Read FSingleScope Write FSingleScope;
  89. end;
  90. implementation
  91. { THTMLLangExtractor }
  92. procedure THTMLLangExtractor.Log(const Msg: String);
  93. begin
  94. if Assigned(FOnLog) then
  95. FOnLog(Self,Msg);
  96. end;
  97. procedure THTMLLangExtractor.Log(const Fmt: String; const Args: array of const);
  98. begin
  99. Log(Format(Fmt,Args));
  100. end;
  101. procedure THTMLLangExtractor.DoStartElement(Sender: TObject; const {%H-}NamespaceURI, LocalName, {%H-}QName: SAXString; Atts: TSAXAttributes);
  102. Var
  103. aID,aTerm,aAttr : String;
  104. I,P,aCount : Integer;
  105. begin
  106. if Not Assigned(atts) then exit;
  107. aID:=UTF8Encode(Atts.GetValue('','data-'+Utf8Decode(tagname)));
  108. if (aID='') then
  109. exit;
  110. aCount:=WordCount(aID,[';']);
  111. FcurrentName:='';
  112. for I:=1 to aCount do
  113. begin
  114. aTerm:=ExtractWord(I,aID,[';']);
  115. P:=Pos('-',aTerm);
  116. if (P=0) then
  117. begin
  118. if FCurrentName='' then
  119. FCurrentName:=aID
  120. else
  121. Log('Translate element "%s" contains 2 IDs: "%s" "%s". Ignoring 2nd ',[aID,FCurrentName,aTerm]);
  122. end
  123. else
  124. begin
  125. aAttr:=Copy(aTerm,P+1);
  126. AddString(aTerm,UTF8Encode(Atts.GetValue('',UTF8Decode(aAttr))));
  127. end;
  128. end;
  129. end;
  130. procedure THTMLLangExtractor.DoTextElement(Sender: TObject; const ch: PSAXChar; AStart, ALength: Integer);
  131. Var
  132. S : String;
  133. W : UnicodeString;
  134. begin
  135. if FCurrentName='' then exit;
  136. W:='';
  137. SetLength(W,aLength);
  138. Move(ch^,W[1],aLength*SizeOf(WideChar));
  139. S:=Trim(UTF8Encode(W));
  140. AddString(FCurrentName,S);
  141. end;
  142. procedure THTMLLangExtractor.AddString(const aName, aValue: String);
  143. Var
  144. Idx : Integer;
  145. Old : String;
  146. begin
  147. Idx:=FCurrent.IndexOfName(aName,True);
  148. If Idx<>-1 then
  149. begin
  150. Old:=FCurrent.Items[idx].AsString;
  151. if (Old<>aValue) then
  152. Log('Ignoring duplicate name %s. Old text = "%s", new = "%s"',[aName, Old, aValue]);
  153. end
  154. else
  155. begin
  156. FCurrent.Strings[aName]:=aValue;
  157. FCurrentName:='';
  158. Inc(FCurrentCount);
  159. end;
  160. end;
  161. procedure THTMLLangExtractor.CollectHTMLFileNamesAndTexts(const aFileName : String);
  162. Var
  163. MyReader : THTMLReader;
  164. F : TFileStream;
  165. aScope : string;
  166. begin
  167. if SingleScope<>'' then
  168. aScope:=SingleScope
  169. else
  170. aScope:=LowerCase(ChangeFileExt(ExtractFileName(aFileName),''));
  171. Log('Searching %s for translatable terms, adding to scope : %s',[aFileName,aScope]);
  172. if (FLangObjects.Items[0] as TJSONObject).IndexOfName(aScope)<>-1 then
  173. FCurrent:=(FLangObjects.Items[0] as TJSONObject).Objects[aScope]
  174. else
  175. begin
  176. FCurrent:=TJSONObject.Create;
  177. // Add scope to default language
  178. (FLangObjects.Items[0] as TJSONObject).Add(aScope,FCurrent);
  179. end;
  180. FCurrentCount:=0;
  181. MyReader:=nil;
  182. F:=TFileStream.Create(aFileName,fmOpenRead or fmShareDenyNone);
  183. Try
  184. MyReader:=THTMLReader.Create;
  185. MyReader.OnStartElement:=@DoStartElement;
  186. MyReader.OnCharacters:=@DoTextElement;
  187. MyReader.OnEndElement:=@DoEndElement;
  188. MyReader.ParseStream(F);
  189. Log('Found %d translatable terms',[FCurrentCount]);
  190. finally
  191. FreeAndNil(MyReader);
  192. FreeAndNil(F);
  193. end;
  194. end;
  195. procedure THTMLLangExtractor.DoEndElement(Sender: TObject; const NamespaceURI, LocalName, QName: SAXString);
  196. begin
  197. FCurrentName:='';
  198. end;
  199. procedure THTMLLangExtractor.CollectHTMLNamesAndTexts(const aDir: string);
  200. Var
  201. Info : TSearchRec;
  202. begin
  203. // HTML files
  204. If FindFirst(aDir+'*.html',0,Info)=0 then
  205. try
  206. Repeat
  207. CollectHTMLFileNamesAndTexts(aDir+Info.Name);
  208. Until FindNext(Info)<>0;
  209. finally
  210. FindClose(Info);
  211. end;
  212. // Subdirs
  213. if Recurse then
  214. If FindFirst(aDir+'*',faDirectory,Info)=0 then
  215. try
  216. Repeat
  217. With Info do
  218. if ((Attr and faDirectory)<>0) and (Name<>'.') and (Name<>'..') then
  219. CollectHTMLNamesAndTexts(IncludeTrailingPathDelimiter(aDir+Name));
  220. Until FindNext(Info)<>0;
  221. finally
  222. FindClose(Info);
  223. end;
  224. end;
  225. function THTMLLangExtractor.GetLanguageFile(aLang: String): String;
  226. Var
  227. Ext : String;
  228. begin
  229. Ext:=ExtractFileExt(OutputFileName);
  230. Result:=ChangeFileExt(OutputFileName,'-'+aLang+Ext);
  231. end;
  232. function THTMLLangExtractor.GetTagName: String;
  233. begin
  234. Result:=FTagName;
  235. if Result='' then
  236. Result:='translate';
  237. end;
  238. procedure THTMLLangExtractor.CreateLanguageFiles;
  239. Function GetAsJSON(aObject : TJSONObject) : string;
  240. begin
  241. if FMinified then
  242. Result:=aObject.AsJSON
  243. else
  244. Result:=aObject.FormatJSON
  245. end;
  246. Var
  247. I : Integer;
  248. S : TStringStream;
  249. begin
  250. if FFileMode=fmSingle then
  251. begin
  252. S:=TstringStream.Create(GetAsJSON(FLangObjects),TEncoding.UTF8);
  253. try
  254. S.SaveToFile(OutputFileName);
  255. finally
  256. S.Free;
  257. end;
  258. end
  259. else
  260. begin
  261. For I:=0 to FLangObjects.Count-1 do
  262. begin
  263. S:=TstringStream.Create(GetAsJSON(FLangObjects.Items[i] as TJSONObject),TEncoding.UTF8);
  264. try
  265. S.SaveToFile(GetLanguageFile(FLangObjects.Names[i]));
  266. finally
  267. S.Free;
  268. end;
  269. end;
  270. end;
  271. end;
  272. constructor THTMLLangExtractor.Create(aOwner: TComponent);
  273. begin
  274. inherited Create(aOwner);
  275. FLangObjects:=TJSONObject.Create;
  276. FTranslations:=TFPObjectList.Create(True);
  277. end;
  278. destructor THTMLLangExtractor.Destroy;
  279. begin
  280. FreeAndNil(FTranslations);
  281. FreeAndNil(FLangObjects);
  282. inherited Destroy;
  283. end;
  284. procedure THTMLLangExtractor.CopyWords(SrcScope,DestScope : TJSONObject; aList : TStrings);
  285. Var
  286. I : Integer;
  287. aName,aValue : String;
  288. begin
  289. For I:=0 to SrcScope.Count-1 do
  290. begin
  291. aName:=SrcScope.Names[I];
  292. if DestScope.IndexOfName(aName)=-1 then
  293. begin
  294. if TrashNewValues then
  295. aValue:='生词'+IntToStr(i)
  296. else
  297. aValue:=SrcScope.Items[I].AsString;
  298. DestScope.Add(aName,aValue);
  299. if Assigned(aList) then
  300. aList.Add(aName);
  301. end;
  302. end;
  303. end;
  304. procedure THTMLLangExtractor.CopyMissingWords;
  305. Var
  306. I,J,aSectionWordCount,aSectionCount : Integer;
  307. NewWords : TStringList;
  308. Src,Dest,SrcScope,DestScope : TJSONObject;
  309. NewSection : Boolean;
  310. aScope : String;
  311. begin
  312. aSectionCount:=0;
  313. aSectionWordCount:=0;
  314. NewWords:=TstringList.Create;
  315. Try
  316. NewWords.Sorted:=True;
  317. NewWords.Duplicates:=dupIgnore;
  318. Src:=FLangObjects.Items[0] as TJSONObject;
  319. // Copy all scopes
  320. For I:=0 to Src.Count-1 do
  321. begin
  322. aScope:=Src.Names[I];
  323. SrcScope:=Src.Items[i] as TJSONObject;
  324. NewSection:=False;
  325. For J:=1 to FLangObjects.Count-1 do
  326. begin
  327. Dest:=FLangObjects.Items[J] as TJSONObject;
  328. If (Dest.IndexOfName(aScope)=-1) then
  329. begin
  330. NewSection:=true;
  331. if TrashNewValues then
  332. begin
  333. DestScope:=TJSONObject.Create;
  334. Dest.Add(aScope,DestScope);
  335. CopyWords(SrcScope,DestScope,Nil);
  336. end
  337. else
  338. Dest.Add(aScope,Src.Items[I].Clone);
  339. end
  340. else
  341. begin
  342. DestScope:=Dest.Objects[aScope] as TJSONObject;
  343. CopyWords(SrcScope,DestScope,NewWords);
  344. end;
  345. end;
  346. If NewSection then
  347. begin
  348. Inc(aSectionCount);
  349. Inc(aSectionWordCount,SrcScope.Count);
  350. end;
  351. end;
  352. Log('Copied %d new scopes with %d words, added %d new words in existing scopes.',[aSectionCount,aSectionWordCount,NewWords.Count])
  353. finally
  354. NewWords.Free;
  355. end;
  356. end;
  357. function THTMLLangExtractor.LoadFile(const aFileName: string): TJSONObject;
  358. Var
  359. F : TFileStream;
  360. D : TJSONData;
  361. begin
  362. Log('Loading existing file "%s"',[aFileName]);
  363. F:=TFileStream.Create(aFileName,fmOpenRead or fmShareDenyWrite);
  364. try
  365. D:=GetJSON(F);
  366. if D is TJSONObject then
  367. begin
  368. Result:=D as TJSONObject;
  369. D:=Nil;
  370. end
  371. else
  372. begin
  373. Log('File "%s" does not contain valid JSON',[aFileName]);
  374. Result:=TJSONObject.Create;
  375. end;
  376. finally
  377. D.Free;
  378. F.Free;
  379. end;
  380. end;
  381. procedure THTMLLangExtractor.LoadExistingFiles;
  382. Var
  383. I : Integer;
  384. Obj : TJSONObject;
  385. aLang : String;
  386. begin
  387. // Load global file, if any
  388. if (OutputFileMode=fmSingle) and FileExists(OutputFileName) then
  389. begin
  390. Obj:=LoadFile(OutputFileName);
  391. FreeAndNil(FLangObjects);
  392. FLangObjects:=Obj;
  393. end;
  394. // Add all languages
  395. for I:=1 to WordCount(Languages,[',']) do
  396. begin
  397. aLang:=ExtractWord(I,Languages,[',']);
  398. if (OutputFileMode=fmMultiple) and FileExists(GetLanguageFile(aLang)) then
  399. FLangObjects.Add(aLang,LoadFile(GetLanguageFile(aLang)))
  400. else if FLangObjects.IndexOfName(aLang)=-1 then
  401. FLangObjects.Add(aLang,TJSONObject.Create)
  402. end;
  403. end;
  404. Procedure THTMLLangExtractor.CreateLanguageNodes;
  405. var
  406. I : Integer;
  407. aLang : String;
  408. begin
  409. FreeAndNil(FLangObjects);
  410. FLangObjects:=TJSONObject.Create;
  411. // Add all languages
  412. for I:=1 to WordCount(Languages,[',']) do
  413. begin
  414. aLang:=ExtractWord(I,Languages,[',']);
  415. if FLangObjects.IndexOfName(aLang)=-1 then
  416. FLangObjects.Add(aLang,TJSONObject.Create)
  417. end;
  418. end;
  419. procedure THTMLLangExtractor.Execute;
  420. Var
  421. aCount : Integer;
  422. begin
  423. if Languages='' then
  424. Languages:='en';
  425. if not CleanOutput then
  426. LoadExistingFiles
  427. else
  428. CreateLanguageNodes;
  429. if (HTMLDir<>'') then
  430. CollectHTMLNamesAndTexts(IncludeTrailingPathDelimiter(HTMLDir));
  431. aCount:=FLangObjects.Items[0].Count;
  432. Log('Collected %d message scopes',[aCount]);
  433. CopyMissingWords;
  434. CreateLanguageFiles;
  435. end;
  436. end.