pdfdump.pp 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418
  1. { **********************************************************************
  2. This file is part of the Free Component Library
  3. PDF file dumper
  4. Copyright (c) 2022 by Michael Van Canneyt [email protected]
  5. See the file COPYING.FPC, included in this distribution,
  6. for details about the copyright.
  7. This program is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  10. **********************************************************************}
  11. program pdfdump;
  12. {$mode objfpc}
  13. {$h+}
  14. uses
  15. cwString, sysutils, classes, contnrs, fppdfobjects, fppdfparser, fppdfpredict,
  16. custapp, fppdfconsts;
  17. type
  18. { TPDFDumpApplication }
  19. TInfoSection = (isInfo,isCatalog,isTrailer,isObjects, isFonts, isPages,isPageContents,isPageText, isDictionaries);
  20. TInfoSections = Set of TInfoSection;
  21. TPDFDumpApplication = class(TCustomApplication)
  22. Private
  23. FFiles : TStrings;
  24. FSections : TInfoSections;
  25. FPageNo : Integer;
  26. FVerbose : Boolean;
  27. Public
  28. constructor Create(aOwner: TComponent); override;
  29. destructor destroy; override;
  30. Protected
  31. procedure DisplayPageText(Doc: TPDFDocument; aIndex: Integer; aPage: TPDFPageObject);
  32. procedure DoLog(sender: TObject; aKind: TLogkind; const aMessage: string);
  33. Procedure DoProgress(Sender : TObject;aKind : TProgressKind; aCurrent,aCount : Integer);
  34. procedure DisplayCatalog(Doc: TPDFDocument);
  35. procedure DisplayInfo(Doc: TPDFDocument);
  36. procedure DisplayObjects(Doc: TPDFDocument);
  37. procedure DisplayFonts(Doc: TPDFDocument);
  38. procedure DisplayPageContents(Doc: TPDFDocument; aIndex: Integer; aPage: TPDFPageObject);
  39. procedure DisplayPages(Doc: TPDFDocument);
  40. procedure DisplayTrailer(Doc: TPDFDocument);
  41. Public
  42. function ProcessOptions : Boolean;
  43. procedure Usage(Msg : String);
  44. procedure DumpFile(FN: String);
  45. procedure DoRun; override;
  46. end;
  47. { TPDFDumpApplication }
  48. constructor TPDFDumpApplication.Create(aOwner: TComponent);
  49. begin
  50. inherited Create(aOwner);
  51. FFiles:=TStringList.Create;
  52. end;
  53. destructor TPDFDumpApplication.destroy;
  54. begin
  55. FreeAndNil(FFiles);
  56. inherited destroy;
  57. end;
  58. procedure TPDFDumpApplication.DoRun;
  59. var
  60. FN : String;
  61. Count,Errors : Integer;
  62. begin
  63. StopOnException:=True;
  64. Terminate;
  65. if not ProcessOptions then
  66. exit;
  67. Errors:=0;
  68. Count:=0;
  69. For FN in FFiles do
  70. try
  71. Inc(Count);
  72. DumpFile(FN);
  73. except
  74. On E: Exception do
  75. begin
  76. ExitCode:=1;
  77. Writeln(Stderr,Format('Error %s examining file "%s" : %s',[E.ClassName,FN,E.Message]));
  78. Inc(Count);
  79. end;
  80. end;
  81. Flush(output);
  82. if Errors>0 then
  83. begin
  84. Writeln(StdErr,Format('Processed %d files, encountered an error in %f files.',[Count,Errors]));
  85. Flush(StdErr);
  86. end;
  87. end;
  88. function TPDFDumpApplication.ProcessOptions: Boolean;
  89. Procedure CheckSection(aShort : Char; aLong : String; aSection : TInfoSection);
  90. begin
  91. if HasOption(aShort,aLong) then
  92. Include(FSections,aSection);
  93. end;
  94. Const
  95. ShortOpts = 'hopcdiln:vtf';
  96. LongOpts : Array of string = ('help','objects','pages','pagecontent','dictionaries','info','catalog','pageno:','verbose','text','fonts');
  97. Var
  98. Err : String;
  99. S : TInfoSection;
  100. begin
  101. Err:=Checkoptions(ShortOpts,LongOpts);
  102. GetNonOptions(ShortOpts,LongOpts,FFiles);
  103. if (Err<>'') or HasOption('h','help') then
  104. begin
  105. Usage(Err);
  106. exit(False);
  107. end;
  108. if FFiles.Count=0 then
  109. begin
  110. Usage('No filenames specified');
  111. Exit(False);
  112. end;
  113. CheckSection('o','objects',isObjects);
  114. CheckSection('p','pages',isPages);
  115. CheckSection('c','pagecontent',isPageContents);
  116. CheckSection('d','dictionaries',isDictionaries);
  117. CheckSection('i','info',isInfo);
  118. CheckSection('f','fonts',isFonts);
  119. CheckSection('l','catalog',isInfo);
  120. CheckSection('t','text',isPageText);
  121. fVerbose:=HasOption('v','verbose');
  122. if HasOption('n','pageno') then
  123. begin
  124. FPageNo:=StrToInt(GetOptionValue('n','pageno'));
  125. end;
  126. if (FSections=[]) then
  127. for S in TInfoSection do
  128. Include(FSections,S);
  129. end;
  130. procedure TPDFDumpApplication.Usage(Msg: String);
  131. begin
  132. Writeln('Usage ',ExtractFileName(ParamStr(0)),' [options] FILE1 FILE2 ...');
  133. Writeln('Where options is one or more of:');
  134. Writeln('-h --help This help text');
  135. Writeln('-c --pagecontent Show page content stream (commands). Needs -p');
  136. Writeln('-d --dictionaries Show object dictionaries. Needs -o');
  137. Writeln('-p --fonts Show font info');
  138. Writeln('-i --info Show document info');
  139. Writeln('-l --catalog Show document catalog');
  140. Writeln('-n --pageno=N Show only page N');
  141. Writeln('-o --objects Show indirect objects');
  142. Writeln('-p --pages Show pages');
  143. Writeln('-t --text Show page text. Needs -p');
  144. Writeln('-v --verbose Show warnings/extra info when parsing');
  145. Halt(Ord(Msg<>''));
  146. end;
  147. procedure TPDFDumpApplication.DisplayTrailer(Doc : TPDFDocument);
  148. begin
  149. if Assigned(Doc.TrailerDict) then
  150. begin
  151. Writeln('Trailer dictionary:');
  152. Writeln(Doc.TrailerDict.GetDescription);
  153. end;
  154. end;
  155. procedure TPDFDumpApplication.DisplayObjects(Doc : TPDFDocument);
  156. Var
  157. Obj : TPDFObject;
  158. Ind : TPDFIndirect absolute Obj;
  159. begin
  160. Writeln('Indirect object count : ',Doc.Count);
  161. For obj in Doc do
  162. begin
  163. Writeln('Object (',Obj.ClassName,') : ',Obj.GetDescription);
  164. if Obj is TPDFIndirect then
  165. if Assigned(Ind.ObjectDict) and (isDictionaries in FSections) then
  166. begin
  167. Writeln('object dictionary : ',Ind.ObjectDict.GetDescription);
  168. Writeln;
  169. end;
  170. end;
  171. end;
  172. procedure TPDFDumpApplication.DisplayFonts(Doc: TPDFDocument);
  173. Var
  174. Obj : TPDFObject;
  175. // Fnt : TPDFFontObject absolute Obj;
  176. begin
  177. Writeln('Font definitions:');
  178. Writeln;
  179. For Obj in Doc do
  180. if Obj is TPDFFontObject then
  181. begin
  182. Writeln(Obj.GetDescription);
  183. Writeln;
  184. Writeln;
  185. end;
  186. end;
  187. procedure TPDFDumpApplication.DoProgress(Sender: TObject; aKind: TProgressKind;
  188. aCurrent, aCount: Integer);
  189. Const
  190. Kinds : Array [TProgressKind] of String = ('XRef','Indirect','ContentStream');
  191. begin
  192. Writeln('Loading ', Kinds[aKind],': ',aCurrent,'/',aCount);
  193. end;
  194. procedure TPDFDumpApplication.DoLog(sender: TObject; aKind: TLogkind;
  195. const aMessage: string);
  196. begin
  197. Writeln('[',aKind,'] : ',aMessage);
  198. end;
  199. procedure TPDFDumpApplication.DisplayCatalog(Doc : TPDFDocument);
  200. begin
  201. if Assigned(Doc.FindCatalog) then
  202. begin
  203. Writeln('Document catalog:');
  204. Writeln(Doc.FindCatalog.ObjectDict.GetDescription);
  205. end;
  206. end;
  207. procedure TPDFDumpApplication.DisplayInfo(Doc : TPDFDocument);
  208. Var
  209. Info : TPDFDocumentInfo;
  210. begin
  211. if Not Assigned(Doc.FindDocumentInfoObject) then
  212. exit;
  213. Info:=Doc.FindDocumentInfo;
  214. With Info do
  215. Try
  216. Writeln('Document info:');
  217. Writeln('Title : ',Title);
  218. Writeln('Author : ',Author);
  219. Writeln('Subject : ',Subject);
  220. Writeln('Keywords : ',Keywords);
  221. Writeln('Creator : ',Creator);
  222. Writeln('Producer : ',Producer);
  223. Writeln('Creation Date : ',DateTimeToStr(CreationDate));
  224. Writeln('Modification Date : ',DateTimeToStr(ModDate));
  225. Writeln('Trapped : ',Trapped);
  226. Finally
  227. Free;
  228. end;
  229. end;
  230. procedure TPDFDumpApplication.DisplayPageContents(Doc : TPDFDocument; aIndex: Integer; aPage : TPDFPageObject);
  231. Var
  232. I,J : Integer;
  233. Cmd : TPDFCommand;
  234. begin
  235. For I:=0 to aPage.CommandList.Count-1 do
  236. begin
  237. Cmd:=aPage.CommandList[I];
  238. Write('Command ',I,' : ',Cmd.Command,' (',Cmd.ClassName,'):');
  239. For J:=0 to Length(Cmd.Tokens)-1 do
  240. Write(' ',Cmd.Tokens[J].TokenData);
  241. Writeln;
  242. end;
  243. end;
  244. procedure TPDFDumpApplication.DisplayPageText(Doc : TPDFDocument; aIndex: Integer; aPage : TPDFPageObject);
  245. Var
  246. I : Integer;
  247. Cmd : TPDFCommand;
  248. FontName,Rawtext : RawByteString;
  249. aFontRef : TPDFRefData;
  250. UnicodeMap : TPDFCMap;
  251. aFontObj : TPDFFontObject;
  252. begin
  253. UnicodeMap:=Nil;
  254. For I:=0 to aPage.CommandList.Count-1 do
  255. begin
  256. Cmd:=aPage.CommandList[I];
  257. if Cmd is TPDFTfCommand then
  258. begin
  259. FontName:=TPDFTfCommand(Cmd).FontName;
  260. if (FontName<>'') and (FontName[1]='/') then
  261. Delete(FontName,1,1);
  262. aFontRef:=aPage.FindFontRef(FontName);
  263. aFontObj:=Doc.FindFont(aFontRef); // TPDFFontObject
  264. if Assigned(aFontObj) then
  265. UnicodeMap:=aFontObj.UnicodeCMap
  266. else
  267. UnicodeMap:=nil;
  268. end
  269. else If cmd is TPDFTextCommand then
  270. begin
  271. rawText:=TPDFTextCommand(Cmd).GetFullText(UnicodeMap);
  272. // Writeln('GetCodePage : ',CodePageToCodePageName(StringCodePage(Rawtext)));
  273. SetCodePage(RawText,CP_UTF8);
  274. Writeln(RawText);
  275. end;
  276. end;
  277. end;
  278. procedure TPDFDumpApplication.DisplayPages(Doc : TPDFDocument);
  279. Var
  280. aPage : TPDFPageObject;
  281. I : Integer;
  282. begin
  283. Writeln('Page count : ',Doc.PageCount);
  284. For I:=0 to Doc.PageCount-1 do
  285. begin
  286. aPage:=Doc.Pages[I];
  287. Write('Page object ',I,': ');
  288. if not Assigned(aPage) then
  289. Writeln('Not found')
  290. else
  291. begin
  292. Writeln('Object type: ',aPage.ObjectType,' (',aPage.ClassName,')');
  293. if isDictionaries in FSections then
  294. begin
  295. Writeln('Page dictionary : ',aPage.ObjectDict.GetDescription);
  296. Writeln;
  297. end;
  298. if isPageContents in FSections then
  299. DisplayPageContents(Doc,I,aPage);
  300. if isPageText in FSections then
  301. begin
  302. Writeln('Page text : ');
  303. Writeln;
  304. DisplayPageText(Doc,I,aPage)
  305. end;
  306. end;
  307. end;
  308. end;
  309. procedure TPDFDumpApplication.DumpFile(FN : String);
  310. Var
  311. F : TFileStream;
  312. P : TPDFParser;
  313. Doc : TPDFDocument;
  314. S : TInfoSection;
  315. begin
  316. P:=Nil;
  317. Doc:=Nil;
  318. Writeln('Contents of ',FN,' : ');
  319. F:=TFileStream.Create(FN,fmOpenRead or fmShareDenyWrite);
  320. try
  321. Doc:=TPDFDocument.Create();
  322. P:=TPDFParser.Create(F);
  323. if FVerbose then
  324. begin
  325. P.OnProgress:=@DoProgress;
  326. P.OnLog:=@DoLog;
  327. end;
  328. // P.ResolveObjects:=False;
  329. P.ParseDocument(Doc);
  330. if isPageText in FSections then
  331. P.ResolveToUnicodeCMaps(Doc);
  332. For S in FSections do
  333. begin
  334. Case s of
  335. isObjects : DisplayObjects(Doc);
  336. isPages : DisplayPages(Doc);
  337. isCatalog : DisplayCatalog(Doc);
  338. isInfo : DisplayInfo(Doc);
  339. isFonts : DisplayFonts(Doc);
  340. isTrailer : DisplayTrailer(Doc);
  341. else
  342. // Do nothing
  343. end;
  344. Writeln;
  345. Writeln();
  346. end;
  347. finally
  348. Doc.Free;
  349. P.Free;
  350. F.Free;
  351. end;
  352. Flush(Output);
  353. end;
  354. begin
  355. With TPDFDumpApplication.Create(Nil) do
  356. try
  357. Initialize;
  358. Run;
  359. finally
  360. Free
  361. end;
  362. end.