pdfdump.pp 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424
  1. { **********************************************************************
  2. This file is part of the Free Component Library
  3. PDF file dumper
  4. Copyright (c) 2022 by Michael Van Canneyt [email protected]
  5. See the file COPYING.FPC, included in this distribution,
  6. for details about the copyright.
  7. This program is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  10. **********************************************************************}
  11. program pdfdump;
  12. {$mode objfpc}
  13. {$h+}
  14. uses
  15. {$ifdef unix}
  16. cwString,
  17. {$endif}
  18. sysutils, classes, contnrs, fppdfobjects, fppdfparser, fppdfpredict,
  19. custapp, fppdfconsts, fppdfcommands;
  20. type
  21. { TPDFDumpApplication }
  22. TInfoSection = (isInfo, isCatalog, isTrailer, isObjects, isFonts,
  23. isPages, isPageContents, isPageText, isDictionaries);
  24. TInfoSections = Set of TInfoSection;
  25. TPDFDumpApplication = class(TCustomApplication)
  26. Private
  27. FFiles : TStrings;
  28. FSections : TInfoSections;
  29. FPageNo : Integer;
  30. FVerbose : Boolean;
  31. Public
  32. constructor Create(aOwner: TComponent); override;
  33. destructor Destroy; override;
  34. Protected
  35. procedure DisplayPageText(Doc: TPDFDocument; aIndex: Integer; aPage: TPDFPageObject);
  36. procedure DoLog(sender: TObject; aKind: TPDFLogkind; const aMessage: string); reintroduce;
  37. Procedure DoProgress(Sender: TObject; aKind: TPDFProgressKind; aCurrent, aCount : Integer);
  38. procedure DisplayCatalog(Doc: TPDFDocument);
  39. procedure DisplayInfo(Doc: TPDFDocument);
  40. procedure DisplayObjects(Doc: TPDFDocument);
  41. procedure DisplayFonts(Doc: TPDFDocument);
  42. procedure DisplayPageContents(Doc: TPDFDocument; aIndex: Integer; aPage: TPDFPageObject);
  43. procedure DisplayPages(Doc: TPDFDocument);
  44. procedure DisplayTrailer(Doc: TPDFDocument);
  45. Public
  46. function ProcessOptions : Boolean;
  47. procedure Usage(Msg: String);
  48. procedure DumpFile(FN: String);
  49. procedure DoRun; override;
  50. end;
  51. { TPDFDumpApplication }
  52. constructor TPDFDumpApplication.Create(aOwner: TComponent);
  53. begin
  54. inherited Create(aOwner);
  55. FFiles:=TStringList.Create;
  56. end;
  57. destructor TPDFDumpApplication.destroy;
  58. begin
  59. FreeAndNil(FFiles);
  60. inherited destroy;
  61. end;
  62. procedure TPDFDumpApplication.DoRun;
  63. var
  64. FN : String;
  65. Count,Errors : Integer;
  66. begin
  67. StopOnException:=True;
  68. Terminate;
  69. if not ProcessOptions then
  70. exit;
  71. Errors:=0;
  72. Count:=0;
  73. For FN in FFiles do
  74. try
  75. Inc(Count);
  76. DumpFile(FN);
  77. except
  78. On E: Exception do
  79. begin
  80. ExitCode:=1;
  81. Writeln(Stderr,Format('Error %s examining file "%s" : %s',[E.ClassName,FN,E.Message]));
  82. Inc(Count);
  83. end;
  84. end;
  85. Flush(output);
  86. if Errors>0 then
  87. begin
  88. Writeln(StdErr,Format('Processed %d files, encountered an error in %f files.',[Count,Errors]));
  89. Flush(StdErr);
  90. end;
  91. end;
  92. function TPDFDumpApplication.ProcessOptions: Boolean;
  93. Procedure CheckSection(aShort : Char; aLong : String; aSection : TInfoSection);
  94. begin
  95. if HasOption(aShort,aLong) then
  96. Include(FSections,aSection);
  97. end;
  98. Const
  99. ShortOpts = 'hopcdiln:vtf';
  100. LongOpts : Array of string = ('help','objects','pages','pagecontent','dictionaries','info','catalog','pageno:','verbose','text','fonts');
  101. Var
  102. Err : String;
  103. S : TInfoSection;
  104. begin
  105. Err:=Checkoptions(ShortOpts,LongOpts);
  106. GetNonOptions(ShortOpts,LongOpts,FFiles);
  107. if (Err<>'') or HasOption('h','help') then
  108. begin
  109. Usage(Err);
  110. exit(False);
  111. end;
  112. if FFiles.Count=0 then
  113. begin
  114. Usage('No filenames specified');
  115. Exit(False);
  116. end;
  117. CheckSection('o','objects',isObjects);
  118. CheckSection('p','pages',isPages);
  119. CheckSection('c','pagecontent',isPageContents);
  120. CheckSection('d','dictionaries',isDictionaries);
  121. CheckSection('i','info',isInfo);
  122. CheckSection('f','fonts',isFonts);
  123. CheckSection('l','catalog',isInfo);
  124. CheckSection('t','text',isPageText);
  125. fVerbose:=HasOption('v','verbose');
  126. if HasOption('n','pageno') then
  127. begin
  128. FPageNo:=StrToInt(GetOptionValue('n','pageno'));
  129. end;
  130. if (FSections=[]) then
  131. for S in TInfoSection do
  132. Include(FSections,S);
  133. Result:=true;
  134. end;
  135. procedure TPDFDumpApplication.Usage(Msg: String);
  136. begin
  137. Writeln('Usage ',ExtractFileName(ParamStr(0)),' [options] FILE1 FILE2 ...');
  138. Writeln('Where options is one or more of:');
  139. Writeln('-h --help This help text');
  140. Writeln('-c --pagecontent Show page content stream (commands). Needs -p');
  141. Writeln('-d --dictionaries Show object dictionaries. Needs -o');
  142. Writeln('-f --fonts Show font info');
  143. Writeln('-i --info Show document info');
  144. Writeln('-l --catalog Show document catalog');
  145. Writeln('-n --pageno=N Show only page N');
  146. Writeln('-o --objects Show indirect objects');
  147. Writeln('-p --pages Show pages');
  148. Writeln('-t --text Show page text. Needs -p');
  149. Writeln('-v --verbose Show warnings/extra info when parsing');
  150. Halt(Ord(Msg<>''));
  151. end;
  152. procedure TPDFDumpApplication.DisplayTrailer(Doc : TPDFDocument);
  153. begin
  154. if Assigned(Doc.TrailerDict) then
  155. begin
  156. Writeln('Trailer dictionary:');
  157. Writeln(Doc.TrailerDict.GetDescription);
  158. end;
  159. end;
  160. procedure TPDFDumpApplication.DisplayObjects(Doc : TPDFDocument);
  161. Var
  162. Obj : TPDFObject;
  163. Ind : TPDFIndirect absolute Obj;
  164. begin
  165. Writeln('Indirect object count : ',Doc.Count);
  166. For obj in Doc do
  167. begin
  168. Writeln('Object (',Obj.ClassName,') : ',Obj.GetDescription);
  169. if Obj is TPDFIndirect then
  170. if Assigned(Ind.ObjectDict) and (isDictionaries in FSections) then
  171. begin
  172. Writeln('object dictionary : ',Ind.ObjectDict.GetDescription);
  173. Writeln;
  174. end;
  175. end;
  176. end;
  177. procedure TPDFDumpApplication.DisplayFonts(Doc: TPDFDocument);
  178. Var
  179. Obj : TPDFObject;
  180. // Fnt : TPDFFontObject absolute Obj;
  181. begin
  182. Writeln('Font definitions:');
  183. Writeln;
  184. For Obj in Doc do
  185. if (Obj is TPDFFontObject) or (Obj is TPDFFontDescriptor) then
  186. begin
  187. Writeln(Obj.GetDescription);
  188. Writeln;
  189. Writeln;
  190. end;
  191. end;
  192. procedure TPDFDumpApplication.DoProgress(Sender: TObject; aKind: TPDFProgressKind;
  193. aCurrent, aCount: Integer);
  194. Const
  195. Kinds : Array [TPDFProgressKind] of String = ('XRef','Indirect','ContentStream');
  196. begin
  197. Writeln('Loading ', Kinds[aKind],': ',aCurrent,'/',aCount);
  198. end;
  199. procedure TPDFDumpApplication.DoLog(sender: TObject; aKind: TPDFLogkind;
  200. const aMessage: string);
  201. begin
  202. Writeln('[',aKind,'] : ',aMessage);
  203. end;
  204. procedure TPDFDumpApplication.DisplayCatalog(Doc : TPDFDocument);
  205. begin
  206. if Assigned(Doc.FindCatalog) then
  207. begin
  208. Writeln('Document catalog:');
  209. Writeln(Doc.FindCatalog.ObjectDict.GetDescription);
  210. end;
  211. end;
  212. procedure TPDFDumpApplication.DisplayInfo(Doc : TPDFDocument);
  213. Var
  214. Info : TPDFDocumentInfo;
  215. begin
  216. if Not Assigned(Doc.FindDocumentInfoObject) then
  217. exit;
  218. Info:=Doc.FindDocumentInfo;
  219. With Info do
  220. Try
  221. Writeln('Document info:');
  222. Writeln('Title : ',Title);
  223. Writeln('Author : ',Author);
  224. Writeln('Subject : ',Subject);
  225. Writeln('Keywords : ',Keywords);
  226. Writeln('Creator : ',Creator);
  227. Writeln('Producer : ',Producer);
  228. Writeln('Creation Date : ',DateTimeToStr(CreationDate));
  229. Writeln('Modification Date : ',DateTimeToStr(ModDate));
  230. Writeln('Trapped : ',Trapped);
  231. Finally
  232. Free;
  233. end;
  234. end;
  235. procedure TPDFDumpApplication.DisplayPageContents(Doc : TPDFDocument; aIndex: Integer; aPage : TPDFPageObject);
  236. Var
  237. I,J : Integer;
  238. Cmd : TPDFCommand;
  239. begin
  240. For I:=0 to aPage.CommandList.Count-1 do
  241. begin
  242. Cmd:=aPage.CommandList[I];
  243. Write('Command ',I,' : ',Cmd.Command,' (',Cmd.ClassName,'):');
  244. For J:=0 to Length(Cmd.Tokens)-1 do
  245. Write(' ',Cmd.Tokens[J].TokenData);
  246. Writeln;
  247. end;
  248. end;
  249. procedure TPDFDumpApplication.DisplayPageText(Doc : TPDFDocument; aIndex: Integer; aPage : TPDFPageObject);
  250. Var
  251. I : Integer;
  252. Cmd : TPDFCommand;
  253. FontName,Rawtext : RawByteString;
  254. aFontRef : TPDFRefData;
  255. UnicodeMap : TPDFCMap;
  256. aFontObj : TPDFFontObject;
  257. begin
  258. UnicodeMap:=Nil;
  259. For I:=0 to aPage.CommandList.Count-1 do
  260. begin
  261. Cmd:=aPage.CommandList[I];
  262. if Cmd is TPDFTf_Command then
  263. begin
  264. FontName:=TPDFTf_Command(Cmd).FontName;
  265. if (FontName<>'') and (FontName[1]='/') then
  266. Delete(FontName,1,1);
  267. aFontRef:=aPage.FindFontRef(FontName);
  268. aFontObj:=Doc.FindFont(aFontRef); // TPDFFontObject
  269. if Assigned(aFontObj) then
  270. UnicodeMap:=aFontObj.UnicodeCMap
  271. else
  272. UnicodeMap:=nil;
  273. end
  274. else If cmd is TPDFTextCommand then
  275. begin
  276. rawText:=TPDFTextCommand(Cmd).GetFullText(UnicodeMap);
  277. //Writeln('GetCodePage : ',CodePageToCodePageName(StringCodePage(Rawtext)));
  278. SetCodePage(RawText,CP_UTF8);
  279. Writeln(RawText);
  280. end;
  281. end;
  282. end;
  283. procedure TPDFDumpApplication.DisplayPages(Doc : TPDFDocument);
  284. Var
  285. aPage : TPDFPageObject;
  286. I : Integer;
  287. begin
  288. Writeln('Page count : ',Doc.PageCount);
  289. For I:=0 to Doc.PageCount-1 do
  290. begin
  291. aPage:=Doc.Page[I];
  292. Write('Page object ',I,': ');
  293. if not Assigned(aPage) then
  294. Writeln('Not found')
  295. else
  296. begin
  297. Writeln('Object type: ',aPage.ObjectType,' (',aPage.ClassName,')');
  298. if isDictionaries in FSections then
  299. begin
  300. Writeln('Page dictionary : ',aPage.ObjectDict.GetDescription);
  301. Writeln;
  302. end;
  303. if isPageContents in FSections then
  304. DisplayPageContents(Doc,I,aPage);
  305. if isPageText in FSections then
  306. begin
  307. Writeln('Page text : ');
  308. Writeln;
  309. DisplayPageText(Doc,I,aPage)
  310. end;
  311. end;
  312. end;
  313. end;
  314. procedure TPDFDumpApplication.DumpFile(FN : String);
  315. Var
  316. F : TFileStream;
  317. P : TPDFParser;
  318. Doc : TPDFDocument;
  319. S : TInfoSection;
  320. begin
  321. P:=Nil;
  322. Doc:=Nil;
  323. Writeln('Contents of ',FN,' : ');
  324. F:=TFileStream.Create(FN,fmOpenRead or fmShareDenyWrite);
  325. try
  326. Doc:=TPDFDocument.Create();
  327. P:=TPDFParser.Create(F);
  328. if FVerbose then
  329. begin
  330. P.OnProgress:=@DoProgress;
  331. P.OnLog:=@DoLog;
  332. end;
  333. // P.ResolveObjects:=False;
  334. P.ParseDocument(Doc);
  335. if isPageText in FSections then
  336. P.DoResolveToUnicodeCMaps(Doc);
  337. For S in FSections do
  338. begin
  339. Case s of
  340. isObjects : DisplayObjects(Doc);
  341. isPages : DisplayPages(Doc);
  342. isCatalog : DisplayCatalog(Doc);
  343. isInfo : DisplayInfo(Doc);
  344. isFonts : DisplayFonts(Doc);
  345. isTrailer : DisplayTrailer(Doc);
  346. else
  347. // Do nothing
  348. end;
  349. Writeln;
  350. Writeln();
  351. end;
  352. finally
  353. Doc.Free;
  354. P.Free;
  355. F.Free;
  356. end;
  357. Flush(Output);
  358. end;
  359. begin
  360. With TPDFDumpApplication.Create(Nil) do
  361. try
  362. Initialize;
  363. Run;
  364. finally
  365. Free
  366. end;
  367. end.