pdfdump.pp 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421
  1. { **********************************************************************
  2. This file is part of the Free Component Library
  3. PDF file dumper
  4. Copyright (c) 2022 by Michael Van Canneyt [email protected]
  5. See the file COPYING.FPC, included in this distribution,
  6. for details about the copyright.
  7. This program is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  10. **********************************************************************}
  11. program pdfdump;
  12. {$mode objfpc}
  13. {$h+}
  14. uses
  15. cwString, sysutils, classes, contnrs, fppdfobjects, fppdfparser, fppdfpredict,
  16. custapp, fppdfconsts;
  17. type
  18. { TPDFDumpApplication }
  19. TInfoSection = (isInfo, isCatalog, isTrailer, isObjects, isFonts,
  20. isPages, isPageContents, isPageText, isDictionaries);
  21. TInfoSections = Set of TInfoSection;
  22. TPDFDumpApplication = class(TCustomApplication)
  23. Private
  24. FFiles : TStrings;
  25. FSections : TInfoSections;
  26. FPageNo : Integer;
  27. FVerbose : Boolean;
  28. Public
  29. constructor Create(aOwner: TComponent); override;
  30. destructor Destroy; override;
  31. Protected
  32. procedure DisplayPageText(Doc: TPDFDocument; aIndex: Integer; aPage: TPDFPageObject);
  33. procedure DoLog(sender: TObject; aKind: TPDFLogkind; const aMessage: string); reintroduce;
  34. Procedure DoProgress(Sender: TObject; aKind: TPDFProgressKind; aCurrent, aCount : Integer);
  35. procedure DisplayCatalog(Doc: TPDFDocument);
  36. procedure DisplayInfo(Doc: TPDFDocument);
  37. procedure DisplayObjects(Doc: TPDFDocument);
  38. procedure DisplayFonts(Doc: TPDFDocument);
  39. procedure DisplayPageContents(Doc: TPDFDocument; aIndex: Integer; aPage: TPDFPageObject);
  40. procedure DisplayPages(Doc: TPDFDocument);
  41. procedure DisplayTrailer(Doc: TPDFDocument);
  42. Public
  43. function ProcessOptions : Boolean;
  44. procedure Usage(Msg: String);
  45. procedure DumpFile(FN: String);
  46. procedure DoRun; override;
  47. end;
  48. { TPDFDumpApplication }
  49. constructor TPDFDumpApplication.Create(aOwner: TComponent);
  50. begin
  51. inherited Create(aOwner);
  52. FFiles:=TStringList.Create;
  53. end;
  54. destructor TPDFDumpApplication.destroy;
  55. begin
  56. FreeAndNil(FFiles);
  57. inherited destroy;
  58. end;
  59. procedure TPDFDumpApplication.DoRun;
  60. var
  61. FN : String;
  62. Count,Errors : Integer;
  63. begin
  64. StopOnException:=True;
  65. Terminate;
  66. if not ProcessOptions then
  67. exit;
  68. Errors:=0;
  69. Count:=0;
  70. For FN in FFiles do
  71. try
  72. Inc(Count);
  73. DumpFile(FN);
  74. except
  75. On E: Exception do
  76. begin
  77. ExitCode:=1;
  78. Writeln(Stderr,Format('Error %s examining file "%s" : %s',[E.ClassName,FN,E.Message]));
  79. Inc(Count);
  80. end;
  81. end;
  82. Flush(output);
  83. if Errors>0 then
  84. begin
  85. Writeln(StdErr,Format('Processed %d files, encountered an error in %f files.',[Count,Errors]));
  86. Flush(StdErr);
  87. end;
  88. end;
  89. function TPDFDumpApplication.ProcessOptions: Boolean;
  90. Procedure CheckSection(aShort : Char; aLong : String; aSection : TInfoSection);
  91. begin
  92. if HasOption(aShort,aLong) then
  93. Include(FSections,aSection);
  94. end;
  95. Const
  96. ShortOpts = 'hopcdiln:vtf';
  97. LongOpts : Array of string = ('help','objects','pages','pagecontent','dictionaries','info','catalog','pageno:','verbose','text','fonts');
  98. Var
  99. Err : String;
  100. S : TInfoSection;
  101. begin
  102. Err:=Checkoptions(ShortOpts,LongOpts);
  103. GetNonOptions(ShortOpts,LongOpts,FFiles);
  104. if (Err<>'') or HasOption('h','help') then
  105. begin
  106. Usage(Err);
  107. exit(False);
  108. end;
  109. if FFiles.Count=0 then
  110. begin
  111. Usage('No filenames specified');
  112. Exit(False);
  113. end;
  114. CheckSection('o','objects',isObjects);
  115. CheckSection('p','pages',isPages);
  116. CheckSection('c','pagecontent',isPageContents);
  117. CheckSection('d','dictionaries',isDictionaries);
  118. CheckSection('i','info',isInfo);
  119. CheckSection('f','fonts',isFonts);
  120. CheckSection('l','catalog',isInfo);
  121. CheckSection('t','text',isPageText);
  122. fVerbose:=HasOption('v','verbose');
  123. if HasOption('n','pageno') then
  124. begin
  125. FPageNo:=StrToInt(GetOptionValue('n','pageno'));
  126. end;
  127. if (FSections=[]) then
  128. for S in TInfoSection do
  129. Include(FSections,S);
  130. Result:=true;
  131. end;
  132. procedure TPDFDumpApplication.Usage(Msg: String);
  133. begin
  134. Writeln('Usage ',ExtractFileName(ParamStr(0)),' [options] FILE1 FILE2 ...');
  135. Writeln('Where options is one or more of:');
  136. Writeln('-h --help This help text');
  137. Writeln('-c --pagecontent Show page content stream (commands). Needs -p');
  138. Writeln('-d --dictionaries Show object dictionaries. Needs -o');
  139. Writeln('-p --fonts Show font info');
  140. Writeln('-i --info Show document info');
  141. Writeln('-l --catalog Show document catalog');
  142. Writeln('-n --pageno=N Show only page N');
  143. Writeln('-o --objects Show indirect objects');
  144. Writeln('-p --pages Show pages');
  145. Writeln('-t --text Show page text. Needs -p');
  146. Writeln('-v --verbose Show warnings/extra info when parsing');
  147. Halt(Ord(Msg<>''));
  148. end;
  149. procedure TPDFDumpApplication.DisplayTrailer(Doc : TPDFDocument);
  150. begin
  151. if Assigned(Doc.TrailerDict) then
  152. begin
  153. Writeln('Trailer dictionary:');
  154. Writeln(Doc.TrailerDict.GetDescription);
  155. end;
  156. end;
  157. procedure TPDFDumpApplication.DisplayObjects(Doc : TPDFDocument);
  158. Var
  159. Obj : TPDFObject;
  160. Ind : TPDFIndirect absolute Obj;
  161. begin
  162. Writeln('Indirect object count : ',Doc.Count);
  163. For obj in Doc do
  164. begin
  165. Writeln('Object (',Obj.ClassName,') : ',Obj.GetDescription);
  166. if Obj is TPDFIndirect then
  167. if Assigned(Ind.ObjectDict) and (isDictionaries in FSections) then
  168. begin
  169. Writeln('object dictionary : ',Ind.ObjectDict.GetDescription);
  170. Writeln;
  171. end;
  172. end;
  173. end;
  174. procedure TPDFDumpApplication.DisplayFonts(Doc: TPDFDocument);
  175. Var
  176. Obj : TPDFObject;
  177. // Fnt : TPDFFontObject absolute Obj;
  178. begin
  179. Writeln('Font definitions:');
  180. Writeln;
  181. For Obj in Doc do
  182. if Obj is TPDFFontObject then
  183. begin
  184. Writeln(Obj.GetDescription);
  185. Writeln;
  186. Writeln;
  187. end;
  188. end;
  189. procedure TPDFDumpApplication.DoProgress(Sender: TObject; aKind: TPDFProgressKind;
  190. aCurrent, aCount: Integer);
  191. Const
  192. Kinds : Array [TPDFProgressKind] of String = ('XRef','Indirect','ContentStream');
  193. begin
  194. Writeln('Loading ', Kinds[aKind],': ',aCurrent,'/',aCount);
  195. end;
  196. procedure TPDFDumpApplication.DoLog(sender: TObject; aKind: TPDFLogkind;
  197. const aMessage: string);
  198. begin
  199. Writeln('[',aKind,'] : ',aMessage);
  200. end;
  201. procedure TPDFDumpApplication.DisplayCatalog(Doc : TPDFDocument);
  202. begin
  203. if Assigned(Doc.FindCatalog) then
  204. begin
  205. Writeln('Document catalog:');
  206. Writeln(Doc.FindCatalog.ObjectDict.GetDescription);
  207. end;
  208. end;
  209. procedure TPDFDumpApplication.DisplayInfo(Doc : TPDFDocument);
  210. Var
  211. Info : TPDFDocumentInfo;
  212. begin
  213. if Not Assigned(Doc.FindDocumentInfoObject) then
  214. exit;
  215. Info:=Doc.FindDocumentInfo;
  216. With Info do
  217. Try
  218. Writeln('Document info:');
  219. Writeln('Title : ',Title);
  220. Writeln('Author : ',Author);
  221. Writeln('Subject : ',Subject);
  222. Writeln('Keywords : ',Keywords);
  223. Writeln('Creator : ',Creator);
  224. Writeln('Producer : ',Producer);
  225. Writeln('Creation Date : ',DateTimeToStr(CreationDate));
  226. Writeln('Modification Date : ',DateTimeToStr(ModDate));
  227. Writeln('Trapped : ',Trapped);
  228. Finally
  229. Free;
  230. end;
  231. end;
  232. procedure TPDFDumpApplication.DisplayPageContents(Doc : TPDFDocument; aIndex: Integer; aPage : TPDFPageObject);
  233. Var
  234. I,J : Integer;
  235. Cmd : TPDFCommand;
  236. begin
  237. For I:=0 to aPage.CommandList.Count-1 do
  238. begin
  239. Cmd:=aPage.CommandList[I];
  240. Write('Command ',I,' : ',Cmd.Command,' (',Cmd.ClassName,'):');
  241. For J:=0 to Length(Cmd.Tokens)-1 do
  242. Write(' ',Cmd.Tokens[J].TokenData);
  243. Writeln;
  244. end;
  245. end;
  246. procedure TPDFDumpApplication.DisplayPageText(Doc : TPDFDocument; aIndex: Integer; aPage : TPDFPageObject);
  247. Var
  248. I : Integer;
  249. Cmd : TPDFCommand;
  250. FontName,Rawtext : RawByteString;
  251. aFontRef : TPDFRefData;
  252. UnicodeMap : TPDFCMap;
  253. aFontObj : TPDFFontObject;
  254. begin
  255. UnicodeMap:=Nil;
  256. For I:=0 to aPage.CommandList.Count-1 do
  257. begin
  258. Cmd:=aPage.CommandList[I];
  259. if Cmd is TPDFTfCommand then
  260. begin
  261. FontName:=TPDFTfCommand(Cmd).FontName;
  262. if (FontName<>'') and (FontName[1]='/') then
  263. Delete(FontName,1,1);
  264. aFontRef:=aPage.FindFontRef(FontName);
  265. aFontObj:=Doc.FindFont(aFontRef); // TPDFFontObject
  266. if Assigned(aFontObj) then
  267. UnicodeMap:=aFontObj.UnicodeCMap
  268. else
  269. UnicodeMap:=nil;
  270. end
  271. else If cmd is TPDFTextCommand then
  272. begin
  273. rawText:=TPDFTextCommand(Cmd).GetFullText(UnicodeMap);
  274. //Writeln('GetCodePage : ',CodePageToCodePageName(StringCodePage(Rawtext)));
  275. SetCodePage(RawText,CP_UTF8);
  276. Writeln(RawText);
  277. end;
  278. end;
  279. end;
  280. procedure TPDFDumpApplication.DisplayPages(Doc : TPDFDocument);
  281. Var
  282. aPage : TPDFPageObject;
  283. I : Integer;
  284. begin
  285. Writeln('Page count : ',Doc.PageCount);
  286. For I:=0 to Doc.PageCount-1 do
  287. begin
  288. aPage:=Doc.Page[I];
  289. Write('Page object ',I,': ');
  290. if not Assigned(aPage) then
  291. Writeln('Not found')
  292. else
  293. begin
  294. Writeln('Object type: ',aPage.ObjectType,' (',aPage.ClassName,')');
  295. if isDictionaries in FSections then
  296. begin
  297. Writeln('Page dictionary : ',aPage.ObjectDict.GetDescription);
  298. Writeln;
  299. end;
  300. if isPageContents in FSections then
  301. DisplayPageContents(Doc,I,aPage);
  302. if isPageText in FSections then
  303. begin
  304. Writeln('Page text : ');
  305. Writeln;
  306. DisplayPageText(Doc,I,aPage)
  307. end;
  308. end;
  309. end;
  310. end;
  311. procedure TPDFDumpApplication.DumpFile(FN : String);
  312. Var
  313. F : TFileStream;
  314. P : TPDFParser;
  315. Doc : TPDFDocument;
  316. S : TInfoSection;
  317. begin
  318. P:=Nil;
  319. Doc:=Nil;
  320. Writeln('Contents of ',FN,' : ');
  321. F:=TFileStream.Create(FN,fmOpenRead or fmShareDenyWrite);
  322. try
  323. Doc:=TPDFDocument.Create();
  324. P:=TPDFParser.Create(F);
  325. if FVerbose then
  326. begin
  327. P.OnProgress:=@DoProgress;
  328. P.OnLog:=@DoLog;
  329. end;
  330. // P.ResolveObjects:=False;
  331. P.ParseDocument(Doc);
  332. if isPageText in FSections then
  333. P.DoResolveToUnicodeCMaps(Doc);
  334. For S in FSections do
  335. begin
  336. Case s of
  337. isObjects : DisplayObjects(Doc);
  338. isPages : DisplayPages(Doc);
  339. isCatalog : DisplayCatalog(Doc);
  340. isInfo : DisplayInfo(Doc);
  341. isFonts : DisplayFonts(Doc);
  342. isTrailer : DisplayTrailer(Doc);
  343. else
  344. // Do nothing
  345. end;
  346. Writeln;
  347. Writeln();
  348. end;
  349. finally
  350. Doc.Free;
  351. P.Free;
  352. F.Free;
  353. end;
  354. Flush(Output);
  355. end;
  356. begin
  357. With TPDFDumpApplication.Create(Nil) do
  358. try
  359. Initialize;
  360. Run;
  361. finally
  362. Free
  363. end;
  364. end.