xmlread.pp 65 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580
  1. {
  2. This file is part of the Free Component Library
  3. XML reading routines.
  4. Copyright (c) 1999-2000 by Sebastian Guenther, [email protected]
  5. Modified in 2006 by Sergei Gorelkin, [email protected]
  6. See the file COPYING.FPC, included in this distribution,
  7. for details about the copyright.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  11. **********************************************************************}
  12. unit XMLRead;
  13. {$ifdef fpc}
  14. {$MODE objfpc}{$H+}
  15. {$endif}
  16. interface
  17. uses
  18. SysUtils, Classes, DOM;
  19. type
  20. EXMLReadError = class(Exception);
  21. procedure ReadXMLFile(out ADoc: TXMLDocument; const AFilename: String); overload;
  22. procedure ReadXMLFile(out ADoc: TXMLDocument; var f: Text); overload;
  23. procedure ReadXMLFile(out ADoc: TXMLDocument; var f: TStream); overload;
  24. procedure ReadXMLFile(out ADoc: TXMLDocument; var f: TStream; const ABaseURI: String); overload;
  25. procedure ReadXMLFragment(AParentNode: TDOMNode; const AFilename: String); overload;
  26. procedure ReadXMLFragment(AParentNode: TDOMNode; var f: Text); overload;
  27. procedure ReadXMLFragment(AParentNode: TDOMNode; var f: TStream); overload;
  28. procedure ReadXMLFragment(AParentNode: TDOMNode; var f: TStream; const ABaseURI: String); overload;
  29. procedure ReadDTDFile(out ADoc: TXMLDocument; const AFilename: String); overload;
  30. procedure ReadDTDFile(out ADoc: TXMLDocument; var f: Text); overload;
  31. procedure ReadDTDFile(out ADoc: TXMLDocument; var f: TStream); overload;
  32. procedure ReadDTDFile(out ADoc: TXMLDocument; var f: TStream; const ABaseURI: String); overload;
  33. // =======================================================
  34. implementation
  35. uses
  36. UriParser;
  37. type
  38. TSetOfChar = set of Char;
  39. const
  40. Letter = ['A'..'Z', 'a'..'z'];
  41. Digit = ['0'..'9'];
  42. PubidChars: TSetOfChar = [' ', #13, #10, 'a'..'z', 'A'..'Z', '0'..'9',
  43. '-', '''', '(', ')', '+', ',', '.', '/', ':', '=', '?', ';', '!', '*',
  44. '#', '@', '$', '_', '%'];
  45. type
  46. TDOMNotationEx = class(TDOMNotation);
  47. TDOMAttrEx = class(TDOMAttr);
  48. TXMLInputSource = class;
  49. TDOMElementDef = class;
  50. TDOMEntityEx = class(TDOMEntity)
  51. protected
  52. FInternal: Boolean;
  53. FResolved: Boolean;
  54. FOnStack: Boolean;
  55. FReplacementText: DOMString;
  56. end;
  57. // TODO: Do I need PEMap in DocType? Maybe move it to Reader itself?
  58. // (memory usage - they are not needed after parsing)
  59. TDOMDocumentTypeEx = class(TDOMDocumentType)
  60. private
  61. FHasPERefs: Boolean;
  62. FPEMap: TDOMNamedNodeMap;
  63. FElementDefs: TDOMNamedNodeMap;
  64. function GetPEMap: TDOMNamedNodeMap;
  65. function GetElementDefs: TDOMNamedNodeMap;
  66. protected
  67. property PEMap: TDOMNamedNodeMap read GetPEMap;
  68. property ElementDefs: TDOMNamedNodeMap read GetElementDefs;
  69. property HasPERefs: Boolean read FHasPERefs write FHasPERefs;
  70. public
  71. destructor Destroy; override;
  72. end;
  73. TXMLReader = class;
  74. TDecoder = class;
  75. TDecoderRef = class of TDecoder;
  76. TXMLInputSource = class
  77. private
  78. FBuf: PChar;
  79. FBufEnd: PChar;
  80. FEof: Boolean;
  81. FSurrogate: WideChar;
  82. FReader: TXMLReader;
  83. FParent: TXMLInputSource;
  84. FEntity: TObject; // weak reference
  85. FCursor: TObject; // weak reference
  86. FLine: Integer;
  87. FColumn: Integer;
  88. FSystemID: WideString;
  89. FPublicID: WideString;
  90. function GetSystemID: WideString;
  91. function GetPublicID: WideString;
  92. protected
  93. procedure FetchData; virtual;
  94. public
  95. constructor Create(const AData: WideString);
  96. function NextChar: WideChar; virtual;
  97. procedure Initialize; virtual;
  98. procedure SetEncoding(const AEncoding: string); virtual;
  99. property SystemID: WideString read GetSystemID write FSystemID;
  100. property PublicID: WideString read GetPublicID write FPublicID;
  101. end;
  102. TXMLDecodingSource = class(TXMLInputSource)
  103. private
  104. FDecoder: TDecoder;
  105. FSeenCR: Boolean;
  106. function InternalNextChar: WideChar;
  107. procedure DecodingError(const Msg: string); overload;
  108. procedure DecodingError(const Msg: string; const Args: array of const); overload;
  109. public
  110. destructor Destroy; override;
  111. function NextChar: WideChar; override;
  112. procedure SetEncoding(const AEncoding: string); override;
  113. procedure Initialize; override;
  114. end;
  115. TXMLStreamInputSource = class(TXMLDecodingSource)
  116. private
  117. FAllocated: PChar;
  118. FStream: TStream;
  119. FBufSize: Integer;
  120. FOwnStream: Boolean;
  121. public
  122. constructor Create(AStream: TStream; AOwnStream: Boolean);
  123. destructor Destroy; override;
  124. procedure FetchData; override;
  125. end;
  126. TXMLFileInputSource = class(TXMLDecodingSource)
  127. private
  128. FFile: ^Text;
  129. FString: string;
  130. public
  131. constructor Create(var AFile: Text);
  132. procedure FetchData; override;
  133. end;
  134. TDecoder = class
  135. private
  136. FSource: TXMLDecodingSource;
  137. public
  138. constructor Create(ASource: TXMLDecodingSource);
  139. function DecodeNext: WideChar; virtual; abstract;
  140. class function Supports(const AEncoding: string): Boolean; virtual; abstract;
  141. end;
  142. TISO8859_1Decoder = class(TDecoder)
  143. public
  144. function DecodeNext: WideChar; override;
  145. class function Supports(const AEncoding: string): Boolean; override;
  146. end;
  147. TUCS2Decoder = class(TDecoder)
  148. private
  149. FSwapEndian: Boolean;
  150. FEncoding: string;
  151. public
  152. function DecodeNext: WideChar; override;
  153. class function Supports(const AEncoding: string): Boolean; override;
  154. end;
  155. TUTF8Decoder = class(TDecoder)
  156. public
  157. function DecodeNext: WideChar; override;
  158. class function Supports(const AEncoding: string): Boolean; override;
  159. end;
  160. PWideCharBuf = ^TWideCharBuf;
  161. TWideCharBuf = record
  162. Buffer: PWideChar;
  163. Length: Integer;
  164. MaxLength: Integer;
  165. end;
  166. TEntityResolveEvent = procedure(const PubID, SysID: WideString; var Source: TXMLInputSource) of object;
  167. TDeclType = (dtNone, dtXml, dtText);
  168. TXMLReader = class
  169. private
  170. FSource: TXMLInputSource;
  171. FCurChar: WideChar;
  172. FWhitespace: Boolean;
  173. FXML11: Boolean;
  174. FValue: TWideCharBuf;
  175. FName: TWideCharBuf;
  176. FCopyBuf: PWideCharBuf;
  177. FIntSubset: Boolean;
  178. FAllowedDecl: TDeclType;
  179. FDtdParsed: Boolean;
  180. FRecognizePE: Boolean;
  181. FStandalone: Boolean; // property of Doc ?
  182. FInvalid: Boolean;
  183. // TODO: This array must be stored globally, not per instance
  184. FNamePages: PByteArray;
  185. FForbiddenAscii: TSetOfChar;
  186. FDocType: TDOMDocumentTypeEx; // a shortcut
  187. FEntityLevel: Integer;
  188. FPreserveWhitespace: Boolean;
  189. FCreateEntityRefs: Boolean;
  190. procedure RaiseExpectedQmark;
  191. procedure GetChar;
  192. procedure GetCharRaw;
  193. procedure Unget(wc: WideChar);
  194. procedure Initialize(ASource: TXMLInputSource);
  195. procedure InitializeRoot(ASource: TXMLInputSource);
  196. procedure DoParseAttValue(Delim: WideChar);
  197. procedure DoParseFragment;
  198. procedure DoParseExtSubset(ASource: TXMLInputSource);
  199. function ContextPush(AEntity: TDOMEntityEx): Boolean;
  200. function ContextPop: Boolean;
  201. procedure XML11_BuildTables;
  202. function XML11_CheckName: Boolean;
  203. protected
  204. FCursor: TDOMNode;
  205. procedure RaiseExc(const descr: String); overload;
  206. procedure RaiseExc(const descr: string; const args: array of const); overload;
  207. procedure RaiseExc(Expected: WideChar); overload;
  208. function SkipWhitespace: Boolean;
  209. procedure ExpectWhitespace;
  210. procedure ExpectString(const s: String);
  211. procedure ExpectChar(wc: WideChar);
  212. function CheckForChar(c: WideChar): Boolean;
  213. procedure SkipString(const ValidChars: TSetOfChar);
  214. function GetString(const ValidChars: TSetOfChar): WideString;
  215. procedure RaiseNameNotFound;
  216. function CheckName: Boolean;
  217. function CheckNmToken: Boolean;
  218. function ExpectName: WideString; // [5]
  219. procedure SkipName;
  220. function SkipQuotedLiteral: Boolean;
  221. procedure ExpectAttValue; // [10]
  222. procedure SkipPubidLiteral; // [12]
  223. procedure SkipSystemLiteral(out Literal: WideString; Required: Boolean);
  224. procedure ParseComment; // [15]
  225. procedure ParsePI; // [16]
  226. procedure ParseCDSect; // [18]
  227. procedure ParseXmlOrTextDecl(TextDecl: Boolean);
  228. function ParseEq: Boolean; // [25]
  229. procedure ExpectEq;
  230. procedure ParseMisc; // [27]
  231. procedure ParseDoctypeDecl; // [28]
  232. procedure ParseMarkupDecl; // [29]
  233. procedure ParseElement; // [39]
  234. procedure ParseContent; // [43]
  235. function ResolvePredefined(const RefName: WideString): WideChar;
  236. procedure IncludeEntity(AEntity: TDOMEntityEx; InAttr: Boolean);
  237. procedure StartPE;
  238. function ParseCharRef: Boolean; // [66]
  239. function ParseReference: TDOMEntityEx; // [67]
  240. function ParsePEReference: Boolean; // [69]
  241. function ParseExternalID(out SysID, PubID: WideString; // [75]
  242. SysIdOptional: Boolean): Boolean;
  243. procedure ProcessTextAndRefs;
  244. procedure AssertPENesting(CurrentLevel: Integer);
  245. procedure ParseEntityDecl;
  246. procedure ParseEntityDeclValue(Delim: WideChar);
  247. procedure ParseAttlistDecl;
  248. procedure ExpectChoiceOrSeq;
  249. procedure ParseMixedOrChildren;
  250. procedure ParseElementDecl;
  251. procedure ParseNotationDecl;
  252. function ResolveEntity(const SystemID, PublicID: WideString; out Source: TXMLInputSource): Boolean;
  253. procedure ProcessDefaultAttributes(Element: TDOMElement);
  254. procedure ValidationError(const Msg: string; const args: array of const);
  255. public
  256. doc: TDOMDocument;
  257. constructor Create;
  258. destructor Destroy; override;
  259. procedure ProcessXML(ASource: TXMLInputSource); // [1]
  260. procedure ProcessFragment(ASource: TXMLInputSource; AOwner: TDOMNode);
  261. procedure ProcessDTD(ASource: TXMLInputSource); // ([29])
  262. end;
  263. // AttDef/ElementDef support
  264. TAttrDataType = (
  265. DT_CDATA,
  266. DT_ID,
  267. DT_IDREF,
  268. DT_IDREFS,
  269. DT_ENTITY,
  270. DT_ENTITIES,
  271. DT_NMTOKEN,
  272. DT_NMTOKENS,
  273. DT_NOTATION
  274. );
  275. TAttrDefault = (
  276. AD_IMPLIED,
  277. AD_DEFAULT,
  278. AD_REQUIRED,
  279. AD_FIXED
  280. );
  281. TDOMAttrDef = class(TDOMAttr)
  282. protected
  283. FDataType: TAttrDataType;
  284. FDefault: TAttrDefault;
  285. // FEnumeration: TWideStringList? array of WideStrings?
  286. end;
  287. TDOMElementDef = class(TDOMElement);
  288. {$i names.inc}
  289. // TODO: List of registered/supported decoders
  290. function FindDecoder(const Encoding: string): TDecoderRef;
  291. begin
  292. if TISO8859_1Decoder.Supports(Encoding) then
  293. Result := TISO8859_1Decoder
  294. else
  295. Result := nil;
  296. end;
  297. procedure BufAllocate(var ABuffer: TWideCharBuf; ALength: Integer);
  298. begin
  299. ABuffer.MaxLength := ALength;
  300. ABuffer.Length := 0;
  301. ABuffer.Buffer:=AllocMem(ABuffer.MaxLength*SizeOf(WideChar));
  302. end;
  303. procedure BufAppend(var ABuffer: TWideCharBuf; wc: WideChar);
  304. var
  305. OldLength : integer;
  306. begin
  307. if ABuffer.Length >= ABuffer.MaxLength then
  308. begin
  309. OldLength := ABuffer.MaxLength;
  310. ABuffer.MaxLength := ABuffer.MaxLength * 2;
  311. ReallocMem(ABuffer.Buffer, ABuffer.MaxLength * SizeOf(WideChar));
  312. FillChar(ABuffer.Buffer[OldLength],(ABuffer.MaxLength-OldLength) * SizeOf(WideChar),0);
  313. end;
  314. ABuffer.Buffer[ABuffer.Length] := wc;
  315. Inc(ABuffer.Length);
  316. end;
  317. function IsValidEncName(const s: WideString): Boolean;
  318. var
  319. I: Integer;
  320. begin
  321. Result := False;
  322. if (s = '') or (s[1] > #255) or not (char(s[1]) in ['A'..'Z', 'a'..'z']) then
  323. Exit;
  324. for I := 2 to Length(s) do
  325. if (s[I] > #255) or not (char(s[I]) in ['A'..'Z', 'a'..'z', '0'..'9', '.', '_', '-']) then
  326. Exit;
  327. Result := True;
  328. end;
  329. { TDOMDocumentTypeEx }
  330. destructor TDOMDocumentTypeEx.Destroy;
  331. begin
  332. FPEMap.Free;
  333. FElementDefs.Free;
  334. inherited Destroy;
  335. end;
  336. function TDOMDocumentTypeEx.GetElementDefs: TDOMNamedNodeMap;
  337. begin
  338. if FElementDefs = nil then
  339. FElementDefs := TDOMNamedNodeMap.Create(Self, ELEMENT_NODE);
  340. Result := FElementDefs;
  341. end;
  342. function TDOMDocumentTypeEx.GetPEMap: TDOMNamedNodeMap;
  343. begin
  344. if FPEMap = nil then
  345. FPEMap := TDOMNamedNodeMap.Create(Self, ENTITY_NODE);
  346. Result := FPEMap;
  347. end;
  348. // TODO: These classes still cannot be considered as the final solution...
  349. { TXMLInputSource }
  350. constructor TXMLInputSource.Create(const AData: WideString);
  351. begin
  352. inherited Create;
  353. FBuf := PChar(PWideChar(AData));
  354. FBufEnd := FBuf + Length(AData) * sizeof(WideChar);
  355. end;
  356. procedure TXMLInputSource.Initialize;
  357. begin
  358. FLine := 1;
  359. FColumn := 0;
  360. end;
  361. function TXMLInputSource.NextChar: WideChar;
  362. begin
  363. if FSurrogate <> #0 then
  364. begin
  365. Result := FSurrogate;
  366. FSurrogate := #0;
  367. end
  368. else if FBufEnd <= FBuf then
  369. begin
  370. Result := #0;
  371. FEof := True;
  372. end
  373. else
  374. begin
  375. Result := PWideChar(FBuf)^;
  376. Inc(FBuf, sizeof(WideChar));
  377. end;
  378. // TODO: Column counting - surrogate pair is a single char!
  379. if Result = #10 then
  380. begin
  381. Inc(FLine);
  382. FColumn := 0;
  383. end
  384. else
  385. Inc(FColumn);
  386. end;
  387. procedure TXMLDecodingSource.DecodingError(const Msg: string);
  388. begin
  389. FReader.RaiseExc(Msg);
  390. end;
  391. procedure TXMLDecodingSource.DecodingError(const Msg: string;
  392. const Args: array of const);
  393. begin
  394. FReader.RaiseExc(Msg, Args);
  395. end;
  396. procedure TXMLInputSource.FetchData;
  397. begin
  398. FEof := True;
  399. end;
  400. procedure TXMLInputSource.SetEncoding(const AEncoding: string);
  401. begin
  402. // do nothing
  403. end;
  404. function TXMLInputSource.GetPublicID: WideString;
  405. begin
  406. if FPublicID <> '' then
  407. Result := FPublicID
  408. else if Assigned(FParent) then
  409. Result := FParent.PublicID
  410. else
  411. Result := '';
  412. end;
  413. function TXMLInputSource.GetSystemID: WideString;
  414. begin
  415. if FSystemID <> '' then
  416. Result := FSystemID
  417. else if Assigned(FParent) then
  418. Result := FParent.SystemID
  419. else
  420. Result := '';
  421. end;
  422. { TXMLDecodingSource }
  423. destructor TXMLDecodingSource.Destroy;
  424. begin
  425. FDecoder.Free;
  426. inherited Destroy;
  427. end;
  428. function TXMLDecodingSource.InternalNextChar: WideChar;
  429. begin
  430. // TODO: find a place for it, finally...
  431. if FSurrogate <> #0 then
  432. begin
  433. Result := FSurrogate;
  434. FSurrogate := #0;
  435. Exit;
  436. end;
  437. if FBufEnd <= FBuf then
  438. FetchData;
  439. if not FEof then
  440. Result := FDecoder.DecodeNext
  441. else
  442. Result := #0;
  443. end;
  444. function TXMLDecodingSource.NextChar: WideChar;
  445. begin
  446. Result := InternalNextChar;
  447. if FSeenCR then
  448. begin
  449. if (Result = #10) or ((Result = #$85) and FReader.FXML11) then
  450. Result := InternalNextChar;
  451. FSeenCR := False;
  452. end;
  453. case Result of
  454. #13: begin
  455. FSeenCR := True;
  456. Result := #10;
  457. end;
  458. #$85, #$2028:
  459. if FReader.FXML11 then
  460. Result := #10;
  461. end;
  462. if (Result < #256) and (char(Result) in FReader.FForbiddenAscii) or
  463. ((ord(Result) or 1) = $FFFF) then
  464. DecodingError('Invalid character');
  465. // TODO: Column counting - surrogate pair is a single char!
  466. if Result = #10 then
  467. begin
  468. Inc(FLine);
  469. FColumn := 0;
  470. end
  471. else
  472. Inc(FColumn);
  473. end;
  474. procedure TXMLDecodingSource.Initialize;
  475. begin
  476. inherited;
  477. if FBufEnd-FBuf > 1 then
  478. repeat
  479. if (FBuf[0] = #$FE) and (FBuf[1] = #$FF) then // BE
  480. begin
  481. FDecoder := TUCS2Decoder.Create(Self);
  482. TUCS2Decoder(FDecoder).FEncoding := 'UTF-16BE';
  483. {$IFNDEF ENDIAN_BIG}
  484. TUCS2Decoder(FDecoder).FSwapEndian := True;
  485. {$ENDIF}
  486. Exit;
  487. end
  488. else if (FBuf[0] = #$FF) and (FBuf[1] = #$FE) then // LE
  489. begin
  490. FDecoder := TUCS2Decoder.Create(Self);
  491. TUCS2Decoder(FDecoder).FEncoding := 'UTF-16LE';
  492. {$IFDEF ENDIAN_BIG}
  493. TUCS2Decoder(FDecoder).FSwapEndian := True;
  494. {$ENDIF}
  495. Exit;
  496. end
  497. else
  498. Break;
  499. until False;
  500. FDecoder := TUTF8Decoder.Create(Self);
  501. end;
  502. procedure TXMLDecodingSource.SetEncoding(const AEncoding: string);
  503. var
  504. NewDecoder: TDecoderRef;
  505. begin
  506. if FDecoder.Supports(AEncoding) then // no change needed
  507. Exit;
  508. // hardcoded stuff - special case of UCS2
  509. if FDecoder is TUCS2Decoder then
  510. begin
  511. // check for 'UTF-16LE' or 'UTF-16BE'
  512. if SameText(AEncoding, TUCS2Decoder(FDecoder).FEncoding) then
  513. Exit
  514. else
  515. DecodingError('Current encoding cannot be switched to ''%s''', [AEncoding]);
  516. end;
  517. NewDecoder := FindDecoder(AEncoding);
  518. if Assigned(NewDecoder) then
  519. begin
  520. FDecoder.Free;
  521. FDecoder := NewDecoder.Create(Self);
  522. end
  523. else
  524. DecodingError('Encoding ''%s'' is not supported', [AEncoding]);
  525. end;
  526. { TXMLStreamInputSource }
  527. constructor TXMLStreamInputSource.Create(AStream: TStream; AOwnStream: Boolean);
  528. begin
  529. FStream := AStream;
  530. FBufSize := 4096;
  531. GetMem(FAllocated, FBufSize+8);
  532. FBuf := FAllocated+8;
  533. FBufEnd := FBuf;
  534. FOwnStream := AOwnStream;
  535. FetchData;
  536. end;
  537. destructor TXMLStreamInputSource.Destroy;
  538. begin
  539. FreeMem(FAllocated);
  540. if FOwnStream then
  541. FStream.Free;
  542. inherited Destroy;
  543. end;
  544. procedure TXMLStreamInputSource.FetchData;
  545. var
  546. Remainder, BytesRead: Integer;
  547. OldBuf: PChar;
  548. begin
  549. Assert(FBufEnd - FBuf < 8);
  550. OldBuf := FBuf;
  551. Remainder := FBufEnd - FBuf;
  552. FBuf := FAllocated+8-Remainder;
  553. Move(OldBuf^, FBuf^, Remainder);
  554. BytesRead := FStream.Read(FAllocated[8], FBufSize);
  555. if BytesRead = 0 then
  556. FEof := True;
  557. FBufEnd := FAllocated + 8 + BytesRead;
  558. end;
  559. { TXMLFileInputSource }
  560. constructor TXMLFileInputSource.Create(var AFile: Text);
  561. begin
  562. FFile := @AFile;
  563. ReadLn(FFile^, FString);
  564. FBuf := PChar(FString);
  565. FBufEnd := FBuf + Length(FString);
  566. end;
  567. procedure TXMLFileInputSource.FetchData;
  568. begin
  569. FEof := Eof(FFile^);
  570. if not FEof then
  571. begin
  572. ReadLn(FFile^, FString);
  573. FString := FString + #10; // bad solution...
  574. FBuf := PChar(FString);
  575. FBufEnd := FBuf + Length(FString);
  576. end;
  577. end;
  578. { TDecoder }
  579. constructor TDecoder.Create(ASource: TXMLDecodingSource);
  580. begin
  581. inherited Create;
  582. FSource := ASource;
  583. end;
  584. { TISO8859_1Decoder}
  585. function TISO8859_1Decoder.DecodeNext: WideChar;
  586. begin
  587. with FSource do
  588. begin
  589. Result := WideChar(FBuf[0]);
  590. Inc(FBuf);
  591. end;
  592. end;
  593. class function TISO8859_1Decoder.Supports(const AEncoding: string): Boolean;
  594. begin
  595. Result := SameText(AEncoding, 'ISO-8859-1') or
  596. SameText(AEncoding, 'ISO_8859-1') or
  597. SameText(AEncoding, 'latin1') or
  598. SameText(AEncoding, 'iso-ir-100') or
  599. SameText(AEncoding, 'l1') or
  600. SameText(AEncoding, 'IBM819') or
  601. SameText(AEncoding, 'CP819') or
  602. SameText(AEncoding, 'csISOLatin1') or
  603. // This one is not in character-sets.txt, but used in most FPC documentation...
  604. SameText(AEncoding, 'ISO8859-1');
  605. end;
  606. { TUCS2Decoder }
  607. function TUCS2Decoder.DecodeNext: WideChar;
  608. begin
  609. with FSource do
  610. begin
  611. Result := PWideChar(FBuf)^;
  612. Inc(FBuf, sizeof(WideChar));
  613. end;
  614. if FSwapEndian then
  615. Result := WideChar(Swap(Word(Result)));
  616. end;
  617. class function TUCS2Decoder.Supports(const AEncoding: string): Boolean;
  618. begin
  619. // generic aliases for both LE and BE
  620. Result := SameText(AEncoding, 'UTF-16') or
  621. SameText(AEncoding, 'unicode');
  622. end;
  623. { TUTF8Decoder }
  624. function TUTF8Decoder.DecodeNext: WideChar;
  625. const
  626. MaxCode: array[0..3] of Cardinal = ($7F, $7FF, $FFFF, $1FFFFF);
  627. var
  628. Value: Cardinal;
  629. I, bc: Integer;
  630. begin
  631. with FSource do
  632. begin
  633. Result := WideChar(FBuf[0]);
  634. Inc(FBuf);
  635. if Result < #$80 then
  636. Exit;
  637. if Byte(Result) and $40 = 0 then
  638. DecodingError('Invalid UTF8 sequence start byte');
  639. bc := 1;
  640. if Byte(Result) and $20 <> 0 then
  641. begin
  642. Inc(bc);
  643. if Byte(Result) and $10 <> 0 then
  644. begin
  645. Inc(bc);
  646. if Byte(Result) and $8 <> 0 then
  647. DecodingError('UCS4 character out of supported range');
  648. end;
  649. end;
  650. // DONE: (?) check that bc bytes available
  651. if FBufEnd-FBuf < bc then
  652. FetchData;
  653. Value := Byte(Result);
  654. I := bc; // note: I is never zero
  655. while bc > 0 do
  656. begin
  657. if ord(FBuf[0]) and $C0 <> $80 then
  658. DecodingError('Invalid byte in UTF8 sequence');
  659. Value := (Value shl 6) or (Cardinal(FBuf[0]) and $3F);
  660. Inc(FBuf);
  661. Dec(bc);
  662. end;
  663. Value := Value and MaxCode[I];
  664. // RFC2279 check
  665. if Value <= MaxCode[I-1] then
  666. DecodingError('Invalid UTF8 sequence');
  667. case Value of
  668. 0..$D7FF, $E000..$FFFF:
  669. begin
  670. Result := WideChar(Value);
  671. Exit;
  672. end;
  673. $10000..$10FFFF:
  674. begin
  675. Result := WideChar($D7C0 + (Value shr 10));
  676. FSurrogate := WideChar($DC00 xor (Value and $3FF));
  677. Exit;
  678. end;
  679. end;
  680. DecodingError('UCS4 character out of supported range');
  681. end;
  682. end;
  683. class function TUTF8Decoder.Supports(const AEncoding: string): Boolean;
  684. begin
  685. Result := SameText(AEncoding, 'UTF-8');
  686. end;
  687. { TXMLReader }
  688. function TXMLReader.ResolveEntity(const SystemID, PublicID: WideString; out Source: TXMLInputSource): Boolean;
  689. var
  690. AbsSysID: WideString;
  691. Filename: string;
  692. Stream: TStream;
  693. begin
  694. Result := False;
  695. if ResolveRelativeURI(FSource.SystemID, SystemID, AbsSysID) then
  696. begin
  697. Source := nil;
  698. // TODO: alternative resolvers
  699. if URIToFilename(AbsSysID, Filename) then
  700. begin
  701. try
  702. Stream := TFileStream.Create(Filename, fmOpenRead + fmShareDenyWrite);
  703. Source := TXMLStreamInputSource.Create(Stream, True);
  704. Source.SystemID := AbsSysID;
  705. Source.PublicID := PublicID;
  706. Result := True;
  707. except
  708. on E: Exception do
  709. ValidationError('%s', [E.Message]);
  710. end;
  711. end;
  712. end;
  713. end;
  714. procedure TXMLReader.InitializeRoot(ASource: TXMLInputSource);
  715. begin
  716. Initialize(ASource);
  717. GetChar;
  718. // TODO: presence of BOM must prevent UTF-8 encoding from being changed
  719. CheckForChar(#$FEFF); // skip BOM, if one is present
  720. end;
  721. procedure TXMLReader.Initialize(ASource: TXMLInputSource);
  722. begin
  723. FSource := ASource;
  724. FSource.FReader := Self;
  725. FSource.Initialize;
  726. end;
  727. procedure TXMLReader.GetCharRaw;
  728. begin
  729. FCurChar := FSource.NextChar;
  730. FWhitespace := (FCurChar = #32) or (FCurChar = #10) or
  731. (FCurChar = #9) or (FCurChar = #13);
  732. // Used for handling the internal DTD subset
  733. if Assigned(FCopyBuf) and (FSource.FParent = nil) then
  734. BufAppend(FCopyBuf^, FCurChar);
  735. end;
  736. procedure TXMLReader.GetChar;
  737. begin
  738. GetCharRaw;
  739. if not FRecognizePE then
  740. Exit;
  741. if (FCurChar = #0) and ContextPop then
  742. begin
  743. Unget(FCurChar);
  744. FCurChar := #32;
  745. FWhitespace := True;
  746. end
  747. else if FCurChar = '%' then
  748. begin
  749. FCurChar := FSource.NextChar;
  750. if not CheckName then
  751. begin
  752. Unget(FCurChar);
  753. FCurChar := '%';
  754. Exit;
  755. end;
  756. if FCurChar = ';' then // "%pe1;%pe2" - must not recognize pe2 immediately!
  757. GetCharRaw
  758. else
  759. RaiseExc(WideChar(';'));
  760. StartPE;
  761. FCurChar := #32;
  762. FWhitespace := True;
  763. end;
  764. end;
  765. procedure TXMLReader.Unget(wc: WideChar);
  766. begin
  767. FSource.FSurrogate := wc;
  768. end;
  769. procedure TXMLReader.RaiseExpectedQmark;
  770. begin
  771. RaiseExc('Expected single or double quote');
  772. end;
  773. procedure TXMLReader.RaiseExc(Expected: WideChar);
  774. begin
  775. // FIX: don't output what is found - anything may be found, including exploits...
  776. RaiseExc('Expected "%1s"', [string(Expected)]);
  777. end;
  778. procedure TXMLReader.RaiseExc(const descr: String);
  779. begin
  780. raise EXMLReadError.CreateFmt('In ''%s'' (line %d pos %d): %s', [FSource.SystemID, FSource.FLine, FSource.FColumn, descr]);
  781. end;
  782. procedure TXMLReader.RaiseExc(const descr: string; const args: array of const);
  783. begin
  784. RaiseExc(Format(descr, args));
  785. end;
  786. function TXMLReader.SkipWhitespace: Boolean;
  787. begin
  788. Result := False;
  789. while FWhitespace do
  790. begin
  791. GetChar;
  792. Result := True;
  793. end;
  794. end;
  795. procedure TXMLReader.ExpectWhitespace;
  796. begin
  797. if not SkipWhitespace then
  798. RaiseExc('Expected whitespace');
  799. end;
  800. procedure TXMLReader.ExpectChar(wc: WideChar);
  801. begin
  802. if FCurChar = wc then
  803. GetChar
  804. else
  805. RaiseExc(wc);
  806. end;
  807. procedure TXMLReader.ExpectString(const s: String);
  808. var
  809. I: Integer;
  810. begin
  811. for I := 1 to Length(s) do
  812. begin
  813. if FCurChar <> WideChar(s[i]) then
  814. RaiseExc('Expected "%s"', [s]);
  815. GetChar;
  816. end;
  817. end;
  818. function TXMLReader.CheckForChar(c: WideChar): Boolean;
  819. begin
  820. Result := (FCurChar = c);
  821. if Result then
  822. GetChar;
  823. end;
  824. procedure TXMLReader.SkipString(const ValidChars: TSetOfChar);
  825. begin
  826. FValue.Length := 0;
  827. while (ord(FCurChar) < 256) and (char(FCurChar) in ValidChars) do
  828. begin
  829. BufAppend(FValue, FCurChar);
  830. GetChar;
  831. end;
  832. end;
  833. function TXMLReader.GetString(const ValidChars: TSetOfChar): WideString;
  834. begin
  835. SkipString(ValidChars);
  836. SetString(Result, FValue.Buffer, FValue.Length);
  837. end;
  838. constructor TXMLReader.Create;
  839. begin
  840. inherited Create;
  841. // Naming bitmap: Point to static data for XML 1.0,
  842. // and allocate buffer in XML11_BuildTables when necessary.
  843. FNamePages := @NamePages;
  844. BufAllocate(FName, 128);
  845. BufAllocate(FValue, 512);
  846. FForbiddenAscii := [#1..#8, #11..#12, #14..#31];
  847. // TODO: put under user control
  848. FPreserveWhitespace := True;
  849. FCreateEntityRefs := True;
  850. end;
  851. destructor TXMLReader.Destroy;
  852. begin
  853. if FXML11 then
  854. FreeMem(FNamePages);
  855. FreeMem(FName.Buffer);
  856. FreeMem(FValue.Buffer);
  857. while ContextPop do; // clean input stack
  858. FSource.Free;
  859. inherited Destroy;
  860. end;
  861. procedure TXMLReader.XML11_BuildTables;
  862. var
  863. I: Integer;
  864. begin
  865. if not FXML11 then
  866. GetMem(FNamePages, 512);
  867. FXML11 := True;
  868. for I := 0 to 255 do
  869. FNamePages^[I] := ord(Byte(I) in Xml11HighPages);
  870. FNamePages^[0] := 2;
  871. FNamePages^[3] := $2c;
  872. FNamePages^[$20] := $2a;
  873. FNamePages^[$21] := $2b;
  874. FNamePages^[$2f] := $29;
  875. FNamePages^[$30] := $2d;
  876. FNamePages^[$fd] := $28;
  877. Move(FNamePages^, FNamePages^[256], 256);
  878. FNamePages^[$100] := $19;
  879. FNamePages^[$103] := $2E;
  880. FNamePages^[$120] := $2F;
  881. FForbiddenAscii := [#1..#8, #11..#12, #14..#31, #$7F..#$84, #$86..#$9F];
  882. end;
  883. procedure TXMLReader.ProcessXML(ASource: TXMLInputSource);
  884. begin
  885. doc := TXMLDocument.Create;
  886. FCursor := doc;
  887. InitializeRoot(ASource);
  888. FAllowedDecl := dtXml;
  889. ParseMisc;
  890. FDtdParsed := True;
  891. if FDocType = nil then
  892. ValidationError('Missing DTD', []);
  893. if CheckName then
  894. ParseElement
  895. else
  896. RaiseExc('Expected element');
  897. ParseMisc;
  898. if Assigned(FDocType) and (doc.DocumentElement.TagName <> FDocType.Name) then
  899. ValidationError('DTD name does not match root element', []);
  900. if FCurChar <> #0 then
  901. RaiseExc('Text after end of document element found');
  902. end;
  903. procedure TXMLReader.ProcessFragment(ASource: TXMLInputSource; AOwner: TDOMNode);
  904. begin
  905. doc := AOwner.OwnerDocument;
  906. FCursor := AOwner;
  907. InitializeRoot(ASource);
  908. FXML11 := doc.InheritsFrom(TXMLDocument) and (TXMLDocument(doc).XMLVersion = '1.1');
  909. FAllowedDecl := dtText;
  910. DoParseFragment;
  911. end;
  912. // XML 1.1 allowed range $10000..$EFFFF is [D800..DB7F] followed by [DC00..DFFF]
  913. function TXMLReader.XML11_CheckName: Boolean;
  914. begin
  915. if (FCurChar >= #$D800) and (FCurChar <= #$DB7F) then
  916. begin
  917. BufAppend(FName, FCurChar);
  918. GetCharRaw;
  919. Result := (FCurChar >= #$DC00) and (FCurChar <= #$DFFF);
  920. end
  921. else
  922. Result := False;
  923. end;
  924. function TXMLReader.CheckName: Boolean;
  925. begin
  926. FName.Length := 0;
  927. Result := (Byte(FCurChar) in NamingBitmap[FNamePages^[hi(Word(FCurChar))]]) or
  928. (FXML11 and XML11_CheckName);
  929. if Result then
  930. repeat
  931. BufAppend(FName, FCurChar);
  932. GetChar;
  933. until not ((Byte(FCurChar) in NamingBitmap[FNamePages^[$100+hi(Word(FCurChar))]]) or
  934. (FXML11 and XML11_CheckName));
  935. end;
  936. function TXMLReader.CheckNmToken: Boolean;
  937. begin
  938. FName.Length := 0;
  939. Result := False;
  940. while (Byte(FCurChar) in NamingBitmap[FNamePages^[$100+hi(Word(FCurChar))]]) or
  941. (FXML11 and XML11_CheckName) do
  942. begin
  943. BufAppend(FName, FCurChar);
  944. GetChar;
  945. Result := True;
  946. end;
  947. end;
  948. procedure TXMLReader.RaiseNameNotFound;
  949. begin
  950. RaiseExc('Name starts with invalid character');
  951. end;
  952. function TXMLReader.ExpectName: WideString;
  953. begin
  954. if not CheckName then
  955. RaiseNameNotFound;
  956. SetString(Result, FName.Buffer, FName.Length);
  957. end;
  958. procedure TXMLReader.SkipName;
  959. begin
  960. if not CheckName then
  961. RaiseNameNotFound;
  962. end;
  963. function TXMLReader.ResolvePredefined(const RefName: WideString): WideChar;
  964. begin
  965. if RefName = 'amp' then
  966. Result := '&'
  967. else if RefName = 'apos' then
  968. Result := ''''
  969. else if RefName = 'gt' then
  970. Result := '>'
  971. else if RefName = 'lt' then
  972. Result := '<'
  973. else if RefName = 'quot' then
  974. Result := '"'
  975. else
  976. Result := #0;
  977. end;
  978. function TXMLReader.ParseCharRef: Boolean; // [66]
  979. var
  980. Value: Integer;
  981. begin
  982. Result := FCurChar = '#';
  983. if Result then
  984. begin
  985. GetCharRaw;
  986. Value := 0;
  987. if CheckForChar('x') then
  988. repeat
  989. case FCurChar of
  990. '0'..'9': Value := Value * 16 + Ord(FCurChar) - Ord('0');
  991. 'a'..'f': Value := Value * 16 + Ord(FCurChar) - (Ord('a') - 10);
  992. 'A'..'F': Value := Value * 16 + Ord(FCurChar) - (Ord('A') - 10);
  993. else
  994. Break;
  995. end;
  996. GetCharRaw;
  997. until False
  998. else
  999. repeat
  1000. case FCurChar of
  1001. '0'..'9': Value := Value * 10 + Ord(FCurChar) - Ord('0');
  1002. else
  1003. Break;
  1004. end;
  1005. GetCharRaw;
  1006. until False;
  1007. ExpectChar(';');
  1008. case Value of
  1009. $01..$08, $0B..$0C, $0E..$1F:
  1010. if FXML11 then
  1011. BufAppend(FValue, WideChar(Value))
  1012. else
  1013. RaiseExc('Invalid character reference');
  1014. $09, $0A, $0D, $20..$D7FF, $E000..$FFFD:
  1015. BufAppend(FValue, WideChar(Value));
  1016. $10000..$10FFFF:
  1017. begin
  1018. BufAppend(FValue, WideChar($D7C0 + (Value shr 10)));
  1019. BufAppend(FValue, WideChar($DC00 xor (Value and $3FF)));
  1020. end;
  1021. else
  1022. RaiseExc('Invalid character reference');
  1023. end;
  1024. end;
  1025. end;
  1026. procedure TXMLReader.DoParseAttValue(Delim: WideChar);
  1027. var
  1028. RefNode: TDOMEntityEx;
  1029. begin
  1030. FValue.Length := 0;
  1031. while (FCurChar <> Delim) and (FCurChar <> #0) do
  1032. begin
  1033. if FCurChar = '<' then
  1034. RaiseExc('Literal "<" in attribute value')
  1035. else if FCurChar <> '&' then
  1036. begin
  1037. if FWhitespace then
  1038. FCurChar := #32;
  1039. BufAppend(FValue, FCurChar);
  1040. GetCharRaw;
  1041. end
  1042. else
  1043. begin
  1044. GetCharRaw; // skip '&'
  1045. if ParseCharRef then
  1046. Continue;
  1047. RefNode := ParseReference;
  1048. if Assigned(RefNode) then
  1049. begin
  1050. if FValue.Length > 0 then
  1051. begin
  1052. FCursor.AppendChild(doc.CreateTextNodeBuf(FValue.Buffer, FValue.Length));
  1053. FValue.Length := 0;
  1054. end;
  1055. if RefNode.SystemID <> '' then
  1056. RaiseExc('External entity reference is not allowed in attribute value');
  1057. IncludeEntity(RefNode, True);
  1058. end;
  1059. end;
  1060. end; // while
  1061. if FValue.Length > 0 then
  1062. begin
  1063. FCursor.AppendChild(doc.CreateTextNodeBuf(FValue.Buffer, FValue.Length));
  1064. FValue.Length := 0;
  1065. end;
  1066. end;
  1067. procedure TXMLReader.DoParseFragment;
  1068. begin
  1069. ParseContent;
  1070. if FCurChar <> #0 then
  1071. RaiseExc('Closing tag not allowed here');
  1072. end;
  1073. function TXMLReader.ContextPush(AEntity: TDOMEntityEx): Boolean;
  1074. var
  1075. Src: TXMLInputSource;
  1076. begin
  1077. if AEntity.SystemID <> '' then
  1078. begin
  1079. Result := ResolveEntity(AEntity.SystemID, AEntity.PublicID, Src);
  1080. if not Result then
  1081. Exit;
  1082. {
  1083. TODO: need different handling of TextDecl in external PEs
  1084. it cannot be parsed if PE is referenced INSIDE declaration
  1085. But - is such case ever met in the wild ?? E.g. MSXML fails such things...
  1086. }
  1087. FAllowedDecl := dtText;
  1088. end
  1089. else
  1090. Src := TXMLInputSource.Create(AEntity.FReplacementText);
  1091. AEntity.FOnStack := True;
  1092. Src.FEntity := AEntity;
  1093. Src.FParent := FSource;
  1094. Src.FCursor := FCursor;
  1095. Unget(FCurChar); // remember FCurChar in previous context
  1096. Inc(FEntityLevel);
  1097. Initialize(Src);
  1098. Result := True;
  1099. end;
  1100. function TXMLReader.ContextPop: Boolean;
  1101. var
  1102. Src: TXMLInputSource;
  1103. begin
  1104. Result := Assigned(FSource.FParent);
  1105. if Result then
  1106. begin
  1107. Src := FSource.FParent;
  1108. if Assigned(FSource.FEntity) then
  1109. TDOMEntityEx(FSource.FEntity).FOnStack := False;
  1110. FCursor := TDOMNode(FSource.FCursor);
  1111. FSource.Free;
  1112. FSource := Src;
  1113. Dec(FEntityLevel);
  1114. GetChar; // re-classify - case of "%pe1;%pe2;"
  1115. end;
  1116. end;
  1117. procedure TXMLReader.IncludeEntity(AEntity: TDOMEntityEx; InAttr: Boolean);
  1118. var
  1119. Node, Child: TDOMNode;
  1120. begin
  1121. if not AEntity.FResolved then
  1122. begin
  1123. if AEntity.FOnStack then
  1124. RaiseExc('Entity ''%s'' recursively references itself', [AEntity.NodeName]);
  1125. if ContextPush(AEntity) then
  1126. begin
  1127. GetCharRaw;
  1128. CheckForChar(#$FEFF);
  1129. FCursor := AEntity; // build child node tree for the entity
  1130. try
  1131. if InAttr then
  1132. DoParseAttValue(#0)
  1133. else
  1134. DoParseFragment;
  1135. AEntity.FResolved := True;
  1136. finally
  1137. ContextPop; // FCursor restored
  1138. FValue.Length := 0;
  1139. end;
  1140. end;
  1141. end;
  1142. Node := FCursor;
  1143. if FCreateEntityRefs or (not AEntity.FResolved) then
  1144. begin
  1145. Node := doc.CreateEntityReference(AEntity.NodeName);
  1146. FCursor.AppendChild(Node);
  1147. end;
  1148. Child := AEntity.FirstChild; // clone the entity node tree
  1149. while Assigned(Child) do
  1150. begin
  1151. Node.AppendChild(Child.CloneNode(True));
  1152. Child := Child.NextSibling;
  1153. end;
  1154. end;
  1155. procedure TXMLReader.StartPE;
  1156. var
  1157. PEName: WideString;
  1158. PEnt: TDOMEntityEx;
  1159. begin
  1160. SetString(PEName, FName.Buffer, FName.Length);
  1161. PEnt := FDocType.PEMap.GetNamedItem(PEName) as TDOMEntityEx;
  1162. if PEnt = nil then // TODO -cVC: Referencing undefined PE
  1163. begin // (These are classified as 'optional errors'...)
  1164. // ValidationError('Undefined parameter entity referenced: %s', [PEName]);
  1165. Exit;
  1166. end;
  1167. if PEnt.FOnStack then
  1168. RaiseExc('Entity ''%%%s'' recursively references itself', [PEnt.NodeName]);
  1169. ContextPush(PEnt);
  1170. end;
  1171. function TXMLReader.ParseReference: TDOMEntityEx;
  1172. var
  1173. RefName: WideString;
  1174. Predef: WideChar;
  1175. begin
  1176. Result := nil;
  1177. RefName := ExpectName;
  1178. ExpectChar(';');
  1179. Predef := ResolvePredefined(RefName);
  1180. if Predef <> #0 then
  1181. BufAppend(FValue, Predef)
  1182. else
  1183. begin
  1184. if Assigned(FDocType) then
  1185. Result := FDocType.Entities.GetNamedItem(RefName) as TDOMEntityEx;
  1186. if Result = nil then
  1187. begin
  1188. if FStandalone or (FDocType = nil) or not (FDocType.HasPERefs or (FDocType.SystemID <> '')) then
  1189. RaiseExc('Undefined entity ''%s'' referenced', [RefName])
  1190. else
  1191. ValidationError('Undefined entity ''%s'' referenced', [RefName]);
  1192. end
  1193. else
  1194. begin
  1195. if FStandalone and (not Result.FInternal) then
  1196. RaiseExc('Standalone constraint violation');
  1197. if Result.NotationName <> '' then
  1198. RaiseExc('Reference to unparsed entity ''%s''', [RefName]);
  1199. end;
  1200. end;
  1201. end;
  1202. procedure TXMLReader.ProcessTextAndRefs;
  1203. var
  1204. nonWs: Boolean;
  1205. last: WideChar;
  1206. RefNode: TDOMEntityEx;
  1207. begin
  1208. FValue.Length := 0;
  1209. nonWs := False;
  1210. FAllowedDecl := dtNone;
  1211. while (FCurChar <> '<') and (FCurChar <> #0) do
  1212. begin
  1213. if FCurChar <> '&' then
  1214. begin
  1215. if not FWhitespace then
  1216. nonWs := True;
  1217. BufAppend(FValue, FCurChar);
  1218. if FCurChar = '>' then
  1219. with FValue do
  1220. if (Length >= 3) and
  1221. (Buffer[Length-2] = ']') and (Buffer[Length-3] = ']') then
  1222. RaiseExc('Literal '']]>'' is not allowed in text');
  1223. GetCharRaw;
  1224. end
  1225. else
  1226. begin
  1227. GetCharRaw; // skip '&'
  1228. if ParseCharRef then
  1229. begin
  1230. last := FValue.Buffer[FValue.Length-1];
  1231. if (last <> #9) and (last <> #10) and (last <> #13) and (last <> #32) then
  1232. nonWs := True;
  1233. Continue;
  1234. end;
  1235. nonWs := True;
  1236. RefNode := ParseReference;
  1237. if Assigned(RefNode) then
  1238. begin
  1239. if (nonWs or FPreserveWhitespace) and (FValue.Length > 0) then
  1240. begin
  1241. FCursor.AppendChild(doc.CreateTextNodeBuf(FValue.Buffer, FValue.Length));
  1242. FValue.Length := 0;
  1243. nonWs := False;
  1244. end;
  1245. IncludeEntity(RefNode, False);
  1246. end;
  1247. end;
  1248. end; // while
  1249. if (nonWs or FPreserveWhitespace) and (FValue.Length > 0) then
  1250. begin
  1251. FCursor.AppendChild(doc.CreateTextNodeBuf(FValue.Buffer, FValue.Length));
  1252. FValue.Length := 0;
  1253. end;
  1254. end;
  1255. procedure TXMLReader.ExpectAttValue; // [10]
  1256. var
  1257. Delim: WideChar;
  1258. begin
  1259. if (FCurChar <> '''') and (FCurChar <> '"') then
  1260. RaiseExpectedQmark;
  1261. Delim := FCurChar;
  1262. GetCharRaw; // skip quote
  1263. DoParseAttValue(Delim);
  1264. GetChar; // NOTE: not GetCharRaw - when parsing AttDef in DTD,
  1265. // immediately following PERef must be recognized
  1266. end;
  1267. function TXMLReader.SkipQuotedLiteral: Boolean;
  1268. var
  1269. Delim: WideChar;
  1270. begin
  1271. Result := (FCurChar = '''') or (FCurChar = '"');
  1272. if Result then
  1273. begin
  1274. Delim := FCurChar;
  1275. GetCharRaw; // skip quote
  1276. FValue.Length := 0;
  1277. while (FCurChar <> Delim) and (FCurChar <> #0) do
  1278. begin
  1279. BufAppend(FValue, FCurChar);
  1280. GetCharRaw;
  1281. end;
  1282. ExpectChar(Delim); // <-- to check the EOF only
  1283. end;
  1284. end;
  1285. procedure TXMLReader.SkipPubidLiteral; // [12]
  1286. var
  1287. I: Integer;
  1288. begin
  1289. if SkipQuotedLiteral then
  1290. begin
  1291. for I := 0 to FValue.Length-1 do
  1292. if (FValue.Buffer[I] > #255) or not (Char(FValue.Buffer[I]) in PubidChars) then
  1293. RaiseExc('Illegal Public ID literal')
  1294. end
  1295. else
  1296. RaiseExpectedQMark;
  1297. end;
  1298. procedure TXMLReader.SkipSystemLiteral(out Literal: WideString; Required: Boolean);
  1299. begin
  1300. if SkipQuotedLiteral then
  1301. SetString(Literal, FValue.Buffer, FValue.Length)
  1302. else if Required then
  1303. RaiseExpectedQMark;
  1304. end;
  1305. procedure TXMLReader.ParseComment; // [15]
  1306. begin
  1307. ExpectString('--');
  1308. FValue.Length := 0;
  1309. repeat
  1310. BufAppend(FValue, FCurChar);
  1311. GetCharRaw;
  1312. with FValue do
  1313. if (Length >= 2) and (Buffer[Length-1] = '-') and
  1314. (Buffer[Length-2] = '-') then
  1315. begin
  1316. Dec(Length, 2);
  1317. if Assigned(FCursor) then
  1318. FCursor.AppendChild(doc.CreateCommentBuf(Buffer, Length));
  1319. ExpectChar('>');
  1320. Exit;
  1321. end;
  1322. until FCurChar = #0;
  1323. RaiseExc('Unterminated comment');
  1324. end;
  1325. procedure TXMLReader.ParsePI; // [16]
  1326. var
  1327. Name, Value: WideString;
  1328. begin
  1329. GetCharRaw; // skip '?'
  1330. Name := ExpectName;
  1331. with FName do
  1332. if (Length = 3) and
  1333. ((Buffer[0] = 'X') or (Buffer[0] = 'x')) and
  1334. ((Buffer[1] = 'M') or (Buffer[1] = 'm')) and
  1335. ((Buffer[2] = 'L') or (Buffer[2] = 'l')) then
  1336. begin
  1337. if Name <> 'xml' then
  1338. RaiseExc('''xml'' is a reserved word; it must be lowercase');
  1339. if FAllowedDecl <> dtNone then
  1340. begin
  1341. ParseXmlOrTextDecl(FAllowedDecl = dtText);
  1342. FAllowedDecl := dtNone;
  1343. Exit;
  1344. end
  1345. else
  1346. RaiseExc('XML declaration not allowed here');
  1347. end;
  1348. if FCurChar <> '?' then
  1349. ExpectWhitespace;
  1350. FAllowedDecl := dtNone;
  1351. FValue.Length := 0;
  1352. repeat
  1353. BufAppend(FValue, FCurChar);
  1354. GetCharRaw;
  1355. with FValue do
  1356. if (Length >= 2) and (Buffer[Length-1] = '>') and
  1357. (Buffer[Length-2] = '?') then
  1358. begin
  1359. Dec(Length, 2);
  1360. SetString(Value, Buffer, Length);
  1361. if Assigned(FCursor) then
  1362. FCursor.AppendChild(Doc.CreateProcessingInstruction(Name, Value));
  1363. Exit;
  1364. end;
  1365. until FCurChar = #0;
  1366. RaiseExc('Unterminated processing instruction');
  1367. end;
  1368. // here we come from ParsePI, 'xml' is already consumed
  1369. procedure TXMLReader.ParseXmlOrTextDecl(TextDecl: Boolean);
  1370. var
  1371. TmpStr: WideString;
  1372. IsXML11: Boolean;
  1373. begin
  1374. ExpectWhitespace;
  1375. // VersionInfo: optional in TextDecl, required in XmlDecl
  1376. if (not TextDecl) or (FCurChar = 'v') then
  1377. begin
  1378. ExpectString('version'); // [24]
  1379. ExpectEq;
  1380. SkipSystemLiteral(TmpStr, True);
  1381. IsXML11 := False;
  1382. if TmpStr = '1.1' then // Checking for bad chars is implied
  1383. IsXML11 := True
  1384. else if TmpStr <> '1.0' then
  1385. RaiseExc('Illegal version number');
  1386. if not TextDecl then
  1387. begin
  1388. if doc.InheritsFrom(TXMLDocument) then
  1389. TXMLDocument(doc).XMLVersion := TmpStr;
  1390. if IsXML11 then
  1391. XML11_BuildTables;
  1392. end
  1393. else // parsing external entity
  1394. if IsXML11 and not FXML11 then
  1395. RaiseExc('XML 1.0 document cannot invoke XML 1.1 entities');
  1396. if FCurChar <> '?' then
  1397. ExpectWhitespace;
  1398. end;
  1399. // EncodingDecl: required in TextDecl, optional in XmlDecl
  1400. if TextDecl or (FCurChar = 'e') then // [80]
  1401. begin
  1402. ExpectString('encoding');
  1403. ExpectEq;
  1404. SkipSystemLiteral(TmpStr, True);
  1405. if not IsValidEncName(TmpStr) then
  1406. RaiseExc('Illegal encoding name');
  1407. FSource.SetEncoding(TmpStr); // <-- Wide2Ansi conversion here
  1408. // getting here means that specified encoding is supported
  1409. // TODO: maybe assign the 'preferred' encoding name?
  1410. if not TextDecl and doc.InheritsFrom(TXMLDocument) then
  1411. TXMLDocument(doc).Encoding := TmpStr;
  1412. if FCurChar <> '?' then
  1413. ExpectWhitespace;
  1414. end;
  1415. // SDDecl: forbidden in TextDecl, optional in XmlDecl
  1416. if (not TextDecl) and (FCurChar = 's') then
  1417. begin
  1418. ExpectString('standalone');
  1419. ExpectEq;
  1420. SkipSystemLiteral(TmpStr, True);
  1421. if TmpStr = 'yes' then
  1422. FStandalone := True
  1423. else if TmpStr <> 'no' then
  1424. RaiseExc('Only "yes" or "no" are permitted as values of "standalone"');
  1425. SkipWhitespace;
  1426. end;
  1427. ExpectString('?>');
  1428. end;
  1429. procedure TXMLReader.ParseDoctypeDecl; // [28]
  1430. var
  1431. IntSubset: TWideCharBuf;
  1432. Src, OldSrc: TXMLInputSource;
  1433. begin
  1434. FAllowedDecl := dtNone;
  1435. if FDtdParsed then
  1436. RaiseExc('Markup declaration not allowed here');
  1437. ExpectString('DOCTYPE'); // gives possibly incorrect error message
  1438. ExpectWhitespace;
  1439. FDocType := TDOMDocumentTypeEx.Create(doc);
  1440. FDtdParsed := True;
  1441. { To comply with certain output tests, we must insert PIs coming from internal
  1442. subset before DocType node. This looks very synthetic, but let it be...
  1443. Moreover, this code actually duplicates such PIs }
  1444. try
  1445. FDocType.FName := ExpectName;
  1446. ExpectWhitespace;
  1447. ParseExternalID(FDocType.FSystemID, FDocType.FPublicID, False);
  1448. SkipWhitespace;
  1449. if FCurChar = '[' then
  1450. begin
  1451. BufAllocate(IntSubset, 256);
  1452. FCopyBuf := @IntSubset;
  1453. GetChar; // cause very first char after '[' to be appended
  1454. try
  1455. FIntSubset := True;
  1456. ParseMarkupDecl;
  1457. if IntSubset.Length > 0 then // sanity check - must at least contain ']'
  1458. SetString(FDocType.FInternalSubset, IntSubset.Buffer, IntSubset.Length-1);
  1459. ExpectChar(']');
  1460. finally
  1461. FIntSubset := False;
  1462. FCopyBuf := nil;
  1463. FreeMem(IntSubset.Buffer);
  1464. end;
  1465. SkipWhitespace;
  1466. end;
  1467. ExpectChar('>');
  1468. if FDocType.SystemID <> '' then
  1469. begin
  1470. if ResolveEntity(FDocType.SystemID, FDocType.PublicID, Src) then
  1471. begin
  1472. OldSrc := FSource;
  1473. Unget(FCurChar);
  1474. FCursor := nil;
  1475. try
  1476. DoParseExtSubset(Src);
  1477. finally
  1478. while ContextPop do; // Cleanup after possible exceptions
  1479. FSource.Free;
  1480. FSource := OldSrc;
  1481. GetChar;
  1482. FCursor := Doc;
  1483. end;
  1484. end;
  1485. end;
  1486. finally
  1487. doc.AppendChild(FDocType);
  1488. end;
  1489. end;
  1490. procedure TXMLReader.ParseMisc;
  1491. begin
  1492. repeat
  1493. if SkipWhitespace then
  1494. FAllowedDecl := dtNone;
  1495. if not CheckForChar('<') then
  1496. Break;
  1497. if CheckForChar('!') then
  1498. begin
  1499. FAllowedDecl := dtNone;
  1500. if FCurChar = '-' then
  1501. ParseComment
  1502. else
  1503. ParseDoctypeDecl;
  1504. end
  1505. else
  1506. if FCurChar = '?' then
  1507. ParsePI
  1508. else
  1509. Break;
  1510. until FCurChar = #0;
  1511. FAllowedDecl := dtNone;
  1512. end;
  1513. function TXMLReader.ParseEq: Boolean; // [25]
  1514. begin
  1515. while FWhitespace do GetCharRaw;
  1516. Result := FCurChar = '=';
  1517. if Result then
  1518. begin
  1519. GetCharRaw;
  1520. while FWhitespace do GetCharRaw;
  1521. end;
  1522. end;
  1523. procedure TXMLReader.ExpectEq;
  1524. begin
  1525. if not ParseEq then
  1526. RaiseExc('Expected "="');
  1527. end;
  1528. { DTD stuff }
  1529. procedure TXMLReader.AssertPENesting(CurrentLevel: Integer);
  1530. begin
  1531. if CurrentLevel <> FEntityLevel then
  1532. ValidationError('Parameter entities must be properly nested', []);
  1533. end;
  1534. // content model
  1535. type
  1536. TElementContentType = (
  1537. ctEmpty,
  1538. ctAny,
  1539. ctMixed,
  1540. ctName,
  1541. ctChoice,
  1542. ctSeq
  1543. );
  1544. TElementContentQuant = (
  1545. cqNone,
  1546. cqOpt,
  1547. cqReq,
  1548. cqPlus
  1549. );
  1550. {
  1551. TElementContent = record
  1552. ContentType: TElementContentType;
  1553. ContentQuant: TElementContentQuant;
  1554. Name: WideString;
  1555. Children: array of TElementContent;
  1556. end;
  1557. }
  1558. procedure TXMLReader.ExpectChoiceOrSeq(); // [49], [50]
  1559. var
  1560. Delim: WideChar;
  1561. PELevel: Integer;
  1562. begin
  1563. Delim := #0;
  1564. repeat
  1565. SkipWhitespace;
  1566. if FCurChar = '(' then
  1567. begin
  1568. PELevel := FEntityLevel;
  1569. GetChar;
  1570. ExpectChoiceOrSeq;
  1571. AssertPENesting(PELevel);
  1572. GetChar;
  1573. end
  1574. else
  1575. SkipName;
  1576. if CheckForChar('?') then
  1577. else if CheckForChar('*') then
  1578. else if CheckForChar('+') then;
  1579. SkipWhitespace;
  1580. if FCurChar = ')' then
  1581. Break;
  1582. if Delim = #0 then
  1583. begin
  1584. if (FCurChar = '|') or (FCurChar = ',') then
  1585. Delim := FCurChar
  1586. else
  1587. RaiseExc('Expected "|" or ","');
  1588. end
  1589. else
  1590. if FCurChar <> Delim then
  1591. RaiseExc(Delim);
  1592. GetChar; // skip delimiter
  1593. until False;
  1594. end;
  1595. procedure TXMLReader.ParseMixedOrChildren;
  1596. var
  1597. PELevel: Integer;
  1598. NeedAsterisk: Boolean;
  1599. begin
  1600. PELevel := FEntityLevel;
  1601. GetChar; // starting bracket
  1602. SkipWhitespace;
  1603. if CheckForChar('#') then // Mixed section [51]
  1604. begin
  1605. ExpectString('PCDATA');
  1606. SkipWhitespace;
  1607. NeedAsterisk := False;
  1608. while FCurChar <> ')' do
  1609. begin
  1610. ExpectChar('|');
  1611. NeedAsterisk := True;
  1612. SkipWhitespace;
  1613. SkipName;
  1614. SkipWhitespace;
  1615. end;
  1616. AssertPENesting(PELevel);
  1617. GetChar;
  1618. if NeedAsterisk then
  1619. ExpectChar('*')
  1620. else
  1621. CheckForChar('*');
  1622. end
  1623. else // Parse Children section [47]
  1624. begin
  1625. ExpectChoiceOrSeq;
  1626. AssertPENesting(PELevel);
  1627. GetChar;
  1628. if CheckForChar('?') then
  1629. else if CheckForChar('*') then
  1630. else if CheckForChar('+') then;
  1631. end;
  1632. end;
  1633. procedure TXMLReader.ParseElementDecl; // [45]
  1634. begin
  1635. SkipName;
  1636. ExpectWhitespace;
  1637. // Get contentspec [46]
  1638. if FCurChar = 'E' then
  1639. ExpectString('EMPTY')
  1640. else if FCurChar = 'A' then
  1641. ExpectString('ANY')
  1642. else if FCurChar = '(' then
  1643. ParseMixedOrChildren
  1644. else
  1645. RaiseExc('Invalid content specification');
  1646. end;
  1647. procedure TXMLReader.ParseNotationDecl; // [82]
  1648. var
  1649. Notation: TDOMNotationEx;
  1650. begin
  1651. Notation := TDOMNotationEx(TDOMNotation.Create(Doc));
  1652. try
  1653. Notation.FName := ExpectName;
  1654. ExpectWhitespace;
  1655. if not ParseExternalID(Notation.FSystemID, Notation.FPublicID, True) then
  1656. RaiseExc('Expected external or public ID');
  1657. except
  1658. Notation.Free;
  1659. raise;
  1660. end;
  1661. if FDocType.Notations.GetNamedItem(Notation.FName) = nil then
  1662. FDocType.Notations.SetNamedItem(Notation)
  1663. else
  1664. begin
  1665. ValidationError('Duplicate notation declaration: %s', [Notation.FName]);
  1666. Notation.Free;
  1667. end;
  1668. end;
  1669. procedure TXMLReader.ParseAttlistDecl; // [52]
  1670. var
  1671. SaveCurNode: TDOMNode;
  1672. ValueRequired: Boolean;
  1673. Token: WideString;
  1674. ElDef: TDOMElementDef;
  1675. AttDef: TDOMAttrDef;
  1676. begin
  1677. Token := ExpectName;
  1678. ElDef := TDOMElementDef(FDocType.ElementDefs.GetNamedItem(Token));
  1679. if ElDef = nil then
  1680. begin
  1681. // TODO -cVC: must distinguish ElementDef created here from one explicitly declared
  1682. ElDef := TDOMElementDef.Create(doc);
  1683. ElDef.FNodeName := Token;
  1684. FDocType.ElementDefs.SetNamedItem(ElDef);
  1685. end;
  1686. SkipWhitespace;
  1687. while FCurChar <> '>' do
  1688. begin
  1689. SkipWhitespace; { !!! }
  1690. AttDef := TDOMAttrDef.Create(doc);
  1691. try
  1692. AttDef.FName := ExpectName;
  1693. ExpectWhitespace;
  1694. Token := GetString(['A'..'Z']); // Get AttType [54], [55], [56]
  1695. if Token = 'CDATA' then
  1696. AttDef.FDataType := DT_CDATA
  1697. else if Token = 'ID' then
  1698. AttDef.FDataType := DT_ID
  1699. else if Token = 'IDREF' then
  1700. AttDef.FDataType := DT_IDREF
  1701. else if Token = 'IDREFS' then
  1702. AttDef.FDataType := DT_IDREFS
  1703. else if Token = 'ENTITY' then
  1704. AttDef.FDataType := DT_ENTITY
  1705. else if Token = 'ENTITIES' then
  1706. AttDef.FDataType := DT_ENTITIES
  1707. else if Token = 'NMTOKEN' then
  1708. AttDef.FDataType := DT_NMTOKEN
  1709. else if Token = 'NMTOKENS' then
  1710. AttDef.FDataType := DT_NMTOKENS
  1711. else if Token = 'NOTATION' then // [57], [58]
  1712. begin
  1713. AttDef.FDataType := DT_NOTATION;
  1714. ExpectWhitespace;
  1715. ExpectChar('(');
  1716. repeat
  1717. SkipWhitespace;
  1718. SkipName;
  1719. SkipWhitespace;
  1720. until not CheckForChar('|');
  1721. ExpectChar(')');
  1722. end
  1723. else
  1724. if CheckForChar('(') then // [59]
  1725. begin
  1726. AttDef.FDataType := DT_NMTOKEN;
  1727. repeat
  1728. SkipWhitespace;
  1729. if not CheckNmToken then
  1730. RaiseNameNotFound; // not completely correct error message
  1731. SkipWhitespace;
  1732. until not CheckForChar('|');
  1733. ExpectChar(')');
  1734. end else
  1735. RaiseExc('Invalid tokenized type');
  1736. ExpectWhitespace;
  1737. // Get DefaultDecl [60]
  1738. ValueRequired := False;
  1739. if CheckForChar('#') then
  1740. begin
  1741. Token := GetString(['A'..'Z']);
  1742. if Token = 'REQUIRED' then
  1743. AttDef.FDefault := AD_REQUIRED
  1744. else if Token = 'IMPLIED' then
  1745. AttDef.FDefault := AD_IMPLIED
  1746. else if Token = 'FIXED' then
  1747. begin
  1748. AttDef.FDefault := AD_FIXED;
  1749. ExpectWhitespace;
  1750. ValueRequired := True;
  1751. end
  1752. else
  1753. RaiseExc('Illegal attribute default');
  1754. end
  1755. else
  1756. begin
  1757. AttDef.FDefault := AD_DEFAULT;
  1758. ValueRequired := True;
  1759. end;
  1760. if ValueRequired then
  1761. begin
  1762. SaveCurNode := FCursor;
  1763. FCursor := AttDef;
  1764. // tricky moment, no tests for that
  1765. { FRecognizePE := False; } // TODO: shall it really be disabled?
  1766. try
  1767. ExpectAttValue;
  1768. finally
  1769. FCursor := SaveCurNode;
  1770. { FRecognizePE := not FIntSubset; }
  1771. end;
  1772. if AttDef.FDataType = DT_ID then
  1773. ValidationError('Attributes of type ID must not have a default value',[]);
  1774. end;
  1775. // First declaration is binding, subsequent should be ignored
  1776. if Assigned(ElDef.GetAttributeNode(AttDef.Name)) then
  1777. AttDef.Free
  1778. else
  1779. ElDef.SetAttributeNode(AttDef);
  1780. except
  1781. AttDef.Free;
  1782. raise;
  1783. end;
  1784. SkipWhitespace;
  1785. end;
  1786. end;
  1787. procedure TXMLReader.ParseEntityDeclValue(Delim: WideChar); // [9]
  1788. var
  1789. I: Integer;
  1790. Src: TXMLInputSource;
  1791. begin
  1792. Src := FSource;
  1793. // "Included in literal": process until delimiter hit IN SAME context
  1794. while not ((FSource = Src) and CheckForChar(Delim)) do
  1795. if ParsePEReference then
  1796. begin
  1797. if FIntSubset and (FSource.FParent = nil) then
  1798. RaiseExc('PE references in internal subset not allowed inside declarations');
  1799. StartPE;
  1800. GetCharRaw;
  1801. end
  1802. else if FCurChar = '&' then // CharRefs: include, EntityRefs: bypass
  1803. begin
  1804. GetCharRaw;
  1805. if not ParseCharRef then
  1806. begin
  1807. BufAppend(FValue, '&');
  1808. ExpectName;
  1809. ExpectChar(';');
  1810. for I := 0 to FName.Length-1 do
  1811. BufAppend(FValue, FName.Buffer[I]);
  1812. BufAppend(FValue, ';');
  1813. end;
  1814. end
  1815. else if FCurChar <> #0 then // Regular character
  1816. begin
  1817. BufAppend(FValue, FCurChar);
  1818. GetCharRaw;
  1819. end
  1820. else if not ContextPop then // #0
  1821. Break;
  1822. end;
  1823. procedure TXMLReader.ParseEntityDecl; // [70]
  1824. var
  1825. NDataAllowed: Boolean;
  1826. Delim: WideChar;
  1827. Entity: TDOMEntityEx;
  1828. Map: TDOMNamedNodeMap;
  1829. begin
  1830. NDataAllowed := True;
  1831. Map := FDocType.Entities;
  1832. if CheckForChar('%') then // [72]
  1833. begin
  1834. ExpectWhitespace;
  1835. NDataAllowed := False;
  1836. Map := FDocType.PEMap;
  1837. end;
  1838. Entity := TDOMEntityEx.Create(Doc);
  1839. try
  1840. Entity.FInternal := FIntSubset and (FSource.FParent = nil);
  1841. Entity.FName := ExpectName;
  1842. ExpectWhitespace;
  1843. if (FCurChar = '"') or (FCurChar = '''') then
  1844. begin
  1845. NDataAllowed := False;
  1846. Delim := FCurChar;
  1847. FRecognizePE := False; // PERef right after delimiter should not be recognized
  1848. GetCharRaw; // at char level - we process it 'manually'
  1849. FValue.Length := 0;
  1850. ParseEntityDeclValue(Delim);
  1851. FRecognizePE := not FIntSubset;
  1852. SetString(Entity.FReplacementText, FValue.Buffer, FValue.Length);
  1853. end
  1854. else
  1855. if not ParseExternalID(Entity.FSystemID, Entity.FPublicID, False) then
  1856. RaiseExc('Expected entity value or external ID');
  1857. if NDataAllowed then // [76]
  1858. begin
  1859. if FCurChar <> '>' then
  1860. ExpectWhitespace;
  1861. if FCurChar = 'N' then
  1862. begin
  1863. ExpectString('NDATA');
  1864. ExpectWhitespace;
  1865. SkipName;
  1866. // TODO -cVC: Notation declared. Here or after all has been read?
  1867. SetString(Entity.FNotationName, FName.Buffer, FName.Length);
  1868. if FDocType.Notations.GetNamedItem(Entity.NotationName) = nil then
  1869. ValidationError('Reference to undeclared notation ''%s''', [Entity.NotationName]);
  1870. end;
  1871. end;
  1872. except
  1873. Entity.Free;
  1874. raise;
  1875. end;
  1876. // Repeated declarations of same entity are legal but must be ignored
  1877. if Map.GetNamedItem(Entity.NodeName) = nil then
  1878. Map.SetNamedItem(Entity)
  1879. else
  1880. Entity.Free;
  1881. end;
  1882. procedure TXMLReader.ParseMarkupDecl; // [29]
  1883. var
  1884. Token: WideString;
  1885. IncludeLevel: Integer;
  1886. IgnoreLevel: Integer;
  1887. PELevel: Integer;
  1888. begin
  1889. IncludeLevel := 0;
  1890. IgnoreLevel := 0;
  1891. repeat
  1892. if SkipWhitespace then
  1893. FAllowedDecl := dtNone;
  1894. if ParsePEReference then // PERef between declarations should always be recognized
  1895. begin
  1896. FAllowedDecl := dtNone;
  1897. if Assigned(FDocType) then
  1898. FDocType.HasPERefs := True;
  1899. StartPE;
  1900. GetChar;
  1901. Continue;
  1902. end;
  1903. if (FCurChar = #0) and ContextPop then
  1904. Continue;
  1905. if (FCurChar = ']') and (IncludeLevel > 0) then
  1906. begin
  1907. ExpectString(']]>');
  1908. Dec(IncludeLevel);
  1909. Continue;
  1910. end;
  1911. if FCurChar <> '<' then
  1912. Break;
  1913. PELevel := FEntityLevel;
  1914. GetCharRaw;
  1915. if CheckForChar('!') then
  1916. begin
  1917. FAllowedDecl := dtNone;
  1918. if FCurChar = '-' then
  1919. ParseComment
  1920. else if FCurChar = '[' then
  1921. begin
  1922. if FIntSubset and (FSource.FParent = nil) then
  1923. RaiseExc('Conditional sections not allowed in internal subset');
  1924. FRecognizePE := not FIntSubset;
  1925. GetChar; // skip '['
  1926. SkipWhitespace;
  1927. Token := GetString(['A'..'Z']);
  1928. SkipWhitespace;
  1929. if Token = 'INCLUDE' then
  1930. Inc(IncludeLevel)
  1931. else if Token = 'IGNORE' then
  1932. IgnoreLevel := 1
  1933. else
  1934. RaiseExc('Expected "INCLUDE" or "IGNORE"');
  1935. AssertPENesting(PELevel);
  1936. ExpectChar('[');
  1937. if IgnoreLevel > 0 then
  1938. repeat
  1939. FRecognizePE := False; // PEs not recognized in IGNORE section
  1940. if CheckForChar('<') and CheckForChar('!') and CheckForChar('[') then
  1941. Inc(IgnoreLevel)
  1942. else if CheckForChar(']') and CheckForChar(']') and CheckForChar('>') then
  1943. Dec(IgnoreLevel)
  1944. else GetChar;
  1945. until (IgnoreLevel=0) or (FCurChar = #0);
  1946. end
  1947. else
  1948. begin
  1949. FRecognizePE := not FIntSubset;
  1950. Token := GetString(['A'..'Z']);
  1951. ExpectWhitespace;
  1952. if Token = 'ELEMENT' then
  1953. ParseElementDecl
  1954. else if Token = 'ENTITY' then
  1955. ParseEntityDecl
  1956. else if Token = 'ATTLIST' then
  1957. ParseAttlistDecl
  1958. else if Token = 'NOTATION' then
  1959. ParseNotationDecl
  1960. else
  1961. RaiseExc('Illegal markup declaration');
  1962. SkipWhitespace;
  1963. FRecognizePE := False; // ! Don't auto-pop context on last markup delimiter
  1964. ExpectChar('>'); // This enables correct nesting check
  1965. end;
  1966. {
  1967. MarkupDecl starting in PE and ending in root is a WFC [28a]
  1968. MarkupDecl starting in root but ending in PE is a VC (erratum 2e-14)
  1969. }
  1970. if PELevel > FEntityLevel then
  1971. RaiseExc('Parameter entities must be properly nested')
  1972. else
  1973. AssertPENesting(PELevel);
  1974. end
  1975. else if FCurChar = '?' then
  1976. ParsePI;
  1977. until False;
  1978. FRecognizePE := False;
  1979. if (IncludeLevel > 0) or (IgnoreLevel > 0) then
  1980. RaiseExc('Conditional section not closed');
  1981. end;
  1982. procedure TXMLReader.DoParseExtSubset(ASource: TXMLInputSource);
  1983. begin
  1984. InitializeRoot(ASource);
  1985. FAllowedDecl := dtText;
  1986. ParseMarkupDecl;
  1987. if FCurChar <> #0 then
  1988. RaiseExc('Illegal character in DTD');
  1989. end;
  1990. procedure TXMLReader.ProcessDTD(ASource: TXMLInputSource);
  1991. begin
  1992. doc := TXMLDocument.Create;
  1993. FDocType := TDOMDocumentTypeEx.Create(doc);
  1994. // TODO: DTD labeled version 1.1 will be rejected - must set FXML11 flag
  1995. // TODO: what shall be FCursor? FDocType cannot - it does not accept child nodes
  1996. doc.AppendChild(FDocType);
  1997. DoParseExtSubset(ASource);
  1998. end;
  1999. procedure TXMLReader.ParseCDSect; // [18]
  2000. var
  2001. name: WideString;
  2002. begin
  2003. ExpectString('[CDATA[');
  2004. FValue.Length := 0;
  2005. repeat
  2006. BufAppend(FValue, FCurChar);
  2007. GetCharRaw;
  2008. with FValue do
  2009. if (Length >= 3) and (Buffer[Length-1] = '>') and
  2010. (Buffer[Length-2] = ']') and (Buffer[Length-3] = ']') then
  2011. begin
  2012. Dec(Length, 3);
  2013. SetString(name, Buffer, Length);
  2014. FCursor.AppendChild(doc.CreateCDATASection(name));
  2015. Exit;
  2016. end;
  2017. until FCurChar = #0;
  2018. RaiseExc('Unterminated CDATA section');
  2019. end;
  2020. procedure TXMLReader.ParseContent;
  2021. begin
  2022. repeat
  2023. if FCurChar = '<' then
  2024. begin
  2025. GetCharRaw;
  2026. if CheckName then
  2027. ParseElement
  2028. else if FCurChar = '!' then
  2029. begin
  2030. GetCharRaw;
  2031. FAllowedDecl := dtNone;
  2032. if FCurChar = '[' then
  2033. ParseCDSect
  2034. else if FCurChar = '-' then
  2035. ParseComment
  2036. else
  2037. ParseDoctypeDecl; // actually will raise error
  2038. end
  2039. else if FCurChar = '?' then
  2040. ParsePI
  2041. else
  2042. Exit;
  2043. end
  2044. else
  2045. ProcessTextAndRefs;
  2046. until FCurChar = #0;
  2047. end;
  2048. // Element name already in FNameBuffer
  2049. procedure TXMLReader.ParseElement; // [39] [40] [44]
  2050. var
  2051. NewElem: TDOMElement;
  2052. IsEmpty: Boolean;
  2053. attr, OldAttr: TDOMNode;
  2054. begin
  2055. NewElem := doc.CreateElementBuf(FName.Buffer, FName.Length);
  2056. FCursor.AppendChild(NewElem);
  2057. Assert(NewElem.ParentNode = FCursor, 'AppendChild did not set ParentNode');
  2058. FCursor := NewElem;
  2059. IsEmpty := False;
  2060. while FCurChar <> '>' do
  2061. begin
  2062. if FCurChar = '/' then
  2063. begin
  2064. GetCharRaw;
  2065. IsEmpty := True;
  2066. FCursor := FCursor.ParentNode;
  2067. Break;
  2068. end;
  2069. // Get Attribute [41]
  2070. ExpectWhitespace;
  2071. if not CheckName then // allow stuff like <element >, <element />
  2072. Continue;
  2073. attr := doc.CreateAttributeBuf(FName.Buffer, FName.Length);
  2074. // !!cannot use TDOMElement.SetAttributeNode because it will free old attribute
  2075. OldAttr := NewElem.Attributes.SetNamedItem(Attr);
  2076. if Assigned(OldAttr) then
  2077. begin
  2078. OldAttr.Free;
  2079. RaiseExc('Duplicate attribute');
  2080. end;
  2081. ExpectEq;
  2082. Assert(TDOMAttr(attr).OwnerElement = NewElem, 'DOMAttr.OwnerElement not set correctly');
  2083. FCursor := attr;
  2084. ExpectAttValue;
  2085. FCursor := NewElem;
  2086. end;
  2087. ExpectChar('>');
  2088. ProcessDefaultAttributes(NewElem);
  2089. if not IsEmpty then
  2090. begin
  2091. if not FPreserveWhitespace then // critical for testsuite compliance
  2092. SkipWhitespace;
  2093. ParseContent;
  2094. if FCurChar = '/' then // Get ETag [42]
  2095. begin
  2096. GetCharRaw;
  2097. if ExpectName <> NewElem.TagName then
  2098. RaiseExc('Unmatching element end tag (expected "</%s>")', [NewElem.TagName]);
  2099. SkipWhitespace;
  2100. ExpectChar('>');
  2101. FCursor := FCursor.ParentNode;
  2102. end
  2103. else if FCurChar <> #0 then
  2104. RaiseNameNotFound
  2105. else // End of stream in content
  2106. RaiseExc('Document element not closed');
  2107. end;
  2108. end;
  2109. procedure TXMLReader.ProcessDefaultAttributes(Element: TDOMElement);
  2110. var
  2111. I: Integer;
  2112. ElDef: TDOMElementDef;
  2113. AttDefs: TDOMNamedNodeMap;
  2114. AttDef: TDOMAttrDef;
  2115. Attr: TDOMAttrEx;
  2116. Spec: Boolean;
  2117. begin
  2118. if Assigned(FDocType) then
  2119. begin
  2120. ElDef := TDOMElementDef(FDocType.ElementDefs.GetNamedItem(Element.TagName));
  2121. if Assigned(ElDef) and ElDef.HasAttributes then
  2122. begin
  2123. AttDefs := ElDef.Attributes;
  2124. for I := 0 to AttDefs.Length-1 do
  2125. begin
  2126. AttDef := AttDefs[I] as TDOMAttrDef;
  2127. Spec := True;
  2128. // no validity checking yet; just append default values
  2129. Attr := TDOMAttrEx(Element.GetAttributeNode(AttDef.Name));
  2130. if (AttDef.FDefault in [AD_DEFAULT, AD_FIXED]) and (Attr = nil) then
  2131. begin
  2132. Attr := TDOMAttrEx(AttDef.CloneNode(True));
  2133. Element.SetAttributeNode(Attr);
  2134. Spec := False;
  2135. end;
  2136. if Assigned(Attr) then
  2137. begin
  2138. Attr.FSpecified := Spec;
  2139. Attr.FNormalize := (AttDef.FDataType <> DT_CDATA);
  2140. end;
  2141. end;
  2142. end;
  2143. end;
  2144. end;
  2145. function TXMLReader.ParsePEReference: Boolean; // [69]
  2146. begin
  2147. Result := CheckForChar('%');
  2148. if Result then
  2149. begin
  2150. SkipName;
  2151. ExpectChar(';');
  2152. end;
  2153. end;
  2154. function TXMLReader.ParseExternalID(out SysID, PubID: WideString; // [75]
  2155. SysIdOptional: Boolean): Boolean;
  2156. begin
  2157. if FCurChar = 'S' then
  2158. begin
  2159. ExpectString('SYSTEM');
  2160. ExpectWhitespace;
  2161. SkipSystemLiteral(SysID, True);
  2162. Result := True;
  2163. end
  2164. else if FCurChar = 'P' then
  2165. begin
  2166. ExpectString('PUBLIC');
  2167. ExpectWhitespace;
  2168. SkipPubidLiteral;
  2169. SetString(PubID, FValue.Buffer, FValue.Length);
  2170. if SysIdOptional then
  2171. begin
  2172. SkipWhitespace;
  2173. SkipSystemLiteral(SysID, False);
  2174. end
  2175. else
  2176. begin
  2177. ExpectWhitespace;
  2178. SkipSystemLiteral(SysID, True);
  2179. end;
  2180. Result := True;
  2181. end else
  2182. Result := False;
  2183. end;
  2184. procedure TXMLReader.ValidationError(const Msg: string;
  2185. const args: array of const);
  2186. begin
  2187. // TODO: just a stub now
  2188. FInvalid := True;
  2189. end;
  2190. procedure ReadXMLFile(out ADoc: TXMLDocument; var f: Text);
  2191. var
  2192. Reader: TXMLReader;
  2193. Src: TXMLInputSource;
  2194. begin
  2195. ADoc := nil;
  2196. Src := TXMLFileInputSource.Create(f);
  2197. Src.SystemID := FilenameToURI(TTextRec(f).Name);
  2198. Reader := TXMLReader.Create;
  2199. try
  2200. Reader.ProcessXML(Src);
  2201. ADoc := TXMLDocument(Reader.Doc);
  2202. finally
  2203. Reader.Free;
  2204. end;
  2205. end;
  2206. procedure ReadXMLFile(out ADoc: TXMLDocument; var f: TStream; const ABaseURI: String);
  2207. var
  2208. Reader: TXMLReader;
  2209. Src: TXMLInputSource;
  2210. begin
  2211. ADoc := nil;
  2212. Reader := TXMLReader.Create;
  2213. try
  2214. Src := TXMLStreamInputSource.Create(f, False);
  2215. Src.SystemID := ABaseURI;
  2216. Reader.ProcessXML(Src);
  2217. finally
  2218. ADoc := TXMLDocument(Reader.doc);
  2219. Reader.Free;
  2220. end;
  2221. end;
  2222. procedure ReadXMLFile(out ADoc: TXMLDocument; var f: TStream);
  2223. begin
  2224. ReadXMLFile(ADoc, f, 'stream:');
  2225. end;
  2226. procedure ReadXMLFile(out ADoc: TXMLDocument; const AFilename: String);
  2227. var
  2228. FileStream: TStream;
  2229. begin
  2230. ADoc := nil;
  2231. FileStream := TFileStream.Create(AFilename, fmOpenRead+fmShareDenyWrite);
  2232. try
  2233. ReadXMLFile(ADoc, FileStream, FilenameToURI(AFilename));
  2234. finally
  2235. FileStream.Free;
  2236. end;
  2237. end;
  2238. procedure ReadXMLFragment(AParentNode: TDOMNode; var f: Text);
  2239. var
  2240. Reader: TXMLReader;
  2241. Src: TXMLInputSource;
  2242. begin
  2243. Reader := TXMLReader.Create;
  2244. try
  2245. Src := TXMLFileInputSource.Create(f);
  2246. Src.SystemID := FilenameToURI(TTextRec(f).Name);
  2247. Reader.ProcessFragment(Src, AParentNode);
  2248. finally
  2249. Reader.Free;
  2250. end;
  2251. end;
  2252. procedure ReadXMLFragment(AParentNode: TDOMNode; var f: TStream; const ABaseURI: String);
  2253. var
  2254. Reader: TXMLReader;
  2255. Src: TXMLInputSource;
  2256. begin
  2257. Reader := TXMLReader.Create;
  2258. try
  2259. Src := TXMLStreamInputSource.Create(f, False);
  2260. Src.SystemID := ABaseURI;
  2261. Reader.ProcessFragment(Src, AParentNode);
  2262. finally
  2263. Reader.Free;
  2264. end;
  2265. end;
  2266. procedure ReadXMLFragment(AParentNode: TDOMNode; var f: TStream);
  2267. begin
  2268. ReadXMLFragment(AParentNode, f, 'stream:');
  2269. end;
  2270. procedure ReadXMLFragment(AParentNode: TDOMNode; const AFilename: String);
  2271. var
  2272. Stream: TStream;
  2273. begin
  2274. Stream := TFileStream.Create(AFilename, fmOpenRead+fmShareDenyWrite);
  2275. try
  2276. ReadXMLFragment(AParentNode, Stream, FilenameToURI(AFilename));
  2277. finally
  2278. Stream.Free;
  2279. end;
  2280. end;
  2281. procedure ReadDTDFile(out ADoc: TXMLDocument; var f: Text);
  2282. var
  2283. Reader: TXMLReader;
  2284. Src: TXMLInputSource;
  2285. begin
  2286. ADoc := nil;
  2287. Reader := TXMLReader.Create;
  2288. try
  2289. Src := TXMLFileInputSource.Create(f);
  2290. Src.SystemID := FilenameToURI(TTextRec(f).Name);
  2291. Reader.ProcessDTD(Src);
  2292. ADoc := TXMLDocument(Reader.doc);
  2293. finally
  2294. Reader.Free;
  2295. end;
  2296. end;
  2297. procedure ReadDTDFile(out ADoc: TXMLDocument; var f: TStream; const ABaseURI: String);
  2298. var
  2299. Reader: TXMLReader;
  2300. Src: TXMLInputSource;
  2301. begin
  2302. ADoc := nil;
  2303. Reader := TXMLReader.Create;
  2304. try
  2305. Src := TXMLStreamInputSource.Create(f, False);
  2306. Src.SystemID := ABaseURI;
  2307. Reader.ProcessDTD(Src);
  2308. ADoc := TXMLDocument(Reader.doc);
  2309. finally
  2310. Reader.Free;
  2311. end;
  2312. end;
  2313. procedure ReadDTDFile(out ADoc: TXMLDocument; var f: TStream);
  2314. begin
  2315. ReadDTDFile(ADoc, f, 'stream:');
  2316. end;
  2317. procedure ReadDTDFile(out ADoc: TXMLDocument; const AFilename: String);
  2318. var
  2319. Stream: TStream;
  2320. begin
  2321. ADoc := nil;
  2322. Stream := TFileStream.Create(AFilename, fmOpenRead+fmShareDenyWrite);
  2323. try
  2324. ReadDTDFile(ADoc, Stream, FilenameToURI(AFilename));
  2325. finally
  2326. Stream.Free;
  2327. end;
  2328. end;
  2329. end.