jsonscanner.pp 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632
  1. {
  2. This file is part of the Free Component Library
  3. JSON source lexical scanner
  4. Copyright (c) 2007 by Michael Van Canneyt [email protected]
  5. See the file COPYING.FPC, included in this distribution,
  6. for details about the copyright.
  7. This program is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  10. **********************************************************************}
  11. {$mode objfpc}
  12. {$h+}
  13. { $INLINE ON}
  14. unit jsonscanner;
  15. interface
  16. uses SysUtils, Classes;
  17. resourcestring
  18. SErrInvalidCharacter = 'Invalid character at line %d, pos %d: ''%s''';
  19. SUnterminatedComment = 'Unterminated comment at line %d, pos %d: ''%s''';
  20. SErrOpenString = 'string exceeds end of line %d';
  21. type
  22. TJSONToken = (
  23. tkEOF,
  24. tkWhitespace,
  25. tkString,
  26. tkNumber,
  27. tkTrue,
  28. tkFalse,
  29. tkNull,
  30. // Simple (one-character) tokens
  31. tkComma, // ','
  32. tkColon, // ':'
  33. tkCurlyBraceOpen, // '{'
  34. tkCurlyBraceClose, // '}'
  35. tkSquaredBraceOpen, // '['
  36. tkSquaredBraceClose, // ']'
  37. tkIdentifier, // Any Javascript identifier
  38. tkComment,
  39. tkUnknown
  40. );
  41. EScannerError = class(EParserError);
  42. TJSONOption = (joUTF8,joStrict,joComments,joIgnoreTrailingComma,joIgnoreDuplicates,joBOMCheck);
  43. TJSONOptions = set of TJSONOption;
  44. Const
  45. DefaultOptions = [joUTF8];
  46. Type
  47. { TJSONScanner }
  48. TJSONScanner = class
  49. private
  50. FSource: RawByteString;
  51. FCurPos : PAnsiChar; // Position inside total string
  52. FCurRow: Integer;
  53. FCurToken: TJSONToken;
  54. FCurTokenString: string;
  55. FCurLine: PChar;
  56. FTokenStr: PAnsiChar; // position inside FCurLine
  57. FEOL : PAnsiChar; // EOL
  58. FOptions : TJSONOptions;
  59. function GetCurColumn: Integer; inline;
  60. function GetCurLine: string;
  61. function GetO(AIndex: TJSONOption): Boolean;
  62. procedure SetO(AIndex: TJSONOption; AValue: Boolean);
  63. protected
  64. procedure Error(const Msg: string);overload;
  65. procedure Error(const Msg: string; Const Args: array of const);overload;
  66. // function DoFetchToken: TJSONToken; inline;
  67. public
  68. constructor Create(Source : TStream; AUseUTF8 : Boolean = True); overload; deprecated 'use options form instead';
  69. constructor Create(Source: TStream; AOptions: TJSONOptions); overload;
  70. constructor Create(const aSource : RawByteString; AUseUTF8 : Boolean = True); overload; deprecated 'use options form instead';
  71. constructor Create(const aSource: RawByteString; AOptions: TJSONOptions); overload;
  72. function FetchToken: TJSONToken;
  73. property CurLine: string read GetCurLine;
  74. property CurRow: Integer read FCurRow;
  75. property CurColumn: Integer read GetCurColumn;
  76. property CurToken: TJSONToken read FCurToken;
  77. property CurTokenString: string read FCurTokenString;
  78. // Use strict JSON: " for strings, object members are strings, not identifiers
  79. Property Strict : Boolean Index joStrict Read GetO Write SetO ; deprecated 'use options instead';
  80. // if set to TRUE, then strings will be converted to UTF8 ansistrings, not system codepage ansistrings.
  81. Property UseUTF8 : Boolean index joUTF8 Read GetO Write SetO; deprecated 'Use options instead';
  82. // Parsing options
  83. Property Options : TJSONOptions Read FOptions Write FOptions;
  84. end;
  85. const
  86. TokenInfos: array[TJSONToken] of string = (
  87. 'EOF',
  88. 'Whitespace',
  89. 'String',
  90. 'Number',
  91. 'True',
  92. 'False',
  93. 'Null',
  94. ',',
  95. ':',
  96. '{',
  97. '}',
  98. '[',
  99. ']',
  100. 'identifier',
  101. 'comment',
  102. ''
  103. );
  104. implementation
  105. constructor TJSONScanner.Create(Source : TStream; AUseUTF8 : Boolean = True);
  106. Var
  107. O : TJSONOptions;
  108. begin
  109. O:=DefaultOptions;
  110. if AUseUTF8 then
  111. Include(O,joUTF8)
  112. else
  113. Exclude(O,joUTF8);
  114. Create(Source,O);
  115. end;
  116. constructor TJSONScanner.Create(Source: TStream; AOptions: TJSONOptions);
  117. procedure SkipStreamBOM;
  118. Var
  119. OldPos : integer;
  120. Header : array[0..3] of byte;
  121. begin
  122. OldPos := Source.Position;
  123. FillChar(Header{%H-}, SizeOf(Header), 0);
  124. if Source.Read(Header, 3) = 3 then
  125. if (Header[0]=$EF) and (Header[1]=$BB) and (Header[2]=$BF) then
  126. exit;
  127. Source.Position := OldPos;
  128. end;
  129. Var
  130. S : RawByteString;
  131. begin
  132. if (joBOMCheck in aOptions) then
  133. SkipStreamBom;
  134. S:='';
  135. SetLength(S,Source.Size-Source.Position);
  136. if Length(S)>0 then
  137. Source.ReadBuffer(S[1],Length(S));
  138. Create(S,AOptions)
  139. end;
  140. constructor TJSONScanner.Create(const aSource : RawByteString; AUseUTF8 : Boolean = True);
  141. Var
  142. O : TJSONOptions;
  143. begin
  144. O:=DefaultOptions;
  145. if AUseUTF8 then
  146. Include(O,joUTF8)
  147. else
  148. Exclude(O,joUTF8);
  149. Create(aSource,O);
  150. end;
  151. constructor TJSONScanner.Create(const aSource: RawByteString; AOptions: TJSONOptions);
  152. begin
  153. FSource:=aSource;
  154. FCurPos:=PAnsiChar(FSource);
  155. if FCurPos<>Nil then
  156. FCurRow:=1;
  157. FOptions:=AOptions;
  158. end;
  159. function TJSONScanner.GetCurColumn: Integer;
  160. begin
  161. Result := FTokenStr - FCurLine;
  162. end;
  163. procedure TJSONScanner.Error(const Msg: string);
  164. begin
  165. raise EScannerError.Create(Msg);
  166. end;
  167. procedure TJSONScanner.Error(const Msg: string; const Args: array of const);
  168. begin
  169. raise EScannerError.CreateFmt(Msg, Args);
  170. end;
  171. function TJSONScanner.FetchToken: TJSONToken;
  172. (*
  173. procedure dumpcurrent;
  174. begin
  175. Writeln('Start of line : ',FCurLine);
  176. Writeln('Cur pos : ',FCurPos);
  177. Writeln('Start of token : ',FTokenstr);
  178. Writeln('End of line : ',FTokenstr);
  179. end;
  180. *)
  181. function FetchLine: Boolean;
  182. begin
  183. Result:=(FCurPos<>Nil) and (FCurPos^<>#0);
  184. if Result then
  185. begin
  186. FCurLine:=FCurPos;
  187. FTokenStr:=FCurPos;
  188. While Not (FCurPos^ in [#0,#10,#13]) do
  189. Inc(FCurPos);
  190. FEOL:=FCurPos;
  191. If (FCurPos^<>#0) then
  192. // While (FCurPos^<>#0) and (FCurPos^ in [#10,#13]) do
  193. begin
  194. if (FCurPos^=#13) and (FCurPos[1]=#10) then
  195. Inc(FCurPos); // Skip CR-LF
  196. Inc(FCurPos); // To start of next line
  197. Inc(FCurRow); // Increase line index
  198. end;
  199. // Len:=FEOL-FTokenStr;
  200. // FTokenStr:=FCurPos;
  201. end
  202. else
  203. begin
  204. FCurLine:=Nil;
  205. FTokenStr:=nil;
  206. end;
  207. end;
  208. var
  209. TokenStart: PChar;
  210. it : TJSONToken;
  211. I : Integer;
  212. OldLength, SectionLength, tstart,tcol, u1,u2: Integer;
  213. C , c2: char;
  214. S : String[8];
  215. Line : String;
  216. IsStar,EOC: Boolean;
  217. Procedure MaybeAppendUnicode;
  218. Var
  219. u : UTF8String;
  220. begin
  221. // if there is a leftover \u, append
  222. if (u1<>0) then
  223. begin
  224. if (joUTF8 in Options) or (DefaultSystemCodePage=CP_UTF8) then
  225. U:=Utf8Encode(WideString(WideChar(u1))) // ToDo: use faster function
  226. else
  227. U:=String(WideChar(u1)); // WideChar converts the encoding. Should it warn on loss?
  228. FCurTokenString:=FCurTokenString+U;
  229. OldLength:=Length(FCurTokenString);
  230. u1:=0;
  231. u2:=0;
  232. end;
  233. end;
  234. begin
  235. if (FTokenStr = nil) or (FTokenStr=FEOL) then
  236. begin
  237. if not FetchLine then
  238. begin
  239. Result := tkEOF;
  240. FCurToken := Result;
  241. exit;
  242. end;
  243. end;
  244. FCurTokenString := '';
  245. case FTokenStr^ of
  246. #0: // Empty line
  247. begin
  248. FetchLine;
  249. Result := tkWhitespace;
  250. end;
  251. #9, ' ', #10, #13:
  252. begin
  253. Result := tkWhitespace;
  254. repeat
  255. if FTokenStr = FEOL then
  256. begin
  257. if not FetchLine then
  258. begin
  259. FCurToken := Result;
  260. exit;
  261. end
  262. end
  263. else
  264. Inc(FTokenStr);
  265. until not (FTokenStr[0] in [#9, ' ']);
  266. end;
  267. '"','''':
  268. begin
  269. C:=FTokenStr^;
  270. If (C='''') and (joStrict in Options) then
  271. Error(SErrInvalidCharacter, [CurRow,CurColumn,FTokenStr[0]]);
  272. Inc(FTokenStr);
  273. TokenStart := FTokenStr;
  274. OldLength := 0;
  275. FCurTokenString := '';
  276. u1:=0;
  277. while not (FTokenStr^ in [#0,C]) do
  278. begin
  279. if (FTokenStr^='\') then
  280. begin
  281. // Save length
  282. SectionLength := FTokenStr - TokenStart;
  283. Inc(FTokenStr);
  284. // Read escaped token
  285. Case FTokenStr^ of
  286. '"' : S:='"';
  287. '''' : S:='''';
  288. 't' : S:=#9;
  289. 'b' : S:=#8;
  290. 'n' : S:=#10;
  291. 'r' : S:=#13;
  292. 'f' : S:=#12;
  293. '\' : S:='\';
  294. '/' : S:='/';
  295. 'u' : begin
  296. u2:=0;
  297. For I:=1 to 4 do
  298. begin
  299. Inc(FTokenStr);
  300. c2:=FTokenStr^;
  301. Case c2 of
  302. '0'..'9': u2:=u2*16+ord(c2)-ord('0');
  303. 'A'..'F': u2:=u2*16+ord(c2)-ord('A')+10;
  304. 'a'..'f': u2:=u2*16+ord(c2)-ord('a')+10;
  305. else
  306. Error(SErrInvalidCharacter, [CurRow,CurColumn,FTokenStr[0]]);
  307. end;
  308. end;
  309. if u1<>0 then
  310. begin
  311. // 4bytes, compose.
  312. if not ((u2>=$DC00) and (u2<=$DFFF)) then
  313. Error(SErrInvalidCharacter, [CurRow,CurColumn,IntToStr(u2)]);
  314. if (joUTF8 in Options) or (DefaultSystemCodePage=CP_UTF8) then
  315. S:=Utf8Encode(WideString(WideChar(u1)+WideChar(u2))) // ToDo: use faster function
  316. else
  317. S:=String(WideChar(u1)+WideChar(u2)); // WideChar converts the encoding. Should it warn on loss?
  318. u1:=0;
  319. end
  320. else
  321. begin
  322. // Surrogate start
  323. if (u2>=$D800) and (U2<=$DBFF) then
  324. begin
  325. u1:=u2;
  326. S:='';
  327. end
  328. else
  329. begin
  330. if (joUTF8 in Options) or (DefaultSystemCodePage=CP_UTF8) then
  331. S:=Utf8Encode(WideString(WideChar(u2))) // ToDo: use faster function
  332. else
  333. S:=String(WideChar(u2)); // WideChar converts the encoding. Should it warn on loss?
  334. U1:=0;
  335. U2:=0;
  336. end;
  337. end;
  338. end;
  339. #0 : Error(SErrOpenString,[FCurRow]);
  340. else
  341. Error(SErrInvalidCharacter, [CurRow,CurColumn,FTokenStr[0]]);
  342. end;
  343. I:=Length(S);
  344. if (SectionLength+I>0) then
  345. begin
  346. // If length=1, we know it was not \uXX, but u1 can be nonzero, and we must first append it.
  347. // example: \u00f8\"
  348. if (I=1) and (u1<>0) then
  349. MaybeAppendUnicode;
  350. SetLength(FCurTokenString, OldLength + SectionLength+i);
  351. if SectionLength > 0 then
  352. Move(TokenStart^, FCurTokenString[OldLength + 1], SectionLength);
  353. if I>0 then
  354. Move(S[1],FCurTokenString[OldLength + SectionLength+1],i);
  355. Inc(OldLength, SectionLength+I);
  356. end;
  357. // Next char
  358. TokenStart := FTokenStr+1;
  359. end
  360. else if u1<>0 then
  361. MaybeAppendUnicode;
  362. if FTokenStr^ < #$20 then
  363. if FTokenStr^ = #0 then Error(SErrOpenString,[FCurRow])
  364. else if joStrict in Options then Error(SErrInvalidCharacter, [CurRow,CurColumn,FTokenStr[0]]);
  365. Inc(FTokenStr);
  366. end;
  367. if FTokenStr^ = #0 then
  368. Error(SErrOpenString,[FCurRow]);
  369. if u1<>0 then
  370. MaybeAppendUnicode;
  371. SectionLength := FTokenStr - TokenStart;
  372. SetLength(FCurTokenString, OldLength + SectionLength);
  373. if SectionLength > 0 then
  374. Move(TokenStart^, FCurTokenString[OldLength + 1], SectionLength);
  375. Inc(FTokenStr);
  376. Result := tkString;
  377. end;
  378. ',':
  379. begin
  380. Inc(FTokenStr);
  381. Result := tkComma;
  382. end;
  383. '0'..'9','.','-':
  384. begin
  385. TokenStart := FTokenStr;
  386. if FTokenStr^ = '-' then inc(FTokenStr);
  387. case FTokenStr^ of
  388. '1'..'9': Inc(FTokenStr);
  389. '0': begin
  390. Inc(FTokenStr);
  391. if (joStrict in Options) and (FTokenStr^ in ['0'..'9']) then
  392. Error(SErrInvalidCharacter, [CurRow,CurColumn,FTokenStr[0]]);
  393. end;
  394. '.': if joStrict in Options then
  395. Error(SErrInvalidCharacter, [CurRow,CurColumn,FTokenStr[0]]);
  396. else
  397. Error(SErrInvalidCharacter, [CurRow,CurColumn,FTokenStr[0]]);
  398. end;
  399. while true do
  400. begin
  401. case FTokenStr^ of
  402. '0'..'9': inc(FTokenStr);
  403. '.':
  404. begin
  405. case FTokenStr[1] of
  406. '0'..'9': Inc(FTokenStr, 2);
  407. 'e', 'E': begin
  408. if joStrict in Options then
  409. Error(SErrInvalidCharacter, [CurRow,CurColumn,FTokenStr[0]]);
  410. Inc(FTokenStr);
  411. end;
  412. else Error(SErrInvalidCharacter, [CurRow,CurColumn,FTokenStr[0]]);
  413. end;
  414. while FTokenStr^ in ['0'..'9'] do
  415. inc(FTokenStr);
  416. break;
  417. end;
  418. else
  419. break;
  420. end;
  421. end;
  422. if FTokenStr^ in ['e', 'E'] then begin
  423. Inc(FTokenStr);
  424. if FTokenStr^ in ['-','+'] then
  425. Inc(FTokenStr);
  426. if not (FTokenStr^ in ['0'..'9']) then
  427. Error(SErrInvalidCharacter, [CurRow,CurColumn,FTokenStr[0]]);
  428. repeat
  429. Inc(FTokenStr);
  430. until not (FTokenStr^ in ['0'..'9']);
  431. end;
  432. if {(FTokenStr<>FEOL) and }not (FTokenStr^ in [#13,#10,#0,'}',']',',',#9,' ']) then
  433. Error(SErrInvalidCharacter, [CurRow,CurColumn,FTokenStr[0]]);
  434. SectionLength := FTokenStr - TokenStart;
  435. FCurTokenString:='';
  436. SetString(FCurTokenString, TokenStart, SectionLength);
  437. If (FCurTokenString[1]='.') then
  438. FCurTokenString:='0'+FCurTokenString;
  439. Result := tkNumber;
  440. end;
  441. ':':
  442. begin
  443. Inc(FTokenStr);
  444. Result := tkColon;
  445. end;
  446. '{':
  447. begin
  448. Inc(FTokenStr);
  449. Result := tkCurlyBraceOpen;
  450. end;
  451. '}':
  452. begin
  453. Inc(FTokenStr);
  454. Result := tkCurlyBraceClose;
  455. end;
  456. '[':
  457. begin
  458. Inc(FTokenStr);
  459. Result := tkSquaredBraceOpen;
  460. end;
  461. ']':
  462. begin
  463. Inc(FTokenStr);
  464. Result := tkSquaredBraceClose;
  465. end;
  466. '/' :
  467. begin
  468. if Not (joComments in Options) then
  469. Error(SErrInvalidCharacter, [CurRow,CurCOlumn,FTokenStr[0]]);
  470. TokenStart:=FTokenStr;
  471. Inc(FTokenStr);
  472. Case FTokenStr^ of
  473. '/' : begin
  474. FCurTokenString:='';
  475. Inc(FTokenStr);
  476. TokenStart:=FTokenStr;
  477. SectionLength := PChar(FEOL)-TokenStart;
  478. SetString(FCurTokenString, TokenStart, SectionLength);
  479. FetchLine;
  480. end;
  481. '*' :
  482. begin
  483. IsStar:=False;
  484. Inc(FTokenStr);
  485. TokenStart:=FTokenStr;
  486. Repeat
  487. While (FTokenStr=FEOL) do
  488. begin
  489. SectionLength := (FTokenStr - TokenStart);
  490. Line:='';
  491. SetString(Line, TokenStart, SectionLength);
  492. FCurtokenString:=FCurtokenString+Line+sLineBreak;
  493. if not fetchLine then
  494. Error(SUnterminatedComment, [CurRow,CurCOlumn,FTokenStr[0]]);
  495. TokenStart:=FTokenStr;
  496. end;
  497. IsStar:=FTokenStr^='*';
  498. Inc(FTokenStr);
  499. EOC:=(isStar and (FTokenStr^='/'));
  500. Until EOC;
  501. if EOC then
  502. begin
  503. SectionLength := (FTokenStr - TokenStart-1);
  504. Line:='';
  505. SetString(Line, TokenStart, SectionLength);
  506. FCurtokenString:=FCurtokenString+Line;
  507. Inc(FTokenStr);
  508. end;
  509. end;
  510. else
  511. Error(SErrInvalidCharacter, [CurRow,CurCOlumn,FTokenStr[0]]);
  512. end;
  513. Result:=tkComment;
  514. end;
  515. 'a'..'z','A'..'Z','_':
  516. begin
  517. tstart:=CurRow;
  518. Tcol:=CurColumn;
  519. TokenStart := FTokenStr;
  520. Result:=tkIdentifier;
  521. case TokenStart^ of
  522. 't': if (TokenStart[1] = 'r') and (TokenStart[2] = 'u') and (TokenStart[3] = 'e') then
  523. Result:=tkTrue;
  524. 'f': if (TokenStart[1] = 'a') and (TokenStart[2] = 'l') and (TokenStart[3] = 's') and (TokenStart[4] = 'e') then
  525. Result:=tkFalse;
  526. 'n': if (TokenStart[1] = 'u') and (TokenStart[2] = 'l') and (TokenStart[3] = 'l') then
  527. Result:=tkNull;
  528. end;
  529. if result <> tkIdentifier then inc(FTokenStr, length(TokenInfos[result]) - 1);
  530. repeat
  531. Inc(FTokenStr);
  532. until not (FTokenStr^ in ['A'..'Z', 'a'..'z', '0'..'9', '_']);
  533. SectionLength := FTokenStr - TokenStart;
  534. FCurTokenString:='';
  535. SetString(FCurTokenString, TokenStart, SectionLength);
  536. if (result = tkIdentifier) or (SectionLength <> length(TokenInfos[result])) then begin
  537. if (joStrict in Options) then
  538. Error(SErrInvalidCharacter, [tStart,tcol,TokenStart[0]]);
  539. for it := tkTrue to tkNull do
  540. if CompareText(CurTokenString, TokenInfos[it]) = 0 then
  541. begin
  542. Result := it;
  543. FCurToken := Result;
  544. exit;
  545. end;
  546. end;
  547. end;
  548. else
  549. Error(SErrInvalidCharacter, [CurRow,CurColumn,FTokenStr[0]]);
  550. end;
  551. FCurToken := Result;
  552. end;
  553. {function TJSONScanner.FetchToken: TJSONToken;
  554. begin
  555. Result:=DoFetchToken;
  556. end;}
  557. function TJSONScanner.GetCurLine: string;
  558. begin
  559. Result:='';
  560. if FCurLine<>Nil then
  561. begin
  562. SetLength(Result,FEOL-FCurLine);
  563. if Length(Result)>0 then
  564. Move(FCurLine^,Result[1],Length(Result));
  565. end;
  566. end;
  567. function TJSONScanner.GetO(AIndex: TJSONOption): Boolean;
  568. begin
  569. Result:=AIndex in FOptions;
  570. end;
  571. procedure TJSONScanner.SetO(AIndex: TJSONOption; AValue: Boolean);
  572. begin
  573. If AValue then
  574. Include(Foptions,AIndex)
  575. else
  576. Exclude(Foptions,AIndex)
  577. end;
  578. end.