jsonscanner.pp 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617
  1. {
  2. This file is part of the Free Component Library
  3. JSON source lexical scanner
  4. Copyright (c) 2007 by Michael Van Canneyt [email protected]
  5. See the file COPYING.FPC, included in this distribution,
  6. for details about the copyright.
  7. This program is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  10. **********************************************************************}
  11. {$mode objfpc}
  12. {$h+}
  13. { $INLINE ON}
  14. unit jsonscanner;
  15. interface
  16. uses SysUtils, Classes;
  17. resourcestring
  18. SErrInvalidCharacter = 'Invalid character at line %d, pos %d: ''%s''';
  19. SUnterminatedComment = 'Unterminated comment at line %d, pos %d: ''%s''';
  20. SErrOpenString = 'string exceeds end of line %d';
  21. type
  22. TJSONToken = (
  23. tkEOF,
  24. tkWhitespace,
  25. tkString,
  26. tkNumber,
  27. tkTrue,
  28. tkFalse,
  29. tkNull,
  30. // Simple (one-character) tokens
  31. tkComma, // ','
  32. tkColon, // ':'
  33. tkCurlyBraceOpen, // '{'
  34. tkCurlyBraceClose, // '}'
  35. tkSquaredBraceOpen, // '['
  36. tkSquaredBraceClose, // ']'
  37. tkIdentifier, // Any Javascript identifier
  38. tkComment,
  39. tkUnknown
  40. );
  41. EScannerError = class(EParserError);
  42. TJSONOption = (joUTF8,joStrict,joComments,joIgnoreTrailingComma,joIgnoreDuplicates,joBOMCheck);
  43. TJSONOptions = set of TJSONOption;
  44. Const
  45. DefaultOptions = [joUTF8];
  46. Type
  47. { TJSONScanner }
  48. TJSONScanner = class
  49. private
  50. FSource: RawByteString;
  51. FCurPos : PAnsiChar; // Position inside total string
  52. FCurRow: Integer;
  53. FCurToken: TJSONToken;
  54. FCurTokenString: string;
  55. FCurLine: PChar;
  56. FTokenStr: PAnsiChar; // position inside FCurLine
  57. FEOL : PAnsiChar; // EOL
  58. FOptions : TJSONOptions;
  59. function GetCurColumn: Integer; inline;
  60. function GetCurLine: string;
  61. function GetO(AIndex: TJSONOption): Boolean;
  62. procedure SetO(AIndex: TJSONOption; AValue: Boolean);
  63. protected
  64. procedure Error(const Msg: string);overload;
  65. procedure Error(const Msg: string; Const Args: array of const);overload;
  66. // function DoFetchToken: TJSONToken; inline;
  67. public
  68. constructor Create(Source : TStream; AUseUTF8 : Boolean = True); overload; deprecated 'use options form instead';
  69. constructor Create(Source: TStream; AOptions: TJSONOptions); overload;
  70. constructor Create(const aSource : RawByteString; AUseUTF8 : Boolean = True); overload; deprecated 'use options form instead';
  71. constructor Create(const aSource: RawByteString; AOptions: TJSONOptions); overload;
  72. function FetchToken: TJSONToken;
  73. property CurLine: string read GetCurLine;
  74. property CurRow: Integer read FCurRow;
  75. property CurColumn: Integer read GetCurColumn;
  76. property CurToken: TJSONToken read FCurToken;
  77. property CurTokenString: string read FCurTokenString;
  78. // Use strict JSON: " for strings, object members are strings, not identifiers
  79. Property Strict : Boolean Index joStrict Read GetO Write SetO ; deprecated 'use options instead';
  80. // if set to TRUE, then strings will be converted to UTF8 ansistrings, not system codepage ansistrings.
  81. Property UseUTF8 : Boolean index joUTF8 Read GetO Write SetO; deprecated 'Use options instead';
  82. // Parsing options
  83. Property Options : TJSONOptions Read FOptions Write FOptions;
  84. end;
  85. const
  86. TokenInfos: array[TJSONToken] of string = (
  87. 'EOF',
  88. 'Whitespace',
  89. 'String',
  90. 'Number',
  91. 'True',
  92. 'False',
  93. 'Null',
  94. ',',
  95. ':',
  96. '{',
  97. '}',
  98. '[',
  99. ']',
  100. 'identifier',
  101. 'comment',
  102. ''
  103. );
  104. implementation
  105. constructor TJSONScanner.Create(Source : TStream; AUseUTF8 : Boolean = True);
  106. Var
  107. O : TJSONOptions;
  108. begin
  109. O:=DefaultOptions;
  110. if AUseUTF8 then
  111. Include(O,joUTF8)
  112. else
  113. Exclude(O,joUTF8);
  114. Create(Source,O);
  115. end;
  116. constructor TJSONScanner.Create(Source: TStream; AOptions: TJSONOptions);
  117. procedure SkipStreamBOM;
  118. Var
  119. OldPos : integer;
  120. Header : array[0..3] of byte;
  121. begin
  122. OldPos := Source.Position;
  123. FillChar(Header, SizeOf(Header), 0);
  124. if Source.Read(Header, 3) = 3 then
  125. if (Header[0]=$EF) and (Header[1]=$BB) and (Header[2]=$BF) then
  126. exit;
  127. Source.Position := OldPos;
  128. end;
  129. Var
  130. S : RawByteString;
  131. begin
  132. if (joBOMCheck in aOptions) then
  133. SkipStreamBom;
  134. S:='';
  135. SetLength(S,Source.Size-Source.Position);
  136. if Length(S)>0 then
  137. Source.ReadBuffer(S[1],Length(S));
  138. Create(S,AOptions)
  139. end;
  140. constructor TJSONScanner.Create(const aSource : RawByteString; AUseUTF8 : Boolean = True);
  141. Var
  142. O : TJSONOptions;
  143. begin
  144. O:=DefaultOptions;
  145. if AUseUTF8 then
  146. Include(O,joUTF8)
  147. else
  148. Exclude(O,joUTF8);
  149. Create(aSource,O);
  150. end;
  151. constructor TJSONScanner.Create(const aSource: RawByteString; AOptions: TJSONOptions);
  152. begin
  153. FSource:=aSource;
  154. FCurPos:=PAnsiChar(FSource);
  155. if FCurPos<>Nil then
  156. FCurRow:=1;
  157. FOptions:=AOptions;
  158. end;
  159. function TJSONScanner.GetCurColumn: Integer;
  160. begin
  161. Result := FTokenStr - FCurLine;
  162. end;
  163. procedure TJSONScanner.Error(const Msg: string);
  164. begin
  165. raise EScannerError.Create(Msg);
  166. end;
  167. procedure TJSONScanner.Error(const Msg: string; const Args: array of const);
  168. begin
  169. raise EScannerError.CreateFmt(Msg, Args);
  170. end;
  171. function TJSONScanner.FetchToken: TJSONToken;
  172. (*
  173. procedure dumpcurrent;
  174. begin
  175. Writeln('Start of line : ',FCurLine);
  176. Writeln('Cur pos : ',FCurPos);
  177. Writeln('Start of token : ',FTokenstr);
  178. Writeln('End of line : ',FTokenstr);
  179. end;
  180. *)
  181. function FetchLine: Boolean;
  182. begin
  183. Result:=(FCurPos<>Nil) and (FCurPos^<>#0);
  184. if Result then
  185. begin
  186. FCurLine:=FCurPos;
  187. FTokenStr:=FCurPos;
  188. While Not (FCurPos^ in [#0,#10,#13]) do
  189. Inc(FCurPos);
  190. FEOL:=FCurPos;
  191. If (FCurPos^<>#0) then
  192. // While (FCurPos^<>#0) and (FCurPos^ in [#10,#13]) do
  193. begin
  194. if (FCurPos^=#13) and (FCurPos[1]=#10) then
  195. Inc(FCurPos); // Skip CR-LF
  196. Inc(FCurPos); // To start of next line
  197. Inc(FCurRow); // Increase line index
  198. end;
  199. // Len:=FEOL-FTokenStr;
  200. // FTokenStr:=FCurPos;
  201. end
  202. else
  203. begin
  204. FCurLine:=Nil;
  205. FTokenStr:=nil;
  206. end;
  207. end;
  208. var
  209. TokenStart: PChar;
  210. it : TJSONToken;
  211. I : Integer;
  212. OldLength, SectionLength, tstart,tcol, u1,u2: Integer;
  213. C , c2: char;
  214. S : String[8];
  215. Line : String;
  216. IsStar,EOC: Boolean;
  217. Procedure MaybeAppendUnicode;
  218. Var
  219. u : UTF8String;
  220. begin
  221. // if there is a leftover \u, append
  222. if (u1<>0) then
  223. begin
  224. if (joUTF8 in Options) or (DefaultSystemCodePage=CP_UTF8) then
  225. U:=Utf8Encode(WideString(WideChar(u1))) // ToDo: use faster function
  226. else
  227. U:=String(WideChar(u1)); // WideChar converts the encoding. Should it warn on loss?
  228. FCurTokenString:=FCurTokenString+U;
  229. OldLength:=Length(FCurTokenString);
  230. u1:=0;
  231. u2:=0;
  232. end;
  233. end;
  234. begin
  235. if (FTokenStr = nil) or (FTokenStr=FEOL) then
  236. begin
  237. if not FetchLine then
  238. begin
  239. Result := tkEOF;
  240. FCurToken := Result;
  241. exit;
  242. end;
  243. end;
  244. FCurTokenString := '';
  245. case FTokenStr^ of
  246. #0: // Empty line
  247. begin
  248. FetchLine;
  249. Result := tkWhitespace;
  250. end;
  251. #9, ' ', #10, #13:
  252. begin
  253. Result := tkWhitespace;
  254. repeat
  255. if FTokenStr = FEOL then
  256. begin
  257. if not FetchLine then
  258. begin
  259. FCurToken := Result;
  260. exit;
  261. end
  262. end
  263. else
  264. Inc(FTokenStr);
  265. until not (FTokenStr[0] in [#9, ' ']);
  266. end;
  267. '"','''':
  268. begin
  269. C:=FTokenStr^;
  270. If (C='''') and (joStrict in Options) then
  271. Error(SErrInvalidCharacter, [CurRow,CurColumn,FTokenStr[0]]);
  272. Inc(FTokenStr);
  273. TokenStart := FTokenStr;
  274. OldLength := 0;
  275. FCurTokenString := '';
  276. u1:=0;
  277. while not (FTokenStr^ in [#0,C]) do
  278. begin
  279. if (FTokenStr^='\') then
  280. begin
  281. // Save length
  282. SectionLength := FTokenStr - TokenStart;
  283. Inc(FTokenStr);
  284. // Read escaped token
  285. Case FTokenStr^ of
  286. '"' : S:='"';
  287. '''' : S:='''';
  288. 't' : S:=#9;
  289. 'b' : S:=#8;
  290. 'n' : S:=#10;
  291. 'r' : S:=#13;
  292. 'f' : S:=#12;
  293. '\' : S:='\';
  294. '/' : S:='/';
  295. 'u' : begin
  296. u2:=0;
  297. For I:=1 to 4 do
  298. begin
  299. Inc(FTokenStr);
  300. c2:=FTokenStr^;
  301. Case c2 of
  302. '0'..'9': u2:=u2*16+ord(c2)-ord('0');
  303. 'A'..'F': u2:=u2*16+ord(c2)-ord('A')+10;
  304. 'a'..'f': u2:=u2*16+ord(c2)-ord('a')+10;
  305. else
  306. Error(SErrInvalidCharacter, [CurRow,CurColumn,FTokenStr[0]]);
  307. end;
  308. end;
  309. // ToDo: 4-bytes UTF16
  310. if u1<>0 then
  311. begin
  312. if (joUTF8 in Options) or (DefaultSystemCodePage=CP_UTF8) then
  313. S:=Utf8Encode(WideString(WideChar(u1)+WideChar(u2))) // ToDo: use faster function
  314. else
  315. S:=String(WideChar(u1)+WideChar(u2)); // WideChar converts the encoding. Should it warn on loss?
  316. u1:=0;
  317. end
  318. else
  319. begin
  320. S:='';
  321. u1:=u2;
  322. end
  323. end;
  324. #0 : Error(SErrOpenString,[FCurRow]);
  325. else
  326. Error(SErrInvalidCharacter, [CurRow,CurColumn,FTokenStr[0]]);
  327. end;
  328. I:=Length(S);
  329. if (SectionLength+I>0) then
  330. begin
  331. // If length=1, we know it was not \uXX, but u1 can be nonzero, and we must first append it.
  332. // example: \u00f8\"
  333. if (I=1) and (u1<>0) then
  334. MaybeAppendUnicode;
  335. SetLength(FCurTokenString, OldLength + SectionLength+i);
  336. if SectionLength > 0 then
  337. Move(TokenStart^, FCurTokenString[OldLength + 1], SectionLength);
  338. if I>0 then
  339. Move(S[1],FCurTokenString[OldLength + SectionLength+1],i);
  340. Inc(OldLength, SectionLength+I);
  341. end;
  342. // Next char
  343. TokenStart := FTokenStr+1;
  344. end
  345. else if u1<>0 then
  346. MaybeAppendUnicode;
  347. if FTokenStr^ < #$20 then
  348. if FTokenStr^ = #0 then Error(SErrOpenString,[FCurRow])
  349. else if joStrict in Options then Error(SErrInvalidCharacter, [CurRow,CurColumn,FTokenStr[0]]);
  350. Inc(FTokenStr);
  351. end;
  352. if FTokenStr^ = #0 then
  353. Error(SErrOpenString,[FCurRow]);
  354. if u1<>0 then
  355. MaybeAppendUnicode;
  356. SectionLength := FTokenStr - TokenStart;
  357. SetLength(FCurTokenString, OldLength + SectionLength);
  358. if SectionLength > 0 then
  359. Move(TokenStart^, FCurTokenString[OldLength + 1], SectionLength);
  360. Inc(FTokenStr);
  361. Result := tkString;
  362. end;
  363. ',':
  364. begin
  365. Inc(FTokenStr);
  366. Result := tkComma;
  367. end;
  368. '0'..'9','.','-':
  369. begin
  370. TokenStart := FTokenStr;
  371. if FTokenStr^ = '-' then inc(FTokenStr);
  372. case FTokenStr^ of
  373. '1'..'9': Inc(FTokenStr);
  374. '0': begin
  375. Inc(FTokenStr);
  376. if (joStrict in Options) and (FTokenStr^ in ['0'..'9']) then
  377. Error(SErrInvalidCharacter, [CurRow,CurColumn,FTokenStr[0]]);
  378. end;
  379. '.': if joStrict in Options then
  380. Error(SErrInvalidCharacter, [CurRow,CurColumn,FTokenStr[0]]);
  381. else
  382. Error(SErrInvalidCharacter, [CurRow,CurColumn,FTokenStr[0]]);
  383. end;
  384. while true do
  385. begin
  386. case FTokenStr^ of
  387. '0'..'9': inc(FTokenStr);
  388. '.':
  389. begin
  390. case FTokenStr[1] of
  391. '0'..'9': Inc(FTokenStr, 2);
  392. 'e', 'E': begin
  393. if joStrict in Options then
  394. Error(SErrInvalidCharacter, [CurRow,CurColumn,FTokenStr[0]]);
  395. Inc(FTokenStr);
  396. end;
  397. else Error(SErrInvalidCharacter, [CurRow,CurColumn,FTokenStr[0]]);
  398. end;
  399. while FTokenStr^ in ['0'..'9'] do
  400. inc(FTokenStr);
  401. break;
  402. end;
  403. else
  404. break;
  405. end;
  406. end;
  407. if FTokenStr^ in ['e', 'E'] then begin
  408. Inc(FTokenStr);
  409. if FTokenStr^ in ['-','+'] then
  410. Inc(FTokenStr);
  411. if not (FTokenStr^ in ['0'..'9']) then
  412. Error(SErrInvalidCharacter, [CurRow,CurColumn,FTokenStr[0]]);
  413. repeat
  414. Inc(FTokenStr);
  415. until not (FTokenStr^ in ['0'..'9']);
  416. end;
  417. if {(FTokenStr<>FEOL) and }not (FTokenStr^ in [#13,#10,#0,'}',']',',',#9,' ']) then
  418. Error(SErrInvalidCharacter, [CurRow,CurColumn,FTokenStr[0]]);
  419. SectionLength := FTokenStr - TokenStart;
  420. FCurTokenString:='';
  421. SetString(FCurTokenString, TokenStart, SectionLength);
  422. If (FCurTokenString[1]='.') then
  423. FCurTokenString:='0'+FCurTokenString;
  424. Result := tkNumber;
  425. end;
  426. ':':
  427. begin
  428. Inc(FTokenStr);
  429. Result := tkColon;
  430. end;
  431. '{':
  432. begin
  433. Inc(FTokenStr);
  434. Result := tkCurlyBraceOpen;
  435. end;
  436. '}':
  437. begin
  438. Inc(FTokenStr);
  439. Result := tkCurlyBraceClose;
  440. end;
  441. '[':
  442. begin
  443. Inc(FTokenStr);
  444. Result := tkSquaredBraceOpen;
  445. end;
  446. ']':
  447. begin
  448. Inc(FTokenStr);
  449. Result := tkSquaredBraceClose;
  450. end;
  451. '/' :
  452. begin
  453. if Not (joComments in Options) then
  454. Error(SErrInvalidCharacter, [CurRow,CurCOlumn,FTokenStr[0]]);
  455. TokenStart:=FTokenStr;
  456. Inc(FTokenStr);
  457. Case FTokenStr^ of
  458. '/' : begin
  459. FCurTokenString:='';
  460. Inc(FTokenStr);
  461. TokenStart:=FTokenStr;
  462. SectionLength := PChar(FEOL)-TokenStart;
  463. SetString(FCurTokenString, TokenStart, SectionLength);
  464. FetchLine;
  465. end;
  466. '*' :
  467. begin
  468. IsStar:=False;
  469. Inc(FTokenStr);
  470. TokenStart:=FTokenStr;
  471. Repeat
  472. While (FTokenStr=FEOL) do
  473. begin
  474. SectionLength := (FTokenStr - TokenStart);
  475. Line:='';
  476. SetString(Line, TokenStart, SectionLength);
  477. FCurtokenString:=FCurtokenString+Line+sLineBreak;
  478. if not fetchLine then
  479. Error(SUnterminatedComment, [CurRow,CurCOlumn,FTokenStr[0]]);
  480. TokenStart:=FTokenStr;
  481. end;
  482. IsStar:=FTokenStr^='*';
  483. Inc(FTokenStr);
  484. EOC:=(isStar and (FTokenStr^='/'));
  485. Until EOC;
  486. if EOC then
  487. begin
  488. SectionLength := (FTokenStr - TokenStart-1);
  489. Line:='';
  490. SetString(Line, TokenStart, SectionLength);
  491. FCurtokenString:=FCurtokenString+Line;
  492. Inc(FTokenStr);
  493. end;
  494. end;
  495. else
  496. Error(SErrInvalidCharacter, [CurRow,CurCOlumn,FTokenStr[0]]);
  497. end;
  498. Result:=tkComment;
  499. end;
  500. 'a'..'z','A'..'Z','_':
  501. begin
  502. tstart:=CurRow;
  503. Tcol:=CurColumn;
  504. TokenStart := FTokenStr;
  505. Result:=tkIdentifier;
  506. case TokenStart^ of
  507. 't': if (TokenStart[1] = 'r') and (TokenStart[2] = 'u') and (TokenStart[3] = 'e') then
  508. Result:=tkTrue;
  509. 'f': if (TokenStart[1] = 'a') and (TokenStart[2] = 'l') and (TokenStart[3] = 's') and (TokenStart[4] = 'e') then
  510. Result:=tkFalse;
  511. 'n': if (TokenStart[1] = 'u') and (TokenStart[2] = 'l') and (TokenStart[3] = 'l') then
  512. Result:=tkNull;
  513. end;
  514. if result <> tkIdentifier then inc(FTokenStr, length(TokenInfos[result]) - 1);
  515. repeat
  516. Inc(FTokenStr);
  517. until not (FTokenStr^ in ['A'..'Z', 'a'..'z', '0'..'9', '_']);
  518. SectionLength := FTokenStr - TokenStart;
  519. FCurTokenString:='';
  520. SetString(FCurTokenString, TokenStart, SectionLength);
  521. if (result = tkIdentifier) or (SectionLength <> length(TokenInfos[result])) then begin
  522. if (joStrict in Options) then
  523. Error(SErrInvalidCharacter, [tStart,tcol,TokenStart[0]]);
  524. for it := tkTrue to tkNull do
  525. if CompareText(CurTokenString, TokenInfos[it]) = 0 then
  526. begin
  527. Result := it;
  528. FCurToken := Result;
  529. exit;
  530. end;
  531. end;
  532. end;
  533. else
  534. Error(SErrInvalidCharacter, [CurRow,CurColumn,FTokenStr[0]]);
  535. end;
  536. FCurToken := Result;
  537. end;
  538. {function TJSONScanner.FetchToken: TJSONToken;
  539. begin
  540. Result:=DoFetchToken;
  541. end;}
  542. function TJSONScanner.GetCurLine: string;
  543. begin
  544. Result:='';
  545. if FCurLine<>Nil then
  546. begin
  547. SetLength(Result,FEOL-FCurLine);
  548. if Length(Result)>0 then
  549. Move(FCurLine^,Result[1],Length(Result));
  550. end;
  551. end;
  552. function TJSONScanner.GetO(AIndex: TJSONOption): Boolean;
  553. begin
  554. Result:=AIndex in FOptions;
  555. end;
  556. procedure TJSONScanner.SetO(AIndex: TJSONOption; AValue: Boolean);
  557. begin
  558. If AValue then
  559. Include(Foptions,AIndex)
  560. else
  561. Exclude(Foptions,AIndex)
  562. end;
  563. end.