jsonscanner.pp 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569
  1. {
  2. This file is part of the Free Component Library
  3. JSON source lexical scanner
  4. Copyright (c) 2007 by Michael Van Canneyt [email protected]
  5. See the file COPYING.FPC, included in this distribution,
  6. for details about the copyright.
  7. This program is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  10. **********************************************************************}
  11. {$mode objfpc}
  12. {$h+}
  13. { $INLINE ON}
  14. unit jsonscanner;
  15. interface
  16. uses SysUtils, Classes;
  17. resourcestring
  18. SErrInvalidCharacter = 'Invalid character at line %d, pos %d: ''%s''';
  19. SUnterminatedComment = 'Unterminated comment at line %d, pos %d: ''%s''';
  20. SErrOpenString = 'string exceeds end of line %d';
  21. type
  22. TJSONToken = (
  23. tkEOF,
  24. tkWhitespace,
  25. tkString,
  26. tkNumber,
  27. tkTrue,
  28. tkFalse,
  29. tkNull,
  30. // Simple (one-character) tokens
  31. tkComma, // ','
  32. tkColon, // ':'
  33. tkCurlyBraceOpen, // '{'
  34. tkCurlyBraceClose, // '}'
  35. tkSquaredBraceOpen, // '['
  36. tkSquaredBraceClose, // ']'
  37. tkIdentifier, // Any Javascript identifier
  38. tkComment,
  39. tkUnknown
  40. );
  41. EScannerError = class(EParserError);
  42. TJSONOption = (joUTF8,joStrict,joComments,joIgnoreTrailingComma);
  43. TJSONOptions = set of TJSONOption;
  44. Const
  45. DefaultOptions = [joUTF8];
  46. Type
  47. { TJSONScanner }
  48. TJSONScanner = class
  49. private
  50. FSource: RawByteString;
  51. FCurPos : PAnsiChar; // Position inside total string
  52. FCurRow: Integer;
  53. FCurToken: TJSONToken;
  54. FCurTokenString: string;
  55. FCurLine: PChar;
  56. FTokenStr: PAnsiChar; // position inside FCurLine
  57. FEOL : PAnsiChar; // EOL
  58. FOptions : TJSONOptions;
  59. function GetCurColumn: Integer; inline;
  60. function GetCurLine: string;
  61. function GetO(AIndex: TJSONOption): Boolean;
  62. procedure SetO(AIndex: TJSONOption; AValue: Boolean);
  63. protected
  64. procedure Error(const Msg: string);overload;
  65. procedure Error(const Msg: string; Const Args: array of const);overload;
  66. // function DoFetchToken: TJSONToken; inline;
  67. public
  68. constructor Create(Source : TStream; AUseUTF8 : Boolean = True); overload; deprecated 'use options form instead';
  69. constructor Create(Source: TStream; AOptions: TJSONOptions); overload;
  70. constructor Create(const aSource : RawByteString; AUseUTF8 : Boolean = True); overload; deprecated 'use options form instead';
  71. constructor Create(const aSource: RawByteString; AOptions: TJSONOptions); overload;
  72. function FetchToken: TJSONToken;
  73. property CurLine: string read GetCurLine;
  74. property CurRow: Integer read FCurRow;
  75. property CurColumn: Integer read GetCurColumn;
  76. property CurToken: TJSONToken read FCurToken;
  77. property CurTokenString: string read FCurTokenString;
  78. // Use strict JSON: " for strings, object members are strings, not identifiers
  79. Property Strict : Boolean Index joStrict Read GetO Write SetO ; deprecated 'use options instead';
  80. // if set to TRUE, then strings will be converted to UTF8 ansistrings, not system codepage ansistrings.
  81. Property UseUTF8 : Boolean index joUTF8 Read GetO Write SetO; deprecated 'Use options instead';
  82. // Parsing options
  83. Property Options : TJSONOptions Read FOptions Write FOptions;
  84. end;
  85. const
  86. TokenInfos: array[TJSONToken] of string = (
  87. 'EOF',
  88. 'Whitespace',
  89. 'String',
  90. 'Number',
  91. 'True',
  92. 'False',
  93. 'Null',
  94. ',',
  95. ':',
  96. '{',
  97. '}',
  98. '[',
  99. ']',
  100. 'identifier',
  101. 'comment',
  102. ''
  103. );
  104. implementation
  105. constructor TJSONScanner.Create(Source : TStream; AUseUTF8 : Boolean = True);
  106. Var
  107. O : TJSONOptions;
  108. begin
  109. O:=DefaultOptions;
  110. if AUseUTF8 then
  111. Include(O,joUTF8)
  112. else
  113. Exclude(O,joUTF8);
  114. Create(Source,O);
  115. end;
  116. constructor TJSONScanner.Create(Source: TStream; AOptions: TJSONOptions);
  117. Var
  118. S : RawByteString;
  119. begin
  120. S:='';
  121. SetLength(S,Source.Size);
  122. if Length(S)>0 then
  123. Source.ReadBuffer(S[1],Length(S));
  124. Create(S,AOptions)
  125. end;
  126. constructor TJSONScanner.Create(const aSource : RawByteString; AUseUTF8 : Boolean = True);
  127. Var
  128. O : TJSONOptions;
  129. begin
  130. O:=DefaultOptions;
  131. if AUseUTF8 then
  132. Include(O,joUTF8)
  133. else
  134. Exclude(O,joUTF8);
  135. Create(aSource,O);
  136. end;
  137. constructor TJSONScanner.Create(const aSource: RawByteString; AOptions: TJSONOptions);
  138. begin
  139. FSource:=aSource;
  140. FCurPos:=PAnsiChar(FSource);
  141. FOptions:=AOptions;
  142. end;
  143. function TJSONScanner.GetCurColumn: Integer;
  144. begin
  145. Result := FTokenStr - FCurLine;
  146. end;
  147. procedure TJSONScanner.Error(const Msg: string);
  148. begin
  149. raise EScannerError.Create(Msg);
  150. end;
  151. procedure TJSONScanner.Error(const Msg: string; const Args: array of const);
  152. begin
  153. raise EScannerError.CreateFmt(Msg, Args);
  154. end;
  155. function TJSONScanner.FetchToken: TJSONToken;
  156. (*
  157. procedure dumpcurrent;
  158. begin
  159. Writeln('Start of line : ',FCurLine);
  160. Writeln('Cur pos : ',FCurPos);
  161. Writeln('Start of token : ',FTokenstr);
  162. Writeln('End of line : ',FTokenstr);
  163. end;
  164. *)
  165. function FetchLine: Boolean;
  166. begin
  167. Result:=(FCurPos<>Nil) and (FCurPos^<>#0);
  168. if Result then
  169. begin
  170. FCurLine:=FCurPos;
  171. FTokenStr:=FCurPos;
  172. While Not (FCurPos^ in [#0,#10,#13]) do
  173. Inc(FCurPos);
  174. FEOL:=FCurPos;
  175. While (FCurPos^<>#0) and (FCurPos^ in [#10,#13]) do
  176. begin
  177. if (FCurPos^=#13) and (FCurPos[1]=#10) then
  178. Inc(FCurPos); // Skip CR-LF
  179. Inc(FCurPos); // To start of next line
  180. Inc(FCurRow); // Increase line index
  181. end;
  182. // Len:=FEOL-FTokenStr;
  183. // FTokenStr:=FCurPos;
  184. end
  185. else
  186. begin
  187. FCurLine:=Nil;
  188. FTokenStr:=nil;
  189. end;
  190. end;
  191. var
  192. TokenStart: PChar;
  193. it : TJSONToken;
  194. I : Integer;
  195. OldLength, SectionLength, tstart,tcol, u1,u2: Integer;
  196. C , c2: char;
  197. S : String[4];
  198. IsStar,EOC: Boolean;
  199. Procedure MaybeAppendUnicode;
  200. Var
  201. u : UTF8String;
  202. begin
  203. // if there is a leftover \u, append
  204. if (u1<>0) then
  205. begin
  206. if (joUTF8 in Options) or (DefaultSystemCodePage=CP_UTF8) then
  207. U:=Utf8Encode(WideString(WideChar(u1))) // ToDo: use faster function
  208. else
  209. U:=String(WideChar(u1)); // WideChar converts the encoding. Should it warn on loss?
  210. FCurTokenString:=FCurTokenString+U;
  211. OldLength:=Length(FCurTokenString);
  212. u1:=0;
  213. end;
  214. end;
  215. begin
  216. if (FTokenStr = nil) or (FTokenStr=FEOL) then
  217. begin
  218. if not FetchLine then
  219. begin
  220. Result := tkEOF;
  221. FCurToken := Result;
  222. exit;
  223. end;
  224. end;
  225. FCurTokenString := '';
  226. case FTokenStr^ of
  227. #0: // Empty line
  228. begin
  229. FetchLine;
  230. Result := tkWhitespace;
  231. end;
  232. #9, ' ', #10, #13:
  233. begin
  234. Result := tkWhitespace;
  235. repeat
  236. if FTokenStr = FEOL then
  237. begin
  238. if not FetchLine then
  239. begin
  240. FCurToken := Result;
  241. exit;
  242. end
  243. end
  244. else
  245. Inc(FTokenStr);
  246. until not (FTokenStr[0] in [#9, ' ']);
  247. end;
  248. '"','''':
  249. begin
  250. C:=FTokenStr^;
  251. If (C='''') and (joStrict in Options) then
  252. Error(SErrInvalidCharacter, [CurRow,CurColumn,FTokenStr[0]]);
  253. Inc(FTokenStr);
  254. TokenStart := FTokenStr;
  255. OldLength := 0;
  256. FCurTokenString := '';
  257. u1:=0;
  258. while not (FTokenStr^ in [#0,C]) do
  259. begin
  260. if (FTokenStr^='\') then
  261. begin
  262. // Save length
  263. SectionLength := FTokenStr - TokenStart;
  264. Inc(FTokenStr);
  265. // Read escaped token
  266. Case FTokenStr^ of
  267. '"' : S:='"';
  268. '''' : S:='''';
  269. 't' : S:=#9;
  270. 'b' : S:=#8;
  271. 'n' : S:=#10;
  272. 'r' : S:=#13;
  273. 'f' : S:=#12;
  274. '\' : S:='\';
  275. '/' : S:='/';
  276. 'u' : begin
  277. S:='0000';
  278. u2:=0;
  279. For I:=1 to 4 do
  280. begin
  281. Inc(FTokenStr);
  282. c2:=FTokenStr^;
  283. Case c2 of
  284. '0'..'9': u2:=u2*16+ord(c2)-ord('0');
  285. 'A'..'F': u2:=u2*16+ord(c2)-ord('A')+10;
  286. 'a'..'f': u2:=u2*16+ord(c2)-ord('a')+10;
  287. else
  288. Error(SErrInvalidCharacter, [CurRow,CurColumn,FTokenStr[0]]);
  289. end;
  290. end;
  291. // ToDo: 4-bytes UTF16
  292. if u1<>0 then
  293. begin
  294. if (joUTF8 in Options) or (DefaultSystemCodePage=CP_UTF8) then
  295. S:=Utf8Encode(WideString(WideChar(u1)+WideChar(u2))) // ToDo: use faster function
  296. else
  297. S:=String(WideChar(u1)+WideChar(u2)); // WideChar converts the encoding. Should it warn on loss?
  298. u1:=0;
  299. end
  300. else
  301. begin
  302. S:='';
  303. u1:=u2;
  304. end
  305. end;
  306. #0 : Error(SErrOpenString,[FCurRow]);
  307. else
  308. Error(SErrInvalidCharacter, [CurRow,CurColumn,FTokenStr[0]]);
  309. end;
  310. I:=Length(S);
  311. if (SectionLength+I>0) then
  312. begin
  313. // If length=1, we know it was not \uXX, but u1 can be nonzero, and we must first append it.
  314. // example: \u00f8\"
  315. if (I=1) and (u1<>0) then
  316. MaybeAppendUnicode;
  317. SetLength(FCurTokenString, OldLength + SectionLength+i);
  318. if SectionLength > 0 then
  319. Move(TokenStart^, FCurTokenString[OldLength + 1], SectionLength);
  320. if I>0 then
  321. Move(S[1],FCurTokenString[OldLength + SectionLength+1],i);
  322. Inc(OldLength, SectionLength+I);
  323. end;
  324. // Next char
  325. TokenStart := FTokenStr+1;
  326. end
  327. else if u1<>0 then
  328. MaybeAppendUnicode;
  329. if FTokenStr^ = #0 then
  330. Error(SErrOpenString,[FCurRow]);
  331. Inc(FTokenStr);
  332. end;
  333. if FTokenStr^ = #0 then
  334. Error(SErrOpenString,[FCurRow]);
  335. if u1<>0 then
  336. MaybeAppendUnicode;
  337. SectionLength := FTokenStr - TokenStart;
  338. SetLength(FCurTokenString, OldLength + SectionLength);
  339. if SectionLength > 0 then
  340. Move(TokenStart^, FCurTokenString[OldLength + 1], SectionLength);
  341. Inc(FTokenStr);
  342. Result := tkString;
  343. end;
  344. ',':
  345. begin
  346. Inc(FTokenStr);
  347. Result := tkComma;
  348. end;
  349. '0'..'9','.','-':
  350. begin
  351. TokenStart := FTokenStr;
  352. while true do
  353. begin
  354. Inc(FTokenStr);
  355. case FTokenStr^ of
  356. '.':
  357. begin
  358. if FTokenStr[1] in ['0'..'9', 'e', 'E'] then
  359. begin
  360. Inc(FTokenStr);
  361. repeat
  362. Inc(FTokenStr);
  363. until not (FTokenStr^ in ['0'..'9', 'e', 'E','-','+']);
  364. end;
  365. break;
  366. end;
  367. '0'..'9': ;
  368. 'e', 'E':
  369. begin
  370. Inc(FTokenStr);
  371. if FTokenStr^ in ['-','+'] then
  372. Inc(FTokenStr);
  373. while FTokenStr^ in ['0'..'9'] do
  374. Inc(FTokenStr);
  375. break;
  376. end;
  377. else
  378. if {(FTokenStr<>FEOL) and }not (FTokenStr^ in [#13,#10,#0,'}',']',',',#9,' ']) then
  379. Error(SErrInvalidCharacter, [CurRow,CurColumn,FTokenStr[0]]);
  380. break;
  381. end;
  382. end;
  383. SectionLength := FTokenStr - TokenStart;
  384. FCurTokenString:='';
  385. SetString(FCurTokenString, TokenStart, SectionLength);
  386. If (FCurTokenString[1]='.') then
  387. FCurTokenString:='0'+FCurTokenString;
  388. Result := tkNumber;
  389. end;
  390. ':':
  391. begin
  392. Inc(FTokenStr);
  393. Result := tkColon;
  394. end;
  395. '{':
  396. begin
  397. Inc(FTokenStr);
  398. Result := tkCurlyBraceOpen;
  399. end;
  400. '}':
  401. begin
  402. Inc(FTokenStr);
  403. Result := tkCurlyBraceClose;
  404. end;
  405. '[':
  406. begin
  407. Inc(FTokenStr);
  408. Result := tkSquaredBraceOpen;
  409. end;
  410. ']':
  411. begin
  412. Inc(FTokenStr);
  413. Result := tkSquaredBraceClose;
  414. end;
  415. '/' :
  416. begin
  417. if Not (joComments in Options) then
  418. Error(SErrInvalidCharacter, [CurRow,CurCOlumn,FTokenStr[0]]);
  419. TokenStart:=FTokenStr;
  420. Inc(FTokenStr);
  421. Case FTokenStr^ of
  422. '/' : begin
  423. FCurTokenString:='';
  424. Inc(FTokenStr);
  425. TokenStart:=FTokenStr;
  426. SectionLength := PChar(FEOL)-TokenStart;
  427. SetString(FCurTokenString, TokenStart, SectionLength);
  428. FTokenStr:=FCurPos;
  429. end;
  430. '*' :
  431. begin
  432. IsStar:=False;
  433. Inc(FTokenStr);
  434. TokenStart:=FTokenStr;
  435. Repeat
  436. if (FTokenStr^=#0) then
  437. begin
  438. SectionLength := (FTokenStr - TokenStart);
  439. S:='';
  440. SetString(S, TokenStart, SectionLength);
  441. FCurtokenString:=FCurtokenString+S;
  442. if not fetchLine then
  443. Error(SUnterminatedComment, [CurRow,CurCOlumn,FTokenStr[0]]);
  444. TokenStart:=FTokenStr;
  445. end;
  446. IsStar:=FTokenStr^='*';
  447. Inc(FTokenStr);
  448. EOC:=(isStar and (FTokenStr^='/'));
  449. Until EOC;
  450. if EOC then
  451. begin
  452. SectionLength := (FTokenStr - TokenStart-1);
  453. S:='';
  454. SetString(S, TokenStart, SectionLength);
  455. FCurtokenString:=FCurtokenString+S;
  456. Inc(FTokenStr);
  457. end;
  458. end;
  459. else
  460. Error(SErrInvalidCharacter, [CurRow,CurCOlumn,FTokenStr[0]]);
  461. end;
  462. Result:=tkComment;
  463. end;
  464. 'a'..'z','A'..'Z','_':
  465. begin
  466. tstart:=CurRow;
  467. Tcol:=CurColumn;
  468. TokenStart := FTokenStr;
  469. repeat
  470. Inc(FTokenStr);
  471. until not (FTokenStr^ in ['A'..'Z', 'a'..'z', '0'..'9', '_']);
  472. SectionLength := FTokenStr - TokenStart;
  473. FCurTokenString:='';
  474. SetString(FCurTokenString, TokenStart, SectionLength);
  475. for it := tkTrue to tkNull do
  476. if CompareText(CurTokenString, TokenInfos[it]) = 0 then
  477. begin
  478. Result := it;
  479. FCurToken := Result;
  480. exit;
  481. end;
  482. if (joStrict in Options) then
  483. Error(SErrInvalidCharacter, [tStart,tcol,TokenStart[0]])
  484. else
  485. Result:=tkIdentifier;
  486. end;
  487. else
  488. Error(SErrInvalidCharacter, [CurRow,CurColumn,FTokenStr[0]]);
  489. end;
  490. FCurToken := Result;
  491. end;
  492. {function TJSONScanner.FetchToken: TJSONToken;
  493. begin
  494. Result:=DoFetchToken;
  495. end;}
  496. function TJSONScanner.GetCurLine: string;
  497. begin
  498. Result:='';
  499. if FCurLine<>Nil then
  500. begin
  501. SetLength(Result,FEOL-FCurLine);
  502. if Length(Result)>0 then
  503. Move(FCurLine^,Result[1],Length(Result));
  504. end;
  505. end;
  506. function TJSONScanner.GetO(AIndex: TJSONOption): Boolean;
  507. begin
  508. Result:=AIndex in FOptions;
  509. end;
  510. procedure TJSONScanner.SetO(AIndex: TJSONOption; AValue: Boolean);
  511. begin
  512. If AValue then
  513. Include(Foptions,AIndex)
  514. else
  515. Exclude(Foptions,AIndex)
  516. end;
  517. end.