Browse Source

Use varints for TJSONScanner.FParts to save memory.

Rika Ichinose 1 year ago
parent
commit
9918bb5619
1 changed files with 122 additions and 57 deletions
  1. 122 57
      packages/fcl-json/src/jsonscanner.pp

+ 122 - 57
packages/fcl-json/src/jsonscanner.pp

@@ -65,20 +65,20 @@ Const
 
 
 Type
 Type
 
 
-  { TJSONScanner }
-
   TJSONScanner = class
   TJSONScanner = class
   private
   private
     FCurPos, FSourceStart: PAnsiChar;
     FCurPos, FSourceStart: PAnsiChar;
     FCurRow: Integer;
     FCurRow: Integer;
     FCurToken: TJSONToken;
     FCurToken: TJSONToken;
-    FCurTokenString: AnsiString; // Calculated lazily from FParts. FNParts = -1 if ready.
+    FCurTokenString: AnsiString; // Calculated lazily from FParts. FPartsBytes = -1 if ready.
 
 
-    // Describes how to build FCurTokenString if asked.
-    // FParts[i] >= 0: piece with start = FSourceStart + FParts[i] and length = FParts[i + 1].
-    // FParts[i] = -1 - N: Unicode codepoint N.
-    FParts: array of SizeInt;
-    FNParts: SizeInt; // -1 if FCurTokenString is ready.
+    // Sequence of varints that describes how to build FCurTokenString if asked.
+    // Number X with lowest bit 0 means piece with start = FPartsSrcPos + X shr 1, length = next number; FPartsSrcPos advances to start + length.
+    //   I.e. offsets are relative to the previous piece, so AAA\nBBB\nCCC results in tiny 1-byte offsets (and lengths) no matter how long the entire string is.
+    // Number X with lowest bit 1 means Unicode codepoint X shr 1.
+    FParts: array of uint8;
+    FPartsBytes: SizeInt; // -1 if FCurTokenString is ready.
+    FPartsSrcPos: PAnsiChar; // Used during FetchToken to form FParts with relative offsets, and then during BuildCurTokenString to interpret them.
 
 
     FOptions : TJSONOptions;
     FOptions : TJSONOptions;
     FCurLine, FCurLineEnd: PAnsiChar; // FCurLineEnd = nil if needs to be calculated.
     FCurLine, FCurLineEnd: PAnsiChar; // FCurLineEnd = nil if needs to be calculated.
@@ -92,9 +92,14 @@ Type
     procedure SetO(AIndex: TJSONOption; AValue: Boolean);
     procedure SetO(AIndex: TJSONOption; AValue: Boolean);
     function CountChars(start, ed: PAnsiChar): SizeInt;
     function CountChars(start, ed: PAnsiChar): SizeInt;
 
 
-    function GrowParts(by: SizeInt): PSizeInt;
-    procedure AddPiece(start, ed: PAnsiChar);
+  const
+    MaxViLen = 1 + sizeof(SizeUint);
+    class function ViRead(p: PUint8; out v: SizeUint): SizeUint; static;
+    class function ViWrite(p: PUint8; v: SizeUint): SizeUint; static;
+
+    function EnsurePartsSpace(nBytes: SizeInt): PUint8;
     procedure AddCodepoint(cp: uint32);
     procedure AddCodepoint(cp: uint32);
+    procedure AddPiece(start, ed: PAnsiChar);
     function GetCurTokenString: ansistring;
     function GetCurTokenString: ansistring;
     procedure BuildCurTokenString;
     procedure BuildCurTokenString;
     class function CodepointToASCII(cp: uint32; Rp: PAnsiChar): SizeInt; static;
     class function CodepointToASCII(cp: uint32; Rp: PAnsiChar): SizeInt; static;
@@ -222,11 +227,13 @@ function TJSONScanner.FetchToken: TJSONToken;
 var
 var
   Sp, Start: PAnsiChar;
   Sp, Start: PAnsiChar;
 begin
 begin
-  FNParts := 0;
+  FPartsBytes := 0;
   Sp := FCurPos;
   Sp := FCurPos;
   // Don't consider such newline a tkWhitespace.
   // Don't consider such newline a tkWhitespace.
   if Sp^ in [#13, #10] then
   if Sp^ in [#13, #10] then
     Sp := ScanNewline(Sp);
     Sp := ScanNewline(Sp);
+  Start := Sp;
+  FPartsSrcPos := Sp; // AddPiece uses it to write relative offsets to FParts.
   case Sp^ of
   case Sp^ of
     #0: Result := tkEOF;
     #0: Result := tkEOF;
     #9, ' ', #13, #10:
     #9, ' ', #13, #10:
@@ -289,7 +296,6 @@ begin
       end;
       end;
     'a'..'z','A'..'Z','_':
     'a'..'z','A'..'Z','_':
       begin
       begin
-        Start := Sp;
         repeat
         repeat
           Inc(Sp);
           Inc(Sp);
         until not (Sp^ in ['A'..'Z', 'a'..'z', '0'..'9', '_']);
         until not (Sp^ in ['A'..'Z', 'a'..'z', '0'..'9', '_']);
@@ -299,6 +305,7 @@ begin
     else
     else
       InvalidCharacter(Sp);
       InvalidCharacter(Sp);
   end;
   end;
+  FPartsSrcPos := Start; // BuildCurTokenString uses it to read relative offsets from FParts.
   FCurPos := Sp;
   FCurPos := Sp;
   FCurToken := Result;
   FCurToken := Result;
 end;
 end;
@@ -348,36 +355,88 @@ begin
     Result := ed - start;
     Result := ed - start;
 end;
 end;
 
 
-function TJSONScanner.GrowParts(by: SizeInt): PSizeInt;
+// Varint uses: 1 byte for 0..127, 2 bytes for 128..16511, 3 bytes for 16512..2113663, otherwise 1 + sizeof(SizeUint) bytes for full SizeUint (very rare).
+class function TJSONScanner.ViRead(p: PUint8; out v: SizeUint): SizeUint;
 var
 var
-  newNParts: SizeInt;
+  val: SizeUint;
 begin
 begin
-  newNParts := FNParts + by;
-  if newNParts > length(FParts) then
-    SetLength(FParts, 4 + newNParts + SizeUint(newNParts) div 4);
-  Result := @FParts[FNParts];
-  FNParts := newNParts;
+  val := p^;
+  if val < %10000000 then
+    result := 1
+  else if val < %11000000 then
+  begin
+    val := 128 + (val and %111111 shl 8 or p[1]);
+    result := 2;
+  end
+  else if val < %11100000 then
+  begin
+    val := (128 + 16384) + (val and %11111 shl 16 or unaligned(PUint16(p + 1)^));
+    result := 3;
+  end else
+  begin
+    val := unaligned(PSizeUint(p + 1)^);
+    result := 1 + sizeof(SizeUint);
+  end;
+  v := val;
 end;
 end;
 
 
-procedure TJSONScanner.AddPiece(start, ed: PAnsiChar);
-var
-  pp: PSizeInt;
+class function TJSONScanner.ViWrite(p: PUint8; v: SizeUint): SizeUint;
 begin
 begin
-  if start = ed then
-    exit;
-  pp := GrowParts(2);
-  pp[0] := start - FSourceStart;
-  pp[1] := ed - start;
+  if v <= 127 then
+  begin
+    p[0] := v;
+    result := 1;
+  end
+  else if v <= 16511 then
+  begin
+    v := v - 128;
+    p[0] := %10000000 or v shr 8;
+    p[1] := byte(v);
+    result := 2;
+  end
+  else if v <= 2113663 then
+  begin
+    v := v - (128 + 16384);
+    p[0] := %11000000 or v shr 16;
+    unaligned(PUint16(p + 1)^) := uint16(v);
+    result := 3;
+  end else
+  begin
+    p[0] := 255;
+    unaligned(PSizeUint(p + 1)^) := v;
+    result := 1 + sizeof(SizeUint);
+  end;
+end;
+
+function TJSONScanner.EnsurePartsSpace(nBytes: SizeInt): PUint8;
+begin
+  if FPartsBytes + nBytes > length(FParts) then
+    SetLength(FParts, 4 + FPartsBytes + nBytes + SizeUint(FPartsBytes + nBytes) div 4);
+  Result := @FParts[FPartsBytes];
 end;
 end;
 
 
 procedure TJSONScanner.AddCodepoint(cp: uint32);
 procedure TJSONScanner.AddCodepoint(cp: uint32);
 begin
 begin
-  GrowParts(1)^ := -1 - SizeInt(cp);
+  inc(FPartsBytes, ViWrite(EnsurePartsSpace(MaxViLen), cp shl 1 or 1));
+end;
+
+procedure TJSONScanner.AddPiece(start, ed: PAnsiChar);
+var
+  pp: PUint8;
+  n: SizeUint;
+begin
+  if start = ed then
+    exit;
+  // Can also store a piece consisting of a single ASCII character as a codepoint, but savings are very minor; often both variants require 2 bytes.
+  pp := EnsurePartsSpace(2 * MaxViLen);
+  n := ViWrite(pp, (start - FPartsSrcPos) shl 1);
+  inc(FPartsBytes, n + ViWrite(pp + n, ed - start));
+  FPartsSrcPos := ed;
 end;
 end;
 
 
 function TJSONScanner.GetCurTokenString: ansistring;
 function TJSONScanner.GetCurTokenString: ansistring;
 begin
 begin
-  if FNParts >= 0 then
+  if FPartsBytes >= 0 then
     BuildCurTokenString;
     BuildCurTokenString;
   result := FCurTokenString;
   result := FCurTokenString;
 end;
 end;
@@ -385,44 +444,50 @@ end;
 procedure TJSONScanner.BuildCurTokenString;
 procedure TJSONScanner.BuildCurTokenString;
 var
 var
   utf8: boolean;
   utf8: boolean;
-  iPart, len: SizeInt;
+  partp, parte: PUint8;
+  len: SizeInt;
+  v, v2: SizeUint;
   cp: uint32;
   cp: uint32;
-  Rp: PAnsiChar;
+  Srcp, Rp: PAnsiChar;
 begin
 begin
   utf8 := (joUTF8 in Options) or (DefaultSystemCodePage=CP_UTF8);
   utf8 := (joUTF8 in Options) or (DefaultSystemCodePage=CP_UTF8);
   len := 0;
   len := 0;
   // Prepass for length. Exact if utf8, otherwise ceiling.
   // Prepass for length. Exact if utf8, otherwise ceiling.
-  iPart := 0;
-  while iPart < FNParts do
+  partp := PUint8(FParts);
+  parte := partp + FPartsBytes;
+  while partp < parte do
   begin
   begin
-    if FParts[iPart] >= 0 then
+    inc(partp, ViRead(partp, v));
+    if v and 1 = 0 then
     begin
     begin
-      inc(len, FParts[iPart + 1]);
-      inc(iPart, 2);
+      inc(partp, ViRead(partp, v));
+      inc(len, v);
     end else
     end else
     begin
     begin
-      cp := -(FParts[iPart] + 1);
+      cp := v shr 1;
       if cp <= $7F then inc(len) // First 128 characters use 1 byte both in UTF-8 or ANSI encodings.
       if cp <= $7F then inc(len) // First 128 characters use 1 byte both in UTF-8 or ANSI encodings.
       // Use 2 in non-utf8 mode as ceiling value, assuming ANSI encodings use at most 2 bytes per codepoint. (Eg: Shift JIS uses 1 or 2.)
       // Use 2 in non-utf8 mode as ceiling value, assuming ANSI encodings use at most 2 bytes per codepoint. (Eg: Shift JIS uses 1 or 2.)
       else if (cp <= $7FF) or not utf8 then inc(len, 2)
       else if (cp <= $7FF) or not utf8 then inc(len, 2)
       else if cp <= $FFFF then inc(len, 3)
       else if cp <= $FFFF then inc(len, 3)
       else inc(len, 4);
       else inc(len, 4);
-      inc(iPart);
     end;
     end;
   end;
   end;
   SetLength(FCurTokenString, len);
   SetLength(FCurTokenString, len);
+  Srcp := FPartsSrcPos;
   Rp := PAnsiChar(Pointer(FCurTokenString));
   Rp := PAnsiChar(Pointer(FCurTokenString));
-  iPart := 0;
-  while iPart < FNParts do
+  partp := PUint8(FParts);
+  while partp < parte do
   begin
   begin
-    if FParts[iPart] >= 0 then
+    inc(partp, ViRead(partp, v));
+    if v and 1 = 0 then
     begin
     begin
-      Move(FSourceStart[FParts[iPart]], Rp^, FParts[iPart + 1]);
-      inc(Rp, FParts[iPart + 1]);
-      inc(iPart, 2);
+      inc(partp, ViRead(partp, v2));
+      Move(Srcp[v shr 1], Rp^, v2);
+      inc(Rp, v2);
+      inc(Srcp, v shr 1 + v2);
     end else
     end else
     begin
     begin
-      cp := -(FParts[iPart] + 1);
+      cp := v shr 1;
       if cp <= $7F then
       if cp <= $7F then
       begin
       begin
         byte(Rp^) := cp;
         byte(Rp^) := cp;
@@ -451,11 +516,11 @@ begin
           end
           end
         else
         else
           Inc(Rp, CodepointToASCII(cp, Rp));
           Inc(Rp, CodepointToASCII(cp, Rp));
-      inc(iPart);
     end;
     end;
   end;
   end;
-  SetLength(FCurTokenString, Rp - PAnsiChar(Pointer(FCurTokenString)));
-  FNParts := -1;
+  if not utf8 then
+    SetLength(FCurTokenString, Rp - PAnsiChar(Pointer(FCurTokenString)));
+  FPartsBytes := -1;
 end;
 end;
 
 
 class function TJSONScanner.CodepointToASCII(cp: uint32; Rp: PAnsiChar): SizeInt;
 class function TJSONScanner.CodepointToASCII(cp: uint32; Rp: PAnsiChar): SizeInt;
@@ -480,8 +545,8 @@ end;
 
 
 function TJSONScanner.ScanString(Sp: PAnsiChar): PAnsiChar;
 function TJSONScanner.ScanString(Sp: PAnsiChar): PAnsiChar;
 const
 const
-  SimpleEscapes_Spell: array[0 .. 8] of ansichar = 'tbnrf"''\/';
-  SimpleEscapes_Meant: array[0 .. High(SimpleEscapes_Spell)] of ansichar = #9#8#10#13#12'"''\/';
+  SimpleEscapes_Spell: array[0 .. 8] of ansichar = 'tbnrf"''\/'; // Symbols starting from length(SimpleEscapes_Meant) are mapped to themselves.
+  SimpleEscapes_Meant: array[0 .. 4] of ansichar = #9#8#10#13#12;
 var
 var
   StartChar: AnsiChar;
   StartChar: AnsiChar;
   LiteralStart: PAnsiChar;
   LiteralStart: PAnsiChar;
@@ -499,6 +564,9 @@ begin
     if not (Sp^ in [#0 .. #31, '\', '''', '"']) then
     if not (Sp^ in [#0 .. #31, '\', '''', '"']) then
       continue;
       continue;
 
 
+    if Sp^ = StartChar then
+      break;
+
     if Sp^ = '\' then
     if Sp^ = '\' then
     begin
     begin
       AddPiece(LiteralStart, Sp);
       AddPiece(LiteralStart, Sp);
@@ -528,10 +596,10 @@ begin
       begin
       begin
         Inc(Sp);
         Inc(Sp);
         LiteralStart := Sp + 1;
         LiteralStart := Sp + 1;
-        if SimpleEscapes_Meant[iEsc] = SimpleEscapes_Spell[iEsc] then
-          dec(LiteralStart) // Just start next literal from this very character instead of handling it explicitly somehow.
+        if iEsc < length(SimpleEscapes_Meant) then // Escaped character maps to something else?
+          AddCodepoint(ord(SimpleEscapes_Meant[iEsc]))
         else
         else
-          GrowParts(1)^ := -1 - ord(SimpleEscapes_Meant[iEsc]);
+          dec(LiteralStart); // Just start next literal from this very character instead of handling it explicitly somehow.
         continue;
         continue;
       end;
       end;
 
 
@@ -541,9 +609,6 @@ begin
         InvalidCharacter(Sp + 1);
         InvalidCharacter(Sp + 1);
     end;
     end;
 
 
-    if Sp^ = StartChar then
-      break;
-
     if Sp^ < #20 then
     if Sp^ < #20 then
       if Sp^ = #0 then
       if Sp^ = #0 then
         Error(SErrOpenString, [FCurRow])
         Error(SErrOpenString, [FCurRow])
@@ -615,7 +680,7 @@ begin
   if not (Sp^ in [#13, #10, #0, '}', ']', ',', #9, ' ']) then
   if not (Sp^ in [#13, #10, #0, '}', ']', ',', #9, ' ']) then
     InvalidCharacter(Sp);
     InvalidCharacter(Sp);
   if Start^ = '.' then
   if Start^ = '.' then
-    GrowParts(1)^ := -1 - ord('0');
+    AddCodepoint(ord('0'));
   AddPiece(Start, Sp);
   AddPiece(Start, Sp);
   Result := Sp;
   Result := Sp;
 end;
 end;