Browse Source

* xmlread.pp, modified attribute parsing code to use DOM-independent data structures (second part)
* ExpectAttValue() now builds both plain string value and first-level node chain.
* Normalize() procedure moved to xmlutils.pp and made publicly available as BufNormalize.
* ParseLiteral() cleaned of attribute-specific code; at this point it is clear it won't be used for parsing attributes.

git-svn-id: trunk@16186 -

sergei 15 years ago
parent
commit
7e5ff8ca89
2 changed files with 93 additions and 102 deletions
  1. 56 102
      packages/fcl-xml/src/xmlread.pp
  2. 37 0
      packages/fcl-xml/src/xmlutils.pp

+ 56 - 102
packages/fcl-xml/src/xmlread.pp

@@ -338,7 +338,7 @@ type
     PrefixLen: Integer;  // to avoid recalculation
   end;
 
-  TLiteralType = (ltPlain, ltAttr, ltTokAttr, ltPubid, ltEntity);
+  TLiteralType = (ltPlain, ltPubid, ltEntity);
 
   TXMLReader = class
   private
@@ -437,8 +437,8 @@ type
     procedure CheckNCName;
     function  ExpectName: WideString;                                   // [5]
     function ParseLiteral(var ToFill: TWideCharBuf; aType: TLiteralType;
-      Required: Boolean; Normalized: PBoolean = nil): Boolean;
-    procedure ExpectAttValue(attrData: PNodeData);                      // [10]
+      Required: Boolean): Boolean;
+    function ExpectAttValue(attrData: PNodeData; NonCDATA: Boolean): Boolean; // [10]
     procedure ParseComment(discard: Boolean);                           // [15]
     procedure ParsePI;                                                  // [16]
     procedure CreatePINode;
@@ -1590,17 +1590,24 @@ const
   AttrDelims: TSetOfChar = [#0, '<', '&', '''', '"', #9, #10, #13];
   GT_Delim: TSetOfChar = [#0, '>'];
 
-procedure TXMLReader.ExpectAttValue(AttrData: PNodeData);
+{ Parse attribute literal, producing plain string value in AttrData.FValueStr.
+  If entity references are encountered and FExpandEntities=False, also builds
+  a node chain starting from AttrData.FNext. Node chain is built only for the
+  first level. If NonCDATA=True, additionally normalizes whitespace in string value.
+  Returns True if value actually needed normalization }
+function TXMLReader.ExpectAttValue(AttrData: PNodeData; NonCDATA: Boolean): Boolean;
 var
   wc: WideChar;
   Delim: WideChar;
   ent: TDOMEntityEx;
   start: TObject;
   curr: PNodeData;
+  StartPos: Integer;
 begin
   SkipQuote(Delim);
   curr := AttrData;
   FValue.Length := 0;
+  StartPos := 0;
   start := FSource.FEntity;
   repeat
     wc := FSource.SkipUntil(FValue, AttrDelims);
@@ -1612,14 +1619,14 @@ begin
         Continue;
 
       ent := EntityCheck(True);
-      if (ent = nil) or (not FExpandEntities) then
+      if ((ent = nil) or (not FExpandEntities)) and (FSource.FEntity = start) then
       begin
-        if FValue.Length > 0 then
+        if FValue.Length > StartPos then
         begin
           curr := AllocAttributeValueChunk(curr);
           curr^.FNodeType := ntText;
-          SetString(curr^.FValueStr, FValue.Buffer, FValue.Length);
-          FValue.Length := 0;
+          // without PWideChar typecast and in {$T-}, FPC treats '@' result as PAnsiChar...
+          SetString(curr^.FValueStr, PWideChar(@FValue.Buffer[StartPos]), FValue.Length-StartPos);
         end;
         curr := AllocAttributeValueChunk(curr);
         curr^.FNodeType := ntEntityReference;
@@ -1628,8 +1635,9 @@ begin
           SetString(curr^.FValueStr, FName.Buffer, FName.Length)
         else
           curr^.FValueStr := ent.FName;
-      end
-      else
+      end;
+      StartPos := FValue.Length;
+      if Assigned(ent) then
         ContextPush(ent);
     end
     else if wc <> #0 then
@@ -1641,22 +1649,28 @@ begin
         wc := #32;
       BufAppend(FValue, wc);
     end
-    else if (FSource.FEntity = start) or not ContextPop then    // #0
-      FatalError('Literal has no closing quote', -1);
+    else
+    begin
+      if (FSource.FEntity = start) or not ContextPop then    // #0
+        FatalError('Literal has no closing quote', -1);
+      StartPos := FValue.Length;
+    end;
   until False;
-  if Assigned(attrData^.FNext) then  // complex case
+  if Assigned(attrData^.FNext) then
   begin
     FAttrCleanupFlag := True;
-    if FValue.Length > 0 then
+    if FValue.Length > StartPos then
     begin
       curr := AllocAttributeValueChunk(curr);
       curr^.FNodeType := ntText;
-      SetString(curr^.FValueStr, FValue.Buffer, FValue.Length);
+      SetString(curr^.FValueStr, PWideChar(@FValue.Buffer[StartPos]), FValue.Length-StartPos);
     end;
-  end
+  end;
+  if nonCDATA then
+    BufNormalize(FValue, Result)
   else
-    SetString(attrData^.FValueStr, FValue.Buffer, FValue.Length);
-  FValue.Length := 0;
+    Result := False;
+  SetString(attrData^.FValueStr, FValue.Buffer, FValue.Length);
 end;
 
 const
@@ -1830,58 +1844,19 @@ begin
   end;
 end;
 
-procedure Normalize(var Buf: TWideCharBuf; Modified: PBoolean);
-var
-  Dst, Src: Integer;
-begin
-  Dst := 0;
-  Src := 0;
-  // skip leading space if any
-  while (Src < Buf.Length) and (Buf.Buffer[Src] = ' ') do
-    Inc(Src);
-
-  while Src < Buf.Length do
-  begin
-    if Buf.Buffer[Src] = ' ' then
-    begin
-      // Dst cannot be 0 here, because leading space is already skipped
-      if Buf.Buffer[Dst-1] <> ' ' then
-      begin
-        Buf.Buffer[Dst] := ' ';
-        Inc(Dst);
-      end;
-    end
-    else
-    begin
-      Buf.Buffer[Dst] := Buf.Buffer[Src];
-      Inc(Dst);
-    end;
-    Inc(Src);
-  end;
-  // trailing space (only one possible due to compression)
-  if (Dst > 0) and (Buf.Buffer[Dst-1] = ' ') then
-    Dec(Dst);
-
-  if Assigned(Modified) then
-    Modified^ := Dst <> Buf.Length;
-  Buf.Length := Dst;
-end;
-
 const
   LiteralDelims: array[TLiteralType] of TSetOfChar = (
     [#0, '''', '"'],                          // ltPlain
-    [#0, '<', '&', '''', '"', #9, #10, #13],  // ltAttr
-    [#0, '<', '&', '''', '"', #9, #10, #13],  // ltTokAttr
     [#0, '''', '"', #13, #10],                // ltPubid
     [#0, '%', '&', '''', '"']                 // ltEntity
   );
 
 function TXMLReader.ParseLiteral(var ToFill: TWideCharBuf; aType: TLiteralType;
-  Required: Boolean; Normalized: PBoolean): Boolean;
+  Required: Boolean): Boolean;
 var
   start: TObject;
   wc, Delim: WideChar;
-  ent: TDOMEntityEx;
+  dummy: Boolean;
 begin
   SkipQuote(Delim, Required);
   Result := (Delim <> #0);
@@ -1900,32 +1875,18 @@ begin
         FatalError('PE reference not allowed here in internal subset', FName.Length+2);
       StartPE;
     end
-    else if wc = '&' then  { ltAttr, ltTokAttr, ltEntity }
+    else if wc = '&' then  { ltEntity }
     begin
       if ParseRef(ToFill) then   // charRefs always expanded
         Continue;
-      if aType = ltEntity then   // bypass
-      begin
-        BufAppend(ToFill, '&');
-        BufAppendChunk(ToFill, FName.Buffer, FName.Buffer + FName.Length);
-        BufAppend(ToFill, ';');
-      end
-      else                       // include
-      begin
-        if ResolvePredefined then
-          Continue;
-        ent := EntityCheck(True);
-        if ent = nil then
-          Continue;
-        ContextPush(ent);
-      end;
+      BufAppend(ToFill, '&');
+      BufAppendChunk(ToFill, FName.Buffer, FName.Buffer + FName.Length);
+      BufAppend(ToFill, ';');
     end
-    else if wc = '<' then
-      FatalError('Character ''<'' is not allowed in attribute value')
     else if wc <> #0 then
     begin
       FSource.NextChar;
-      if (wc = #10) or (wc = #13) or (wc = #9) then
+      if (wc = #10) or (wc = #13) then
         wc := #32
       // terminating delimiter must be in the same context as the starting one
       else if (wc = Delim) and (start = FSource.FEntity) then
@@ -1935,8 +1896,8 @@ begin
     else if (FSource.FEntity = start) or not ContextPop then    // #0
       FatalError('Literal has no closing quote', -1);
   until False;
-  if aType in [ltTokAttr, ltPubid] then
-    Normalize(ToFill, Normalized);
+  if aType = ltPubid then
+    BufNormalize(ToFill, dummy);
 end;
 
 function TXMLReader.SkipUntilSeq(const Delim: TSetOfChar; c1: WideChar; c2: WideChar = #0): Boolean;
@@ -2527,7 +2488,7 @@ begin
 
 // See comments to valid-sa-094: PE expansion should be disabled in AttDef.
         attrData := AllocAttributeData(nil);
-        ExpectAttValue(attrData);
+        ExpectAttValue(attrData, dt <> dtCDATA);
 
         LoadAttribute(attrData, AttDef);   // convert to DOM form
         CleanupAttributeData;
@@ -3152,30 +3113,17 @@ var
   attrData: PNodeData;
   AttDef: TDOMAttrDef;
   i: Integer;
+  normalized: Boolean;
 
+{ still needs a temp to store AttDef.Value }
 procedure CheckValue;
-var
-  AttValue, OldValue: WideString;
 begin
-  if FStandalone and AttDef.ExternallyDeclared then
-  begin
-    OldValue := Attr.Value;
-    Attr.DataType := AttDef.DataType;
-    AttValue := Attr.Value;
-    if AttValue <> OldValue then
-      StandaloneError(-1);
-  end
-  else
-  begin
-    Attr.DataType := AttDef.DataType;
-    AttValue := Attr.Value;
-  end;
   // TODO: what about normalization of AttDef.Value? (Currently it IS normalized)
-  if (AttDef.Default = adFixed) and (AttDef.Value <> AttValue) then
-    ValidationError('Value of attribute ''%s'' does not match its #FIXED default',[AttDef.Name], -1);
-  if not ValidateAttrSyntax(AttDef, AttValue) then
-    ValidationError('Attribute ''%s'' type mismatch', [AttDef.Name], -1);
-  ValidateAttrValue(Attr, AttValue);
+  if (AttDef.Default = adFixed) and (AttDef.Value <> attrData^.FValueStr) then
+    ValidationError('Value of attribute ''%s'' does not match its #FIXED default',[attrData^.FQName^.Key], -1);
+  if not ValidateAttrSyntax(AttDef, attrData^.FValueStr) then
+    ValidationError('Attribute ''%s'' type mismatch', [attrData^.FQName^.Key], -1);
+  ValidateAttrValue(Attr, attrData^.FValueStr);
 end;
 
 begin
@@ -3201,13 +3149,19 @@ begin
       FatalError('Duplicate attribute', FName.Length);
 
   ExpectEq;
-  ExpectAttValue(attrData);
+  normalized := ExpectAttValue(attrData, Assigned(AttDef) and (AttDef.DataType <> dtCDATA));
 
   LoadAttribute(attrData, attr);
 
   elem.Attributes.SetNamedItem(attr);
   if Assigned(AttDef) and ((AttDef.DataType <> dtCdata) or (AttDef.Default = adFixed)) then
+  begin
+    Attr.DataType := AttDef.DataType;
+    if normalized and FStandalone and AttDef.ExternallyDeclared then
+      StandaloneError(-1);
+
     CheckValue;
+  end;
 end;
 
 procedure TXMLReader.AddForwardRef(aList: TFPList; Buf: PWideChar; Length: Integer);

+ 37 - 0
packages/fcl-xml/src/xmlutils.pp

@@ -159,6 +159,7 @@ procedure BufAllocate(var ABuffer: TWideCharBuf; ALength: Integer);
 procedure BufAppend(var ABuffer: TWideCharBuf; wc: WideChar);
 procedure BufAppendChunk(var ABuf: TWideCharBuf; pstart, pend: PWideChar);
 function BufEquals(const ABuf: TWideCharBuf; const Arg: WideString): Boolean;
+procedure BufNormalize(var Buf: TWideCharBuf; out Modified: Boolean);
 
 { Built-in decoder functions for UTF-8, UTF-16 and ISO-8859-1 }
 
@@ -912,6 +913,42 @@ begin
     CompareMem(ABuf.Buffer, Pointer(Arg), ABuf.Length*sizeof(WideChar));
 end;
 
+procedure BufNormalize(var Buf: TWideCharBuf; out Modified: Boolean);
+var
+  Dst, Src: Integer;
+begin
+  Dst := 0;
+  Src := 0;
+  // skip leading space if any
+  while (Src < Buf.Length) and (Buf.Buffer[Src] = ' ') do
+    Inc(Src);
+
+  while Src < Buf.Length do
+  begin
+    if Buf.Buffer[Src] = ' ' then
+    begin
+      // Dst cannot be 0 here, because leading space is already skipped
+      if Buf.Buffer[Dst-1] <> ' ' then
+      begin
+        Buf.Buffer[Dst] := ' ';
+        Inc(Dst);
+      end;
+    end
+    else
+    begin
+      Buf.Buffer[Dst] := Buf.Buffer[Src];
+      Inc(Dst);
+    end;
+    Inc(Src);
+  end;
+  // trailing space (only one possible due to compression)
+  if (Dst > 0) and (Buf.Buffer[Dst-1] = ' ') then
+    Dec(Dst);
+
+  Modified := Dst <> Buf.Length;
+  Buf.Length := Dst;
+end;
+
 { standard decoders }
 
 function Decode_UCS2(Context: Pointer; InBuf: PChar; var InCnt: Cardinal; OutBuf: PWideChar; var OutCnt: Cardinal): Integer; stdcall;