Răsfoiți Sursa

htmldefs.pp:
+ Define elements which may omit end-tag (except HTML, HEAD and BODY which may also omit the start-tag)
+ Define which elements may close other elements (modelled after libxml2).
* DIV may have #PCDATA content.

sax_html.pp:
* Improve the parser to report startElement/endElement events properly. Should resolve Mantis #14073 and related element hierarchy issues.

git-svn-id: trunk@13357 -

sergei 16 ani în urmă
părinte
comite
fcd96805fa
2 a modificat fișierele cu 235 adăugiri și 83 ștergeri
  1. 106 14
      packages/fcl-xml/src/htmldefs.pp
  2. 129 69
      packages/fcl-xml/src/sax_html.pp

+ 106 - 14
packages/fcl-xml/src/htmldefs.pp

@@ -97,8 +97,9 @@ type
     efPCDATAContent,                    // may have PCDATA content
     efPreserveWhitespace,               // preserve all whitespace
     efDeprecated,                       // can be dropped in future versions
-    efNoChecks                          // Checks (attributes,subtags,...) can only be implemented in descendants
-    );
+    efNoChecks,                         // Checks (attributes,subtags,...) can only be implemented in descendants
+    efEndTagOptional
+  );
   THTMLElementFlags = set of THTMLElementFlag;
 
   PHTMLElementProps = ^THTMLElementProps;
@@ -184,10 +185,10 @@ const
     (Name: 'col';       Flags: [];
      Attributes: atsattrs+atscellhalign+[atvalign,atspan,atwidth]),
 
-    (Name: 'colgroup';  Flags: [efSubelementContent];
+    (Name: 'colgroup';  Flags: [efSubelementContent, efEndTagOptional];
      Attributes: atsattrs+atscellhalign+[atvalign,atspan,atwidth]),
 
-    (Name: 'dd';        Flags: efSubcontent; Attributes: atsattrs),
+    (Name: 'dd';        Flags: efSubcontent+[efEndTagOptional]; Attributes: atsattrs),
 
     (Name: 'del';       Flags: [efSubelementContent]; Attributes: atsattrs+[atcite,atdatetime]),
 
@@ -195,11 +196,11 @@ const
 
     (Name: 'dir';       Flags: [efSubelementContent,efDeprecated]; Attributes: atsattrs),
 
-    (Name: 'div';       Flags: [efSubelementContent]; Attributes: atsattrs),
+    (Name: 'div';       Flags: efSubContent; Attributes: atsattrs),
 
     (Name: 'dl';        Flags: [efSubelementContent]; Attributes: atsattrs),
 
-    (Name: 'dt';        Flags: [efPCDataContent]; Attributes: atsattrs),
+    (Name: 'dt';        Flags: [efPCDataContent, efEndTagOptional]; Attributes: atsattrs),
 
     (Name: 'em';        Flags: efSubcontent; Attributes: atsattrs),
 
@@ -260,7 +261,7 @@ const
 
     (Name: 'legend';    Flags: efSubcontent; Attributes: atsattrs+[ataccesskey]),
 
-    (Name: 'li';        Flags: efSubcontent; Attributes: atsattrs),
+    (Name: 'li';        Flags: efSubcontent+[efEndTagOptional]; Attributes: atsattrs),
 
     (Name: 'link';      Flags: [];
      Attributes: atsattrs+[atcharset,athref,athreflang,attype,atrel,atrev,atmedia]),
@@ -283,10 +284,10 @@ const
 
     (Name: 'optgroup';  Flags: efSubcontent; Attributes: atsattrs+[atdisabled,atlabel]),
 
-    (Name: 'option';    Flags: efSubcontent;
+    (Name: 'option';    Flags: efSubcontent+[efEndTagOptional];
      Attributes: atsattrs+[atselected,atdisabled,atlabel,atvalue]),
 
-    (Name: 'p';         Flags: efSubcontent; Attributes: atsattrs),
+    (Name: 'p';         Flags: efSubcontent+[efEndTagOptional]; Attributes: atsattrs),
 
     (Name: 'param';     Flags: []; Attributes: [atid,atname,atvalue,atvaluetype,attype]),
 
@@ -324,23 +325,23 @@ const
 
     (Name: 'tbody';     Flags: [efSubelementContent]; Attributes: atsattrs+atscellhalign+[atvalign]),
 
-    (Name: 'td';        Flags: efSubcontent;
+    (Name: 'td';        Flags: efSubcontent+[efEndTagOptional];
      Attributes: atsattrs+atscellhalign+[atvalign,atabbr,ataxis,atheaders,atscope,atrowspan,atcolspan]),
 
     (Name: 'textarea';  Flags: [efPCDATAContent];
      Attributes: atsattrs+[atname,atrows,atcols,atdisabled,atreadonly,attabindex,
                  ataccesskey,atonfocus,atonblur,atonselect,atonchange]),
 
-    (Name: 'tfoot';     Flags: [efSubelementContent]; Attributes: atsattrs+atscellhalign+[atvalign]),
+    (Name: 'tfoot';     Flags: [efSubelementContent,efEndTagOptional]; Attributes: atsattrs+atscellhalign+[atvalign]),
 
-    (Name: 'th';        Flags: efSubcontent;
+    (Name: 'th';        Flags: efSubcontent+[efEndTagOptional];
      Attributes: atsattrs+atscellhalign+[atvalign,atabbr,ataxis,atheaders,atscope,atrowspan,atcolspan]),
 
-    (Name: 'thead';     Flags: [efSubelementContent]; Attributes: atsattrs+atscellhalign+[atvalign]),
+    (Name: 'thead';     Flags: [efSubelementContent, efEndTagOptional]; Attributes: atsattrs+atscellhalign+[atvalign]),
 
     (Name: 'title';     Flags: efSubcontent; Attributes: atsi18n),
 
-    (Name: 'tr';        Flags: [efSubelementContent];
+    (Name: 'tr';        Flags: [efSubelementContent, efEndTagOptional];
      Attributes: atsattrs+atscellhalign+[atvalign]),
 
     (Name: 'tt';        Flags: efSubcontent; Attributes: atsattrs),
@@ -559,12 +560,81 @@ const
 function ResolveHTMLEntityReference(const Name: String;
   var Entity: WideChar): Boolean;
 
+function IsAutoClose(NewTag, OldTag: THTMLElementTag): Boolean;
 
 
 implementation
 
 uses SysUtils;
 
+{ Define which elements auto-close other elements, modelled after libxml2.
+  This is an array of variable-length lists, each terminated by etUnknown.
+  Indices to first element of each list are provided by AutoCloseIndex array,
+  which *must* be updated after any change. }
+const
+  AutoCloseTab: array[0..277] of THTMLElementTag = (
+
+  etform,       etform, etp, ethr, eth1, eth2, eth3, eth4, eth5, eth6,
+                etdl, etul, etol, etmenu, etdir, etaddress, etpre,
+                ethead, etUnknown,
+  ethead,       etp, etUnknown,
+  ettitle,      etp, etUnknown,
+  etbody,       ethead, etstyle, etlink, ettitle, etp, etUnknown,
+  etframeset,   ethead, etstyle, etlink, ettitle, etp, etUnknown,
+  etli,         etp, eth1, eth2, eth3, eth4, eth5, eth6, etdl, etaddress,
+                etpre, ethead, etli, etUnknown,
+  ethr,         etp, ethead, etUnknown,
+  eth1,         etp, ethead, etUnknown,
+  eth2,         etp, ethead, etUnknown,
+  eth3,         etp, ethead, etUnknown,
+  eth4,         etp, ethead, etUnknown,
+  eth5,         etp, ethead, etUnknown,
+  eth6,         etp, ethead, etUnknown,
+  etdir,        etp, ethead, etUnknown,
+  etaddress,    etp, ethead, etul, etUnknown,
+  etpre,        etp, ethead, etul, etUnknown,
+  etblockquote, etp, ethead, etUnknown,
+  etdl,         etp, etdt, etmenu, etdir, etaddress, etpre,
+                ethead, etUnknown,
+  etdt,         etp, etmenu, etdir, etaddress, etpre,
+                ethead, etdd, etUnknown,
+  etdd,         etp, etmenu, etdir, etaddress, etpre,
+                ethead, etdt, etUnknown,
+  etul,         etp, ethead, etol, etmenu, etdir, etaddress, etpre, etUnknown,
+  etol,         etp, ethead, etul, etUnknown,
+  etmenu,       etp, ethead, etul, etUnknown,
+  etp,          etp, ethead, eth1, eth2, eth3, eth4, eth5, eth6, etUnknown,
+  etdiv,        etp, ethead, etUnknown,
+  etnoscript,   etp, ethead, etUnknown,
+  etcenter,     etfont, etb, eti, etp, ethead, etUnknown,
+  eta,          eta, etUnknown,
+  etcaption,    etp, etUnknown,
+  etcolgroup,   etcaption, etcolgroup, etcol, etp, etUnknown,
+  etcol,        etcaption, etcol, etp, etUnknown,
+  ettable,      etp, ethead, eth1, eth2, eth3, eth4, eth5, eth6, etpre,
+                eta, etUnknown,
+  etth,         etth, ettd, etp, etspan, etfont, eta, etb, eti, etu, etUnknown,
+  ettd,         etth, ettd, etp, etspan, etfont, eta, etb, eti, etu, etUnknown,
+  ettr,         etth, ettd, ettr, etcaption, etcol, etcolgroup, etp, etUnknown,
+  etthead,      etcaption, etcol, etcolgroup, etUnknown,
+  ettfoot,      etth, ettd, ettr, etcaption, etcol, etcolgroup, etthead,
+                ettbody, etp, etUnknown,
+  ettbody,      etth, ettd, ettr, etcaption, etcol, etcolgroup, etthead,
+                ettfoot, ettbody, etp, etUnknown,
+  etoptgroup,   etoption, etUnknown,
+  etoption,     etoption, etUnknown,
+  etfieldset,   etlegend, etp, ethead, eth1, eth2, eth3, eth4, eth5, eth6,
+                etpre, eta, etUnknown,
+  etUnknown);
+
+  AutoCloseIndex: array[0..40] of Integer = (
+    0, 19, 22, 25, 32, 39, 53, 57, 61, 65, 69,
+    73, 77, 81, 85, 90, 95, 99, 108, 117, 126,
+    135, 140, 145, 155, 159, 163, 170, 173, 176,
+    182, 187, 199, 210, 221, 230, 235, 246, 258,
+    261, 264
+  );
+
 function ResolveHTMLEntityReference(const Name: String;
   var Entity: WideChar): Boolean;
 var
@@ -639,4 +709,26 @@ begin
   end;
 end;
 
+function IsAutoClose(NewTag, OldTag: THTMLElementTag): Boolean;
+var
+  i, j: Integer;
+begin
+  Result := False;
+  for i := 0 to high(AutoCloseIndex) do
+    if NewTag = AutoCloseTab[AutoCloseIndex[i]] then
+    begin
+      j := AutoCloseIndex[i]+1;
+      while AutoCloseTab[j] <> etUnknown do
+      begin
+        if AutoCloseTab[j] = OldTag then
+        begin
+          Result := True;
+          Exit;
+        end;
+        Inc(j);
+      end;
+      Exit;
+    end;
+end;
+
 end.

+ 129 - 69
packages/fcl-xml/src/sax_html.pp

@@ -52,6 +52,11 @@ type
     FTokenText: SAXString;
     FCurStringValueDelimiter: Char;
     FAttrNameRead: Boolean;
+    FStack: array of THTMLElementTag;
+    FNesting: Integer;
+    procedure AutoClose(const aName: string);
+    procedure NamePush(const aName: string);
+    procedure NamePop;
   protected
     procedure EnterNewScannerContext(NewContext: THTMLScannerContext);
   public
@@ -122,6 +127,7 @@ constructor THTMLReader.Create;
 begin
   inherited Create;
   FScannerContext := scUnknown;
+  SetLength(FStack, 16);
 end;
 
 destructor THTMLReader.Destroy;
@@ -265,89 +271,135 @@ begin
   end;
 end;
 
-procedure THTMLReader.EnterNewScannerContext(NewContext: THTMLScannerContext);
+function LookupTag(const aName: string): THTMLElementTag;
+var
+  j: THTMLElementTag;
+begin
+  for j := Low(THTMLElementTag) to High(THTMLElementTag) do
+    if SameText(HTMLElementProps[j].Name, aName) then
+    begin
+      Result := j;
+      Exit;
+    end;
+  Result := etUnknown;
+end;
 
-  function SplitTagString(const s: String; var Attr: TSAXAttributes): String;
-  var
-    i, j: Integer;
-    AttrName: String;
-    ValueDelimiter: Char;
-    DoIncJ: Boolean;
+procedure THTMLReader.AutoClose(const aName: string);
+var
+  newTag: THTMLElementTag;
+begin
+  newTag := LookupTag(aName);
+  while (FNesting > 0) and IsAutoClose(newTag, FStack[FNesting-1]) do
   begin
-    Attr := nil;
-    i := Pos(' ', s);
-    if i <= 0 then
-      Result := LowerCase(s)
-    else
-    begin
-      Result := LowerCase(Copy(s, 1, i - 1));
-      Attr := TSAXAttributes.Create;
+    DoEndElement('', HTMLElementProps[FStack[FNesting-1]].Name, '');
+    namePop;
+  end;
+end;
 
-      Inc(i);
+procedure THTMLReader.NamePush(const aName: string);
+var
+  tag: THTMLElementTag;
+begin
+  tag := LookupTag(aName);
+  if FNesting >= Length(FStack) then
+    SetLength(FStack, FNesting * 2);
+  FStack[FNesting] := tag;
+  Inc(FNesting);
+end;
+
+procedure THTMLReader.NamePop;
+begin
+  if FNesting <= 0 then
+    Exit;
+  Dec(FNesting);
+  FStack[FNesting] := etUnknown;
+end;
+
+function SplitTagString(const s: String; var Attr: TSAXAttributes): String;
+var
+  i, j: Integer;
+  AttrName: String;
+  ValueDelimiter: Char;
+  DoIncJ: Boolean;
+begin
+  Attr := nil;
+  i := Pos(' ', s);
+  if i <= 0 then
+    Result := LowerCase(s)
+  else
+  begin
+    Result := LowerCase(Copy(s, 1, i - 1));
+    Attr := TSAXAttributes.Create;
+    Inc(i);
 
-      while (i <= Length(s)) and (s[i] in WhitespaceChars) do
-        Inc(i);
+    while (i <= Length(s)) and (s[i] in WhitespaceChars) do
+      Inc(i);
 
-      SetLength(AttrName, 0);
-      j := i;
+    SetLength(AttrName, 0);
+    j := i;
 
-      while j <= Length(s) do
-        if s[j] = '=' then
+    while j <= Length(s) do
+      if s[j] = '=' then
+      begin
+        AttrName := LowerCase(Copy(s, i, j - i));
+        Inc(j);
+        if (j < Length(s)) and ((s[j] = '''') or (s[j] = '"')) then
         begin
-          AttrName := LowerCase(Copy(s, i, j - i));
+          ValueDelimiter := s[j];
           Inc(j);
-          if (j < Length(s)) and ((s[j] = '''') or (s[j] = '"')) then
+        end else
+          ValueDelimiter := #0;
+        i := j;
+        DoIncJ := False;
+        while j <= Length(s) do
+          if ValueDelimiter = #0 then
+            if s[j] in WhitespaceChars then
+              break
+            else
+              Inc(j)
+          else if s[j] = ValueDelimiter then
           begin
-            ValueDelimiter := s[j];
-            Inc(j);
+            DoIncJ := True;
+            break
           end else
-            ValueDelimiter := #0;
-          i := j;
-          DoIncJ := False;
-          while j <= Length(s) do
-            if ValueDelimiter = #0 then
-              if s[j] in WhitespaceChars then
-                break
-              else
-                Inc(j)
-            else if s[j] = ValueDelimiter then
-            begin
-              DoIncJ := True;
-              break
-            end else
-              Inc(j);
+            Inc(j);
 
-          Attr.AddAttribute('', AttrName, '', '', Copy(s, i, j - i));
+        Attr.AddAttribute('', AttrName, '', '', Copy(s, i, j - i));
 
-          if DoIncJ then
-            Inc(j);
+        if DoIncJ then
+          Inc(j);
 
-          while (j <= Length(s)) and (s[j] in WhitespaceChars) do
-            Inc(j);
-          i := j;
-        end
-        else if s[j] in WhitespaceChars then
-        begin
-          Attr.AddAttribute('', Copy(s, i, j - i), '', '', '');
+        while (j <= Length(s)) and (s[j] in WhitespaceChars) do
           Inc(j);
-          while (j <= Length(s)) and (s[j] in WhitespaceChars) do
-            Inc(j);
-          i := j;
-        end else
+        i := j;
+      end
+      else if s[j] in WhitespaceChars then
+      begin
+        Attr.AddAttribute('', Copy(s, i, j - i), '', '', '');
+        Inc(j);
+        while (j <= Length(s)) and (s[j] in WhitespaceChars) do
           Inc(j);
-    end;
+        i := j;
+      end else
+        Inc(j);
   end;
+end;
 
+procedure THTMLReader.EnterNewScannerContext(NewContext: THTMLScannerContext);
 var
   Attr: TSAXAttributes;
   TagName: String;
   Found: Boolean;
   Ent: SAXChar;
   i: Integer;
+  elTag: THTMLElementTag;
 begin
   case ScannerContext of
     scWhitespace:
-      DoIgnorableWhitespace(PSAXChar(TokenText), 1, Length(TokenText));
+      if (FNesting > 0) and (efPCDataContent in HTMLElementProps[FStack[FNesting-1]].Flags) then
+        DoCharacters(PSAXChar(TokenText), 0, Length(TokenText))
+      else
+        DoIgnorableWhitespace(PSAXChar(TokenText), 0, Length(TokenText));
     scText:
       DoCharacters(PSAXChar(TokenText), 0, Length(TokenText));
     scEntityReference:
@@ -382,18 +434,35 @@ begin
           setlength(fTokenText,length(fTokenText)-1);
           // Do NOT combine to a single line, as Attr is an output value!
           TagName := SplitTagString(TokenText, Attr);
+          AutoClose(TagName);
           DoStartElement('', TagName, '', Attr);
           DoEndElement('', TagName, '');
         end
         else if TokenText[1] = '/' then
         begin
-          DoEndElement('',
-            SplitTagString(Copy(TokenText, 2, Length(TokenText)), Attr), '');
+          Delete(FTokenText, 1, 1);
+          TagName := SplitTagString(TokenText, Attr);
+          elTag := LookupTag(TagName);
+          i := FNesting-1;
+          while (i >= 0) and (FStack[i] <> elTag) and
+            (efEndTagOptional in HTMLElementProps[FStack[i]].Flags) do
+            Dec(i);
+          if (i>=0) and (FStack[i] = elTag) then
+            while FStack[FNesting-1] <> elTag do
+            begin
+              DoEndElement('', HTMLElementProps[FStack[FNesting-1]].Name, '');
+              namePop;
+            end;
+
+          DoEndElement('', TagName, '');
+          namePop;
         end
         else if TokenText[1] <> '!' then
         begin
           // Do NOT combine to a single line, as Attr is an output value!
           TagName := SplitTagString(TokenText, Attr);
+          AutoClose(TagName);
+          namePush(TagName);
           DoStartElement('', TagName, '', Attr);
         end;
         if Assigned(Attr) then
@@ -427,16 +496,7 @@ end;
 constructor THTMLToDOMConverter.CreateFragment(AReader: THTMLReader;
   AFragmentRoot: TDOMNode);
 begin
-  inherited Create;
-  FReader := AReader;
-  FReader.OnCharacters := @ReaderCharacters;
-  FReader.OnIgnorableWhitespace := @ReaderIgnorableWhitespace;
-  FReader.OnSkippedEntity := @ReaderSkippedEntity;
-  FReader.OnStartElement := @ReaderStartElement;
-  FReader.OnEndElement := @ReaderEndElement;
-  FDocument := AFragmentRoot.OwnerDocument;
-  FElementStack := TList.Create;
-  FNodeBuffer := TList.Create;
+  Create(AReader, AFragmentRoot.OwnerDocument);
   FragmentRoot := AFragmentRoot;
   IsFragmentMode := True;
 end;