Browse Source

Reworked entity reference handling in SAX parsers:
* htmldefs.pp - no more limited to Latin-1; uses binary search instead of linear.
* sax_html.pp - no longer emits SkippedEntity events; any reference is either resolved or handled as text.
* sax_xml.pp - in contrast to HTML, never handles entities as text (either resolved or passed to SkippedEntity).

git-svn-id: trunk@13368 -

sergei 16 years ago
parent
commit
173a0647a3
3 changed files with 337 additions and 257 deletions
  1. 330 205
      packages/fcl-xml/src/htmldefs.pp
  2. 3 33
      packages/fcl-xml/src/sax_html.pp
  3. 4 19
      packages/fcl-xml/src/sax_xml.pp

+ 330 - 205
packages/fcl-xml/src/htmldefs.pp

@@ -390,174 +390,7 @@ const
       'radio','submit','reset','file','hidden','image','button');
   HTMLbuttontype : array [THTMLbuttontype] of string = ('','submit','reset','button');
 
-
-  // ISO8859-1 mapping:
-  HTMLEntities: array[#160..#255] of String = (
-    // 160-191
-    'nbsp', 'iexcl', 'cent', 'pound', 'curren', 'yen', 'brvbar', 'sect',
-    'uml', 'copy', 'ordf', 'laquo', 'not', 'shy', 'reg', 'macr',
-    'deg', 'plusmn', 'sup2', 'sup3', 'acute', 'micro', 'para', 'middot',
-    'cedil', 'sup1', 'ordm', 'raquo', 'frac14', 'frac12', 'frac34', 'iquest',
-    // 192-223
-    'Agrave', 'Aacute', 'Acirc', 'Atilde', 'Auml', 'Aring', 'AElig', 'Ccedil',
-    'Egrave', 'Eacute', 'Ecirc', 'Euml', 'Igrave', 'Iacute', 'Icirc', 'Iuml',
-    'ETH', 'Ntilde', 'Ograve', 'Oacute', 'Ocirc', 'Otilde', 'Ouml', 'times',
-    'Oslash', 'Ugrave', 'Uacute', 'Ucirc', 'Uuml', 'Yacute', 'THORN', 'szlig',
-    // 224-255
-    'agrave', 'aacute', 'acirc', 'atilde', 'auml', 'aring', 'aelig', 'ccedil',
-    'egrave', 'eacute', 'ecirc', 'euml', 'igrave', 'iacute', 'icirc', 'iuml',
-    'eth', 'ntilde', 'ograve', 'oacute', 'ocirc', 'otilde', 'ouml', 'divide',
-    'oslash', 'ugrave', 'uacute', 'ucirc', 'uuml', 'yacute', 'thorn', 'yuml');
-
-
-  UnicodeHTMLEntities: array[0..141] of String = (
-    'Alpha',    // #913
-    'Beta',     // #914
-    'Gamma',    // #915
-    'Delta',    // #916
-    'Epsilon',  // #917
-    'Zeta',     // #918
-    'Eta',      // #919
-    'Theta',    // #920
-    'Iota',     // #921
-    'Kappa',    // #922
-    'Lambda',   // #923
-    'Mu',       // #924
-    'Nu',       // #925
-    'Xi',       // #926
-    'Omicron',  // #927
-    'Pi',       // #928
-    'Rho',      // #929
-    'Sigma',    // #931
-    'Tau',      // #932
-    'Upsilon',  // #933
-    'Phi',      // #934
-    'Chi',      // #935
-    'Psi',      // #936
-    'Omega',    // #937
-    'alpha',    // #945
-    'beta',     // #946
-    'gamma',    // #947
-    'delta',    // #948
-    'epsilon',  // #949
-    'zeta',     // #950
-    'eta',      // #951
-    'theta',    // #952
-    'iota',     // #953
-    'kappa',    // #954
-    'lambda',   // #955
-    'mu',       // #956
-    'nu',       // #957
-    'xi',       // #958
-    'omicron',  // #959
-    'pi',       // #960
-    'rho',      // #961
-    'sigmaf',   // #962
-    'sigma',    // #963
-    'tau',      // #964
-    'upsilon',  // #965
-    'phi',      // #966
-    'chi',      // #967
-    'psi',      // #968
-    'omega',    // #969
-    'thetasym', // #977
-    'upsih',    // #978
-    'piv',      // #982
-    'ensp',     // #8194
-    'emsp',     // #8195
-    'thinsp',   // #8201
-    'zwnj',     // #8204
-    'zwj',      // #8205
-    'lrm',      // #8206
-    'rlm',      // #8207
-    'ndash',    // #8211
-    'mdash',    // #8212
-    'lsquo',    // #8216
-    'rsquo',    // #8217
-    'sbquo',    // #8218
-    'ldquo',    // #8220
-    'rdquo',    // #8221
-    'bdquo',    // #8222
-    'dagger',   // #8224
-    'Dagger',   // #8225
-    'bull',     // #8226
-    'hellip',   // #8230
-    'permil',   // #8240
-    'prime',    // #8242
-    'lsaquo',   // #8249
-    'rsaquo',   // #8250
-    'oline',    // #8254
-    'frasl',    // #8260
-    'image',    // #8465
-    'weierp',   // #8472
-    'real',     // #8476
-    'trade',    // #8482
-    'alefsym',  // #8501
-    'larr',     // #8592
-    'uarr',     // #8593
-    'rarr',     // #8594
-    'darr',     // #8595
-    'harr',     // #8596
-    'crarr',    // #8629
-    'lArr',     // #8656
-    'uArr',     // #8657
-    'rArr',     // #8658
-    'dArr',     // #8659
-    'hArr',     // #8660
-    'forall',   // #8704
-    'part',     // #8706
-    'exist',    // #8707
-    'empty',    // #8709
-    'nabla',    // #8711
-    'isin',     // #8712
-    'notin',    // #8713
-    'ni',       // #8715
-    'prod',     // #8719
-    'sum',      // #8721
-    'minus',    // #8722
-    'lowast',   // #8727
-    'radic',    // #8730
-    'prop',     // #8733
-    'infin',    // #8734
-    'ang',      // #8736
-    'and',      // #8743
-    'or',       // #8744
-    'cap',      // #8745
-    'cup',      // #8746
-    'int',      // #8747
-    'there4',   // #8756
-    'sim',      // #8764
-    'cong',     // #8773
-    'asymp',    // #8776
-    'ne',       // #8800
-    'equiv',    // #8801
-    'le',       // #8804
-    'ge',       // #8805
-    'sub',      // #8834
-    'sup',      // #8835
-    'nsub',     // #8836
-    'sube',     // #8838
-    'supe',     // #8839
-    'oplus',    // #8853
-    'otimes',   // #8855
-    'perp',     // #8869
-    'sdot',     // #8901
-    'lceil',    // #8968
-    'rceil',    // #8969
-    'lfloor',   // #8970
-    'rfloor',   // #8971
-    'lang',     // #9001
-    'rang',     // #9002
-    'loz',      // #9674
-    'spades',   // #9824
-    'clubs',    // #9827
-    'hearts',   // #9829
-    'diams'     // #9830
-  );
-
-
-
-function ResolveHTMLEntityReference(const Name: String;
+function ResolveHTMLEntityReference(const Name: WideString;
   var Entity: WideChar): Boolean;
 
 function IsAutoClose(NewTag, OldTag: THTMLElementTag): Boolean;
@@ -635,40 +468,333 @@ const
     261, 264
   );
 
-function ResolveHTMLEntityReference(const Name: String;
+{ HTML entities, each preceded with its code. There is a separate list for
+  each entity length, and each list is sorted by character codes.
+  The sole purpose of using AnsiString here is staying compatible with Delphi 7,
+  which is totally broken with respect to handling wide literals.
+}
+
+  ent_2 =
+    #3#$9C  + 'Mu'+
+    #3#$9D  + 'Nu'+
+    #3#$A0  + 'Pi'+
+    #3#$9E  + 'Xi'+
+    #$22#$65+ 'ge'+
+    #0#62   + 'gt'+
+    #$22#$64+ 'le'+
+    #0#60   + 'lt'+
+    #3#$BC  + 'mu'+
+    #$22#$60+ 'ne'+
+    #$22#$0B+ 'ni'+
+    #3#$BD  + 'nu'+
+    #$22#$28+ 'or'+
+    #3#$C0  + 'pi'+
+    #3#$BE  + 'xi';
+
+  ent_3 =
+    #3#$A7  + 'Chi'+
+    #0#208  + 'ETH'+
+    #3#$97  + 'Eta'+
+    #3#$A6  + 'Phi'+
+    #3#$A8  + 'Psi'+
+    #3#$A1  + 'Rho'+
+    #3#$A4  + 'Tau'+
+    #0#38   + 'amp'+
+    #$22#$27+ 'and'+
+    #$22#$20+ 'ang'+
+    #$22#$29+ 'cap'+
+    #3#$C7  + 'chi'+
+    #$22#$2A+ 'cup'+
+    #0#176  + 'deg'+
+    #3#$B7  + 'eta'+
+    #0#240  + 'eth'+
+    #$22#$2B+ 'int'+
+    #$25#$CA+ 'loz'+
+    #$20#$0E+ 'lrm'+
+    #0#172  + 'not'+
+    #3#$C6  + 'phi'+
+    #3#$D6  + 'piv'+
+    #3#$C8  + 'psi'+
+    #0#174  + 'reg'+
+    #3#$C1  + 'rho'+
+    #$20#$0F+ 'rlm'+
+    #0#173  + 'shy'+
+    #$22#$3C+ 'sim'+
+    #$22#$82+ 'sub'+
+    #$22#$11+ 'sum'+
+    #$22#$83+ 'sup'+
+    #3#$C4  + 'tau'+
+    #0#168  + 'uml'+
+    #0#165  + 'yen'+
+    #$20#$0D+ 'zwj';
+
+  ent_4 =
+    #0#196  + 'Auml'+
+    #3#$92  + 'Beta'+
+    #0#203  + 'Euml'+
+    #3#$99  + 'Iota'+
+    #0#207  + 'Iuml'+
+    #0#214  + 'Ouml'+
+    #0#220  + 'Uuml'+
+    #1#$78  + 'Yuml'+
+    #3#$96  + 'Zeta'+
+
+    #0#228  + 'auml'+
+    #3#$B2  + 'beta'+
+    #$20#$22+ 'bull'+
+    #0#162  + 'cent'+
+    #2#$C6  + 'circ'+
+    #$22#$45+ 'cong'+
+    #0#169  + 'copy'+
+    #$21#$D3+ 'dArr'+
+    #$21#$93+ 'darr'+
+    #$20#$03+ 'emsp'+
+    #$20#$02+ 'ensp'+
+    #0#235  + 'euml'+
+    #$20#$AC+ 'euro'+
+    #1#$92  + 'fnof'+
+    #$21#$D4+ 'hArr'+
+    #$21#$94+ 'harr'+
+    #3#$B9  + 'iota'+
+    #$22#$08+ 'isin'+
+    #0#239  + 'iuml'+
+    #$21#$D0+ 'lArr'+
+    #$23#$29+ 'lang'+
+    #$21#$90+ 'larr'+
+    #0#175  + 'macr'+
+    #0#160  + 'nbsp'+
+    #$22#$84+ 'nsub'+
+    #0#170  + 'ordf'+
+    #0#186  + 'ordm'+
+    #0#246  + 'ouml'+
+    #0#182  + 'para'+
+    #$22#$02+ 'part'+
+    #$22#$A5+ 'perp'+
+    #$22#$0F+ 'prod'+
+    #$22#$1D+ 'prop'+
+    #0#34   + 'quot'+
+    #$21#$D2+ 'rArr'+
+    #$23#$2A+ 'rang'+
+    #$21#$92+ 'rarr'+
+    #$21#$1C+ 'real'+
+    #$22#$C5+ 'sdot'+
+    #0#167  + 'sect'+
+    #$22#$86+ 'sube'+
+    #0#185  + 'sup1'+
+    #0#178  + 'sup2'+
+    #0#179  + 'sup3'+
+    #$22#$87+ 'supe'+
+    #$21#$D1+ 'uArr'+
+    #$21#$91+ 'uarr'+
+    #0#252  + 'uuml'+
+    #0#255  + 'yuml'+
+    #3#$B6  + 'zeta'+
+    #$20#$0C+ 'zwnj';
+
+  ent_5 =
+    #0#198  + 'AElig'+
+    #0#194  + 'Acirc'+
+    #3#$91  + 'Alpha'+
+    #0#197  + 'Aring'+
+    #3#$94  + 'Delta'+
+    #0#202  + 'Ecirc'+
+    #3#$93  + 'Gamma'+
+    #0#206  + 'Icirc'+
+    #3#$9A  + 'Kappa'+
+    #1#$52  + 'OElig'+
+    #0#212  + 'Ocirc'+
+    #3#$A9  + 'Omega'+
+    #$20#$33+ 'Prime'+
+    #3#$A3  + 'Sigma'+
+    #0#222  + 'THORN'+
+    #3#$98  + 'Theta'+
+    #0#219  + 'Ucirc'+
+
+    #0#226  + 'acirc'+
+    #0#180  + 'acute'+
+    #0#230  + 'aelig'+
+    #3#$B1  + 'alpha'+
+    #0#229  + 'aring'+
+    #$22#$48+ 'asymp'+
+    #$20#$1E+ 'bdquo'+
+    #0#184  + 'cedil'+
+    #$26#$63+ 'clubs'+
+    #$21#$B5+ 'crarr'+
+    #3#$B4  + 'delta'+
+    #$26#$66+ 'diams'+
+    #0#234  + 'ecirc'+
+    #$22#$05+ 'empty'+
+    #$22#$61+ 'equiv'+
+    #$22#$03+ 'exist'+
+    #$20#$44+ 'frasl'+
+    #3#$B3  + 'gamma'+
+    #0#238  + 'icirc'+
+    #0#161  + 'iexcl'+
+    #$21#$11+ 'image'+
+    #$22#$1E+ 'infin'+
+    #3#$BA  + 'kappa'+
+    #0#171  + 'laquo'+
+    #$23#$08+ 'lceil'+
+    #$20#$1C+ 'ldquo'+
+    #$20#$18+ 'lsquo'+
+    #$20#$14+ 'mdash'+
+    #0#181  + 'micro'+
+    #$22#$12+ 'minus'+
+    #$22#$07+ 'nabla'+
+    #$20#$13+ 'ndash'+
+    #$22#$09+ 'notin'+
+    #0#244  + 'ocirc'+
+    #1#$53  + 'oelig'+
+    #$20#$3E+ 'oline'+
+    #3#$C9  + 'omega'+
+    #$22#$95+ 'oplus'+
+    #0#163  + 'pound'+
+    #$20#$32+ 'prime'+
+    #$22#$1A+ 'radic'+
+    #0#187  + 'raquo'+
+    #$23#$09+ 'rceil'+
+    #$20#$1D+ 'rdquo'+
+    #$20#$19+ 'rsquo'+
+    #$20#$1A+ 'sbquo'+
+    #3#$C3  + 'sigma'+
+    #0#223  + 'szlig'+
+    #3#$B8  + 'theta'+
+    #0#254  + 'thorn'+
+    #2#$DC  + 'tilde'+
+    #0#215  + 'times'+
+    #$21#$22+ 'trade'+
+    #0#251  + 'ucirc'+
+    #3#$D2  + 'upsih';
+
+  ent_6 =
+    #0#193  + 'Aacute'+
+    #0#192  + 'Agrave'+
+    #0#195  + 'Atilde'+
+    #0#199  + 'Ccedil'+
+    #$20#$21+ 'Dagger'+
+    #0#201  + 'Eacute'+
+    #0#200  + 'Egrave'+
+    #0#205  + 'Iacute'+
+    #0#204  + 'Igrave'+
+    #3#$9B  + 'Lambda'+
+    #0#209  + 'Ntilde'+
+    #0#211  + 'Oacute'+
+    #0#210  + 'Ograve'+
+    #0#216  + 'Oslash'+
+    #0#213  + 'Otilde'+
+    #1#$60  + 'Scaron'+
+    #0#218  + 'Uacute'+
+    #0#217  + 'Ugrave'+
+    #0#221  + 'Yacute'+
+
+    #0#225  + 'aacute'+
+    #0#224  + 'agrave'+
+    #0#227  + 'atilde'+
+    #0#166  + 'brvbar'+
+    #0#231  + 'ccedil'+
+    #0#164  + 'curren'+
+    #$20#$20+ 'dagger'+
+    #0#247  + 'divide'+
+    #0#233  + 'eacute'+
+    #0#232  + 'egrave'+
+    #$22#$00+ 'forall'+
+    #0#189  + 'frac12'+
+    #0#188  + 'frac14'+
+    #0#190  + 'frac34'+
+    #$26#$65+ 'hearts'+
+    #$20#$26+ 'hellip'+
+    #0#237  + 'iacute'+
+    #0#236  + 'igrave'+
+    #0#191  + 'iquest'+
+    #3#$BB  + 'lambda'+
+    #$23#$0A+ 'lfloor'+
+    #$22#$17+ 'lowast'+
+    #$20#$39+ 'lsaquo'+
+    #0#183  + 'middot'+
+    #0#241  + 'ntilde'+
+    #0#243  + 'oacute'+
+    #0#242  + 'ograve'+
+    #0#248  + 'oslash'+
+    #0#245  + 'otilde'+
+    #$22#$97+ 'otimes'+
+    #$20#$30+ 'permil'+
+    #0#177  + 'plusmn'+
+    #$23#$0B+ 'rfloor'+
+    #$20#$3A+ 'rsaquo'+
+    #1#$61  + 'scaron'+
+    #3#$C2  + 'sigmaf'+
+    #$26#$60+ 'spades'+
+    #$22#$34+ 'there4'+
+    #$20#$09+ 'thinsp'+
+    #0#250  + 'uacute'+
+    #0#249  + 'ugrave'+
+    #$21#$18+ 'weierp'+
+    #0#253  + 'yacute';
+
+  ent_7 =
+    #3#$95  + 'Epsilon'+
+    #3#$9F  + 'Omicron'+
+    #3#$A5  + 'Upsilon'+
+    #$21#$35+ 'alefsym'+
+    #3#$B5  + 'epsilon'+
+    #3#$BF  + 'omicron'+
+    #3#$C5  + 'upsilon';
+
+  ent_8 =
+    #3#$D1  + 'thetasym';
+
+  strs: array[2..8] of string = (
+    ent_2, ent_3, ent_4, ent_5, ent_6, ent_7, ent_8
+  );
+
+function BSearch(P: PWideChar; Len: Integer; const data: string): WideChar;
+var
+  L, H, mid, J, C: Integer;
+begin
+  Result := #0;
+  L := 0;
+  H := (Length(data)+1) div (Len+2);
+  while L <= H do
+  begin
+    mid := L + ((H - L) shr 1);
+    J := 0;
+    repeat
+      C := ord(P[J]) - ord(data[mid*(Len+2)+3+J]);
+      Inc(J);
+    until (C <> 0) or (J >= Len);
+    if C > 0 then L := mid + 1 else
+    begin
+      H := mid - 1;
+      if C = 0 then
+      begin
+        Result := WideChar((ord(data[mid*(Len+2)+1]) shl 8) or ord(data[mid*(Len+2)+2]));
+        Exit;
+      end;
+    end;
+  end;
+end;
+
+{
+  Remaining issues:
+  1) UTF-16 surrogate pairs
+  2) HTML accepts uppercase 'X' for hex notation, but XML does not.
+  3) 'apos' is used in xml/xhtml, but not in HTML 4.01
+}
+
+function ResolveHTMLEntityReference(const Name: WideString;
   var Entity: WideChar): Boolean;
 var
-  Ent: WideChar;
-  i: Integer;
+  i, L: Integer;
   value: Integer;
 begin
-  if Name = 'quot' then
-  begin
-    Entity := '"';
-    Result := True;
-  end else if Name = 'apos' then
-  begin
-    Entity := '''';
-    Result := True;
-  end else if Name = 'amp' then
-  begin
-    Entity := '&';
-    Result := True;
-  end else if Name = 'lt' then
-  begin
-    Entity := '<';
-    Result := True;
-  end else if Name = 'gt' then
-  begin
-    Entity := '>';
-    Result := True;
-  end else if (Length(Name) > 1) and (Name[1] = '#') then
+  L := Length(Name);
+  if (L > 1) and (Name[1] = '#') then
   begin
     value := 0;
-    if Name[2] in ['x', 'X'] then
+    if (Name[2] = 'x') or (Name[2] = 'X') then
     begin
       i := 3;
-      while i <= Length(Name) do
+      while i <= L do
       begin
         case Name[i] of
           '0'..'9': Value := Value * 16 + Ord(Name[i]) - Ord('0');
@@ -683,7 +809,7 @@ begin
     else
     begin
       i := 2;
-      while i <= Length(Name) do
+      while i <= L do
       begin
         case Name[i] of
           '0'..'9': Value := Value * 10 + Ord(Name[i]) - Ord('0');
@@ -693,19 +819,18 @@ begin
         Inc(i);
       end;
     end;
-    Result := (i = Length(Name)+1);
+    Result := (i = L+1);
     if Result then
       Entity := WideChar(Value);
-  end else
+  end
+  else
   begin
-    for Ent := Low(HTMLEntities) to High(HTMLEntities) do
-      if HTMLEntities[Ent] = Name then
-      begin
-        Entity := Ent;
-        Result := True;
-        exit;
-      end;
-    Result := False;
+    case L of
+      2..8: Entity := BSearch(PWideChar(Name), L, strs[L]);
+    else
+      Entity := #0;
+    end;
+    Result := (Entity <> #0);
   end;
 end;
 

+ 3 - 33
packages/fcl-xml/src/sax_html.pp

@@ -93,7 +93,6 @@ type
       Start, Count: Integer);
     procedure ReaderIgnorableWhitespace(Sender: TObject; const ch: PSAXChar;
       Start, Count: Integer);
-    procedure ReaderSkippedEntity(Sender: TObject; const Name: SAXString);
     procedure ReaderStartElement(Sender: TObject;
       const NamespaceURI, LocalName, RawName: SAXString; Attr: TSAXAttributes);
     procedure ReaderEndElement(Sender: TObject;
@@ -389,7 +388,6 @@ procedure THTMLReader.EnterNewScannerContext(NewContext: THTMLScannerContext);
 var
   Attr: TSAXAttributes;
   TagName: String;
-  Found: Boolean;
   Ent: SAXChar;
   i: Integer;
   elTag: THTMLElementTag;
@@ -405,25 +403,9 @@ begin
     scEntityReference:
       begin
         if ResolveHTMLEntityReference(TokenText, Ent) then
-        begin
-          DoCharacters(@Ent, 0, 1);
-        end else
-        begin
-          { Is this a predefined Unicode character entity? We must check this,
-            as undefined entities must be handled as text, for compatiblity
-            to popular browsers... }
-          Found := False;
-          for i := Low(UnicodeHTMLEntities) to High(UnicodeHTMLEntities) do
-            if UnicodeHTMLEntities[i] = TokenText then
-            begin
-              Found := True;
-              break;
-            end;
-          if Found then
-            DoSkippedEntity(TokenText)
-          else
-            DoCharacters(PSAXChar('&' + TokenText), 0, Length(TokenText) + 2);
-        end;
+          DoCharacters(@Ent, 0, 1)
+        else
+          DoCharacters(PSAXChar('&' + TokenText + ';'), 0, Length(TokenText) + 2);
       end;
     scTag:
       if Length(TokenText) > 0 then
@@ -485,7 +467,6 @@ begin
   FReader := AReader;
   FReader.OnCharacters := @ReaderCharacters;
   FReader.OnIgnorableWhitespace := @ReaderIgnorableWhitespace;
-  FReader.OnSkippedEntity := @ReaderSkippedEntity;
   FReader.OnStartElement := @ReaderStartElement;
   FReader.OnEndElement := @ReaderEndElement;
   FDocument := ADocument;
@@ -544,17 +525,6 @@ begin
   FNodeBuffer.Add(NodeInfo);
 end;
 
-procedure THTMLToDOMConverter.ReaderSkippedEntity(Sender: TObject;
-  const Name: SAXString);
-var
-  NodeInfo: THTMLNodeInfo;
-begin
-  NodeInfo := THTMLNodeInfo.Create;
-  NodeInfo.NodeType := ntEntityReference;
-  NodeInfo.DOMNode := FDocument.CreateEntityReference(Name);
-  FNodeBuffer.Add(NodeInfo);
-end;
-
 procedure THTMLToDOMConverter.ReaderStartElement(Sender: TObject;
   const NamespaceURI, LocalName, RawName: SAXString; Attr: TSAXAttributes);
 var

+ 4 - 19
packages/fcl-xml/src/sax_xml.pp

@@ -343,26 +343,11 @@ begin
       DoCharacters(PSAXChar(TokenText), 0, Length(TokenText));
     scEntityReference:
       begin
+        // TODO: xml must NOT recognize HTML entities, except 5 defined for xml.
         if ResolveHTMLEntityReference(TokenText, Ent) then
-        begin
-          DoCharacters(@Ent, 0, 1);
-        end else
-        begin
-          { Is this a predefined Unicode character entity? We must check this,
-            as undefined entities must be handled as text, for compatiblity
-            to popular browsers... }
-          Found := False;
-          for i := Low(UnicodeHTMLEntities) to High(UnicodeHTMLEntities) do
-            if UnicodeHTMLEntities[i] = TokenText then
-            begin
-              Found := True;
-              break;
-            end;
-          if Found then
-            DoSkippedEntity(TokenText)
-          else
-            DoCharacters(PSAXChar('&' + TokenText), 0, Length(TokenText) + 2);
-        end;
+          DoCharacters(@Ent, 0, 1)
+        else
+          DoSkippedEntity(TokenText);
       end;
     scTag:
       if Length(TokenText) > 0 then