Browse Source

* HTML parser: in case of malformed input, do not create attributes with invalid names (Mantis #16916).
* Along the way, eliminated one layer of useless converting strings from wide to ansi and back.

git-svn-id: trunk@15564 -

sergei 15 years ago
parent
commit
7e2f713d09
2 changed files with 40 additions and 32 deletions
  1. 30 32
      packages/fcl-xml/src/sax_html.pp
  2. 10 0
      packages/fcl-xml/src/xmlutils.pp

+ 30 - 32
packages/fcl-xml/src/sax_html.pp

@@ -31,7 +31,7 @@ unit SAX_HTML;
 
 
 interface
 interface
 
 
-uses SysUtils, Classes, SAX, DOM, DOM_HTML,htmldefs;
+uses SysUtils, Classes, SAX, DOM, DOM_HTML,htmldefs,xmlutils;
 
 
 type
 type
 
 
@@ -54,8 +54,8 @@ type
     FAttrNameRead: Boolean;
     FAttrNameRead: Boolean;
     FStack: array of THTMLElementTag;
     FStack: array of THTMLElementTag;
     FNesting: Integer;
     FNesting: Integer;
-    procedure AutoClose(const aName: string);
-    procedure NamePush(const aName: string);
+    procedure AutoClose(const aName: SAXString);
+    procedure NamePush(const aName: SAXString);
     procedure NamePop;
     procedure NamePop;
   protected
   protected
     procedure EnterNewScannerContext(NewContext: THTMLScannerContext);
     procedure EnterNewScannerContext(NewContext: THTMLScannerContext);
@@ -271,12 +271,14 @@ begin
   end;
   end;
 end;
 end;
 
 
-function LookupTag(const aName: string): THTMLElementTag;
+function LookupTag(const aName: SAXString): THTMLElementTag;
 var
 var
   j: THTMLElementTag;
   j: THTMLElementTag;
+  ansiName: string;
 begin
 begin
+  ansiName := aName;
   for j := Low(THTMLElementTag) to High(THTMLElementTag) do
   for j := Low(THTMLElementTag) to High(THTMLElementTag) do
-    if SameText(HTMLElementProps[j].Name, aName) then
+    if SameText(HTMLElementProps[j].Name, ansiName) then
     begin
     begin
       Result := j;
       Result := j;
       Exit;
       Exit;
@@ -284,7 +286,7 @@ begin
   Result := etUnknown;
   Result := etUnknown;
 end;
 end;
 
 
-procedure THTMLReader.AutoClose(const aName: string);
+procedure THTMLReader.AutoClose(const aName: SAXString);
 var
 var
   newTag: THTMLElementTag;
   newTag: THTMLElementTag;
 begin
 begin
@@ -296,7 +298,7 @@ begin
   end;
   end;
 end;
 end;
 
 
-procedure THTMLReader.NamePush(const aName: string);
+procedure THTMLReader.NamePush(const aName: SAXString);
 var
 var
   tag: THTMLElementTag;
   tag: THTMLElementTag;
 begin
 begin
@@ -315,27 +317,27 @@ begin
   FStack[FNesting] := etUnknown;
   FStack[FNesting] := etUnknown;
 end;
 end;
 
 
-function SplitTagString(const s: String; var Attr: TSAXAttributes): String;
+function SplitTagString(const s: SAXString; var Attr: TSAXAttributes): SAXString;
 var
 var
   i, j: Integer;
   i, j: Integer;
-  AttrName: String;
-  ValueDelimiter: Char;
+  AttrName: SAXString;
+  ValueDelimiter: WideChar;
   DoIncJ: Boolean;
   DoIncJ: Boolean;
 begin
 begin
   Attr := nil;
   Attr := nil;
   i := 1;
   i := 1;
-  while (i <= Length(s)) and not (s[i] in WhitespaceChars) do
+  while (i <= Length(s)) and not IsXMLWhitespace(s[i]) do
     Inc(i);
     Inc(i);
 
 
   if i = Length(s) then
   if i = Length(s) then
-    Result := LowerCase(s)
+    Result := s
   else
   else
   begin
   begin
-    Result := LowerCase(Copy(s, 1, i - 1));
+    Result := Copy(s, 1, i - 1);
     Attr := TSAXAttributes.Create;
     Attr := TSAXAttributes.Create;
     Inc(i);
     Inc(i);
 
 
-    while (i <= Length(s)) and (s[i] in WhitespaceChars) do
+    while (i <= Length(s)) and IsXMLWhitespace(s[i]) do
       Inc(i);
       Inc(i);
 
 
     SetLength(AttrName, 0);
     SetLength(AttrName, 0);
@@ -344,7 +346,8 @@ begin
     while j <= Length(s) do
     while j <= Length(s) do
       if s[j] = '=' then
       if s[j] = '=' then
       begin
       begin
-        AttrName := LowerCase(Copy(s, i, j - i));
+        AttrName := Copy(s, i, j - i);
+        WStrLower(AttrName);
         Inc(j);
         Inc(j);
         if (j < Length(s)) and ((s[j] = '''') or (s[j] = '"')) then
         if (j < Length(s)) and ((s[j] = '''') or (s[j] = '"')) then
         begin
         begin
@@ -356,7 +359,7 @@ begin
         DoIncJ := False;
         DoIncJ := False;
         while j <= Length(s) do
         while j <= Length(s) do
           if ValueDelimiter = #0 then
           if ValueDelimiter = #0 then
-            if s[j] in WhitespaceChars then
+            if IsXMLWhitespace(s[j]) then
               break
               break
             else
             else
               Inc(j)
               Inc(j)
@@ -367,31 +370,34 @@ begin
           end else
           end else
             Inc(j);
             Inc(j);
 
 
-        Attr.AddAttribute('', AttrName, '', '', Copy(s, i, j - i));
+        if IsXMLName(AttrName) then
+          Attr.AddAttribute('', AttrName, '', '', Copy(s, i, j - i));
 
 
         if DoIncJ then
         if DoIncJ then
           Inc(j);
           Inc(j);
 
 
-        while (j <= Length(s)) and (s[j] in WhitespaceChars) do
+        while (j <= Length(s)) and IsXMLWhitespace(s[j]) do
           Inc(j);
           Inc(j);
         i := j;
         i := j;
       end
       end
-      else if s[j] in WhitespaceChars then
+      else if IsXMLWhitespace(s[j]) then
       begin
       begin
-        Attr.AddAttribute('', Copy(s, i, j - i), '', '', '');
+        if IsXMLName(@s[i], j-i) then
+          Attr.AddAttribute('', Copy(s, i, j - i), '', '', '');
         Inc(j);
         Inc(j);
-        while (j <= Length(s)) and (s[j] in WhitespaceChars) do
+        while (j <= Length(s)) and IsXMLWhitespace(s[j]) do
           Inc(j);
           Inc(j);
         i := j;
         i := j;
       end else
       end else
         Inc(j);
         Inc(j);
   end;
   end;
+  WStrLower(result);
 end;
 end;
 
 
 procedure THTMLReader.EnterNewScannerContext(NewContext: THTMLScannerContext);
 procedure THTMLReader.EnterNewScannerContext(NewContext: THTMLScannerContext);
 var
 var
   Attr: TSAXAttributes;
   Attr: TSAXAttributes;
-  TagName: String;
+  TagName: SAXString;
   Ent: SAXChar;
   Ent: SAXChar;
   i: Integer;
   i: Integer;
   elTag: THTMLElementTag;
   elTag: THTMLElementTag;
@@ -502,30 +508,22 @@ end;
 procedure THTMLToDOMConverter.ReaderCharacters(Sender: TObject;
 procedure THTMLToDOMConverter.ReaderCharacters(Sender: TObject;
   const ch: PSAXChar; Start, Count: Integer);
   const ch: PSAXChar; Start, Count: Integer);
 var
 var
-  s: SAXString;
   NodeInfo: THTMLNodeInfo;
   NodeInfo: THTMLNodeInfo;
 begin
 begin
-  SetLength(s, Count);
-  Move(ch^, s[1], Count * SizeOf(SAXChar));
-
   NodeInfo := THTMLNodeInfo.Create;
   NodeInfo := THTMLNodeInfo.Create;
   NodeInfo.NodeType := ntText;
   NodeInfo.NodeType := ntText;
-  NodeInfo.DOMNode := FDocument.CreateTextNode(s);
+  NodeInfo.DOMNode := FDocument.CreateTextNodeBuf(ch, Count, False);
   FNodeBuffer.Add(NodeInfo);
   FNodeBuffer.Add(NodeInfo);
 end;
 end;
 
 
 procedure THTMLToDOMConverter.ReaderIgnorableWhitespace(Sender: TObject;
 procedure THTMLToDOMConverter.ReaderIgnorableWhitespace(Sender: TObject;
   const ch: PSAXChar; Start, Count: Integer);
   const ch: PSAXChar; Start, Count: Integer);
 var
 var
-  s: SAXString;
   NodeInfo: THTMLNodeInfo;
   NodeInfo: THTMLNodeInfo;
 begin
 begin
-  SetLength(s, Count);
-  Move(ch^, s[1], Count * SizeOf(SAXChar));
-
   NodeInfo := THTMLNodeInfo.Create;
   NodeInfo := THTMLNodeInfo.Create;
   NodeInfo.NodeType := ntWhitespace;
   NodeInfo.NodeType := ntWhitespace;
-  NodeInfo.DOMNode := FDocument.CreateTextNode(s);
+  NodeInfo.DOMNode := FDocument.CreateTextNodeBuf(ch, Count, False);
   FNodeBuffer.Add(NodeInfo);
   FNodeBuffer.Add(NodeInfo);
 end;
 end;
 
 

+ 10 - 0
packages/fcl-xml/src/xmlutils.pp

@@ -35,6 +35,7 @@ function IsXmlWhiteSpace(c: WideChar): Boolean;
 function Hash(InitValue: LongWord; Key: PWideChar; KeyLen: Integer): LongWord;
 function Hash(InitValue: LongWord; Key: PWideChar; KeyLen: Integer): LongWord;
 { beware, works in ASCII range only }
 { beware, works in ASCII range only }
 function WStrLIComp(S1, S2: PWideChar; Len: Integer): Integer;
 function WStrLIComp(S1, S2: PWideChar; Len: Integer): Integer;
+procedure WStrLower(var S: WideString);
 
 
 type
 type
   TXMLVersion = (xmlVersionUnknown, xmlVersion10, xmlVersion11);
   TXMLVersion = (xmlVersionUnknown, xmlVersion10, xmlVersion11);
@@ -385,6 +386,15 @@ begin
   result := c1 - c2;
   result := c1 - c2;
 end;
 end;
 
 
+procedure WStrLower(var S: WideString);
+var
+  i: Integer;
+begin
+  for i := 1 to Length(S) do
+    if (S[i] >= 'A') and (S[i] <= 'Z') then
+      Inc(word(S[i]), 32);
+end;
+
 function Hash(InitValue: LongWord; Key: PWideChar; KeyLen: Integer): LongWord;
 function Hash(InitValue: LongWord; Key: PWideChar; KeyLen: Integer): LongWord;
 begin
 begin
   Result := InitValue;
   Result := InitValue;