You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2019/09/24 16:29:23 UTC

svn commit: r1867468 - in /manifoldcf/trunk: ./ connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ framework/connector-common/src/main/java/org/apache/manifoldcf/connectorcommon/fuzzyml/

Author: kwright
Date: Tue Sep 24 16:29:23 2019
New Revision: 1867468

URL: http://svn.apache.org/viewvc?rev=1867468&view=rev
Log:
Fix for CONNECTORS-1623.

Modified:
    manifoldcf/trunk/CHANGES.txt
    manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ScriptParseState.java
    manifoldcf/trunk/framework/connector-common/src/main/java/org/apache/manifoldcf/connectorcommon/fuzzyml/TagParseState.java

Modified: manifoldcf/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/CHANGES.txt?rev=1867468&r1=1867467&r2=1867468&view=diff
==============================================================================
--- manifoldcf/trunk/CHANGES.txt (original)
+++ manifoldcf/trunk/CHANGES.txt Tue Sep 24 16:29:23 2019
@@ -6,6 +6,9 @@ $Id$
 
 ======================= Release 2.14 =====================
 
+CONNECTORS-1623: Harden HTML parser against stuff in SCRIPT tags that looks like tags.
+(Julien Massiera, Karl Wright)
+
 CONNECTORS-1566: Develop csws connector to replace deprecated LiveLink connector.
 (Markus Schuch, Karl Wright)
 

Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ScriptParseState.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ScriptParseState.java?rev=1867468&r1=1867467&r2=1867468&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ScriptParseState.java (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ScriptParseState.java Tue Sep 24 16:29:23 2019
@@ -63,6 +63,11 @@ public class ScriptParseState extends HT
   }
 
   @Override
+  protected boolean acceptNewTag() {
+    return scriptParseState != SCRIPTPARSESTATE_INSCRIPT;
+  }
+    
+  @Override
   protected boolean noteTagEnd(String tagName)
     throws ManifoldCFException
   {

Modified: manifoldcf/trunk/framework/connector-common/src/main/java/org/apache/manifoldcf/connectorcommon/fuzzyml/TagParseState.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/framework/connector-common/src/main/java/org/apache/manifoldcf/connectorcommon/fuzzyml/TagParseState.java?rev=1867468&r1=1867467&r2=1867468&view=diff
==============================================================================
--- manifoldcf/trunk/framework/connector-common/src/main/java/org/apache/manifoldcf/connectorcommon/fuzzyml/TagParseState.java (original)
+++ manifoldcf/trunk/framework/connector-common/src/main/java/org/apache/manifoldcf/connectorcommon/fuzzyml/TagParseState.java Tue Sep 24 16:29:23 2019
@@ -285,7 +285,7 @@ public class TagParseState extends Singl
       }
       else if (bTagDepth == 0)
       {
-        if (isWhitespace(thisChar))
+        if (isWhitespace(thisChar) || !acceptNewTag())
         {
           // Not a tag.
           currentState = TAGPARSESTATE_NORMAL;
@@ -937,6 +937,12 @@ public class TagParseState extends Singl
     return false;
   }
 
+  /** Allow parsing within tag.
+   */
+  protected boolean acceptNewTag() {
+    return true;
+  }
+    
   /** Allocate the buffer.
   */
   protected StringBuilder newBuffer()