You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2019/09/24 16:30:38 UTC

svn commit: r1867469 - in /manifoldcf/branches/release-2.14-branch: ./ connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ framework/connector-common/src/main/java/org/apache/manifoldcf/connectorcommon/fuz...

Author: kwright
Date: Tue Sep 24 16:30:38 2019
New Revision: 1867469

URL: http://svn.apache.org/viewvc?rev=1867469&view=rev
Log:
Pull up fix for CONNECTORS-1623 from trunk

Modified:
    manifoldcf/branches/release-2.14-branch/   (props changed)
    manifoldcf/branches/release-2.14-branch/CHANGES.txt
    manifoldcf/branches/release-2.14-branch/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ScriptParseState.java
    manifoldcf/branches/release-2.14-branch/framework/connector-common/src/main/java/org/apache/manifoldcf/connectorcommon/fuzzyml/TagParseState.java

Propchange: manifoldcf/branches/release-2.14-branch/
------------------------------------------------------------------------------
--- svn:mergeinfo (original)
+++ svn:mergeinfo Tue Sep 24 16:30:38 2019
@@ -155,4 +155,4 @@
 /manifoldcf/branches/CONNECTORS-981:1605049-1605773
 /manifoldcf/branches/CONNECTORS-989:1611600-1612101
 /manifoldcf/branches/CONNECTORS-990:1610284-1610707
-/manifoldcf/trunk:1867444,1867447
+/manifoldcf/trunk:1867444,1867447,1867468

Modified: manifoldcf/branches/release-2.14-branch/CHANGES.txt
URL: http://svn.apache.org/viewvc/manifoldcf/branches/release-2.14-branch/CHANGES.txt?rev=1867469&r1=1867468&r2=1867469&view=diff
==============================================================================
--- manifoldcf/branches/release-2.14-branch/CHANGES.txt (original)
+++ manifoldcf/branches/release-2.14-branch/CHANGES.txt Tue Sep 24 16:30:38 2019
@@ -3,6 +3,9 @@ $Id$
 
 ======================= Release 2.14 =====================
 
+CONNECTORS-1623: Harden HTML parser against stuff in SCRIPT tags that looks like tags.
+(Julien Massiera, Karl Wright)
+
 CONNECTORS-1566: Develop csws connector to replace deprecated LiveLink connector.
 (Markus Schuch, Karl Wright)
 

Modified: manifoldcf/branches/release-2.14-branch/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ScriptParseState.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/release-2.14-branch/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ScriptParseState.java?rev=1867469&r1=1867468&r2=1867469&view=diff
==============================================================================
--- manifoldcf/branches/release-2.14-branch/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ScriptParseState.java (original)
+++ manifoldcf/branches/release-2.14-branch/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ScriptParseState.java Tue Sep 24 16:30:38 2019
@@ -63,6 +63,11 @@ public class ScriptParseState extends HT
   }
 
   @Override
+  protected boolean acceptNewTag() {
+    return scriptParseState != SCRIPTPARSESTATE_INSCRIPT;
+  }
+    
+  @Override
   protected boolean noteTagEnd(String tagName)
     throws ManifoldCFException
   {

Modified: manifoldcf/branches/release-2.14-branch/framework/connector-common/src/main/java/org/apache/manifoldcf/connectorcommon/fuzzyml/TagParseState.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/release-2.14-branch/framework/connector-common/src/main/java/org/apache/manifoldcf/connectorcommon/fuzzyml/TagParseState.java?rev=1867469&r1=1867468&r2=1867469&view=diff
==============================================================================
--- manifoldcf/branches/release-2.14-branch/framework/connector-common/src/main/java/org/apache/manifoldcf/connectorcommon/fuzzyml/TagParseState.java (original)
+++ manifoldcf/branches/release-2.14-branch/framework/connector-common/src/main/java/org/apache/manifoldcf/connectorcommon/fuzzyml/TagParseState.java Tue Sep 24 16:30:38 2019
@@ -285,7 +285,7 @@ public class TagParseState extends Singl
       }
       else if (bTagDepth == 0)
       {
-        if (isWhitespace(thisChar))
+        if (isWhitespace(thisChar) || !acceptNewTag())
         {
           // Not a tag.
           currentState = TAGPARSESTATE_NORMAL;
@@ -937,6 +937,12 @@ public class TagParseState extends Singl
     return false;
   }
 
+  /** Allow parsing within tag.
+   */
+  protected boolean acceptNewTag() {
+    return true;
+  }
+    
   /** Allocate the buffer.
   */
   protected StringBuilder newBuffer()