You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2015/04/30 02:42:03 UTC

svn commit: r1676883 - in /manifoldcf/branches/dev_1x: ./ connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/

Author: kwright
Date: Thu Apr 30 00:42:03 2015
New Revision: 1676883

URL: http://svn.apache.org/r1676883
Log:
Pull up part 1 of fix for CONNECTORS-1192 from trunk

Modified:
    manifoldcf/branches/dev_1x/   (props changed)
    manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindContentHandler.java
    manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHTMLFormHandler.java
    manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHTMLHrefHandler.java
    manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IHTMLHandler.java
    manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/LinkParseState.java
    manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java

Propchange: manifoldcf/branches/dev_1x/
------------------------------------------------------------------------------
--- svn:mergeinfo (original)
+++ svn:mergeinfo Thu Apr 30 00:42:03 2015
@@ -123,4 +123,4 @@
 /manifoldcf/branches/CONNECTORS-981:1605049-1605773
 /manifoldcf/branches/CONNECTORS-989:1611600-1612101
 /manifoldcf/branches/CONNECTORS-990:1610284-1610707
-/manifoldcf/trunk:1620703,1620748,1620812,1620862,1621449,1621613,1621855,1622213,1622740,1622850,1622853-1622854,1623249,1623251,1623314,1623599,1623951,1623953-1623954,1623956,1623972,1624058,1624085,1624174,1624236,1624377,1624384,1624399,1624449,1624464,1624504,1624729-1624731,1624906,1624909-1624910,1624982,1625023,1625095,1625103,1625108,1625264,1625270,1625394,1625400,1625910,1626090,1626097,1626102,1626638-1626639,1626973,1627687,1627690,1627959,1628046,1628066,1628106,1628168,1628188,1628699,1628798,1628808,1628845,1628905,1629122,1629374-1629375,1629379,1629541,1629994,1630188,1630535,1630623,1630671,1630812,1630885,1631039,1631162,1631164,1631252,1631750,1631953,1632013,1632225,1632289,1632562,1632844,1632847,1632854,1633062-1633063,1633108,1633193,1633202,1633282,1633284,1633295,1633336,1633339,1633345,1633348,1633364,1633378,1633383,1633432,1633546,1633590,1633634,1633668,1633727,1633760,1633764,1633786,1633910,1633923,1634021,1634028,1634067,1634132,1634145,1634148,163
 4155,1634188,1634202,1634264,1634373,1634530,1634688,1634850,1634857,1635103,1635116,1635421,1635438,1635478,1635481,1635484,1635490,1635809,1635939,1636146,1636167,1636180,1636207,1636215,1636232,1636334,1636519,1636570,1636684,1636940,1637011,1637310,1637350,1637364,1637373,1637378,1639259,1639593,1639600,1640018,1640101,1640199,1640204,1640208,1640314,1640319,1640749,1640772,1640805,1640888,1640925,1640941-1640942,1641222,1641328,1641557,1641559,1641629,1641633,1641724,1641754,1641911,1642163,1642255,1642318,1642531,1642650,1642658,1642673,1642716,1644197,1644399,1644538,1644920,1644931,1646317,1646397,1646403,1646408,1646640,1646947,1647574,1647585,1647608,1648686,1648976,1649201,1649203,1649529,1649605,1649628,1649794,1650351,1650722,1650741-1650742,1650745,1650747,1650911,1650954,1651332,1651539,1651907,1651921,1652071,1652974,1653175,1653899,1654651,1655205,1655261,1655264,1655377,1655411,1655618,1655914,1657346,1657443,1658004,1658036,1658121,1658155,1658188,1658463,1658476,
 1660258,1660276,1661454,1665848,1666160,1666781,1666820,1668312,1669100,1669238,1669487,1669523,1669586,1669660,1670614,1670625,1670715,1671496,1672169,1672301,1672616,1672737,1673559,1673573,1673579,1673722,1675781,1675898,1676094
+/manifoldcf/trunk:1620703,1620748,1620812,1620862,1621449,1621613,1621855,1622213,1622740,1622850,1622853-1622854,1623249,1623251,1623314,1623599,1623951,1623953-1623954,1623956,1623972,1624058,1624085,1624174,1624236,1624377,1624384,1624399,1624449,1624464,1624504,1624729-1624731,1624906,1624909-1624910,1624982,1625023,1625095,1625103,1625108,1625264,1625270,1625394,1625400,1625910,1626090,1626097,1626102,1626638-1626639,1626973,1627687,1627690,1627959,1628046,1628066,1628106,1628168,1628188,1628699,1628798,1628808,1628845,1628905,1629122,1629374-1629375,1629379,1629541,1629994,1630188,1630535,1630623,1630671,1630812,1630885,1631039,1631162,1631164,1631252,1631750,1631953,1632013,1632225,1632289,1632562,1632844,1632847,1632854,1633062-1633063,1633108,1633193,1633202,1633282,1633284,1633295,1633336,1633339,1633345,1633348,1633364,1633378,1633383,1633432,1633546,1633590,1633634,1633668,1633727,1633760,1633764,1633786,1633910,1633923,1634021,1634028,1634067,1634132,1634145,1634148,163
 4155,1634188,1634202,1634264,1634373,1634530,1634688,1634850,1634857,1635103,1635116,1635421,1635438,1635478,1635481,1635484,1635490,1635809,1635939,1636146,1636167,1636180,1636207,1636215,1636232,1636334,1636519,1636570,1636684,1636940,1637011,1637310,1637350,1637364,1637373,1637378,1639259,1639593,1639600,1640018,1640101,1640199,1640204,1640208,1640314,1640319,1640749,1640772,1640805,1640888,1640925,1640941-1640942,1641222,1641328,1641557,1641559,1641629,1641633,1641724,1641754,1641911,1642163,1642255,1642318,1642531,1642650,1642658,1642673,1642716,1644197,1644399,1644538,1644920,1644931,1646317,1646397,1646403,1646408,1646640,1646947,1647574,1647585,1647608,1648686,1648976,1649201,1649203,1649529,1649605,1649628,1649794,1650351,1650722,1650741-1650742,1650745,1650747,1650911,1650954,1651332,1651539,1651907,1651921,1652071,1652974,1653175,1653899,1654651,1655205,1655261,1655264,1655377,1655411,1655618,1655914,1657346,1657443,1658004,1658036,1658121,1658155,1658188,1658463,1658476,
 1660258,1660276,1661454,1665848,1666160,1666781,1666820,1668312,1669100,1669238,1669487,1669523,1669586,1669660,1670614,1670625,1670715,1671496,1672169,1672301,1672616,1672737,1673559,1673573,1673579,1673722,1675781,1675898,1676094,1676882

Modified: manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindContentHandler.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindContentHandler.java?rev=1676883&r1=1676882&r2=1676883&view=diff
==============================================================================
--- manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindContentHandler.java (original)
+++ manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindContentHandler.java Thu Apr 30 00:42:03 2015
@@ -60,10 +60,7 @@ public class FindContentHandler extends
       contentBuffer.append(textCharacter);
     else
     {
-      String bufferContents = contentBuffer.toString();
-      contentBuffer.setLength(0);
-      if (contentPattern.matcher(bufferContents).find())
-        targetURI = "";
+      processBuffer();
     }
   }
 
@@ -123,5 +120,22 @@ public class FindContentHandler extends
   {
   }
 
+  /** Finish up all processing.  Called ONLY if we haven't already aborted.
+  */
+  @Override
+  public void finishUp()
+    throws ManifoldCFException
+  {
+    if (targetURI == null)
+      processBuffer();
+  }
+
+  protected void processBuffer()
+  {
+    String bufferContents = contentBuffer.toString();
+    contentBuffer.setLength(0);
+    if (contentPattern.matcher(bufferContents).find())
+      targetURI = "";
+  }
 
 }

Modified: manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHTMLFormHandler.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHTMLFormHandler.java?rev=1676883&r1=1676882&r2=1676883&view=diff
==============================================================================
--- manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHTMLFormHandler.java (original)
+++ manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHTMLFormHandler.java Thu Apr 30 00:42:03 2015
@@ -188,4 +188,10 @@ public class FindHTMLFormHandler extends
   {
   }
 
+  @Override
+  public void finishUp()
+    throws ManifoldCFException
+  {
+  }
+
 }

Modified: manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHTMLHrefHandler.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHTMLHrefHandler.java?rev=1676883&r1=1676882&r2=1676883&view=diff
==============================================================================
--- manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHTMLHrefHandler.java (original)
+++ manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHTMLHrefHandler.java Thu Apr 30 00:42:03 2015
@@ -144,4 +144,10 @@ public class FindHTMLHrefHandler extends
     noteDiscoveredLink(rawURL);
   }
 
+  @Override
+  public void finishUp()
+    throws ManifoldCFException
+  {
+  }
+
 }

Modified: manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IHTMLHandler.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IHTMLHandler.java?rev=1676883&r1=1676882&r2=1676883&view=diff
==============================================================================
--- manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IHTMLHandler.java (original)
+++ manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IHTMLHandler.java Thu Apr 30 00:42:03 2015
@@ -59,4 +59,9 @@ public interface IHTMLHandler extends ID
   public void noteTextCharacter(char textCharacter)
     throws ManifoldCFException;
 
+  /** Done with the document.
+  */
+  public void finishUp()
+    throws ManifoldCFException;
+
 }

Modified: manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/LinkParseState.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/LinkParseState.java?rev=1676883&r1=1676882&r2=1676883&view=diff
==============================================================================
--- manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/LinkParseState.java (original)
+++ manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/LinkParseState.java Thu Apr 30 00:42:03 2015
@@ -67,4 +67,12 @@ public class LinkParseState extends Meta
     return false;
   }
 
+  @Override
+  public void finishUp()
+    throws ManifoldCFException
+  {
+    handler.finishUp();
+    super.finishUp();
+  }
+  
 }

Modified: manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java?rev=1676883&r1=1676882&r2=1676883&view=diff
==============================================================================
--- manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java (original)
+++ manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java Thu Apr 30 00:42:03 2015
@@ -6418,6 +6418,12 @@ public class WebcrawlerConnector extends
         noteDiscoveredLink(rawURL);
     }
 
+    @Override
+    public void finishUp()
+      throws ManifoldCFException
+    {
+    }
+
   }
 
   /** Class that describes XML handling */