You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2010/09/08 15:10:07 UTC

svn commit: r995042 - in /incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/acf/crawler/connectors/webcrawler: WebcrawlerConfig.java WebcrawlerConnector.java

Author: kwright
Date: Wed Sep  8 13:10:06 2010
New Revision: 995042

URL: http://svn.apache.org/viewvc?rev=995042&view=rev
Log:
Add feature to limit the domain hosts to just those represented by seeds.  CONNECTORS-104.

Modified:
    incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/acf/crawler/connectors/webcrawler/WebcrawlerConfig.java
    incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/acf/crawler/connectors/webcrawler/WebcrawlerConnector.java

Modified: incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/acf/crawler/connectors/webcrawler/WebcrawlerConfig.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/acf/crawler/connectors/webcrawler/WebcrawlerConfig.java?rev=995042&r1=995041&r2=995042&view=diff
==============================================================================
--- incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/acf/crawler/connectors/webcrawler/WebcrawlerConfig.java (original)
+++ incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/acf/crawler/connectors/webcrawler/WebcrawlerConfig.java Wed Sep  8 13:10:06 2010
@@ -113,7 +113,9 @@ public class WebcrawlerConfig
   * if any one matches, causes the URL to be excluded.  These
   * regexps are newline separated, and # starts a comment.  */
   public static final String NODE_EXCLUDES = "excludes";
-
+  /** Limit to seeds.  When value attribute is true, only seed domains will be permitted. */
+  public static final String NODE_LIMITTOSEEDS = "limittoseeds";
+  
 }
 
 

Modified: incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/acf/crawler/connectors/webcrawler/WebcrawlerConnector.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/acf/crawler/connectors/webcrawler/WebcrawlerConnector.java?rev=995042&r1=995041&r2=995042&view=diff
==============================================================================
--- incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/acf/crawler/connectors/webcrawler/WebcrawlerConnector.java (original)
+++ incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/acf/crawler/connectors/webcrawler/WebcrawlerConnector.java Wed Sep  8 13:10:06 2010
@@ -573,7 +573,7 @@ public class WebcrawlerConnector extends
     {
       String documentIdentifier = documentIdentifiers[i];
       // Verify that the url is legal
-      if (filter.isDocumentLegal(documentIdentifier))
+      if (filter.isDocumentAndHostLegal(documentIdentifier))
       {
         // The first thing we need to know is whether this url is part of a session-protected area.  We'll use that information
         // later to detect redirection to login.
@@ -3390,7 +3390,8 @@ public class WebcrawlerConnector extends
     String seeds = "";
     String inclusions = ".*\n";
     String exclusions = "";
-
+    boolean includeMatching = false;
+    
     // Now, loop through description
     i = 0;
     while (i < ds.getChildCount())
@@ -3414,6 +3415,14 @@ public class WebcrawlerConnector extends
         if (exclusions == null)
           exclusions = "";
       }
+      else if (sn.getType().equals(org.apache.acf.crawler.connectors.webcrawler.WebcrawlerConfig.NODE_LIMITTOSEEDS))
+      {
+        String value = sn.getAttributeValue(WebcrawlerConfig.ATTR_VALUE);
+        if (value == null || value.equals("false"))
+          includeMatching = false;
+        else
+          includeMatching = true;
+      }
     }
 
     // Seeds tab
@@ -3594,7 +3603,6 @@ public class WebcrawlerConnector extends
     }
 
     // Inclusions tab
-
     if (tabName.equals("Inclusions"))
     {
       out.print(
@@ -3605,13 +3613,21 @@ public class WebcrawlerConnector extends
 "      <textarea rows=\"25\" cols=\"80\" name=\"inclusions\">"+org.apache.acf.ui.util.Encoder.bodyEscape(inclusions)+"</textarea>\n"+
 "    </td>\n"+
 "  </tr>\n"+
+"  <tr>\n"+
+"    <td class=\"description\"><nobr>Include only hosts matching seeds?</nobr></td>\n"+
+"    <td class=\"value\">\n"+
+"      <input type=\"checkbox\" name=\"matchinghosts\" value=\"true\""+(includeMatching?" checked=\"yes\"":"")+"/>\n"+
+"      <input type=\"hidden\" name=\"matchinghosts_present\" value=\"true\"/>\n"+
+"    </td>\n"+
 "</table>\n"
       );
     }
     else
     {
       out.print(
-"<input type=\"hidden\" name=\"inclusions\" value=\""+org.apache.acf.ui.util.Encoder.attributeEscape(inclusions)+"\"/>\n"
+"<input type=\"hidden\" name=\"inclusions\" value=\""+org.apache.acf.ui.util.Encoder.attributeEscape(inclusions)+"\"/>\n"+
+"<input type=\"hidden\" name=\"matchinghosts\" value=\""+(includeMatching?"true":"false")+"\"/>\n"+
+"<input type=\"hidden\" name=\"matchinghosts_present\" value=\"true\"/>\n"
       );
     }
 
@@ -3873,6 +3889,27 @@ public class WebcrawlerConnector extends
       ds.addChild(ds.getChildCount(),cn);
     }
 
+    // Handle the seeds-only switch
+    String matchingHostsPresent = variableContext.getParameter("matchinghosts_present");
+    if (matchingHostsPresent != null)
+    {
+      // Delete existing switch record first
+      int i = 0;
+      while (i < ds.getChildCount())
+      {
+        SpecificationNode sn = ds.getChild(i);
+        if (sn.getType().equals(org.apache.acf.crawler.connectors.webcrawler.WebcrawlerConfig.NODE_LIMITTOSEEDS))
+          ds.removeChild(i);
+        else
+          i++;
+      }
+
+      String matchingHosts = variableContext.getParameter("matchinghosts");
+      SpecificationNode cn = new SpecificationNode(org.apache.acf.crawler.connectors.webcrawler.WebcrawlerConfig.NODE_LIMITTOSEEDS);
+      cn.setAttribute(org.apache.acf.crawler.connectors.webcrawler.WebcrawlerConfig.ATTR_VALUE,(matchingHosts==null||matchingHosts.equals("false"))?"false":"true");
+      ds.addChild(ds.getChildCount(),cn);
+    }
+    
     // Get the exclusions
     String exclusions = variableContext.getParameter("exclusions");
     if (exclusions != null)
@@ -4091,7 +4128,8 @@ public class WebcrawlerConnector extends
     String seeds = "";
     String inclusions = ".*\n";
     String exclusions = "";
-
+    boolean includeMatching = false;
+    
     int i = 0;
     while (i < ds.getChildCount())
     {
@@ -4114,6 +4152,14 @@ public class WebcrawlerConnector extends
         if (exclusions == null)
           exclusions = "";
       }
+      else if (sn.getType().equals(org.apache.acf.crawler.connectors.webcrawler.WebcrawlerConfig.NODE_LIMITTOSEEDS))
+      {
+        String value = sn.getAttributeValue(WebcrawlerConfig.ATTR_VALUE);
+        if (value == null || value.equals("false"))
+          includeMatching = false;
+        else
+          includeMatching = true;
+      }
     }
     out.print(
 "<table class=\"displaytable\">\n"+
@@ -4236,6 +4282,16 @@ public class WebcrawlerConnector extends
     out.print(
 "  <tr><td class=\"separator\" colspan=\"2\"><hr/></td></tr>\n"+
 "  <tr>\n"+
+"    <td class=\"description\"><nobr>Include only hosts mentioned in seeds?</nobr></td>\n"+
+"    <td class=\"value\">\n"+
+"    "+(includeMatching?"yes":"no")+"\n"+
+"    </td>\n"+
+"  </tr>\n"
+    );
+
+    out.print(
+"  <tr><td class=\"separator\" colspan=\"2\"><hr/></td></tr>\n"+
+"  <tr>\n"+
 "    <td class=\"description\"><nobr>Includes:</nobr></td>\n"+
 "    <td class=\"value\">\n"
     );
@@ -4737,7 +4793,13 @@ public class WebcrawlerConnector extends
           Logging.connectors.debug("WEB: Can't use url '"+rawURL+"' because it has an unsupported protocol '"+protocol+"'");
         return null;
       }
-
+      if (!filter.isHostLegal(host))
+      {
+        if (Logging.connectors.isDebugEnabled())
+          Logging.connectors.debug("WEB: Can't use url '"+rawURL+"' because its host is not found in the seeds ('"+host+"')");
+        return null;
+      }
+      
       // Canonicalization procedure.
       // The query part of the URL may contain bad parameters (session id's, for instance), or may be ordered in such a
       // way as to prevent an effectively identical URL from being matched.  The anchor part of the URL should also be stripped.
@@ -7121,6 +7183,9 @@ public class WebcrawlerConnector extends
     protected ArrayList includePatterns = new ArrayList();
     /** The arraylist of exclude patterns */
     protected ArrayList excludePatterns = new ArrayList();
+    /** The hash map of seed hosts, to limit urls by, if non-null */
+    protected HashMap seedHosts = null;
+    
     /** Canonicalization policies */
     protected CanonicalizationPolicies canonicalizationPolicies = new CanonicalizationPolicies();
 
@@ -7134,11 +7199,20 @@ public class WebcrawlerConnector extends
     {
       String includes = "";
       String excludes = "";
+      String seeds = "";
+      boolean limitToSeeds = false;
       int i = 0;
       while (i < spec.getChildCount())
       {
         SpecificationNode sn = spec.getChild(i++);
-        if (sn.getType().equals(WebcrawlerConfig.NODE_INCLUDES))
+        if (sn.getType().equals(WebcrawlerConfig.NODE_SEEDS))
+        {
+          // Save the seeds aside; we'll parse them only if we need to.
+          seeds = sn.getValue();
+          if (seeds == null)
+            seeds = "";
+        }
+        else if (sn.getType().equals(WebcrawlerConfig.NODE_INCLUDES))
         {
           includes = sn.getValue();
           if (includes == null)
@@ -7150,6 +7224,14 @@ public class WebcrawlerConnector extends
           if (excludes == null)
             excludes = "";
         }
+        else if (sn.getType().equals(WebcrawlerConfig.NODE_LIMITTOSEEDS))
+        {
+          String value = sn.getAttributeValue(WebcrawlerConfig.ATTR_VALUE);
+          if (value == null || value.equals("false"))
+            limitToSeeds = false;
+          else
+            limitToSeeds = true;
+        }
         else if (sn.getType().equals("urlspec"))
         {
           String urlRegexp = sn.getAttributeValue("regexp");
@@ -7230,8 +7312,76 @@ public class WebcrawlerConnector extends
       compileList(includePatterns,list);
       list = stringToArray(excludes);
       compileList(excludePatterns,list);
+      
+      if (limitToSeeds)
+      {
+        seedHosts = new HashMap();
+        // Parse all URLs, and put their hosts into the hash table.
+        // Break up the seeds string and iterate over the results.
+        list = stringToArray(seeds);
+        // We must only return valid urls here!!!
+        int index = 0;
+        while (index < list.size())
+        {
+          String urlCandidate = (String)list.get(index++);
+          try
+          {
+            java.net.URI url = new java.net.URI(urlCandidate);
+
+            String host = url.getHost();
+
+            if (host != null)
+              seedHosts.put(host,host);
+          }
+          catch (java.net.URISyntaxException e)
+          {
+            // Skip the entry
+          }
+          catch (java.lang.IllegalArgumentException e)
+          {
+            // Skip the entry
+          }
+
+        }
+      }
     }
 
+    /** Check if both a document and host are legal.
+    */
+    public boolean isDocumentAndHostLegal(String url)
+    {
+      if (!isDocumentLegal(url))
+        return false;
+      if (seedHosts == null)
+        return true;
+      try
+      {
+        java.net.URI uri = new java.net.URI(url);
+        String host = uri.getHost();
+        if (host == null)
+          return false;
+        return isHostLegal(host);
+      }
+      catch (java.net.URISyntaxException e)
+      {
+        return false;
+      }
+      catch (java.lang.IllegalArgumentException e)
+      {
+        return false;
+      }
+
+    }
+    
+    /** Check if a host is legal.
+    */
+    public boolean isHostLegal(String host)
+    {
+      if (seedHosts == null)
+        return true;
+      return seedHosts.get(host) != null;
+    }
+    
     /** Check if the document identifier is legal.
     */
     public boolean isDocumentLegal(String url)