You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2015/05/26 12:40:16 UTC
svn commit: r1681735 - in /manifoldcf/trunk: ./ connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler...

Author: kwright
Date: Tue May 26 10:40:15 2015
New Revision: 1681735

URL: http://svn.apache.org/r1681735
Log:
Fix for CONNECTORS-1193.  Committed on behalf of Arcadius Ahouansou.

Added:
    manifoldcf/trunk/connectors/webcrawler/connector/src/test/java/org/apache/manifoldcf/crawler/connectors/webcrawler/DocumentContentExclusionIT.java
Modified:
    manifoldcf/trunk/CHANGES.txt
    manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindContentHandler.java
    manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConfig.java
    manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
    manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_en_US.properties
    manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_ja_JP.properties
    manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_zh_CN.properties
    manifoldcf/trunk/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/IOutputHistoryActivity.java
    manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IHistoryActivity.java

Modified: manifoldcf/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/CHANGES.txt?rev=1681735&r1=1681734&r2=1681735&view=diff
==============================================================================
--- manifoldcf/trunk/CHANGES.txt (original)
+++ manifoldcf/trunk/CHANGES.txt Tue May 26 10:40:15 2015
@@ -3,6 +3,10 @@ $Id$
 
 ======================= 2.2-dev =====================
 
+CONNECTORS-1193: Add ability to discard web content based on a 
+set of regular expressions.
+(Arcadius Ahouansou)
+
 CONNECTORS-1199: SearchBlox connector formatting non-standard.
 (Karl Wright)
 

Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindContentHandler.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindContentHandler.java?rev=1681735&r1=1681734&r2=1681735&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindContentHandler.java (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindContentHandler.java Tue May 26 10:40:15 2015
@@ -18,15 +18,17 @@
 */
 package org.apache.manifoldcf.crawler.connectors.webcrawler;
 
-import org.apache.manifoldcf.core.interfaces.*;
-import org.apache.manifoldcf.crawler.system.Logging;
-import java.util.regex.*;
-import java.util.*;
+import org.apache.manifoldcf.core.interfaces.ManifoldCFException;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Pattern;
+import static java.util.Arrays.asList;
+import static org.apache.manifoldcf.crawler.system.Logging.connectors;
 
 /** This class is the handler for HTML content grepping during state transitions */
 public class FindContentHandler extends FindHandler implements IHTMLHandler
 {
-  protected final Pattern contentPattern;
+  protected final List<Pattern> contentPatterns;
   protected final StringBuilder contentBuffer = new StringBuilder();
 
   protected final static int MAX_LENGTH = 65536;
@@ -35,7 +37,13 @@ public class FindContentHandler extends
   public FindContentHandler(String parentURI, Pattern contentPattern)
   {
     super(parentURI);
-    this.contentPattern = contentPattern;
+    this.contentPatterns = asList(contentPattern);
+  }
+
+  public FindContentHandler(String parentURI, List<Pattern> contentPatterns)
+  {
+    super(parentURI);
+    this.contentPatterns = contentPatterns;
   }
 
   /** Apply overrides */
@@ -69,10 +77,14 @@ public class FindContentHandler extends
         // continuity
         String bufferContents = contentBuffer.toString();
         contentBuffer.setLength(0);
-        if (contentPattern.matcher(bufferContents).find())
-          targetURI = "";
-        else
-        {
+        for (Pattern contentPattern : contentPatterns) {
+          if (contentPattern.matcher(bufferContents).find()) {
+            targetURI = "";
+            break;
+          }
+        }
+
+        if(targetURI == null) {
           contentBuffer.append(bufferContents.substring(bufferContents.length() - OVERLAP_AMOUNT));
         }
       }
@@ -153,8 +165,12 @@ public class FindContentHandler extends
   {
     String bufferContents = contentBuffer.toString();
     contentBuffer.setLength(0);
-    if (contentPattern.matcher(bufferContents).find())
-      targetURI = "";
+    for(Pattern contentPattern: contentPatterns) {
+      if (contentPattern.matcher(bufferContents).find()) {
+        targetURI = "";
+        return;
+      }
+    }
   }
 
 }

Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConfig.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConfig.java?rev=1681735&r1=1681734&r2=1681735&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConfig.java (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConfig.java Tue May 26 10:40:15 2015
@@ -137,6 +137,12 @@ public class WebcrawlerConfig
   * if any one matches, causes the URL to be excluded from indexing.  These
   * regexps are newline separated, and # starts a comment.  */
   public static final String NODE_EXCLUDESINDEX = "excludesindex";
+
+  /**
+   * Exclude any page containing specified regex in their body from index
+   */
+  public static final String NODE_EXCLUDESCONTENTINDEX = "excludescontentindex";
+
   /** Limit to seeds.  When value attribute is true, only seed domains will be permitted. */
   public static final String NODE_LIMITTOSEEDS = "limittoseeds";
   /** Canonicalization rule.  Attributes are regexp, description, reorder, 

Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java?rev=1681735&r1=1681734&r2=1681735&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java Tue May 26 10:40:15 2015
@@ -29,10 +29,6 @@ import org.apache.manifoldcf.connectorco
 import org.apache.manifoldcf.connectorcommon.fuzzyml.*;
 
 import org.apache.http.conn.ConnectTimeoutException;
-import org.apache.http.client.RedirectException;
-import org.apache.http.client.CircularRedirectException;
-import org.apache.http.NoHttpResponseException;
-import org.apache.http.HttpException;
 
 import java.io.*;
 import java.nio.charset.StandardCharsets;
@@ -508,7 +504,7 @@ public class WebcrawlerConnector extends
   *@param activities is the interface this method should use to perform whatever framework actions are desired.
   *@param spec is a document specification (that comes from the job).
   *@param seedTime is the end of the time range of documents to consider, exclusive.
-  *@param lastSeedVersionString is the last seeding version string for this job, or null if the job has no previous seeding version string.
+  *@param lastSeedVersion is the last seeding version string for this job, or null if the job has no previous seeding version string.
   *@param jobMode is an integer describing how the job is being run, whether continuous or once-only.
   *@return an updated seeding version string, to be stored with the job.
   */
@@ -1303,7 +1299,7 @@ public class WebcrawlerConnector extends
         activities.noDocument(documentIdentifier,versionString);
         return;
       }
-      
+
       if (activities.checkURLIndexable(documentIdentifier) == false)
       {
         if (Logging.connectors.isDebugEnabled())
@@ -1355,7 +1351,15 @@ public class WebcrawlerConnector extends
         activities.noDocument(documentIdentifier,versionString);
         return;
       }
-      
+
+      if(!filter.isDocumentContentIndexable(documentIdentifier)){
+        if (Logging.connectors.isDebugEnabled())
+          Logging.connectors.debug("Web: For document '"+documentIdentifier+"', not indexing because document content matched document content exclusion rule");
+        errorCode = activities.EXCLUDED_CONTENT;
+        errorDesc = "Rejected due to content exclusion rule";
+        activities.noDocument(documentIdentifier,versionString);
+        return;
+      }
       // Ingest the document
       if (Logging.connectors.isDebugEnabled())
         Logging.connectors.debug("WEB: Decided to ingest '"+documentIdentifier+"'");
@@ -3811,6 +3815,8 @@ public class WebcrawlerConnector extends
     String exclusions = "";
     String inclusionsIndex = ".*\n";
     String exclusionsIndex = "";
+    String exclusionsContentIndex = "";
+    
     boolean includeMatching = true;
     Set<String> excludedHeaders = new HashSet<String>();
     
@@ -3861,6 +3867,12 @@ public class WebcrawlerConnector extends
         if (exclusionsIndex == null)
           exclusionsIndex = "";
       }
+      else if (sn.getType().equals(WebcrawlerConfig.NODE_EXCLUDESCONTENTINDEX))
+      {
+        exclusionsContentIndex = sn.getValue();
+        if (exclusionsContentIndex == null)
+        	exclusionsContentIndex = "";
+      }
       else if (sn.getType().equals(WebcrawlerConfig.NODE_LIMITTOSEEDS))
       {
         String value = sn.getAttributeValue(WebcrawlerConfig.ATTR_VALUE);
@@ -4218,14 +4230,21 @@ public class WebcrawlerConnector extends
 "      <textarea rows=\"10\" cols=\"60\" name=\""+seqPrefix+"exclusionsindex\">"+Encoder.bodyEscape(exclusionsIndex)+"</textarea>\n"+
 "    </td>\n"+
 "  </tr>\n"+
+"  <tr>\n"+
+"    <td class=\"description\" colspan=\"1\"><nobr>" + Messages.getBodyString(locale,"WebcrawlerConnector.ExcludeContentFromIndex") + "</nobr></td>\n"+
+"    <td class=\"value\" colspan=\"1\">\n"+
+"      <textarea rows=\"10\" cols=\"60\" name=\""+seqPrefix+"exclusionscontentindex\">"+Encoder.bodyEscape(exclusionsContentIndex)+"</textarea>\n"+
+"    </td>\n"+
+"  </tr>\n"+
 "</table>\n"
       );
     }
     else
     {
       out.print(
-"<input type=\"hidden\" name=\""+seqPrefix+"exclusions\" value=\""+Encoder.attributeEscape(exclusions)+"\"/>\n"+
-"<input type=\"hidden\" name=\""+seqPrefix+"exclusionsindex\" value=\""+Encoder.attributeEscape(exclusionsIndex)+"\"/>\n"
+              "<input type=\"hidden\" name=\"" + seqPrefix + "exclusions\" value=\"" + Encoder.attributeEscape(exclusions) + "\"/>\n" +
+                      "<input type=\"hidden\" name=\"" + seqPrefix + "exclusionsindex\" value=\"" + Encoder.attributeEscape(exclusionsIndex) + "\"/>\n" +
+                      "<input type=\"hidden\" name=\"" + seqPrefix + "exclusionscontentindex\" value=\"" + Encoder.attributeEscape(exclusionsContentIndex) + "\"/>\n"
       );
     }
   
@@ -4586,6 +4605,26 @@ public class WebcrawlerConnector extends
       ds.addChild(ds.getChildCount(),cn);
     }
 
+    // Get the content index exclusions
+    String exclusionsContentIndex = variableContext.getParameter(seqPrefix+"exclusionscontentindex");
+    if (exclusionsContentIndex != null)
+    {
+      // Delete existing content exclusions record first
+      int i = 0;
+      while (i < ds.getChildCount())
+      {
+        SpecificationNode sn = ds.getChild(i);
+        if (sn.getType().equals(WebcrawlerConfig.NODE_EXCLUDESCONTENTINDEX))
+          ds.removeChild(i);
+        else
+          i++;
+      }
+
+      SpecificationNode cn = new SpecificationNode(WebcrawlerConfig.NODE_EXCLUDESCONTENTINDEX);
+      cn.setValue(exclusionsContentIndex);
+      ds.addChild(ds.getChildCount(),cn);
+    }
+
     // Read the url specs
     String urlRegexpCount = variableContext.getParameter(seqPrefix+"urlregexpcount");
     if (urlRegexpCount != null && urlRegexpCount.length() > 0)
@@ -4742,6 +4781,8 @@ public class WebcrawlerConnector extends
     String exclusions = "";
     String inclusionsIndex = ".*\n";
     String exclusionsIndex = "";
+    String exclusionsContentIndex = "";
+
     boolean includeMatching = false;
     Set<String> excludedHeaders = new HashSet<String>();
     
@@ -4779,6 +4820,12 @@ public class WebcrawlerConnector extends
         if (exclusionsIndex == null)
           exclusionsIndex = "";
       }
+      else if (sn.getType().equals(WebcrawlerConfig.NODE_EXCLUDESCONTENTINDEX))
+      {
+        exclusionsContentIndex = sn.getValue();
+        if (exclusionsContentIndex == null)
+        	exclusionsContentIndex = "";
+      }
       else if (sn.getType().equals(WebcrawlerConfig.NODE_LIMITTOSEEDS))
       {
         String value = sn.getAttributeValue(WebcrawlerConfig.ATTR_VALUE);
@@ -5157,6 +5204,48 @@ public class WebcrawlerConnector extends
     out.print(
 "    </td>\n"+
 "  </tr>\n"+
+"  <tr><td class=\"separator\" colspan=\"2\"><hr/></td></tr>\n"+
+"  <tr>\n"+
+"    <td class=\"description\"><nobr>" + Messages.getBodyString(locale,"WebcrawlerConnector.ExcludeContentFromIndex") + "</nobr></td>\n"+
+"    <td class=\"value\">\n"
+    );
+    try
+    {
+      java.io.Reader str = new java.io.StringReader(exclusionsContentIndex);
+      try
+      {
+        java.io.BufferedReader is = new java.io.BufferedReader(str);
+        try
+        {
+          while (true)
+          {
+            String nextString = is.readLine();
+            if (nextString == null)
+              break;
+            if (nextString.length() == 0)
+              continue;
+            out.print(
+"      <nobr>"+Encoder.bodyEscape(nextString)+"</nobr><br/>\n"
+            );
+          }
+        }
+        finally
+        {
+          is.close();
+        }
+      }
+      finally
+      {
+        str.close();
+      }
+    }
+    catch (java.io.IOException e)
+    {
+      throw new ManifoldCFException("IO error: "+e.getMessage(),e);
+    }
+    out.print(
+"    </td>\n"+
+"  </tr>\n"+
 "    \n"+
 "  <tr><td class=\"separator\" colspan=\"2\"><hr/></td></tr>\n"
     );
@@ -5936,7 +6025,7 @@ public class WebcrawlerConnector extends
     handler.applyOverrides(lp);
     return handler.getTargetURI();
   }
-  
+
   /** Find HTML link URI, if present, making sure specified preference is matched. */
   protected String findHTMLLinkURI(String currentURI, LoginParameters lp)
     throws ManifoldCFException
@@ -7732,7 +7821,7 @@ public class WebcrawlerConnector extends
 
   /** This class describes the url filtering information (for crawling and indexing) obtained from a digested DocumentSpecification.
   */
-  protected static class DocumentURLFilter
+  protected class DocumentURLFilter
   {
     /** The version string */
     protected String versionString;
@@ -7748,7 +7837,10 @@ public class WebcrawlerConnector extends
     protected final List<Pattern> excludeIndexPatterns = new ArrayList<Pattern>();
     /** The hash map of seed hosts, to limit urls by, if non-null */
     protected Set<String> seedHosts = null;
-    
+
+    /**List of content exclusion pattern*/
+    protected final List<Pattern> excludeContentIndexPatterns = new ArrayList<Pattern>();
+
     /** Canonicalization policies */
     protected final CanonicalizationPolicies canonicalizationPolicies = new CanonicalizationPolicies();
 
@@ -7764,6 +7856,7 @@ public class WebcrawlerConnector extends
       String excludes = "";
       String includesIndex = ".*";
       String excludesIndex = "";
+      String excludesContentIndex = "";
       String seeds = "";
       List<String> packList = new ArrayList<String>();
       String[] packStuff = new String[2];
@@ -7895,12 +7988,19 @@ public class WebcrawlerConnector extends
             throw new ManifoldCFException("Canonicalization regular expression '"+urlRegexp+"' is illegal: "+e.getMessage(),e);
           }
         }
+        else if (sn.getType().equals(WebcrawlerConfig.NODE_EXCLUDESCONTENTINDEX))
+        {
+          excludesContentIndex = sn.getValue();
+          if (excludesContentIndex == null)
+            excludesContentIndex = "";
+        }
       }
 
       // Note: format change since MCF 1.7 release
       StringBuilder versionBuffer = new StringBuilder();
       pack(versionBuffer,includesIndex,'+');
       pack(versionBuffer,excludesIndex,'+');
+      pack(versionBuffer,excludesContentIndex,'+');
       packList(versionBuffer,packList,'+');
       versionString = versionBuffer.toString();
       
@@ -7913,7 +8013,9 @@ public class WebcrawlerConnector extends
       compileList(includeIndexPatterns,list);
       list = stringToArray(excludesIndex);
       compileList(excludeIndexPatterns,list);
-      
+      list = stringToArray(excludesContentIndex);
+      compileList(excludeContentIndexPatterns,list);
+
       if (limitToSeeds)
       {
         seedHosts = new HashSet<String>();
@@ -8084,6 +8186,30 @@ public class WebcrawlerConnector extends
       return canonicalizationPolicies;
     }
 
+    public boolean isDocumentContentIndexable(String documentIdentifier) throws ManifoldCFException {
+        String content = findSpecifiedContent(documentIdentifier, excludeContentIndexPatterns);
+        if (content != null) {
+          if (Logging.connectors.isDebugEnabled())
+            Logging.connectors.debug("WEB: Url '" + documentIdentifier + "' is not indexable because content exclusion pattern was matched");
+
+          return false;
+      }
+      return true;
+    }
+
+    protected String findSpecifiedContent(String currentURI, List<Pattern> patterns) throws ManifoldCFException
+    {
+      if (excludeContentIndexPatterns.isEmpty()) {
+        if (Logging.connectors.isDebugEnabled())
+          Logging.connectors.debug("WEB: no content exclusion rule supplied... returning");
+        return null;
+      }
+
+      FindContentHandler handler = new FindContentHandler(currentURI, patterns);
+      handleHTML(currentURI, handler);
+      return handler.getTargetURI();
+    }
+
   }
 
   protected static class FetchStatus

Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_en_US.properties
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_en_US.properties?rev=1681735&r1=1681734&r2=1681735&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_en_US.properties (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_en_US.properties Tue May 26 10:40:15 2015
@@ -76,6 +76,7 @@ WebcrawlerConnector.IncludeInIndex=Inclu
 WebcrawlerConnector.IncludeOnlyHostsMatchingSeeds=Include only hosts matching seeds?
 WebcrawlerConnector.ExcludeFromCrawl=Exclude from crawl:
 WebcrawlerConnector.ExcludeFromIndex=Exclude from index:
+WebcrawlerConnector.ExcludeContentFromIndex=Exclude content from index:
 WebcrawlerConnector.DeleteToken=Delete token #
 WebcrawlerConnector.NoAccessTokensPresent=No access tokens present
 WebcrawlerConnector.AddAccessToken=Add access token

Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_ja_JP.properties
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_ja_JP.properties?rev=1681735&r1=1681734&r2=1681735&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_ja_JP.properties (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_ja_JP.properties Tue May 26 10:40:15 2015
@@ -77,6 +77,7 @@ WebcrawlerConnector.IncludeOnlyHostsMatc
 WebcrawlerConnector.ExcludeFromCrawl=ã¯ãã¼ã«ããé¤å¤ï¼
 WebcrawlerConnector.ExcludeFromIndex=ç´¢å¼ãé¤å¤ï¼
 WebcrawlerConnector.DeleteToken=ãã¼ã¯ã³ãåé¤ #
+WebcrawlerConnector.ExcludeContentFromIndex=Exclude content from index:
 WebcrawlerConnector.NoAccessTokensPresent=ã¢ã¯ã»ã¹ãã¼ã¯ã³ãããã¾ãã
 WebcrawlerConnector.AddAccessToken=ã¢ã¯ã»ã¹ãã¼ã¯ã³ãè¿½å 
 WebcrawlerConnector.DeleteMetadata=ã¡ã¿ãã¼ã¿ãåé¤ #

Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_zh_CN.properties
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_zh_CN.properties?rev=1681735&r1=1681734&r2=1681735&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_zh_CN.properties (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_zh_CN.properties Tue May 26 10:40:15 2015
@@ -77,6 +77,7 @@ WebcrawlerConnector.IncludeOnlyHostsMatc
 WebcrawlerConnector.ExcludeFromCrawl=æé¤äºç¬è«å¤: 
 WebcrawlerConnector.ExcludeFromIndex=æé¤äºç´¢å¼å¤: 
 WebcrawlerConnector.DeleteToken=å é¤ä»¤ç #
+WebcrawlerConnector.ExcludeContentFromIndex=Exclude content from index:
 WebcrawlerConnector.NoAccessTokensPresent=è®¿é®ä»¤çä¸åå¨
 WebcrawlerConnector.AddAccessToken=æ·»å è®¿é®ä»¤ç
 WebcrawlerConnector.DeleteMetadata=å é¤åæ°æ® #

Added: manifoldcf/trunk/connectors/webcrawler/connector/src/test/java/org/apache/manifoldcf/crawler/connectors/webcrawler/DocumentContentExclusionIT.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/test/java/org/apache/manifoldcf/crawler/connectors/webcrawler/DocumentContentExclusionIT.java?rev=1681735&view=auto
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/test/java/org/apache/manifoldcf/crawler/connectors/webcrawler/DocumentContentExclusionIT.java (added)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/test/java/org/apache/manifoldcf/crawler/connectors/webcrawler/DocumentContentExclusionIT.java Tue May 26 10:40:15 2015
@@ -0,0 +1,235 @@
+package org.apache.manifoldcf.crawler.connectors.webcrawler;
+
+import org.apache.manifoldcf.agents.interfaces.IOutputConnection;
+import org.apache.manifoldcf.agents.interfaces.IOutputConnectionManager;
+import org.apache.manifoldcf.agents.interfaces.OutputConnectionManagerFactory;
+import org.apache.manifoldcf.core.interfaces.ConfigParams;
+import org.apache.manifoldcf.core.interfaces.IResultRow;
+import org.apache.manifoldcf.core.interfaces.IResultSet;
+import org.apache.manifoldcf.core.interfaces.IThreadContext;
+import org.apache.manifoldcf.core.interfaces.ManifoldCFException;
+import org.apache.manifoldcf.core.interfaces.Specification;
+import org.apache.manifoldcf.core.interfaces.SpecificationNode;
+import org.apache.manifoldcf.core.interfaces.ThreadContextFactory;
+import org.apache.manifoldcf.crawler.connectors.webcrawler.tests.BaseITHSQLDB;
+import org.apache.manifoldcf.crawler.interfaces.*;
+import org.eclipse.jetty.server.Server;
+import org.eclipse.jetty.server.ServerConnector;
+import org.eclipse.jetty.servlet.ServletContextHandler;
+import org.eclipse.jetty.util.thread.QueuedThreadPool;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+import javax.servlet.ServletException;
+import javax.servlet.http.HttpServlet;
+import javax.servlet.http.HttpServletRequest;
+import javax.servlet.http.HttpServletResponse;
+import java.io.IOException;
+
+import static org.hamcrest.core.Is.is;
+import static org.hamcrest.core.IsNull.nullValue;
+import static org.junit.Assert.assertThat;
+
+public class DocumentContentExclusionIT extends BaseITHSQLDB {
+
+    private static final int MAX_DOC_COUNT = 3;
+
+    public static final String CONTENTFILTER_SERVLET_PATH = "/contentexclusiontest";
+    private static final int PORT = 8191;
+    public static final long MAX_WAIT_TIME = 60 * 1000L;
+    public static final String WEB_CONNECTION = "Web Connection";
+    static String baseUrl = "http://127.0.0.1:" + PORT + CONTENTFILTER_SERVLET_PATH + "?page=";
+
+    private Server server = null;
+    private IJobManager jobManager;
+    private IOutputConnectionManager outputConnectionManager;
+
+
+    private IRepositoryConnectionManager repoConnectionManager;
+
+
+    @Before
+    public void beforeDocumentContentFilterTest() throws Exception {
+        server = new Server(new QueuedThreadPool(20));
+        ServerConnector connector = new ServerConnector(server);
+        connector.setPort(PORT);
+        connector.setIdleTimeout(60000);// important for Http KeepAlive
+        server.addConnector(connector);
+        ServletContextHandler context = new ServletContextHandler(ServletContextHandler.SESSIONS);
+        context.addServlet(ContentFilterTestServlet.class, CONTENTFILTER_SERVLET_PATH);
+        server.setHandler(context);
+        server.start();
+
+        IThreadContext tc = ThreadContextFactory.make();
+        repoConnectionManager = RepositoryConnectionManagerFactory.make(tc);
+        outputConnectionManager = OutputConnectionManagerFactory.make(tc);
+        jobManager = JobManagerFactory.make(tc);
+        createRepoConnector();
+        createOutputConnector();
+    }
+
+
+    @Test
+    public void testDocumentContentExclusion() throws Exception {
+        //No content exclusion rule
+        IJobDescription job = setupContentFilterJob();
+        runContentFilterJob(job);
+        checkContentFilterHistory(false);
+        cleanupContentFilterJobs(job);
+
+        //With exclusion rule
+        job = setupContentFilterJob();
+        //add content exclusion rule
+        addContentExclusionRule(job);
+        runContentFilterJob(job);
+        checkContentFilterHistory(true);
+        cleanupContentFilterJobs(job);
+    }
+
+    private void checkContentFilterHistory(boolean hasContentExcluded) throws Exception {
+        FilterCriteria filter = new FilterCriteria(new String[]{"process"}, 0l, Long.MAX_VALUE, new RegExpCriteria(".*\\" + CONTENTFILTER_SERVLET_PATH + ".*", true), null);
+        SortOrder sortOrderValue = new SortOrder();
+        sortOrderValue.addCriteria("entityid", SortOrder.SORT_ASCENDING);
+        IResultSet result = repoConnectionManager.genHistorySimple(WEB_CONNECTION, filter, sortOrderValue, 0, 20);
+        assertThat(result.getRowCount(), is(MAX_DOC_COUNT));
+
+        for (int i = 0; i < MAX_DOC_COUNT; i++) {
+            IResultRow row = result.getRow(i);
+            assertThat((String) row.getValue("identifier"), is(baseUrl + i));
+            if (hasContentExcluded && i == 1) {
+                //if excluding, only page 1 will be excluded
+                assertThat((String) row.getValue("resultcode"), is("EXCLUDEDCONTENT"));
+                assertThat((String) row.getValue("resultdesc"), is("Rejected due to content exclusion rule"));
+            } else {
+                assertThat((String) row.getValue("resultcode"), is("OK"));
+                assertThat(row.getValue("resultdesc"), is(nullValue()));
+            }
+        }
+    }
+
+    @After
+    public void tearDownDocumentContentFilterTest() throws Exception {
+        if (server != null) {
+            server.stop();
+        }
+    }
+
+
+    private IJobDescription setupContentFilterJob() throws Exception {
+
+        // Create a job.
+        IJobDescription job = jobManager.createJob();
+        job.setDescription("Test Job");
+        job.setConnectionName(WEB_CONNECTION);
+        job.addPipelineStage(-1, true, "Null Connection", "");
+        job.setType(job.TYPE_SPECIFIED);
+        job.setStartMethod(job.START_DISABLE);
+        job.setHopcountMode(job.HOPCOUNT_NEVERDELETE);
+
+        Specification jobSpec = job.getSpecification();
+
+        // 3 seeds only
+        SpecificationNode sn = new SpecificationNode(WebcrawlerConfig.NODE_SEEDS);
+        StringBuilder sb = new StringBuilder();
+        for (int i = 0; i < MAX_DOC_COUNT; i++) {
+            sb.append(baseUrl + i + "\n");
+        }
+        sn.setValue(sb.toString());
+        jobSpec.addChild(jobSpec.getChildCount(), sn);
+
+        sn = new SpecificationNode(WebcrawlerConfig.NODE_INCLUDES);
+        sn.setValue(".*\n");
+        jobSpec.addChild(jobSpec.getChildCount(), sn);
+
+        sn = new SpecificationNode(WebcrawlerConfig.NODE_INCLUDESINDEX);
+        sn.setValue(".*\n");
+        jobSpec.addChild(jobSpec.getChildCount(), sn);
+        // Save the job.
+        jobManager.save(job);
+
+        return job;
+
+    }
+
+    private void addContentExclusionRule(IJobDescription job) throws ManifoldCFException {
+        Specification jobSpec = job.getSpecification();
+        SpecificationNode sn;
+        sn = new SpecificationNode(WebcrawlerConfig.NODE_EXCLUDESCONTENTINDEX);
+        sn.setValue(".*expired.*\n");
+        jobSpec.addChild(jobSpec.getChildCount(), sn);
+        jobManager.save(job);
+    }
+
+    private IOutputConnection createOutputConnector() throws ManifoldCFException {
+        // Create a basic null output connection, and save it.
+        IOutputConnection outputConn = outputConnectionManager.create();
+        outputConn.setName("Null Connection");
+        outputConn.setDescription("Null Connection");
+        outputConn.setClassName("org.apache.manifoldcf.agents.tests.TestingOutputConnector");
+        outputConn.setMaxConnections(10);
+        // Now, save
+        outputConnectionManager.save(outputConn);
+
+        return outputConn;
+    }
+
+    private IRepositoryConnection createRepoConnector() throws ManifoldCFException {
+        //TODO: This is a copy/paste: Could we have common method for creating test jobs???
+        IRepositoryConnection repoConnection = repoConnectionManager.create();
+        repoConnection.setName("Web Connection");
+        repoConnection.setDescription("Web Connection");
+        repoConnection.setClassName("org.apache.manifoldcf.crawler.connectors.webcrawler.WebcrawlerConnector");
+        repoConnection.setMaxConnections(50);
+        ConfigParams cp = repoConnection.getConfigParams();
+
+        cp.setParameter(WebcrawlerConfig.PARAMETER_EMAIL, "someone@somewhere.com");
+        cp.setParameter(WebcrawlerConfig.PARAMETER_ROBOTSUSAGE, "none");
+
+        repoConnectionManager.save(repoConnection);
+
+        return repoConnection;
+    }
+
+    private void cleanupContentFilterJobs(IJobDescription job) throws ManifoldCFException, InterruptedException {
+        repoConnectionManager.cleanUpHistoryData(WEB_CONNECTION);
+        jobManager.deleteJob(job.getID());
+        mcfInstance.waitJobDeletedNative(jobManager, job.getID(), MAX_WAIT_TIME);
+    }
+
+    private void runContentFilterJob(IJobDescription job) throws ManifoldCFException, InterruptedException {
+        jobManager.manualStart(job.getID());
+
+        try {
+            mcfInstance.waitJobInactiveNative(jobManager, job.getID(), MAX_WAIT_TIME);
+        } catch (ManifoldCFException e) {
+            System.err.println("Halting for inspection");
+            Thread.sleep(1000L);
+            throw e;
+        }
+        // Check to be sure we actually processed the right number of documents.
+        JobStatus status = jobManager.getStatus(job.getID());
+        System.err.println("doc processed: " + status.getDocumentsProcessed() + " Job status: " + status.getStatus());
+    }
+
+
+    public static class ContentFilterTestServlet extends HttpServlet {
+        @Override
+        protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException,
+                IOException {
+            response.setContentType("text/html; charset=utf-8");
+            //response.setHeader("Keep-Alive", "timeout=5, max=100");
+            response.setStatus(HttpServletResponse.SC_OK);
+            String page = request.getParameter("page");
+            page = (page == null) ? "unkown" : page;
+            response.getWriter().println("<html><head><title></title></head><body><h1>You are now on page " + page + " </h1>");
+            if ("1".equals(page)) {
+                //Only page 1 will contain the keyword "expired"
+                response.getWriter().println("<h1>Page 1 has expired. bye bye</h1>");
+            }
+            response.getWriter().println("</body>");
+            response.getWriter().flush();
+        }
+    }
+
+}
\ No newline at end of file

Modified: manifoldcf/trunk/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/IOutputHistoryActivity.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/IOutputHistoryActivity.java?rev=1681735&r1=1681734&r2=1681735&view=diff
==============================================================================
--- manifoldcf/trunk/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/IOutputHistoryActivity.java (original)
+++ manifoldcf/trunk/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/IOutputHistoryActivity.java Tue May 26 10:40:15 2015
@@ -35,6 +35,8 @@ public interface IOutputHistoryActivity
   public static final String EXCLUDED_LENGTH = "EXCLUDEDLENGTH";
   public static final String EXCLUDED_MIMETYPE = "EXCLUDEDMIMETYPE";
   public static final String EXCLUDED_DATE = "EXCLUDEDDATE";
+  public static final String EXCLUDED_CONTENT = "EXCLUDEDCONTENT";
+
   /**
    * Use this result code when security info is not recognized. 
    */

Modified: manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IHistoryActivity.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IHistoryActivity.java?rev=1681735&r1=1681734&r2=1681735&view=diff
==============================================================================
--- manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IHistoryActivity.java (original)
+++ manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IHistoryActivity.java Tue May 26 10:40:15 2015
@@ -32,6 +32,8 @@ public interface IHistoryActivity
   public static final String EXCLUDED_LENGTH = IOutputHistoryActivity.EXCLUDED_LENGTH;
   public static final String EXCLUDED_MIMETYPE = IOutputHistoryActivity.EXCLUDED_MIMETYPE;
   public static final String EXCLUDED_DATE = IOutputHistoryActivity.EXCLUDED_DATE;
+  public static final String EXCLUDED_CONTENT = IOutputHistoryActivity.EXCLUDED_CONTENT;
+
   /**
    * Use this result code when you get URL value from repository and it is not valid.
    */