You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2015/05/26 12:40:16 UTC
svn commit: r1681735 - in /manifoldcf/trunk: ./
connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/
connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler...
Author: kwright
Date: Tue May 26 10:40:15 2015
New Revision: 1681735
URL: http://svn.apache.org/r1681735
Log:
Fix for CONNECTORS-1193. Committed on behalf of Arcadius Ahouansou.
Added:
manifoldcf/trunk/connectors/webcrawler/connector/src/test/java/org/apache/manifoldcf/crawler/connectors/webcrawler/DocumentContentExclusionIT.java
Modified:
manifoldcf/trunk/CHANGES.txt
manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindContentHandler.java
manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConfig.java
manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_en_US.properties
manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_ja_JP.properties
manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_zh_CN.properties
manifoldcf/trunk/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/IOutputHistoryActivity.java
manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IHistoryActivity.java
Modified: manifoldcf/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/CHANGES.txt?rev=1681735&r1=1681734&r2=1681735&view=diff
==============================================================================
--- manifoldcf/trunk/CHANGES.txt (original)
+++ manifoldcf/trunk/CHANGES.txt Tue May 26 10:40:15 2015
@@ -3,6 +3,10 @@ $Id$
======================= 2.2-dev =====================
+CONNECTORS-1193: Add ability to discard web content based on a
+set of regular expressions.
+(Arcadius Ahouansou)
+
CONNECTORS-1199: SearchBlox connector formatting non-standard.
(Karl Wright)
Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindContentHandler.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindContentHandler.java?rev=1681735&r1=1681734&r2=1681735&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindContentHandler.java (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindContentHandler.java Tue May 26 10:40:15 2015
@@ -18,15 +18,17 @@
*/
package org.apache.manifoldcf.crawler.connectors.webcrawler;
-import org.apache.manifoldcf.core.interfaces.*;
-import org.apache.manifoldcf.crawler.system.Logging;
-import java.util.regex.*;
-import java.util.*;
+import org.apache.manifoldcf.core.interfaces.ManifoldCFException;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Pattern;
+import static java.util.Arrays.asList;
+import static org.apache.manifoldcf.crawler.system.Logging.connectors;
/** This class is the handler for HTML content grepping during state transitions */
public class FindContentHandler extends FindHandler implements IHTMLHandler
{
- protected final Pattern contentPattern;
+ protected final List<Pattern> contentPatterns;
protected final StringBuilder contentBuffer = new StringBuilder();
protected final static int MAX_LENGTH = 65536;
@@ -35,7 +37,13 @@ public class FindContentHandler extends
public FindContentHandler(String parentURI, Pattern contentPattern)
{
super(parentURI);
- this.contentPattern = contentPattern;
+ this.contentPatterns = asList(contentPattern);
+ }
+
+ public FindContentHandler(String parentURI, List<Pattern> contentPatterns)
+ {
+ super(parentURI);
+ this.contentPatterns = contentPatterns;
}
/** Apply overrides */
@@ -69,10 +77,14 @@ public class FindContentHandler extends
// continuity
String bufferContents = contentBuffer.toString();
contentBuffer.setLength(0);
- if (contentPattern.matcher(bufferContents).find())
- targetURI = "";
- else
- {
+ for (Pattern contentPattern : contentPatterns) {
+ if (contentPattern.matcher(bufferContents).find()) {
+ targetURI = "";
+ break;
+ }
+ }
+
+ if(targetURI == null) {
contentBuffer.append(bufferContents.substring(bufferContents.length() - OVERLAP_AMOUNT));
}
}
@@ -153,8 +165,12 @@ public class FindContentHandler extends
{
String bufferContents = contentBuffer.toString();
contentBuffer.setLength(0);
- if (contentPattern.matcher(bufferContents).find())
- targetURI = "";
+ for(Pattern contentPattern: contentPatterns) {
+ if (contentPattern.matcher(bufferContents).find()) {
+ targetURI = "";
+ return;
+ }
+ }
}
}
Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConfig.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConfig.java?rev=1681735&r1=1681734&r2=1681735&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConfig.java (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConfig.java Tue May 26 10:40:15 2015
@@ -137,6 +137,12 @@ public class WebcrawlerConfig
* if any one matches, causes the URL to be excluded from indexing. These
* regexps are newline separated, and # starts a comment. */
public static final String NODE_EXCLUDESINDEX = "excludesindex";
+
+ /**
+ * Exclude any page containing specified regex in their body from index
+ */
+ public static final String NODE_EXCLUDESCONTENTINDEX = "excludescontentindex";
+
/** Limit to seeds. When value attribute is true, only seed domains will be permitted. */
public static final String NODE_LIMITTOSEEDS = "limittoseeds";
/** Canonicalization rule. Attributes are regexp, description, reorder,
Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java?rev=1681735&r1=1681734&r2=1681735&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java Tue May 26 10:40:15 2015
@@ -29,10 +29,6 @@ import org.apache.manifoldcf.connectorco
import org.apache.manifoldcf.connectorcommon.fuzzyml.*;
import org.apache.http.conn.ConnectTimeoutException;
-import org.apache.http.client.RedirectException;
-import org.apache.http.client.CircularRedirectException;
-import org.apache.http.NoHttpResponseException;
-import org.apache.http.HttpException;
import java.io.*;
import java.nio.charset.StandardCharsets;
@@ -508,7 +504,7 @@ public class WebcrawlerConnector extends
*@param activities is the interface this method should use to perform whatever framework actions are desired.
*@param spec is a document specification (that comes from the job).
*@param seedTime is the end of the time range of documents to consider, exclusive.
- *@param lastSeedVersionString is the last seeding version string for this job, or null if the job has no previous seeding version string.
+ *@param lastSeedVersion is the last seeding version string for this job, or null if the job has no previous seeding version string.
*@param jobMode is an integer describing how the job is being run, whether continuous or once-only.
*@return an updated seeding version string, to be stored with the job.
*/
@@ -1303,7 +1299,7 @@ public class WebcrawlerConnector extends
activities.noDocument(documentIdentifier,versionString);
return;
}
-
+
if (activities.checkURLIndexable(documentIdentifier) == false)
{
if (Logging.connectors.isDebugEnabled())
@@ -1355,7 +1351,15 @@ public class WebcrawlerConnector extends
activities.noDocument(documentIdentifier,versionString);
return;
}
-
+
+ if(!filter.isDocumentContentIndexable(documentIdentifier)){
+ if (Logging.connectors.isDebugEnabled())
+ Logging.connectors.debug("Web: For document '"+documentIdentifier+"', not indexing because document content matched document content exclusion rule");
+ errorCode = activities.EXCLUDED_CONTENT;
+ errorDesc = "Rejected due to content exclusion rule";
+ activities.noDocument(documentIdentifier,versionString);
+ return;
+ }
// Ingest the document
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("WEB: Decided to ingest '"+documentIdentifier+"'");
@@ -3811,6 +3815,8 @@ public class WebcrawlerConnector extends
String exclusions = "";
String inclusionsIndex = ".*\n";
String exclusionsIndex = "";
+ String exclusionsContentIndex = "";
+
boolean includeMatching = true;
Set<String> excludedHeaders = new HashSet<String>();
@@ -3861,6 +3867,12 @@ public class WebcrawlerConnector extends
if (exclusionsIndex == null)
exclusionsIndex = "";
}
+ else if (sn.getType().equals(WebcrawlerConfig.NODE_EXCLUDESCONTENTINDEX))
+ {
+ exclusionsContentIndex = sn.getValue();
+ if (exclusionsContentIndex == null)
+ exclusionsContentIndex = "";
+ }
else if (sn.getType().equals(WebcrawlerConfig.NODE_LIMITTOSEEDS))
{
String value = sn.getAttributeValue(WebcrawlerConfig.ATTR_VALUE);
@@ -4218,14 +4230,21 @@ public class WebcrawlerConnector extends
" <textarea rows=\"10\" cols=\"60\" name=\""+seqPrefix+"exclusionsindex\">"+Encoder.bodyEscape(exclusionsIndex)+"</textarea>\n"+
" </td>\n"+
" </tr>\n"+
+" <tr>\n"+
+" <td class=\"description\" colspan=\"1\"><nobr>" + Messages.getBodyString(locale,"WebcrawlerConnector.ExcludeContentFromIndex") + "</nobr></td>\n"+
+" <td class=\"value\" colspan=\"1\">\n"+
+" <textarea rows=\"10\" cols=\"60\" name=\""+seqPrefix+"exclusionscontentindex\">"+Encoder.bodyEscape(exclusionsContentIndex)+"</textarea>\n"+
+" </td>\n"+
+" </tr>\n"+
"</table>\n"
);
}
else
{
out.print(
-"<input type=\"hidden\" name=\""+seqPrefix+"exclusions\" value=\""+Encoder.attributeEscape(exclusions)+"\"/>\n"+
-"<input type=\"hidden\" name=\""+seqPrefix+"exclusionsindex\" value=\""+Encoder.attributeEscape(exclusionsIndex)+"\"/>\n"
+ "<input type=\"hidden\" name=\"" + seqPrefix + "exclusions\" value=\"" + Encoder.attributeEscape(exclusions) + "\"/>\n" +
+ "<input type=\"hidden\" name=\"" + seqPrefix + "exclusionsindex\" value=\"" + Encoder.attributeEscape(exclusionsIndex) + "\"/>\n" +
+ "<input type=\"hidden\" name=\"" + seqPrefix + "exclusionscontentindex\" value=\"" + Encoder.attributeEscape(exclusionsContentIndex) + "\"/>\n"
);
}
@@ -4586,6 +4605,26 @@ public class WebcrawlerConnector extends
ds.addChild(ds.getChildCount(),cn);
}
+ // Get the content index exclusions
+ String exclusionsContentIndex = variableContext.getParameter(seqPrefix+"exclusionscontentindex");
+ if (exclusionsContentIndex != null)
+ {
+ // Delete existing content exclusions record first
+ int i = 0;
+ while (i < ds.getChildCount())
+ {
+ SpecificationNode sn = ds.getChild(i);
+ if (sn.getType().equals(WebcrawlerConfig.NODE_EXCLUDESCONTENTINDEX))
+ ds.removeChild(i);
+ else
+ i++;
+ }
+
+ SpecificationNode cn = new SpecificationNode(WebcrawlerConfig.NODE_EXCLUDESCONTENTINDEX);
+ cn.setValue(exclusionsContentIndex);
+ ds.addChild(ds.getChildCount(),cn);
+ }
+
// Read the url specs
String urlRegexpCount = variableContext.getParameter(seqPrefix+"urlregexpcount");
if (urlRegexpCount != null && urlRegexpCount.length() > 0)
@@ -4742,6 +4781,8 @@ public class WebcrawlerConnector extends
String exclusions = "";
String inclusionsIndex = ".*\n";
String exclusionsIndex = "";
+ String exclusionsContentIndex = "";
+
boolean includeMatching = false;
Set<String> excludedHeaders = new HashSet<String>();
@@ -4779,6 +4820,12 @@ public class WebcrawlerConnector extends
if (exclusionsIndex == null)
exclusionsIndex = "";
}
+ else if (sn.getType().equals(WebcrawlerConfig.NODE_EXCLUDESCONTENTINDEX))
+ {
+ exclusionsContentIndex = sn.getValue();
+ if (exclusionsContentIndex == null)
+ exclusionsContentIndex = "";
+ }
else if (sn.getType().equals(WebcrawlerConfig.NODE_LIMITTOSEEDS))
{
String value = sn.getAttributeValue(WebcrawlerConfig.ATTR_VALUE);
@@ -5157,6 +5204,48 @@ public class WebcrawlerConnector extends
out.print(
" </td>\n"+
" </tr>\n"+
+" <tr><td class=\"separator\" colspan=\"2\"><hr/></td></tr>\n"+
+" <tr>\n"+
+" <td class=\"description\"><nobr>" + Messages.getBodyString(locale,"WebcrawlerConnector.ExcludeContentFromIndex") + "</nobr></td>\n"+
+" <td class=\"value\">\n"
+ );
+ try
+ {
+ java.io.Reader str = new java.io.StringReader(exclusionsContentIndex);
+ try
+ {
+ java.io.BufferedReader is = new java.io.BufferedReader(str);
+ try
+ {
+ while (true)
+ {
+ String nextString = is.readLine();
+ if (nextString == null)
+ break;
+ if (nextString.length() == 0)
+ continue;
+ out.print(
+" <nobr>"+Encoder.bodyEscape(nextString)+"</nobr><br/>\n"
+ );
+ }
+ }
+ finally
+ {
+ is.close();
+ }
+ }
+ finally
+ {
+ str.close();
+ }
+ }
+ catch (java.io.IOException e)
+ {
+ throw new ManifoldCFException("IO error: "+e.getMessage(),e);
+ }
+ out.print(
+" </td>\n"+
+" </tr>\n"+
" \n"+
" <tr><td class=\"separator\" colspan=\"2\"><hr/></td></tr>\n"
);
@@ -5936,7 +6025,7 @@ public class WebcrawlerConnector extends
handler.applyOverrides(lp);
return handler.getTargetURI();
}
-
+
/** Find HTML link URI, if present, making sure specified preference is matched. */
protected String findHTMLLinkURI(String currentURI, LoginParameters lp)
throws ManifoldCFException
@@ -7732,7 +7821,7 @@ public class WebcrawlerConnector extends
/** This class describes the url filtering information (for crawling and indexing) obtained from a digested DocumentSpecification.
*/
- protected static class DocumentURLFilter
+ protected class DocumentURLFilter
{
/** The version string */
protected String versionString;
@@ -7748,7 +7837,10 @@ public class WebcrawlerConnector extends
protected final List<Pattern> excludeIndexPatterns = new ArrayList<Pattern>();
/** The hash map of seed hosts, to limit urls by, if non-null */
protected Set<String> seedHosts = null;
-
+
+ /**List of content exclusion pattern*/
+ protected final List<Pattern> excludeContentIndexPatterns = new ArrayList<Pattern>();
+
/** Canonicalization policies */
protected final CanonicalizationPolicies canonicalizationPolicies = new CanonicalizationPolicies();
@@ -7764,6 +7856,7 @@ public class WebcrawlerConnector extends
String excludes = "";
String includesIndex = ".*";
String excludesIndex = "";
+ String excludesContentIndex = "";
String seeds = "";
List<String> packList = new ArrayList<String>();
String[] packStuff = new String[2];
@@ -7895,12 +7988,19 @@ public class WebcrawlerConnector extends
throw new ManifoldCFException("Canonicalization regular expression '"+urlRegexp+"' is illegal: "+e.getMessage(),e);
}
}
+ else if (sn.getType().equals(WebcrawlerConfig.NODE_EXCLUDESCONTENTINDEX))
+ {
+ excludesContentIndex = sn.getValue();
+ if (excludesContentIndex == null)
+ excludesContentIndex = "";
+ }
}
// Note: format change since MCF 1.7 release
StringBuilder versionBuffer = new StringBuilder();
pack(versionBuffer,includesIndex,'+');
pack(versionBuffer,excludesIndex,'+');
+ pack(versionBuffer,excludesContentIndex,'+');
packList(versionBuffer,packList,'+');
versionString = versionBuffer.toString();
@@ -7913,7 +8013,9 @@ public class WebcrawlerConnector extends
compileList(includeIndexPatterns,list);
list = stringToArray(excludesIndex);
compileList(excludeIndexPatterns,list);
-
+ list = stringToArray(excludesContentIndex);
+ compileList(excludeContentIndexPatterns,list);
+
if (limitToSeeds)
{
seedHosts = new HashSet<String>();
@@ -8084,6 +8186,30 @@ public class WebcrawlerConnector extends
return canonicalizationPolicies;
}
+ public boolean isDocumentContentIndexable(String documentIdentifier) throws ManifoldCFException {
+ String content = findSpecifiedContent(documentIdentifier, excludeContentIndexPatterns);
+ if (content != null) {
+ if (Logging.connectors.isDebugEnabled())
+ Logging.connectors.debug("WEB: Url '" + documentIdentifier + "' is not indexable because content exclusion pattern was matched");
+
+ return false;
+ }
+ return true;
+ }
+
+ protected String findSpecifiedContent(String currentURI, List<Pattern> patterns) throws ManifoldCFException
+ {
+ if (excludeContentIndexPatterns.isEmpty()) {
+ if (Logging.connectors.isDebugEnabled())
+ Logging.connectors.debug("WEB: no content exclusion rule supplied... returning");
+ return null;
+ }
+
+ FindContentHandler handler = new FindContentHandler(currentURI, patterns);
+ handleHTML(currentURI, handler);
+ return handler.getTargetURI();
+ }
+
}
protected static class FetchStatus
Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_en_US.properties
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_en_US.properties?rev=1681735&r1=1681734&r2=1681735&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_en_US.properties (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_en_US.properties Tue May 26 10:40:15 2015
@@ -76,6 +76,7 @@ WebcrawlerConnector.IncludeInIndex=Inclu
WebcrawlerConnector.IncludeOnlyHostsMatchingSeeds=Include only hosts matching seeds?
WebcrawlerConnector.ExcludeFromCrawl=Exclude from crawl:
WebcrawlerConnector.ExcludeFromIndex=Exclude from index:
+WebcrawlerConnector.ExcludeContentFromIndex=Exclude content from index:
WebcrawlerConnector.DeleteToken=Delete token #
WebcrawlerConnector.NoAccessTokensPresent=No access tokens present
WebcrawlerConnector.AddAccessToken=Add access token
Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_ja_JP.properties
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_ja_JP.properties?rev=1681735&r1=1681734&r2=1681735&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_ja_JP.properties (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_ja_JP.properties Tue May 26 10:40:15 2015
@@ -77,6 +77,7 @@ WebcrawlerConnector.IncludeOnlyHostsMatc
WebcrawlerConnector.ExcludeFromCrawl=ã¯ãã¼ã«ããé¤å¤ï¼
WebcrawlerConnector.ExcludeFromIndex=ç´¢å¼ãé¤å¤ï¼
WebcrawlerConnector.DeleteToken=ãã¼ã¯ã³ãåé¤ #
+WebcrawlerConnector.ExcludeContentFromIndex=Exclude content from index:
WebcrawlerConnector.NoAccessTokensPresent=ã¢ã¯ã»ã¹ãã¼ã¯ã³ãããã¾ãã
WebcrawlerConnector.AddAccessToken=ã¢ã¯ã»ã¹ãã¼ã¯ã³ã追å
WebcrawlerConnector.DeleteMetadata=ã¡ã¿ãã¼ã¿ãåé¤ #
Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_zh_CN.properties
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_zh_CN.properties?rev=1681735&r1=1681734&r2=1681735&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_zh_CN.properties (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_zh_CN.properties Tue May 26 10:40:15 2015
@@ -77,6 +77,7 @@ WebcrawlerConnector.IncludeOnlyHostsMatc
WebcrawlerConnector.ExcludeFromCrawl=æé¤äºç¬è«å¤:
WebcrawlerConnector.ExcludeFromIndex=æé¤äºç´¢å¼å¤:
WebcrawlerConnector.DeleteToken=å é¤ä»¤ç #
+WebcrawlerConnector.ExcludeContentFromIndex=Exclude content from index:
WebcrawlerConnector.NoAccessTokensPresent=访é®ä»¤çä¸åå¨
WebcrawlerConnector.AddAccessToken=æ·»å 访é®ä»¤ç
WebcrawlerConnector.DeleteMetadata=å é¤å
æ°æ® #
Added: manifoldcf/trunk/connectors/webcrawler/connector/src/test/java/org/apache/manifoldcf/crawler/connectors/webcrawler/DocumentContentExclusionIT.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/test/java/org/apache/manifoldcf/crawler/connectors/webcrawler/DocumentContentExclusionIT.java?rev=1681735&view=auto
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/test/java/org/apache/manifoldcf/crawler/connectors/webcrawler/DocumentContentExclusionIT.java (added)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/test/java/org/apache/manifoldcf/crawler/connectors/webcrawler/DocumentContentExclusionIT.java Tue May 26 10:40:15 2015
@@ -0,0 +1,235 @@
+package org.apache.manifoldcf.crawler.connectors.webcrawler;
+
+import org.apache.manifoldcf.agents.interfaces.IOutputConnection;
+import org.apache.manifoldcf.agents.interfaces.IOutputConnectionManager;
+import org.apache.manifoldcf.agents.interfaces.OutputConnectionManagerFactory;
+import org.apache.manifoldcf.core.interfaces.ConfigParams;
+import org.apache.manifoldcf.core.interfaces.IResultRow;
+import org.apache.manifoldcf.core.interfaces.IResultSet;
+import org.apache.manifoldcf.core.interfaces.IThreadContext;
+import org.apache.manifoldcf.core.interfaces.ManifoldCFException;
+import org.apache.manifoldcf.core.interfaces.Specification;
+import org.apache.manifoldcf.core.interfaces.SpecificationNode;
+import org.apache.manifoldcf.core.interfaces.ThreadContextFactory;
+import org.apache.manifoldcf.crawler.connectors.webcrawler.tests.BaseITHSQLDB;
+import org.apache.manifoldcf.crawler.interfaces.*;
+import org.eclipse.jetty.server.Server;
+import org.eclipse.jetty.server.ServerConnector;
+import org.eclipse.jetty.servlet.ServletContextHandler;
+import org.eclipse.jetty.util.thread.QueuedThreadPool;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+import javax.servlet.ServletException;
+import javax.servlet.http.HttpServlet;
+import javax.servlet.http.HttpServletRequest;
+import javax.servlet.http.HttpServletResponse;
+import java.io.IOException;
+
+import static org.hamcrest.core.Is.is;
+import static org.hamcrest.core.IsNull.nullValue;
+import static org.junit.Assert.assertThat;
+
+public class DocumentContentExclusionIT extends BaseITHSQLDB {
+
+ private static final int MAX_DOC_COUNT = 3;
+
+ public static final String CONTENTFILTER_SERVLET_PATH = "/contentexclusiontest";
+ private static final int PORT = 8191;
+ public static final long MAX_WAIT_TIME = 60 * 1000L;
+ public static final String WEB_CONNECTION = "Web Connection";
+ static String baseUrl = "http://127.0.0.1:" + PORT + CONTENTFILTER_SERVLET_PATH + "?page=";
+
+ private Server server = null;
+ private IJobManager jobManager;
+ private IOutputConnectionManager outputConnectionManager;
+
+
+ private IRepositoryConnectionManager repoConnectionManager;
+
+
+ @Before
+ public void beforeDocumentContentFilterTest() throws Exception {
+ server = new Server(new QueuedThreadPool(20));
+ ServerConnector connector = new ServerConnector(server);
+ connector.setPort(PORT);
+ connector.setIdleTimeout(60000);// important for Http KeepAlive
+ server.addConnector(connector);
+ ServletContextHandler context = new ServletContextHandler(ServletContextHandler.SESSIONS);
+ context.addServlet(ContentFilterTestServlet.class, CONTENTFILTER_SERVLET_PATH);
+ server.setHandler(context);
+ server.start();
+
+ IThreadContext tc = ThreadContextFactory.make();
+ repoConnectionManager = RepositoryConnectionManagerFactory.make(tc);
+ outputConnectionManager = OutputConnectionManagerFactory.make(tc);
+ jobManager = JobManagerFactory.make(tc);
+ createRepoConnector();
+ createOutputConnector();
+ }
+
+
+ @Test
+ public void testDocumentContentExclusion() throws Exception {
+ //No content exclusion rule
+ IJobDescription job = setupContentFilterJob();
+ runContentFilterJob(job);
+ checkContentFilterHistory(false);
+ cleanupContentFilterJobs(job);
+
+ //With exclusion rule
+ job = setupContentFilterJob();
+ //add content exclusion rule
+ addContentExclusionRule(job);
+ runContentFilterJob(job);
+ checkContentFilterHistory(true);
+ cleanupContentFilterJobs(job);
+ }
+
+ private void checkContentFilterHistory(boolean hasContentExcluded) throws Exception {
+ FilterCriteria filter = new FilterCriteria(new String[]{"process"}, 0l, Long.MAX_VALUE, new RegExpCriteria(".*\\" + CONTENTFILTER_SERVLET_PATH + ".*", true), null);
+ SortOrder sortOrderValue = new SortOrder();
+ sortOrderValue.addCriteria("entityid", SortOrder.SORT_ASCENDING);
+ IResultSet result = repoConnectionManager.genHistorySimple(WEB_CONNECTION, filter, sortOrderValue, 0, 20);
+ assertThat(result.getRowCount(), is(MAX_DOC_COUNT));
+
+ for (int i = 0; i < MAX_DOC_COUNT; i++) {
+ IResultRow row = result.getRow(i);
+ assertThat((String) row.getValue("identifier"), is(baseUrl + i));
+ if (hasContentExcluded && i == 1) {
+ //if excluding, only page 1 will be excluded
+ assertThat((String) row.getValue("resultcode"), is("EXCLUDEDCONTENT"));
+ assertThat((String) row.getValue("resultdesc"), is("Rejected due to content exclusion rule"));
+ } else {
+ assertThat((String) row.getValue("resultcode"), is("OK"));
+ assertThat(row.getValue("resultdesc"), is(nullValue()));
+ }
+ }
+ }
+
+ @After
+ public void tearDownDocumentContentFilterTest() throws Exception {
+ if (server != null) {
+ server.stop();
+ }
+ }
+
+
+ private IJobDescription setupContentFilterJob() throws Exception {
+
+ // Create a job.
+ IJobDescription job = jobManager.createJob();
+ job.setDescription("Test Job");
+ job.setConnectionName(WEB_CONNECTION);
+ job.addPipelineStage(-1, true, "Null Connection", "");
+ job.setType(job.TYPE_SPECIFIED);
+ job.setStartMethod(job.START_DISABLE);
+ job.setHopcountMode(job.HOPCOUNT_NEVERDELETE);
+
+ Specification jobSpec = job.getSpecification();
+
+ // 3 seeds only
+ SpecificationNode sn = new SpecificationNode(WebcrawlerConfig.NODE_SEEDS);
+ StringBuilder sb = new StringBuilder();
+ for (int i = 0; i < MAX_DOC_COUNT; i++) {
+ sb.append(baseUrl + i + "\n");
+ }
+ sn.setValue(sb.toString());
+ jobSpec.addChild(jobSpec.getChildCount(), sn);
+
+ sn = new SpecificationNode(WebcrawlerConfig.NODE_INCLUDES);
+ sn.setValue(".*\n");
+ jobSpec.addChild(jobSpec.getChildCount(), sn);
+
+ sn = new SpecificationNode(WebcrawlerConfig.NODE_INCLUDESINDEX);
+ sn.setValue(".*\n");
+ jobSpec.addChild(jobSpec.getChildCount(), sn);
+ // Save the job.
+ jobManager.save(job);
+
+ return job;
+
+ }
+
+ private void addContentExclusionRule(IJobDescription job) throws ManifoldCFException {
+ Specification jobSpec = job.getSpecification();
+ SpecificationNode sn;
+ sn = new SpecificationNode(WebcrawlerConfig.NODE_EXCLUDESCONTENTINDEX);
+ sn.setValue(".*expired.*\n");
+ jobSpec.addChild(jobSpec.getChildCount(), sn);
+ jobManager.save(job);
+ }
+
+ private IOutputConnection createOutputConnector() throws ManifoldCFException {
+ // Create a basic null output connection, and save it.
+ IOutputConnection outputConn = outputConnectionManager.create();
+ outputConn.setName("Null Connection");
+ outputConn.setDescription("Null Connection");
+ outputConn.setClassName("org.apache.manifoldcf.agents.tests.TestingOutputConnector");
+ outputConn.setMaxConnections(10);
+ // Now, save
+ outputConnectionManager.save(outputConn);
+
+ return outputConn;
+ }
+
+ private IRepositoryConnection createRepoConnector() throws ManifoldCFException {
+ //TODO: This is a copy/paste: Could we have common method for creating test jobs???
+ IRepositoryConnection repoConnection = repoConnectionManager.create();
+ repoConnection.setName("Web Connection");
+ repoConnection.setDescription("Web Connection");
+ repoConnection.setClassName("org.apache.manifoldcf.crawler.connectors.webcrawler.WebcrawlerConnector");
+ repoConnection.setMaxConnections(50);
+ ConfigParams cp = repoConnection.getConfigParams();
+
+ cp.setParameter(WebcrawlerConfig.PARAMETER_EMAIL, "someone@somewhere.com");
+ cp.setParameter(WebcrawlerConfig.PARAMETER_ROBOTSUSAGE, "none");
+
+ repoConnectionManager.save(repoConnection);
+
+ return repoConnection;
+ }
+
+ private void cleanupContentFilterJobs(IJobDescription job) throws ManifoldCFException, InterruptedException {
+ repoConnectionManager.cleanUpHistoryData(WEB_CONNECTION);
+ jobManager.deleteJob(job.getID());
+ mcfInstance.waitJobDeletedNative(jobManager, job.getID(), MAX_WAIT_TIME);
+ }
+
+ private void runContentFilterJob(IJobDescription job) throws ManifoldCFException, InterruptedException {
+ jobManager.manualStart(job.getID());
+
+ try {
+ mcfInstance.waitJobInactiveNative(jobManager, job.getID(), MAX_WAIT_TIME);
+ } catch (ManifoldCFException e) {
+ System.err.println("Halting for inspection");
+ Thread.sleep(1000L);
+ throw e;
+ }
+ // Check to be sure we actually processed the right number of documents.
+ JobStatus status = jobManager.getStatus(job.getID());
+ System.err.println("doc processed: " + status.getDocumentsProcessed() + " Job status: " + status.getStatus());
+ }
+
+
+ public static class ContentFilterTestServlet extends HttpServlet {
+ @Override
+ protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException,
+ IOException {
+ response.setContentType("text/html; charset=utf-8");
+ //response.setHeader("Keep-Alive", "timeout=5, max=100");
+ response.setStatus(HttpServletResponse.SC_OK);
+ String page = request.getParameter("page");
+ page = (page == null) ? "unkown" : page;
+ response.getWriter().println("<html><head><title></title></head><body><h1>You are now on page " + page + " </h1>");
+ if ("1".equals(page)) {
+ //Only page 1 will contain the keyword "expired"
+ response.getWriter().println("<h1>Page 1 has expired. bye bye</h1>");
+ }
+ response.getWriter().println("</body>");
+ response.getWriter().flush();
+ }
+ }
+
+}
\ No newline at end of file
Modified: manifoldcf/trunk/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/IOutputHistoryActivity.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/IOutputHistoryActivity.java?rev=1681735&r1=1681734&r2=1681735&view=diff
==============================================================================
--- manifoldcf/trunk/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/IOutputHistoryActivity.java (original)
+++ manifoldcf/trunk/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/IOutputHistoryActivity.java Tue May 26 10:40:15 2015
@@ -35,6 +35,8 @@ public interface IOutputHistoryActivity
public static final String EXCLUDED_LENGTH = "EXCLUDEDLENGTH";
public static final String EXCLUDED_MIMETYPE = "EXCLUDEDMIMETYPE";
public static final String EXCLUDED_DATE = "EXCLUDEDDATE";
+ public static final String EXCLUDED_CONTENT = "EXCLUDEDCONTENT";
+
/**
* Use this result code when security info is not recognized.
*/
Modified: manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IHistoryActivity.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IHistoryActivity.java?rev=1681735&r1=1681734&r2=1681735&view=diff
==============================================================================
--- manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IHistoryActivity.java (original)
+++ manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IHistoryActivity.java Tue May 26 10:40:15 2015
@@ -32,6 +32,8 @@ public interface IHistoryActivity
public static final String EXCLUDED_LENGTH = IOutputHistoryActivity.EXCLUDED_LENGTH;
public static final String EXCLUDED_MIMETYPE = IOutputHistoryActivity.EXCLUDED_MIMETYPE;
public static final String EXCLUDED_DATE = IOutputHistoryActivity.EXCLUDED_DATE;
+ public static final String EXCLUDED_CONTENT = IOutputHistoryActivity.EXCLUDED_CONTENT;
+
/**
* Use this result code when you get URL value from repository and it is not valid.
*/