You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2011/07/25 00:23:26 UTC
svn commit: r1150505 - in /incubator/lcf/trunk: ./
connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/
Author: kwright
Date: Sun Jul 24 22:23:25 2011
New Revision: 1150505
URL: http://svn.apache.org/viewvc?rev=1150505&view=rev
Log:
Finish CONNECTORS-214. Add post-fetch filtering to the Inclusions and Exclusions tab of the web connector.
Modified:
incubator/lcf/trunk/CHANGES.txt
incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConfig.java
incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
Modified: incubator/lcf/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/CHANGES.txt?rev=1150505&r1=1150504&r2=1150505&view=diff
==============================================================================
--- incubator/lcf/trunk/CHANGES.txt (original)
+++ incubator/lcf/trunk/CHANGES.txt Sun Jul 24 22:23:25 2011
@@ -3,13 +3,17 @@ $Id$
======================= 0.3-dev =========================
+CONNECTORS-214: Add post-fetch regular expression filtering to Web
+connector.
+(Karl Wright)
+
CONNECTORS-225: Derby throws deadlock exceptions when indexing
rapidly.
(Karl Wright)
CONNECTORS-223: Move test classes to be compatible with maven
conventions.
-(Karl Wright)
+(Tobias Rübner, Karl Wright)
CONNECTORS-219: Update maven pom.xml files to include proper
dependencies and version numbers.
Modified: incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConfig.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConfig.java?rev=1150505&r1=1150504&r2=1150505&view=diff
==============================================================================
--- incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConfig.java (original)
+++ incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConfig.java Sun Jul 24 22:23:25 2011
@@ -106,13 +106,21 @@ public class WebcrawlerConfig
* text area. */
public static final String NODE_SEEDS = "seeds";
/** Include regexps node. The value of this node contains the regexps that
- * must match the canonical URL in order for that URL to be included. These
+ * must match the canonical URL in order for that URL to be included in the crawl. These
* regexps are newline separated, and # starts a comment. */
public static final String NODE_INCLUDES = "includes";
/** Exclude regexps node. The value of this node contains the regexps that
- * if any one matches, causes the URL to be excluded. These
+ * if any one matches, causes the URL to be excluded from the crawl. These
* regexps are newline separated, and # starts a comment. */
public static final String NODE_EXCLUDES = "excludes";
+ /** Include regexps node. The value of this node contains the regexps that
+ * must match the canonical URL in order for that URL to be included for indexing. These
+ * regexps are newline separated, and # starts a comment. */
+ public static final String NODE_INCLUDESINDEX = "includesindex";
+ /** Exclude regexps node. The value of this node contains the regexps that
+ * if any one matches, causes the URL to be excluded from indexing. These
+ * regexps are newline separated, and # starts a comment. */
+ public static final String NODE_EXCLUDESINDEX = "excludesindex";
/** Limit to seeds. When value attribute is true, only seed domains will be permitted. */
public static final String NODE_LIMITTOSEEDS = "limittoseeds";
Modified: incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java?rev=1150505&r1=1150504&r2=1150505&view=diff
==============================================================================
--- incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java (original)
+++ incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java Sun Jul 24 22:23:25 2011
@@ -549,6 +549,8 @@ public class WebcrawlerConnector extends
// an object that knows how to do this.
DocumentURLFilter filter = new DocumentURLFilter(spec);
+ String filterVersion = filter.getVersionString();
+
String[] rval = new String[documentIdentifiers.length];
long currentTime = System.currentTimeMillis();
@@ -1054,6 +1056,9 @@ public class WebcrawlerConnector extends
packList(sb,metadata,'+');
// Done with the parseable part! Add the checksum.
sb.append(checkSum);
+ // Add the filter version
+ sb.append("+");
+ sb.append(filterVersion);
rval[i] = sb.toString();
break;
case RESULT_RETRY_DOCUMENT:
@@ -1139,7 +1144,7 @@ public class WebcrawlerConnector extends
// We can exclude it if it does not seem to be a kind of document that the ingestion system knows
// about.
if (indexDocument)
- indexDocument = isDataIngestable(activities,documentIdentifier);
+ indexDocument = isDataIngestable(activities,documentIdentifier,filter);
if (indexDocument)
{
@@ -3400,6 +3405,8 @@ public class WebcrawlerConnector extends
String seeds = "";
String inclusions = ".*\n";
String exclusions = "";
+ String inclusionsIndex = ".*\n";
+ String exclusionsIndex = "";
boolean includeMatching = false;
// Now, loop through description
@@ -3425,6 +3432,18 @@ public class WebcrawlerConnector extends
if (exclusions == null)
exclusions = "";
}
+ else if (sn.getType().equals(org.apache.manifoldcf.crawler.connectors.webcrawler.WebcrawlerConfig.NODE_INCLUDESINDEX))
+ {
+ inclusionsIndex = sn.getValue();
+ if (inclusionsIndex == null)
+ inclusionsIndex = "";
+ }
+ else if (sn.getType().equals(org.apache.manifoldcf.crawler.connectors.webcrawler.WebcrawlerConfig.NODE_EXCLUDESINDEX))
+ {
+ exclusionsIndex = sn.getValue();
+ if (exclusionsIndex == null)
+ exclusionsIndex = "";
+ }
else if (sn.getType().equals(org.apache.manifoldcf.crawler.connectors.webcrawler.WebcrawlerConfig.NODE_LIMITTOSEEDS))
{
String value = sn.getAttributeValue(WebcrawlerConfig.ATTR_VALUE);
@@ -3619,8 +3638,15 @@ public class WebcrawlerConnector extends
"<table class=\"displaytable\">\n"+
" <tr><td class=\"separator\" colspan=\"2\"><hr/></td></tr>\n"+
" <tr>\n"+
-" <td class=\"value\" colspan=\"2\">\n"+
-" <textarea rows=\"25\" cols=\"80\" name=\"inclusions\">"+org.apache.manifoldcf.ui.util.Encoder.bodyEscape(inclusions)+"</textarea>\n"+
+" <td class=\"description\" colspan=\"1\"><nobr>Include in crawl:</nobr></td>\n"+
+" <td class=\"value\" colspan=\"1\">\n"+
+" <textarea rows=\"25\" cols=\"60\" name=\"inclusions\">"+org.apache.manifoldcf.ui.util.Encoder.bodyEscape(inclusions)+"</textarea>\n"+
+" </td>\n"+
+" </tr>\n"+
+" <tr>\n"+
+" <td class=\"description\" colspan=\"1\"><nobr>Include in index:</nobr></td>\n"+
+" <td class=\"value\" colspan=\"1\">\n"+
+" <textarea rows=\"10\" cols=\"60\" name=\"inclusionsindex\">"+org.apache.manifoldcf.ui.util.Encoder.bodyEscape(inclusionsIndex)+"</textarea>\n"+
" </td>\n"+
" </tr>\n"+
" <tr>\n"+
@@ -3636,6 +3662,7 @@ public class WebcrawlerConnector extends
{
out.print(
"<input type=\"hidden\" name=\"inclusions\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(inclusions)+"\"/>\n"+
+"<input type=\"hidden\" name=\"inclusionsindex\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(inclusionsIndex)+"\"/>\n"+
"<input type=\"hidden\" name=\"matchinghosts\" value=\""+(includeMatching?"true":"false")+"\"/>\n"+
"<input type=\"hidden\" name=\"matchinghosts_present\" value=\"true\"/>\n"
);
@@ -3649,8 +3676,15 @@ public class WebcrawlerConnector extends
"<table class=\"displaytable\">\n"+
" <tr><td class=\"separator\" colspan=\"2\"><hr/></td></tr>\n"+
" <tr>\n"+
-" <td class=\"value\" colspan=\"2\">\n"+
-" <textarea rows=\"25\" cols=\"80\" name=\"exclusions\">"+org.apache.manifoldcf.ui.util.Encoder.bodyEscape(exclusions)+"</textarea>\n"+
+" <td class=\"description\" colspan=\"1\"><nobr>Exclude from crawl:</nobr></td>\n"+
+" <td class=\"value\" colspan=\"1\">\n"+
+" <textarea rows=\"25\" cols=\"60\" name=\"exclusions\">"+org.apache.manifoldcf.ui.util.Encoder.bodyEscape(exclusions)+"</textarea>\n"+
+" </td>\n"+
+" </tr>\n"+
+" <tr>\n"+
+" <td class=\"description\" colspan=\"1\"><nobr>Exclude from index:</nobr></td>\n"+
+" <td class=\"value\" colspan=\"1\">\n"+
+" <textarea rows=\"10\" cols=\"60\" name=\"exclusionsindex\">"+org.apache.manifoldcf.ui.util.Encoder.bodyEscape(exclusionsIndex)+"</textarea>\n"+
" </td>\n"+
" </tr>\n"+
"</table>\n"
@@ -3659,7 +3693,8 @@ public class WebcrawlerConnector extends
else
{
out.print(
-"<input type=\"hidden\" name=\"exclusions\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(exclusions)+"\"/>\n"
+"<input type=\"hidden\" name=\"exclusions\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(exclusions)+"\"/>\n"+
+"<input type=\"hidden\" name=\"exclusionsindex\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(exclusionsIndex)+"\"/>\n"
);
}
@@ -3900,6 +3935,26 @@ public class WebcrawlerConnector extends
ds.addChild(ds.getChildCount(),cn);
}
+ // Get the index inclusions
+ String inclusionsIndex = variableContext.getParameter("inclusionsindex");
+ if (inclusionsIndex != null)
+ {
+ // Delete existing index inclusions record first
+ int i = 0;
+ while (i < ds.getChildCount())
+ {
+ SpecificationNode sn = ds.getChild(i);
+ if (sn.getType().equals(org.apache.manifoldcf.crawler.connectors.webcrawler.WebcrawlerConfig.NODE_INCLUDESINDEX))
+ ds.removeChild(i);
+ else
+ i++;
+ }
+
+ SpecificationNode cn = new SpecificationNode(org.apache.manifoldcf.crawler.connectors.webcrawler.WebcrawlerConfig.NODE_INCLUDESINDEX);
+ cn.setValue(inclusionsIndex);
+ ds.addChild(ds.getChildCount(),cn);
+ }
+
// Handle the seeds-only switch
String matchingHostsPresent = variableContext.getParameter("matchinghosts_present");
if (matchingHostsPresent != null)
@@ -3941,6 +3996,26 @@ public class WebcrawlerConnector extends
ds.addChild(ds.getChildCount(),cn);
}
+ // Get the index exclusions
+ String exclusionsIndex = variableContext.getParameter("exclusionsindex");
+ if (exclusionsIndex != null)
+ {
+ // Delete existing exclusions record first
+ int i = 0;
+ while (i < ds.getChildCount())
+ {
+ SpecificationNode sn = ds.getChild(i);
+ if (sn.getType().equals(org.apache.manifoldcf.crawler.connectors.webcrawler.WebcrawlerConfig.NODE_EXCLUDESINDEX))
+ ds.removeChild(i);
+ else
+ i++;
+ }
+
+ SpecificationNode cn = new SpecificationNode(org.apache.manifoldcf.crawler.connectors.webcrawler.WebcrawlerConfig.NODE_EXCLUDESINDEX);
+ cn.setValue(exclusionsIndex);
+ ds.addChild(ds.getChildCount(),cn);
+ }
+
// Read the url specs
String urlRegexpCount = variableContext.getParameter("urlregexpcount");
if (urlRegexpCount != null && urlRegexpCount.length() > 0)
@@ -4140,6 +4215,8 @@ public class WebcrawlerConnector extends
String seeds = "";
String inclusions = ".*\n";
String exclusions = "";
+ String inclusionsIndex = ".*\n";
+ String exclusionsIndex = "";
boolean includeMatching = false;
int i = 0;
@@ -4164,6 +4241,18 @@ public class WebcrawlerConnector extends
if (exclusions == null)
exclusions = "";
}
+ else if (sn.getType().equals(org.apache.manifoldcf.crawler.connectors.webcrawler.WebcrawlerConfig.NODE_INCLUDESINDEX))
+ {
+ inclusionsIndex = sn.getValue();
+ if (inclusionsIndex == null)
+ inclusionsIndex = "";
+ }
+ else if (sn.getType().equals(org.apache.manifoldcf.crawler.connectors.webcrawler.WebcrawlerConfig.NODE_EXCLUDESINDEX))
+ {
+ exclusionsIndex = sn.getValue();
+ if (exclusionsIndex == null)
+ exclusionsIndex = "";
+ }
else if (sn.getType().equals(org.apache.manifoldcf.crawler.connectors.webcrawler.WebcrawlerConfig.NODE_LIMITTOSEEDS))
{
String value = sn.getAttributeValue(WebcrawlerConfig.ATTR_VALUE);
@@ -4304,7 +4393,7 @@ public class WebcrawlerConnector extends
out.print(
" <tr><td class=\"separator\" colspan=\"2\"><hr/></td></tr>\n"+
" <tr>\n"+
-" <td class=\"description\"><nobr>Includes:</nobr></td>\n"+
+" <td class=\"description\"><nobr>Include in crawl:</nobr></td>\n"+
" <td class=\"value\">\n"
);
try
@@ -4346,7 +4435,49 @@ public class WebcrawlerConnector extends
" </tr>\n"+
" <tr><td class=\"separator\" colspan=\"2\"><hr/></td></tr>\n"+
" <tr>\n"+
-" <td class=\"description\"><nobr>Excludes:</nobr></td>\n"+
+" <td class=\"description\"><nobr>Include in index:</nobr></td>\n"+
+" <td class=\"value\">\n"
+ );
+ try
+ {
+ java.io.Reader str = new java.io.StringReader(inclusionsIndex);
+ try
+ {
+ java.io.BufferedReader is = new java.io.BufferedReader(str);
+ try
+ {
+ while (true)
+ {
+ String nextString = is.readLine();
+ if (nextString == null)
+ break;
+ if (nextString.length() == 0)
+ continue;
+ out.print(
+" <nobr>"+org.apache.manifoldcf.ui.util.Encoder.bodyEscape(nextString)+"</nobr><br/>\n"
+ );
+ }
+ }
+ finally
+ {
+ is.close();
+ }
+ }
+ finally
+ {
+ str.close();
+ }
+ }
+ catch (java.io.IOException e)
+ {
+ throw new ManifoldCFException("IO error: "+e.getMessage(),e);
+ }
+ out.print(
+" </td>\n"+
+" </tr>\n"+
+" <tr><td class=\"separator\" colspan=\"2\"><hr/></td></tr>\n"+
+" <tr>\n"+
+" <td class=\"description\"><nobr>Exclude from crawl:</nobr></td>\n"+
" <td class=\"value\">\n"
);
try
@@ -4386,6 +4517,48 @@ public class WebcrawlerConnector extends
out.print(
" </td>\n"+
" </tr>\n"+
+" <tr><td class=\"separator\" colspan=\"2\"><hr/></td></tr>\n"+
+" <tr>\n"+
+" <td class=\"description\"><nobr>Exclude from index:</nobr></td>\n"+
+" <td class=\"value\">\n"
+ );
+ try
+ {
+ java.io.Reader str = new java.io.StringReader(exclusionsIndex);
+ try
+ {
+ java.io.BufferedReader is = new java.io.BufferedReader(str);
+ try
+ {
+ while (true)
+ {
+ String nextString = is.readLine();
+ if (nextString == null)
+ break;
+ if (nextString.length() == 0)
+ continue;
+ out.print(
+" <nobr>"+org.apache.manifoldcf.ui.util.Encoder.bodyEscape(nextString)+"</nobr><br/>\n"
+ );
+ }
+ }
+ finally
+ {
+ is.close();
+ }
+ }
+ finally
+ {
+ str.close();
+ }
+ }
+ catch (java.io.IOException e)
+ {
+ throw new ManifoldCFException("IO error: "+e.getMessage(),e);
+ }
+ out.print(
+" </td>\n"+
+" </tr>\n"+
" \n"+
" <tr><td class=\"separator\" colspan=\"2\"><hr/></td></tr>\n"
);
@@ -5115,7 +5288,7 @@ public class WebcrawlerConnector extends
/** Code to check if an already-fetched document should be ingested.
*/
- protected boolean isDataIngestable(IFingerprintActivity activities, String documentIdentifier)
+ protected boolean isDataIngestable(IFingerprintActivity activities, String documentIdentifier, DocumentURLFilter filter)
throws ServiceInterruption, ManifoldCFException
{
if (cache.getResponseCode(documentIdentifier) != 200)
@@ -5127,6 +5300,9 @@ public class WebcrawlerConnector extends
if (activities.checkURLIndexable(documentIdentifier) == false)
return false;
+ if (filter.isDocumentIndexable(documentIdentifier) == false)
+ return false;
+
// Check if it's a recognized content type
String contentType = cache.getContentType(documentIdentifier);
@@ -6901,14 +7077,20 @@ public class WebcrawlerConnector extends
}
}
- /** This class describes the url filtering information obtained from a digested DocumentSpecification.
+ /** This class describes the url filtering information (for crawling and indexing) obtained from a digested DocumentSpecification.
*/
protected static class DocumentURLFilter
{
+ /** The version string */
+ protected String versionString;
/** The arraylist of include patterns */
protected ArrayList includePatterns = new ArrayList();
/** The arraylist of exclude patterns */
protected ArrayList excludePatterns = new ArrayList();
+ /** The arraylist of index include patterns */
+ protected ArrayList includeIndexPatterns = new ArrayList();
+ /** The arraylist of index exclude patterns */
+ protected ArrayList excludeIndexPatterns = new ArrayList();
/** The hash map of seed hosts, to limit urls by, if non-null */
protected HashMap seedHosts = null;
@@ -6923,8 +7105,10 @@ public class WebcrawlerConnector extends
public DocumentURLFilter(DocumentSpecification spec)
throws ManifoldCFException
{
- String includes = "";
+ String includes = ".*";
String excludes = "";
+ String includesIndex = ".*";
+ String excludesIndex = "";
String seeds = "";
boolean limitToSeeds = false;
int i = 0;
@@ -6950,6 +7134,18 @@ public class WebcrawlerConnector extends
if (excludes == null)
excludes = "";
}
+ else if (sn.getType().equals(WebcrawlerConfig.NODE_INCLUDESINDEX))
+ {
+ includesIndex = sn.getValue();
+ if (includesIndex == null)
+ includesIndex = "";
+ }
+ else if (sn.getType().equals(WebcrawlerConfig.NODE_EXCLUDESINDEX))
+ {
+ excludesIndex = sn.getValue();
+ if (excludesIndex == null)
+ excludesIndex = "";
+ }
else if (sn.getType().equals(WebcrawlerConfig.NODE_LIMITTOSEEDS))
{
String value = sn.getAttributeValue(WebcrawlerConfig.ATTR_VALUE);
@@ -7034,10 +7230,16 @@ public class WebcrawlerConnector extends
}
}
+ versionString = includesIndex + "+" + excludesIndex;
+
ArrayList list = stringToArray(includes);
compileList(includePatterns,list);
list = stringToArray(excludes);
compileList(excludePatterns,list);
+ list = stringToArray(includesIndex);
+ compileList(includeIndexPatterns,list);
+ list = stringToArray(excludesIndex);
+ compileList(excludeIndexPatterns,list);
if (limitToSeeds)
{
@@ -7072,6 +7274,14 @@ public class WebcrawlerConnector extends
}
}
+ /** Get whatever contribution to the version string should come from this data.
+ */
+ public String getVersionString()
+ {
+ // In practice, this is NOT what controls the set that is spidered, but rather the set that is indexed
+ return versionString;
+ }
+
/** Check if both a document and host are legal.
*/
public boolean isDocumentAndHostLegal(String url)
@@ -7147,6 +7357,45 @@ public class WebcrawlerConnector extends
return true;
}
+ /** Check if the document identifier is indexable.
+ */
+ public boolean isDocumentIndexable(String url)
+ {
+ // First, verify that the url matches one of the patterns in the include list.
+ int i = 0;
+ while (i < includeIndexPatterns.size())
+ {
+ Pattern p = (Pattern)includeIndexPatterns.get(i);
+ Matcher m = p.matcher(url);
+ if (m.find())
+ break;
+ i++;
+ }
+ if (i == includeIndexPatterns.size())
+ {
+ if (Logging.connectors.isDebugEnabled())
+ Logging.connectors.debug("WEB: Url '"+url+"' is not indexable because no include patterns match it");
+ return false;
+ }
+
+ // Now make sure it's not in the exclude list.
+ i = 0;
+ while (i < excludeIndexPatterns.size())
+ {
+ Pattern p = (Pattern)excludeIndexPatterns.get(i);
+ Matcher m = p.matcher(url);
+ if (m.find())
+ {
+ if (Logging.connectors.isDebugEnabled())
+ Logging.connectors.debug("WEB: Url '"+url+"' is not indexable because exclude pattern '"+p.toString()+"' matched it");
+ return false;
+ }
+ i++;
+ }
+
+ return true;
+ }
+
/** Get canonicalization policies */
public CanonicalizationPolicies getCanonicalizationPolicies()
{