You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by jm...@apache.org on 2022/01/24 09:48:55 UTC
svn commit: r1897405 - /manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
Author: jmssiera
Date: Mon Jan 24 09:48:55 2022
New Revision: 1897405
URL: http://svn.apache.org/viewvc?rev=1897405&view=rev
Log:
CONNECTORS-1665 WebConnector: Add activity records for excluded URLs
Modified:
manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java?rev=1897405&r1=1897404&r2=1897405&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java Mon Jan 24 09:48:55 2022
@@ -545,7 +545,7 @@ public class WebcrawlerConnector extends
while (index < list.size())
{
String urlCandidate = (String)list.get(index++);
- String documentIdentifier = makeDocumentIdentifier(null,urlCandidate,filter);
+ String documentIdentifier = makeDocumentIdentifier(null,urlCandidate,filter, activities);
if (documentIdentifier == null)
{
// Bad seed. Log it, and continue!
@@ -616,7 +616,7 @@ public class WebcrawlerConnector extends
for (String documentIdentifier : documentIdentifiers)
{
// Verify that the url is legal
- if (!filter.isDocumentAndHostLegal(documentIdentifier))
+ if (!filter.isDocumentAndHostLegal(documentIdentifier,activities))
{
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("WEB: Removing url '"+documentIdentifier+"' because it's not in the set of allowed ones");
@@ -1320,7 +1320,7 @@ public class WebcrawlerConnector extends
return;
}
- String ingestURL = filter.isDocumentIndexable(documentIdentifier);
+ String ingestURL = filter.isDocumentIndexable(documentIdentifier,activities);
if (ingestURL == null)
{
if (Logging.connectors.isDebugEnabled())
@@ -3400,7 +3400,7 @@ public class WebcrawlerConnector extends
*@param filter the filter object, used to remove unmatching URLs.
*@return the canonical URL (the document identifier), or null if the url was illegal.
*/
- protected String makeDocumentIdentifier(String parentIdentifier, String rawURL, DocumentURLFilter filter)
+ protected String makeDocumentIdentifier(String parentIdentifier, String rawURL, DocumentURLFilter filter, IHistoryActivity activities)
throws ManifoldCFException
{
try
@@ -3462,7 +3462,7 @@ public class WebcrawlerConnector extends
}
// Check to be sure the canonicalized URL is in fact one of the ones we want to include
- if (!filter.isDocumentLegal(id))
+ if (!filter.isDocumentLegal(id, activities))
return null;
return id;
@@ -3897,7 +3897,7 @@ public class WebcrawlerConnector extends
public void noteDiscoveredBase(String rawURL)
throws ManifoldCFException
{
- String newIdentifier = makeDocumentIdentifier(baseDocumentIdentifier,rawURL,filter);
+ String newIdentifier = makeDocumentIdentifier(baseDocumentIdentifier,rawURL,filter,activities);
if (newIdentifier != null)
baseDocumentIdentifier = newIdentifier;
}
@@ -3909,7 +3909,7 @@ public class WebcrawlerConnector extends
public void noteDiscoveredLink(String rawURL)
throws ManifoldCFException
{
- String newIdentifier = makeDocumentIdentifier(baseDocumentIdentifier,rawURL,filter);
+ String newIdentifier = makeDocumentIdentifier(baseDocumentIdentifier,rawURL,filter,activities);
if (newIdentifier != null)
{
if (Logging.connectors.isDebugEnabled())
@@ -5908,10 +5908,11 @@ public class WebcrawlerConnector extends
}
/** Check if both a document and host are legal.
+ * @throws ManifoldCFException
*/
- public boolean isDocumentAndHostLegal(String url)
+ public boolean isDocumentAndHostLegal(String url, IHistoryActivity activities) throws ManifoldCFException
{
- if (!isDocumentLegal(url))
+ if (!isDocumentLegal(url, activities))
return false;
if (seedHosts == null)
return true;
@@ -5944,8 +5945,9 @@ public class WebcrawlerConnector extends
}
/** Check if the document identifier is legal.
+ * @throws ManifoldCFException
*/
- public boolean isDocumentLegal(String url)
+ public boolean isDocumentLegal(String url, IHistoryActivity activities) throws ManifoldCFException
{
// First, verify that the url matches one of the patterns in the include list.
int i = 0;
@@ -5961,6 +5963,7 @@ public class WebcrawlerConnector extends
{
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("WEB: Url '"+url+"' is illegal because no include patterns match it");
+ activities.recordActivity(System.currentTimeMillis(), ACTIVITY_FETCH, null, url, "EXCLUDED", "URL has been excluded as it does not match any include filter", null);
return false;
}
@@ -5974,6 +5977,7 @@ public class WebcrawlerConnector extends
{
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("WEB: Url '"+url+"' is illegal because exclude pattern '"+p.toString()+"' matched it");
+ activities.recordActivity(System.currentTimeMillis(), ACTIVITY_FETCH, null, url, "EXCLUDED", "URL has been excluded as the exclude pattern " + p.toString() + " matched it", null);
return false;
}
i++;
@@ -5985,7 +5989,7 @@ public class WebcrawlerConnector extends
/** Check if the document identifier is indexable, and return the indexing URL if found.
* @return null if the url doesn't match or should not be ingested, or the new string if it does.
*/
- public String isDocumentIndexable(String url)
+ public String isDocumentIndexable(String url, IProcessActivity activities)
throws ManifoldCFException
{
// First, verify that the url matches one of the patterns in the include list.
@@ -6002,6 +6006,7 @@ public class WebcrawlerConnector extends
{
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("WEB: Url '"+url+"' is not indexable because no include patterns match it");
+ activities.recordActivity(System.currentTimeMillis(), ACTIVITY_FETCH, null, url, "EXCLUDED", "URL has been excluded as it does not match any include filter", null);
return null;
}
@@ -6015,6 +6020,7 @@ public class WebcrawlerConnector extends
{
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("WEB: Url '"+url+"' is not indexable because exclude pattern '"+p.toString()+"' matched it");
+ activities.recordActivity(System.currentTimeMillis(), ACTIVITY_FETCH, null, url, "EXCLUDED", "URL has been excluded as the exclude pattern " + p.toString() + " matched it", null);
return null;
}
i++;
@@ -6025,6 +6031,7 @@ public class WebcrawlerConnector extends
{
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("WEB: Url '"+url+"' is not indexable because it did not match a mapping rule");
+ activities.recordActivity(System.currentTimeMillis(), ACTIVITY_FETCH, null, url, "EXCLUDED", "URL has been excluded because it did not match a mapping rule", null);
}
return rval;