You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by jm...@apache.org on 2022/01/24 09:48:55 UTC
svn commit: r1897405 - /manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java

Author: jmssiera
Date: Mon Jan 24 09:48:55 2022
New Revision: 1897405

URL: http://svn.apache.org/viewvc?rev=1897405&view=rev
Log:
CONNECTORS-1665 WebConnector: Add activity records for excluded URLs

Modified:
    manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java

Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java?rev=1897405&r1=1897404&r2=1897405&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java Mon Jan 24 09:48:55 2022
@@ -545,7 +545,7 @@ public class WebcrawlerConnector extends
     while (index < list.size())
     {
       String urlCandidate = (String)list.get(index++);
-      String documentIdentifier = makeDocumentIdentifier(null,urlCandidate,filter);
+      String documentIdentifier = makeDocumentIdentifier(null,urlCandidate,filter, activities);
       if (documentIdentifier == null)
       {
         // Bad seed.  Log it, and continue!
@@ -616,7 +616,7 @@ public class WebcrawlerConnector extends
     for (String documentIdentifier : documentIdentifiers)
     {
       // Verify that the url is legal
-      if (!filter.isDocumentAndHostLegal(documentIdentifier))
+      if (!filter.isDocumentAndHostLegal(documentIdentifier,activities))
       {
         if (Logging.connectors.isDebugEnabled())
           Logging.connectors.debug("WEB: Removing url '"+documentIdentifier+"' because it's not in the set of allowed ones");
@@ -1320,7 +1320,7 @@ public class WebcrawlerConnector extends
         return;
       }
 
-      String ingestURL = filter.isDocumentIndexable(documentIdentifier);
+      String ingestURL = filter.isDocumentIndexable(documentIdentifier,activities);
       if (ingestURL == null)
       {
         if (Logging.connectors.isDebugEnabled())
@@ -3400,7 +3400,7 @@ public class WebcrawlerConnector extends
   *@param filter the filter object, used to remove unmatching URLs.
   *@return the canonical URL (the document identifier), or null if the url was illegal.
   */
-  protected String makeDocumentIdentifier(String parentIdentifier, String rawURL, DocumentURLFilter filter)
+  protected String makeDocumentIdentifier(String parentIdentifier, String rawURL, DocumentURLFilter filter, IHistoryActivity activities)
     throws ManifoldCFException
   {
     try
@@ -3462,7 +3462,7 @@ public class WebcrawlerConnector extends
       }
 
       // Check to be sure the canonicalized URL is in fact one of the ones we want to include
-      if (!filter.isDocumentLegal(id))
+      if (!filter.isDocumentLegal(id, activities))
         return null;
 
       return id;
@@ -3897,7 +3897,7 @@ public class WebcrawlerConnector extends
     public void noteDiscoveredBase(String rawURL)
       throws ManifoldCFException
     {
-      String newIdentifier = makeDocumentIdentifier(baseDocumentIdentifier,rawURL,filter);
+      String newIdentifier = makeDocumentIdentifier(baseDocumentIdentifier,rawURL,filter,activities);
       if (newIdentifier != null)
         baseDocumentIdentifier = newIdentifier;
     }
@@ -3909,7 +3909,7 @@ public class WebcrawlerConnector extends
     public void noteDiscoveredLink(String rawURL)
       throws ManifoldCFException
     {
-      String newIdentifier = makeDocumentIdentifier(baseDocumentIdentifier,rawURL,filter);
+      String newIdentifier = makeDocumentIdentifier(baseDocumentIdentifier,rawURL,filter,activities);
       if (newIdentifier != null)
       {
         if (Logging.connectors.isDebugEnabled())
@@ -5908,10 +5908,11 @@ public class WebcrawlerConnector extends
     }
     
     /** Check if both a document and host are legal.
+     * @throws ManifoldCFException 
     */
-    public boolean isDocumentAndHostLegal(String url)
+    public boolean isDocumentAndHostLegal(String url, IHistoryActivity activities) throws ManifoldCFException
     {
-      if (!isDocumentLegal(url))
+      if (!isDocumentLegal(url, activities))
         return false;
       if (seedHosts == null)
         return true;
@@ -5944,8 +5945,9 @@ public class WebcrawlerConnector extends
     }
     
     /** Check if the document identifier is legal.
+     * @throws ManifoldCFException 
     */
-    public boolean isDocumentLegal(String url)
+    public boolean isDocumentLegal(String url, IHistoryActivity activities) throws ManifoldCFException
     {
       // First, verify that the url matches one of the patterns in the include list.
       int i = 0;
@@ -5961,6 +5963,7 @@ public class WebcrawlerConnector extends
       {
         if (Logging.connectors.isDebugEnabled())
           Logging.connectors.debug("WEB: Url '"+url+"' is illegal because no include patterns match it");
+        activities.recordActivity(System.currentTimeMillis(), ACTIVITY_FETCH, null, url, "EXCLUDED", "URL has been excluded as it does not match any include filter", null);
         return false;
       }
 
@@ -5974,6 +5977,7 @@ public class WebcrawlerConnector extends
         {
           if (Logging.connectors.isDebugEnabled())
             Logging.connectors.debug("WEB: Url '"+url+"' is illegal because exclude pattern '"+p.toString()+"' matched it");
+          activities.recordActivity(System.currentTimeMillis(), ACTIVITY_FETCH, null, url, "EXCLUDED", "URL has been excluded as the exclude pattern " + p.toString() + " matched it", null);
           return false;
         }
         i++;
@@ -5985,7 +5989,7 @@ public class WebcrawlerConnector extends
     /** Check if the document identifier is indexable, and return the indexing URL if found.
     * @return null if the url doesn't match or should not be ingested, or the new string if it does.
     */
-    public String isDocumentIndexable(String url)
+    public String isDocumentIndexable(String url, IProcessActivity activities)
       throws ManifoldCFException
     {
       // First, verify that the url matches one of the patterns in the include list.
@@ -6002,6 +6006,7 @@ public class WebcrawlerConnector extends
       {
         if (Logging.connectors.isDebugEnabled())
           Logging.connectors.debug("WEB: Url '"+url+"' is not indexable because no include patterns match it");
+        activities.recordActivity(System.currentTimeMillis(), ACTIVITY_FETCH, null, url, "EXCLUDED", "URL has been excluded as it does not match any include filter", null);
         return null;
       }
 
@@ -6015,6 +6020,7 @@ public class WebcrawlerConnector extends
         {
           if (Logging.connectors.isDebugEnabled())
             Logging.connectors.debug("WEB: Url '"+url+"' is not indexable because exclude pattern '"+p.toString()+"' matched it");
+          activities.recordActivity(System.currentTimeMillis(), ACTIVITY_FETCH, null, url, "EXCLUDED", "URL has been excluded as the exclude pattern " + p.toString() + " matched it", null);
           return null;
         }
         i++;
@@ -6025,6 +6031,7 @@ public class WebcrawlerConnector extends
       {
         if (Logging.connectors.isDebugEnabled())
           Logging.connectors.debug("WEB: Url '"+url+"' is not indexable because it did not match a mapping rule");
+        activities.recordActivity(System.currentTimeMillis(), ACTIVITY_FETCH, null, url, "EXCLUDED", "URL has been excluded because it did not match a mapping rule", null);
       }
 
       return rval;