You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2011/08/23 01:56:04 UTC

svn commit: r1160504 - in /incubator/lcf/trunk: ./ connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/

Author: kwright
Date: Mon Aug 22 23:56:04 2011
New Revision: 1160504

URL: http://svn.apache.org/viewvc?rev=1160504&view=rev
Log:
Add ability for the web connector to transmit most response headers to the output connector, as metadata.  CONNECTORS-243.

Modified:
    incubator/lcf/trunk/CHANGES.txt
    incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IThrottledConnection.java
    incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ThrottledFetcher.java
    incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java

Modified: incubator/lcf/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/CHANGES.txt?rev=1160504&r1=1160503&r2=1160504&view=diff
==============================================================================
--- incubator/lcf/trunk/CHANGES.txt (original)
+++ incubator/lcf/trunk/CHANGES.txt Mon Aug 22 23:56:04 2011
@@ -3,6 +3,10 @@ $Id$
 
 ======================= 0.3-dev =========================
 
+CONNECTORS-243: Include most response headers in the metadata for
+each web connector document.
+(Jan Høydahl, Karl Wright)
+
 CONNECTORS-224: Add OpenSearchServer output connector.
 (Emmanuel Keller, Karl Wright)
 

Modified: incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IThrottledConnection.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IThrottledConnection.java?rev=1160504&r1=1160503&r2=1160504&view=diff
==============================================================================
--- incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IThrottledConnection.java (original)
+++ incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IThrottledConnection.java Mon Aug 22 23:56:04 2011
@@ -22,6 +22,7 @@ import org.apache.manifoldcf.core.interf
 import org.apache.manifoldcf.agents.interfaces.*;
 import org.apache.manifoldcf.crawler.interfaces.*;
 import java.io.*;
+import java.util.*;
 
 /** This interface represents an established connection to a URL.
 */
@@ -78,6 +79,12 @@ public interface IThrottledConnection
   public LoginCookies getLastFetchCookies()
     throws ManifoldCFException, ServiceInterruption;
 
+  /** Get response headers
+  *@return a map keyed by header name containing a list of values.
+  */
+  public Map<String,List<String>> getResponseHeaders()
+    throws ManifoldCFException, ServiceInterruption;
+    
   /** Get a specified response header, if it exists.
   *@param headerName is the name of the header.
   *@return the header value, or null if it doesn't exist.

Modified: incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ThrottledFetcher.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ThrottledFetcher.java?rev=1160504&r1=1160503&r2=1160504&view=diff
==============================================================================
--- incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ThrottledFetcher.java (original)
+++ incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ThrottledFetcher.java Mon Aug 22 23:56:04 2011
@@ -1630,6 +1630,31 @@ public class ThrottledFetcher
       return lastFetchCookies;
     }
 
+    /** Get response headers
+    *@return a map keyed by header name containing a list of values.
+    */
+    public Map<String,List<String>> getResponseHeaders()
+      throws ManifoldCFException, ServiceInterruption
+    {
+      Header[] headers = fetchMethod.getResponseHeaders();
+      Map<String,List<String>> rval = new HashMap<String,List<String>>();
+      int i = 0;
+      while (i < headers.length)
+      {
+        Header h = headers[i++];
+        String name = h.getName();
+        String value = h.getValue();
+        List<String> values = rval.get(name);
+        if (values == null)
+        {
+          values = new ArrayList<String>();
+          rval.put(name,values);
+        }
+        values.add(value);
+      }
+      return rval;
+    }
+
     /** Get a specified response header, if it exists.
     *@param headerName is the name of the header.
     *@return the header value, or null if it doesn't exist.

Modified: incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java?rev=1160504&r1=1160503&r2=1160504&view=diff
==============================================================================
--- incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java (original)
+++ incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java Mon Aug 22 23:56:04 2011
@@ -141,6 +141,19 @@ public class WebcrawlerConnector extends
   protected final static String FETCH_STANDARD = "URL";
   protected final static String FETCH_LOGIN = "LOGIN";
 
+  // Reserved headers
+  protected static Map<String,String> reservedHeaders;
+  static
+  {
+    reservedHeaders = new HashMap<String,String>();
+    reservedHeaders.put("age","age");
+    reservedHeaders.put("www-authenticate","www-authenticate");
+    reservedHeaders.put("proxy-authenticate","proxy-authenticate");
+    reservedHeaders.put("date","date");
+    reservedHeaders.put("set-cookie","set-cookie");
+    reservedHeaders.put("via","via");
+  }
+  
   /** Robots usage flag */
   protected int robotsUsage = ROBOTS_ALL;
   /** The user-agent for this connector instance */
@@ -607,7 +620,9 @@ public class WebcrawlerConnector extends
           Throwable contextException = null;
           // The checksum, which will be needed if resultSignal is RESULT_VERSION_NEEDED.
           String checkSum = null;
-
+          // The headers, which will be needed if resultSignal is RESULT_VERSION_NEEDED.
+          Map<String,List<String>> headerData = null;
+          
           while (true)
           {
             try
@@ -696,6 +711,7 @@ public class WebcrawlerConnector extends
                         {
                           // Treat it as real, and cache it.
                           checkSum = cache.addData(activities,currentURI,connection);
+                          headerData = connection.getResponseHeaders();
                           resultSignal = RESULT_VERSION_NEEDED;
                           activityResultCode = null;
                         }
@@ -1052,8 +1068,43 @@ public class WebcrawlerConnector extends
             else
               sb.append('-');
 
-            // Now, do the metadata
-            packList(sb,metadata,'+');
+            // Now, do the metadata.  This comes in two parts: first, the canned metadata, then the header data.
+            // They're all folded into the same part of the version string.
+            int headerCount = 0;
+            Iterator<String> headerIterator = headerData.keySet().iterator();
+            while (headerIterator.hasNext())
+            {
+              String headerName = headerIterator.next();
+              if (reservedHeaders.get(headerName.toLowerCase()) == null)
+                headerCount += headerData.get(headerName).size();
+            }
+            String[] fullMetadata = new String[metadata.length + headerCount];
+            headerCount = 0;
+            headerIterator = headerData.keySet().iterator();
+            while (headerIterator.hasNext())
+            {
+              String headerName = headerIterator.next();
+              if (reservedHeaders.get(headerName.toLowerCase()) == null)
+              {
+                List<String> headerValues = headerData.get(headerName);
+                for (String headerValue : headerValues)
+                {
+                  fixedListStrings[0] = "header-"+headerName;
+                  fixedListStrings[1] = headerValue;
+                  StringBuilder newsb = new StringBuilder();
+                  packFixedList(newsb,fixedListStrings,'=');
+                  fullMetadata[headerCount++] = newsb.toString();
+                }
+              }
+            }
+            int index = 0;
+            while (index < metadata.length)
+            {
+              fullMetadata[headerCount++] = metadata[index++];
+            }
+            java.util.Arrays.sort(fullMetadata);
+            
+            packList(sb,fullMetadata,'+');
             // Done with the parseable part!  Add the checksum.
             sb.append(checkSum);
             // Add the filter version