You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2011/08/23 01:56:04 UTC
svn commit: r1160504 - in /incubator/lcf/trunk: ./
connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/
Author: kwright
Date: Mon Aug 22 23:56:04 2011
New Revision: 1160504
URL: http://svn.apache.org/viewvc?rev=1160504&view=rev
Log:
Add ability for the web connector to transmit most response headers to the output connector, as metadata. CONNECTORS-243.
Modified:
incubator/lcf/trunk/CHANGES.txt
incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IThrottledConnection.java
incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ThrottledFetcher.java
incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
Modified: incubator/lcf/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/CHANGES.txt?rev=1160504&r1=1160503&r2=1160504&view=diff
==============================================================================
--- incubator/lcf/trunk/CHANGES.txt (original)
+++ incubator/lcf/trunk/CHANGES.txt Mon Aug 22 23:56:04 2011
@@ -3,6 +3,10 @@ $Id$
======================= 0.3-dev =========================
+CONNECTORS-243: Include most response headers in the metadata for
+each web connector document.
+(Jan Høydahl, Karl Wright)
+
CONNECTORS-224: Add OpenSearchServer output connector.
(Emmanuel Keller, Karl Wright)
Modified: incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IThrottledConnection.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IThrottledConnection.java?rev=1160504&r1=1160503&r2=1160504&view=diff
==============================================================================
--- incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IThrottledConnection.java (original)
+++ incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IThrottledConnection.java Mon Aug 22 23:56:04 2011
@@ -22,6 +22,7 @@ import org.apache.manifoldcf.core.interf
import org.apache.manifoldcf.agents.interfaces.*;
import org.apache.manifoldcf.crawler.interfaces.*;
import java.io.*;
+import java.util.*;
/** This interface represents an established connection to a URL.
*/
@@ -78,6 +79,12 @@ public interface IThrottledConnection
public LoginCookies getLastFetchCookies()
throws ManifoldCFException, ServiceInterruption;
+ /** Get response headers
+ *@return a map keyed by header name containing a list of values.
+ */
+ public Map<String,List<String>> getResponseHeaders()
+ throws ManifoldCFException, ServiceInterruption;
+
/** Get a specified response header, if it exists.
*@param headerName is the name of the header.
*@return the header value, or null if it doesn't exist.
Modified: incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ThrottledFetcher.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ThrottledFetcher.java?rev=1160504&r1=1160503&r2=1160504&view=diff
==============================================================================
--- incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ThrottledFetcher.java (original)
+++ incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ThrottledFetcher.java Mon Aug 22 23:56:04 2011
@@ -1630,6 +1630,31 @@ public class ThrottledFetcher
return lastFetchCookies;
}
+ /** Get response headers
+ *@return a map keyed by header name containing a list of values.
+ */
+ public Map<String,List<String>> getResponseHeaders()
+ throws ManifoldCFException, ServiceInterruption
+ {
+ Header[] headers = fetchMethod.getResponseHeaders();
+ Map<String,List<String>> rval = new HashMap<String,List<String>>();
+ int i = 0;
+ while (i < headers.length)
+ {
+ Header h = headers[i++];
+ String name = h.getName();
+ String value = h.getValue();
+ List<String> values = rval.get(name);
+ if (values == null)
+ {
+ values = new ArrayList<String>();
+ rval.put(name,values);
+ }
+ values.add(value);
+ }
+ return rval;
+ }
+
/** Get a specified response header, if it exists.
*@param headerName is the name of the header.
*@return the header value, or null if it doesn't exist.
Modified: incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java?rev=1160504&r1=1160503&r2=1160504&view=diff
==============================================================================
--- incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java (original)
+++ incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java Mon Aug 22 23:56:04 2011
@@ -141,6 +141,19 @@ public class WebcrawlerConnector extends
protected final static String FETCH_STANDARD = "URL";
protected final static String FETCH_LOGIN = "LOGIN";
+ // Reserved headers
+ protected static Map<String,String> reservedHeaders;
+ static
+ {
+ reservedHeaders = new HashMap<String,String>();
+ reservedHeaders.put("age","age");
+ reservedHeaders.put("www-authenticate","www-authenticate");
+ reservedHeaders.put("proxy-authenticate","proxy-authenticate");
+ reservedHeaders.put("date","date");
+ reservedHeaders.put("set-cookie","set-cookie");
+ reservedHeaders.put("via","via");
+ }
+
/** Robots usage flag */
protected int robotsUsage = ROBOTS_ALL;
/** The user-agent for this connector instance */
@@ -607,7 +620,9 @@ public class WebcrawlerConnector extends
Throwable contextException = null;
// The checksum, which will be needed if resultSignal is RESULT_VERSION_NEEDED.
String checkSum = null;
-
+ // The headers, which will be needed if resultSignal is RESULT_VERSION_NEEDED.
+ Map<String,List<String>> headerData = null;
+
while (true)
{
try
@@ -696,6 +711,7 @@ public class WebcrawlerConnector extends
{
// Treat it as real, and cache it.
checkSum = cache.addData(activities,currentURI,connection);
+ headerData = connection.getResponseHeaders();
resultSignal = RESULT_VERSION_NEEDED;
activityResultCode = null;
}
@@ -1052,8 +1068,43 @@ public class WebcrawlerConnector extends
else
sb.append('-');
- // Now, do the metadata
- packList(sb,metadata,'+');
+ // Now, do the metadata. This comes in two parts: first, the canned metadata, then the header data.
+ // They're all folded into the same part of the version string.
+ int headerCount = 0;
+ Iterator<String> headerIterator = headerData.keySet().iterator();
+ while (headerIterator.hasNext())
+ {
+ String headerName = headerIterator.next();
+ if (reservedHeaders.get(headerName.toLowerCase()) == null)
+ headerCount += headerData.get(headerName).size();
+ }
+ String[] fullMetadata = new String[metadata.length + headerCount];
+ headerCount = 0;
+ headerIterator = headerData.keySet().iterator();
+ while (headerIterator.hasNext())
+ {
+ String headerName = headerIterator.next();
+ if (reservedHeaders.get(headerName.toLowerCase()) == null)
+ {
+ List<String> headerValues = headerData.get(headerName);
+ for (String headerValue : headerValues)
+ {
+ fixedListStrings[0] = "header-"+headerName;
+ fixedListStrings[1] = headerValue;
+ StringBuilder newsb = new StringBuilder();
+ packFixedList(newsb,fixedListStrings,'=');
+ fullMetadata[headerCount++] = newsb.toString();
+ }
+ }
+ }
+ int index = 0;
+ while (index < metadata.length)
+ {
+ fullMetadata[headerCount++] = metadata[index++];
+ }
+ java.util.Arrays.sort(fullMetadata);
+
+ packList(sb,fullMetadata,'+');
// Done with the parseable part! Add the checksum.
sb.append(checkSum);
// Add the filter version