You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2012/06/18 20:24:06 UTC
svn commit: r1351447 - in /manifoldcf/trunk: ./
connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/
Author: kwright
Date: Mon Jun 18 18:24:05 2012
New Revision: 1351447
URL: http://svn.apache.org/viewvc?rev=1351447&view=rev
Log:
Fix for CONNECTORS-482. Include a portion of the response data when an HTTP error code is returned.
Modified:
manifoldcf/trunk/CHANGES.txt
manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IThrottledConnection.java
manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ThrottledFetcher.java
manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
Modified: manifoldcf/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/CHANGES.txt?rev=1351447&r1=1351446&r2=1351447&view=diff
==============================================================================
--- manifoldcf/trunk/CHANGES.txt (original)
+++ manifoldcf/trunk/CHANGES.txt Mon Jun 18 18:24:05 2012
@@ -3,6 +3,10 @@ $Id$
======================= 0.6-dev =====================
+CONNECTORS-482: Need to include at least a portion of the HTTP
+body in history message whenever a non-200 HTTP code comes back.
+(Karl Wright)
+
CONNECTORS-481: Documentation of API is wrong in a few spots.
(Adrian Conlon, Karl Wright)
Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IThrottledConnection.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IThrottledConnection.java?rev=1351447&r1=1351446&r2=1351447&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IThrottledConnection.java (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IThrottledConnection.java Mon Jun 18 18:24:05 2012
@@ -98,6 +98,11 @@ public interface IThrottledConnection
public InputStream getResponseBodyStream()
throws ManifoldCFException, ServiceInterruption;
+ /** Get limited response as a string.
+ */
+ public String getLimitedResponseBody(int maxSize, String encoding)
+ throws ManifoldCFException, ServiceInterruption;
+
/** Note that the connection fetch was interrupted by something.
*/
public void noteInterrupted(Throwable e);
Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ThrottledFetcher.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ThrottledFetcher.java?rev=1351447&r1=1351446&r2=1351447&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ThrottledFetcher.java (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ThrottledFetcher.java Mon Jun 18 18:24:05 2012
@@ -1726,6 +1726,53 @@ public class ThrottledFetcher
}
}
+ /** Get limited response as a string.
+ */
+ public String getLimitedResponseBody(int maxSize, String encoding)
+ throws ManifoldCFException, ServiceInterruption
+ {
+ try
+ {
+ InputStream is = getResponseBodyStream();
+ try
+ {
+ Reader r = new InputStreamReader(is,encoding);
+ char[] buffer = new char[maxSize];
+ int amt = r.read(buffer);
+ return new String(buffer,0,amt);
+ }
+ finally
+ {
+ is.close();
+ }
+ }
+ catch (java.net.SocketTimeoutException e)
+ {
+ Logging.connectors.debug("Web: Socket timeout exception reading response stream for '"+myUrl+"', retrying");
+ throw new ServiceInterruption("Socket timeout exception reading response stream: "+e.getMessage(),e,System.currentTimeMillis()+TIME_5MIN,-1L,2,false);
+ }
+ catch (org.apache.commons.httpclient.ConnectTimeoutException e)
+ {
+ Logging.connectors.debug("Web: Connect timeout exception reading response stream for '"+myUrl+"', retrying");
+ throw new ServiceInterruption("Connect timeout exception reading response stream: "+e.getMessage(),e,System.currentTimeMillis()+TIME_5MIN,-1L,2,false);
+ }
+ catch (InterruptedIOException e)
+ {
+ //Logging.connectors.warn("IO interruption seen: "+e.getMessage(),e);
+ throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
+ }
+ catch (IOException e)
+ {
+ Logging.connectors.debug("Web: IO exception reading response stream for '"+myUrl+"', retrying");
+ throw new ServiceInterruption("IO exception reading response stream: "+e.getMessage(),e,System.currentTimeMillis()+TIME_5MIN,-1L,2,false);
+ }
+ catch (IllegalStateException e)
+ {
+ Logging.connectors.debug("Web: State error reading response body for '"+myUrl+"', retrying");
+ throw new ServiceInterruption("State error reading response body: "+e.getMessage(),e,TIME_5MIN,-1L,2,false);
+ }
+ }
+
/** Note that the connection fetch was interrupted by something.
*/
public void noteInterrupted(Throwable e)
Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java?rev=1351447&r1=1351446&r2=1351447&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java Mon Jun 18 18:24:05 2012
@@ -696,16 +696,7 @@ public class WebcrawlerConnector extends
// Basically, we want to get rid of everything that we (a) don't know what
// to do with in the ingestion system, and (b) we can't get useful links from.
- String contentType = connection.getResponseHeader("Content-Type");
- // Some sites have multiple content types. We just look at the LAST one in that case.
- if (contentType != null)
- {
- String[] contentTypes = contentType.split(",");
- if (contentTypes.length > 0)
- contentType = contentTypes[contentTypes.length-1].trim();
- else
- contentType = null;
- }
+ String contentType = extractContentType(connection.getResponseHeader("Content-Type"));
if (isContentInteresting(activities,currentURI,response,contentType))
{
@@ -719,7 +710,7 @@ public class WebcrawlerConnector extends
{
contextMessage = "it had the wrong content type";
resultSignal = RESULT_NO_DOCUMENT;
- activityResultCode = null;
+ activityResultCode = "-13";//null;
}
}
else
@@ -728,9 +719,31 @@ public class WebcrawlerConnector extends
// We don't want to remove it from the queue entirely, because that would cause us to lose track of the item, and therefore lose
// control of all scheduling around it. Instead, we leave it on the queue and give it an empty version string; that will lead it to be
// reprocessed without fail on the next scheduled check.
- contextMessage = "it failed to fetch (status="+Integer.toString(response)+")";
+ // Decode response body to the extent we can
+ String contentType = extractContentType(connection.getResponseHeader("Content-Type"));
+ String encoding = extractEncoding(contentType);
+ if (encoding == null)
+ encoding = "utf-8";
+ String decodedResponse = "undecodable";
+ try
+ {
+ decodedResponse = "'"+connection.getLimitedResponseBody(1024,encoding)+"'";
+ }
+ catch (ManifoldCFException e)
+ {
+ // Eat this exception unless it is an interrupt
+ if (e.getErrorCode() == ManifoldCFException.INTERRUPTED)
+ throw e;
+ connection.noteInterrupted(e);
+ }
+ catch (ServiceInterruption e)
+ {
+ // Eat this exception too
+ connection.noteInterrupted(e);
+ }
+ contextMessage = "it failed to fetch (status="+Integer.toString(response)+", message="+decodedResponse+")";
resultSignal = RESULT_NO_VERSION;
- activityResultCode = null;
+ activityResultCode = Integer.toString(response);//null;
}
}
catch (ManifoldCFException e)
@@ -1145,6 +1158,45 @@ public class WebcrawlerConnector extends
return rval;
}
+ protected static String extractContentType(String contentType)
+ {
+ // Some sites have multiple content types. We just look at the LAST one in that case.
+ if (contentType != null)
+ {
+ String[] contentTypes = contentType.split(",");
+ if (contentTypes.length > 0)
+ contentType = contentTypes[contentTypes.length-1].trim();
+ else
+ contentType = null;
+ }
+ return contentType;
+ }
+
+ protected static String extractEncoding(String contentType)
+ {
+ if (contentType == null)
+ return null;
+ int semiIndex = contentType.indexOf(";");
+ if (semiIndex == -1)
+ return null;
+ String suffix = contentType.substring(semiIndex+1);
+ suffix = suffix.trim();
+ if (suffix.startsWith("charset="))
+ return suffix.substring("charset=".length());
+ return null;
+ }
+
+ protected static String extractMimeType(String contentType)
+ {
+ if (contentType == null)
+ return null;
+ int semiIndex = contentType.indexOf(";");
+ if (semiIndex != -1)
+ contentType = contentType.substring(0,semiIndex);
+ contentType = contentType.trim();
+ return contentType;
+ }
+
/** Process a set of documents.
* This is the method that should cause each document to be fetched, processed, and the results either added
* to the queue of documents for the current job, and/or entered into the incremental ingestion manager.
@@ -6065,37 +6117,18 @@ public class WebcrawlerConnector extends
return;
// We ONLY look for XML if the content type *says* it is XML.
- String contentType = cache.getContentType(documentURI);
- // Some sites have multiple content types. We just look at the LAST one in that case.
- if (contentType != null)
- {
- String[] contentTypes = contentType.split(",");
- if (contentTypes.length > 0)
- contentType = contentTypes[contentTypes.length-1].trim();
- else
- contentType = null;
- }
- if (contentType == null)
- return;
-
- int semiIndex = contentType.indexOf(";");
- String suffix = null;
- if (semiIndex != -1)
- {
- suffix = contentType.substring(semiIndex+1);
- contentType = contentType.substring(0,semiIndex);
- }
- contentType = contentType.trim();
+ String contentType = extractContentType(cache.getContentType(documentURI));
+ String mimeType = extractMimeType(contentType);
boolean isXML =
- contentType.equals("text/xml") ||
- contentType.equals("application/rss+xml") ||
- contentType.equals("application/xml") ||
- contentType.equals("application/atom+xml") ||
- contentType.equals("application/xhtml+xml") ||
- contentType.equals("text/XML") ||
- contentType.equals("application/rdf+xml") ||
- contentType.equals("text/application") ||
- contentType.equals("XML");
+ mimeType.equals("text/xml") ||
+ mimeType.equals("application/rss+xml") ||
+ mimeType.equals("application/xml") ||
+ mimeType.equals("application/atom+xml") ||
+ mimeType.equals("application/xhtml+xml") ||
+ mimeType.equals("text/XML") ||
+ mimeType.equals("application/rdf+xml") ||
+ mimeType.equals("text/application") ||
+ mimeType.equals("XML");
if (!isXML)
return;
@@ -6103,13 +6136,9 @@ public class WebcrawlerConnector extends
// OK, it's XML. Now what? Well, we get the encoding, and we verify that it is text, then we try to get links
// from it presuming it is an RSS feed.
- String encoding = "utf-8";
- if (suffix != null)
- {
- suffix = suffix.trim();
- if (suffix.startsWith("charset="))
- encoding = suffix.substring("charset=".length());
- }
+ String encoding = extractEncoding(contentType);
+ if (encoding == null)
+ encoding = "utf-8";
InputStream is = cache.getData(documentURI);
if (is == null)
@@ -6835,31 +6864,11 @@ public class WebcrawlerConnector extends
return;
}
// Grab the content-type so we know how to decode.
- String encoding = "utf-8";
- String contentType = cache.getContentType(documentURI);
- // Some sites have multiple content types. We just look at the LAST one in that case.
- if (contentType != null)
- {
- String[] contentTypes = contentType.split(",");
- if (contentTypes.length > 0)
- contentType = contentTypes[contentTypes.length-1].trim();
- else
- contentType = null;
- }
-
- if (contentType != null)
- {
- int pos = contentType.indexOf(";");
- if (pos != -1)
- {
- contentType = contentType.substring(pos+1).trim();
- if (contentType.startsWith("charset="))
- {
- encoding = contentType.substring("charset=".length());
- }
- }
- }
-
+ String contentType = extractContentType(cache.getContentType(documentURI));
+ String encoding = extractEncoding(contentType);
+ if (encoding == null)
+ encoding = "utf-8";
+
// Search for A HREF tags in the document stream. This is brain-dead link location
InputStream is = cache.getData(documentURI);
if (is == null)