You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2012/06/18 20:24:06 UTC

svn commit: r1351447 - in /manifoldcf/trunk: ./ connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/

Author: kwright
Date: Mon Jun 18 18:24:05 2012
New Revision: 1351447

URL: http://svn.apache.org/viewvc?rev=1351447&view=rev
Log:
Fix for CONNECTORS-482.  Include a portion of the response data when an HTTP error code is returned.

Modified:
    manifoldcf/trunk/CHANGES.txt
    manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IThrottledConnection.java
    manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ThrottledFetcher.java
    manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java

Modified: manifoldcf/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/CHANGES.txt?rev=1351447&r1=1351446&r2=1351447&view=diff
==============================================================================
--- manifoldcf/trunk/CHANGES.txt (original)
+++ manifoldcf/trunk/CHANGES.txt Mon Jun 18 18:24:05 2012
@@ -3,6 +3,10 @@ $Id$
 
 ======================= 0.6-dev =====================
 
+CONNECTORS-482: Need to include at least a portion of the HTTP
+body in history message whenever a non-200 HTTP code comes back.
+(Karl Wright)
+
 CONNECTORS-481: Documentation of API is wrong in a few spots.
 (Adrian Conlon, Karl Wright)
 

Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IThrottledConnection.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IThrottledConnection.java?rev=1351447&r1=1351446&r2=1351447&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IThrottledConnection.java (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IThrottledConnection.java Mon Jun 18 18:24:05 2012
@@ -98,6 +98,11 @@ public interface IThrottledConnection
   public InputStream getResponseBodyStream()
     throws ManifoldCFException, ServiceInterruption;
 
+  /** Get limited response as a string.
+  */
+  public String getLimitedResponseBody(int maxSize, String encoding)
+    throws ManifoldCFException, ServiceInterruption;
+
   /** Note that the connection fetch was interrupted by something.
   */
   public void noteInterrupted(Throwable e);

Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ThrottledFetcher.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ThrottledFetcher.java?rev=1351447&r1=1351446&r2=1351447&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ThrottledFetcher.java (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ThrottledFetcher.java Mon Jun 18 18:24:05 2012
@@ -1726,6 +1726,53 @@ public class ThrottledFetcher
       }
     }
 
+    /** Get limited response as a string.
+    */
+    public String getLimitedResponseBody(int maxSize, String encoding)
+      throws ManifoldCFException, ServiceInterruption
+    {
+      try
+      {
+        InputStream is = getResponseBodyStream();
+        try
+        {
+          Reader r = new InputStreamReader(is,encoding);
+          char[] buffer = new char[maxSize];
+          int amt = r.read(buffer);
+          return new String(buffer,0,amt);
+        }
+        finally
+        {
+          is.close();
+        }
+      }
+      catch (java.net.SocketTimeoutException e)
+      {
+        Logging.connectors.debug("Web: Socket timeout exception reading response stream for '"+myUrl+"', retrying");
+        throw new ServiceInterruption("Socket timeout exception reading response stream: "+e.getMessage(),e,System.currentTimeMillis()+TIME_5MIN,-1L,2,false);
+      }
+      catch (org.apache.commons.httpclient.ConnectTimeoutException e)
+      {
+        Logging.connectors.debug("Web: Connect timeout exception reading response stream for '"+myUrl+"', retrying");
+        throw new ServiceInterruption("Connect timeout exception reading response stream: "+e.getMessage(),e,System.currentTimeMillis()+TIME_5MIN,-1L,2,false);
+      }
+      catch (InterruptedIOException e)
+      {
+        //Logging.connectors.warn("IO interruption seen: "+e.getMessage(),e);
+        throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
+      }
+      catch (IOException e)
+      {
+        Logging.connectors.debug("Web: IO exception reading response stream for '"+myUrl+"', retrying");
+        throw new ServiceInterruption("IO exception reading response stream: "+e.getMessage(),e,System.currentTimeMillis()+TIME_5MIN,-1L,2,false);
+      }
+      catch (IllegalStateException e)
+      {
+        Logging.connectors.debug("Web: State error reading response body for '"+myUrl+"', retrying");
+        throw new ServiceInterruption("State error reading response body: "+e.getMessage(),e,TIME_5MIN,-1L,2,false);
+      }
+    }
+
     /** Note that the connection fetch was interrupted by something.
     */
     public void noteInterrupted(Throwable e)

Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java?rev=1351447&r1=1351446&r2=1351447&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java Mon Jun 18 18:24:05 2012
@@ -696,16 +696,7 @@ public class WebcrawlerConnector extends
                         // Basically, we want to get rid of everything that we (a) don't know what
                         // to do with in the ingestion system, and (b) we can't get useful links from.
 
-                        String contentType = connection.getResponseHeader("Content-Type");
-                        // Some sites have multiple content types.  We just look at the LAST one in that case.
-                        if (contentType != null)
-                        {
-                          String[] contentTypes = contentType.split(",");
-                          if (contentTypes.length > 0)
-                            contentType = contentTypes[contentTypes.length-1].trim();
-                          else
-                            contentType = null;
-                        }
+                        String contentType = extractContentType(connection.getResponseHeader("Content-Type"));
 
                         if (isContentInteresting(activities,currentURI,response,contentType))
                         {
@@ -719,7 +710,7 @@ public class WebcrawlerConnector extends
                         {
                           contextMessage = "it had the wrong content type";
                           resultSignal = RESULT_NO_DOCUMENT;
-                          activityResultCode = null;
+                          activityResultCode = "-13";//null;
                         }
                       }
                       else
@@ -728,9 +719,31 @@ public class WebcrawlerConnector extends
                         // We don't want to remove it from the queue entirely, because that would cause us to lose track of the item, and therefore lose
                         // control of all scheduling around it.  Instead, we leave it on the queue and give it an empty version string; that will lead it to be
                         // reprocessed without fail on the next scheduled check.
-                        contextMessage = "it failed to fetch (status="+Integer.toString(response)+")";
+                        // Decode response body to the extent we can
+                        String contentType = extractContentType(connection.getResponseHeader("Content-Type"));
+                        String encoding = extractEncoding(contentType);
+                        if (encoding == null)
+                          encoding = "utf-8";
+                        String decodedResponse = "undecodable";
+                        try
+                        {
+                          decodedResponse = "'"+connection.getLimitedResponseBody(1024,encoding)+"'";
+                        }
+                        catch (ManifoldCFException e)
+                        {
+                          // Eat this exception unless it is an interrupt
+                          if (e.getErrorCode() == ManifoldCFException.INTERRUPTED)
+                            throw e;
+                          connection.noteInterrupted(e);
+                        }
+                        catch (ServiceInterruption e)
+                        {
+                          // Eat this exception too
+                          connection.noteInterrupted(e);
+                        }
+                        contextMessage = "it failed to fetch (status="+Integer.toString(response)+", message="+decodedResponse+")";
                         resultSignal = RESULT_NO_VERSION;
-                        activityResultCode = null;
+                        activityResultCode = Integer.toString(response);//null;
                       }
                     }
                     catch (ManifoldCFException e)
@@ -1145,6 +1158,45 @@ public class WebcrawlerConnector extends
     return rval;
   }
 
+  protected static String extractContentType(String contentType)
+  {
+    // Some sites have multiple content types.  We just look at the LAST one in that case.
+    if (contentType != null)
+    {
+      String[] contentTypes = contentType.split(",");
+      if (contentTypes.length > 0)
+        contentType = contentTypes[contentTypes.length-1].trim();
+      else
+        contentType = null;
+    }
+    return contentType;
+  }
+
+  protected static String extractEncoding(String contentType)
+  {
+    if (contentType == null)
+      return null;
+    int semiIndex = contentType.indexOf(";");
+    if (semiIndex == -1)
+      return null;
+    String suffix = contentType.substring(semiIndex+1);
+    suffix = suffix.trim();
+    if (suffix.startsWith("charset="))
+      return suffix.substring("charset=".length());
+    return null;
+  }
+  
+  protected static String extractMimeType(String contentType)
+  {
+    if (contentType == null)
+      return null;
+    int semiIndex = contentType.indexOf(";");
+    if (semiIndex != -1)
+      contentType = contentType.substring(0,semiIndex);
+    contentType = contentType.trim();
+    return contentType;
+  }
+  
   /** Process a set of documents.
   * This is the method that should cause each document to be fetched, processed, and the results either added
   * to the queue of documents for the current job, and/or entered into the incremental ingestion manager.
@@ -6065,37 +6117,18 @@ public class WebcrawlerConnector extends
         return;
 
       // We ONLY look for XML if the content type *says* it is XML.
-      String contentType = cache.getContentType(documentURI);
-      // Some sites have multiple content types.  We just look at the LAST one in that case.
-      if (contentType != null)
-      {
-        String[] contentTypes = contentType.split(",");
-        if (contentTypes.length > 0)
-          contentType = contentTypes[contentTypes.length-1].trim();
-        else
-          contentType = null;
-      }
-      if (contentType == null)
-        return;
-
-      int semiIndex = contentType.indexOf(";");
-      String suffix = null;
-      if (semiIndex != -1)
-      {
-        suffix = contentType.substring(semiIndex+1);
-        contentType = contentType.substring(0,semiIndex);
-      }
-      contentType = contentType.trim();
+      String contentType = extractContentType(cache.getContentType(documentURI));
+      String mimeType = extractMimeType(contentType);
       boolean isXML =
-        contentType.equals("text/xml") ||
-        contentType.equals("application/rss+xml") ||
-        contentType.equals("application/xml") ||
-        contentType.equals("application/atom+xml") ||
-        contentType.equals("application/xhtml+xml") ||
-        contentType.equals("text/XML") ||
-        contentType.equals("application/rdf+xml") ||
-        contentType.equals("text/application") ||
-        contentType.equals("XML");
+        mimeType.equals("text/xml") ||
+        mimeType.equals("application/rss+xml") ||
+        mimeType.equals("application/xml") ||
+        mimeType.equals("application/atom+xml") ||
+        mimeType.equals("application/xhtml+xml") ||
+        mimeType.equals("text/XML") ||
+        mimeType.equals("application/rdf+xml") ||
+        mimeType.equals("text/application") ||
+        mimeType.equals("XML");
 
       if (!isXML)
         return;
@@ -6103,13 +6136,9 @@ public class WebcrawlerConnector extends
       // OK, it's XML.  Now what?  Well, we get the encoding, and we verify that it is text, then we try to get links
       // from it presuming it is an RSS feed.
 
-      String encoding = "utf-8";
-      if (suffix != null)
-      {
-        suffix = suffix.trim();
-        if (suffix.startsWith("charset="))
-          encoding = suffix.substring("charset=".length());
-      }
+      String encoding = extractEncoding(contentType);
+      if (encoding == null)
+        encoding = "utf-8";
 
       InputStream is = cache.getData(documentURI);
       if (is == null)
@@ -6835,31 +6864,11 @@ public class WebcrawlerConnector extends
         return;
       }
       // Grab the content-type so we know how to decode.
-      String encoding = "utf-8";
-      String contentType = cache.getContentType(documentURI);
-      // Some sites have multiple content types.  We just look at the LAST one in that case.
-      if (contentType != null)
-      {
-        String[] contentTypes = contentType.split(",");
-        if (contentTypes.length > 0)
-          contentType = contentTypes[contentTypes.length-1].trim();
-        else
-          contentType = null;
-      }
-
-      if (contentType != null)
-      {
-        int pos = contentType.indexOf(";");
-        if (pos != -1)
-        {
-          contentType = contentType.substring(pos+1).trim();
-          if (contentType.startsWith("charset="))
-          {
-            encoding = contentType.substring("charset=".length());
-          }
-        }
-      }
-
+      String contentType = extractContentType(cache.getContentType(documentURI));
+      String encoding = extractEncoding(contentType);
+      if (encoding == null)
+        encoding = "utf-8";
+      
       // Search for A HREF tags in the document stream.  This is brain-dead link location
       InputStream is = cache.getData(documentURI);
       if (is == null)