You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2015/01/30 03:27:53 UTC

svn commit: r1655914 - in /manifoldcf/trunk: CHANGES.txt connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ThrottledFetcher.java

Author: kwright
Date: Fri Jan 30 02:27:53 2015
New Revision: 1655914

URL: http://svn.apache.org/r1655914
Log:
Second fix for CONNECTORS-1154.

Modified:
    manifoldcf/trunk/CHANGES.txt
    manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ThrottledFetcher.java

Modified: manifoldcf/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/CHANGES.txt?rev=1655914&r1=1655913&r2=1655914&view=diff
==============================================================================
--- manifoldcf/trunk/CHANGES.txt (original)
+++ manifoldcf/trunk/CHANGES.txt Fri Jan 30 02:27:53 2015
@@ -3,6 +3,11 @@ $Id$
 
 ======================= 2.1-dev =====================
 
+CONNECTORS-1154: Changes to the flow through the web connector's
+deflate/unzip logic, designed to prevent gzip headers from being read
+if the stream's contents are not read.
+(Li Minhui, Karl Wright)
+
 CONNECTORS-1153: Broken authority name comparison is preventing
 incremental when no authority is used.
 (Aeham Abushwashi, Karl Wright)

Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ThrottledFetcher.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ThrottledFetcher.java?rev=1655914&r1=1655913&r2=1655914&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ThrottledFetcher.java (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ThrottledFetcher.java Fri Jan 30 02:27:53 2015
@@ -904,7 +904,12 @@ public class ThrottledFetcher
         throw new ManifoldCFException("Attempt to get an input stream when no method thread");
       try
       {
-        return methodThread.getSafeInputStream();
+        InputStream bodyStream = methodThread.getSafeInputStream();
+        if (methodThread.isGZipStream())
+          bodyStream = new GZIPInputStream(bodyStream);
+        else if (methodThread.isDeflateStream())
+          bodyStream = new DeflateInputStream(bodyStream);
+        return bodyStream;
       }
       catch (InterruptedException e)
       {
@@ -1374,6 +1379,8 @@ public class ThrottledFetcher
     protected boolean streamCreated = false;
     protected Throwable streamException = null;
     protected boolean abortThread = false;
+    protected boolean gzip = false;
+    protected boolean deflate = false;
 
     protected Throwable shutdownException = null;
 
@@ -1458,8 +1465,6 @@ public class ThrottledFetcher
               {
                 try
                 {
-                  boolean gzip = false;
-                  boolean deflate = false;
                   Header ceheader = response.getEntity().getContentEncoding();
                   if (ceheader != null)
                   {
@@ -1484,10 +1489,6 @@ public class ThrottledFetcher
                   if (bodyStream != null)
                   {
                     bodyStream = new ThrottledInputstream(fetchThrottler.createFetchStream(),theConnection,bodyStream);
-                    if (gzip)
-                      bodyStream = new GZIPInputStream(bodyStream);
-                    else if (deflate)
-                      bodyStream = new DeflateInputStream(bodyStream);
                     threadStream = new XThreadInputStream(bodyStream);
                   }
                   streamCreated = true;
@@ -1645,6 +1646,46 @@ public class ThrottledFetcher
         }
       }
     }
+
+    public boolean isGZipStream()
+      throws InterruptedException, IOException, HttpException
+    {
+      // Must wait until stream is created, or until we note an exception was thrown.
+      while (true)
+      {
+        synchronized (this)
+        {
+          if (responseException != null)
+            throw new IllegalStateException("Check for response before getting stream");
+          if (cookieException != null)
+            throw new IllegalStateException("Check for cookies before getting stream");
+          checkException(streamException);
+          if (streamCreated)
+            return gzip;
+          wait();
+        }
+      }
+    }    
+
+    public boolean isDeflateStream()
+      throws InterruptedException, IOException, HttpException
+    {
+      // Must wait until stream is created, or until we note an exception was thrown.
+      while (true)
+      {
+        synchronized (this)
+        {
+          if (responseException != null)
+            throw new IllegalStateException("Check for response before getting stream");
+          if (cookieException != null)
+            throw new IllegalStateException("Check for cookies before getting stream");
+          checkException(streamException);
+          if (streamCreated)
+            return deflate;
+          wait();
+        }
+      }
+    }    
     
     public InputStream getSafeInputStream()
       throws InterruptedException, IOException, HttpException