You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2013/06/02 13:36:32 UTC

svn commit: r1488672 - in /manifoldcf/trunk: ./ CHANGES.txt connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ThrottledFetcher.java

Author: kwright
Date: Sun Jun  2 11:36:31 2013
New Revision: 1488672

URL: http://svn.apache.org/r1488672
Log:
Fix for CONNECTORS-693.

Modified:
    manifoldcf/trunk/   (props changed)
    manifoldcf/trunk/CHANGES.txt
    manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ThrottledFetcher.java

Propchange: manifoldcf/trunk/
------------------------------------------------------------------------------
  Merged /manifoldcf/branches/CONNECTORS-693:r1487471-1488671

Modified: manifoldcf/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/CHANGES.txt?rev=1488672&r1=1488671&r2=1488672&view=diff
==============================================================================
--- manifoldcf/trunk/CHANGES.txt (original)
+++ manifoldcf/trunk/CHANGES.txt Sun Jun  2 11:36:31 2013
@@ -3,6 +3,10 @@ $Id$
 
 ======================= 1.3-dev =====================
 
+CONNECTORS-693: Support for gzip and deflate encoding for web
+connector.
+(Maciej Li¿ewski, Karl Wright)
+
 CONNECTORS-694: Add Google Drive connector.
 (Andrew Janowczyk, Karl Wright)
 

Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ThrottledFetcher.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ThrottledFetcher.java?rev=1488672&r1=1488671&r2=1488672&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ThrottledFetcher.java (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ThrottledFetcher.java Sun Jun  2 11:36:31 2013
@@ -19,6 +19,7 @@
 package org.apache.manifoldcf.crawler.connectors.webcrawler;
 
 import org.apache.manifoldcf.core.interfaces.*;
+import org.apache.manifoldcf.core.common.DeflateInputStream;
 import org.apache.manifoldcf.core.common.XThreadInputStream;
 import org.apache.manifoldcf.agents.interfaces.*;
 import org.apache.manifoldcf.crawler.interfaces.*;
@@ -27,6 +28,7 @@ import org.apache.manifoldcf.crawler.sys
 import java.util.*;
 import java.io.*;
 import java.net.*;
+import java.util.zip.GZIPInputStream;
 import java.util.concurrent.TimeUnit;
 
 import org.apache.http.conn.ClientConnectionManager;
@@ -55,6 +57,7 @@ import org.apache.http.params.CoreConnec
 import org.apache.http.HttpStatus;
 import org.apache.http.HttpHost;
 import org.apache.http.Header;
+import org.apache.http.HeaderElement;
 import org.apache.http.conn.params.ConnRoutePNames;
 import org.apache.http.message.BasicHeader;
 import org.apache.http.client.params.ClientPNames;
@@ -1571,7 +1574,8 @@ public class ThrottledFetcher
       fetchMethod.setHeader(new BasicHeader("User-Agent",userAgent));
       fetchMethod.setHeader(new BasicHeader("From",from));
       fetchMethod.setHeader(new BasicHeader("Accept","*/*"));
-        
+      fetchMethod.setHeader(new BasicHeader("Accept-Encoding","gzip,deflate"));
+
       // Use a custom cookie store
       CookieStore cookieStore = new OurBasicCookieStore();
       // If we have any cookies to set, set them.
@@ -2565,10 +2569,36 @@ public class ThrottledFetcher
               {
                 try
                 {
+                  boolean gzip = false;
+                  boolean deflate = false;
+                  Header ceheader = response.getEntity().getContentEncoding();
+                  if (ceheader != null)
+                  {
+                    HeaderElement[] codecs = ceheader.getElements();
+                    for (int i = 0; i < codecs.length; i++)
+                    {
+                      if (codecs[i].getName().equalsIgnoreCase("gzip"))
+                      {
+                        // GZIP
+                        gzip = true;
+                        break;
+                      }
+                      else if (codecs[i].getName().equalsIgnoreCase("deflate"))
+                      {
+                        // Deflate
+                        deflate = true;
+                        break;
+                      }
+                    }
+                  }
                   bodyStream = response.getEntity().getContent();
                   if (bodyStream != null)
                   {
                     bodyStream = new ThrottledInputstream(theConnection,bodyStream);
+                    if (gzip)
+                      bodyStream = new GZIPInputStream(bodyStream);
+                    else if (deflate)
+                      bodyStream = new DeflateInputStream(bodyStream);
                     threadStream = new XThreadInputStream(bodyStream);
                   }
                   streamCreated = true;