You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2013/05/29 19:50:50 UTC
svn commit: r1487565 -
/manifoldcf/branches/CONNECTORS-693/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ThrottledFetcher.java
Author: kwright
Date: Wed May 29 17:50:50 2013
New Revision: 1487565
URL: http://svn.apache.org/r1487565
Log:
Add gzip and deflate encodings.
Modified:
manifoldcf/branches/CONNECTORS-693/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ThrottledFetcher.java
Modified: manifoldcf/branches/CONNECTORS-693/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ThrottledFetcher.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-693/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ThrottledFetcher.java?rev=1487565&r1=1487564&r2=1487565&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-693/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ThrottledFetcher.java (original)
+++ manifoldcf/branches/CONNECTORS-693/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ThrottledFetcher.java Wed May 29 17:50:50 2013
@@ -19,6 +19,7 @@
package org.apache.manifoldcf.crawler.connectors.webcrawler;
import org.apache.manifoldcf.core.interfaces.*;
+import org.apache.manifoldcf.core.common.DeflateInputStream;
import org.apache.manifoldcf.core.common.XThreadInputStream;
import org.apache.manifoldcf.agents.interfaces.*;
import org.apache.manifoldcf.crawler.interfaces.*;
@@ -27,6 +28,7 @@ import org.apache.manifoldcf.crawler.sys
import java.util.*;
import java.io.*;
import java.net.*;
+import java.util.zip.GZIPInputStream;
import java.util.concurrent.TimeUnit;
import org.apache.http.conn.ClientConnectionManager;
@@ -55,6 +57,7 @@ import org.apache.http.params.CoreConnec
import org.apache.http.HttpStatus;
import org.apache.http.HttpHost;
import org.apache.http.Header;
+import org.apache.http.HeaderElement;
import org.apache.http.conn.params.ConnRoutePNames;
import org.apache.http.message.BasicHeader;
import org.apache.http.client.params.ClientPNames;
@@ -1571,7 +1574,8 @@ public class ThrottledFetcher
fetchMethod.setHeader(new BasicHeader("User-Agent",userAgent));
fetchMethod.setHeader(new BasicHeader("From",from));
fetchMethod.setHeader(new BasicHeader("Accept","*/*"));
-
+ fetchMethod.setHeader(new BasicHeader("Accept-Encoding","gzip,deflate"));
+
// Use a custom cookie store
CookieStore cookieStore = new OurBasicCookieStore();
// If we have any cookies to set, set them.
@@ -2565,10 +2569,36 @@ public class ThrottledFetcher
{
try
{
+ boolean gzip = false;
+ boolean deflate = false;
+ Header ceheader = response.getEntity().getContentEncoding();
+ if (ceheader != null)
+ {
+ HeaderElement[] codecs = ceheader.getElements();
+ for (int i = 0; i < codecs.length; i++)
+ {
+ if (codecs[i].getName().equalsIgnoreCase("gzip"))
+ {
+ // GZIP
+ gzip = true;
+ break;
+ }
+ else if (codecs[i].getName().equalsIgnoreCase("deflate"))
+ {
+ // Deflate
+ deflate = true;
+ break;
+ }
+ }
+ }
bodyStream = response.getEntity().getContent();
if (bodyStream != null)
{
bodyStream = new ThrottledInputstream(theConnection,bodyStream);
+ if (gzip)
+ bodyStream = new GZIPInputStream(bodyStream);
+ else if (deflate)
+ bodyStream = new DeflateInputStream(bodyStream);
threadStream = new XThreadInputStream(bodyStream);
}
streamCreated = true;