You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2008/09/22 18:02:41 UTC

svn commit: r697878 - in /lucene/nutch/trunk: ./ src/java/org/apache/nutch/util/ src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/ src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/ src/plugin/protocol-httpclient/src/java/...

Author: ab
Date: Mon Sep 22 09:02:40 2008
New Revision: 697878

URL: http://svn.apache.org/viewvc?rev=697878&view=rev
Log:
NUTCH-375 - Add support for Content-Encoding: deflate.

Added:
    lucene/nutch/trunk/src/java/org/apache/nutch/util/DeflateUtils.java   (with props)
Modified:
    lucene/nutch/trunk/CHANGES.txt
    lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
    lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
    lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
    lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=697878&r1=697877&r2=697878&view=diff
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Mon Sep 22 09:02:40 2008
@@ -268,6 +268,9 @@
 98. NUTCH-651 - Remove bin/{start|stop}-balancer.sh from svn
     tracking. (dogacan)
 
+99. NUTCH-375 - Add support for Content-Encoding: deflated
+    (Pascal Beis, ab)
+
 Release 0.9 - 2007-04-02
 
  1. Changed log4j confiquration to log to stdout on commandline

Added: lucene/nutch/trunk/src/java/org/apache/nutch/util/DeflateUtils.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/util/DeflateUtils.java?rev=697878&view=auto
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/util/DeflateUtils.java (added)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/util/DeflateUtils.java Mon Sep 22 09:02:40 2008
@@ -0,0 +1,142 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+import java.io.ByteArrayOutputStream;
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.util.zip.Inflater;
+import java.util.zip.InflaterInputStream;
+import java.util.zip.DeflaterOutputStream;
+
+// Commons Logging imports
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+/**
+ *  A collection of utility methods for working on deflated data.
+ */
+public class DeflateUtils {
+  
+  private static final Log LOG = LogFactory.getLog(DeflateUtils.class);
+  private static final int EXPECTED_COMPRESSION_RATIO = 5;
+  private static final int BUF_SIZE = 4096;
+
+  /**
+   * Returns an inflated copy of the input array.  If the deflated 
+   * input has been truncated or corrupted, a best-effort attempt is
+   * made to inflate as much as possible.  If no data can be extracted
+   * <code>null</code> is returned.
+   */
+  public static final byte[] inflateBestEffort(byte[] in) {
+    return inflateBestEffort(in, Integer.MAX_VALUE);
+  }
+
+  /**
+   * Returns an inflated copy of the input array, truncated to
+   * <code>sizeLimit</code> bytes, if necessary.  If the deflated input
+   * has been truncated or corrupted, a best-effort attempt is made to
+   * inflate as much as possible.  If no data can be extracted
+   * <code>null</code> is returned.
+   */
+  public static final byte[] inflateBestEffort(byte[] in, int sizeLimit) {
+    // decompress using InflaterInputStream 
+    ByteArrayOutputStream outStream = 
+      new ByteArrayOutputStream(EXPECTED_COMPRESSION_RATIO * in.length);
+
+    // "true" because HTTP does not provide zlib headers
+    Inflater inflater = new Inflater(true);
+    InflaterInputStream inStream = 
+      new InflaterInputStream(new ByteArrayInputStream(in), inflater);
+
+    byte[] buf = new byte[BUF_SIZE];
+    int written = 0;
+    while (true) {
+      try {
+	int size = inStream.read(buf);
+	if (size <= 0) 
+	  break;
+	if ((written + size) > sizeLimit) {
+	  outStream.write(buf, 0, sizeLimit - written);
+	  break;
+	}
+	outStream.write(buf, 0, size);
+	written+= size;
+      } catch (Exception e) {
+	LOG.info( "Caught Exception in inflateBestEffort" );
+        e.printStackTrace(LogUtil.getWarnStream(LOG));
+	break;
+      }
+    }
+    try {
+      outStream.close();
+    } catch (IOException e) {
+    }
+
+    return outStream.toByteArray();
+  }
+
+
+  /**
+   * Returns an inflated copy of the input array.  
+   * @throws IOException if the input cannot be properly decompressed
+   */
+  public static final byte[] inflate(byte[] in) throws IOException {
+    // decompress using InflaterInputStream 
+    ByteArrayOutputStream outStream = 
+      new ByteArrayOutputStream(EXPECTED_COMPRESSION_RATIO * in.length);
+
+    InflaterInputStream inStream = 
+      new InflaterInputStream ( new ByteArrayInputStream(in) );
+
+    byte[] buf = new byte[BUF_SIZE];
+    while (true) {
+      int size = inStream.read(buf);
+      if (size <= 0) 
+        break;
+      outStream.write(buf, 0, size);
+    }
+    outStream.close();
+
+    return outStream.toByteArray();
+  }
+
+  /**
+   * Returns a deflated copy of the input array.
+   */
+  public static final byte[] deflate(byte[] in) {
+    // compress using DeflaterOutputStream 
+    ByteArrayOutputStream byteOut = 
+      new ByteArrayOutputStream(in.length / EXPECTED_COMPRESSION_RATIO);
+
+    DeflaterOutputStream outStream = new DeflaterOutputStream(byteOut);
+
+    try {
+      outStream.write(in);
+    } catch (Exception e) {
+      e.printStackTrace(LogUtil.getWarnStream(LOG));
+    }
+
+    try {
+      outStream.close();
+    } catch (IOException e) {
+      e.printStackTrace(LogUtil.getWarnStream(LOG));
+    }
+
+    return byteOut.toByteArray();
+  }
+}

Propchange: lucene/nutch/trunk/src/java/org/apache/nutch/util/DeflateUtils.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=697878&r1=697877&r2=697878&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java (original)
+++ lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java Mon Sep 22 09:02:40 2008
@@ -38,6 +38,7 @@
 import org.apache.nutch.protocol.ProtocolStatus;
 import org.apache.nutch.protocol.RobotRules;
 import org.apache.nutch.util.GZIPUtils;
+import org.apache.nutch.util.DeflateUtils;
 import org.apache.nutch.util.LogUtil;
 
 // Hadoop imports
@@ -498,7 +499,24 @@
     }
     return content;
   }
-  
+
+  public byte[] processDeflateEncoded(byte[] compressed, URL url) throws IOException {
+
+    if (LOGGER.isTraceEnabled()) { LOGGER.trace("inflating...."); }
+
+    byte[] content = DeflateUtils.inflateBestEffort(compressed, getMaxContent());
+
+    if (content == null)
+      throw new IOException("inflateBestEffort returned null");
+
+    if (LOGGER.isTraceEnabled()) {
+      LOGGER.trace("fetched " + compressed.length
+                 + " bytes of compressed content (expanded to "
+                 + content.length + " bytes) from " + url);
+    }
+    return content;
+  }
+
   protected static void main(HttpBase http, String[] args) throws Exception {
     boolean verbose = false;
     String url = null;

Modified: lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java?rev=697878&r1=697877&r2=697878&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java (original)
+++ lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java Mon Sep 22 09:02:40 2008
@@ -113,7 +113,7 @@
       reqStr.append(portString);
       reqStr.append("\r\n");
 
-      reqStr.append("Accept-Encoding: x-gzip, gzip\r\n");
+      reqStr.append("Accept-Encoding: x-gzip, gzip, deflate\r\n");
 
       String userAgent = http.getUserAgent();
       if ((userAgent == null) || (userAgent.length() == 0)) {
@@ -156,6 +156,8 @@
       String contentEncoding = getHeader(Response.CONTENT_ENCODING);
       if ("gzip".equals(contentEncoding) || "x-gzip".equals(contentEncoding)) {
         content = http.processGzipEncoded(content, url);
+      } else if ("deflate".equals(contentEncoding)) {
+       content = http.processDeflateEncoded(content, url);
       } else {
         if (Http.LOG.isTraceEnabled()) {
           Http.LOG.trace("fetched " + content.length + " bytes from " + url);

Modified: lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java?rev=697878&r1=697877&r2=697878&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java (original)
+++ lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java Mon Sep 22 09:02:40 2008
@@ -188,7 +188,7 @@
     headers.add(new Header("Accept",
             "text/html,application/xml;q=0.9,application/xhtml+xml,text/xml;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5"));
     // accept gzipped content
-    headers.add(new Header("Accept-Encoding", "x-gzip, gzip"));
+    headers.add(new Header("Accept-Encoding", "x-gzip, gzip, deflate"));
     hostConf.getParams().setParameter("http.default-headers", headers);
 
     // HTTP proxy server details

Modified: lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java?rev=697878&r1=697877&r2=697878&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java (original)
+++ lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java Mon Sep 22 09:02:40 2008
@@ -150,7 +150,7 @@
         if (getHeader(Response.LOCATION) != null)
           fetchTrace.append("; Location: " + getHeader(Response.LOCATION));
       }
-      // Extract gzip and x-gzip files
+      // Extract gzip, x-gzip and deflate content
       if (content != null) {
         // check if we have to uncompress it
         String contentEncoding = headers.get(Response.CONTENT_ENCODING);
@@ -161,6 +161,10 @@
           content = http.processGzipEncoded(content, url);
           if (Http.LOG.isTraceEnabled())
             fetchTrace.append("; extracted to " + content.length + " bytes");
+        } else if ("deflate".equals(contentEncoding)) {
+          content = http.processDeflateEncoded(content, url);
+          if (Http.LOG.isTraceEnabled())
+            fetchTrace.append("; extracted to " + content.length + " bytes");
         }
       }