You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2008/09/22 18:02:41 UTC
svn commit: r697878 - in /lucene/nutch/trunk: ./
src/java/org/apache/nutch/util/
src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/
src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/
src/plugin/protocol-httpclient/src/java/...
Author: ab
Date: Mon Sep 22 09:02:40 2008
New Revision: 697878
URL: http://svn.apache.org/viewvc?rev=697878&view=rev
Log:
NUTCH-375 - Add support for Content-Encoding: deflate.
Added:
lucene/nutch/trunk/src/java/org/apache/nutch/util/DeflateUtils.java (with props)
Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=697878&r1=697877&r2=697878&view=diff
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Mon Sep 22 09:02:40 2008
@@ -268,6 +268,9 @@
98. NUTCH-651 - Remove bin/{start|stop}-balancer.sh from svn
tracking. (dogacan)
+99. NUTCH-375 - Add support for Content-Encoding: deflated
+ (Pascal Beis, ab)
+
Release 0.9 - 2007-04-02
1. Changed log4j confiquration to log to stdout on commandline
Added: lucene/nutch/trunk/src/java/org/apache/nutch/util/DeflateUtils.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/util/DeflateUtils.java?rev=697878&view=auto
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/util/DeflateUtils.java (added)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/util/DeflateUtils.java Mon Sep 22 09:02:40 2008
@@ -0,0 +1,142 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+import java.io.ByteArrayOutputStream;
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.util.zip.Inflater;
+import java.util.zip.InflaterInputStream;
+import java.util.zip.DeflaterOutputStream;
+
+// Commons Logging imports
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+/**
+ * A collection of utility methods for working on deflated data.
+ */
+public class DeflateUtils {
+
+ private static final Log LOG = LogFactory.getLog(DeflateUtils.class);
+ private static final int EXPECTED_COMPRESSION_RATIO = 5;
+ private static final int BUF_SIZE = 4096;
+
+ /**
+ * Returns an inflated copy of the input array. If the deflated
+ * input has been truncated or corrupted, a best-effort attempt is
+ * made to inflate as much as possible. If no data can be extracted
+ * <code>null</code> is returned.
+ */
+ public static final byte[] inflateBestEffort(byte[] in) {
+ return inflateBestEffort(in, Integer.MAX_VALUE);
+ }
+
+ /**
+ * Returns an inflated copy of the input array, truncated to
+ * <code>sizeLimit</code> bytes, if necessary. If the deflated input
+ * has been truncated or corrupted, a best-effort attempt is made to
+ * inflate as much as possible. If no data can be extracted
+ * <code>null</code> is returned.
+ */
+ public static final byte[] inflateBestEffort(byte[] in, int sizeLimit) {
+ // decompress using InflaterInputStream
+ ByteArrayOutputStream outStream =
+ new ByteArrayOutputStream(EXPECTED_COMPRESSION_RATIO * in.length);
+
+ // "true" because HTTP does not provide zlib headers
+ Inflater inflater = new Inflater(true);
+ InflaterInputStream inStream =
+ new InflaterInputStream(new ByteArrayInputStream(in), inflater);
+
+ byte[] buf = new byte[BUF_SIZE];
+ int written = 0;
+ while (true) {
+ try {
+ int size = inStream.read(buf);
+ if (size <= 0)
+ break;
+ if ((written + size) > sizeLimit) {
+ outStream.write(buf, 0, sizeLimit - written);
+ break;
+ }
+ outStream.write(buf, 0, size);
+ written+= size;
+ } catch (Exception e) {
+ LOG.info( "Caught Exception in inflateBestEffort" );
+ e.printStackTrace(LogUtil.getWarnStream(LOG));
+ break;
+ }
+ }
+ try {
+ outStream.close();
+ } catch (IOException e) {
+ }
+
+ return outStream.toByteArray();
+ }
+
+
+ /**
+ * Returns an inflated copy of the input array.
+ * @throws IOException if the input cannot be properly decompressed
+ */
+ public static final byte[] inflate(byte[] in) throws IOException {
+ // decompress using InflaterInputStream
+ ByteArrayOutputStream outStream =
+ new ByteArrayOutputStream(EXPECTED_COMPRESSION_RATIO * in.length);
+
+ InflaterInputStream inStream =
+ new InflaterInputStream ( new ByteArrayInputStream(in) );
+
+ byte[] buf = new byte[BUF_SIZE];
+ while (true) {
+ int size = inStream.read(buf);
+ if (size <= 0)
+ break;
+ outStream.write(buf, 0, size);
+ }
+ outStream.close();
+
+ return outStream.toByteArray();
+ }
+
+ /**
+ * Returns a deflated copy of the input array.
+ */
+ public static final byte[] deflate(byte[] in) {
+ // compress using DeflaterOutputStream
+ ByteArrayOutputStream byteOut =
+ new ByteArrayOutputStream(in.length / EXPECTED_COMPRESSION_RATIO);
+
+ DeflaterOutputStream outStream = new DeflaterOutputStream(byteOut);
+
+ try {
+ outStream.write(in);
+ } catch (Exception e) {
+ e.printStackTrace(LogUtil.getWarnStream(LOG));
+ }
+
+ try {
+ outStream.close();
+ } catch (IOException e) {
+ e.printStackTrace(LogUtil.getWarnStream(LOG));
+ }
+
+ return byteOut.toByteArray();
+ }
+}
Propchange: lucene/nutch/trunk/src/java/org/apache/nutch/util/DeflateUtils.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=697878&r1=697877&r2=697878&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java (original)
+++ lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java Mon Sep 22 09:02:40 2008
@@ -38,6 +38,7 @@
import org.apache.nutch.protocol.ProtocolStatus;
import org.apache.nutch.protocol.RobotRules;
import org.apache.nutch.util.GZIPUtils;
+import org.apache.nutch.util.DeflateUtils;
import org.apache.nutch.util.LogUtil;
// Hadoop imports
@@ -498,7 +499,24 @@
}
return content;
}
-
+
+ public byte[] processDeflateEncoded(byte[] compressed, URL url) throws IOException {
+
+ if (LOGGER.isTraceEnabled()) { LOGGER.trace("inflating...."); }
+
+ byte[] content = DeflateUtils.inflateBestEffort(compressed, getMaxContent());
+
+ if (content == null)
+ throw new IOException("inflateBestEffort returned null");
+
+ if (LOGGER.isTraceEnabled()) {
+ LOGGER.trace("fetched " + compressed.length
+ + " bytes of compressed content (expanded to "
+ + content.length + " bytes) from " + url);
+ }
+ return content;
+ }
+
protected static void main(HttpBase http, String[] args) throws Exception {
boolean verbose = false;
String url = null;
Modified: lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java?rev=697878&r1=697877&r2=697878&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java (original)
+++ lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java Mon Sep 22 09:02:40 2008
@@ -113,7 +113,7 @@
reqStr.append(portString);
reqStr.append("\r\n");
- reqStr.append("Accept-Encoding: x-gzip, gzip\r\n");
+ reqStr.append("Accept-Encoding: x-gzip, gzip, deflate\r\n");
String userAgent = http.getUserAgent();
if ((userAgent == null) || (userAgent.length() == 0)) {
@@ -156,6 +156,8 @@
String contentEncoding = getHeader(Response.CONTENT_ENCODING);
if ("gzip".equals(contentEncoding) || "x-gzip".equals(contentEncoding)) {
content = http.processGzipEncoded(content, url);
+ } else if ("deflate".equals(contentEncoding)) {
+ content = http.processDeflateEncoded(content, url);
} else {
if (Http.LOG.isTraceEnabled()) {
Http.LOG.trace("fetched " + content.length + " bytes from " + url);
Modified: lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java?rev=697878&r1=697877&r2=697878&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java (original)
+++ lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java Mon Sep 22 09:02:40 2008
@@ -188,7 +188,7 @@
headers.add(new Header("Accept",
"text/html,application/xml;q=0.9,application/xhtml+xml,text/xml;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5"));
// accept gzipped content
- headers.add(new Header("Accept-Encoding", "x-gzip, gzip"));
+ headers.add(new Header("Accept-Encoding", "x-gzip, gzip, deflate"));
hostConf.getParams().setParameter("http.default-headers", headers);
// HTTP proxy server details
Modified: lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java?rev=697878&r1=697877&r2=697878&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java (original)
+++ lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java Mon Sep 22 09:02:40 2008
@@ -150,7 +150,7 @@
if (getHeader(Response.LOCATION) != null)
fetchTrace.append("; Location: " + getHeader(Response.LOCATION));
}
- // Extract gzip and x-gzip files
+ // Extract gzip, x-gzip and deflate content
if (content != null) {
// check if we have to uncompress it
String contentEncoding = headers.get(Response.CONTENT_ENCODING);
@@ -161,6 +161,10 @@
content = http.processGzipEncoded(content, url);
if (Http.LOG.isTraceEnabled())
fetchTrace.append("; extracted to " + content.length + " bytes");
+ } else if ("deflate".equals(contentEncoding)) {
+ content = http.processDeflateEncoded(content, url);
+ if (Http.LOG.isTraceEnabled())
+ fetchTrace.append("; extracted to " + content.length + " bytes");
}
}