You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by cu...@apache.org on 2005/08/15 20:10:27 UTC

svn commit: r232841 - in /lucene/nutch/branches/mapred/src/java/org/apache/nutch: io/CompressedWritable.java protocol/Content.java

Author: cutting
Date: Mon Aug 15 11:10:23 2005
New Revision: 232841

URL: http://svn.apache.org/viewcvs?rev=232841&view=rev
Log:
Lazily decompress content.

Added:
    lucene/nutch/branches/mapred/src/java/org/apache/nutch/io/CompressedWritable.java
Modified:
    lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/Content.java

Added: lucene/nutch/branches/mapred/src/java/org/apache/nutch/io/CompressedWritable.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/io/CompressedWritable.java?rev=232841&view=auto
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/io/CompressedWritable.java (added)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/io/CompressedWritable.java Mon Aug 15 11:10:23 2005
@@ -0,0 +1,81 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.io;
+
+import java.io.IOException;
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.DataOutputStream;
+import java.io.DataInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.ByteArrayInputStream;
+import java.util.zip.DeflaterOutputStream;
+import java.util.zip.InflaterInputStream;
+
+/** A base-class for Writables which store themselves compressed and lazily
+ * inflate on field access.  This is useful for large objects whose fields are
+ * not be altered during a map or reduce operation: leaving the field data
+ * compressed makes copying the instance from one file to another much
+ * faster. */
+public abstract class CompressedWritable implements Writable {
+  // if non-null, the compressed field data of this instance.
+  private byte[] compressed;
+
+  public CompressedWritable() {}
+
+  public final void readFields(DataInput in) throws IOException {
+    compressed = new byte[in.readInt()];
+    in.readFully(compressed, 0, compressed.length);
+  }
+
+  /** Must be called by all methods which access fields to ensure that the data
+   * has been uncompressed. */
+  protected void ensureInflated() {
+    if (compressed != null) {
+      try {
+        ByteArrayInputStream deflated = new ByteArrayInputStream(compressed);
+        DataInput inflater =
+          new DataInputStream(new InflaterInputStream(deflated));
+        readFieldsCompressed(inflater);
+        compressed = null;
+      } catch (IOException e) {
+        throw new RuntimeException(e);
+      }
+    }
+  }
+
+  /** Subclasses implement this instead of {@link #readFields(DataInput)}. */
+  protected abstract void readFieldsCompressed(DataInput in)
+    throws IOException;
+
+  public final void write(DataOutput out) throws IOException {
+    if (compressed == null) {
+      ByteArrayOutputStream deflated = new ByteArrayOutputStream();
+      DataOutputStream deflater =
+        new DataOutputStream(new DeflaterOutputStream(deflated));
+      writeCompressed(deflater);
+      deflater.close();
+      compressed = deflated.toByteArray();
+    }
+    out.writeInt(compressed.length);
+    out.write(compressed);
+  }
+
+  /** Subclasses implement this instead of {@link #write(DataOutput)}. */
+  protected abstract void writeCompressed(DataOutput out) throws IOException;
+
+}

Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/Content.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/Content.java?rev=232841&r1=232840&r2=232841&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/Content.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/Content.java Mon Aug 15 11:10:23 2005
@@ -23,12 +23,13 @@
 import org.apache.nutch.fs.*;
 import org.apache.nutch.util.*;
 
-public final class Content extends VersionedWritable {
+public final class Content extends CompressedWritable {
 
   public static final String DIR_NAME = "content";
 
   private final static byte VERSION = 1;
 
+  private byte version;
   private String url;
   private String base;
   private byte[] content;
@@ -53,15 +54,16 @@
     this.metadata = metadata;
   }
 
-  public byte getVersion() { return VERSION; }
-
-  public final void readFields(DataInput in) throws IOException {
-    super.readFields(in);                         // check version
+  protected final void readFieldsCompressed(DataInput in) throws IOException {
+    version = in.readByte();
+    if (version > VERSION)
+      throw new VersionMismatchException(VERSION, version);
 
     url = UTF8.readString(in);                    // read url
     base = UTF8.readString(in);                   // read base
 
-    content = WritableUtils.readCompressedByteArray(in);
+    content = new byte[in.readInt()];             // read content
+    in.readFully(content);
 
     contentType = UTF8.readString(in);            // read contentType
 
@@ -72,13 +74,14 @@
     }
   }
 
-  public final void write(DataOutput out) throws IOException {
-    super.write(out);                             // write version
+  protected final void writeCompressed(DataOutput out) throws IOException {
+    out.writeByte(version);
 
     UTF8.writeString(out, url);                   // write url
     UTF8.writeString(out, base);                  // write base
 
-    WritableUtils.writeCompressedByteArray(out, content); // write content
+    out.writeInt(content.length);                 // write content
+    out.write(content);
 
     UTF8.writeString(out, contentType);           // write contentType
     
@@ -102,32 +105,55 @@
   //
 
   /** The url fetched. */
-  public String getUrl() { return url; }
+  public String getUrl() {
+    ensureInflated();
+    return url;
+  }
 
   /** The base url for relative links contained in the content.
    * Maybe be different from url if the request redirected.
    */
-  public String getBaseUrl() { return base; }
+  public String getBaseUrl() {
+    ensureInflated();
+    return base;
+  }
 
   /** The binary content retrieved. */
-  public byte[] getContent() { return content; }
-  public void setContent(byte[] content) { this.content = content; }
+  public byte[] getContent() {
+    ensureInflated();
+    return content;
+  }
+  public void setContent(byte[] content) {
+    ensureInflated();
+    this.content = content;
+  }
 
   /** The media type of the retrieved content.
    * @see http://www.iana.org/assignments/media-types/
    */
-  public String getContentType() { return contentType; }
+  public String getContentType() {
+    ensureInflated();
+    return contentType;
+  }
   public void setContentType(String contentType) {
+    ensureInflated();
     this.contentType = contentType;
   }
 
   /** Other protocol-specific data. */
-  public Properties getMetadata() { return metadata; }
+  public Properties getMetadata() {
+    ensureInflated();
+    return metadata;
+  }
 
   /** Return the value of a metadata property. */
-  public String get(String name) { return getMetadata().getProperty(name); }
+  public String get(String name) {
+    ensureInflated();
+    return getMetadata().getProperty(name);
+  }
 
   public boolean equals(Object o) {
+    ensureInflated();
     if (!(o instanceof Content)){
       return false;
     }
@@ -141,6 +167,7 @@
   }
 
   public String toString() {
+    ensureInflated();
     StringBuffer buffer = new StringBuffer();
 
     buffer.append("url: " + url + "\n" );