You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by cu...@apache.org on 2005/08/15 20:10:27 UTC
svn commit: r232841 - in
/lucene/nutch/branches/mapred/src/java/org/apache/nutch:
io/CompressedWritable.java protocol/Content.java
Author: cutting
Date: Mon Aug 15 11:10:23 2005
New Revision: 232841
URL: http://svn.apache.org/viewcvs?rev=232841&view=rev
Log:
Lazily decompress content.
Added:
lucene/nutch/branches/mapred/src/java/org/apache/nutch/io/CompressedWritable.java
Modified:
lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/Content.java
Added: lucene/nutch/branches/mapred/src/java/org/apache/nutch/io/CompressedWritable.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/io/CompressedWritable.java?rev=232841&view=auto
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/io/CompressedWritable.java (added)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/io/CompressedWritable.java Mon Aug 15 11:10:23 2005
@@ -0,0 +1,81 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.io;
+
+import java.io.IOException;
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.DataOutputStream;
+import java.io.DataInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.ByteArrayInputStream;
+import java.util.zip.DeflaterOutputStream;
+import java.util.zip.InflaterInputStream;
+
+/** A base-class for Writables which store themselves compressed and lazily
+ * inflate on field access. This is useful for large objects whose fields are
+ * not be altered during a map or reduce operation: leaving the field data
+ * compressed makes copying the instance from one file to another much
+ * faster. */
+public abstract class CompressedWritable implements Writable {
+ // if non-null, the compressed field data of this instance.
+ private byte[] compressed;
+
+ public CompressedWritable() {}
+
+ public final void readFields(DataInput in) throws IOException {
+ compressed = new byte[in.readInt()];
+ in.readFully(compressed, 0, compressed.length);
+ }
+
+ /** Must be called by all methods which access fields to ensure that the data
+ * has been uncompressed. */
+ protected void ensureInflated() {
+ if (compressed != null) {
+ try {
+ ByteArrayInputStream deflated = new ByteArrayInputStream(compressed);
+ DataInput inflater =
+ new DataInputStream(new InflaterInputStream(deflated));
+ readFieldsCompressed(inflater);
+ compressed = null;
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+ }
+
+ /** Subclasses implement this instead of {@link #readFields(DataInput)}. */
+ protected abstract void readFieldsCompressed(DataInput in)
+ throws IOException;
+
+ public final void write(DataOutput out) throws IOException {
+ if (compressed == null) {
+ ByteArrayOutputStream deflated = new ByteArrayOutputStream();
+ DataOutputStream deflater =
+ new DataOutputStream(new DeflaterOutputStream(deflated));
+ writeCompressed(deflater);
+ deflater.close();
+ compressed = deflated.toByteArray();
+ }
+ out.writeInt(compressed.length);
+ out.write(compressed);
+ }
+
+ /** Subclasses implement this instead of {@link #write(DataOutput)}. */
+ protected abstract void writeCompressed(DataOutput out) throws IOException;
+
+}
Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/Content.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/Content.java?rev=232841&r1=232840&r2=232841&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/Content.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/Content.java Mon Aug 15 11:10:23 2005
@@ -23,12 +23,13 @@
import org.apache.nutch.fs.*;
import org.apache.nutch.util.*;
-public final class Content extends VersionedWritable {
+public final class Content extends CompressedWritable {
public static final String DIR_NAME = "content";
private final static byte VERSION = 1;
+ private byte version;
private String url;
private String base;
private byte[] content;
@@ -53,15 +54,16 @@
this.metadata = metadata;
}
- public byte getVersion() { return VERSION; }
-
- public final void readFields(DataInput in) throws IOException {
- super.readFields(in); // check version
+ protected final void readFieldsCompressed(DataInput in) throws IOException {
+ version = in.readByte();
+ if (version > VERSION)
+ throw new VersionMismatchException(VERSION, version);
url = UTF8.readString(in); // read url
base = UTF8.readString(in); // read base
- content = WritableUtils.readCompressedByteArray(in);
+ content = new byte[in.readInt()]; // read content
+ in.readFully(content);
contentType = UTF8.readString(in); // read contentType
@@ -72,13 +74,14 @@
}
}
- public final void write(DataOutput out) throws IOException {
- super.write(out); // write version
+ protected final void writeCompressed(DataOutput out) throws IOException {
+ out.writeByte(version);
UTF8.writeString(out, url); // write url
UTF8.writeString(out, base); // write base
- WritableUtils.writeCompressedByteArray(out, content); // write content
+ out.writeInt(content.length); // write content
+ out.write(content);
UTF8.writeString(out, contentType); // write contentType
@@ -102,32 +105,55 @@
//
/** The url fetched. */
- public String getUrl() { return url; }
+ public String getUrl() {
+ ensureInflated();
+ return url;
+ }
/** The base url for relative links contained in the content.
* Maybe be different from url if the request redirected.
*/
- public String getBaseUrl() { return base; }
+ public String getBaseUrl() {
+ ensureInflated();
+ return base;
+ }
/** The binary content retrieved. */
- public byte[] getContent() { return content; }
- public void setContent(byte[] content) { this.content = content; }
+ public byte[] getContent() {
+ ensureInflated();
+ return content;
+ }
+ public void setContent(byte[] content) {
+ ensureInflated();
+ this.content = content;
+ }
/** The media type of the retrieved content.
* @see http://www.iana.org/assignments/media-types/
*/
- public String getContentType() { return contentType; }
+ public String getContentType() {
+ ensureInflated();
+ return contentType;
+ }
public void setContentType(String contentType) {
+ ensureInflated();
this.contentType = contentType;
}
/** Other protocol-specific data. */
- public Properties getMetadata() { return metadata; }
+ public Properties getMetadata() {
+ ensureInflated();
+ return metadata;
+ }
/** Return the value of a metadata property. */
- public String get(String name) { return getMetadata().getProperty(name); }
+ public String get(String name) {
+ ensureInflated();
+ return getMetadata().getProperty(name);
+ }
public boolean equals(Object o) {
+ ensureInflated();
if (!(o instanceof Content)){
return false;
}
@@ -141,6 +167,7 @@
}
public String toString() {
+ ensureInflated();
StringBuffer buffer = new StringBuffer();
buffer.append("url: " + url + "\n" );