You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by st...@apache.org on 2009/11/28 20:53:40 UTC
svn commit: r885142 [4/6] - in /hadoop/common/branches/HADOOP-6194: ./
.eclipse.templates/ bin/ ivy/ lib/jdiff/ src/ src/contrib/ src/contrib/ec2/
src/docs/ src/docs/src/documentation/
src/docs/src/documentation/content/xdocs/ src/docs/src/documentatio...
Modified: hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/BZip2Codec.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/BZip2Codec.java?rev=885142&r1=885141&r2=885142&view=diff
==============================================================================
--- hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/BZip2Codec.java (original)
+++ hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/BZip2Codec.java Sat Nov 28 19:53:33 2009
@@ -23,6 +23,9 @@
import java.io.InputStream;
import java.io.OutputStream;
+
+import org.apache.hadoop.fs.Seekable;
+import org.apache.hadoop.io.compress.bzip2.BZip2Constants;
import org.apache.hadoop.io.compress.bzip2.BZip2DummyCompressor;
import org.apache.hadoop.io.compress.bzip2.BZip2DummyDecompressor;
import org.apache.hadoop.io.compress.bzip2.CBZip2InputStream;
@@ -35,17 +38,17 @@
* CompressionCodec which have a Compressor or Decompressor type argument, throw
* UnsupportedOperationException.
*/
-public class BZip2Codec implements
- org.apache.hadoop.io.compress.CompressionCodec {
+public class BZip2Codec implements SplittableCompressionCodec {
private static final String HEADER = "BZ";
private static final int HEADER_LEN = HEADER.length();
+ private static final String SUB_HEADER = "h9";
+ private static final int SUB_HEADER_LEN = SUB_HEADER.length();
/**
* Creates a new instance of BZip2Codec
*/
- public BZip2Codec() {
- }
+ public BZip2Codec() { }
/**
* Creates CompressionOutputStream for BZip2
@@ -62,10 +65,10 @@
}
/**
- * This functionality is currently not supported.
+ * Creates a compressor using given OutputStream.
*
- * @throws java.lang.UnsupportedOperationException
- * Throws UnsupportedOperationException
+ * @return CompressionOutputStream
+ @throws java.io.IOException
*/
public CompressionOutputStream createOutputStream(OutputStream out,
Compressor compressor) throws IOException {
@@ -75,8 +78,7 @@
/**
* This functionality is currently not supported.
*
- * @throws java.lang.UnsupportedOperationException
- * Throws UnsupportedOperationException
+ * @return BZip2DummyCompressor.class
*/
public Class<? extends org.apache.hadoop.io.compress.Compressor> getCompressorType() {
return BZip2DummyCompressor.class;
@@ -85,8 +87,7 @@
/**
* This functionality is currently not supported.
*
- * @throws java.lang.UnsupportedOperationException
- * Throws UnsupportedOperationException
+ * @return Compressor
*/
public Compressor createCompressor() {
return new BZip2DummyCompressor();
@@ -109,8 +110,7 @@
/**
* This functionality is currently not supported.
*
- * @throws java.lang.UnsupportedOperationException
- * Throws UnsupportedOperationException
+ * @return CompressionInputStream
*/
public CompressionInputStream createInputStream(InputStream in,
Decompressor decompressor) throws IOException {
@@ -118,10 +118,64 @@
}
/**
+ * Creates CompressionInputStream to be used to read off uncompressed data
+ * in one of the two reading modes. i.e. Continuous or Blocked reading modes
+ *
+ * @param seekableIn The InputStream
+ * @param start The start offset into the compressed stream
+ * @param end The end offset into the compressed stream
+ * @param readMode Controls whether progress is reported continuously or
+ * only at block boundaries.
+ *
+ * @return CompressionInputStream for BZip2 aligned at block boundaries
+ */
+ public SplitCompressionInputStream createInputStream(InputStream seekableIn,
+ Decompressor decompressor, long start, long end, READ_MODE readMode)
+ throws IOException {
+
+ if (!(seekableIn instanceof Seekable)) {
+ throw new IOException("seekableIn must be an instance of " +
+ Seekable.class.getName());
+ }
+
+ //find the position of first BZip2 start up marker
+ ((Seekable)seekableIn).seek(0);
+
+ // BZip2 start of block markers are of 6 bytes. But the very first block
+ // also has "BZh9", making it 10 bytes. This is the common case. But at
+ // time stream might start without a leading BZ.
+ final long FIRST_BZIP2_BLOCK_MARKER_POSITION =
+ CBZip2InputStream.numberOfBytesTillNextMarker(seekableIn);
+ long adjStart = Math.max(0L, start - FIRST_BZIP2_BLOCK_MARKER_POSITION);
+
+ ((Seekable)seekableIn).seek(adjStart);
+ SplitCompressionInputStream in =
+ new BZip2CompressionInputStream(seekableIn, adjStart, end, readMode);
+
+
+ // The following if clause handles the following case:
+ // Assume the following scenario in BZip2 compressed stream where
+ // . represent compressed data.
+ // .....[48 bit Block].....[48 bit Block].....[48 bit Block]...
+ // ........................[47 bits][1 bit].....[48 bit Block]...
+ // ................................^[Assume a Byte alignment here]
+ // ........................................^^[current position of stream]
+ // .....................^^[We go back 10 Bytes in stream and find a Block marker]
+ // ........................................^^[We align at wrong position!]
+ // ...........................................................^^[While this pos is correct]
+
+ if (in.getPos() <= start) {
+ ((Seekable)seekableIn).seek(start);
+ in = new BZip2CompressionInputStream(seekableIn, start, end, readMode);
+ }
+
+ return in;
+ }
+
+ /**
* This functionality is currently not supported.
*
- * @throws java.lang.UnsupportedOperationException
- * Throws UnsupportedOperationException
+ * @return BZip2DummyDecompressor.class
*/
public Class<? extends org.apache.hadoop.io.compress.Decompressor> getDecompressorType() {
return BZip2DummyDecompressor.class;
@@ -130,8 +184,7 @@
/**
* This functionality is currently not supported.
*
- * @throws java.lang.UnsupportedOperationException
- * Throws UnsupportedOperationException
+ * @return Decompressor
*/
public Decompressor createDecompressor() {
return new BZip2DummyDecompressor();
@@ -146,7 +199,8 @@
return ".bz2";
}
- private static class BZip2CompressionOutputStream extends CompressionOutputStream {
+ private static class BZip2CompressionOutputStream extends
+ CompressionOutputStream {
// class data starts here//
private CBZip2OutputStream output;
@@ -221,26 +275,79 @@
}// end of class BZip2CompressionOutputStream
- private static class BZip2CompressionInputStream extends CompressionInputStream {
+ /**
+ * This class is capable to de-compress BZip2 data in two modes;
+ * CONTINOUS and BYBLOCK. BYBLOCK mode makes it possible to
+ * do decompression starting any arbitrary position in the stream.
+ *
+ * So this facility can easily be used to parallelize decompression
+ * of a large BZip2 file for performance reasons. (It is exactly
+ * done so for Hadoop framework. See LineRecordReader for an
+ * example). So one can break the file (of course logically) into
+ * chunks for parallel processing. These "splits" should be like
+ * default Hadoop splits (e.g as in FileInputFormat getSplit metod).
+ * So this code is designed and tested for FileInputFormat's way
+ * of splitting only.
+ */
+
+ private static class BZip2CompressionInputStream extends
+ SplitCompressionInputStream {
// class data starts here//
private CBZip2InputStream input;
boolean needsReset;
+ private BufferedInputStream bufferedIn;
+ private boolean isHeaderStripped = false;
+ private boolean isSubHeaderStripped = false;
+ private READ_MODE readMode = READ_MODE.CONTINUOUS;
+ private long startingPos = 0L;
+
+ // Following state machine handles different states of compressed stream
+ // position
+ // HOLD : Don't advertise compressed stream position
+ // ADVERTISE : Read 1 more character and advertise stream position
+ // See more comments about it before updatePos method.
+ private enum POS_ADVERTISEMENT_STATE_MACHINE {
+ HOLD, ADVERTISE
+ };
+
+ POS_ADVERTISEMENT_STATE_MACHINE posSM = POS_ADVERTISEMENT_STATE_MACHINE.HOLD;
+ long compressedStreamPosition = 0;
+
// class data ends here//
public BZip2CompressionInputStream(InputStream in) throws IOException {
+ this(in, 0L, Long.MAX_VALUE, READ_MODE.CONTINUOUS);
+ }
- super(in);
- needsReset = true;
+ public BZip2CompressionInputStream(InputStream in, long start, long end,
+ READ_MODE readMode) throws IOException {
+ super(in, start, end);
+ needsReset = false;
+ bufferedIn = new BufferedInputStream(super.in);
+ this.startingPos = super.getPos();
+ this.readMode = readMode;
+ if (this.startingPos == 0) {
+ // We only strip header if it is start of file
+ bufferedIn = readStreamHeader();
+ }
+ input = new CBZip2InputStream(bufferedIn, readMode);
+ if (this.isHeaderStripped) {
+ input.updateReportedByteCount(HEADER_LEN);
+ }
+
+ if (this.isSubHeaderStripped) {
+ input.updateReportedByteCount(SUB_HEADER_LEN);
+ }
+
+ this.updatePos(false);
}
private BufferedInputStream readStreamHeader() throws IOException {
// We are flexible enough to allow the compressed stream not to
// start with the header of BZ. So it works fine either we have
// the header or not.
- BufferedInputStream bufferedIn = null;
if (super.in != null) {
- bufferedIn = new BufferedInputStream(super.in);
bufferedIn.mark(HEADER_LEN);
byte[] headerBytes = new byte[HEADER_LEN];
int actualRead = bufferedIn.read(headerBytes, 0, HEADER_LEN);
@@ -248,6 +355,17 @@
String header = new String(headerBytes);
if (header.compareTo(HEADER) != 0) {
bufferedIn.reset();
+ } else {
+ this.isHeaderStripped = true;
+ // In case of BYBLOCK mode, we also want to strip off
+ // remaining two character of the header.
+ if (this.readMode == READ_MODE.BYBLOCK) {
+ actualRead = bufferedIn.read(headerBytes, 0,
+ SUB_HEADER_LEN);
+ if (actualRead != -1) {
+ this.isSubHeaderStripped = true;
+ }
+ }
}
}
}
@@ -267,33 +385,96 @@
}
}
+ /**
+ * This method updates compressed stream position exactly when the
+ * client of this code has read off at least one byte passed any BZip2
+ * end of block marker.
+ *
+ * This mechanism is very helpful to deal with data level record
+ * boundaries. Please see constructor and next methods of
+ * org.apache.hadoop.mapred.LineRecordReader as an example usage of this
+ * feature. We elaborate it with an example in the following:
+ *
+ * Assume two different scenarios of the BZip2 compressed stream, where
+ * [m] represent end of block, \n is line delimiter and . represent compressed
+ * data.
+ *
+ * ............[m]......\n.......
+ *
+ * ..........\n[m]......\n.......
+ *
+ * Assume that end is right after [m]. In the first case the reading
+ * will stop at \n and there is no need to read one more line. (To see the
+ * reason of reading one more line in the next() method is explained in LineRecordReader.)
+ * While in the second example LineRecordReader needs to read one more line
+ * (till the second \n). Now since BZip2Codecs only update position
+ * at least one byte passed a maker, so it is straight forward to differentiate
+ * between the two cases mentioned.
+ *
+ */
+
public int read(byte[] b, int off, int len) throws IOException {
if (needsReset) {
internalReset();
}
- return this.input.read(b, off, len);
+ int result = 0;
+ result = this.input.read(b, off, len);
+ if (result == BZip2Constants.END_OF_BLOCK) {
+ this.posSM = POS_ADVERTISEMENT_STATE_MACHINE.ADVERTISE;
+ }
+
+ if (this.posSM == POS_ADVERTISEMENT_STATE_MACHINE.ADVERTISE) {
+ result = this.input.read(b, off, off + 1);
+ // This is the precise time to update compressed stream position
+ // to the client of this code.
+ this.updatePos(true);
+ this.posSM = POS_ADVERTISEMENT_STATE_MACHINE.HOLD;
+ }
+
+ return result;
+
+ }
+
+ public int read() throws IOException {
+ byte b[] = new byte[1];
+ int result = this.read(b, 0, 1);
+ return (result < 0) ? result : b[0];
}
private void internalReset() throws IOException {
if (needsReset) {
needsReset = false;
BufferedInputStream bufferedIn = readStreamHeader();
- input = new CBZip2InputStream(bufferedIn);
+ input = new CBZip2InputStream(bufferedIn, this.readMode);
}
}
public void resetState() throws IOException {
- // Cannot read from bufferedIn at this point because bufferedIn might not be ready
+ // Cannot read from bufferedIn at this point because bufferedIn
+ // might not be ready
// yet, as in SequenceFile.Reader implementation.
needsReset = true;
}
- public int read() throws IOException {
- if (needsReset) {
- internalReset();
+ public long getPos() {
+ return this.compressedStreamPosition;
}
- return this.input.read();
+
+ /*
+ * As the comments before read method tell that
+ * compressed stream is advertised when at least
+ * one byte passed EOB have been read off. But
+ * there is an exception to this rule. When we
+ * construct the stream we advertise the position
+ * exactly at EOB. In the following method
+ * shouldAddOn boolean captures this exception.
+ *
+ */
+ private void updatePos(boolean shouldAddOn) {
+ int addOn = shouldAddOn ? 1 : 0;
+ this.compressedStreamPosition = this.startingPos
+ + this.input.getProcessedByteCount() + addOn;
}
}// end of BZip2CompressionInputStream
Modified: hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/BlockDecompressorStream.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/BlockDecompressorStream.java?rev=885142&r1=885141&r2=885142&view=diff
==============================================================================
--- hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/BlockDecompressorStream.java (original)
+++ hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/BlockDecompressorStream.java Sat Nov 28 19:53:33 2009
@@ -38,9 +38,10 @@
* @param in input stream
* @param decompressor decompressor to use
* @param bufferSize size of buffer
+ * @throws IOException
*/
public BlockDecompressorStream(InputStream in, Decompressor decompressor,
- int bufferSize) {
+ int bufferSize) throws IOException {
super(in, decompressor, bufferSize);
}
@@ -49,12 +50,13 @@
*
* @param in input stream
* @param decompressor decompressor to use
+ * @throws IOException
*/
- public BlockDecompressorStream(InputStream in, Decompressor decompressor) {
+ public BlockDecompressorStream(InputStream in, Decompressor decompressor) throws IOException {
super(in, decompressor);
}
- protected BlockDecompressorStream(InputStream in) {
+ protected BlockDecompressorStream(InputStream in) throws IOException {
super(in);
}
Modified: hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/CodecPool.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/CodecPool.java?rev=885142&r1=885141&r2=885142&view=diff
==============================================================================
--- hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/CodecPool.java (original)
+++ hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/CodecPool.java Sat Nov 28 19:53:33 2009
@@ -24,6 +24,7 @@
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.util.ReflectionUtils;
/**
@@ -91,20 +92,26 @@
*
* @param codec the <code>CompressionCodec</code> for which to get the
* <code>Compressor</code>
+ * @param conf the <code>Configuration</code> object which contains confs for creating or reinit the compressor
* @return <code>Compressor</code> for the given
* <code>CompressionCodec</code> from the pool or a new one
*/
- public static Compressor getCompressor(CompressionCodec codec) {
+ public static Compressor getCompressor(CompressionCodec codec, Configuration conf) {
Compressor compressor = borrow(compressorPool, codec.getCompressorType());
if (compressor == null) {
compressor = codec.createCompressor();
LOG.info("Got brand-new compressor");
} else {
+ compressor.reinit(conf);
LOG.debug("Got recycled compressor");
}
return compressor;
}
+ public static Compressor getCompressor(CompressionCodec codec) {
+ return getCompressor(codec, null);
+ }
+
/**
* Get a {@link Decompressor} for the given {@link CompressionCodec} from the
* pool or a new one.
Modified: hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/CompressionInputStream.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/CompressionInputStream.java?rev=885142&r1=885141&r2=885142&view=diff
==============================================================================
--- hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/CompressionInputStream.java (original)
+++ hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/CompressionInputStream.java Sat Nov 28 19:53:33 2009
@@ -21,6 +21,8 @@
import java.io.IOException;
import java.io.InputStream;
+import org.apache.hadoop.fs.PositionedReadable;
+import org.apache.hadoop.fs.Seekable;
/**
* A compression input stream.
*
@@ -28,19 +30,25 @@
* reposition the underlying input stream then call {@link #resetState()},
* without having to also synchronize client buffers.
*/
-public abstract class CompressionInputStream extends InputStream {
+
+public abstract class CompressionInputStream extends InputStream implements Seekable {
/**
* The input stream to be compressed.
*/
protected final InputStream in;
+ protected long maxAvailableData = 0L;
/**
* Create a compression input stream that reads
* the decompressed bytes from the given stream.
*
* @param in The input stream to be compressed.
+ * @throws IOException
*/
- protected CompressionInputStream(InputStream in) {
+ protected CompressionInputStream(InputStream in) throws IOException {
+ if (!(in instanceof Seekable) || !(in instanceof PositionedReadable)) {
+ this.maxAvailableData = in.available();
+ }
this.in = in;
}
@@ -60,4 +68,40 @@
*/
public abstract void resetState() throws IOException;
+ /**
+ * This method returns the current position in the stream.
+ *
+ * @return Current position in stream as a long
+ */
+ public long getPos() throws IOException {
+ if (!(in instanceof Seekable) || !(in instanceof PositionedReadable)){
+ //This way of getting the current position will not work for file
+ //size which can be fit in an int and hence can not be returned by
+ //available method.
+ return (this.maxAvailableData - this.in.available());
+ }
+ else{
+ return ((Seekable)this.in).getPos();
+ }
+
+ }
+
+ /**
+ * This method is current not supported.
+ *
+ * @throws UnsupportedOperationException
+ */
+
+ public void seek(long pos) throws UnsupportedOperationException {
+ throw new UnsupportedOperationException();
+ }
+
+ /**
+ * This method is current not supported.
+ *
+ * @throws UnsupportedOperationException
+ */
+ public boolean seekToNewSource(long targetPos) throws UnsupportedOperationException {
+ throw new UnsupportedOperationException();
+ }
}
Modified: hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/Compressor.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/Compressor.java?rev=885142&r1=885141&r2=885142&view=diff
==============================================================================
--- hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/Compressor.java (original)
+++ hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/Compressor.java Sat Nov 28 19:53:33 2009
@@ -20,6 +20,8 @@
import java.io.IOException;
+import org.apache.hadoop.conf.Configuration;
+
/**
* Specification of a stream-based 'compressor' which can be
* plugged into a {@link CompressionOutputStream} to compress data.
@@ -102,5 +104,13 @@
/**
* Closes the compressor and discards any unprocessed input.
*/
- public void end();
+ public void end();
+
+ /**
+ * Prepare the compressor to be used in a new stream with settings defined in
+ * the given Configuration
+ *
+ * @param conf Configuration from which new setting are fetched
+ */
+ public void reinit(Configuration conf);
}
Modified: hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/DecompressorStream.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/DecompressorStream.java?rev=885142&r1=885141&r2=885142&view=diff
==============================================================================
--- hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/DecompressorStream.java (original)
+++ hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/DecompressorStream.java Sat Nov 28 19:53:33 2009
@@ -30,7 +30,7 @@
protected boolean eof = false;
protected boolean closed = false;
- public DecompressorStream(InputStream in, Decompressor decompressor, int bufferSize) {
+ public DecompressorStream(InputStream in, Decompressor decompressor, int bufferSize) throws IOException {
super(in);
if (in == null || decompressor == null) {
@@ -43,7 +43,7 @@
buffer = new byte[bufferSize];
}
- public DecompressorStream(InputStream in, Decompressor decompressor) {
+ public DecompressorStream(InputStream in, Decompressor decompressor) throws IOException {
this(in, decompressor, 512);
}
@@ -51,8 +51,9 @@
* Allow derived classes to directly set the underlying stream.
*
* @param in Underlying input stream.
+ * @throws IOException
*/
- protected DecompressorStream(InputStream in) {
+ protected DecompressorStream(InputStream in) throws IOException {
super(in);
}
Modified: hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/GzipCodec.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/GzipCodec.java?rev=885142&r1=885141&r2=885142&view=diff
==============================================================================
--- hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/GzipCodec.java (original)
+++ hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/GzipCodec.java Sat Nov 28 19:53:33 2009
@@ -22,8 +22,11 @@
import java.util.zip.GZIPOutputStream;
import java.util.zip.GZIPInputStream;
+import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.compress.DefaultCodec;
import org.apache.hadoop.io.compress.zlib.*;
+import org.apache.hadoop.io.compress.zlib.ZlibCompressor.CompressionLevel;
+import org.apache.hadoop.io.compress.zlib.ZlibCompressor.CompressionStrategy;
/**
* This class creates gzip compressors/decompressors.
@@ -103,8 +106,9 @@
/**
* Allow subclasses to directly set the inflater stream.
+ * @throws IOException
*/
- protected GzipInputStream(DecompressorStream in) {
+ protected GzipInputStream(DecompressorStream in) throws IOException {
super(in);
}
@@ -154,7 +158,7 @@
public Compressor createCompressor() {
return (ZlibFactory.isNativeZlibLoaded(conf))
- ? new GzipZlibCompressor()
+ ? new GzipZlibCompressor(conf)
: null;
}
@@ -205,6 +209,13 @@
ZlibCompressor.CompressionStrategy.DEFAULT_STRATEGY,
ZlibCompressor.CompressionHeader.GZIP_FORMAT, 64*1024);
}
+
+ public GzipZlibCompressor(Configuration conf) {
+ super(ZlibFactory.getCompressionLevel(conf),
+ ZlibFactory.getCompressionStrategy(conf),
+ ZlibCompressor.CompressionHeader.GZIP_FORMAT,
+ 64 * 1024);
+ }
}
static final class GzipZlibDecompressor extends ZlibDecompressor {
Modified: hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/bzip2/BZip2Constants.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/bzip2/BZip2Constants.java?rev=885142&r1=885141&r2=885142&view=diff
==============================================================================
--- hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/bzip2/BZip2Constants.java (original)
+++ hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/bzip2/BZip2Constants.java Sat Nov 28 19:53:33 2009
@@ -44,6 +44,14 @@
int N_ITERS = 4;
int MAX_SELECTORS = (2 + (900000 / G_SIZE));
int NUM_OVERSHOOT_BYTES = 20;
+ /**
+ * End of a BZip2 block
+ */
+ public static final int END_OF_BLOCK = -2;
+ /**
+ * End of BZip2 stream.
+ */
+ public static final int END_OF_STREAM = -1;
/**
* This array really shouldn't be here. Again, for historical purposes it
Modified: hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/bzip2/BZip2DummyCompressor.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/bzip2/BZip2DummyCompressor.java?rev=885142&r1=885141&r2=885142&view=diff
==============================================================================
--- hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/bzip2/BZip2DummyCompressor.java (original)
+++ hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/bzip2/BZip2DummyCompressor.java Sat Nov 28 19:53:33 2009
@@ -20,6 +20,7 @@
import java.io.IOException;
+import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.compress.Compressor;
/**
@@ -77,4 +78,9 @@
throw new UnsupportedOperationException();
}
+ @Override
+ public void reinit(Configuration conf) {
+ // do nothing
+ }
+
}
Modified: hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/bzip2/CBZip2InputStream.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/bzip2/CBZip2InputStream.java?rev=885142&r1=885141&r2=885142&view=diff
==============================================================================
--- hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/bzip2/CBZip2InputStream.java (original)
+++ hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/bzip2/CBZip2InputStream.java Sat Nov 28 19:53:33 2009
@@ -23,9 +23,13 @@
*/
package org.apache.hadoop.io.compress.bzip2;
+import java.io.BufferedInputStream;
import java.io.InputStream;
import java.io.IOException;
+import org.apache.hadoop.io.compress.SplittableCompressionCodec.READ_MODE;
+
+
/**
* An input stream that decompresses from the BZip2 format (without the file
* header chars) to be read as any other stream.
@@ -45,30 +49,43 @@
* </p>
*
* <p>
+ * This Ant code was enhanced so that it can de-compress blocks of bzip2 data.
+ * Current position in the stream is an important statistic for Hadoop. For
+ * example in LineRecordReader, we solely depend on the current position in the
+ * stream to know about the progess. The notion of position becomes complicated
+ * for compressed files. The Hadoop splitting is done in terms of compressed
+ * file. But a compressed file deflates to a large amount of data. So we have
+ * handled this problem in the following way.
+ *
+ * On object creation time, we find the next block start delimiter. Once such a
+ * marker is found, the stream stops there (we discard any read compressed data
+ * in this process) and the position is updated (i.e. the caller of this class
+ * will find out the stream location). At this point we are ready for actual
+ * reading (i.e. decompression) of data.
+ *
+ * The subsequent read calls give out data. The position is updated when the
+ * caller of this class has read off the current block + 1 bytes. In between the
+ * block reading, position is not updated. (We can only update the postion on
+ * block boundaries).
+ * </p>
+ *
+ * <p>
* Instances of this class are not threadsafe.
* </p>
*/
public class CBZip2InputStream extends InputStream implements BZip2Constants {
- private static void reportCRCError() throws IOException {
- throw new IOException("BZip2 CRC error");
-
- }
-
- private void makeMaps() {
- final boolean[] inUse = this.data.inUse;
- final byte[] seqToUnseq = this.data.seqToUnseq;
-
- int nInUseShadow = 0;
-
- for (int i = 0; i < 256; i++) {
- if (inUse[i])
- seqToUnseq[nInUseShadow++] = (byte) i;
- }
-
- this.nInUse = nInUseShadow;
- }
+ public static final long BLOCK_DELIMITER = 0X314159265359L;// start of block
+ public static final long EOS_DELIMITER = 0X177245385090L;// end of bzip2 stream
+ private static final int DELIMITER_BIT_LENGTH = 48;
+ READ_MODE readMode = READ_MODE.CONTINUOUS;
+ // The variable records the current advertised position of the stream.
+ private long reportedBytesReadFromCompressedStream = 0L;
+ // The following variable keep record of compressed bytes read.
+ private long bytesReadFromCompressedStream = 0L;
+ private boolean lazyInitialization = false;
+ private byte array[] = new byte[1];
/**
* Index of the last char in the block, so the block size == last + 1.
@@ -86,32 +103,34 @@
*/
private int blockSize100k;
- private boolean blockRandomised;
+ private boolean blockRandomised = false;
- private int bsBuff;
- private int bsLive;
+ private long bsBuff;
+ private long bsLive;
private final CRC crc = new CRC();
private int nInUse;
- private InputStream in;
+ private BufferedInputStream in;
private int currentChar = -1;
- private static final int EOF = 0;
- private static final int START_BLOCK_STATE = 1;
- private static final int RAND_PART_A_STATE = 2;
- private static final int RAND_PART_B_STATE = 3;
- private static final int RAND_PART_C_STATE = 4;
- private static final int NO_RAND_PART_A_STATE = 5;
- private static final int NO_RAND_PART_B_STATE = 6;
- private static final int NO_RAND_PART_C_STATE = 7;
+ /**
+ * A state machine to keep track of current state of the de-coder
+ *
+ */
+ public enum STATE {
+ EOF, START_BLOCK_STATE, RAND_PART_A_STATE, RAND_PART_B_STATE, RAND_PART_C_STATE, NO_RAND_PART_A_STATE, NO_RAND_PART_B_STATE, NO_RAND_PART_C_STATE, NO_PROCESS_STATE
+ };
- private int currentState = START_BLOCK_STATE;
+ private STATE currentState = STATE.START_BLOCK_STATE;
private int storedBlockCRC, storedCombinedCRC;
private int computedBlockCRC, computedCombinedCRC;
+ private boolean skipResult = false;// used by skipToNextMarker
+ private static boolean skipDecompression = false;
+
// Variables used by setup* methods exclusively
private int su_count;
@@ -130,6 +149,121 @@
private CBZip2InputStream.Data data;
/**
+ * This method reports the processed bytes so far. Please note that this
+ * statistic is only updated on block boundaries and only when the stream is
+ * initiated in BYBLOCK mode.
+ */
+ public long getProcessedByteCount() {
+ return reportedBytesReadFromCompressedStream;
+ }
+
+ /**
+ * This method keeps track of raw processed compressed
+ * bytes.
+ *
+ * @param count count is the number of bytes to be
+ * added to raw processed bytes
+ */
+
+ protected void updateProcessedByteCount(int count) {
+ this.bytesReadFromCompressedStream += count;
+ }
+
+ /**
+ * This method is called by the client of this
+ * class in case there are any corrections in
+ * the stream position. One common example is
+ * when client of this code removes starting BZ
+ * characters from the compressed stream.
+ *
+ * @param count count bytes are added to the reported bytes
+ *
+ */
+ public void updateReportedByteCount(int count) {
+ this.reportedBytesReadFromCompressedStream += count;
+ this.updateProcessedByteCount(count);
+ }
+
+ /**
+ * This method reads a Byte from the compressed stream. Whenever we need to
+ * read from the underlying compressed stream, this method should be called
+ * instead of directly calling the read method of the underlying compressed
+ * stream. This method does important record keeping to have the statistic
+ * that how many bytes have been read off the compressed stream.
+ */
+ private int readAByte(InputStream inStream) throws IOException {
+ int read = inStream.read();
+ if (read >= 0) {
+ this.updateProcessedByteCount(1);
+ }
+ return read;
+ }
+
+ /**
+ * This method tries to find the marker (passed to it as the first parameter)
+ * in the stream. It can find bit patterns of length <= 63 bits. Specifically
+ * this method is used in CBZip2InputStream to find the end of block (EOB)
+ * delimiter in the stream, starting from the current position of the stream.
+ * If marker is found, the stream position will be right after marker at the
+ * end of this call.
+ *
+ * @param marker The bit pattern to be found in the stream
+ * @param markerBitLength No of bits in the marker
+ *
+ * @throws IOException
+ * @throws IllegalArgumentException if marketBitLength is greater than 63
+ */
+ public boolean skipToNextMarker(long marker, int markerBitLength)
+ throws IOException, IllegalArgumentException {
+ try {
+ if (markerBitLength > 63) {
+ throw new IllegalArgumentException(
+ "skipToNextMarker can not find patterns greater than 63 bits");
+ }
+ // pick next marketBitLength bits in the stream
+ long bytes = 0;
+ bytes = this.bsR(markerBitLength);
+ if (bytes == -1) {
+ return false;
+ }
+ while (true) {
+ if (bytes == marker) {
+ return true;
+
+ } else {
+ bytes = bytes << 1;
+ bytes = bytes & ((1L << markerBitLength) - 1);
+ int oneBit = (int) this.bsR(1);
+ if (oneBit != -1) {
+ bytes = bytes | oneBit;
+ } else
+ return false;
+ }
+ }
+ } catch (IOException ex) {
+ return false;
+ }
+ }
+
+ protected void reportCRCError() throws IOException {
+ throw new IOException("crc error");
+ }
+
+ private void makeMaps() {
+ final boolean[] inUse = this.data.inUse;
+ final byte[] seqToUnseq = this.data.seqToUnseq;
+
+ int nInUseShadow = 0;
+
+ for (int i = 0; i < 256; i++) {
+ if (inUse[i])
+ seqToUnseq[nInUseShadow++] = (byte) i;
+ }
+
+ this.nInUse = nInUseShadow;
+ }
+
+ /**
* Constructs a new CBZip2InputStream which decompresses bytes read from the
* specified stream.
*
@@ -145,21 +279,99 @@
* @throws NullPointerException
* if <tt>in == null</tt>
*/
- public CBZip2InputStream(final InputStream in) throws IOException {
- super();
+ public CBZip2InputStream(final InputStream in, READ_MODE readMode)
+ throws IOException {
- this.in = in;
+ super();
+ int blockSize = 0X39;// i.e 9
+ this.blockSize100k = blockSize - '0';
+ this.in = new BufferedInputStream(in, 1024 * 9);// >1 MB buffer
+ this.readMode = readMode;
+ if (readMode == READ_MODE.CONTINUOUS) {
+ currentState = STATE.START_BLOCK_STATE;
+ lazyInitialization = (in.available() == 0)?true:false;
+ if(!lazyInitialization){
init();
}
+ } else if (readMode == READ_MODE.BYBLOCK) {
+ this.currentState = STATE.NO_PROCESS_STATE;
+ skipResult = this.skipToNextMarker(CBZip2InputStream.BLOCK_DELIMITER,DELIMITER_BIT_LENGTH);
+ this.reportedBytesReadFromCompressedStream = this.bytesReadFromCompressedStream;
+ if(!skipDecompression){
+ changeStateToProcessABlock();
+ }
+ }
+ }
+
+ /**
+ * Returns the number of bytes between the current stream position
+ * and the immediate next BZip2 block marker.
+ *
+ * @param in
+ * The InputStream
+ *
+ * @return long Number of bytes between current stream position and the
+ * next BZip2 block start marker.
+ * @throws IOException
+ *
+ */
+ public static long numberOfBytesTillNextMarker(final InputStream in) throws IOException{
+ CBZip2InputStream.skipDecompression = true;
+ CBZip2InputStream anObject = null;
+
+ anObject = new CBZip2InputStream(in, READ_MODE.BYBLOCK);
+
+ return anObject.getProcessedByteCount();
+ }
+
+ public CBZip2InputStream(final InputStream in) throws IOException {
+ this(in, READ_MODE.CONTINUOUS);
+ }
+
+ private void changeStateToProcessABlock() throws IOException {
+ if (skipResult == true) {
+ initBlock();
+ setupBlock();
+ } else {
+ this.currentState = STATE.EOF;
+ }
+ }
+
public int read() throws IOException {
+
if (this.in != null) {
- return read0();
+ int result = this.read(array, 0, 1);
+ int value = 0XFF & array[0];
+ return (result > 0 ? value : result);
+
} else {
throw new IOException("stream closed");
}
}
+ /**
+ * In CONTINOUS reading mode, this read method starts from the
+ * start of the compressed stream and end at the end of file by
+ * emitting un-compressed data. In this mode stream positioning
+ * is not announced and should be ignored.
+ *
+ * In BYBLOCK reading mode, this read method informs about the end
+ * of a BZip2 block by returning EOB. At this event, the compressed
+ * stream position is also announced. This announcement tells that
+ * how much of the compressed stream has been de-compressed and read
+ * out of this class. In between EOB events, the stream position is
+ * not updated.
+ *
+ *
+ * @throws IOException
+ * if the stream content is malformed or an I/O error occurs.
+ *
+ * @return int The return value greater than 0 are the bytes read. A value
+ * of -1 means end of stream while -2 represents end of block
+ */
+
+
public int read(final byte[] dest, final int offs, final int len)
throws IOException {
if (offs < 0) {
@@ -176,13 +388,39 @@
throw new IOException("stream closed");
}
+ if(lazyInitialization){
+ this.init();
+ this.lazyInitialization = false;
+ }
+
+ if(skipDecompression){
+ changeStateToProcessABlock();
+ CBZip2InputStream.skipDecompression = false;
+ }
+
final int hi = offs + len;
int destOffs = offs;
- for (int b; (destOffs < hi) && ((b = read0()) >= 0);) {
+ int b = 0;
+
+
+
+ for (; ((destOffs < hi) && ((b = read0())) >= 0);) {
dest[destOffs++] = (byte) b;
+
}
- return (destOffs == offs) ? -1 : (destOffs - offs);
+ int result = destOffs - offs;
+ if (result == 0) {
+ //report 'end of block' or 'end of stream'
+ result = b;
+
+ skipResult = this.skipToNextMarker(CBZip2InputStream.BLOCK_DELIMITER, DELIMITER_BIT_LENGTH);
+ //Exactly when we are about to start a new block, we advertise the stream position.
+ this.reportedBytesReadFromCompressedStream = this.bytesReadFromCompressedStream;
+
+ changeStateToProcessABlock();
+ }
+ return result;
}
private int read0() throws IOException {
@@ -190,7 +428,10 @@
switch (this.currentState) {
case EOF:
- return -1;
+ return END_OF_STREAM;// return -1
+
+ case NO_PROCESS_STATE:
+ return END_OF_BLOCK;// return -2
case START_BLOCK_STATE:
throw new IllegalStateException();
@@ -225,13 +466,13 @@
}
private void init() throws IOException {
- int magic2 = this.in.read();
+ int magic2 = this.readAByte(in);
if (magic2 != 'h') {
throw new IOException("Stream is not BZip2 formatted: expected 'h'"
+ " as first byte but got '" + (char) magic2 + "'");
}
- int blockSize = this.in.read();
+ int blockSize = this.readAByte(in);
if ((blockSize < '1') || (blockSize > '9')) {
throw new IOException("Stream is not BZip2 formatted: illegal "
+ "blocksize " + (char) blockSize);
@@ -244,6 +485,27 @@
}
private void initBlock() throws IOException {
+ if (this.readMode == READ_MODE.BYBLOCK) {
+ // this.checkBlockIntegrity();
+ this.storedBlockCRC = bsGetInt();
+ this.blockRandomised = bsR(1) == 1;
+
+ /**
+ * Allocate data here instead in constructor, so we do not allocate
+ * it if the input file is empty.
+ */
+ if (this.data == null) {
+ this.data = new Data(this.blockSize100k);
+ }
+
+ // currBlockNo++;
+ getAndMoveToFrontDecode();
+
+ this.crc.initialiseCRC();
+ this.currentState = STATE.START_BLOCK_STATE;
+ return;
+ }
+
char magic0 = bsGetUByte();
char magic1 = bsGetUByte();
char magic2 = bsGetUByte();
@@ -261,7 +523,7 @@
magic4 != 0x53 || // 'S'
magic5 != 0x59 // 'Y'
) {
- this.currentState = EOF;
+ this.currentState = STATE.EOF;
throw new IOException("bad block header");
} else {
this.storedBlockCRC = bsGetInt();
@@ -279,7 +541,7 @@
getAndMoveToFrontDecode();
this.crc.initialiseCRC();
- this.currentState = START_BLOCK_STATE;
+ this.currentState = STATE.START_BLOCK_STATE;
}
}
@@ -304,7 +566,7 @@
private void complete() throws IOException {
this.storedCombinedCRC = bsGetInt();
- this.currentState = EOF;
+ this.currentState = STATE.EOF;
this.data = null;
if (this.storedCombinedCRC != this.computedCombinedCRC) {
@@ -326,14 +588,14 @@
}
}
- private int bsR(final int n) throws IOException {
- int bsLiveShadow = this.bsLive;
- int bsBuffShadow = this.bsBuff;
+ private long bsR(final long n) throws IOException {
+ long bsLiveShadow = this.bsLive;
+ long bsBuffShadow = this.bsBuff;
if (bsLiveShadow < n) {
final InputStream inShadow = this.in;
do {
- int thech = inShadow.read();
+ int thech = readAByte(inShadow);
if (thech < 0) {
throw new IOException("unexpected end of stream");
@@ -347,15 +609,15 @@
}
this.bsLive = bsLiveShadow - n;
- return (bsBuffShadow >> (bsLiveShadow - n)) & ((1 << n) - 1);
+ return (bsBuffShadow >> (bsLiveShadow - n)) & ((1L << n) - 1);
}
private boolean bsGetBit() throws IOException {
- int bsLiveShadow = this.bsLive;
- int bsBuffShadow = this.bsBuff;
+ long bsLiveShadow = this.bsLive;
+ long bsBuffShadow = this.bsBuff;
if (bsLiveShadow < 1) {
- int thech = this.in.read();
+ int thech = this.readAByte(in);
if (thech < 0) {
throw new IOException("unexpected end of stream");
@@ -375,7 +637,7 @@
}
private int bsGetInt() throws IOException {
- return (((((bsR(8) << 8) | bsR(8)) << 8) | bsR(8)) << 8) | bsR(8);
+ return (int) ((((((bsR(8) << 8) | bsR(8)) << 8) | bsR(8)) << 8) | bsR(8));
}
/**
@@ -454,8 +716,8 @@
final int alphaSize = this.nInUse + 2;
/* Now the selectors */
- final int nGroups = bsR(3);
- final int nSelectors = bsR(15);
+ final int nGroups = (int) bsR(3);
+ final int nSelectors = (int) bsR(15);
for (int i = 0; i < nSelectors; i++) {
int j = 0;
@@ -486,7 +748,7 @@
/* Now the coding tables */
for (int t = 0; t < nGroups; t++) {
- int curr = bsR(5);
+ int curr = (int) bsR(5);
final char[] len_t = len[t];
for (int i = 0; i < alphaSize; i++) {
while (bsGetBit()) {
@@ -532,7 +794,7 @@
}
private void getAndMoveToFrontDecode() throws IOException {
- this.origPtr = bsR(24);
+ this.origPtr = (int) bsR(24);
recvDecodingTables();
final InputStream inShadow = this.in;
@@ -562,8 +824,8 @@
int groupPos = G_SIZE - 1;
final int eob = this.nInUse + 1;
int nextSym = getAndMoveToFrontDecode0(0);
- int bsBuffShadow = this.bsBuff;
- int bsLiveShadow = this.bsLive;
+ int bsBuffShadow = (int) this.bsBuff;
+ int bsLiveShadow = (int) this.bsLive;
int lastShadow = -1;
int zt = selector[groupNo] & 0xff;
int[] base_zt = base[zt];
@@ -597,10 +859,8 @@
int zn = minLens_zt;
- // Inlined:
- // int zvec = bsR(zn);
while (bsLiveShadow < zn) {
- final int thech = inShadow.read();
+ final int thech = readAByte(inShadow);
if (thech >= 0) {
bsBuffShadow = (bsBuffShadow << 8) | thech;
bsLiveShadow += 8;
@@ -609,14 +869,14 @@
throw new IOException("unexpected end of stream");
}
}
- int zvec = (bsBuffShadow >> (bsLiveShadow - zn))
+ long zvec = (bsBuffShadow >> (bsLiveShadow - zn))
& ((1 << zn) - 1);
bsLiveShadow -= zn;
while (zvec > limit_zt[zn]) {
zn++;
while (bsLiveShadow < 1) {
- final int thech = inShadow.read();
+ final int thech = readAByte(inShadow);
if (thech >= 0) {
bsBuffShadow = (bsBuffShadow << 8) | thech;
bsLiveShadow += 8;
@@ -630,7 +890,7 @@
zvec = (zvec << 1)
| ((bsBuffShadow >> bsLiveShadow) & 1);
}
- nextSym = perm_zt[zvec - base_zt[zn]];
+ nextSym = perm_zt[(int) (zvec - base_zt[zn])];
}
final byte ch = seqToUnseq[yy[0]];
@@ -680,10 +940,8 @@
int zn = minLens_zt;
- // Inlined:
- // int zvec = bsR(zn);
while (bsLiveShadow < zn) {
- final int thech = inShadow.read();
+ final int thech = readAByte(inShadow);
if (thech >= 0) {
bsBuffShadow = (bsBuffShadow << 8) | thech;
bsLiveShadow += 8;
@@ -699,7 +957,7 @@
while (zvec > limit_zt[zn]) {
zn++;
while (bsLiveShadow < 1) {
- final int thech = inShadow.read();
+ final int thech = readAByte(inShadow);
if (thech >= 0) {
bsBuffShadow = (bsBuffShadow << 8) | thech;
bsLiveShadow += 8;
@@ -709,7 +967,7 @@
}
}
bsLiveShadow--;
- zvec = (zvec << 1) | ((bsBuffShadow >> bsLiveShadow) & 1);
+ zvec = ((zvec << 1) | ((bsBuffShadow >> bsLiveShadow) & 1));
}
nextSym = perm_zt[zvec - base_zt[zn]];
}
@@ -726,14 +984,14 @@
final int zt = dataShadow.selector[groupNo] & 0xff;
final int[] limit_zt = dataShadow.limit[zt];
int zn = dataShadow.minLens[zt];
- int zvec = bsR(zn);
- int bsLiveShadow = this.bsLive;
- int bsBuffShadow = this.bsBuff;
+ int zvec = (int) bsR(zn);
+ int bsLiveShadow = (int) this.bsLive;
+ int bsBuffShadow = (int) this.bsBuff;
while (zvec > limit_zt[zn]) {
zn++;
while (bsLiveShadow < 1) {
- final int thech = inShadow.read();
+ final int thech = readAByte(inShadow);
if (thech >= 0) {
bsBuffShadow = (bsBuffShadow << 8) | thech;
@@ -807,12 +1065,16 @@
this.su_ch2 = su_ch2Shadow ^= (this.su_rNToGo == 1) ? 1 : 0;
this.su_i2++;
this.currentChar = su_ch2Shadow;
- this.currentState = RAND_PART_B_STATE;
+ this.currentState = STATE.RAND_PART_B_STATE;
this.crc.updateCRC(su_ch2Shadow);
} else {
endBlock();
+ if (readMode == READ_MODE.CONTINUOUS) {
initBlock();
setupBlock();
+ } else if (readMode == READ_MODE.BYBLOCK) {
+ this.currentState = STATE.NO_PROCESS_STATE;
+ }
}
}
@@ -824,19 +1086,23 @@
this.su_tPos = this.data.tt[this.su_tPos];
this.su_i2++;
this.currentChar = su_ch2Shadow;
- this.currentState = NO_RAND_PART_B_STATE;
+ this.currentState = STATE.NO_RAND_PART_B_STATE;
this.crc.updateCRC(su_ch2Shadow);
} else {
- this.currentState = NO_RAND_PART_A_STATE;
+ this.currentState = STATE.NO_RAND_PART_A_STATE;
endBlock();
+ if (readMode == READ_MODE.CONTINUOUS) {
initBlock();
setupBlock();
+ } else if (readMode == READ_MODE.BYBLOCK) {
+ this.currentState = STATE.NO_PROCESS_STATE;
+ }
}
}
private void setupRandPartB() throws IOException {
if (this.su_ch2 != this.su_chPrev) {
- this.currentState = RAND_PART_A_STATE;
+ this.currentState = STATE.RAND_PART_A_STATE;
this.su_count = 1;
setupRandPartA();
} else if (++this.su_count >= 4) {
@@ -851,13 +1117,13 @@
this.su_rNToGo--;
}
this.su_j2 = 0;
- this.currentState = RAND_PART_C_STATE;
+ this.currentState = STATE.RAND_PART_C_STATE;
if (this.su_rNToGo == 1) {
this.su_z ^= 1;
}
setupRandPartC();
} else {
- this.currentState = RAND_PART_A_STATE;
+ this.currentState = STATE.RAND_PART_A_STATE;
setupRandPartA();
}
}
@@ -868,7 +1134,7 @@
this.crc.updateCRC(this.su_ch2);
this.su_j2++;
} else {
- this.currentState = RAND_PART_A_STATE;
+ this.currentState = STATE.RAND_PART_A_STATE;
this.su_i2++;
this.su_count = 0;
setupRandPartA();
@@ -895,7 +1161,7 @@
this.currentChar = su_ch2Shadow;
this.crc.updateCRC(su_ch2Shadow);
this.su_j2++;
- this.currentState = NO_RAND_PART_C_STATE;
+ this.currentState = STATE.NO_RAND_PART_C_STATE;
} else {
this.su_i2++;
this.su_count = 0;
Modified: hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/zlib/BuiltInZlibDeflater.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/zlib/BuiltInZlibDeflater.java?rev=885142&r1=885141&r2=885142&view=diff
==============================================================================
--- hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/zlib/BuiltInZlibDeflater.java (original)
+++ hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/zlib/BuiltInZlibDeflater.java Sat Nov 28 19:53:33 2009
@@ -21,7 +21,9 @@
import java.io.IOException;
import java.util.zip.Deflater;
+import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.compress.Compressor;
+import org.mortbay.log.Log;
/**
* A wrapper around java.util.zip.Deflater to make it conform
@@ -46,4 +48,30 @@
throws IOException {
return super.deflate(b, off, len);
}
+
+ /**
+ * reinit the compressor with the given configuration. It will reset the
+ * compressor's compression level and compression strategy. Different from
+ * <tt>ZlibCompressor</tt>, <tt>BuiltInZlibDeflater</tt> only support three
+ * kind of compression strategy: FILTERED, HUFFMAN_ONLY and DEFAULT_STRATEGY.
+ * It will use DEFAULT_STRATEGY as default if the configured compression
+ * strategy is not supported.
+ */
+ @Override
+ public void reinit(Configuration conf) {
+ reset();
+ if (conf == null) {
+ return;
+ }
+ setLevel(ZlibFactory.getCompressionLevel(conf).compressionLevel());
+ final ZlibCompressor.CompressionStrategy strategy =
+ ZlibFactory.getCompressionStrategy(conf);
+ try {
+ setStrategy(strategy.compressionStrategy());
+ } catch (IllegalArgumentException ill) {
+ Log.warn(strategy + " not supported by BuiltInZlibDeflater.");
+ setStrategy(DEFAULT_STRATEGY);
+ }
+ Log.debug("Reinit compressor with new compression configuration");
+ }
}
Modified: hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/zlib/ZlibCompressor.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/zlib/ZlibCompressor.java?rev=885142&r1=885141&r2=885142&view=diff
==============================================================================
--- hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/zlib/ZlibCompressor.java (original)
+++ hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/zlib/ZlibCompressor.java Sat Nov 28 19:53:33 2009
@@ -22,8 +22,10 @@
import java.nio.Buffer;
import java.nio.ByteBuffer;
+import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.compress.Compressor;
import org.apache.hadoop.util.NativeCodeLoader;
+import org.mortbay.log.Log;
/**
* A {@link Compressor} based on the popular
@@ -40,7 +42,7 @@
private long stream;
private CompressionLevel level;
private CompressionStrategy strategy;
- private CompressionHeader windowBits;
+ private final CompressionHeader windowBits;
private int directBufferSize;
private byte[] userBuf = null;
private int userBufOff = 0, userBufLen = 0;
@@ -178,6 +180,31 @@
return nativeZlibLoaded;
}
+ protected final void construct(CompressionLevel level, CompressionStrategy strategy,
+ CompressionHeader header, int directBufferSize) {
+ }
+
+ /**
+ * Creates a new compressor with the default compression level.
+ * Compressed data will be generated in ZLIB format.
+ */
+ public ZlibCompressor() {
+ this(CompressionLevel.DEFAULT_COMPRESSION,
+ CompressionStrategy.DEFAULT_STRATEGY,
+ CompressionHeader.DEFAULT_HEADER,
+ DEFAULT_DIRECT_BUFFER_SIZE);
+ }
+
+ /**
+ * Creates a new compressor, taking settings from the configuration.
+ */
+ public ZlibCompressor(Configuration conf) {
+ this(ZlibFactory.getCompressionLevel(conf),
+ ZlibFactory.getCompressionStrategy(conf),
+ CompressionHeader.DEFAULT_HEADER,
+ DEFAULT_DIRECT_BUFFER_SIZE);
+ }
+
/**
* Creates a new compressor using the specified compression level.
* Compressed data will be generated in ZLIB format.
@@ -192,28 +219,38 @@
this.level = level;
this.strategy = strategy;
this.windowBits = header;
+ stream = init(this.level.compressionLevel(),
+ this.strategy.compressionStrategy(),
+ this.windowBits.windowBits());
+
this.directBufferSize = directBufferSize;
-
uncompressedDirectBuf = ByteBuffer.allocateDirect(directBufferSize);
compressedDirectBuf = ByteBuffer.allocateDirect(directBufferSize);
compressedDirectBuf.position(directBufferSize);
-
- stream = init(this.level.compressionLevel(),
- this.strategy.compressionStrategy(),
- this.windowBits.windowBits());
}
-
+
/**
- * Creates a new compressor with the default compression level.
- * Compressed data will be generated in ZLIB format.
+ * Prepare the compressor to be used in a new stream with settings defined in
+ * the given Configuration. It will reset the compressor's compression level
+ * and compression strategy.
+ *
+ * @param conf Configuration storing new settings
*/
- public ZlibCompressor() {
- this(CompressionLevel.DEFAULT_COMPRESSION,
- CompressionStrategy.DEFAULT_STRATEGY,
- CompressionHeader.DEFAULT_HEADER,
- DEFAULT_DIRECT_BUFFER_SIZE);
+ @Override
+ public synchronized void reinit(Configuration conf) {
+ reset();
+ if (conf == null) {
+ return;
+ }
+ end(stream);
+ level = ZlibFactory.getCompressionLevel(conf);
+ strategy = ZlibFactory.getCompressionStrategy(conf);
+ stream = init(level.compressionLevel(),
+ strategy.compressionStrategy(),
+ windowBits.windowBits());
+ Log.debug("Reinit compressor with new compression configuration");
}
-
+
public synchronized void setInput(byte[] b, int off, int len) {
if (b== null) {
throw new NullPointerException();
Modified: hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/zlib/ZlibFactory.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/zlib/ZlibFactory.java?rev=885142&r1=885141&r2=885142&view=diff
==============================================================================
--- hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/zlib/ZlibFactory.java (original)
+++ hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/zlib/ZlibFactory.java Sat Nov 28 19:53:33 2009
@@ -23,7 +23,10 @@
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.compress.Compressor;
import org.apache.hadoop.io.compress.Decompressor;
+import org.apache.hadoop.io.compress.zlib.ZlibCompressor.CompressionLevel;
+import org.apache.hadoop.io.compress.zlib.ZlibCompressor.CompressionStrategy;
import org.apache.hadoop.util.NativeCodeLoader;
+import org.apache.hadoop.fs.CommonConfigurationKeys;
/**
* A collection of factories to create the right
@@ -58,7 +61,9 @@
* and can be loaded for this job, else <code>false</code>
*/
public static boolean isNativeZlibLoaded(Configuration conf) {
- return nativeZlibLoaded && conf.getBoolean("hadoop.native.lib", true);
+ return nativeZlibLoaded && conf.getBoolean(
+ CommonConfigurationKeys.IO_NATIVE_LIB_AVAILABLE_KEY,
+ CommonConfigurationKeys.IO_NATIVE_LIB_AVAILABLE_DEFAULT);
}
/**
@@ -106,5 +111,25 @@
return (isNativeZlibLoaded(conf)) ?
new ZlibDecompressor() : new BuiltInZlibInflater();
}
-
+
+ public static void setCompressionStrategy(Configuration conf,
+ CompressionStrategy strategy) {
+ conf.setEnum("zlib.compress.strategy", strategy);
+ }
+
+ public static CompressionStrategy getCompressionStrategy(Configuration conf) {
+ return conf.getEnum("zlib.compress.strategy",
+ CompressionStrategy.DEFAULT_STRATEGY);
+ }
+
+ public static void setCompressionLevel(Configuration conf,
+ CompressionLevel level) {
+ conf.setEnum("zlib.compress.level", level);
+ }
+
+ public static CompressionLevel getCompressionLevel(Configuration conf) {
+ return conf.getEnum("zlib.compress.level",
+ CompressionLevel.DEFAULT_COMPRESSION);
+ }
+
}
Modified: hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/file/tfile/TFile.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/file/tfile/TFile.java?rev=885142&r1=885141&r2=885142&view=diff
==============================================================================
--- hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/file/tfile/TFile.java (original)
+++ hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/file/tfile/TFile.java Sat Nov 28 19:53:33 2009
@@ -34,6 +34,7 @@
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.io.BoundedByteArrayOutputStream;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.DataInputBuffer;
import org.apache.hadoop.io.DataOutputBuffer;
@@ -668,10 +669,10 @@
* TFile Reader. Users may only read TFiles by creating TFile.Reader.Scanner.
* objects. A scanner may scan the whole TFile ({@link Reader#createScanner()}
* ) , a portion of TFile based on byte offsets (
- * {@link Reader#createScanner(long, long)}), or a portion of TFile with keys
+ * {@link Reader#createScannerByByteRange(long, long)}), or a portion of TFile with keys
* fall in a certain key range (for sorted TFile only,
- * {@link Reader#createScanner(byte[], byte[])} or
- * {@link Reader#createScanner(RawComparable, RawComparable)}).
+ * {@link Reader#createScannerByKey(byte[], byte[])} or
+ * {@link Reader#createScannerByKey(RawComparable, RawComparable)}).
*/
public static class Reader implements Closeable {
// The underlying BCFile reader.
@@ -985,6 +986,16 @@
return new Location(blkIndex, 0);
}
+ Location getLocationByRecordNum(long recNum) throws IOException {
+ checkTFileDataIndex();
+ return tfileIndex.getLocationByRecordNum(recNum);
+ }
+
+ long getRecordNumByLocation(Location location) throws IOException {
+ checkTFileDataIndex();
+ return tfileIndex.getRecordNumByLocation(location);
+ }
+
int compareKeys(byte[] a, int o1, int l1, byte[] b, int o2, int l2) {
if (!isSorted()) {
throw new RuntimeException("Cannot compare keys for unsorted TFiles.");
@@ -1016,6 +1027,21 @@
}
/**
+ * Get the RecordNum for the first key-value pair in a compressed block
+ * whose byte offset in the TFile is greater than or equal to the specified
+ * offset.
+ *
+ * @param offset
+ * the user supplied offset.
+ * @return the RecordNum to the corresponding entry. If no such entry
+ * exists, it returns the total entry count.
+ * @throws IOException
+ */
+ public long getRecordNumNear(long offset) throws IOException {
+ return getRecordNumByLocation(getLocationNear(offset));
+ }
+
+ /**
* Get a sample key that is within a block whose starting offset is greater
* than or equal to the specified offset.
*
@@ -1057,7 +1083,7 @@
* contains zero key-value pairs even if length is positive.
* @throws IOException
*/
- public Scanner createScanner(long offset, long length) throws IOException {
+ public Scanner createScannerByByteRange(long offset, long length) throws IOException {
return new Scanner(this, offset, offset + length);
}
@@ -1073,10 +1099,31 @@
* @return The actual coverage of the returned scanner will cover all keys
* greater than or equal to the beginKey and less than the endKey.
* @throws IOException
+ *
+ * @deprecated Use {@link #createScannerByKey(byte[], byte[])} instead.
*/
+ @Deprecated
public Scanner createScanner(byte[] beginKey, byte[] endKey)
+ throws IOException {
+ return createScannerByKey(beginKey, endKey);
+ }
+
+ /**
+ * Get a scanner that covers a portion of TFile based on keys.
+ *
+ * @param beginKey
+ * Begin key of the scan (inclusive). If null, scan from the first
+ * key-value entry of the TFile.
+ * @param endKey
+ * End key of the scan (exclusive). If null, scan up to the last
+ * key-value entry of the TFile.
+ * @return The actual coverage of the returned scanner will cover all keys
+ * greater than or equal to the beginKey and less than the endKey.
+ * @throws IOException
+ */
+ public Scanner createScannerByKey(byte[] beginKey, byte[] endKey)
throws IOException {
- return createScanner((beginKey == null) ? null : new ByteArray(beginKey,
+ return createScannerByKey((beginKey == null) ? null : new ByteArray(beginKey,
0, beginKey.length), (endKey == null) ? null : new ByteArray(endKey,
0, endKey.length));
}
@@ -1093,9 +1140,31 @@
* @return The actual coverage of the returned scanner will cover all keys
* greater than or equal to the beginKey and less than the endKey.
* @throws IOException
+ *
+ * @deprecated Use {@link #createScannerByKey(RawComparable, RawComparable)}
+ * instead.
*/
+ @Deprecated
public Scanner createScanner(RawComparable beginKey, RawComparable endKey)
throws IOException {
+ return createScannerByKey(beginKey, endKey);
+ }
+
+ /**
+ * Get a scanner that covers a specific key range.
+ *
+ * @param beginKey
+ * Begin key of the scan (inclusive). If null, scan from the first
+ * key-value entry of the TFile.
+ * @param endKey
+ * End key of the scan (exclusive). If null, scan up to the last
+ * key-value entry of the TFile.
+ * @return The actual coverage of the returned scanner will cover all keys
+ * greater than or equal to the beginKey and less than the endKey.
+ * @throws IOException
+ */
+ public Scanner createScannerByKey(RawComparable beginKey, RawComparable endKey)
+ throws IOException {
if ((beginKey != null) && (endKey != null)
&& (compareKeys(beginKey, endKey) >= 0)) {
return new Scanner(this, beginKey, beginKey);
@@ -1104,6 +1173,27 @@
}
/**
+ * Create a scanner that covers a range of records.
+ *
+ * @param beginRecNum
+ * The RecordNum for the first record (inclusive).
+ * @param endRecNum
+ * The RecordNum for the last record (exclusive). To scan the whole
+ * file, either specify endRecNum==-1 or endRecNum==getEntryCount().
+ * @return The TFile scanner that covers the specified range of records.
+ * @throws IOException
+ */
+ public Scanner createScannerByRecordNum(long beginRecNum, long endRecNum)
+ throws IOException {
+ if (beginRecNum < 0) beginRecNum = 0;
+ if (endRecNum < 0 || endRecNum > getEntryCount()) {
+ endRecNum = getEntryCount();
+ }
+ return new Scanner(this, getLocationByRecordNum(beginRecNum),
+ getLocationByRecordNum(endRecNum));
+ }
+
+ /**
* The TFile Scanner. The Scanner has an implicit cursor, which, upon
* creation, points to the first key-value pair in the scan range. If the
* scan range is empty, the cursor will point to the end of the scan range.
@@ -1523,6 +1613,15 @@
}
/**
+ * Get the RecordNum corresponding to the entry pointed by the cursor.
+ * @return The RecordNum corresponding to the entry pointed by the cursor.
+ * @throws IOException
+ */
+ public long getRecordNum() throws IOException {
+ return reader.getRecordNumByLocation(currentLocation);
+ }
+
+ /**
* Internal API. Comparing the key at cursor to user-specified key.
*
* @param other
@@ -2020,8 +2119,10 @@
final static String BLOCK_NAME = "TFile.index";
private ByteArray firstKey;
private final ArrayList<TFileIndexEntry> index;
+ private final ArrayList<Long> recordNumIndex;
private final BytesComparator comparator;
-
+ private long sum = 0;
+
/**
* For reading from file.
*
@@ -2030,6 +2131,7 @@
public TFileIndex(int entryCount, DataInput in, BytesComparator comparator)
throws IOException {
index = new ArrayList<TFileIndexEntry>(entryCount);
+ recordNumIndex = new ArrayList<Long>(entryCount);
int size = Utils.readVInt(in); // size for the first key entry.
if (size > 0) {
byte[] buffer = new byte[size];
@@ -2051,6 +2153,8 @@
new TFileIndexEntry(new DataInputStream(new ByteArrayInputStream(
buffer, 0, size)));
index.add(idx);
+ sum += idx.entries();
+ recordNumIndex.add(sum);
}
} else {
if (entryCount != 0) {
@@ -2082,6 +2186,12 @@
return ret;
}
+ /**
+ * @param key
+ * input key.
+ * @return the ID of the first block that contains key > input key. Or -1
+ * if no such block exists.
+ */
public int upperBound(RawComparable key) {
if (comparator == null) {
throw new RuntimeException("Cannot search in unsorted TFile");
@@ -2103,13 +2213,26 @@
*/
public TFileIndex(BytesComparator comparator) {
index = new ArrayList<TFileIndexEntry>();
+ recordNumIndex = new ArrayList<Long>();
this.comparator = comparator;
}
public RawComparable getFirstKey() {
return firstKey;
}
+
+ public Reader.Location getLocationByRecordNum(long recNum) {
+ int idx = Utils.upperBound(recordNumIndex, recNum);
+ long lastRecNum = (idx == 0)? 0: recordNumIndex.get(idx-1);
+ return new Reader.Location(idx, recNum-lastRecNum);
+ }
+ public long getRecordNumByLocation(Reader.Location location) {
+ int blkIndex = location.getBlockIndex();
+ long lastRecNum = (blkIndex == 0) ? 0: recordNumIndex.get(blkIndex-1);
+ return lastRecNum + location.getRecordIndex();
+ }
+
public void setFirstKey(byte[] key, int offset, int length) {
firstKey = new ByteArray(new byte[length]);
System.arraycopy(key, offset, firstKey.buffer(), 0, length);
@@ -2124,6 +2247,8 @@
public void addEntry(TFileIndexEntry keyEntry) {
index.add(keyEntry);
+ sum += keyEntry.entries();
+ recordNumIndex.add(sum);
}
public TFileIndexEntry getEntry(int bid) {
Modified: hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/serializer/Deserializer.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/serializer/Deserializer.java?rev=885142&r1=885141&r2=885142&view=diff
==============================================================================
--- hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/serializer/Deserializer.java (original)
+++ hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/serializer/Deserializer.java Sat Nov 28 19:53:33 2009
@@ -34,6 +34,7 @@
* </p>
* @param <T>
*/
+@Deprecated
public interface Deserializer<T> {
/**
* <p>Prepare the deserializer for reading.</p>
Modified: hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/serializer/DeserializerComparator.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/serializer/DeserializerComparator.java?rev=885142&r1=885141&r2=885142&view=diff
==============================================================================
--- hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/serializer/DeserializerComparator.java (original)
+++ hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/serializer/DeserializerComparator.java Sat Nov 28 19:53:33 2009
@@ -52,6 +52,13 @@
this.deserializer.open(buffer);
}
+ protected DeserializerComparator(DeserializerBase<T> deserializer)
+ throws IOException {
+
+ this.deserializer = deserializer;
+ this.deserializer.open(buffer);
+ }
+
public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
try {
Modified: hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/serializer/Serialization.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/serializer/Serialization.java?rev=885142&r1=885141&r2=885142&view=diff
==============================================================================
--- hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/serializer/Serialization.java (original)
+++ hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/serializer/Serialization.java Sat Nov 28 19:53:33 2009
@@ -24,6 +24,7 @@
* </p>
* @param <T>
*/
+@Deprecated
public interface Serialization<T> {
/**
Modified: hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/serializer/SerializationFactory.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/serializer/SerializationFactory.java?rev=885142&r1=885141&r2=885142&view=diff
==============================================================================
--- hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/serializer/SerializationFactory.java (original)
+++ hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/serializer/SerializationFactory.java Sat Nov 28 19:53:33 2009
@@ -20,11 +20,13 @@
import java.util.ArrayList;
import java.util.List;
+import java.util.Map;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.io.serializer.avro.AvroGenericSerialization;
import org.apache.hadoop.io.serializer.avro.AvroReflectSerialization;
import org.apache.hadoop.io.serializer.avro.AvroSpecificSerialization;
import org.apache.hadoop.util.ReflectionUtils;
@@ -32,7 +34,7 @@
/**
* <p>
- * A factory for {@link Serialization}s.
+ * A factory for {@link SerializationBase}s.
* </p>
*/
public class SerializationFactory extends Configured {
@@ -40,7 +42,10 @@
private static final Log LOG =
LogFactory.getLog(SerializationFactory.class.getName());
- private List<Serialization<?>> serializations = new ArrayList<Serialization<?>>();
+ private List<SerializationBase<?>> serializations =
+ new ArrayList<SerializationBase<?>>();
+ private List<SerializationBase<?>> legacySerializations =
+ new ArrayList<SerializationBase<?>>();
/**
* <p>
@@ -54,7 +59,8 @@
for (String serializerName : conf.getStrings("io.serializations",
new String[]{WritableSerialization.class.getName(),
AvroSpecificSerialization.class.getName(),
- AvroReflectSerialization.class.getName()})) {
+ AvroReflectSerialization.class.getName(),
+ AvroGenericSerialization.class.getName()})) {
add(conf, serializerName);
}
}
@@ -62,30 +68,62 @@
@SuppressWarnings("unchecked")
private void add(Configuration conf, String serializationName) {
try {
-
- Class<? extends Serialization> serializionClass =
- (Class<? extends Serialization>) conf.getClassByName(serializationName);
- serializations.add((Serialization)
- ReflectionUtils.newInstance(serializionClass, getConf()));
+ Class<?> serializationClass = conf.getClassByName(serializationName);
+ if (SerializationBase.class.isAssignableFrom(serializationClass)) {
+ serializations.add((SerializationBase)
+ ReflectionUtils.newInstance(serializationClass, getConf()));
+ } else if (Serialization.class.isAssignableFrom(serializationClass)) {
+ Serialization serialization = (Serialization)
+ ReflectionUtils.newInstance(serializationClass, getConf());
+ legacySerializations.add(new LegacySerialization(serialization,
+ getConf()));
+ } else {
+ LOG.warn("Serialization class " + serializationName + " is not an " +
+ "instance of Serialization or BaseSerialization.");
+ }
} catch (ClassNotFoundException e) {
- LOG.warn("Serilization class not found: " +
+ LOG.warn("Serialization class not found: " +
StringUtils.stringifyException(e));
}
}
+ @Deprecated
public <T> Serializer<T> getSerializer(Class<T> c) {
return getSerialization(c).getSerializer(c);
}
+ @Deprecated
public <T> Deserializer<T> getDeserializer(Class<T> c) {
return getSerialization(c).getDeserializer(c);
}
- @SuppressWarnings("unchecked")
+ @Deprecated
public <T> Serialization<T> getSerialization(Class<T> c) {
- for (Serialization serialization : serializations) {
- if (serialization.accept(c)) {
- return (Serialization<T>) serialization;
+ return getSerialization(SerializationBase.getMetadataFromClass(c));
+ }
+
+ public <T> SerializerBase<T> getSerializer(Map<String, String> metadata) {
+ SerializationBase<T> serialization = getSerialization(metadata);
+ return serialization.getSerializer(metadata);
+ }
+
+ public <T> DeserializerBase<T> getDeserializer(Map<String, String> metadata) {
+ SerializationBase<T> serialization = getSerialization(metadata);
+ return serialization.getDeserializer(metadata);
+ }
+
+ @SuppressWarnings("unchecked")
+ public <T> SerializationBase<T> getSerialization(Map<String, String> metadata) {
+ for (SerializationBase serialization : serializations) {
+ if (serialization.accept(metadata)) {
+ return (SerializationBase<T>) serialization;
+ }
+ }
+ // Look in the legacy serializations last, since they ignore
+ // non-class metadata
+ for (SerializationBase serialization : legacySerializations) {
+ if (serialization.accept(metadata)) {
+ return (SerializationBase<T>) serialization;
}
}
return null;
Modified: hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/serializer/Serializer.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/serializer/Serializer.java?rev=885142&r1=885141&r2=885142&view=diff
==============================================================================
--- hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/serializer/Serializer.java (original)
+++ hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/serializer/Serializer.java Sat Nov 28 19:53:33 2009
@@ -34,6 +34,7 @@
* </p>
* @param <T>
*/
+@Deprecated
public interface Serializer<T> {
/**
* <p>Prepare the serializer for writing.</p>
Modified: hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/serializer/WritableSerialization.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/serializer/WritableSerialization.java?rev=885142&r1=885141&r2=885142&view=diff
==============================================================================
--- hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/serializer/WritableSerialization.java (original)
+++ hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/serializer/WritableSerialization.java Sat Nov 28 19:53:33 2009
@@ -23,22 +23,20 @@
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
+import java.util.Map;
import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.util.ReflectionUtils;
/**
- * A {@link Serialization} for {@link Writable}s that delegates to
+ * A {@link SerializationBase} for {@link Writable}s that delegates to
* {@link Writable#write(java.io.DataOutput)} and
* {@link Writable#readFields(java.io.DataInput)}.
*/
-public class WritableSerialization extends Configured
- implements Serialization<Writable> {
+public class WritableSerialization extends SerializationBase<Writable> {
- static class WritableDeserializer extends Configured
- implements Deserializer<Writable> {
+ static class WritableDeserializer extends DeserializerBase<Writable> {
private Class<?> writableClass;
private DataInputStream dataIn;
@@ -48,6 +46,7 @@
this.writableClass = c;
}
+ @Override
public void open(InputStream in) {
if (in instanceof DataInputStream) {
dataIn = (DataInputStream) in;
@@ -56,6 +55,7 @@
}
}
+ @Override
public Writable deserialize(Writable w) throws IOException {
Writable writable;
if (w == null) {
@@ -68,16 +68,23 @@
return writable;
}
+ @Override
public void close() throws IOException {
dataIn.close();
}
}
- static class WritableSerializer implements Serializer<Writable> {
-
+ static class WritableSerializer extends SerializerBase<Writable> {
+
+ private Map<String, String> metadata;
private DataOutputStream dataOut;
+ public WritableSerializer(Map<String, String> metadata) {
+ this.metadata = metadata;
+ }
+
+ @Override
public void open(OutputStream out) {
if (out instanceof DataOutputStream) {
dataOut = (DataOutputStream) out;
@@ -86,26 +93,41 @@
}
}
+ @Override
public void serialize(Writable w) throws IOException {
w.write(dataOut);
}
+ @Override
public void close() throws IOException {
dataOut.close();
}
- }
+ @Override
+ public Map<String, String> getMetadata() throws IOException {
+ return metadata;
+ }
- public boolean accept(Class<?> c) {
- return Writable.class.isAssignableFrom(c);
}
- public Deserializer<Writable> getDeserializer(Class<Writable> c) {
- return new WritableDeserializer(getConf(), c);
+ @Override
+ public boolean accept(Map<String, String> metadata) {
+ if (getClass().getName().equals(metadata.get(SERIALIZATION_KEY))) {
+ return true;
+ }
+ Class<?> c = getClassFromMetadata(metadata);
+ return c == null ? false : Writable.class.isAssignableFrom(c);
}
- public Serializer<Writable> getSerializer(Class<Writable> c) {
- return new WritableSerializer();
+ @Override
+ public SerializerBase<Writable> getSerializer(Map<String, String> metadata) {
+ return new WritableSerializer(metadata);
+ }
+
+ @Override
+ public DeserializerBase<Writable> getDeserializer(Map<String, String> metadata) {
+ Class<?> c = getClassFromMetadata(metadata);
+ return new WritableDeserializer(getConf(), c);
}
}
Modified: hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/serializer/avro/AvroReflectSerialization.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/serializer/avro/AvroReflectSerialization.java?rev=885142&r1=885141&r2=885142&view=diff
==============================================================================
--- hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/serializer/avro/AvroReflectSerialization.java (original)
+++ hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/serializer/avro/AvroReflectSerialization.java Sat Nov 28 19:53:33 2009
@@ -19,6 +19,7 @@
package org.apache.hadoop.io.serializer.avro;
import java.util.HashSet;
+import java.util.Map;
import java.util.Set;
import org.apache.avro.Schema;
@@ -27,6 +28,7 @@
import org.apache.avro.reflect.ReflectData;
import org.apache.avro.reflect.ReflectDatumReader;
import org.apache.avro.reflect.ReflectDatumWriter;
+import org.apache.avro.specific.SpecificRecord;
/**
* Serialization for Avro Reflect classes. For a class to be accepted by this
@@ -47,10 +49,18 @@
private Set<String> packages;
- public synchronized boolean accept(Class<?> c) {
+ @Override
+ public synchronized boolean accept(Map<String, String> metadata) {
if (packages == null) {
getPackages();
}
+ if (getClass().getName().equals(metadata.get(SERIALIZATION_KEY))) {
+ return true;
+ }
+ Class<?> c = getClassFromMetadata(metadata);
+ if (c == null) {
+ return false;
+ }
return AvroReflectSerializable.class.isAssignableFrom(c) ||
packages.contains(c.getPackage().getName());
}
@@ -65,24 +75,22 @@
}
}
- protected DatumReader getReader(Class<Object> clazz) {
+ @Override
+ protected DatumReader getReader(Map<String, String> metadata) {
try {
- String prefix =
- ((clazz.getEnclosingClass() == null
- || "null".equals(clazz.getEnclosingClass().getName())) ?
- clazz.getPackage().getName() + "."
- : (clazz.getEnclosingClass().getName() + "$"));
- return new ReflectDatumReader(ReflectData.getSchema(clazz), prefix);
+ return new ReflectDatumReader(getClassFromMetadata(metadata));
} catch (Exception e) {
throw new RuntimeException(e);
}
}
- protected Schema getSchema(Object t) {
- return ReflectData.getSchema(t.getClass());
+ @Override
+ protected Schema getSchema(Object t, Map<String, String> metadata) {
+ return ReflectData.get().getSchema(t.getClass());
}
- protected DatumWriter getWriter(Class<Object> clazz) {
+ @Override
+ protected DatumWriter getWriter(Map<String, String> metadata) {
return new ReflectDatumWriter();
}
Modified: hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/serializer/avro/AvroSerialization.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/serializer/avro/AvroSerialization.java?rev=885142&r1=885141&r2=885142&view=diff
==============================================================================
--- hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/serializer/avro/AvroSerialization.java (original)
+++ hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/serializer/avro/AvroSerialization.java Sat Nov 28 19:53:33 2009
@@ -21,92 +21,105 @@
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
+import java.util.Map;
import org.apache.avro.Schema;
import org.apache.avro.io.BinaryDecoder;
import org.apache.avro.io.BinaryEncoder;
import org.apache.avro.io.DatumReader;
import org.apache.avro.io.DatumWriter;
-import org.apache.hadoop.conf.Configured;
-import org.apache.hadoop.io.serializer.Deserializer;
-import org.apache.hadoop.io.serializer.Serialization;
-import org.apache.hadoop.io.serializer.Serializer;
+import org.apache.hadoop.io.serializer.DeserializerBase;
+import org.apache.hadoop.io.serializer.SerializationBase;
+import org.apache.hadoop.io.serializer.SerializerBase;
/**
* Base class for providing serialization to Avro types.
*/
-public abstract class AvroSerialization<T> extends Configured
- implements Serialization<T>{
+public abstract class AvroSerialization<T> extends SerializationBase<T> {
+
+ public static final String AVRO_SCHEMA_KEY = "Avro-Schema";
- public Deserializer<T> getDeserializer(Class<T> c) {
- return new AvroDeserializer(c);
+ public DeserializerBase<T> getDeserializer(Map<String, String> metadata) {
+ return new AvroDeserializer(metadata);
}
- public Serializer<T> getSerializer(Class<T> c) {
- return new AvroSerializer(c);
+ public SerializerBase<T> getSerializer(Map<String, String> metadata) {
+ return new AvroSerializer(metadata);
}
/**
- * Return an Avro Schema instance for the given class.
+ * Return an Avro Schema instance for the given class and metadata.
*/
- protected abstract Schema getSchema(T t);
+ protected abstract Schema getSchema(T t, Map<String, String> metadata);
/**
- * Create and return Avro DatumWriter for the given class.
+ * Create and return Avro DatumWriter for the given metadata.
*/
- protected abstract DatumWriter<T> getWriter(Class<T> clazz);
+ protected abstract DatumWriter<T> getWriter(Map<String, String> metadata);
/**
- * Create and return Avro DatumReader for the given class.
+ * Create and return Avro DatumReader for the given metadata.
*/
- protected abstract DatumReader<T> getReader(Class<T> clazz);
+ protected abstract DatumReader<T> getReader(Map<String, String> metadata);
- class AvroSerializer implements Serializer<T> {
+ class AvroSerializer extends SerializerBase<T> {
+ private Map<String, String> metadata;
private DatumWriter<T> writer;
private BinaryEncoder encoder;
private OutputStream outStream;
- protected Class<T> clazz;
- AvroSerializer(Class<T> clazz) {
- writer = getWriter(clazz);
+ AvroSerializer(Map<String, String> metadata) {
+ this.metadata = metadata;
+ writer = getWriter(metadata);
}
+ @Override
public void close() throws IOException {
encoder.flush();
outStream.close();
}
+ @Override
public void open(OutputStream out) throws IOException {
outStream = out;
encoder = new BinaryEncoder(out);
}
+ @Override
public void serialize(T t) throws IOException {
- writer.setSchema(getSchema(t));
+ writer.setSchema(getSchema(t, metadata));
writer.write(t, encoder);
}
+ @Override
+ public Map<String, String> getMetadata() throws IOException {
+ return metadata;
+ }
+
}
- class AvroDeserializer implements Deserializer<T> {
+ class AvroDeserializer extends DeserializerBase<T> {
private DatumReader<T> reader;
private BinaryDecoder decoder;
private InputStream inStream;
- AvroDeserializer(Class<T> clazz) {
- this.reader = getReader(clazz);
+ AvroDeserializer(Map<String, String> metadata) {
+ this.reader = getReader(metadata);
}
+ @Override
public void close() throws IOException {
inStream.close();
}
+ @Override
public T deserialize(T t) throws IOException {
return reader.read(t, decoder);
}
+ @Override
public void open(InputStream in) throws IOException {
inStream = in;
decoder = new BinaryDecoder(in);