You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by st...@apache.org on 2009/11/28 20:53:40 UTC

svn commit: r885142 [4/6] - in /hadoop/common/branches/HADOOP-6194: ./ .eclipse.templates/ bin/ ivy/ lib/jdiff/ src/ src/contrib/ src/contrib/ec2/ src/docs/ src/docs/src/documentation/ src/docs/src/documentation/content/xdocs/ src/docs/src/documentatio...

Modified: hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/BZip2Codec.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/BZip2Codec.java?rev=885142&r1=885141&r2=885142&view=diff
==============================================================================
--- hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/BZip2Codec.java (original)
+++ hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/BZip2Codec.java Sat Nov 28 19:53:33 2009
@@ -23,6 +23,9 @@
 import java.io.InputStream;
 import java.io.OutputStream;
 
+
+import org.apache.hadoop.fs.Seekable;
+import org.apache.hadoop.io.compress.bzip2.BZip2Constants;
 import org.apache.hadoop.io.compress.bzip2.BZip2DummyCompressor;
 import org.apache.hadoop.io.compress.bzip2.BZip2DummyDecompressor;
 import org.apache.hadoop.io.compress.bzip2.CBZip2InputStream;
@@ -35,17 +38,17 @@
  * CompressionCodec which have a Compressor or Decompressor type argument, throw
  * UnsupportedOperationException.
  */
-public class BZip2Codec implements
-    org.apache.hadoop.io.compress.CompressionCodec {
+public class BZip2Codec implements SplittableCompressionCodec {
 
   private static final String HEADER = "BZ";
   private static final int HEADER_LEN = HEADER.length();
+  private static final String SUB_HEADER = "h9";
+  private static final int SUB_HEADER_LEN = SUB_HEADER.length();
 
   /**
   * Creates a new instance of BZip2Codec
   */
-  public BZip2Codec() {
-  }
+  public BZip2Codec() { }
 
   /**
   * Creates CompressionOutputStream for BZip2
@@ -62,10 +65,10 @@
   }
 
   /**
-   * This functionality is currently not supported.
+  * Creates a compressor using given OutputStream.
    *
-   * @throws java.lang.UnsupportedOperationException
-   *             Throws UnsupportedOperationException
+  * @return CompressionOutputStream
+    @throws java.io.IOException
    */
   public CompressionOutputStream createOutputStream(OutputStream out,
       Compressor compressor) throws IOException {
@@ -75,8 +78,7 @@
   /**
   * This functionality is currently not supported.
   *
-  * @throws java.lang.UnsupportedOperationException
-  *             Throws UnsupportedOperationException
+  * @return BZip2DummyCompressor.class
   */
   public Class<? extends org.apache.hadoop.io.compress.Compressor> getCompressorType() {
     return BZip2DummyCompressor.class;
@@ -85,8 +87,7 @@
   /**
   * This functionality is currently not supported.
   *
-  * @throws java.lang.UnsupportedOperationException
-  *             Throws UnsupportedOperationException
+  * @return Compressor
   */
   public Compressor createCompressor() {
     return new BZip2DummyCompressor();
@@ -109,8 +110,7 @@
   /**
   * This functionality is currently not supported.
   *
-  * @throws java.lang.UnsupportedOperationException
-  *             Throws UnsupportedOperationException
+  * @return CompressionInputStream
   */
   public CompressionInputStream createInputStream(InputStream in,
       Decompressor decompressor) throws IOException {
@@ -118,10 +118,64 @@
   }
 
   /**
+   * Creates CompressionInputStream to be used to read off uncompressed data
+   * in one of the two reading modes. i.e. Continuous or Blocked reading modes
+   *
+   * @param seekableIn The InputStream
+   * @param start The start offset into the compressed stream
+   * @param end The end offset into the compressed stream
+   * @param readMode Controls whether progress is reported continuously or
+   *                 only at block boundaries.
+   *
+   * @return CompressionInputStream for BZip2 aligned at block boundaries
+   */
+  public SplitCompressionInputStream createInputStream(InputStream seekableIn,
+      Decompressor decompressor, long start, long end, READ_MODE readMode)
+      throws IOException {
+
+    if (!(seekableIn instanceof Seekable)) {
+      throw new IOException("seekableIn must be an instance of " +
+          Seekable.class.getName());
+    }
+
+    //find the position of first BZip2 start up marker
+    ((Seekable)seekableIn).seek(0);
+
+    // BZip2 start of block markers are of 6 bytes.  But the very first block
+    // also has "BZh9", making it 10 bytes.  This is the common case.  But at
+    // time stream might start without a leading BZ.
+    final long FIRST_BZIP2_BLOCK_MARKER_POSITION =
+      CBZip2InputStream.numberOfBytesTillNextMarker(seekableIn);
+    long adjStart = Math.max(0L, start - FIRST_BZIP2_BLOCK_MARKER_POSITION);
+
+    ((Seekable)seekableIn).seek(adjStart);
+    SplitCompressionInputStream in =
+      new BZip2CompressionInputStream(seekableIn, adjStart, end, readMode);
+
+
+    // The following if clause handles the following case:
+    // Assume the following scenario in BZip2 compressed stream where
+    // . represent compressed data.
+    // .....[48 bit Block].....[48 bit   Block].....[48 bit Block]...
+    // ........................[47 bits][1 bit].....[48 bit Block]...
+    // ................................^[Assume a Byte alignment here]
+    // ........................................^^[current position of stream]
+    // .....................^^[We go back 10 Bytes in stream and find a Block marker]
+    // ........................................^^[We align at wrong position!]
+    // ...........................................................^^[While this pos is correct]
+
+    if (in.getPos() <= start) {
+      ((Seekable)seekableIn).seek(start);
+      in = new BZip2CompressionInputStream(seekableIn, start, end, readMode);
+    }
+
+    return in;
+  }
+
+  /**
   * This functionality is currently not supported.
   *
-  * @throws java.lang.UnsupportedOperationException
-  *             Throws UnsupportedOperationException
+  * @return BZip2DummyDecompressor.class
   */
   public Class<? extends org.apache.hadoop.io.compress.Decompressor> getDecompressorType() {
     return BZip2DummyDecompressor.class;
@@ -130,8 +184,7 @@
   /**
   * This functionality is currently not supported.
   *
-  * @throws java.lang.UnsupportedOperationException
-  *             Throws UnsupportedOperationException
+  * @return Decompressor
   */
   public Decompressor createDecompressor() {
     return new BZip2DummyDecompressor();
@@ -146,7 +199,8 @@
     return ".bz2";
   }
 
-  private static class BZip2CompressionOutputStream extends CompressionOutputStream {
+  private static class BZip2CompressionOutputStream extends
+      CompressionOutputStream {
 
     // class data starts here//
     private CBZip2OutputStream output;
@@ -221,26 +275,79 @@
 
   }// end of class BZip2CompressionOutputStream
 
-  private static class BZip2CompressionInputStream extends CompressionInputStream {
+  /**
+   * This class is capable to de-compress BZip2 data in two modes;
+   * CONTINOUS and BYBLOCK.  BYBLOCK mode makes it possible to
+   * do decompression starting any arbitrary position in the stream.
+   *
+   * So this facility can easily be used to parallelize decompression
+   * of a large BZip2 file for performance reasons.  (It is exactly
+   * done so for Hadoop framework.  See LineRecordReader for an
+   * example).  So one can break the file (of course logically) into
+   * chunks for parallel processing.  These "splits" should be like
+   * default Hadoop splits (e.g as in FileInputFormat getSplit metod).
+   * So this code is designed and tested for FileInputFormat's way
+   * of splitting only.
+   */
+
+  private static class BZip2CompressionInputStream extends
+      SplitCompressionInputStream {
 
     // class data starts here//
     private CBZip2InputStream input;
     boolean needsReset;
+    private BufferedInputStream bufferedIn;
+    private boolean isHeaderStripped = false;
+    private boolean isSubHeaderStripped = false;
+    private READ_MODE readMode = READ_MODE.CONTINUOUS;
+    private long startingPos = 0L;
+
+    // Following state machine handles different states of compressed stream
+    // position
+    // HOLD : Don't advertise compressed stream position
+    // ADVERTISE : Read 1 more character and advertise stream position
+    // See more comments about it before updatePos method.
+    private enum POS_ADVERTISEMENT_STATE_MACHINE {
+      HOLD, ADVERTISE
+    };
+
+    POS_ADVERTISEMENT_STATE_MACHINE posSM = POS_ADVERTISEMENT_STATE_MACHINE.HOLD;
+    long compressedStreamPosition = 0;
+
     // class data ends here//
 
     public BZip2CompressionInputStream(InputStream in) throws IOException {
+      this(in, 0L, Long.MAX_VALUE, READ_MODE.CONTINUOUS);
+    }
 
-      super(in);
-      needsReset = true;
+    public BZip2CompressionInputStream(InputStream in, long start, long end,
+        READ_MODE readMode) throws IOException {
+      super(in, start, end);
+      needsReset = false;
+      bufferedIn = new BufferedInputStream(super.in);
+      this.startingPos = super.getPos();
+      this.readMode = readMode;
+      if (this.startingPos == 0) {
+        // We only strip header if it is start of file
+        bufferedIn = readStreamHeader();
+      }
+      input = new CBZip2InputStream(bufferedIn, readMode);
+      if (this.isHeaderStripped) {
+        input.updateReportedByteCount(HEADER_LEN);
+      }
+
+      if (this.isSubHeaderStripped) {
+        input.updateReportedByteCount(SUB_HEADER_LEN);
+      }
+
+      this.updatePos(false);
     }
 
     private BufferedInputStream readStreamHeader() throws IOException {
       // We are flexible enough to allow the compressed stream not to
       // start with the header of BZ. So it works fine either we have
       // the header or not.
-      BufferedInputStream bufferedIn = null;
       if (super.in != null) {
-        bufferedIn = new BufferedInputStream(super.in);
         bufferedIn.mark(HEADER_LEN);
         byte[] headerBytes = new byte[HEADER_LEN];
         int actualRead = bufferedIn.read(headerBytes, 0, HEADER_LEN);
@@ -248,6 +355,17 @@
           String header = new String(headerBytes);
           if (header.compareTo(HEADER) != 0) {
             bufferedIn.reset();
+          } else {
+            this.isHeaderStripped = true;
+            // In case of BYBLOCK mode, we also want to strip off
+            // remaining two character of the header.
+            if (this.readMode == READ_MODE.BYBLOCK) {
+              actualRead = bufferedIn.read(headerBytes, 0,
+                  SUB_HEADER_LEN);
+              if (actualRead != -1) {
+                this.isSubHeaderStripped = true;
+              }
+            }
           }
         }
       }
@@ -267,33 +385,96 @@
       }
     }
 
+    /**
+    * This method updates compressed stream position exactly when the
+    * client of this code has read off at least one byte passed any BZip2
+    * end of block marker.
+    *
+    * This mechanism is very helpful to deal with data level record
+    * boundaries. Please see constructor and next methods of
+    * org.apache.hadoop.mapred.LineRecordReader as an example usage of this
+    * feature.  We elaborate it with an example in the following:
+    *
+    * Assume two different scenarios of the BZip2 compressed stream, where
+    * [m] represent end of block, \n is line delimiter and . represent compressed
+    * data.
+    *
+    * ............[m]......\n.......
+    *
+    * ..........\n[m]......\n.......
+    *
+    * Assume that end is right after [m].  In the first case the reading
+    * will stop at \n and there is no need to read one more line.  (To see the
+    * reason of reading one more line in the next() method is explained in LineRecordReader.)
+    * While in the second example LineRecordReader needs to read one more line
+    * (till the second \n).  Now since BZip2Codecs only update position
+    * at least one byte passed a maker, so it is straight forward to differentiate
+    * between the two cases mentioned.
+    *
+    */
+
     public int read(byte[] b, int off, int len) throws IOException {
       if (needsReset) {
         internalReset();
       }
-      return this.input.read(b, off, len);
 
+      int result = 0;
+      result = this.input.read(b, off, len);
+      if (result == BZip2Constants.END_OF_BLOCK) {
+        this.posSM = POS_ADVERTISEMENT_STATE_MACHINE.ADVERTISE;
+      }
+
+      if (this.posSM == POS_ADVERTISEMENT_STATE_MACHINE.ADVERTISE) {
+        result = this.input.read(b, off, off + 1);
+        // This is the precise time to update compressed stream position
+        // to the client of this code.
+        this.updatePos(true);
+        this.posSM = POS_ADVERTISEMENT_STATE_MACHINE.HOLD;
+      }
+
+      return result;
+
+    }
+
+    public int read() throws IOException {
+      byte b[] = new byte[1];
+      int result = this.read(b, 0, 1);
+      return (result < 0) ? result : b[0];
     }
 
     private void internalReset() throws IOException {
       if (needsReset) {
         needsReset = false;
         BufferedInputStream bufferedIn = readStreamHeader();
-        input = new CBZip2InputStream(bufferedIn);
+        input = new CBZip2InputStream(bufferedIn, this.readMode);
       }
     }    
     
     public void resetState() throws IOException {
-      // Cannot read from bufferedIn at this point because bufferedIn might not be ready
+      // Cannot read from bufferedIn at this point because bufferedIn
+      // might not be ready
       // yet, as in SequenceFile.Reader implementation.
       needsReset = true;
     }
 
-    public int read() throws IOException {
-      if (needsReset) {
-        internalReset();
+    public long getPos() {
+      return this.compressedStreamPosition;
       }
-      return this.input.read();
+
+    /*
+     * As the comments before read method tell that
+     * compressed stream is advertised when at least
+     * one byte passed EOB have been read off.  But
+     * there is an exception to this rule.  When we
+     * construct the stream we advertise the position
+     * exactly at EOB.  In the following method
+     * shouldAddOn boolean captures this exception.
+     *
+     */
+    private void updatePos(boolean shouldAddOn) {
+      int addOn = shouldAddOn ? 1 : 0;
+      this.compressedStreamPosition = this.startingPos
+          + this.input.getProcessedByteCount() + addOn;
     }
 
   }// end of BZip2CompressionInputStream

Modified: hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/BlockDecompressorStream.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/BlockDecompressorStream.java?rev=885142&r1=885141&r2=885142&view=diff
==============================================================================
--- hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/BlockDecompressorStream.java (original)
+++ hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/BlockDecompressorStream.java Sat Nov 28 19:53:33 2009
@@ -38,9 +38,10 @@
    * @param in input stream
    * @param decompressor decompressor to use
    * @param bufferSize size of buffer
+ * @throws IOException
    */
   public BlockDecompressorStream(InputStream in, Decompressor decompressor, 
-                                 int bufferSize) {
+                                 int bufferSize) throws IOException {
     super(in, decompressor, bufferSize);
   }
   
@@ -49,12 +50,13 @@
    * 
    * @param in input stream
    * @param decompressor decompressor to use
+ * @throws IOException
    */
-  public BlockDecompressorStream(InputStream in, Decompressor decompressor) {
+  public BlockDecompressorStream(InputStream in, Decompressor decompressor) throws IOException {
     super(in, decompressor);
   }
 
-  protected BlockDecompressorStream(InputStream in) {
+  protected BlockDecompressorStream(InputStream in) throws IOException {
     super(in);
   }
 

Modified: hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/CodecPool.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/CodecPool.java?rev=885142&r1=885141&r2=885142&view=diff
==============================================================================
--- hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/CodecPool.java (original)
+++ hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/CodecPool.java Sat Nov 28 19:53:33 2009
@@ -24,6 +24,7 @@
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.util.ReflectionUtils;
 
 /**
@@ -91,20 +92,26 @@
    *
    * @param codec the <code>CompressionCodec</code> for which to get the 
    *              <code>Compressor</code>
+   * @param conf the <code>Configuration</code> object which contains confs for creating or reinit the compressor
    * @return <code>Compressor</code> for the given 
    *         <code>CompressionCodec</code> from the pool or a new one
    */
-  public static Compressor getCompressor(CompressionCodec codec) {
+  public static Compressor getCompressor(CompressionCodec codec, Configuration conf) {
     Compressor compressor = borrow(compressorPool, codec.getCompressorType());
     if (compressor == null) {
       compressor = codec.createCompressor();
       LOG.info("Got brand-new compressor");
     } else {
+      compressor.reinit(conf);
       LOG.debug("Got recycled compressor");
     }
     return compressor;
   }
   
+  public static Compressor getCompressor(CompressionCodec codec) {
+    return getCompressor(codec, null);
+  }
+  
   /**
    * Get a {@link Decompressor} for the given {@link CompressionCodec} from the
    * pool or a new one.

Modified: hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/CompressionInputStream.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/CompressionInputStream.java?rev=885142&r1=885141&r2=885142&view=diff
==============================================================================
--- hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/CompressionInputStream.java (original)
+++ hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/CompressionInputStream.java Sat Nov 28 19:53:33 2009
@@ -21,6 +21,8 @@
 import java.io.IOException;
 import java.io.InputStream;
 
+import org.apache.hadoop.fs.PositionedReadable;
+import org.apache.hadoop.fs.Seekable;
 /**
  * A compression input stream.
  *
@@ -28,19 +30,25 @@
  * reposition the underlying input stream then call {@link #resetState()},
  * without having to also synchronize client buffers.
  */
-public abstract class CompressionInputStream extends InputStream {
+
+public abstract class CompressionInputStream extends InputStream implements Seekable {
   /**
    * The input stream to be compressed. 
    */
   protected final InputStream in;
+  protected long maxAvailableData = 0L;
 
   /**
    * Create a compression input stream that reads
    * the decompressed bytes from the given stream.
    * 
    * @param in The input stream to be compressed.
+   * @throws IOException
    */
-  protected CompressionInputStream(InputStream in) {
+  protected CompressionInputStream(InputStream in) throws IOException {
+    if (!(in instanceof Seekable) || !(in instanceof PositionedReadable)) {
+        this.maxAvailableData = in.available();
+    }
     this.in = in;
   }
 
@@ -60,4 +68,40 @@
    */
   public abstract void resetState() throws IOException;
   
+  /**
+   * This method returns the current position in the stream.
+   *
+   * @return Current position in stream as a long
+   */
+  public long getPos() throws IOException {
+    if (!(in instanceof Seekable) || !(in instanceof PositionedReadable)){
+      //This way of getting the current position will not work for file
+      //size which can be fit in an int and hence can not be returned by
+      //available method.
+      return (this.maxAvailableData - this.in.available());
+    }
+    else{
+      return ((Seekable)this.in).getPos();
+    }
+
+  }
+
+  /**
+   * This method is current not supported.
+   *
+   * @throws UnsupportedOperationException
+   */
+
+  public void seek(long pos) throws UnsupportedOperationException {
+    throw new UnsupportedOperationException();
+  }
+
+  /**
+   * This method is current not supported.
+   *
+   * @throws UnsupportedOperationException
+   */
+  public boolean seekToNewSource(long targetPos) throws UnsupportedOperationException {
+    throw new UnsupportedOperationException();
+  }
 }

Modified: hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/Compressor.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/Compressor.java?rev=885142&r1=885141&r2=885142&view=diff
==============================================================================
--- hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/Compressor.java (original)
+++ hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/Compressor.java Sat Nov 28 19:53:33 2009
@@ -20,6 +20,8 @@
 
 import java.io.IOException;
 
+import org.apache.hadoop.conf.Configuration;
+
 /**
  * Specification of a stream-based 'compressor' which can be  
  * plugged into a {@link CompressionOutputStream} to compress data.
@@ -102,5 +104,13 @@
   /**
    * Closes the compressor and discards any unprocessed input.
    */
-  public void end(); 
+  public void end();
+
+  /**
+   * Prepare the compressor to be used in a new stream with settings defined in
+   * the given Configuration
+   * 
+   * @param conf Configuration from which new setting are fetched
+   */
+  public void reinit(Configuration conf);
 }

Modified: hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/DecompressorStream.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/DecompressorStream.java?rev=885142&r1=885141&r2=885142&view=diff
==============================================================================
--- hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/DecompressorStream.java (original)
+++ hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/DecompressorStream.java Sat Nov 28 19:53:33 2009
@@ -30,7 +30,7 @@
   protected boolean eof = false;
   protected boolean closed = false;
   
-  public DecompressorStream(InputStream in, Decompressor decompressor, int bufferSize) {
+  public DecompressorStream(InputStream in, Decompressor decompressor, int bufferSize) throws IOException {
     super(in);
 
     if (in == null || decompressor == null) {
@@ -43,7 +43,7 @@
     buffer = new byte[bufferSize];
   }
 
-  public DecompressorStream(InputStream in, Decompressor decompressor) {
+  public DecompressorStream(InputStream in, Decompressor decompressor) throws IOException {
     this(in, decompressor, 512);
   }
 
@@ -51,8 +51,9 @@
    * Allow derived classes to directly set the underlying stream.
    * 
    * @param in Underlying input stream.
+ * @throws IOException
    */
-  protected DecompressorStream(InputStream in) {
+  protected DecompressorStream(InputStream in) throws IOException {
     super(in);
   }
   

Modified: hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/GzipCodec.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/GzipCodec.java?rev=885142&r1=885141&r2=885142&view=diff
==============================================================================
--- hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/GzipCodec.java (original)
+++ hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/GzipCodec.java Sat Nov 28 19:53:33 2009
@@ -22,8 +22,11 @@
 import java.util.zip.GZIPOutputStream;
 import java.util.zip.GZIPInputStream;
 
+import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.io.compress.DefaultCodec;
 import org.apache.hadoop.io.compress.zlib.*;
+import org.apache.hadoop.io.compress.zlib.ZlibCompressor.CompressionLevel;
+import org.apache.hadoop.io.compress.zlib.ZlibCompressor.CompressionStrategy;
 
 /**
  * This class creates gzip compressors/decompressors. 
@@ -103,8 +106,9 @@
     
     /**
      * Allow subclasses to directly set the inflater stream.
+     * @throws IOException
      */
-    protected GzipInputStream(DecompressorStream in) {
+    protected GzipInputStream(DecompressorStream in) throws IOException {
       super(in);
     }
 
@@ -154,7 +158,7 @@
 
   public Compressor createCompressor() {
     return (ZlibFactory.isNativeZlibLoaded(conf))
-      ? new GzipZlibCompressor()
+      ? new GzipZlibCompressor(conf)
       : null;
   }
 
@@ -205,6 +209,13 @@
           ZlibCompressor.CompressionStrategy.DEFAULT_STRATEGY,
           ZlibCompressor.CompressionHeader.GZIP_FORMAT, 64*1024);
     }
+    
+    public GzipZlibCompressor(Configuration conf) {
+      super(ZlibFactory.getCompressionLevel(conf),
+           ZlibFactory.getCompressionStrategy(conf),
+           ZlibCompressor.CompressionHeader.GZIP_FORMAT,
+           64 * 1024);
+    }
   }
 
   static final class GzipZlibDecompressor extends ZlibDecompressor {

Modified: hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/bzip2/BZip2Constants.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/bzip2/BZip2Constants.java?rev=885142&r1=885141&r2=885142&view=diff
==============================================================================
--- hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/bzip2/BZip2Constants.java (original)
+++ hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/bzip2/BZip2Constants.java Sat Nov 28 19:53:33 2009
@@ -44,6 +44,14 @@
   int N_ITERS = 4;
   int MAX_SELECTORS = (2 + (900000 / G_SIZE));
   int NUM_OVERSHOOT_BYTES = 20;
+  /**
+   * End of a BZip2 block
+   */
+  public static final int END_OF_BLOCK = -2;
+  /**
+   * End of BZip2 stream.
+   */
+  public static final int END_OF_STREAM = -1;
 
   /**
   * This array really shouldn't be here. Again, for historical purposes it

Modified: hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/bzip2/BZip2DummyCompressor.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/bzip2/BZip2DummyCompressor.java?rev=885142&r1=885141&r2=885142&view=diff
==============================================================================
--- hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/bzip2/BZip2DummyCompressor.java (original)
+++ hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/bzip2/BZip2DummyCompressor.java Sat Nov 28 19:53:33 2009
@@ -20,6 +20,7 @@
 
 import java.io.IOException;
 
+import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.io.compress.Compressor;
 
 /**
@@ -77,4 +78,9 @@
     throw new UnsupportedOperationException();
   }
 
+  @Override
+  public void reinit(Configuration conf) {
+    // do nothing
+  }
+
 }

Modified: hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/bzip2/CBZip2InputStream.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/bzip2/CBZip2InputStream.java?rev=885142&r1=885141&r2=885142&view=diff
==============================================================================
--- hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/bzip2/CBZip2InputStream.java (original)
+++ hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/bzip2/CBZip2InputStream.java Sat Nov 28 19:53:33 2009
@@ -23,9 +23,13 @@
  */
 package org.apache.hadoop.io.compress.bzip2;
 
+import java.io.BufferedInputStream;
 import java.io.InputStream;
 import java.io.IOException;
 
+import org.apache.hadoop.io.compress.SplittableCompressionCodec.READ_MODE;
+
+
 /**
  * An input stream that decompresses from the BZip2 format (without the file
  * header chars) to be read as any other stream.
@@ -45,30 +49,43 @@
  * </p>
  *
  * <p>
+ * This Ant code was enhanced so that it can de-compress blocks of bzip2 data.
+ * Current position in the stream is an important statistic for Hadoop. For
+ * example in LineRecordReader, we solely depend on the current position in the
+ * stream to know about the progess. The notion of position becomes complicated
+ * for compressed files. The Hadoop splitting is done in terms of compressed
+ * file. But a compressed file deflates to a large amount of data. So we have
+ * handled this problem in the following way.
+ *
+ * On object creation time, we find the next block start delimiter. Once such a
+ * marker is found, the stream stops there (we discard any read compressed data
+ * in this process) and the position is updated (i.e. the caller of this class
+ * will find out the stream location). At this point we are ready for actual
+ * reading (i.e. decompression) of data.
+ *
+ * The subsequent read calls give out data. The position is updated when the
+ * caller of this class has read off the current block + 1 bytes. In between the
+ * block reading, position is not updated. (We can only update the postion on
+ * block boundaries).
+ * </p>
+ *
+ * <p>
  * Instances of this class are not threadsafe.
  * </p>
  */
 public class CBZip2InputStream extends InputStream implements BZip2Constants {
 
-  private static void reportCRCError() throws IOException {
 
-    throw new IOException("BZip2 CRC error");
-
-  }
-
-  private void makeMaps() {
-    final boolean[] inUse = this.data.inUse;
-    final byte[] seqToUnseq = this.data.seqToUnseq;
-
-    int nInUseShadow = 0;
-
-    for (int i = 0; i < 256; i++) {
-      if (inUse[i])
-        seqToUnseq[nInUseShadow++] = (byte) i;
-    }
-
-    this.nInUse = nInUseShadow;
-  }
+  public static final long BLOCK_DELIMITER = 0X314159265359L;// start of block
+  public static final long EOS_DELIMITER = 0X177245385090L;// end of bzip2 stream
+  private static final int DELIMITER_BIT_LENGTH = 48;
+  READ_MODE readMode = READ_MODE.CONTINUOUS;
+  // The variable records the current advertised position of the stream.
+  private long reportedBytesReadFromCompressedStream = 0L;
+  // The following variable keep record of compressed bytes read.
+  private long bytesReadFromCompressedStream = 0L;
+  private boolean lazyInitialization = false;
+  private byte array[] = new byte[1];
 
   /**
   * Index of the last char in the block, so the block size == last + 1.
@@ -86,32 +103,34 @@
   */
   private int blockSize100k;
 
-  private boolean blockRandomised;
+  private boolean blockRandomised = false;
 
-  private int bsBuff;
-  private int bsLive;
+  private long bsBuff;
+  private long bsLive;
   private final CRC crc = new CRC();
 
   private int nInUse;
 
-  private InputStream in;
+  private BufferedInputStream in;
 
   private int currentChar = -1;
 
-  private static final int EOF = 0;
-  private static final int START_BLOCK_STATE = 1;
-  private static final int RAND_PART_A_STATE = 2;
-  private static final int RAND_PART_B_STATE = 3;
-  private static final int RAND_PART_C_STATE = 4;
-  private static final int NO_RAND_PART_A_STATE = 5;
-  private static final int NO_RAND_PART_B_STATE = 6;
-  private static final int NO_RAND_PART_C_STATE = 7;
+  /**
+   * A state machine to keep track of current state of the de-coder
+   *
+   */
+  public enum STATE {
+    EOF, START_BLOCK_STATE, RAND_PART_A_STATE, RAND_PART_B_STATE, RAND_PART_C_STATE, NO_RAND_PART_A_STATE, NO_RAND_PART_B_STATE, NO_RAND_PART_C_STATE, NO_PROCESS_STATE
+  };
 
-  private int currentState = START_BLOCK_STATE;
+  private STATE currentState = STATE.START_BLOCK_STATE;
 
   private int storedBlockCRC, storedCombinedCRC;
   private int computedBlockCRC, computedCombinedCRC;
 
+  private boolean skipResult = false;// used by skipToNextMarker
+  private static boolean skipDecompression = false;
+
   // Variables used by setup* methods exclusively
 
   private int su_count;
@@ -130,6 +149,121 @@
   private CBZip2InputStream.Data data;
 
   /**
+  * This method reports the processed bytes so far. Please note that this
+  * statistic is only updated on block boundaries and only when the stream is
+  * initiated in BYBLOCK mode.
+  */
+  public long getProcessedByteCount() {
+    return reportedBytesReadFromCompressedStream;
+  }
+
+  /**
+   * This method keeps track of raw processed compressed
+   * bytes.
+   *
+   * @param count count is the number of bytes to be
+   *           added to raw processed bytes
+   */
+
+  protected void updateProcessedByteCount(int count) {
+    this.bytesReadFromCompressedStream += count;
+  }
+
+  /**
+   * This method is called by the client of this
+   * class in case there are any corrections in
+   * the stream position.  One common example is
+   * when client of this code removes starting BZ
+   * characters from the compressed stream.
+   *
+   * @param count count bytes are added to the reported bytes
+   *
+   */
+  public void updateReportedByteCount(int count) {
+    this.reportedBytesReadFromCompressedStream += count;
+    this.updateProcessedByteCount(count);
+  }
+
+  /**
+  * This method reads a Byte from the compressed stream. Whenever we need to
+  * read from the underlying compressed stream, this method should be called
+  * instead of directly calling the read method of the underlying compressed
+  * stream. This method does important record keeping to have the statistic
+  * that how many bytes have been read off the compressed stream.
+  */
+  private int readAByte(InputStream inStream) throws IOException {
+    int read = inStream.read();
+    if (read >= 0) {
+      this.updateProcessedByteCount(1);
+    }
+    return read;
+  }
+
+  /**
+  * This method tries to find the marker (passed to it as the first parameter)
+  * in the stream.  It can find bit patterns of length <= 63 bits.  Specifically
+  * this method is used in CBZip2InputStream to find the end of block (EOB)
+  * delimiter in the stream, starting from the current position of the stream.
+  * If marker is found, the stream position will be right after marker at the
+  * end of this call.
+  *
+  * @param marker  The bit pattern to be found in the stream
+  * @param markerBitLength  No of bits in the marker
+  *
+  * @throws IOException
+  * @throws IllegalArgumentException  if marketBitLength is greater than 63
+  */
+  public boolean skipToNextMarker(long marker, int markerBitLength)
+      throws IOException, IllegalArgumentException {
+    try {
+      if (markerBitLength > 63) {
+        throw new IllegalArgumentException(
+            "skipToNextMarker can not find patterns greater than 63 bits");
+      }
+      // pick next marketBitLength bits in the stream
+      long bytes = 0;
+      bytes = this.bsR(markerBitLength);
+      if (bytes == -1) {
+        return false;
+      }
+      while (true) {
+        if (bytes == marker) {
+          return true;
+
+        } else {
+          bytes = bytes << 1;
+          bytes = bytes & ((1L << markerBitLength) - 1);
+          int oneBit = (int) this.bsR(1);
+          if (oneBit != -1) {
+            bytes = bytes | oneBit;
+          } else
+            return false;
+        }
+      }
+    } catch (IOException ex) {
+      return false;
+    }
+  }
+
+  protected void reportCRCError() throws IOException {
+    throw new IOException("crc error");
+  }
+
+  private void makeMaps() {
+    final boolean[] inUse = this.data.inUse;
+    final byte[] seqToUnseq = this.data.seqToUnseq;
+
+    int nInUseShadow = 0;
+
+    for (int i = 0; i < 256; i++) {
+      if (inUse[i])
+        seqToUnseq[nInUseShadow++] = (byte) i;
+    }
+
+    this.nInUse = nInUseShadow;
+  }
+
+  /**
   * Constructs a new CBZip2InputStream which decompresses bytes read from the
   * specified stream.
   *
@@ -145,21 +279,99 @@
   * @throws NullPointerException
   *             if <tt>in == null</tt>
   */
-  public CBZip2InputStream(final InputStream in) throws IOException {
-    super();
+  public CBZip2InputStream(final InputStream in, READ_MODE readMode)
+      throws IOException {
 
-    this.in = in;
+    super();
+    int blockSize = 0X39;// i.e 9
+    this.blockSize100k = blockSize - '0';
+    this.in = new BufferedInputStream(in, 1024 * 9);// >1 MB buffer
+    this.readMode = readMode;
+    if (readMode == READ_MODE.CONTINUOUS) {
+      currentState = STATE.START_BLOCK_STATE;
+      lazyInitialization = (in.available() == 0)?true:false;
+      if(!lazyInitialization){
     init();
   }
+    } else if (readMode == READ_MODE.BYBLOCK) {
+      this.currentState = STATE.NO_PROCESS_STATE;
+      skipResult = this.skipToNextMarker(CBZip2InputStream.BLOCK_DELIMITER,DELIMITER_BIT_LENGTH);
+      this.reportedBytesReadFromCompressedStream = this.bytesReadFromCompressedStream;
+      if(!skipDecompression){
+        changeStateToProcessABlock();
+      }
+    }
+  }
+
+  /**
+   * Returns the number of bytes between the current stream position
+   * and the immediate next BZip2 block marker.
+   *
+   * @param in
+   *             The InputStream
+   *
+   * @return long Number of bytes between current stream position and the
+   * next BZip2 block start marker.
+ * @throws IOException
+   *
+   */
+  public static long numberOfBytesTillNextMarker(final InputStream in) throws IOException{
+    CBZip2InputStream.skipDecompression = true;
+    CBZip2InputStream anObject = null;
+
+    anObject = new CBZip2InputStream(in, READ_MODE.BYBLOCK);
+
+    return anObject.getProcessedByteCount();
+  }
+
+  public CBZip2InputStream(final InputStream in) throws IOException {
+    this(in, READ_MODE.CONTINUOUS);
+  }
+
+  private void changeStateToProcessABlock() throws IOException {
+    if (skipResult == true) {
+      initBlock();
+      setupBlock();
+    } else {
+      this.currentState = STATE.EOF;
+    }
+  }
+
 
   public int read() throws IOException {
+
     if (this.in != null) {
-      return read0();
+      int result = this.read(array, 0, 1);
+      int value = 0XFF & array[0];
+      return (result > 0 ? value : result);
+
     } else {
       throw new IOException("stream closed");
     }
   }
 
+  /**
+   * In CONTINOUS reading mode, this read method starts from the
+   * start of the compressed stream and end at the end of file by
+   * emitting un-compressed data.  In this mode stream positioning
+   * is not announced and should be ignored.
+   *
+   * In BYBLOCK reading mode, this read method informs about the end
+   * of a BZip2 block by returning EOB.  At this event, the compressed
+   * stream position is also announced.  This announcement tells that
+   * how much of the compressed stream has been de-compressed and read
+   * out of this class.  In between EOB events, the stream position is
+   * not updated.
+   *
+   *
+   * @throws IOException
+   *             if the stream content is malformed or an I/O error occurs.
+   *
+   * @return int The return value greater than 0 are the bytes read.  A value
+   * of -1 means end of stream while -2 represents end of block
+   */
+
+
   public int read(final byte[] dest, final int offs, final int len)
       throws IOException {
     if (offs < 0) {
@@ -176,13 +388,39 @@
       throw new IOException("stream closed");
     }
 
+    if(lazyInitialization){
+      this.init();
+      this.lazyInitialization = false;
+    }
+
+    if(skipDecompression){
+      changeStateToProcessABlock();
+      CBZip2InputStream.skipDecompression = false;
+    }
+
     final int hi = offs + len;
     int destOffs = offs;
-    for (int b; (destOffs < hi) && ((b = read0()) >= 0);) {
+    int b = 0;
+
+
+
+    for (; ((destOffs < hi) && ((b = read0())) >= 0);) {
       dest[destOffs++] = (byte) b;
+
     }
 
-    return (destOffs == offs) ? -1 : (destOffs - offs);
+    int result = destOffs - offs;
+    if (result == 0) {
+      //report 'end of block' or 'end of stream'
+      result = b;
+
+      skipResult = this.skipToNextMarker(CBZip2InputStream.BLOCK_DELIMITER, DELIMITER_BIT_LENGTH);
+      //Exactly when we are about to start a new block, we advertise the stream position.
+      this.reportedBytesReadFromCompressedStream = this.bytesReadFromCompressedStream;
+
+      changeStateToProcessABlock();
+    }
+    return result;
   }
 
   private int read0() throws IOException {
@@ -190,7 +428,10 @@
 
     switch (this.currentState) {
     case EOF:
-      return -1;
+      return END_OF_STREAM;// return -1
+
+    case NO_PROCESS_STATE:
+      return END_OF_BLOCK;// return -2
 
     case START_BLOCK_STATE:
       throw new IllegalStateException();
@@ -225,13 +466,13 @@
   }
 
   private void init() throws IOException {
-    int magic2 = this.in.read();
+    int magic2 = this.readAByte(in);
     if (magic2 != 'h') {
       throw new IOException("Stream is not BZip2 formatted: expected 'h'"
           + " as first byte but got '" + (char) magic2 + "'");
     }
 
-    int blockSize = this.in.read();
+    int blockSize = this.readAByte(in);
     if ((blockSize < '1') || (blockSize > '9')) {
       throw new IOException("Stream is not BZip2 formatted: illegal "
           + "blocksize " + (char) blockSize);
@@ -244,6 +485,27 @@
   }
 
   private void initBlock() throws IOException {
+    if (this.readMode == READ_MODE.BYBLOCK) {
+      // this.checkBlockIntegrity();
+      this.storedBlockCRC = bsGetInt();
+      this.blockRandomised = bsR(1) == 1;
+
+      /**
+      * Allocate data here instead in constructor, so we do not allocate
+      * it if the input file is empty.
+      */
+      if (this.data == null) {
+        this.data = new Data(this.blockSize100k);
+      }
+
+      // currBlockNo++;
+      getAndMoveToFrontDecode();
+
+      this.crc.initialiseCRC();
+      this.currentState = STATE.START_BLOCK_STATE;
+      return;
+    }
+
     char magic0 = bsGetUByte();
     char magic1 = bsGetUByte();
     char magic2 = bsGetUByte();
@@ -261,7 +523,7 @@
         magic4 != 0x53 || // 'S'
         magic5 != 0x59 // 'Y'
     ) {
-      this.currentState = EOF;
+      this.currentState = STATE.EOF;
       throw new IOException("bad block header");
     } else {
       this.storedBlockCRC = bsGetInt();
@@ -279,7 +541,7 @@
       getAndMoveToFrontDecode();
 
       this.crc.initialiseCRC();
-      this.currentState = START_BLOCK_STATE;
+      this.currentState = STATE.START_BLOCK_STATE;
     }
   }
 
@@ -304,7 +566,7 @@
 
   private void complete() throws IOException {
     this.storedCombinedCRC = bsGetInt();
-    this.currentState = EOF;
+    this.currentState = STATE.EOF;
     this.data = null;
 
     if (this.storedCombinedCRC != this.computedCombinedCRC) {
@@ -326,14 +588,14 @@
     }
   }
 
-  private int bsR(final int n) throws IOException {
-    int bsLiveShadow = this.bsLive;
-    int bsBuffShadow = this.bsBuff;
+  private long bsR(final long n) throws IOException {
+    long bsLiveShadow = this.bsLive;
+    long bsBuffShadow = this.bsBuff;
 
     if (bsLiveShadow < n) {
       final InputStream inShadow = this.in;
       do {
-        int thech = inShadow.read();
+        int thech = readAByte(inShadow);
 
         if (thech < 0) {
           throw new IOException("unexpected end of stream");
@@ -347,15 +609,15 @@
     }
 
     this.bsLive = bsLiveShadow - n;
-    return (bsBuffShadow >> (bsLiveShadow - n)) & ((1 << n) - 1);
+    return (bsBuffShadow >> (bsLiveShadow - n)) & ((1L << n) - 1);
   }
 
   private boolean bsGetBit() throws IOException {
-    int bsLiveShadow = this.bsLive;
-    int bsBuffShadow = this.bsBuff;
+    long bsLiveShadow = this.bsLive;
+    long bsBuffShadow = this.bsBuff;
 
     if (bsLiveShadow < 1) {
-      int thech = this.in.read();
+      int thech = this.readAByte(in);
 
       if (thech < 0) {
         throw new IOException("unexpected end of stream");
@@ -375,7 +637,7 @@
   }
 
   private int bsGetInt() throws IOException {
-    return (((((bsR(8) << 8) | bsR(8)) << 8) | bsR(8)) << 8) | bsR(8);
+    return (int) ((((((bsR(8) << 8) | bsR(8)) << 8) | bsR(8)) << 8) | bsR(8));
   }
 
   /**
@@ -454,8 +716,8 @@
     final int alphaSize = this.nInUse + 2;
 
     /* Now the selectors */
-    final int nGroups = bsR(3);
-    final int nSelectors = bsR(15);
+    final int nGroups = (int) bsR(3);
+    final int nSelectors = (int) bsR(15);
 
     for (int i = 0; i < nSelectors; i++) {
       int j = 0;
@@ -486,7 +748,7 @@
 
     /* Now the coding tables */
     for (int t = 0; t < nGroups; t++) {
-      int curr = bsR(5);
+      int curr = (int) bsR(5);
       final char[] len_t = len[t];
       for (int i = 0; i < alphaSize; i++) {
         while (bsGetBit()) {
@@ -532,7 +794,7 @@
   }
 
   private void getAndMoveToFrontDecode() throws IOException {
-    this.origPtr = bsR(24);
+    this.origPtr = (int) bsR(24);
     recvDecodingTables();
 
     final InputStream inShadow = this.in;
@@ -562,8 +824,8 @@
     int groupPos = G_SIZE - 1;
     final int eob = this.nInUse + 1;
     int nextSym = getAndMoveToFrontDecode0(0);
-    int bsBuffShadow = this.bsBuff;
-    int bsLiveShadow = this.bsLive;
+    int bsBuffShadow = (int) this.bsBuff;
+    int bsLiveShadow = (int) this.bsLive;
     int lastShadow = -1;
     int zt = selector[groupNo] & 0xff;
     int[] base_zt = base[zt];
@@ -597,10 +859,8 @@
 
           int zn = minLens_zt;
 
-          // Inlined:
-          // int zvec = bsR(zn);
           while (bsLiveShadow < zn) {
-            final int thech = inShadow.read();
+            final int thech = readAByte(inShadow);
             if (thech >= 0) {
               bsBuffShadow = (bsBuffShadow << 8) | thech;
               bsLiveShadow += 8;
@@ -609,14 +869,14 @@
               throw new IOException("unexpected end of stream");
             }
           }
-          int zvec = (bsBuffShadow >> (bsLiveShadow - zn))
+          long zvec = (bsBuffShadow >> (bsLiveShadow - zn))
               & ((1 << zn) - 1);
           bsLiveShadow -= zn;
 
           while (zvec > limit_zt[zn]) {
             zn++;
             while (bsLiveShadow < 1) {
-              final int thech = inShadow.read();
+              final int thech = readAByte(inShadow);
               if (thech >= 0) {
                 bsBuffShadow = (bsBuffShadow << 8) | thech;
                 bsLiveShadow += 8;
@@ -630,7 +890,7 @@
             zvec = (zvec << 1)
                 | ((bsBuffShadow >> bsLiveShadow) & 1);
           }
-          nextSym = perm_zt[zvec - base_zt[zn]];
+          nextSym = perm_zt[(int) (zvec - base_zt[zn])];
         }
 
         final byte ch = seqToUnseq[yy[0]];
@@ -680,10 +940,8 @@
 
         int zn = minLens_zt;
 
-        // Inlined:
-        // int zvec = bsR(zn);
         while (bsLiveShadow < zn) {
-          final int thech = inShadow.read();
+          final int thech = readAByte(inShadow);
           if (thech >= 0) {
             bsBuffShadow = (bsBuffShadow << 8) | thech;
             bsLiveShadow += 8;
@@ -699,7 +957,7 @@
         while (zvec > limit_zt[zn]) {
           zn++;
           while (bsLiveShadow < 1) {
-            final int thech = inShadow.read();
+            final int thech = readAByte(inShadow);
             if (thech >= 0) {
               bsBuffShadow = (bsBuffShadow << 8) | thech;
               bsLiveShadow += 8;
@@ -709,7 +967,7 @@
             }
           }
           bsLiveShadow--;
-          zvec = (zvec << 1) | ((bsBuffShadow >> bsLiveShadow) & 1);
+          zvec = ((zvec << 1) | ((bsBuffShadow >> bsLiveShadow) & 1));
         }
         nextSym = perm_zt[zvec - base_zt[zn]];
       }
@@ -726,14 +984,14 @@
     final int zt = dataShadow.selector[groupNo] & 0xff;
     final int[] limit_zt = dataShadow.limit[zt];
     int zn = dataShadow.minLens[zt];
-    int zvec = bsR(zn);
-    int bsLiveShadow = this.bsLive;
-    int bsBuffShadow = this.bsBuff;
+    int zvec = (int) bsR(zn);
+    int bsLiveShadow = (int) this.bsLive;
+    int bsBuffShadow = (int) this.bsBuff;
 
     while (zvec > limit_zt[zn]) {
       zn++;
       while (bsLiveShadow < 1) {
-        final int thech = inShadow.read();
+        final int thech = readAByte(inShadow);
 
         if (thech >= 0) {
           bsBuffShadow = (bsBuffShadow << 8) | thech;
@@ -807,12 +1065,16 @@
       this.su_ch2 = su_ch2Shadow ^= (this.su_rNToGo == 1) ? 1 : 0;
       this.su_i2++;
       this.currentChar = su_ch2Shadow;
-      this.currentState = RAND_PART_B_STATE;
+      this.currentState = STATE.RAND_PART_B_STATE;
       this.crc.updateCRC(su_ch2Shadow);
     } else {
       endBlock();
+      if (readMode == READ_MODE.CONTINUOUS) {
       initBlock();
       setupBlock();
+      } else if (readMode == READ_MODE.BYBLOCK) {
+        this.currentState = STATE.NO_PROCESS_STATE;
+      }
     }
   }
 
@@ -824,19 +1086,23 @@
       this.su_tPos = this.data.tt[this.su_tPos];
       this.su_i2++;
       this.currentChar = su_ch2Shadow;
-      this.currentState = NO_RAND_PART_B_STATE;
+      this.currentState = STATE.NO_RAND_PART_B_STATE;
       this.crc.updateCRC(su_ch2Shadow);
     } else {
-      this.currentState = NO_RAND_PART_A_STATE;
+      this.currentState = STATE.NO_RAND_PART_A_STATE;
       endBlock();
+      if (readMode == READ_MODE.CONTINUOUS) {
       initBlock();
       setupBlock();
+      } else if (readMode == READ_MODE.BYBLOCK) {
+        this.currentState = STATE.NO_PROCESS_STATE;
+      }
     }
   }
 
   private void setupRandPartB() throws IOException {
     if (this.su_ch2 != this.su_chPrev) {
-      this.currentState = RAND_PART_A_STATE;
+      this.currentState = STATE.RAND_PART_A_STATE;
       this.su_count = 1;
       setupRandPartA();
     } else if (++this.su_count >= 4) {
@@ -851,13 +1117,13 @@
         this.su_rNToGo--;
       }
       this.su_j2 = 0;
-      this.currentState = RAND_PART_C_STATE;
+      this.currentState = STATE.RAND_PART_C_STATE;
       if (this.su_rNToGo == 1) {
         this.su_z ^= 1;
       }
       setupRandPartC();
     } else {
-      this.currentState = RAND_PART_A_STATE;
+      this.currentState = STATE.RAND_PART_A_STATE;
       setupRandPartA();
     }
   }
@@ -868,7 +1134,7 @@
       this.crc.updateCRC(this.su_ch2);
       this.su_j2++;
     } else {
-      this.currentState = RAND_PART_A_STATE;
+      this.currentState = STATE.RAND_PART_A_STATE;
       this.su_i2++;
       this.su_count = 0;
       setupRandPartA();
@@ -895,7 +1161,7 @@
       this.currentChar = su_ch2Shadow;
       this.crc.updateCRC(su_ch2Shadow);
       this.su_j2++;
-      this.currentState = NO_RAND_PART_C_STATE;
+      this.currentState = STATE.NO_RAND_PART_C_STATE;
     } else {
       this.su_i2++;
       this.su_count = 0;

Modified: hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/zlib/BuiltInZlibDeflater.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/zlib/BuiltInZlibDeflater.java?rev=885142&r1=885141&r2=885142&view=diff
==============================================================================
--- hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/zlib/BuiltInZlibDeflater.java (original)
+++ hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/zlib/BuiltInZlibDeflater.java Sat Nov 28 19:53:33 2009
@@ -21,7 +21,9 @@
 import java.io.IOException;
 import java.util.zip.Deflater;
 
+import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.io.compress.Compressor;
+import org.mortbay.log.Log;
 
 /**
  * A wrapper around java.util.zip.Deflater to make it conform 
@@ -46,4 +48,30 @@
     throws IOException {
     return super.deflate(b, off, len);
   }
+
+  /**
+   * reinit the compressor with the given configuration. It will reset the
+   * compressor's compression level and compression strategy. Different from
+   * <tt>ZlibCompressor</tt>, <tt>BuiltInZlibDeflater</tt> only support three
+   * kind of compression strategy: FILTERED, HUFFMAN_ONLY and DEFAULT_STRATEGY.
+   * It will use DEFAULT_STRATEGY as default if the configured compression
+   * strategy is not supported.
+   */
+  @Override
+  public void reinit(Configuration conf) {
+    reset();
+    if (conf == null) {
+      return;
+    }
+    setLevel(ZlibFactory.getCompressionLevel(conf).compressionLevel());
+    final ZlibCompressor.CompressionStrategy strategy =
+      ZlibFactory.getCompressionStrategy(conf);
+    try {
+      setStrategy(strategy.compressionStrategy());
+    } catch (IllegalArgumentException ill) {
+      Log.warn(strategy + " not supported by BuiltInZlibDeflater.");
+      setStrategy(DEFAULT_STRATEGY);
+    }
+    Log.debug("Reinit compressor with new compression configuration");
+  }
 }

Modified: hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/zlib/ZlibCompressor.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/zlib/ZlibCompressor.java?rev=885142&r1=885141&r2=885142&view=diff
==============================================================================
--- hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/zlib/ZlibCompressor.java (original)
+++ hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/zlib/ZlibCompressor.java Sat Nov 28 19:53:33 2009
@@ -22,8 +22,10 @@
 import java.nio.Buffer;
 import java.nio.ByteBuffer;
 
+import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.io.compress.Compressor;
 import org.apache.hadoop.util.NativeCodeLoader;
+import org.mortbay.log.Log;
 
 /**
  * A {@link Compressor} based on the popular 
@@ -40,7 +42,7 @@
   private long stream;
   private CompressionLevel level;
   private CompressionStrategy strategy;
-  private CompressionHeader windowBits;
+  private final CompressionHeader windowBits;
   private int directBufferSize;
   private byte[] userBuf = null;
   private int userBufOff = 0, userBufLen = 0;
@@ -178,6 +180,31 @@
     return nativeZlibLoaded;
   }
 
+  protected final void construct(CompressionLevel level, CompressionStrategy strategy,
+      CompressionHeader header, int directBufferSize) {
+  }
+
+  /**
+   * Creates a new compressor with the default compression level.
+   * Compressed data will be generated in ZLIB format.
+   */
+  public ZlibCompressor() {
+    this(CompressionLevel.DEFAULT_COMPRESSION,
+         CompressionStrategy.DEFAULT_STRATEGY,
+         CompressionHeader.DEFAULT_HEADER,
+         DEFAULT_DIRECT_BUFFER_SIZE);
+  }
+
+  /**
+   * Creates a new compressor, taking settings from the configuration.
+   */
+  public ZlibCompressor(Configuration conf) {
+    this(ZlibFactory.getCompressionLevel(conf),
+         ZlibFactory.getCompressionStrategy(conf),
+         CompressionHeader.DEFAULT_HEADER,
+         DEFAULT_DIRECT_BUFFER_SIZE);
+  }
+
   /** 
    * Creates a new compressor using the specified compression level.
    * Compressed data will be generated in ZLIB format.
@@ -192,28 +219,38 @@
     this.level = level;
     this.strategy = strategy;
     this.windowBits = header;
+    stream = init(this.level.compressionLevel(), 
+                  this.strategy.compressionStrategy(), 
+                  this.windowBits.windowBits());
+
     this.directBufferSize = directBufferSize;
-    
     uncompressedDirectBuf = ByteBuffer.allocateDirect(directBufferSize);
     compressedDirectBuf = ByteBuffer.allocateDirect(directBufferSize);
     compressedDirectBuf.position(directBufferSize);
-    
-    stream = init(this.level.compressionLevel(), 
-                  this.strategy.compressionStrategy(), 
-                  this.windowBits.windowBits());
   }
-  
+
   /**
-   * Creates a new compressor with the default compression level.
-   * Compressed data will be generated in ZLIB format.
+   * Prepare the compressor to be used in a new stream with settings defined in
+   * the given Configuration. It will reset the compressor's compression level
+   * and compression strategy.
+   * 
+   * @param conf Configuration storing new settings
    */
-  public ZlibCompressor() {
-    this(CompressionLevel.DEFAULT_COMPRESSION, 
-         CompressionStrategy.DEFAULT_STRATEGY, 
-         CompressionHeader.DEFAULT_HEADER, 
-         DEFAULT_DIRECT_BUFFER_SIZE);
+  @Override
+  public synchronized void reinit(Configuration conf) {
+    reset();
+    if (conf == null) {
+      return;
+    }
+    end(stream);
+    level = ZlibFactory.getCompressionLevel(conf);
+    strategy = ZlibFactory.getCompressionStrategy(conf);
+    stream = init(level.compressionLevel(), 
+                  strategy.compressionStrategy(), 
+                  windowBits.windowBits());
+    Log.debug("Reinit compressor with new compression configuration");
   }
-  
+
   public synchronized void setInput(byte[] b, int off, int len) {
     if (b== null) {
       throw new NullPointerException();

Modified: hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/zlib/ZlibFactory.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/zlib/ZlibFactory.java?rev=885142&r1=885141&r2=885142&view=diff
==============================================================================
--- hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/zlib/ZlibFactory.java (original)
+++ hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/compress/zlib/ZlibFactory.java Sat Nov 28 19:53:33 2009
@@ -23,7 +23,10 @@
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.io.compress.Compressor;
 import org.apache.hadoop.io.compress.Decompressor;
+import org.apache.hadoop.io.compress.zlib.ZlibCompressor.CompressionLevel;
+import org.apache.hadoop.io.compress.zlib.ZlibCompressor.CompressionStrategy;
 import org.apache.hadoop.util.NativeCodeLoader;
+import org.apache.hadoop.fs.CommonConfigurationKeys;
 
 /**
  * A collection of factories to create the right 
@@ -58,7 +61,9 @@
    *         and can be loaded for this job, else <code>false</code>
    */
   public static boolean isNativeZlibLoaded(Configuration conf) {
-    return nativeZlibLoaded && conf.getBoolean("hadoop.native.lib", true); 
+    return nativeZlibLoaded && conf.getBoolean(
+                          CommonConfigurationKeys.IO_NATIVE_LIB_AVAILABLE_KEY, 
+                          CommonConfigurationKeys.IO_NATIVE_LIB_AVAILABLE_DEFAULT);
   }
   
   /**
@@ -106,5 +111,25 @@
     return (isNativeZlibLoaded(conf)) ? 
       new ZlibDecompressor() : new BuiltInZlibInflater(); 
   }
-  
+
+  public static void setCompressionStrategy(Configuration conf,
+      CompressionStrategy strategy) {
+    conf.setEnum("zlib.compress.strategy", strategy);
+  }
+
+  public static CompressionStrategy getCompressionStrategy(Configuration conf) {
+    return conf.getEnum("zlib.compress.strategy",
+        CompressionStrategy.DEFAULT_STRATEGY);
+  }
+
+  public static void setCompressionLevel(Configuration conf,
+      CompressionLevel level) {
+    conf.setEnum("zlib.compress.level", level);
+  }
+
+  public static CompressionLevel getCompressionLevel(Configuration conf) {
+    return conf.getEnum("zlib.compress.level",
+        CompressionLevel.DEFAULT_COMPRESSION);
+  }
+
 }

Modified: hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/file/tfile/TFile.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/file/tfile/TFile.java?rev=885142&r1=885141&r2=885142&view=diff
==============================================================================
--- hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/file/tfile/TFile.java (original)
+++ hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/file/tfile/TFile.java Sat Nov 28 19:53:33 2009
@@ -34,6 +34,7 @@
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FSDataInputStream;
 import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.io.BoundedByteArrayOutputStream;
 import org.apache.hadoop.io.BytesWritable;
 import org.apache.hadoop.io.DataInputBuffer;
 import org.apache.hadoop.io.DataOutputBuffer;
@@ -668,10 +669,10 @@
    * TFile Reader. Users may only read TFiles by creating TFile.Reader.Scanner.
    * objects. A scanner may scan the whole TFile ({@link Reader#createScanner()}
    * ) , a portion of TFile based on byte offsets (
-   * {@link Reader#createScanner(long, long)}), or a portion of TFile with keys
+   * {@link Reader#createScannerByByteRange(long, long)}), or a portion of TFile with keys
    * fall in a certain key range (for sorted TFile only,
-   * {@link Reader#createScanner(byte[], byte[])} or
-   * {@link Reader#createScanner(RawComparable, RawComparable)}).
+   * {@link Reader#createScannerByKey(byte[], byte[])} or
+   * {@link Reader#createScannerByKey(RawComparable, RawComparable)}).
    */
   public static class Reader implements Closeable {
     // The underlying BCFile reader.
@@ -985,6 +986,16 @@
       return new Location(blkIndex, 0);
     }
 
+    Location getLocationByRecordNum(long recNum) throws IOException {
+      checkTFileDataIndex();
+      return tfileIndex.getLocationByRecordNum(recNum);
+    }
+
+    long getRecordNumByLocation(Location location) throws IOException {
+      checkTFileDataIndex();
+      return tfileIndex.getRecordNumByLocation(location);      
+    }
+    
     int compareKeys(byte[] a, int o1, int l1, byte[] b, int o2, int l2) {
       if (!isSorted()) {
         throw new RuntimeException("Cannot compare keys for unsorted TFiles.");
@@ -1016,6 +1027,21 @@
     }
 
     /**
+     * Get the RecordNum for the first key-value pair in a compressed block
+     * whose byte offset in the TFile is greater than or equal to the specified
+     * offset.
+     * 
+     * @param offset
+     *          the user supplied offset.
+     * @return the RecordNum to the corresponding entry. If no such entry
+     *         exists, it returns the total entry count.
+     * @throws IOException
+     */
+    public long getRecordNumNear(long offset) throws IOException {
+      return getRecordNumByLocation(getLocationNear(offset));
+    }
+    
+    /**
      * Get a sample key that is within a block whose starting offset is greater
      * than or equal to the specified offset.
      * 
@@ -1057,7 +1083,7 @@
      *         contains zero key-value pairs even if length is positive.
      * @throws IOException
      */
-    public Scanner createScanner(long offset, long length) throws IOException {
+    public Scanner createScannerByByteRange(long offset, long length) throws IOException {
       return new Scanner(this, offset, offset + length);
     }
 
@@ -1073,10 +1099,31 @@
      * @return The actual coverage of the returned scanner will cover all keys
      *         greater than or equal to the beginKey and less than the endKey.
      * @throws IOException
+     * 
+     * @deprecated Use {@link #createScannerByKey(byte[], byte[])} instead.
      */
+    @Deprecated
     public Scanner createScanner(byte[] beginKey, byte[] endKey)
+      throws IOException {
+      return createScannerByKey(beginKey, endKey);
+    }
+    
+    /**
+     * Get a scanner that covers a portion of TFile based on keys.
+     * 
+     * @param beginKey
+     *          Begin key of the scan (inclusive). If null, scan from the first
+     *          key-value entry of the TFile.
+     * @param endKey
+     *          End key of the scan (exclusive). If null, scan up to the last
+     *          key-value entry of the TFile.
+     * @return The actual coverage of the returned scanner will cover all keys
+     *         greater than or equal to the beginKey and less than the endKey.
+     * @throws IOException
+     */
+    public Scanner createScannerByKey(byte[] beginKey, byte[] endKey)
         throws IOException {
-      return createScanner((beginKey == null) ? null : new ByteArray(beginKey,
+      return createScannerByKey((beginKey == null) ? null : new ByteArray(beginKey,
           0, beginKey.length), (endKey == null) ? null : new ByteArray(endKey,
           0, endKey.length));
     }
@@ -1093,9 +1140,31 @@
      * @return The actual coverage of the returned scanner will cover all keys
      *         greater than or equal to the beginKey and less than the endKey.
      * @throws IOException
+     * 
+     * @deprecated Use {@link #createScannerByKey(RawComparable, RawComparable)}
+     *             instead.
      */
+    @Deprecated
     public Scanner createScanner(RawComparable beginKey, RawComparable endKey)
         throws IOException {
+      return createScannerByKey(beginKey, endKey);
+    }
+
+    /**
+     * Get a scanner that covers a specific key range.
+     * 
+     * @param beginKey
+     *          Begin key of the scan (inclusive). If null, scan from the first
+     *          key-value entry of the TFile.
+     * @param endKey
+     *          End key of the scan (exclusive). If null, scan up to the last
+     *          key-value entry of the TFile.
+     * @return The actual coverage of the returned scanner will cover all keys
+     *         greater than or equal to the beginKey and less than the endKey.
+     * @throws IOException
+     */
+    public Scanner createScannerByKey(RawComparable beginKey, RawComparable endKey)
+        throws IOException {
       if ((beginKey != null) && (endKey != null)
           && (compareKeys(beginKey, endKey) >= 0)) {
         return new Scanner(this, beginKey, beginKey);
@@ -1104,6 +1173,27 @@
     }
 
     /**
+     * Create a scanner that covers a range of records.
+     * 
+     * @param beginRecNum
+     *          The RecordNum for the first record (inclusive).
+     * @param endRecNum
+     *          The RecordNum for the last record (exclusive). To scan the whole
+     *          file, either specify endRecNum==-1 or endRecNum==getEntryCount().
+     * @return The TFile scanner that covers the specified range of records.
+     * @throws IOException
+     */
+    public Scanner createScannerByRecordNum(long beginRecNum, long endRecNum)
+        throws IOException {
+      if (beginRecNum < 0) beginRecNum = 0;
+      if (endRecNum < 0 || endRecNum > getEntryCount()) {
+        endRecNum = getEntryCount();
+      }
+      return new Scanner(this, getLocationByRecordNum(beginRecNum),
+          getLocationByRecordNum(endRecNum));
+    }
+
+    /**
      * The TFile Scanner. The Scanner has an implicit cursor, which, upon
      * creation, points to the first key-value pair in the scan range. If the
      * scan range is empty, the cursor will point to the end of the scan range.
@@ -1523,6 +1613,15 @@
       }
 
       /**
+       * Get the RecordNum corresponding to the entry pointed by the cursor.
+       * @return The RecordNum corresponding to the entry pointed by the cursor.
+       * @throws IOException
+       */
+      public long getRecordNum() throws IOException {
+        return reader.getRecordNumByLocation(currentLocation);
+      }
+      
+      /**
        * Internal API. Comparing the key at cursor to user-specified key.
        * 
        * @param other
@@ -2020,8 +2119,10 @@
     final static String BLOCK_NAME = "TFile.index";
     private ByteArray firstKey;
     private final ArrayList<TFileIndexEntry> index;
+    private final ArrayList<Long> recordNumIndex;
     private final BytesComparator comparator;
-
+    private long sum = 0;
+    
     /**
      * For reading from file.
      * 
@@ -2030,6 +2131,7 @@
     public TFileIndex(int entryCount, DataInput in, BytesComparator comparator)
         throws IOException {
       index = new ArrayList<TFileIndexEntry>(entryCount);
+      recordNumIndex = new ArrayList<Long>(entryCount);
       int size = Utils.readVInt(in); // size for the first key entry.
       if (size > 0) {
         byte[] buffer = new byte[size];
@@ -2051,6 +2153,8 @@
               new TFileIndexEntry(new DataInputStream(new ByteArrayInputStream(
                   buffer, 0, size)));
           index.add(idx);
+          sum += idx.entries();
+          recordNumIndex.add(sum);
         }
       } else {
         if (entryCount != 0) {
@@ -2082,6 +2186,12 @@
       return ret;
     }
 
+    /**
+     * @param key
+     *          input key.
+     * @return the ID of the first block that contains key > input key. Or -1
+     *         if no such block exists.
+     */
     public int upperBound(RawComparable key) {
       if (comparator == null) {
         throw new RuntimeException("Cannot search in unsorted TFile");
@@ -2103,13 +2213,26 @@
      */
     public TFileIndex(BytesComparator comparator) {
       index = new ArrayList<TFileIndexEntry>();
+      recordNumIndex = new ArrayList<Long>();
       this.comparator = comparator;
     }
 
     public RawComparable getFirstKey() {
       return firstKey;
     }
+    
+    public Reader.Location getLocationByRecordNum(long recNum) {
+      int idx = Utils.upperBound(recordNumIndex, recNum);
+      long lastRecNum = (idx == 0)? 0: recordNumIndex.get(idx-1);
+      return new Reader.Location(idx, recNum-lastRecNum);
+    }
 
+    public long getRecordNumByLocation(Reader.Location location) {
+      int blkIndex = location.getBlockIndex();
+      long lastRecNum = (blkIndex == 0) ? 0: recordNumIndex.get(blkIndex-1);
+      return lastRecNum + location.getRecordIndex();
+    }
+    
     public void setFirstKey(byte[] key, int offset, int length) {
       firstKey = new ByteArray(new byte[length]);
       System.arraycopy(key, offset, firstKey.buffer(), 0, length);
@@ -2124,6 +2247,8 @@
 
     public void addEntry(TFileIndexEntry keyEntry) {
       index.add(keyEntry);
+      sum += keyEntry.entries();
+      recordNumIndex.add(sum);
     }
 
     public TFileIndexEntry getEntry(int bid) {

Modified: hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/serializer/Deserializer.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/serializer/Deserializer.java?rev=885142&r1=885141&r2=885142&view=diff
==============================================================================
--- hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/serializer/Deserializer.java (original)
+++ hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/serializer/Deserializer.java Sat Nov 28 19:53:33 2009
@@ -34,6 +34,7 @@
  * </p>
  * @param <T>
  */
+@Deprecated
 public interface Deserializer<T> {
   /**
    * <p>Prepare the deserializer for reading.</p>

Modified: hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/serializer/DeserializerComparator.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/serializer/DeserializerComparator.java?rev=885142&r1=885141&r2=885142&view=diff
==============================================================================
--- hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/serializer/DeserializerComparator.java (original)
+++ hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/serializer/DeserializerComparator.java Sat Nov 28 19:53:33 2009
@@ -52,6 +52,13 @@
     this.deserializer.open(buffer);
   }
 
+  protected DeserializerComparator(DeserializerBase<T> deserializer)
+    throws IOException {
+    
+    this.deserializer = deserializer;
+    this.deserializer.open(buffer);
+  }
+
   public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
     try {
       

Modified: hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/serializer/Serialization.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/serializer/Serialization.java?rev=885142&r1=885141&r2=885142&view=diff
==============================================================================
--- hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/serializer/Serialization.java (original)
+++ hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/serializer/Serialization.java Sat Nov 28 19:53:33 2009
@@ -24,6 +24,7 @@
  * </p>
  * @param <T>
  */
+@Deprecated
 public interface Serialization<T> {
   
   /**

Modified: hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/serializer/SerializationFactory.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/serializer/SerializationFactory.java?rev=885142&r1=885141&r2=885142&view=diff
==============================================================================
--- hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/serializer/SerializationFactory.java (original)
+++ hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/serializer/SerializationFactory.java Sat Nov 28 19:53:33 2009
@@ -20,11 +20,13 @@
 
 import java.util.ArrayList;
 import java.util.List;
+import java.util.Map;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.io.serializer.avro.AvroGenericSerialization;
 import org.apache.hadoop.io.serializer.avro.AvroReflectSerialization;
 import org.apache.hadoop.io.serializer.avro.AvroSpecificSerialization;
 import org.apache.hadoop.util.ReflectionUtils;
@@ -32,7 +34,7 @@
 
 /**
  * <p>
- * A factory for {@link Serialization}s.
+ * A factory for {@link SerializationBase}s.
  * </p>
  */
 public class SerializationFactory extends Configured {
@@ -40,7 +42,10 @@
   private static final Log LOG =
     LogFactory.getLog(SerializationFactory.class.getName());
 
-  private List<Serialization<?>> serializations = new ArrayList<Serialization<?>>();
+  private List<SerializationBase<?>> serializations =
+    new ArrayList<SerializationBase<?>>();
+  private List<SerializationBase<?>> legacySerializations =
+    new ArrayList<SerializationBase<?>>();
   
   /**
    * <p>
@@ -54,7 +59,8 @@
     for (String serializerName : conf.getStrings("io.serializations", 
       new String[]{WritableSerialization.class.getName(), 
         AvroSpecificSerialization.class.getName(), 
-        AvroReflectSerialization.class.getName()})) {
+        AvroReflectSerialization.class.getName(),
+        AvroGenericSerialization.class.getName()})) {
       add(conf, serializerName);
     }
   }
@@ -62,30 +68,62 @@
   @SuppressWarnings("unchecked")
   private void add(Configuration conf, String serializationName) {
     try {
-      
-      Class<? extends Serialization> serializionClass =
-        (Class<? extends Serialization>) conf.getClassByName(serializationName);
-      serializations.add((Serialization)
-          ReflectionUtils.newInstance(serializionClass, getConf()));
+      Class<?> serializationClass = conf.getClassByName(serializationName);
+      if (SerializationBase.class.isAssignableFrom(serializationClass)) {
+	serializations.add((SerializationBase)
+	    ReflectionUtils.newInstance(serializationClass, getConf()));	
+      } else if (Serialization.class.isAssignableFrom(serializationClass)) {
+	Serialization serialization = (Serialization)
+	    ReflectionUtils.newInstance(serializationClass, getConf());
+	legacySerializations.add(new LegacySerialization(serialization,
+	    getConf()));	
+      } else {
+	LOG.warn("Serialization class " + serializationName + " is not an " +
+			"instance of Serialization or BaseSerialization.");
+      }
     } catch (ClassNotFoundException e) {
-      LOG.warn("Serilization class not found: " +
+      LOG.warn("Serialization class not found: " +
           StringUtils.stringifyException(e));
     }
   }
 
+  @Deprecated
   public <T> Serializer<T> getSerializer(Class<T> c) {
     return getSerialization(c).getSerializer(c);
   }
 
+  @Deprecated
   public <T> Deserializer<T> getDeserializer(Class<T> c) {
     return getSerialization(c).getDeserializer(c);
   }
 
-  @SuppressWarnings("unchecked")
+  @Deprecated
   public <T> Serialization<T> getSerialization(Class<T> c) {
-    for (Serialization serialization : serializations) {
-      if (serialization.accept(c)) {
-        return (Serialization<T>) serialization;
+    return getSerialization(SerializationBase.getMetadataFromClass(c));
+  }
+  
+  public <T> SerializerBase<T> getSerializer(Map<String, String> metadata) {
+    SerializationBase<T> serialization = getSerialization(metadata);
+    return serialization.getSerializer(metadata);
+  }
+    
+  public <T> DeserializerBase<T> getDeserializer(Map<String, String> metadata) {
+    SerializationBase<T> serialization = getSerialization(metadata);
+    return serialization.getDeserializer(metadata);
+  }
+    
+  @SuppressWarnings("unchecked")
+  public <T> SerializationBase<T> getSerialization(Map<String, String> metadata) {
+    for (SerializationBase serialization : serializations) {
+      if (serialization.accept(metadata)) {
+        return (SerializationBase<T>) serialization;
+      }
+    }
+    // Look in the legacy serializations last, since they ignore
+    // non-class metadata
+    for (SerializationBase serialization : legacySerializations) {
+      if (serialization.accept(metadata)) {
+        return (SerializationBase<T>) serialization;
       }
     }
     return null;

Modified: hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/serializer/Serializer.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/serializer/Serializer.java?rev=885142&r1=885141&r2=885142&view=diff
==============================================================================
--- hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/serializer/Serializer.java (original)
+++ hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/serializer/Serializer.java Sat Nov 28 19:53:33 2009
@@ -34,6 +34,7 @@
  * </p>
  * @param <T>
  */
+@Deprecated
 public interface Serializer<T> {
   /**
    * <p>Prepare the serializer for writing.</p>

Modified: hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/serializer/WritableSerialization.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/serializer/WritableSerialization.java?rev=885142&r1=885141&r2=885142&view=diff
==============================================================================
--- hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/serializer/WritableSerialization.java (original)
+++ hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/serializer/WritableSerialization.java Sat Nov 28 19:53:33 2009
@@ -23,22 +23,20 @@
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.OutputStream;
+import java.util.Map;
 
 import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.conf.Configured;
 import org.apache.hadoop.io.Writable;
 import org.apache.hadoop.util.ReflectionUtils;
 
 /**
- * A {@link Serialization} for {@link Writable}s that delegates to
+ * A {@link SerializationBase} for {@link Writable}s that delegates to
  * {@link Writable#write(java.io.DataOutput)} and
  * {@link Writable#readFields(java.io.DataInput)}.
  */
-public class WritableSerialization extends Configured 
-  implements Serialization<Writable> {
+public class WritableSerialization extends SerializationBase<Writable> {
   
-  static class WritableDeserializer extends Configured 
-    implements Deserializer<Writable> {
+  static class WritableDeserializer extends DeserializerBase<Writable> {
 
     private Class<?> writableClass;
     private DataInputStream dataIn;
@@ -48,6 +46,7 @@
       this.writableClass = c;
     }
     
+    @Override
     public void open(InputStream in) {
       if (in instanceof DataInputStream) {
         dataIn = (DataInputStream) in;
@@ -56,6 +55,7 @@
       }
     }
     
+    @Override
     public Writable deserialize(Writable w) throws IOException {
       Writable writable;
       if (w == null) {
@@ -68,16 +68,23 @@
       return writable;
     }
 
+    @Override
     public void close() throws IOException {
       dataIn.close();
     }
     
   }
   
-  static class WritableSerializer implements Serializer<Writable> {
-
+  static class WritableSerializer extends SerializerBase<Writable> {
+    
+    private Map<String, String> metadata;
     private DataOutputStream dataOut;
     
+    public WritableSerializer(Map<String, String> metadata) {
+      this.metadata = metadata;
+    }
+    
+    @Override
     public void open(OutputStream out) {
       if (out instanceof DataOutputStream) {
         dataOut = (DataOutputStream) out;
@@ -86,26 +93,41 @@
       }
     }
 
+    @Override
     public void serialize(Writable w) throws IOException {
       w.write(dataOut);
     }
 
+    @Override
     public void close() throws IOException {
       dataOut.close();
     }
 
-  }
+    @Override
+    public Map<String, String> getMetadata() throws IOException {
+      return metadata;
+    }
 
-  public boolean accept(Class<?> c) {
-    return Writable.class.isAssignableFrom(c);
   }
 
-  public Deserializer<Writable> getDeserializer(Class<Writable> c) {
-    return new WritableDeserializer(getConf(), c);
+  @Override
+  public boolean accept(Map<String, String> metadata) {
+    if (getClass().getName().equals(metadata.get(SERIALIZATION_KEY))) {
+      return true;
+    }
+    Class<?> c = getClassFromMetadata(metadata);
+    return c == null ? false : Writable.class.isAssignableFrom(c);
   }
 
-  public Serializer<Writable> getSerializer(Class<Writable> c) {
-    return new WritableSerializer();
+  @Override
+  public SerializerBase<Writable> getSerializer(Map<String, String> metadata) {
+    return new WritableSerializer(metadata);
+  }
+  
+  @Override
+  public DeserializerBase<Writable> getDeserializer(Map<String, String> metadata) {
+    Class<?> c = getClassFromMetadata(metadata);
+    return new WritableDeserializer(getConf(), c);
   }
 
 }

Modified: hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/serializer/avro/AvroReflectSerialization.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/serializer/avro/AvroReflectSerialization.java?rev=885142&r1=885141&r2=885142&view=diff
==============================================================================
--- hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/serializer/avro/AvroReflectSerialization.java (original)
+++ hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/serializer/avro/AvroReflectSerialization.java Sat Nov 28 19:53:33 2009
@@ -19,6 +19,7 @@
 package org.apache.hadoop.io.serializer.avro;
 
 import java.util.HashSet;
+import java.util.Map;
 import java.util.Set;
 
 import org.apache.avro.Schema;
@@ -27,6 +28,7 @@
 import org.apache.avro.reflect.ReflectData;
 import org.apache.avro.reflect.ReflectDatumReader;
 import org.apache.avro.reflect.ReflectDatumWriter;
+import org.apache.avro.specific.SpecificRecord;
 
 /**
  * Serialization for Avro Reflect classes. For a class to be accepted by this 
@@ -47,10 +49,18 @@
 
   private Set<String> packages; 
 
-  public synchronized boolean accept(Class<?> c) {
+  @Override
+  public synchronized boolean accept(Map<String, String> metadata) {
     if (packages == null) {
       getPackages();
     }
+    if (getClass().getName().equals(metadata.get(SERIALIZATION_KEY))) {
+      return true;
+    }
+    Class<?> c = getClassFromMetadata(metadata);
+    if (c == null) {
+      return false;
+    }
     return AvroReflectSerializable.class.isAssignableFrom(c) || 
       packages.contains(c.getPackage().getName());
   }
@@ -65,24 +75,22 @@
     }
   }
 
-  protected DatumReader getReader(Class<Object> clazz) {
+  @Override
+  protected DatumReader getReader(Map<String, String> metadata) {
     try {
-      String prefix =  
-        ((clazz.getEnclosingClass() == null 
-            || "null".equals(clazz.getEnclosingClass().getName())) ? 
-              clazz.getPackage().getName() + "." 
-              : (clazz.getEnclosingClass().getName() + "$"));
-      return new ReflectDatumReader(ReflectData.getSchema(clazz), prefix);
+      return new ReflectDatumReader(getClassFromMetadata(metadata));
     } catch (Exception e) {
       throw new RuntimeException(e);
     }
   }
 
-  protected Schema getSchema(Object t) {
-    return ReflectData.getSchema(t.getClass());
+  @Override
+  protected Schema getSchema(Object t, Map<String, String> metadata) {
+    return ReflectData.get().getSchema(t.getClass());
   }
 
-  protected DatumWriter getWriter(Class<Object> clazz) {
+  @Override
+  protected DatumWriter getWriter(Map<String, String> metadata) {
     return new ReflectDatumWriter();
   }
 

Modified: hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/serializer/avro/AvroSerialization.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/serializer/avro/AvroSerialization.java?rev=885142&r1=885141&r2=885142&view=diff
==============================================================================
--- hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/serializer/avro/AvroSerialization.java (original)
+++ hadoop/common/branches/HADOOP-6194/src/java/org/apache/hadoop/io/serializer/avro/AvroSerialization.java Sat Nov 28 19:53:33 2009
@@ -21,92 +21,105 @@
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.OutputStream;
+import java.util.Map;
 
 import org.apache.avro.Schema;
 import org.apache.avro.io.BinaryDecoder;
 import org.apache.avro.io.BinaryEncoder;
 import org.apache.avro.io.DatumReader;
 import org.apache.avro.io.DatumWriter;
-import org.apache.hadoop.conf.Configured;
-import org.apache.hadoop.io.serializer.Deserializer;
-import org.apache.hadoop.io.serializer.Serialization;
-import org.apache.hadoop.io.serializer.Serializer;
+import org.apache.hadoop.io.serializer.DeserializerBase;
+import org.apache.hadoop.io.serializer.SerializationBase;
+import org.apache.hadoop.io.serializer.SerializerBase;
 
 /**
  * Base class for providing serialization to Avro types.
  */
-public abstract class AvroSerialization<T> extends Configured 
-                                        implements Serialization<T>{
+public abstract class AvroSerialization<T> extends SerializationBase<T> {
+  
+  public static final String AVRO_SCHEMA_KEY = "Avro-Schema";
 
-  public Deserializer<T> getDeserializer(Class<T> c) {
-    return new AvroDeserializer(c);
+  public DeserializerBase<T> getDeserializer(Map<String, String> metadata) {
+    return new AvroDeserializer(metadata);
   }
 
-  public Serializer<T> getSerializer(Class<T> c) {
-    return new AvroSerializer(c);
+  public SerializerBase<T> getSerializer(Map<String, String> metadata) {
+    return new AvroSerializer(metadata);
   }
 
   /**
-   * Return an Avro Schema instance for the given class.
+   * Return an Avro Schema instance for the given class and metadata.
    */
-  protected abstract Schema getSchema(T t);
+  protected abstract Schema getSchema(T t, Map<String, String> metadata);
 
   /**
-   * Create and return Avro DatumWriter for the given class.
+   * Create and return Avro DatumWriter for the given metadata.
    */
-  protected abstract DatumWriter<T> getWriter(Class<T> clazz);
+  protected abstract DatumWriter<T> getWriter(Map<String, String> metadata);
 
   /**
-   * Create and return Avro DatumReader for the given class.
+   * Create and return Avro DatumReader for the given metadata.
    */
-  protected abstract DatumReader<T> getReader(Class<T> clazz);
+  protected abstract DatumReader<T> getReader(Map<String, String> metadata);
 
-  class AvroSerializer implements Serializer<T> {
+  class AvroSerializer extends SerializerBase<T> {
 
+    private Map<String, String> metadata;
     private DatumWriter<T> writer;
     private BinaryEncoder encoder;
     private OutputStream outStream;
-    protected Class<T> clazz;
 
-    AvroSerializer(Class<T> clazz) {
-      writer = getWriter(clazz);
+    AvroSerializer(Map<String, String> metadata) {
+      this.metadata = metadata;
+      writer = getWriter(metadata);
     }
 
+    @Override
     public void close() throws IOException {
       encoder.flush();
       outStream.close();
     }
 
+    @Override
     public void open(OutputStream out) throws IOException {
       outStream = out;
       encoder = new BinaryEncoder(out);
     }
 
+    @Override
     public void serialize(T t) throws IOException {
-      writer.setSchema(getSchema(t));
+      writer.setSchema(getSchema(t, metadata));
       writer.write(t, encoder);
     }
 
+    @Override
+    public Map<String, String> getMetadata() throws IOException {
+      return metadata;
+    }
+
   }
 
-  class AvroDeserializer implements Deserializer<T> {
+  class AvroDeserializer extends DeserializerBase<T> {
 
     private DatumReader<T> reader;
     private BinaryDecoder decoder;
     private InputStream inStream;
 
-    AvroDeserializer(Class<T> clazz) {
-      this.reader = getReader(clazz);
+    AvroDeserializer(Map<String, String> metadata) {
+      this.reader = getReader(metadata);
     }
 
+    @Override
     public void close() throws IOException {
       inStream.close();
     }
 
+    @Override
     public T deserialize(T t) throws IOException {
       return reader.read(t, decoder);
     }
 
+    @Override
     public void open(InputStream in) throws IOException {
       inStream = in;
       decoder = new BinaryDecoder(in);