You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pig.apache.org by ga...@apache.org on 2009/11/21 16:36:14 UTC

svn commit: r882929 [4/7] - in /hadoop/pig/branches/branch-0.6/contrib/zebra: ./ src/java/org/apache/hadoop/zebra/io/ src/java/org/apache/hadoop/zebra/mapred/ src/java/org/apache/hadoop/zebra/schema/ src/java/org/apache/hadoop/zebra/tfile/ src/java/org...

Added: hadoop/pig/branches/branch-0.6/contrib/zebra/src/java/org/apache/hadoop/zebra/tfile/TFileDumper.java
URL: http://svn.apache.org/viewvc/hadoop/pig/branches/branch-0.6/contrib/zebra/src/java/org/apache/hadoop/zebra/tfile/TFileDumper.java?rev=882929&view=auto
==============================================================================
--- hadoop/pig/branches/branch-0.6/contrib/zebra/src/java/org/apache/hadoop/zebra/tfile/TFileDumper.java (added)
+++ hadoop/pig/branches/branch-0.6/contrib/zebra/src/java/org/apache/hadoop/zebra/tfile/TFileDumper.java Sat Nov 21 15:36:12 2009
@@ -0,0 +1,295 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with this
+ * work for additional information regarding copyright ownership. The ASF
+ * licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+package org.apache.hadoop.zebra.tfile;
+
+import java.io.IOException;
+import java.io.PrintStream;
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.LinkedHashMap;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IOUtils;
+import org.apache.hadoop.zebra.tfile.BCFile.BlockRegion;
+import org.apache.hadoop.zebra.tfile.BCFile.MetaIndexEntry;
+import org.apache.hadoop.zebra.tfile.TFile.TFileIndexEntry;
+import org.apache.hadoop.zebra.tfile.Utils.Version;
+
+/**
+ * Dumping the information of a TFile.
+ */
+class TFileDumper {
+  static final Log LOG = LogFactory.getLog(TFileDumper.class);
+
+  private TFileDumper() {
+    // namespace object not constructable.
+  }
+
+  private enum Align {
+    LEFT, CENTER, RIGHT, ZERO_PADDED;
+    static String format(String s, int width, Align align) {
+      if (s.length() >= width) return s;
+      int room = width - s.length();
+      Align alignAdjusted = align;
+      if (room == 1) {
+        alignAdjusted = LEFT;
+      }
+      if (alignAdjusted == LEFT) {
+        return s + String.format("%" + room + "s", "");
+      }
+      if (alignAdjusted == RIGHT) {
+        return String.format("%" + room + "s", "") + s;
+      }
+      if (alignAdjusted == CENTER) {
+        int half = room / 2;
+        return String.format("%" + half + "s", "") + s
+            + String.format("%" + (room - half) + "s", "");
+      }
+      throw new IllegalArgumentException("Unsupported alignment");
+    }
+
+    static String format(long l, int width, Align align) {
+      if (align == ZERO_PADDED) {
+        return String.format("%0" + width + "d", l);
+      }
+      return format(Long.toString(l), width, align);
+    }
+
+    static int calculateWidth(String caption, long max) {
+      return Math.max(caption.length(), Long.toString(max).length());
+    }
+  }
+
+  /**
+   * Dump information about TFile.
+   * 
+   * @param file
+   *          Path string of the TFile
+   * @param out
+   *          PrintStream to output the information.
+   * @param conf
+   *          The configuration object.
+   * @throws IOException
+   */
+  static public void dumpInfo(String file, PrintStream out, Configuration conf)
+      throws IOException {
+    final int maxKeySampleLen = 16;
+    Path path = new Path(file);
+    FileSystem fs = path.getFileSystem(conf);
+    long length = fs.getFileStatus(path).getLen();
+    FSDataInputStream fsdis = fs.open(path);
+    TFile.Reader reader = new TFile.Reader(fsdis, length, conf);
+    try {
+      LinkedHashMap<String, String> properties =
+          new LinkedHashMap<String, String>();
+      int blockCnt = reader.readerBCF.getBlockCount();
+      int metaBlkCnt = reader.readerBCF.metaIndex.index.size();
+      properties.put("BCFile Version", reader.readerBCF.version.toString());
+      properties.put("TFile Version", reader.tfileMeta.version.toString());
+      properties.put("File Length", Long.toString(length));
+      properties.put("Data Compression", reader.readerBCF
+          .getDefaultCompressionName());
+      properties.put("Record Count", Long.toString(reader.getEntryCount()));
+      properties.put("Sorted", Boolean.toString(reader.isSorted()));
+      if (reader.isSorted()) {
+        properties.put("Comparator", reader.getComparatorName());
+      }
+      properties.put("Data Block Count", Integer.toString(blockCnt));
+      long dataSize = 0, dataSizeUncompressed = 0;
+      if (blockCnt > 0) {
+        for (int i = 0; i < blockCnt; ++i) {
+          BlockRegion region =
+              reader.readerBCF.dataIndex.getBlockRegionList().get(i);
+          dataSize += region.getCompressedSize();
+          dataSizeUncompressed += region.getRawSize();
+        }
+        properties.put("Data Block Bytes", Long.toString(dataSize));
+        if (reader.readerBCF.getDefaultCompressionName() != "none") {
+          properties.put("Data Block Uncompressed Bytes", Long
+              .toString(dataSizeUncompressed));
+          properties.put("Data Block Compression Ratio", String.format(
+              "1:%.1f", (double) dataSizeUncompressed / dataSize));
+        }
+      }
+
+      properties.put("Meta Block Count", Integer.toString(metaBlkCnt));
+      long metaSize = 0, metaSizeUncompressed = 0;
+      if (metaBlkCnt > 0) {
+        Collection<MetaIndexEntry> metaBlks =
+            reader.readerBCF.metaIndex.index.values();
+        boolean calculateCompression = false;
+        for (Iterator<MetaIndexEntry> it = metaBlks.iterator(); it.hasNext();) {
+          MetaIndexEntry e = it.next();
+          metaSize += e.getRegion().getCompressedSize();
+          metaSizeUncompressed += e.getRegion().getRawSize();
+          if (e.getCompressionAlgorithm() != Compression.Algorithm.NONE) {
+            calculateCompression = true;
+          }
+        }
+        properties.put("Meta Block Bytes", Long.toString(metaSize));
+        if (calculateCompression) {
+          properties.put("Meta Block Uncompressed Bytes", Long
+              .toString(metaSizeUncompressed));
+          properties.put("Meta Block Compression Ratio", String.format(
+              "1:%.1f", (double) metaSizeUncompressed / metaSize));
+        }
+      }
+      properties.put("Meta-Data Size Ratio", String.format("1:%.1f",
+          (double) dataSize / metaSize));
+      long leftOverBytes = length - dataSize - metaSize;
+      long miscSize =
+          BCFile.Magic.size() * 2 + Long.SIZE / Byte.SIZE + Version.size();
+      long metaIndexSize = leftOverBytes - miscSize;
+      properties.put("Meta Block Index Bytes", Long.toString(metaIndexSize));
+      properties.put("Headers Etc Bytes", Long.toString(miscSize));
+      // Now output the properties table.
+      int maxKeyLength = 0;
+      Set<Map.Entry<String, String>> entrySet = properties.entrySet();
+      for (Iterator<Map.Entry<String, String>> it = entrySet.iterator(); it
+          .hasNext();) {
+        Map.Entry<String, String> e = it.next();
+        if (e.getKey().length() > maxKeyLength) {
+          maxKeyLength = e.getKey().length();
+        }
+      }
+      for (Iterator<Map.Entry<String, String>> it = entrySet.iterator(); it
+          .hasNext();) {
+        Map.Entry<String, String> e = it.next();
+        out.printf("%s : %s\n", Align.format(e.getKey(), maxKeyLength,
+            Align.LEFT), e.getValue());
+      }
+      out.println();
+      reader.checkTFileDataIndex();
+      if (blockCnt > 0) {
+        String blkID = "Data-Block";
+        int blkIDWidth = Align.calculateWidth(blkID, blockCnt);
+        int blkIDWidth2 = Align.calculateWidth("", blockCnt);
+        String offset = "Offset";
+        int offsetWidth = Align.calculateWidth(offset, length);
+        String blkLen = "Length";
+        int blkLenWidth =
+            Align.calculateWidth(blkLen, dataSize / blockCnt * 10);
+        String rawSize = "Raw-Size";
+        int rawSizeWidth =
+            Align.calculateWidth(rawSize, dataSizeUncompressed / blockCnt * 10);
+        String records = "Records";
+        int recordsWidth =
+            Align.calculateWidth(records, reader.getEntryCount() / blockCnt
+                * 10);
+        String endKey = "End-Key";
+        int endKeyWidth = Math.max(endKey.length(), maxKeySampleLen * 2 + 5);
+
+        out.printf("%s %s %s %s %s %s\n", Align.format(blkID, blkIDWidth,
+            Align.CENTER), Align.format(offset, offsetWidth, Align.CENTER),
+            Align.format(blkLen, blkLenWidth, Align.CENTER), Align.format(
+                rawSize, rawSizeWidth, Align.CENTER), Align.format(records,
+                recordsWidth, Align.CENTER), Align.format(endKey, endKeyWidth,
+                Align.LEFT));
+
+        for (int i = 0; i < blockCnt; ++i) {
+          BlockRegion region =
+              reader.readerBCF.dataIndex.getBlockRegionList().get(i);
+          TFileIndexEntry indexEntry = reader.tfileIndex.getEntry(i);
+          out.printf("%s %s %s %s %s ", Align.format(Align.format(i,
+              blkIDWidth2, Align.ZERO_PADDED), blkIDWidth, Align.LEFT), Align
+              .format(region.getOffset(), offsetWidth, Align.LEFT), Align
+              .format(region.getCompressedSize(), blkLenWidth, Align.LEFT),
+              Align.format(region.getRawSize(), rawSizeWidth, Align.LEFT),
+              Align.format(indexEntry.kvEntries, recordsWidth, Align.LEFT));
+          byte[] key = indexEntry.key;
+          boolean asAscii = true;
+          int sampleLen = Math.min(maxKeySampleLen, key.length);
+          for (int j = 0; j < sampleLen; ++j) {
+            byte b = key[j];
+            if ((b < 32 && b != 9) || (b == 127)) {
+              asAscii = false;
+            }
+          }
+          if (!asAscii) {
+            out.print("0X");
+            for (int j = 0; j < sampleLen; ++j) {
+              byte b = key[i];
+              out.printf("%X", b);
+            }
+          } else {
+            out.print(new String(key, 0, sampleLen));
+          }
+          if (sampleLen < key.length) {
+            out.print("...");
+          }
+          out.println();
+        }
+      }
+
+      out.println();
+      if (metaBlkCnt > 0) {
+        String name = "Meta-Block";
+        int maxNameLen = 0;
+        Set<Map.Entry<String, MetaIndexEntry>> metaBlkEntrySet =
+            reader.readerBCF.metaIndex.index.entrySet();
+        for (Iterator<Map.Entry<String, MetaIndexEntry>> it =
+            metaBlkEntrySet.iterator(); it.hasNext();) {
+          Map.Entry<String, MetaIndexEntry> e = it.next();
+          if (e.getKey().length() > maxNameLen) {
+            maxNameLen = e.getKey().length();
+          }
+        }
+        int nameWidth = Math.max(name.length(), maxNameLen);
+        String offset = "Offset";
+        int offsetWidth = Align.calculateWidth(offset, length);
+        String blkLen = "Length";
+        int blkLenWidth =
+            Align.calculateWidth(blkLen, metaSize / metaBlkCnt * 10);
+        String rawSize = "Raw-Size";
+        int rawSizeWidth =
+            Align.calculateWidth(rawSize, metaSizeUncompressed / metaBlkCnt
+                * 10);
+        String compression = "Compression";
+        int compressionWidth = compression.length();
+        out.printf("%s %s %s %s %s\n", Align.format(name, nameWidth,
+            Align.CENTER), Align.format(offset, offsetWidth, Align.CENTER),
+            Align.format(blkLen, blkLenWidth, Align.CENTER), Align.format(
+                rawSize, rawSizeWidth, Align.CENTER), Align.format(compression,
+                compressionWidth, Align.LEFT));
+
+        for (Iterator<Map.Entry<String, MetaIndexEntry>> it =
+            metaBlkEntrySet.iterator(); it.hasNext();) {
+          Map.Entry<String, MetaIndexEntry> e = it.next();
+          String blkName = e.getValue().getMetaName();
+          BlockRegion region = e.getValue().getRegion();
+          String blkCompression =
+              e.getValue().getCompressionAlgorithm().getName();
+          out.printf("%s %s %s %s %s\n", Align.format(blkName, nameWidth,
+              Align.LEFT), Align.format(region.getOffset(), offsetWidth,
+              Align.LEFT), Align.format(region.getCompressedSize(),
+              blkLenWidth, Align.LEFT), Align.format(region.getRawSize(),
+              rawSizeWidth, Align.LEFT), Align.format(blkCompression,
+              compressionWidth, Align.LEFT));
+        }
+      }
+    } finally {
+      IOUtils.cleanup(LOG, reader, fsdis);
+    }
+  }
+}

Added: hadoop/pig/branches/branch-0.6/contrib/zebra/src/java/org/apache/hadoop/zebra/tfile/Utils.java
URL: http://svn.apache.org/viewvc/hadoop/pig/branches/branch-0.6/contrib/zebra/src/java/org/apache/hadoop/zebra/tfile/Utils.java?rev=882929&view=auto
==============================================================================
--- hadoop/pig/branches/branch-0.6/contrib/zebra/src/java/org/apache/hadoop/zebra/tfile/Utils.java (added)
+++ hadoop/pig/branches/branch-0.6/contrib/zebra/src/java/org/apache/hadoop/zebra/tfile/Utils.java Sat Nov 21 15:36:12 2009
@@ -0,0 +1,517 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with this
+ * work for additional information regarding copyright ownership. The ASF
+ * licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package org.apache.hadoop.zebra.tfile;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.Comparator;
+import java.util.List;
+
+import org.apache.hadoop.io.Text;
+
+/**
+ * Supporting Utility classes used by TFile, and shared by users of TFile.
+ */
+public final class Utils {
+
+  /**
+   * Prevent the instantiation of Utils.
+   */
+  private Utils() {
+    // nothing
+  }
+
+  /**
+   * Encoding an integer into a variable-length encoding format. Synonymous to
+   * <code>Utils#writeVLong(out, n)</code>.
+   * 
+   * @param out
+   *          output stream
+   * @param n
+   *          The integer to be encoded
+   * @throws IOException
+   * @see Utils#writeVLong(DataOutput, long)
+   */
+  public static void writeVInt(DataOutput out, int n) throws IOException {
+    writeVLong(out, n);
+  }
+
+  /**
+   * Encoding a Long integer into a variable-length encoding format.
+   * <ul>
+   * <li>if n in [-32, 127): encode in one byte with the actual value.
+   * Otherwise,
+   * <li>if n in [-20*2^8, 20*2^8): encode in two bytes: byte[0] = n/256 - 52;
+   * byte[1]=n&0xff. Otherwise,
+   * <li>if n IN [-16*2^16, 16*2^16): encode in three bytes: byte[0]=n/2^16 -
+   * 88; byte[1]=(n>>8)&0xff; byte[2]=n&0xff. Otherwise,
+   * <li>if n in [-8*2^24, 8*2^24): encode in four bytes: byte[0]=n/2^24 - 112;
+   * byte[1] = (n>>16)&0xff; byte[2] = (n>>8)&0xff; byte[3]=n&0xff. Otherwise:
+   * <li>if n in [-2^31, 2^31): encode in five bytes: byte[0]=-125; byte[1] =
+   * (n>>24)&0xff; byte[2]=(n>>16)&0xff; byte[3]=(n>>8)&0xff; byte[4]=n&0xff;
+   * <li>if n in [-2^39, 2^39): encode in six bytes: byte[0]=-124; byte[1] =
+   * (n>>32)&0xff; byte[2]=(n>>24)&0xff; byte[3]=(n>>16)&0xff;
+   * byte[4]=(n>>8)&0xff; byte[5]=n&0xff
+   * <li>if n in [-2^47, 2^47): encode in seven bytes: byte[0]=-123; byte[1] =
+   * (n>>40)&0xff; byte[2]=(n>>32)&0xff; byte[3]=(n>>24)&0xff;
+   * byte[4]=(n>>16)&0xff; byte[5]=(n>>8)&0xff; byte[6]=n&0xff;
+   * <li>if n in [-2^55, 2^55): encode in eight bytes: byte[0]=-122; byte[1] =
+   * (n>>48)&0xff; byte[2] = (n>>40)&0xff; byte[3]=(n>>32)&0xff;
+   * byte[4]=(n>>24)&0xff; byte[5]=(n>>16)&0xff; byte[6]=(n>>8)&0xff;
+   * byte[7]=n&0xff;
+   * <li>if n in [-2^63, 2^63): encode in nine bytes: byte[0]=-121; byte[1] =
+   * (n>>54)&0xff; byte[2] = (n>>48)&0xff; byte[3] = (n>>40)&0xff;
+   * byte[4]=(n>>32)&0xff; byte[5]=(n>>24)&0xff; byte[6]=(n>>16)&0xff;
+   * byte[7]=(n>>8)&0xff; byte[8]=n&0xff;
+   * </ul>
+   * 
+   * @param out
+   *          output stream
+   * @param n
+   *          the integer number
+   * @throws IOException
+   */
+  @SuppressWarnings("fallthrough")
+  public static void writeVLong(DataOutput out, long n) throws IOException {
+    if ((n < 128) && (n >= -32)) {
+      out.writeByte((int) n);
+      return;
+    }
+
+    long un = (n < 0) ? ~n : n;
+    // how many bytes do we need to represent the number with sign bit?
+    int len = (Long.SIZE - Long.numberOfLeadingZeros(un)) / 8 + 1;
+    int firstByte = (int) (n >> ((len - 1) * 8));
+    switch (len) {
+      case 1:
+        // fall it through to firstByte==-1, len=2.
+        firstByte >>= 8;
+      case 2:
+        if ((firstByte < 20) && (firstByte >= -20)) {
+          out.writeByte(firstByte - 52);
+          out.writeByte((int) n);
+          return;
+        }
+        // fall it through to firstByte==0/-1, len=3.
+        firstByte >>= 8;
+      case 3:
+        if ((firstByte < 16) && (firstByte >= -16)) {
+          out.writeByte(firstByte - 88);
+          out.writeShort((int) n);
+          return;
+        }
+        // fall it through to firstByte==0/-1, len=4.
+        firstByte >>= 8;
+      case 4:
+        if ((firstByte < 8) && (firstByte >= -8)) {
+          out.writeByte(firstByte - 112);
+          out.writeShort(((int) n) >>> 8);
+          out.writeByte((int) n);
+          return;
+        }
+        out.writeByte(len - 129);
+        out.writeInt((int) n);
+        return;
+      case 5:
+        out.writeByte(len - 129);
+        out.writeInt((int) (n >>> 8));
+        out.writeByte((int) n);
+        return;
+      case 6:
+        out.writeByte(len - 129);
+        out.writeInt((int) (n >>> 16));
+        out.writeShort((int) n);
+        return;
+      case 7:
+        out.writeByte(len - 129);
+        out.writeInt((int) (n >>> 24));
+        out.writeShort((int) (n >>> 8));
+        out.writeByte((int) n);
+        return;
+      case 8:
+        out.writeByte(len - 129);
+        out.writeLong(n);
+        return;
+      default:
+        throw new RuntimeException("Internel error");
+    }
+  }
+
+  /**
+   * Decoding the variable-length integer. Synonymous to
+   * <code>(int)Utils#readVLong(in)</code>.
+   * 
+   * @param in
+   *          input stream
+   * @return the decoded integer
+   * @throws IOException
+   * 
+   * @see Utils#readVLong(DataInput)
+   */
+  public static int readVInt(DataInput in) throws IOException {
+    long ret = readVLong(in);
+    if ((ret > Integer.MAX_VALUE) || (ret < Integer.MIN_VALUE)) {
+      throw new RuntimeException(
+          "Number too large to be represented as Integer");
+    }
+    return (int) ret;
+  }
+
+  /**
+   * Decoding the variable-length integer. Suppose the value of the first byte
+   * is FB, and the following bytes are NB[*].
+   * <ul>
+   * <li>if (FB >= -32), return (long)FB;
+   * <li>if (FB in [-72, -33]), return (FB+52)<<8 + NB[0]&0xff;
+   * <li>if (FB in [-104, -73]), return (FB+88)<<16 + (NB[0]&0xff)<<8 +
+   * NB[1]&0xff;
+   * <li>if (FB in [-120, -105]), return (FB+112)<<24 + (NB[0]&0xff)<<16 +
+   * (NB[1]&0xff)<<8 + NB[2]&0xff;
+   * <li>if (FB in [-128, -121]), return interpret NB[FB+129] as a signed
+   * big-endian integer.
+   * 
+   * @param in
+   *          input stream
+   * @return the decoded long integer.
+   * @throws IOException
+   */
+
+  public static long readVLong(DataInput in) throws IOException {
+    int firstByte = in.readByte();
+    if (firstByte >= -32) {
+      return firstByte;
+    }
+
+    switch ((firstByte + 128) / 8) {
+      case 11:
+      case 10:
+      case 9:
+      case 8:
+      case 7:
+        return ((firstByte + 52) << 8) | in.readUnsignedByte();
+      case 6:
+      case 5:
+      case 4:
+      case 3:
+        return ((firstByte + 88) << 16) | in.readUnsignedShort();
+      case 2:
+      case 1:
+        return ((firstByte + 112) << 24) | (in.readUnsignedShort() << 8)
+            | in.readUnsignedByte();
+      case 0:
+        int len = firstByte + 129;
+        switch (len) {
+          case 4:
+            return in.readInt();
+          case 5:
+            return ((long) in.readInt()) << 8 | in.readUnsignedByte();
+          case 6:
+            return ((long) in.readInt()) << 16 | in.readUnsignedShort();
+          case 7:
+            return ((long) in.readInt()) << 24 | (in.readUnsignedShort() << 8)
+                | in.readUnsignedByte();
+          case 8:
+            return in.readLong();
+          default:
+            throw new IOException("Corrupted VLong encoding");
+        }
+      default:
+        throw new RuntimeException("Internal error");
+    }
+  }
+
+  /**
+   * Write a String as a VInt n, followed by n Bytes as in Text format.
+   * 
+   * @param out
+   * @param s
+   * @throws IOException
+   */
+  public static void writeString(DataOutput out, String s) throws IOException {
+    if (s != null) {
+      Text text = new Text(s);
+      byte[] buffer = text.getBytes();
+      int len = text.getLength();
+      writeVInt(out, len);
+      out.write(buffer, 0, len);
+    } else {
+      writeVInt(out, -1);
+    }
+  }
+
+  /**
+   * Read a String as a VInt n, followed by n Bytes in Text format.
+   * 
+   * @param in
+   *          The input stream.
+   * @return The string
+   * @throws IOException
+   */
+  public static String readString(DataInput in) throws IOException {
+    int length = readVInt(in);
+    if (length == -1) return null;
+    byte[] buffer = new byte[length];
+    in.readFully(buffer);
+    return Text.decode(buffer);
+  }
+
+  /**
+   * A generic Version class. We suggest applications built on top of TFile use
+   * this class to maintain version information in their meta blocks.
+   * 
+   * A version number consists of a major version and a minor version. The
+   * suggested usage of major and minor version number is to increment major
+   * version number when the new storage format is not backward compatible, and
+   * increment the minor version otherwise.
+   */
+  public static final class Version implements Comparable<Version> {
+    private final short major;
+    private final short minor;
+
+    /**
+     * Construct the Version object by reading from the input stream.
+     * 
+     * @param in
+     *          input stream
+     * @throws IOException
+     */
+    public Version(DataInput in) throws IOException {
+      major = in.readShort();
+      minor = in.readShort();
+    }
+
+    /**
+     * Constructor.
+     * 
+     * @param major
+     *          major version.
+     * @param minor
+     *          minor version.
+     */
+    public Version(short major, short minor) {
+      this.major = major;
+      this.minor = minor;
+    }
+
+    /**
+     * Write the objec to a DataOutput. The serialized format of the Version is
+     * major version followed by minor version, both as big-endian short
+     * integers.
+     * 
+     * @param out
+     *          The DataOutput object.
+     * @throws IOException
+     */
+    public void write(DataOutput out) throws IOException {
+      out.writeShort(major);
+      out.writeShort(minor);
+    }
+
+    /**
+     * Get the major version.
+     * 
+     * @return Major version.
+     */
+    public int getMajor() {
+      return major;
+    }
+
+    /**
+     * Get the minor version.
+     * 
+     * @return The minor version.
+     */
+    public int getMinor() {
+      return minor;
+    }
+
+    /**
+     * Get the size of the serialized Version object.
+     * 
+     * @return serialized size of the version object.
+     */
+    public static int size() {
+      return (Short.SIZE + Short.SIZE) / Byte.SIZE;
+    }
+
+    /**
+     * Return a string representation of the version.
+     */
+    @Override
+    public String toString() {
+      return new StringBuilder("v").append(major).append(".").append(minor)
+          .toString();
+    }
+
+    /**
+     * Test compatibility.
+     * 
+     * @param other
+     *          The Version object to test compatibility with.
+     * @return true if both versions have the same major version number; false
+     *         otherwise.
+     */
+    public boolean compatibleWith(Version other) {
+      return major == other.major;
+    }
+
+    /**
+     * Compare this version with another version.
+     */
+    @Override
+    public int compareTo(Version that) {
+      if (major != that.major) {
+        return major - that.major;
+      }
+      return minor - that.minor;
+    }
+
+    @Override
+    public boolean equals(Object other) {
+      if (this == other) return true;
+      if (!(other instanceof Version)) return false;
+      return compareTo((Version) other) == 0;
+    }
+
+    @Override
+    public int hashCode() {
+      return (major << 16 + minor);
+    }
+  }
+
+  /**
+   * Lower bound binary search. Find the index to the first element in the list
+   * that compares greater than or equal to key.
+   * 
+   * @param <T>
+   *          Type of the input key.
+   * @param list
+   *          The list
+   * @param key
+   *          The input key.
+   * @param cmp
+   *          Comparator for the key.
+   * @return The index to the desired element if it exists; or list.size()
+   *         otherwise.
+   */
+  public static <T> int lowerBound(List<? extends T> list, T key,
+      Comparator<? super T> cmp) {
+    int low = 0;
+    int high = list.size();
+
+    while (low < high) {
+      int mid = (low + high) >>> 1;
+      T midVal = list.get(mid);
+      int ret = cmp.compare(midVal, key);
+      if (ret < 0)
+        low = mid + 1;
+      else high = mid;
+    }
+    return low;
+  }
+
+  /**
+   * Upper bound binary search. Find the index to the first element in the list
+   * that compares greater than the input key.
+   * 
+   * @param <T>
+   *          Type of the input key.
+   * @param list
+   *          The list
+   * @param key
+   *          The input key.
+   * @param cmp
+   *          Comparator for the key.
+   * @return The index to the desired element if it exists; or list.size()
+   *         otherwise.
+   */
+  public static <T> int upperBound(List<? extends T> list, T key,
+      Comparator<? super T> cmp) {
+    int low = 0;
+    int high = list.size();
+
+    while (low < high) {
+      int mid = (low + high) >>> 1;
+      T midVal = list.get(mid);
+      int ret = cmp.compare(midVal, key);
+      if (ret <= 0)
+        low = mid + 1;
+      else high = mid;
+    }
+    return low;
+  }
+
+  /**
+   * Lower bound binary search. Find the index to the first element in the list
+   * that compares greater than or equal to key.
+   * 
+   * @param <T>
+   *          Type of the input key.
+   * @param list
+   *          The list
+   * @param key
+   *          The input key.
+   * @return The index to the desired element if it exists; or list.size()
+   *         otherwise.
+   */
+  public static <T> int lowerBound(List<? extends Comparable<? super T>> list,
+      T key) {
+    int low = 0;
+    int high = list.size();
+
+    while (low < high) {
+      int mid = (low + high) >>> 1;
+      Comparable<? super T> midVal = list.get(mid);
+      int ret = midVal.compareTo(key);
+      if (ret < 0)
+        low = mid + 1;
+      else high = mid;
+    }
+    return low;
+  }
+
+  /**
+   * Upper bound binary search. Find the index to the first element in the list
+   * that compares greater than the input key.
+   * 
+   * @param <T>
+   *          Type of the input key.
+   * @param list
+   *          The list
+   * @param key
+   *          The input key.
+   * @return The index to the desired element if it exists; or list.size()
+   *         otherwise.
+   */
+  public static <T> int upperBound(List<? extends Comparable<? super T>> list,
+      T key) {
+    int low = 0;
+    int high = list.size();
+
+    while (low < high) {
+      int mid = (low + high) >>> 1;
+      Comparable<? super T> midVal = list.get(mid);
+      int ret = midVal.compareTo(key);
+      if (ret <= 0)
+        low = mid + 1;
+      else high = mid;
+    }
+    return low;
+  }
+}

Modified: hadoop/pig/branches/branch-0.6/contrib/zebra/src/java/org/apache/hadoop/zebra/types/CGSchema.java
URL: http://svn.apache.org/viewvc/hadoop/pig/branches/branch-0.6/contrib/zebra/src/java/org/apache/hadoop/zebra/types/CGSchema.java?rev=882929&r1=882928&r2=882929&view=diff
==============================================================================
--- hadoop/pig/branches/branch-0.6/contrib/zebra/src/java/org/apache/hadoop/zebra/types/CGSchema.java (original)
+++ hadoop/pig/branches/branch-0.6/contrib/zebra/src/java/org/apache/hadoop/zebra/types/CGSchema.java Sat Nov 21 15:36:12 2009
@@ -27,7 +27,7 @@
 
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.WritableUtils;
-import org.apache.hadoop.io.file.tfile.Utils.Version;
+import org.apache.hadoop.zebra.tfile.Utils.Version;
 import org.apache.hadoop.zebra.schema.Schema;
 import org.apache.hadoop.zebra.parser.ParseException;
 

Modified: hadoop/pig/branches/branch-0.6/contrib/zebra/src/java/org/apache/hadoop/zebra/types/SortInfo.java
URL: http://svn.apache.org/viewvc/hadoop/pig/branches/branch-0.6/contrib/zebra/src/java/org/apache/hadoop/zebra/types/SortInfo.java?rev=882929&r1=882928&r2=882929&view=diff
==============================================================================
--- hadoop/pig/branches/branch-0.6/contrib/zebra/src/java/org/apache/hadoop/zebra/types/SortInfo.java (original)
+++ hadoop/pig/branches/branch-0.6/contrib/zebra/src/java/org/apache/hadoop/zebra/types/SortInfo.java Sat Nov 21 15:36:12 2009
@@ -21,7 +21,7 @@
 import java.io.IOException;
 import org.apache.hadoop.zebra.schema.Schema;
 import org.apache.hadoop.zebra.schema.ColumnType;
-import org.apache.hadoop.io.file.tfile.TFile;
+import org.apache.hadoop.zebra.tfile.TFile;
 
 /**
  * Sortness related Information

Modified: hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/io/TestBasicTable.java
URL: http://svn.apache.org/viewvc/hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/io/TestBasicTable.java?rev=882929&r1=882928&r2=882929&view=diff
==============================================================================
--- hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/io/TestBasicTable.java (original)
+++ hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/io/TestBasicTable.java Sat Nov 21 15:36:12 2009
@@ -31,7 +31,7 @@
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.BytesWritable;
-import org.apache.hadoop.io.file.tfile.RawComparable;
+import org.apache.hadoop.zebra.tfile.RawComparable;
 import org.apache.hadoop.zebra.io.BasicTable;
 import org.apache.hadoop.zebra.io.BasicTableStatus;
 import org.apache.hadoop.zebra.io.KeyDistribution;
@@ -81,7 +81,7 @@
     return String.format("%s%09d", prefix, random.nextInt(max));
   }
 
-  static int createBasicTable(int parts, int rows, String strSchema, String storage, String sortColumns,
+  public static int createBasicTable(int parts, int rows, String strSchema, String storage, String sortColumns,
       Path path, boolean properClose) throws IOException {
     if (fs.exists(path)) {
       BasicTable.drop(path, conf);
@@ -126,8 +126,11 @@
     if (properClose) {
       writer = new BasicTable.Writer(path, conf);
       writer.close();
-      BasicTableStatus status = getStatus(path);
-      Assert.assertEquals(total, status.getRows());
+      /* We can only test number of rows on sorted tables.*/
+      if (sorted) {
+        BasicTableStatus status = getStatus(path);
+        Assert.assertEquals(total, status.getRows());
+      }
     }
 
     return total;

Modified: hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/io/TestCollection.java
URL: http://svn.apache.org/viewvc/hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/io/TestCollection.java?rev=882929&r1=882928&r2=882929&view=diff
==============================================================================
--- hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/io/TestCollection.java (original)
+++ hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/io/TestCollection.java Sat Nov 21 15:36:12 2009
@@ -1018,4 +1018,4 @@
       System.out.println(e);
     }
   }
-}
\ No newline at end of file
+}

Modified: hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/io/TestColumnGroup.java
URL: http://svn.apache.org/viewvc/hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/io/TestColumnGroup.java?rev=882929&r1=882928&r2=882929&view=diff
==============================================================================
--- hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/io/TestColumnGroup.java (original)
+++ hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/io/TestColumnGroup.java Sat Nov 21 15:36:12 2009
@@ -30,7 +30,7 @@
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.BytesWritable;
-import org.apache.hadoop.io.file.tfile.RawComparable;
+import org.apache.hadoop.zebra.tfile.RawComparable;
 import org.apache.hadoop.zebra.io.BasicTableStatus;
 import org.apache.hadoop.zebra.io.ColumnGroup;
 import org.apache.hadoop.zebra.io.KeyDistribution;
@@ -149,8 +149,11 @@
     if (properClose) {
       writer = new ColumnGroup.Writer(path, conf);
       writer.close();
-      BasicTableStatus status = getStatus(path);
-      Assert.assertEquals(total, status.getRows());
+      /* We can only test number of rows on sorted tables.*/
+      if (sorted) {
+        BasicTableStatus status = getStatus(path);
+        Assert.assertEquals(total, status.getRows());
+      }
     }
 
     return total;

Modified: hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/io/TestColumnGroupName1.java
URL: http://svn.apache.org/viewvc/hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/io/TestColumnGroupName1.java?rev=882929&r1=882928&r2=882929&view=diff
==============================================================================
--- hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/io/TestColumnGroupName1.java (original)
+++ hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/io/TestColumnGroupName1.java Sat Nov 21 15:36:12 2009
@@ -170,4 +170,4 @@
 
     reader.close();
   }
-}
\ No newline at end of file
+}

Modified: hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/io/TestColumnGroupName2.java
URL: http://svn.apache.org/viewvc/hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/io/TestColumnGroupName2.java?rev=882929&r1=882928&r2=882929&view=diff
==============================================================================
--- hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/io/TestColumnGroupName2.java (original)
+++ hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/io/TestColumnGroupName2.java Sat Nov 21 15:36:12 2009
@@ -110,4 +110,4 @@
   public void test1() throws IOException, Exception {
     BasicTable.dumpInfo(path.toString(), System.out, conf);
   }
-}
\ No newline at end of file
+}

Modified: hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/io/TestColumnGroupName3.java
URL: http://svn.apache.org/viewvc/hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/io/TestColumnGroupName3.java?rev=882929&r1=882928&r2=882929&view=diff
==============================================================================
--- hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/io/TestColumnGroupName3.java (original)
+++ hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/io/TestColumnGroupName3.java Sat Nov 21 15:36:12 2009
@@ -110,4 +110,4 @@
   public void test1() throws IOException, Exception {
     BasicTable.dumpInfo(path.toString(), System.out, conf);
   }
-}
\ No newline at end of file
+}

Modified: hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/io/TestColumnGroupName4.java
URL: http://svn.apache.org/viewvc/hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/io/TestColumnGroupName4.java?rev=882929&r1=882928&r2=882929&view=diff
==============================================================================
--- hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/io/TestColumnGroupName4.java (original)
+++ hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/io/TestColumnGroupName4.java Sat Nov 21 15:36:12 2009
@@ -110,4 +110,4 @@
   public void test1() throws IOException, Exception {
     BasicTable.dumpInfo(path.toString(), System.out, conf);
   }
-}
\ No newline at end of file
+}

Modified: hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/io/TestMap.java
URL: http://svn.apache.org/viewvc/hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/io/TestMap.java?rev=882929&r1=882928&r2=882929&view=diff
==============================================================================
--- hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/io/TestMap.java (original)
+++ hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/io/TestMap.java Sat Nov 21 15:36:12 2009
@@ -280,4 +280,4 @@
 
     reader.close();
   }
-}
\ No newline at end of file
+}

Modified: hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/io/TestMapOfRecord.java
URL: http://svn.apache.org/viewvc/hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/io/TestMapOfRecord.java?rev=882929&r1=882928&r2=882929&view=diff
==============================================================================
--- hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/io/TestMapOfRecord.java (original)
+++ hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/io/TestMapOfRecord.java Sat Nov 21 15:36:12 2009
@@ -388,4 +388,4 @@
     reader.close();
 
   }
-}
\ No newline at end of file
+}

Modified: hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/io/TestNegative.java
URL: http://svn.apache.org/viewvc/hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/io/TestNegative.java?rev=882929&r1=882928&r2=882929&view=diff
==============================================================================
--- hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/io/TestNegative.java (original)
+++ hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/io/TestNegative.java Sat Nov 21 15:36:12 2009
@@ -616,4 +616,4 @@
     }
   }
 
-}
\ No newline at end of file
+}

Modified: hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/io/TestRecord.java
URL: http://svn.apache.org/viewvc/hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/io/TestRecord.java?rev=882929&r1=882928&r2=882929&view=diff
==============================================================================
--- hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/io/TestRecord.java (original)
+++ hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/io/TestRecord.java Sat Nov 21 15:36:12 2009
@@ -412,4 +412,4 @@
     reader.close();
 
   }
-}
\ No newline at end of file
+}

Modified: hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/io/TestSimple.java
URL: http://svn.apache.org/viewvc/hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/io/TestSimple.java?rev=882929&r1=882928&r2=882929&view=diff
==============================================================================
--- hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/io/TestSimple.java (original)
+++ hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/io/TestSimple.java Sat Nov 21 15:36:12 2009
@@ -184,4 +184,4 @@
     reader.close();
   }
 
-}
\ No newline at end of file
+}

Modified: hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/mapred/ArticleGenerator.java
URL: http://svn.apache.org/viewvc/hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/mapred/ArticleGenerator.java?rev=882929&r1=882928&r2=882929&view=diff
==============================================================================
--- hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/mapred/ArticleGenerator.java (original)
+++ hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/mapred/ArticleGenerator.java Sat Nov 21 15:36:12 2009
@@ -26,8 +26,8 @@
 import org.apache.hadoop.fs.FSDataOutputStream;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.file.tfile.RandomDistribution.DiscreteRNG;
-import org.apache.hadoop.io.file.tfile.RandomDistribution.Flat;
+import org.apache.hadoop.zebra.tfile.RandomDistribution.DiscreteRNG;
+import org.apache.hadoop.zebra.tfile.RandomDistribution.Flat;
 
 /**
  * Generate some input text files.

Modified: hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/mapred/Dictionary.java
URL: http://svn.apache.org/viewvc/hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/mapred/Dictionary.java?rev=882929&r1=882928&r2=882929&view=diff
==============================================================================
--- hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/mapred/Dictionary.java (original)
+++ hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/mapred/Dictionary.java Sat Nov 21 15:36:12 2009
@@ -24,9 +24,9 @@
 import java.util.Random;
 import java.util.Set;
 
-import org.apache.hadoop.io.file.tfile.RandomDistribution.Binomial;
-import org.apache.hadoop.io.file.tfile.RandomDistribution.DiscreteRNG;
-import org.apache.hadoop.io.file.tfile.RandomDistribution.Zipf;
+import org.apache.hadoop.zebra.tfile.RandomDistribution.Binomial;
+import org.apache.hadoop.zebra.tfile.RandomDistribution.DiscreteRNG;
+import org.apache.hadoop.zebra.tfile.RandomDistribution.Zipf;
 
 /**
  * A dictionary that generates English words, whose frequency follows Zipf

Modified: hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/mapred/TableMRSample2.java
URL: http://svn.apache.org/viewvc/hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/mapred/TableMRSample2.java?rev=882929&r1=882928&r2=882929&view=diff
==============================================================================
--- hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/mapred/TableMRSample2.java (original)
+++ hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/mapred/TableMRSample2.java Sat Nov 21 15:36:12 2009
@@ -113,4 +113,4 @@
       JobClient.runJob(jobConf);
     }
   }
-}
\ No newline at end of file
+}

Modified: hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/mapred/TableMapReduceExample.java
URL: http://svn.apache.org/viewvc/hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/mapred/TableMapReduceExample.java?rev=882929&r1=882928&r2=882929&view=diff
==============================================================================
--- hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/mapred/TableMapReduceExample.java (original)
+++ hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/mapred/TableMapReduceExample.java Sat Nov 21 15:36:12 2009
@@ -225,4 +225,4 @@
         args);
     System.exit(res);
   }
-}
\ No newline at end of file
+}

Modified: hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/mapred/TestBasicTableIOFormatLocalFS.java
URL: http://svn.apache.org/viewvc/hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/mapred/TestBasicTableIOFormatLocalFS.java?rev=882929&r1=882928&r2=882929&view=diff
==============================================================================
--- hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/mapred/TestBasicTableIOFormatLocalFS.java (original)
+++ hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/mapred/TestBasicTableIOFormatLocalFS.java Sat Nov 21 15:36:12 2009
@@ -46,7 +46,7 @@
 import org.apache.hadoop.io.RawComparator;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.Writable;
-import org.apache.hadoop.io.file.tfile.Utils;
+import org.apache.hadoop.zebra.tfile.Utils;
 import org.apache.hadoop.mapred.FileInputFormat;
 import org.apache.hadoop.mapred.FileSplit;
 import org.apache.hadoop.mapred.JobClient;
@@ -163,6 +163,9 @@
   JobConf getJobConf(String name) {
     JobConf jobConf = mr.createJobConf();
     jobConf.setJobName(name);
+    jobConf.setInt("table.input.split.minSize", 1);
+    options.minTableSplitSize = 1; // force more splits
+    jobConf.setInt("dfs.block.size", 1024); // force multiple blocks
     jobConf.set("table.output.tfile.compression", options.compression);
     jobConf.setInt("mapred.app.freqWords.count", options.numFreqWords);
     return jobConf;

Modified: hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/mapred/TestCheckin.java
URL: http://svn.apache.org/viewvc/hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/mapred/TestCheckin.java?rev=882929&r1=882928&r2=882929&view=diff
==============================================================================
--- hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/mapred/TestCheckin.java (original)
+++ hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/mapred/TestCheckin.java Sat Nov 21 15:36:12 2009
@@ -24,7 +24,8 @@
 @RunWith(Suite.class)
 @Suite.SuiteClasses({
   TestBasicTableIOFormatLocalFS.class,
-  TestBasicTableIOFormatDFS.class
+  TestBasicTableIOFormatDFS.class,
+  TestTfileSplit.class
 })
 
 public class TestCheckin {

Added: hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/mapred/TestTfileSplit.java
URL: http://svn.apache.org/viewvc/hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/mapred/TestTfileSplit.java?rev=882929&view=auto
==============================================================================
--- hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/mapred/TestTfileSplit.java (added)
+++ hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/mapred/TestTfileSplit.java Sat Nov 21 15:36:12 2009
@@ -0,0 +1,128 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with this
+ * work for additional information regarding copyright ownership. The ASF
+ * licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+package org.apache.hadoop.zebra.mapred;
+
+import java.io.IOException;
+import java.util.StringTokenizer;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.InputSplit;
+import org.apache.hadoop.zebra.mapred.TableInputFormat;
+import org.apache.hadoop.zebra.io.BasicTable;
+import org.apache.hadoop.zebra.io.TestBasicTable;
+import org.apache.hadoop.zebra.parser.ParseException;
+import org.junit.AfterClass;
+import org.junit.Assert;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+public class TestTfileSplit {
+  private static Configuration conf;
+  private static Path path;
+  
+  @BeforeClass
+  public static void setUpOnce() throws IOException {
+    TestBasicTable.setUpOnce();
+    conf = TestBasicTable.conf;
+    path = new Path(TestBasicTable.rootPath, "TfileSplitTest");
+  }
+  
+  @AfterClass
+  public static void tearDown() throws IOException {
+    BasicTable.drop(path, conf);
+  }
+  
+  /* In this test, we test creating input splits for the projection on non-existing column case.
+   * The first non-deleted column group should be used for split. */ 
+  @Test
+  public void testTfileSplit1() 
+          throws IOException, ParseException {
+    BasicTable.drop(path, conf);
+    TestBasicTable.createBasicTable(1, 100, "a, b, c, d, e, f", "[a, b]; [c, d]", null, path, true);    
+
+    TableInputFormat inputFormat = new TableInputFormat();
+    JobConf jobConf = new JobConf(conf);
+    inputFormat.setInputPaths(jobConf, path);
+    inputFormat.setMinSplitSize(jobConf, 100);
+    inputFormat.setProjection(jobConf, "aa");
+    InputSplit[] splits = inputFormat.getSplits(jobConf, 40);
+    
+    RowTableSplit split = (RowTableSplit) splits[0];
+    String str = split.getSplit().toString();
+    StringTokenizer tokens = new StringTokenizer(str, "\n");
+    str = tokens.nextToken();
+    tokens = new StringTokenizer(str, " ");
+    tokens.nextToken();
+    tokens.nextToken();
+    String s = tokens.nextToken();
+    s = s.substring(0, s.length()-1);
+    int cgIndex = Integer.parseInt(s);
+    Assert.assertEquals(cgIndex, 0); 
+  }
+
+  /* In this test, we test creating input splits when dropped column groups are around.
+   * Here the projection involves all columns and only one valid column group is present.
+   * As such, that column group should be used for split.*/
+  @Test
+  public void testTfileSplit2() 
+          throws IOException, ParseException {    
+    BasicTable.drop(path, conf);
+    TestBasicTable.createBasicTable(1, 100, "a, b, c, d, e, f", "[a, b]; [c, d]", null, path, true);    
+    BasicTable.dropColumnGroup(path, conf, "CG0");
+    BasicTable.dropColumnGroup(path, conf, "CG2");
+    
+    TableInputFormat inputFormat = new TableInputFormat();
+    JobConf jobConf = new JobConf(conf);
+    inputFormat.setInputPaths(jobConf, path);
+    inputFormat.setMinSplitSize(jobConf, 100);
+    InputSplit[] splits = inputFormat.getSplits(jobConf, 40);
+    
+    RowTableSplit split = (RowTableSplit) splits[0];
+    String str = split.getSplit().toString(); 
+    StringTokenizer tokens = new StringTokenizer(str, "\n");
+    str = tokens.nextToken();
+    tokens = new StringTokenizer(str, " ");
+    tokens.nextToken();
+    tokens.nextToken();
+    String s = tokens.nextToken();
+    s = s.substring(0, s.length()-1);
+    int cgIndex = Integer.parseInt(s);
+    Assert.assertEquals(cgIndex, 1); 
+  }
+  
+  /* In this test, we test creating input splits when there is no valid column group present.
+   * Should return 0 splits. */
+  @Test
+  public void testTfileSplit3() 
+          throws IOException, ParseException {    
+    BasicTable.drop(path, conf);
+    TestBasicTable.createBasicTable(1, 100, "a, b, c, d, e, f", "[a, b]; [c, d]", null, path, true);    
+    BasicTable.dropColumnGroup(path, conf, "CG0");
+    BasicTable.dropColumnGroup(path, conf, "CG1");
+    BasicTable.dropColumnGroup(path, conf, "CG2");
+    
+    TableInputFormat inputFormat = new TableInputFormat();
+    JobConf jobConf = new JobConf(conf);
+    inputFormat.setInputPaths(jobConf, path);
+    inputFormat.setMinSplitSize(jobConf, 100);
+    InputSplit[] splits = inputFormat.getSplits(jobConf, 40);
+    
+    Assert.assertEquals(splits.length, 0);
+  }
+}

Modified: hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/pig/TestSimpleType.java
URL: http://svn.apache.org/viewvc/hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/pig/TestSimpleType.java?rev=882929&r1=882928&r2=882929&view=diff
==============================================================================
--- hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/pig/TestSimpleType.java (original)
+++ hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/pig/TestSimpleType.java Sat Nov 21 15:36:12 2009
@@ -165,7 +165,7 @@
     t = tok.nextToken(); // '...<caller to getCurrentRoutine>'
     return t;
   }
-
+  
   // @Test
   public void testReadSimpleStitch() throws IOException, ParseException {
     String query = "records = LOAD '" + path.toString()
@@ -242,9 +242,9 @@
   @Test
   // Store same table
   public void testStorer() throws ExecException, IOException {
-    /*
-     * Use pig LOAD to load testing data for store
-     */
+    //
+    // Use pig LOAD to load testing data for store
+    //
     String query = "records = LOAD '"
         + path.toString()
         + "' USING org.apache.hadoop.zebra.pig.TableLoader() as (s1,s2,s3,s4,s5,s6);";
@@ -256,9 +256,9 @@
       System.out.println(RowValue);
     }
 
-    /*
-     * Use pig STORE to store testing data
-     */
+    //
+    // Use pig STORE to store testing data
+    //
     Path newPath = new Path(getCurrentMethodName());
     pigServer
         .store(
@@ -528,19 +528,19 @@
 
   @Test
   // Store negative, store to same path. Store should fail
-  public void testStorer5() throws ExecException, IOException {
-    /*
-     * Use pig LOAD to load testing data for store
-     */
+  public void testStorer5() throws ExecException, IOException {   
+    //
+    // Use pig LOAD to load testing data for store
+    //
     String query = "records = LOAD '"
         + path.toString()
         + "' USING org.apache.hadoop.zebra.pig.TableLoader() as (s1,s2,s3,s4,s5,s6);";
     pigServer.registerQuery(query);
 
-    /*
-     * Use pig STORE to store testing data
-     */
-    
+    //
+    // Use pig STORE to store testing data
+    //
+    System.out.println("path = " + path);
     ExecJob pigJob = pigServer
         .store(
             "records",
@@ -548,7 +548,6 @@
             TableStorer.class.getCanonicalName()
                 + "('[s1, s2]; [s3, s4]')");
     Assert.assertNotNull(pigJob.getException());
-    System.out.println(pigJob.getException());
+    System.out.println("pig job exception : " + pigJob.getException());
   }
-
 }

Modified: hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/pig/TestUnionMixedTypes.java
URL: http://svn.apache.org/viewvc/hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/pig/TestUnionMixedTypes.java?rev=882929&r1=882928&r2=882929&view=diff
==============================================================================
--- hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/pig/TestUnionMixedTypes.java (original)
+++ hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/pig/TestUnionMixedTypes.java Sat Nov 21 15:36:12 2009
@@ -309,16 +309,15 @@
 
         if (j == 1) {
           System.out.println("j is : " + j);
-          Assert.assertEquals(j + "." + j, cur2.get(0));
-          Assert.assertEquals(j + "." + j + j, cur2.get(1));
+          /* The order of t1 and t2 in the union result can vary across runs. */
+          Assert.assertTrue("1.1".equals(cur2.get(0)) || "3.3".equals(cur2.get(0)));
+          Assert.assertTrue("1.11".equals(cur2.get(1)) || "3.33".equals(cur2.get(1)));
         }
         if (j == 2) {
           System.out.println("j is : " + j);
 
-          Assert.assertEquals((j - 1) + "." + (j - 1) + (j - 1) + (j - 1), cur2
-              .get(0));
-          Assert.assertEquals((j - 1) + "." + (j - 1) + (j - 1) + (j - 1)
-              + (j - 1), cur2.get(1));
+          Assert.assertTrue("1.111".equals(cur2.get(0)) || "3.333".equals(cur2.get(0)));
+          Assert.assertTrue("1.1111".equals(cur2.get(1)) || "3.3333".equals(cur2.get(1)));
         }
 
         TypesUtils.resetTuple(cur2);
@@ -326,37 +325,37 @@
       }// inner while
       if (i == 1) {
         System.out.println("i is : " + i);
-
-        Assert.assertEquals("k11", ((Map) cur.get(1)).get("k1"));
+        Assert.assertTrue("k11".equals(((Map) cur.get(1)).get("k1")) || "k13".equals(((Map) cur.get(1)).get("k1")));
         Assert.assertEquals(null, ((Map) cur.get(1)).get("k2"));
-        Assert.assertEquals("1", cur.get(2));
+        Assert.assertTrue("1".equals(cur.get(2)) || "3".equals(cur.get(2)));
       }
 
       if (i == 2) {
         System.out.println("i should see this line. ");
-        Assert.assertEquals("k12", ((Map) cur.get(1)).get("k1"));
-        Assert.assertEquals("k22", ((Map) cur.get(1)).get("k2"));
-        Assert.assertEquals("2", cur.get(2));
+        Assert.assertTrue("k12".equals(((Map) cur.get(1)).get("k1")) || "k14".equals(((Map) cur.get(1)).get("k1")));
+        Assert.assertTrue("k22".equals(((Map) cur.get(1)).get("k2")) || "k24".equals(((Map) cur.get(1)).get("k2")));
+        Assert.assertTrue("2".equals(cur.get(2)) || "4".equals(cur.get(2)));
       }
       if (i == 3) {
         System.out.println("i is : " + i);
 
-        Assert.assertEquals("k13", ((Map) cur.get(1)).get("k1"));
+        Assert.assertTrue("k11".equals(((Map) cur.get(1)).get("k1")) || "k13".equals(((Map) cur.get(1)).get("k1")));
         Assert.assertEquals(null, ((Map) cur.get(1)).get("k2"));
-        Assert.assertEquals("3", cur.get(2));
+        Assert.assertTrue("1".equals(cur.get(2)) || "3".equals(cur.get(2)));
       }
 
       if (i == 4) {
         System.out.println("i should see this line. ");
-        Assert.assertEquals("k14", ((Map) cur.get(1)).get("k1"));
-        Assert.assertEquals("k24", ((Map) cur.get(1)).get("k2"));
-        Assert.assertEquals("4", cur.get(2));
+        Assert.assertTrue("k12".equals(((Map) cur.get(1)).get("k1")) || "k14".equals(((Map) cur.get(1)).get("k1")));
+        Assert.assertTrue("k22".equals(((Map) cur.get(1)).get("k2")) || "k24".equals(((Map) cur.get(1)).get("k2")));
+        Assert.assertTrue("2".equals(cur.get(2)) || "4".equals(cur.get(2)));
       }
     }// outer while
 
     Assert.assertEquals(4, i);
   }
 
+
   @Test
   // one common field only
   public void testReader2() throws ExecException, IOException {
@@ -399,7 +398,7 @@
         Assert.assertEquals(null, ((Map) cur.get(0)).get("k2"));
         try {
           cur.get(1);
-          Assert.fail("should throw index out of bound excepiotn");
+          Assert.fail("should throw index out of bound exception");
         } catch (Exception e) {
           e.printStackTrace();
         }
@@ -411,7 +410,7 @@
         Assert.assertEquals(null, ((Map) cur.get(0)).get("k2"));
         try {
           cur.get(1);
-          Assert.fail("should throw index out of bound excepiotn");
+          Assert.fail("should throw index out of bound exception");
         } catch (Exception e) {
           e.printStackTrace();
         }
@@ -423,7 +422,7 @@
         Assert.assertEquals(null, ((Map) cur.get(0)).get("k2"));
         try {
           cur.get(1);
-          Assert.fail("should throw index out of bound excepiotn");
+          Assert.fail("should throw index out of bound exception");
         } catch (Exception e) {
           e.printStackTrace();
         }
@@ -455,34 +454,35 @@
 
       i++;
       System.out.println(" line : " + i + " : " + cur.toString());
+      
+      
       if (i == 1) {
         System.out.println("i is : " + i);
 
-        Assert.assertEquals("world1", cur.get(0));
+        Assert.assertTrue("world1".equals(cur.get(0)) || cur.get(0) == null);
         try {
           cur.get(1);
-          Assert.fail("should throw index out of bound excepiotn");
+          Assert.fail("should throw index out of bound exception");
         } catch (Exception e) {
           e.printStackTrace();
         }
       }
 
       if (i == 2) {
-
-        Assert.assertEquals("world2", cur.get(0));
+        Assert.assertTrue("world2".equals(cur.get(0)) || cur.get(0) == null);
         try {
           cur.get(1);
-          Assert.fail("should throw index out of bound excepiotn");
+          Assert.fail("should throw index out of bound exception");
         } catch (Exception e) {
           e.printStackTrace();
         }
       }
       if (i == 3) {
 
-        Assert.assertEquals(null, cur.get(0));
+        Assert.assertTrue("world1".equals(cur.get(0)) || cur.get(0) == null);
         try {
           cur.get(1);
-          Assert.fail("should throw index out of bound excepiotn");
+          Assert.fail("should throw index out of bound exception");
         } catch (Exception e) {
           e.printStackTrace();
         }
@@ -490,10 +490,10 @@
 
       if (i == 4) {
 
-        Assert.assertEquals(null, cur.get(0));
+        Assert.assertTrue("world2".equals(cur.get(0)) || cur.get(0) == null);
         try {
           cur.get(1);
-          Assert.fail("should throw index out of bound excepiotn");
+          Assert.fail("should throw index out of bound exception");
         } catch (Exception e) {
           e.printStackTrace();
         }

Added: hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/tfile/KVGenerator.java
URL: http://svn.apache.org/viewvc/hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/tfile/KVGenerator.java?rev=882929&view=auto
==============================================================================
--- hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/tfile/KVGenerator.java (added)
+++ hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/tfile/KVGenerator.java Sat Nov 21 15:36:12 2009
@@ -0,0 +1,105 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with this
+ * work for additional information regarding copyright ownership. The ASF
+ * licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+package org.apache.hadoop.zebra.tfile;
+
+import java.util.Random;
+
+import org.apache.hadoop.io.BytesWritable;
+import org.apache.hadoop.io.WritableComparator;
+import org.apache.hadoop.zebra.tfile.RandomDistribution.DiscreteRNG;
+
+/**
+ * Generate random <key, value> pairs.
+ */
+class KVGenerator {
+  private final Random random;
+  private final byte[][] dict;
+  private final boolean sorted;
+  private final DiscreteRNG keyLenRNG, valLenRNG;
+  private BytesWritable lastKey;
+  private static final int MIN_KEY_LEN = 4;
+  private final byte prefix[] = new byte[MIN_KEY_LEN];
+
+  public KVGenerator(Random random, boolean sorted, DiscreteRNG keyLenRNG,
+      DiscreteRNG valLenRNG, DiscreteRNG wordLenRNG, int dictSize) {
+    this.random = random;
+    dict = new byte[dictSize][];
+    this.sorted = sorted;
+    this.keyLenRNG = keyLenRNG;
+    this.valLenRNG = valLenRNG;
+    for (int i = 0; i < dictSize; ++i) {
+      int wordLen = wordLenRNG.nextInt();
+      dict[i] = new byte[wordLen];
+      random.nextBytes(dict[i]);
+    }
+    lastKey = new BytesWritable();
+    fillKey(lastKey);
+  }
+  
+  private void fillKey(BytesWritable o) {
+    int len = keyLenRNG.nextInt();
+    if (len < MIN_KEY_LEN) len = MIN_KEY_LEN;
+    o.setSize(len);
+    int n = MIN_KEY_LEN;
+    while (n < len) {
+      byte[] word = dict[random.nextInt(dict.length)];
+      int l = Math.min(word.length, len - n);
+      System.arraycopy(word, 0, o.get(), n, l);
+      n += l;
+    }
+    if (sorted
+        && WritableComparator.compareBytes(lastKey.get(), MIN_KEY_LEN, lastKey
+            .getSize()
+            - MIN_KEY_LEN, o.get(), MIN_KEY_LEN, o.getSize() - MIN_KEY_LEN) > 0) {
+      incrementPrefix();
+    }
+
+    System.arraycopy(prefix, 0, o.get(), 0, MIN_KEY_LEN);
+    lastKey.set(o);
+  }
+
+  private void fillValue(BytesWritable o) {
+    int len = valLenRNG.nextInt();
+    o.setSize(len);
+    int n = 0;
+    while (n < len) {
+      byte[] word = dict[random.nextInt(dict.length)];
+      int l = Math.min(word.length, len - n);
+      System.arraycopy(word, 0, o.get(), n, l);
+      n += l;
+    }
+  }
+  
+  private void incrementPrefix() {
+    for (int i = MIN_KEY_LEN - 1; i >= 0; --i) {
+      ++prefix[i];
+      if (prefix[i] != 0) return;
+    }
+    
+    throw new RuntimeException("Prefix overflown");
+  }
+  
+  public void next(BytesWritable key, BytesWritable value, boolean dupKey) {
+    if (dupKey) {
+      key.set(lastKey);
+    }
+    else {
+      fillKey(key);
+    }
+    fillValue(value);
+  }
+}

Added: hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/tfile/KeySampler.java
URL: http://svn.apache.org/viewvc/hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/tfile/KeySampler.java?rev=882929&view=auto
==============================================================================
--- hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/tfile/KeySampler.java (added)
+++ hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/tfile/KeySampler.java Sat Nov 21 15:36:12 2009
@@ -0,0 +1,56 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with this
+ * work for additional information regarding copyright ownership. The ASF
+ * licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+package org.apache.hadoop.zebra.tfile;
+
+import java.io.IOException;
+import java.util.Random;
+
+import org.apache.hadoop.io.BytesWritable;
+import org.apache.hadoop.zebra.tfile.RandomDistribution.DiscreteRNG;
+
+class KeySampler {
+  Random random;
+  int min, max;
+  DiscreteRNG keyLenRNG;
+  private static final int MIN_KEY_LEN = 4;
+
+  public KeySampler(Random random, RawComparable first, RawComparable last,
+      DiscreteRNG keyLenRNG) throws IOException {
+    this.random = random;
+    min = keyPrefixToInt(first);
+    max = keyPrefixToInt(last);
+    this.keyLenRNG = keyLenRNG;
+  }
+
+  private int keyPrefixToInt(RawComparable key) throws IOException {
+    byte[] b = key.buffer();
+    int o = key.offset();
+    return (b[o] & 0xff) << 24 | (b[o + 1] & 0xff) << 16
+        | (b[o + 2] & 0xff) << 8 | (b[o + 3] & 0xff);
+  }
+  
+  public void next(BytesWritable key) {
+    key.setSize(Math.max(MIN_KEY_LEN, keyLenRNG.nextInt()));
+    random.nextBytes(key.get());
+    int n = random.nextInt(max - min) + min;
+    byte[] b = key.get();
+    b[0] = (byte) (n >> 24);
+    b[1] = (byte) (n >> 16);
+    b[2] = (byte) (n >> 8);
+    b[3] = (byte) n;
+  }
+}

Added: hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/tfile/NanoTimer.java
URL: http://svn.apache.org/viewvc/hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/tfile/NanoTimer.java?rev=882929&view=auto
==============================================================================
--- hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/tfile/NanoTimer.java (added)
+++ hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/tfile/NanoTimer.java Sat Nov 21 15:36:12 2009
@@ -0,0 +1,193 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with this
+ * work for additional information regarding copyright ownership. The ASF
+ * licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+package org.apache.hadoop.zebra.tfile;
+
+/**
+ * A nano-second timer.
+ */
+public class NanoTimer {
+  private long last = -1;
+  private boolean started = false;
+  private long cumulate = 0;
+
+  /**
+   * Constructor
+   * 
+   * @param start
+   *          Start the timer upon construction.
+   */
+  public NanoTimer(boolean start) {
+    if (start) this.start();
+  }
+
+  /**
+   * Start the timer.
+   * 
+   * Note: No effect if timer is already started.
+   */
+  public void start() {
+    if (!this.started) {
+      this.last = System.nanoTime();
+      this.started = true;
+    }
+  }
+
+  /**
+   * Stop the timer.
+   * 
+   * Note: No effect if timer is already stopped.
+   */
+  public void stop() {
+    if (this.started) {
+      this.started = false;
+      this.cumulate += System.nanoTime() - this.last;
+    }
+  }
+
+  /**
+   * Read the timer.
+   * 
+   * @return the elapsed time in nano-seconds. Note: If the timer is never
+   *         started before, -1 is returned.
+   */
+  public long read() {
+    if (!readable()) return -1;
+
+    return this.cumulate;
+  }
+
+  /**
+   * Reset the timer.
+   */
+  public void reset() {
+    this.last = -1;
+    this.started = false;
+    this.cumulate = 0;
+  }
+
+  /**
+   * Checking whether the timer is started
+   * 
+   * @return true if timer is started.
+   */
+  public boolean isStarted() {
+    return this.started;
+  }
+
+  /**
+   * Format the elapsed time to a human understandable string.
+   * 
+   * Note: If timer is never started, "ERR" will be returned.
+   */
+  public String toString() {
+    if (!readable()) {
+      return "ERR";
+    }
+
+    return NanoTimer.nanoTimeToString(this.cumulate);
+  }
+
+  /**
+   * A utility method to format a time duration in nano seconds into a human
+   * understandable stirng.
+   * 
+   * @param t
+   *          Time duration in nano seconds.
+   * @return String representation.
+   */
+  public static String nanoTimeToString(long t) {
+    if (t < 0) return "ERR";
+
+    if (t == 0) return "0";
+
+    if (t < 1000) {
+      return t + "ns";
+    }
+
+    double us = (double) t / 1000;
+    if (us < 1000) {
+      return String.format("%.2fus", us);
+    }
+
+    double ms = us / 1000;
+    if (ms < 1000) {
+      return String.format("%.2fms", ms);
+    }
+
+    double ss = ms / 1000;
+    if (ss < 1000) {
+      return String.format("%.2fs", ss);
+    }
+
+    long mm = (long) ss / 60;
+    ss -= mm * 60;
+    long hh = mm / 60;
+    mm -= hh * 60;
+    long dd = hh / 24;
+    hh -= dd * 24;
+
+    if (dd > 0) {
+      return String.format("%dd%dh", dd, hh);
+    }
+
+    if (hh > 0) {
+      return String.format("%dh%dm", hh, mm);
+    }
+
+    if (mm > 0) {
+      return String.format("%dm%.1fs", mm, ss);
+    }
+
+    return String.format("%.2fs", ss);
+
+    /**
+     * StringBuilder sb = new StringBuilder(); String sep = "";
+     * 
+     * if (dd > 0) { String unit = (dd > 1) ? "days" : "day";
+     * sb.append(String.format("%s%d%s", sep, dd, unit)); sep = " "; }
+     * 
+     * if (hh > 0) { String unit = (hh > 1) ? "hrs" : "hr";
+     * sb.append(String.format("%s%d%s", sep, hh, unit)); sep = " "; }
+     * 
+     * if (mm > 0) { String unit = (mm > 1) ? "mins" : "min";
+     * sb.append(String.format("%s%d%s", sep, mm, unit)); sep = " "; }
+     * 
+     * if (ss > 0) { String unit = (ss > 1) ? "secs" : "sec";
+     * sb.append(String.format("%s%.3f%s", sep, ss, unit)); sep = " "; }
+     * 
+     * return sb.toString();
+     */
+  }
+
+  private boolean readable() {
+    return this.last != -1;
+  }
+
+  /**
+   * Simple tester.
+   * 
+   * @param args
+   */
+  public static void main(String[] args) {
+    long i = 7;
+
+    for (int x = 0; x < 20; ++x, i *= 7) {
+      System.out.println(NanoTimer.nanoTimeToString(i));
+    }
+  }
+}
+

Added: hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/tfile/RandomDistribution.java
URL: http://svn.apache.org/viewvc/hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/tfile/RandomDistribution.java?rev=882929&view=auto
==============================================================================
--- hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/tfile/RandomDistribution.java (added)
+++ hadoop/pig/branches/branch-0.6/contrib/zebra/src/test/org/apache/hadoop/zebra/tfile/RandomDistribution.java Sat Nov 21 15:36:12 2009
@@ -0,0 +1,266 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with this
+ * work for additional information regarding copyright ownership. The ASF
+ * licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+package org.apache.hadoop.zebra.tfile;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Random;
+
+/**
+ * A class that generates random numbers that follow some distribution.
+ */
+public class RandomDistribution {
+  /**
+   * Interface for discrete (integer) random distributions.
+   */
+  public static interface DiscreteRNG {
+    /**
+     * Get the next random number
+     * 
+     * @return the next random number.
+     */
+    public int nextInt();
+  }
+
+  /**
+   * P(i)=1/(max-min)
+   */
+  public static final class Flat implements DiscreteRNG {
+    private final Random random;
+    private final int min;
+    private final int max;
+
+    /**
+     * Generate random integers from min (inclusive) to max (exclusive)
+     * following even distribution.
+     * 
+     * @param random
+     *          The basic random number generator.
+     * @param min
+     *          Minimum integer
+     * @param max
+     *          maximum integer (exclusive).
+     * 
+     */
+    public Flat(Random random, int min, int max) {
+      if (min >= max) {
+        throw new IllegalArgumentException("Invalid range");
+      }
+      this.random = random;
+      this.min = min;
+      this.max = max;
+    }
+    
+    /**
+     * @see DiscreteRNG#nextInt()
+     */
+    @Override
+    public int nextInt() {
+      return random.nextInt(max - min) + min;
+    }
+  }
+
+  /**
+   * Zipf distribution. The ratio of the probabilities of integer i and j is
+   * defined as follows:
+   * 
+   * P(i)/P(j)=((j-min+1)/(i-min+1))^sigma.
+   */
+  public static final class Zipf implements DiscreteRNG {
+    private static final double DEFAULT_EPSILON = 0.001;
+    private final Random random;
+    private final ArrayList<Integer> k;
+    private final ArrayList<Double> v;
+
+    /**
+     * Constructor
+     * 
+     * @param r
+     *          The random number generator.
+     * @param min
+     *          minimum integer (inclusvie)
+     * @param max
+     *          maximum integer (exclusive)
+     * @param sigma
+     *          parameter sigma. (sigma > 1.0)
+     */
+    public Zipf(Random r, int min, int max, double sigma) {
+      this(r, min, max, sigma, DEFAULT_EPSILON);
+    }
+
+    /**
+     * Constructor.
+     * 
+     * @param r
+     *          The random number generator.
+     * @param min
+     *          minimum integer (inclusvie)
+     * @param max
+     *          maximum integer (exclusive)
+     * @param sigma
+     *          parameter sigma. (sigma > 1.0)
+     * @param epsilon
+     *          Allowable error percentage (0 < epsilon < 1.0).
+     */
+    public Zipf(Random r, int min, int max, double sigma, double epsilon) {
+      if ((max <= min) || (sigma <= 1) || (epsilon <= 0)
+          || (epsilon >= 0.5)) {
+        throw new IllegalArgumentException("Invalid arguments");
+      }
+      random = r;
+      k = new ArrayList<Integer>();
+      v = new ArrayList<Double>();
+
+      double sum = 0;
+      int last = -1;
+      for (int i = min; i < max; ++i) {
+        sum += Math.exp(-sigma * Math.log(i - min + 1));
+        if ((last == -1) || i * (1 - epsilon) > last) {
+          k.add(i);
+          v.add(sum);
+          last = i;
+        }
+      }
+
+      if (last != max - 1) {
+        k.add(max - 1);
+        v.add(sum);
+      }
+
+      v.set(v.size() - 1, 1.0);
+
+      for (int i = v.size() - 2; i >= 0; --i) {
+        v.set(i, v.get(i) / sum);
+      }
+    }
+
+    /**
+     * @see DiscreteRNG#nextInt()
+     */
+    @Override
+    public int nextInt() {
+      double d = random.nextDouble();
+      int idx = Collections.binarySearch(v, d);
+
+      if (idx > 0) {
+        ++idx;
+      }
+      else {
+        idx = -(idx + 1);
+      }
+
+      if (idx >= v.size()) {
+        idx = v.size() - 1;
+      }
+
+      if (idx == 0) {
+        return k.get(0);
+      }
+
+      int ceiling = k.get(idx);
+      int lower = k.get(idx - 1);
+
+      return ceiling - random.nextInt(ceiling - lower);
+    }
+  }
+
+  /**
+   * Binomial distribution.
+   * 
+   * P(k)=select(n, k)*p^k*(1-p)^(n-k) (k = 0, 1, ..., n)
+   * 
+   * P(k)=select(max-min-1, k-min)*p^(k-min)*(1-p)^(k-min)*(1-p)^(max-k-1)
+   */
+  public static final class Binomial implements DiscreteRNG {
+    private final Random random;
+    private final int min;
+    private final int n;
+    private final double[] v;
+
+    private static double select(int n, int k) {
+      double ret = 1.0;
+      for (int i = k + 1; i <= n; ++i) {
+        ret *= (double) i / (i - k);
+      }
+      return ret;
+    }
+    
+    private static double power(double p, int k) {
+      return Math.exp(k * Math.log(p));
+    }
+
+    /**
+     * Generate random integers from min (inclusive) to max (exclusive)
+     * following Binomial distribution.
+     * 
+     * @param random
+     *          The basic random number generator.
+     * @param min
+     *          Minimum integer
+     * @param max
+     *          maximum integer (exclusive).
+     * @param p
+     *          parameter.
+     * 
+     */
+    public Binomial(Random random, int min, int max, double p) {
+      if (min >= max) {
+        throw new IllegalArgumentException("Invalid range");
+      }
+      this.random = random;
+      this.min = min;
+      this.n = max - min - 1;
+      if (n > 0) {
+        v = new double[n + 1];
+        double sum = 0.0;
+        for (int i = 0; i <= n; ++i) {
+          sum += select(n, i) * power(p, i) * power(1 - p, n - i);
+          v[i] = sum;
+        }
+        for (int i = 0; i <= n; ++i) {
+          v[i] /= sum;
+        }
+      }
+      else {
+        v = null;
+      }
+    }
+
+    /**
+     * @see DiscreteRNG#nextInt()
+     */
+    @Override
+    public int nextInt() {
+      if (v == null) {
+        return min;
+      }
+      double d = random.nextDouble();
+      int idx = Arrays.binarySearch(v, d);
+      if (idx > 0) {
+        ++idx;
+      } else {
+        idx = -(idx + 1);
+      }
+
+      if (idx >= v.length) {
+        idx = v.length - 1;
+      }
+      return idx + min;
+    }
+  }
+}