You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by cu...@apache.org on 2006/07/26 10:06:53 UTC
svn commit: r425661 - in /lucene/hadoop/trunk: ./ src/java/org/apache/hadoop/io/ src/java/org/apache/hadoop/record/ src/test/org/apache/hadoop/io/

Author: cutting
Date: Wed Jul 26 01:06:53 2006
New Revision: 425661

URL: http://svn.apache.org/viewvc?rev=425661&view=rev
Log:
HADOOP-302.  Add new Text class to replace UTF8.  Also refactor utility methods for zero-compressed integers.

Added:
    lucene/hadoop/trunk/src/java/org/apache/hadoop/io/Text.java
    lucene/hadoop/trunk/src/test/org/apache/hadoop/io/TestText.java
Modified:
    lucene/hadoop/trunk/CHANGES.txt
    lucene/hadoop/trunk/src/java/org/apache/hadoop/io/UTF8.java
    lucene/hadoop/trunk/src/java/org/apache/hadoop/io/WritableComparator.java
    lucene/hadoop/trunk/src/java/org/apache/hadoop/io/WritableUtils.java
    lucene/hadoop/trunk/src/java/org/apache/hadoop/record/BinaryInputArchive.java
    lucene/hadoop/trunk/src/java/org/apache/hadoop/record/BinaryOutputArchive.java
    lucene/hadoop/trunk/src/java/org/apache/hadoop/record/Utils.java

Modified: lucene/hadoop/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/CHANGES.txt?rev=425661&r1=425660&r2=425661&view=diff
==============================================================================
--- lucene/hadoop/trunk/CHANGES.txt (original)
+++ lucene/hadoop/trunk/CHANGES.txt Wed Jul 26 01:06:53 2006
@@ -85,6 +85,11 @@
 24. HADOOP-385.  Fix some bugs in record io code generation.
     (Milind Bhandarkar via cutting)
 
+25. HADOOP-302.  Add new Text class to replace UTF8, removing
+    limitations of that class.  Also refactor utility methods for
+    writing zero-compressed integers (VInts and VLongs).
+    (Hairong Kuang via cutting)
+
 
 Release 0.4.0 - 2006-06-28
 

Added: lucene/hadoop/trunk/src/java/org/apache/hadoop/io/Text.java
URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/java/org/apache/hadoop/io/Text.java?rev=425661&view=auto
==============================================================================
--- lucene/hadoop/trunk/src/java/org/apache/hadoop/io/Text.java (added)
+++ lucene/hadoop/trunk/src/java/org/apache/hadoop/io/Text.java Wed Jul 26 01:06:53 2006
@@ -0,0 +1,568 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.io;
+
+import java.io.IOException;
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.CharacterCodingException;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CharsetEncoder;
+import java.nio.charset.CodingErrorAction;
+import java.nio.charset.MalformedInputException;
+import java.text.CharacterIterator;
+import java.text.StringCharacterIterator;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+/** This class stores text using standard UTF8 encoding.  It provides methods
+ * to serialize, deserialize, and compare texts at byte level.  The type of
+ * length is integer and is serialized using zero-compressed format.  <p>In
+ * addition, it provides methods for string traversal without converting the
+ * byte array to a string.  <p>Also includes utilities for
+ * serializing/deserialing a string, coding/decoding a string, checking if a
+ * byte array contains valid UTF8 code, calculating the length of an encoded
+ * string.
+ */
+public class Text implements WritableComparable {
+  private static final Log LOG= LogFactory.getLog("org.apache.hadoop.io.Text");
+  
+  private static final CharsetDecoder DECODER = 
+    Charset.forName("UTF-8").newDecoder().
+    onMalformedInput(CodingErrorAction.REPORT).
+    onUnmappableCharacter(CodingErrorAction.REPORT);
+  private static final CharsetEncoder ENCODER = 
+    Charset.forName("UTF-8").newEncoder().
+    onMalformedInput(CodingErrorAction.REPORT).
+    onUnmappableCharacter(CodingErrorAction.REPORT);
+
+  private static final byte [] EMPTY_BYTES = new byte[0];
+  
+  private byte[] bytes;
+  private int length;
+
+  public Text() {
+    bytes = EMPTY_BYTES;
+  }
+
+  /** Construct from a string. 
+   * @exception CharacterCodingExcetpion if the string contains 
+   *            invalid codepoints or unpaired surrogates
+   */
+  public Text(String string) throws CharacterCodingException {
+    set(string);
+  }
+
+  /** Construct from another text. */
+  public Text(Text utf8) {
+    set(utf8);
+  }
+
+  /** Construct from a byte array.
+   * @exception CharacterCodingExcetpion if the array has invalid UTF8 bytes 
+   */
+  public Text(byte[] utf8) throws CharacterCodingException {
+    set(utf8);
+  }
+  
+  /** Retuns the raw bytes. */
+  public byte[] getBytes() {
+    return bytes;
+  }
+
+  /** Returns the number of bytes in the byte array */ 
+  public int getLength() {
+    return length;
+  }
+  
+  /**
+   * Returns the Unicode Scalar Value (32-bit integer value)
+   * for the character at <code>position</code>. Note that this
+   * method avoids using the converter or doing String instatiation
+   * @returns the Unicode scalar value at position or -1
+   *          if the position is invalid or points to a
+   *          trailing byte
+   */
+  public int charAt(int position) {
+    if (position > this.length) return -1; // too long
+    if (position < 0) return -1; // duh.
+      
+    ByteBuffer bb = (ByteBuffer)ByteBuffer.wrap(bytes).position(position);
+    return bytesToCodePoint(bb.slice());
+  }
+  
+  public int find(String what) {
+    return find(what, 0);
+  }
+  
+  /**
+   * Finds any occurence of <code>what</code> in the backing
+   * buffer, starting as position <code>start</code>. The starting
+   * position is measured in bytes and the return value is in
+   * terms of byte position in the buffer. The backing buffer is
+   * not converted to a string for this operation.
+   * @return byte position of the first occurence of the search
+   *         string in the UTF-8 buffer or -1 if not found
+   */
+  public int find(String what, int start) {
+    try {
+      ByteBuffer src = ByteBuffer.wrap(this.bytes);
+      ByteBuffer tgt = encode(what);
+      byte b = tgt.get();
+      src.position(start);
+          
+      while (src.hasRemaining()) {
+        if (b == src.get()) { // matching first byte
+          src.mark(); // save position in loop
+          tgt.mark(); // save position in target
+          boolean found = true;
+          int pos = src.position()-1;
+          while (tgt.hasRemaining()) {
+            if (!src.hasRemaining()) { // src expired first
+              tgt.reset();
+              src.reset();
+              found = false;
+              break;
+            }
+            if (!(tgt.get() == src.get())) {
+              tgt.reset();
+              src.reset();
+              found = false;
+              break; // no match
+            }
+          }
+          if (found) return pos;
+        }
+      }
+      return -1; // not found
+    } catch (CharacterCodingException e) {
+      // can't get here
+      e.printStackTrace();
+      return -1;
+    }
+  }  
+  /** Set to contain the contents of a string. 
+   * @exception CharacterCodingException if the string contains 
+   *       invalid codepoints or unpaired surrogate
+   */
+  public void set(String string) throws CharacterCodingException {
+    ByteBuffer bb = encode(string);
+    bytes = bb.array();
+    length = bb.limit();
+  }
+
+  /** Set to a utf8 byte array
+   * @exception CharacterCodingException if the array contains invalid UTF8 code  
+   */
+  public void set(byte[] utf8) throws CharacterCodingException {
+    validateUTF8(utf8);
+    set(utf8, utf8.length);
+  }
+  
+  /** copy a text. */
+  public void set(Text other) {
+    set(other.bytes, other.length);
+  }
+
+  private void set(byte[] utf8, int len ) {
+    setCapacity(len);
+    System.arraycopy(utf8, 0, bytes, 0, len);
+    this.length = len;
+  }
+
+  /*
+   * Sets the capacity of this Text object to <em>at least</em>
+   * <code>len</code> bytes. If the current buffer is longer,
+   * then the capacity and existing content of the buffer are
+   * unchanged. If <code>len</code> is larger
+   * than the current capacity, the Text object's capacity is
+   * increased to match. The existing contents of the buffer
+   * (if any) are deleted.
+   */
+  private void setCapacity( int len ) {
+    if (bytes == null || bytes.length < length)
+      bytes = new byte[length];      
+  }
+   
+  /** 
+   * Convert text back to string
+   * @see java.lang.Object#toString()
+   */
+  public String toString() {
+    try {
+      return decode(bytes);
+    } catch (CharacterCodingException e) { 
+      //bytes is supposed to contain valid utf8, therefore, 
+      // this should never happen
+      return null;
+    }
+  }
+  
+  /** deserialize 
+   * check if the received bytes are valid utf8 code. 
+   * if not throws MalformedInputException
+   * @see Writable#readFields(DataInput)
+   */
+  public void readFields(DataInput in) throws IOException {
+    length = WritableUtils.readVInt(in);
+    setCapacity(length);
+    in.readFully(bytes, 0, length);
+    validateUTF8(bytes);
+  }
+
+  /** Skips over one Text in the input. */
+  public static void skip(DataInput in) throws IOException {
+    int length = WritableUtils.readVInt(in);
+    in.skipBytes(length);
+  }
+
+  /** serialize
+   * write this object to out
+   * length uses zero-compressed encoding
+   * @see Writable#write(DataOutput)
+   */
+  public void write(DataOutput out) throws IOException {
+    WritableUtils.writeVInt(out, length); // out.writeInt(length);
+    out.write(bytes, 0, length);
+  }
+
+  /** Compare two Texts bytewise using standard UTF8 ordering. */
+  public int compareTo(Object o) {
+    Text that = (Text)o;
+    if(this == that)
+      return 0;
+    else
+      return WritableComparator.compareBytes(bytes, 0, length,
+                                             that.bytes, 0, that.length);
+  }
+
+  /** Returns true iff <code>o</code> is a Text with the same contents.  */
+  public boolean equals(Object o) {
+    if (!(o instanceof Text))
+      return false;
+    Text that = (Text)o;
+    if (this == that)
+      return true;
+    else if (this.length != that.length)
+      return false;
+    else
+      return WritableComparator.compareBytes(bytes, 0, length,
+                                             that.bytes, 0, that.length) == 0;
+  }
+
+  /** hash function */
+  public int hashCode() {
+    return WritableComparator.hashBytes(bytes, length);
+  }
+
+  /** A WritableComparator optimized for Text keys. */
+  public static class Comparator extends WritableComparator {
+    public Comparator() {
+      super(Text.class);
+    }
+
+    public int compare(byte[] b1, int s1, int l1,
+                       byte[] b2, int s2, int l2) {
+      try {
+        int n1 = readVInt(b1, s1);
+        int n2 = readVInt(b2, s2);
+        return compareBytes(b1, s1+WritableUtils.getVIntSize(n1), n1, 
+                            b2, s2+WritableUtils.getVIntSize(n2), n2);
+      }catch(IOException e) {
+        LOG.warn(e);
+        throw new RuntimeException(e);
+      }
+    }
+  }
+
+  static {
+    // register this comparator
+    WritableComparator.define(Text.class, new Comparator());
+  }
+
+  /// STATIC UTILITIES FROM HERE DOWN
+  /**
+   * Converts the provided byte array to a String using the
+   * UTF-8 encoding. If the input is malformed,
+   * throws a MalformedInputException.
+   */
+  public static String decode(byte[] utf8) throws CharacterCodingException {
+    return decode(ByteBuffer.wrap(utf8), false);
+  }
+  
+  /**
+   * Converts the provided byte array to a String using the
+   * UTF-8 encoding. If <code>replace</code> is true, then
+   * malformed input is replaced with the
+   * substitution character, which is U+FFFD. Otherwise the
+   * method throws a MalformedInputException.
+   */
+  public static String decode(byte[] utf8, boolean replace) 
+    throws CharacterCodingException {
+    return decode(ByteBuffer.wrap(utf8), replace);
+  }
+  
+  private static String decode(ByteBuffer utf8, boolean replace) 
+    throws CharacterCodingException {
+    synchronized(DECODER) {
+      if (replace) {
+        DECODER.onMalformedInput(
+                                 java.nio.charset.CodingErrorAction.REPLACE);
+        DECODER.onUnmappableCharacter(CodingErrorAction.REPLACE);
+      }
+      String str = DECODER.decode(utf8).toString();
+      // set decoder back to its default value: REPORT
+      if (replace) {
+        DECODER.onMalformedInput(CodingErrorAction.REPORT);
+        DECODER.onUnmappableCharacter(CodingErrorAction.REPORT);
+      }
+      return str;
+    }
+
+  }
+
+  /**
+   * Converts the provided String to bytes using the
+   * UTF-8 encoding. If the input is malformed,
+   * throws a MalformedInputException.
+   * @return ByteBuffer: bytes stores at ByteBuffer.array() 
+   *                     and length is ByteBuffer.limit()
+   */
+
+  public static ByteBuffer encode(String string)
+    throws CharacterCodingException {
+    return encode(string, false);
+  }
+
+  /**
+   * Converts the provided String to bytes using the
+   * UTF-8 encoding. If <code>replace</code> is true, then
+   * malformed input is replaced with the
+   * substitution character, which is U+FFFD. Otherwise the
+   * method throws a MalformedInputException.
+   * @return ByteBuffer: bytes stores at ByteBuffer.array() 
+   *                     and length is ByteBuffer.limit()
+   */
+  public static ByteBuffer encode(String string, boolean replace)
+    throws CharacterCodingException {
+    synchronized(ENCODER) {
+      if (replace) {
+        ENCODER.onMalformedInput(CodingErrorAction.REPLACE);
+        ENCODER.onUnmappableCharacter(CodingErrorAction.REPLACE);
+      }
+      ByteBuffer bytes=ENCODER.encode(CharBuffer.wrap(string.toCharArray()));
+      if (replace) {
+        ENCODER.onMalformedInput(CodingErrorAction.REPORT);
+        ENCODER.onUnmappableCharacter(CodingErrorAction.REPORT);
+      }
+      return bytes;
+    }
+  }
+
+  /** Read a UTF8 encoded string from in
+   */
+  public static String readString(DataInput in) throws IOException {
+    int length = WritableUtils.readVInt(in);
+    byte [] bytes = new byte[length];
+    in.readFully(bytes, 0, length);
+    validateUTF8(bytes);
+    return decode(bytes);
+  }
+
+  /** Write a UTF8 encoded string to out
+   */
+  public static int writeString(DataOutput out, String s) throws IOException {
+    ByteBuffer bytes = encode(s);
+    int length = bytes.limit();
+    WritableUtils.writeVInt(out, length);
+    out.write(bytes.array(), 0, length);
+    return length;
+  }
+
+  ////// states for validateUTF8
+  
+  private static final int LEAD_BYTE = 0;
+
+  private static final int TRAIL_BYTE_1 = 1;
+
+  private static final int TRAIL_BYTE = 2;
+
+  /** 
+   * Check if a byte array contains valid utf-8
+   * @param utf8: byte array
+   * @exception MalformedInputException if the byte array contains invalid utf-8
+   */
+  public static void validateUTF8(byte[] utf8) 
+    throws MalformedInputException {
+    int count = 0;
+    int leadByte = 0;
+    int length = 0;
+    int state = LEAD_BYTE;
+    while (count < utf8.length) {
+      int aByte = ((int) utf8[count] & 0xFF);
+
+      switch (state) {
+      case LEAD_BYTE:
+        leadByte = aByte;
+        length = bytesFromUTF8[aByte];
+
+        switch (length) {
+        case 0: // check for ASCII
+          if (leadByte > 0x7E)
+            throw new MalformedInputException(count);
+          state = TRAIL_BYTE;
+          break;
+        case 1:
+          if (leadByte < 0xC2 || leadByte > 0xDF)
+            throw new MalformedInputException(count);
+          state = TRAIL_BYTE_1;
+          break;
+        case 2:
+          if (leadByte < 0xE0 || leadByte > 0xEF)
+            throw new MalformedInputException(count);
+          state = TRAIL_BYTE_1;
+          break;
+        case 3:
+          if (leadByte < 0xF0 || leadByte > 0xF4)
+            throw new MalformedInputException(count);
+          state = TRAIL_BYTE_1;
+          break;
+        default:
+          // too long! Longest valid UTF-8 is 4 bytes (lead + three)
+          // or if < 0 we got a trail byte in the lead byte position
+          throw new MalformedInputException(count);
+        } // switch (length)
+        break;
+
+      case TRAIL_BYTE_1:
+        if (leadByte == 0xF0 && aByte < 0x90)
+          throw new MalformedInputException(count);
+        if (leadByte == 0xF4 && aByte > 0x8F)
+          throw new MalformedInputException(count);
+        if (leadByte == 0xE0 && aByte < 0xA0)
+          throw new MalformedInputException(count);
+        if (leadByte == 0xED && aByte > 0x9F)
+          throw new MalformedInputException(count);
+        // falls through to regular trail-byte test!!
+      case TRAIL_BYTE:
+        if (aByte < 0x80 || aByte > 0xBF)
+          throw new MalformedInputException(count);
+        if (--length == 0) {
+          state = LEAD_BYTE;
+        } else {
+          state = TRAIL_BYTE;
+        }
+        break;
+      } // switch (state)
+      count++;
+    }
+  }
+
+  /**
+   * Magic numbers for UTF-8. These are the number of bytes
+   * that <em>follow</em> a given lead byte. Trailing bytes
+   * have the value -1. The values 4 and 5 are presented in
+   * this table, even though valid UTF-8 cannot include the
+   * five and six byte sequences.
+   */
+  static final int[] bytesFromUTF8 =
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0,
+    // trail bytes
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3,
+    3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5 };
+
+  /**
+   * Returns the next code point at the current position in
+   * the buffer. The buffer's position will be incremented.
+   * Any mark set on this buffer will be changed by this method!
+   */
+  public static int bytesToCodePoint(ByteBuffer bytes) {
+    bytes.mark();
+    byte b = bytes.get();
+    bytes.reset();
+    int extraBytesToRead = bytesFromUTF8[(int)(b & 0xFF)];
+    if (extraBytesToRead < 0) return -1; // trailing byte!
+    int ch = 0;
+
+    switch (extraBytesToRead) {
+    case 5: ch += (int)(bytes.get() & 0xFF); ch <<= 6; /* remember, illegal UTF-8 */
+    case 4: ch += (int)(bytes.get() & 0xFF); ch <<= 6; /* remember, illegal UTF-8 */
+    case 3: ch += (int)(bytes.get() & 0xFF); ch <<= 6;
+    case 2: ch += (int)(bytes.get() & 0xFF); ch <<= 6;
+    case 1: ch += (int)(bytes.get() & 0xFF); ch <<= 6;
+    case 0: ch += (int)(bytes.get() & 0xFF);
+    }
+    ch -= offsetsFromUTF8[extraBytesToRead];
+
+    return ch;
+  }
+
+  
+  static final int offsetsFromUTF8[] =
+  { 0x00000000, 0x00003080,
+    0x000E2080, 0x03C82080, 0xFA082080, 0x82082080 };
+
+  /**
+   * For the given string, returns the number of UTF-8 bytes
+   * required to encode the string.
+   * @param string text to encode
+   * @return number of UTF-8 bytes required to encode
+   */
+  public static int utf8Length(String string) {
+    CharacterIterator iter = new StringCharacterIterator(string);
+    char ch = iter.first();
+    int size = 0;
+    while (ch != CharacterIterator.DONE) {
+      if ((ch >= 0xD800) && (ch < 0xDC00)) {
+        // surrogate pair?
+        char trail = iter.next();
+        if ((trail > 0xDBFF) && (trail < 0xE000)) {
+          // valid pair
+          size += 4;
+        } else {
+          // invalid pair
+          size += 3;
+          iter.previous(); // rewind one
+        }
+      } else if (ch < 0x80) {
+        size++;
+      } else if (ch < 0x800) {
+        size += 2;
+      } else {
+        // ch < 0x10000, that is, the largest char value
+        size += 3;
+      }
+      ch = iter.next();
+    }
+    return size;
+  }
+}

Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/io/UTF8.java
URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/java/org/apache/hadoop/io/UTF8.java?rev=425661&r1=425660&r2=425661&view=diff
==============================================================================
--- lucene/hadoop/trunk/src/java/org/apache/hadoop/io/UTF8.java (original)
+++ lucene/hadoop/trunk/src/java/org/apache/hadoop/io/UTF8.java Wed Jul 26 01:06:53 2006
@@ -28,6 +28,7 @@
  * <p>Also includes utilities for efficiently reading and writing UTF-8.
  *
  * @author Doug Cutting
+ * @deprecated replaced by Text
  */
 public class UTF8 implements WritableComparable {
   private static final Log LOG= LogFactory.getLog("org.apache.hadoop.io.UTF8");

Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/io/WritableComparator.java
URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/java/org/apache/hadoop/io/WritableComparator.java?rev=425661&r1=425660&r2=425661&view=diff
==============================================================================
--- lucene/hadoop/trunk/src/java/org/apache/hadoop/io/WritableComparator.java (original)
+++ lucene/hadoop/trunk/src/java/org/apache/hadoop/io/WritableComparator.java Wed Jul 26 01:06:53 2006
@@ -158,4 +158,38 @@
       (readInt(bytes, start+4) & 0xFFFFFFFFL);
   }
 
+  /**
+   * Reads a zero-compressed encoded long from a byte array and returns it.
+   * @param bytes: byte array with decode long
+   * @param start: starting index
+   * @throws java.io.IOException 
+   * @return deserialized long
+   */
+  static long readVLong(byte[] bytes, int start) throws IOException {
+      int len = bytes[start];
+      if (len >= -112) {
+          return len;
+      }
+      len = (len < -120) ? -(len + 120) : -(len + 112);
+      if (start+1+len>bytes.length)
+          throw new IOException(
+                  "Not enough number of bytes for a zero-compressed integer");
+      long i = 0;
+      for (int idx = 0; idx < len; idx++) {
+          i = i << 8;
+          i = i | (bytes[start+1+idx] & 0xFF);
+      }
+      return i;
+  }
+  
+  /**
+   * Reads a zero-compressed encoded integer from a byte array and returns it.
+   * @param bytes: byte array with the encoded integer
+   * @param start: start index
+   * @throws java.io.IOException 
+   * @return deserialized integer
+   */
+  static int readVInt(byte[] bytes, int start) throws IOException {
+      return (int) readVLong(bytes, start);
+  }
 }

Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/io/WritableUtils.java
URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/java/org/apache/hadoop/io/WritableUtils.java?rev=425661&r1=425660&r2=425661&view=diff
==============================================================================
--- lucene/hadoop/trunk/src/java/org/apache/hadoop/io/WritableUtils.java (original)
+++ lucene/hadoop/trunk/src/java/org/apache/hadoop/io/WritableUtils.java Wed Jul 26 01:06:53 2006
@@ -17,6 +17,7 @@
 package org.apache.hadoop.io;
 
 import java.io.*;
+
 import org.apache.hadoop.mapred.JobConf;
 import java.util.zip.GZIPInputStream;
 import java.util.zip.GZIPOutputStream;
@@ -229,6 +230,131 @@
     } catch (IOException e) {
       throw new RuntimeException("Error writing/reading clone buffer", e);
     }
+  }
+ 
+  /**
+   * Serializes an integer to a binary stream with zero-compressed encoding.
+   * For -120 <= i <= 127, only one byte is used with the actual value.
+   * For other values of i, the first byte value indicates whether the
+   * integer is positive or negative, and the number of bytes that follow.
+   * If the first byte value v is between -121 and -124, the following integer
+   * is positive, with number of bytes that follow are -(v+120).
+   * If the first byte value v is between -125 and -128, the following integer
+   * is negative, with number of bytes that follow are -(v+124). Bytes are
+   * stored in the high-non-zero-byte-first order.
+   *
+   * @param stream Binary output stream
+   * @param i Integer to be serialized
+   * @throws java.io.IOException 
+   */
+  public static void writeVInt(DataOutput stream, int i) throws IOException {
+      writeVLong(stream, i);
+  }
+  
+  /**
+   * Serializes a long to a binary stream with zero-compressed encoding.
+   * For -112 <= i <= 127, only one byte is used with the actual value.
+   * For other values of i, the first byte value indicates whether the
+   * long is positive or negative, and the number of bytes that follow.
+   * If the first byte value v is between -113 and -120, the following long
+   * is positive, with number of bytes that follow are -(v+112).
+   * If the first byte value v is between -121 and -128, the following long
+   * is negative, with number of bytes that follow are -(v+120). Bytes are
+   * stored in the high-non-zero-byte-first order.
+   * 
+   * @param stream Binary output stream
+   * @param i Long to be serialized
+   * @throws java.io.IOException 
+   */
+  public static void writeVLong(DataOutput stream, long i) throws IOException {
+      if (i >= -112 && i <= 127) {
+          stream.writeByte((byte)i);
+          return;
+      }
+      
+      int len = -112;
+      if (i < 0) {
+          i &= 0x7FFFFFFFFFFFFFFFL; // reset the sign bit
+          len = -120;
+      }
+      
+      long tmp = i;
+      while (tmp != 0) {
+          tmp = tmp >> 8;
+          len--;
+      }
+      
+      stream.writeByte((byte)len);
+      
+      len = (len < -120) ? -(len + 120) : -(len + 112);
+      
+      for (int idx = len; idx != 0; idx--) {
+          int shiftbits = (idx - 1) * 8;
+          long mask = 0xFFL << shiftbits;
+          stream.writeByte((byte)((i & mask) >> shiftbits));
+      }
+  }
+  
+
+  /**
+   * Reads a zero-compressed encoded long from input stream and returns it.
+   * @param stream Binary input stream
+   * @throws java.io.IOException 
+   * @return deserialized long from stream.
+   */
+  public static long readVLong(DataInput stream) throws IOException {
+      int len = stream.readByte();
+      if (len >= -112) {
+          return len;
+      }
+      len = (len < -120) ? -(len + 120) : -(len + 112);
+      byte[] barr = new byte[len];
+      stream.readFully(barr);
+      long i = 0;
+      for (int idx = 0; idx < len; idx++) {
+          i = i << 8;
+          i = i | (barr[idx] & 0xFF);
+      }
+      return i;
+  }
+
+  /**
+   * Reads a zero-compressed encoded integer from input stream and returns it.
+   * @param stream Binary input stream
+   * @throws java.io.IOException 
+   * @return deserialized integer from stream.
+   */
+  public static int readVInt(DataInput stream) throws IOException {
+      return (int) readVLong(stream);
+  }
+  
+
+  /**
+   * Get the encoded length if an integer is stored in a variable-length format
+   * @param i: a long
+   * @return the encoded length 
+   */
+  
+  public static int getVIntSize(long i) {
+      if (i >= -112 && i <= 127) {
+          return 1;
+      }
+      
+      int len = -112;
+      if (i < 0) {
+          i &= 0x7FFFFFFFFFFFFFFFL; // reset the sign bit
+          len = -120;
+      }
+      
+      long tmp = i;
+      while (tmp != 0) {
+          tmp = tmp >> 8;
+          len--;
+      }
+      
+      len = (len < -120) ? -(len + 120) : -(len + 112);
+      
+      return len+1;
   }
   
 }

Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/record/BinaryInputArchive.java
URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/java/org/apache/hadoop/record/BinaryInputArchive.java?rev=425661&r1=425660&r2=425661&view=diff
==============================================================================
--- lucene/hadoop/trunk/src/java/org/apache/hadoop/record/BinaryInputArchive.java (original)
+++ lucene/hadoop/trunk/src/java/org/apache/hadoop/record/BinaryInputArchive.java Wed Jul 26 01:06:53 2006
@@ -22,6 +22,8 @@
 import java.io.DataInputStream;
 import java.io.InputStream;
 
+import org.apache.hadoop.io.WritableUtils;
+
 
 /**
  *
@@ -61,11 +63,11 @@
     }
     
     public int readInt(String tag) throws IOException {
-        return Utils.readInt(in);
+        return WritableUtils.readVInt(in);
     }
     
     public long readLong(String tag) throws IOException {
-        return Utils.readLong(in);
+        return WritableUtils.readVLong(in);
     }
     
     public float readFloat(String tag) throws IOException {
@@ -77,14 +79,14 @@
     }
     
     public String readString(String tag) throws IOException {
-        int len = Utils.readInt(in);
+        int len = readInt(tag);
         byte[] chars = new byte[len];
         in.readFully(chars);
         return new String(chars, "UTF-8");
     }
     
     public ByteArrayOutputStream readBuffer(String tag) throws IOException {
-        int len = Utils.readInt(in);
+        int len = readInt(tag);
         ByteArrayOutputStream buf = new ByteArrayOutputStream(len);
         byte[] arr = new byte[len];
         in.readFully(arr);
@@ -101,13 +103,13 @@
     public void endRecord(String tag) throws IOException {}
     
     public Index startVector(String tag) throws IOException {
-        return new BinaryIndex(Utils.readInt(in));
+        return new BinaryIndex(readInt(tag));
     }
     
     public void endVector(String tag) throws IOException {}
     
     public Index startMap(String tag) throws IOException {
-        return new BinaryIndex(Utils.readInt(in));
+        return new BinaryIndex(readInt(tag));
     }
     
     public void endMap(String tag) throws IOException {}

Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/record/BinaryOutputArchive.java
URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/java/org/apache/hadoop/record/BinaryOutputArchive.java?rev=425661&r1=425660&r2=425661&view=diff
==============================================================================
--- lucene/hadoop/trunk/src/java/org/apache/hadoop/record/BinaryOutputArchive.java (original)
+++ lucene/hadoop/trunk/src/java/org/apache/hadoop/record/BinaryOutputArchive.java Wed Jul 26 01:06:53 2006
@@ -24,6 +24,8 @@
 import java.io.DataOutputStream;
 import java.io.OutputStream;
 
+import org.apache.hadoop.io.WritableUtils;
+
 /**
  *
  * @author Milind Bhandarkar
@@ -50,11 +52,11 @@
     }
     
     public void writeInt(int i, String tag) throws IOException {
-        Utils.writeInt(out, i);
+        WritableUtils.writeVInt(out, i);
     }
     
     public void writeLong(long l, String tag) throws IOException {
-        Utils.writeLong(out, l);
+        WritableUtils.writeVLong(out, l);
     }
     
     public void writeFloat(float f, String tag) throws IOException {
@@ -67,14 +69,14 @@
     
     public void writeString(String s, String tag) throws IOException {
         byte[] chars = s.getBytes("UTF-8");
-        Utils.writeInt(out, chars.length);
+        writeInt(chars.length, tag);
         out.write(chars);
     }
     
     public void writeBuffer(ByteArrayOutputStream buf, String tag)
     throws IOException {
         byte[] barr = buf.toByteArray();
-        Utils.writeInt(out, barr.length);
+        writeInt(barr.length, tag);
         out.write(barr);
     }
     
@@ -87,13 +89,13 @@
     public void endRecord(Record r, String tag) throws IOException {}
     
     public void startVector(ArrayList v, String tag) throws IOException {
-        Utils.writeInt(out, v.size());
+        writeInt(v.size(), tag);
     }
     
     public void endVector(ArrayList v, String tag) throws IOException {}
     
     public void startMap(TreeMap v, String tag) throws IOException {
-        Utils.writeInt(out, v.size());
+        writeInt(v.size(), tag);
     }
     
     public void endMap(TreeMap v, String tag) throws IOException {}

Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/record/Utils.java
URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/java/org/apache/hadoop/record/Utils.java?rev=425661&r1=425660&r2=425661&view=diff
==============================================================================
--- lucene/hadoop/trunk/src/java/org/apache/hadoop/record/Utils.java (original)
+++ lucene/hadoop/trunk/src/java/org/apache/hadoop/record/Utils.java Wed Jul 26 01:06:53 2006
@@ -46,7 +46,8 @@
      * @param stream Binary output stream
      * @param i Integer to be serialized
      * @throws java.io.IOException 
-     */
+     * @deprecated replaced by {@link #org.apache.hadoop.io.WritableUtils.writeVInt}
+      */
     static void writeInt(DataOutput stream, int i) throws IOException {
         if (i >= -120 && i <= 127) {
             stream.writeByte((byte)i);
@@ -89,7 +90,8 @@
      * 
      * @param stream Binary output stream
      * @param i Long to be serialized
-     * @throws java.io.IOException 
+     * @throws java.io.IOException
+     * @deprecated replaced by {@link #org.apache.hadoop.io.WritableUtils.writeVLong}
      */
     static void writeLong(DataOutput stream, long i) throws IOException {
         if (i >= -112 && i <= 127) {
@@ -125,6 +127,7 @@
      * @param stream Binary input stream
      * @throws java.io.IOException 
      * @return deserialized integer from stream.
+     * @deprecated replaced by {@link #org.apache.hadoop.io.WritableUtils.readVInt}
      */
     static int readInt(DataInput stream) throws IOException {
         int len = stream.readByte();
@@ -147,7 +150,8 @@
      * @param stream Binary input stream
      * @throws java.io.IOException 
      * @return deserialized long from stream.
-     */
+     * @deprecated replaced by {@link #org.apache.hadoop.io.WritableUtils.readVLong}
+      */
     static long readLong(DataInput stream) throws IOException {
         int len = stream.readByte();
         if (len >= -112) {

Added: lucene/hadoop/trunk/src/test/org/apache/hadoop/io/TestText.java
URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/test/org/apache/hadoop/io/TestText.java?rev=425661&view=auto
==============================================================================
--- lucene/hadoop/trunk/src/test/org/apache/hadoop/io/TestText.java (added)
+++ lucene/hadoop/trunk/src/test/org/apache/hadoop/io/TestText.java Wed Jul 26 01:06:53 2006
@@ -0,0 +1,223 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.io;
+
+import junit.framework.TestCase;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.charset.CharacterCodingException;
+import java.util.Random;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+/** Unit tests for LargeUTF8. */
+public class TestText extends TestCase {
+  private static final Log LOG= LogFactory.getLog("org.apache.hadoop.io.TestText");
+  private static final int NUM_ITERATIONS = 100;
+  public TestText(String name) { super(name); }
+
+  private static final Random RANDOM = new Random(1);
+
+  private static final int RAND_LEN = -1;
+  
+  // generate a valid java String
+  private static String getTestString(int len) throws Exception {
+    StringBuffer buffer = new StringBuffer();    
+    int length = (len==RAND_LEN) ? RANDOM.nextInt(1000) : len;
+    while (buffer.length()<length) {
+        int codePoint = RANDOM.nextInt(Character.MAX_CODE_POINT);
+        char tmpStr[] = new char[2];
+        if(Character.isDefined(codePoint)) {
+            //unpaired surrogate
+            if(codePoint < Character.MIN_SUPPLEMENTARY_CODE_POINT &&
+                    !Character.isHighSurrogate((char)codePoint) &&
+                    !Character.isLowSurrogate((char)codePoint) ) {
+               Character.toChars(codePoint, tmpStr, 0);
+               buffer.append(tmpStr);
+            }
+        }
+    }
+    return buffer.toString();
+  }
+  
+  public static String getTestString() throws Exception {
+    return getTestString(RAND_LEN);
+  }
+  
+  public static String getLongString() throws Exception {
+      String str = getTestString();
+      int length = Short.MAX_VALUE+str.length();
+      StringBuffer buffer = new StringBuffer();
+      while(buffer.length()<length)
+          buffer.append(str);
+      
+      return buffer.toString();
+  }
+
+  public void testWritable() throws Exception {
+    for (int i = 0; i < NUM_ITERATIONS; i++) {
+      try {
+        String str;
+        if(i == 0 )
+            str = getLongString();
+        else
+            str = getTestString();
+        TestWritable.testWritable(new Text(str));
+      } catch (IOException e) {
+          LOG.info(e);
+      }
+    }
+  }
+
+
+  public void testCoding() throws Exception {
+    for (int i = 0; i < NUM_ITERATIONS; i++) {
+      try {
+          // generate a random string
+          String before;
+          if(i == 0 )
+              before = getLongString();
+          else
+              before = getTestString();
+    
+          // test string to utf8
+          ByteBuffer bb = Text.encode(before);
+          
+          byte[] utf8Text = bb.array();
+          byte[] utf8Java = before.getBytes("UTF-8");
+          assertEquals(0, WritableComparator.compareBytes(
+                      utf8Text, 0, bb.limit(),
+                      utf8Java, 0, utf8Java.length));
+              
+          // test utf8 to string
+          String after = Text.decode(utf8Java);
+          assertTrue(before.equals(after));
+      }catch(CharacterCodingException e) {
+          LOG.info( e );
+      }
+    }
+  }
+  
+  
+  public void testIO() throws Exception {
+    DataOutputBuffer out = new DataOutputBuffer();
+    DataInputBuffer in = new DataInputBuffer();
+
+    for (int i = 0; i < NUM_ITERATIONS; i++) {
+        try {
+          // generate a random string
+          String before;          
+          if(i == 0 )
+              before = getLongString();
+          else
+              before = getTestString();
+
+          // write it
+          out.reset();
+          Text.writeString(out, before);
+
+          // test that it reads correctly
+          in.reset(out.getData(), out.getLength());
+          String after = Text.readString(in);
+          assertTrue(before.equals(after));
+    
+          // Test compatibility with Java's other decoder 
+          int strLenSize = WritableUtils.getVIntSize(Text.utf8Length(before));
+          String after2 = new String(out.getData(), strLenSize, 
+          out.getLength()-strLenSize, "UTF-8");
+              assertTrue(before.equals(after2));
+        }catch(IOException e) {
+            LOG.info(e);
+        }
+      }
+  }
+
+  public void testCompare() throws Exception {
+      DataOutputBuffer out1 = new DataOutputBuffer();
+      DataOutputBuffer out2 = new DataOutputBuffer();
+      DataOutputBuffer out3 = new DataOutputBuffer();
+      Text.Comparator comparator = new Text.Comparator();
+      for (int i=0; i<NUM_ITERATIONS; i++ ) {
+        try {
+          // reset output buffer
+          out1.reset();
+          out2.reset();
+          out3.reset();
+
+          // generate two random strings
+          String str1 = getTestString();
+          String str2 = getTestString();
+          if(i == 0 ) {
+              str1 = getLongString();
+              str2 = getLongString();
+          } else {
+              str1 = getTestString();
+              str2 = getTestString();
+          }
+          
+          // convert to texts
+          Text txt1 = new Text(str1);
+          Text txt2 = new Text(str2);
+          Text txt3 = new Text(str1);
+          
+          // serialize them
+          txt1.write(out1);
+          txt2.write(out2);
+          txt3.write(out3);
+          
+          // compare two strings by looking at their binary formats
+          int ret1 = comparator.compare(out1.getData(), 0, out1.getLength(),
+                  out2.getData(), 0, out2.getLength());
+          // compare two strings
+          int ret2 = txt1.compareTo(txt2);
+          
+          assertEquals(ret1, ret2);
+          
+          // test equal
+          assertEquals(txt1.compareTo(txt3), 0);
+          assertEquals(comparator.compare(out1.getData(), 0, out3.getLength(),
+                  out3.getData(), 0, out3.getLength()), 0);
+        } catch (IOException e) {
+            LOG.info(e);
+        }
+      }
+  }
+      
+  public void testFind() throws Exception {
+      try {
+          Text text = new Text("abcd\u20acbdcd\u20ac");
+          assertTrue(text.find("abd")==-1);
+          assertTrue(text.find("ac")==-1);
+          assertTrue(text.find("\u20ac")==4);
+          assertTrue(text.find("\u20ac", 5)==11);
+      } catch( CharacterCodingException e) {
+          LOG.warn(e);
+      }
+  }
+  
+  public static void main(String[] args)  throws Exception
+  {
+    TestText test = new TestText("main");
+    test.testIO();
+    test.testCompare();
+    test.testCoding();
+    test.testWritable();
+    test.testFind();
+  }
+}