You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by cu...@apache.org on 2006/07/26 10:06:53 UTC
svn commit: r425661 - in /lucene/hadoop/trunk: ./
src/java/org/apache/hadoop/io/ src/java/org/apache/hadoop/record/
src/test/org/apache/hadoop/io/
Author: cutting
Date: Wed Jul 26 01:06:53 2006
New Revision: 425661
URL: http://svn.apache.org/viewvc?rev=425661&view=rev
Log:
HADOOP-302. Add new Text class to replace UTF8. Also refactor utility methods for zero-compressed integers.
Added:
lucene/hadoop/trunk/src/java/org/apache/hadoop/io/Text.java
lucene/hadoop/trunk/src/test/org/apache/hadoop/io/TestText.java
Modified:
lucene/hadoop/trunk/CHANGES.txt
lucene/hadoop/trunk/src/java/org/apache/hadoop/io/UTF8.java
lucene/hadoop/trunk/src/java/org/apache/hadoop/io/WritableComparator.java
lucene/hadoop/trunk/src/java/org/apache/hadoop/io/WritableUtils.java
lucene/hadoop/trunk/src/java/org/apache/hadoop/record/BinaryInputArchive.java
lucene/hadoop/trunk/src/java/org/apache/hadoop/record/BinaryOutputArchive.java
lucene/hadoop/trunk/src/java/org/apache/hadoop/record/Utils.java
Modified: lucene/hadoop/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/CHANGES.txt?rev=425661&r1=425660&r2=425661&view=diff
==============================================================================
--- lucene/hadoop/trunk/CHANGES.txt (original)
+++ lucene/hadoop/trunk/CHANGES.txt Wed Jul 26 01:06:53 2006
@@ -85,6 +85,11 @@
24. HADOOP-385. Fix some bugs in record io code generation.
(Milind Bhandarkar via cutting)
+25. HADOOP-302. Add new Text class to replace UTF8, removing
+ limitations of that class. Also refactor utility methods for
+ writing zero-compressed integers (VInts and VLongs).
+ (Hairong Kuang via cutting)
+
Release 0.4.0 - 2006-06-28
Added: lucene/hadoop/trunk/src/java/org/apache/hadoop/io/Text.java
URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/java/org/apache/hadoop/io/Text.java?rev=425661&view=auto
==============================================================================
--- lucene/hadoop/trunk/src/java/org/apache/hadoop/io/Text.java (added)
+++ lucene/hadoop/trunk/src/java/org/apache/hadoop/io/Text.java Wed Jul 26 01:06:53 2006
@@ -0,0 +1,568 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.io;
+
+import java.io.IOException;
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.CharacterCodingException;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CharsetEncoder;
+import java.nio.charset.CodingErrorAction;
+import java.nio.charset.MalformedInputException;
+import java.text.CharacterIterator;
+import java.text.StringCharacterIterator;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+/** This class stores text using standard UTF8 encoding. It provides methods
+ * to serialize, deserialize, and compare texts at byte level. The type of
+ * length is integer and is serialized using zero-compressed format. <p>In
+ * addition, it provides methods for string traversal without converting the
+ * byte array to a string. <p>Also includes utilities for
+ * serializing/deserialing a string, coding/decoding a string, checking if a
+ * byte array contains valid UTF8 code, calculating the length of an encoded
+ * string.
+ */
+public class Text implements WritableComparable {
+ private static final Log LOG= LogFactory.getLog("org.apache.hadoop.io.Text");
+
+ private static final CharsetDecoder DECODER =
+ Charset.forName("UTF-8").newDecoder().
+ onMalformedInput(CodingErrorAction.REPORT).
+ onUnmappableCharacter(CodingErrorAction.REPORT);
+ private static final CharsetEncoder ENCODER =
+ Charset.forName("UTF-8").newEncoder().
+ onMalformedInput(CodingErrorAction.REPORT).
+ onUnmappableCharacter(CodingErrorAction.REPORT);
+
+ private static final byte [] EMPTY_BYTES = new byte[0];
+
+ private byte[] bytes;
+ private int length;
+
+ public Text() {
+ bytes = EMPTY_BYTES;
+ }
+
+ /** Construct from a string.
+ * @exception CharacterCodingExcetpion if the string contains
+ * invalid codepoints or unpaired surrogates
+ */
+ public Text(String string) throws CharacterCodingException {
+ set(string);
+ }
+
+ /** Construct from another text. */
+ public Text(Text utf8) {
+ set(utf8);
+ }
+
+ /** Construct from a byte array.
+ * @exception CharacterCodingExcetpion if the array has invalid UTF8 bytes
+ */
+ public Text(byte[] utf8) throws CharacterCodingException {
+ set(utf8);
+ }
+
+ /** Retuns the raw bytes. */
+ public byte[] getBytes() {
+ return bytes;
+ }
+
+ /** Returns the number of bytes in the byte array */
+ public int getLength() {
+ return length;
+ }
+
+ /**
+ * Returns the Unicode Scalar Value (32-bit integer value)
+ * for the character at <code>position</code>. Note that this
+ * method avoids using the converter or doing String instatiation
+ * @returns the Unicode scalar value at position or -1
+ * if the position is invalid or points to a
+ * trailing byte
+ */
+ public int charAt(int position) {
+ if (position > this.length) return -1; // too long
+ if (position < 0) return -1; // duh.
+
+ ByteBuffer bb = (ByteBuffer)ByteBuffer.wrap(bytes).position(position);
+ return bytesToCodePoint(bb.slice());
+ }
+
+ public int find(String what) {
+ return find(what, 0);
+ }
+
+ /**
+ * Finds any occurence of <code>what</code> in the backing
+ * buffer, starting as position <code>start</code>. The starting
+ * position is measured in bytes and the return value is in
+ * terms of byte position in the buffer. The backing buffer is
+ * not converted to a string for this operation.
+ * @return byte position of the first occurence of the search
+ * string in the UTF-8 buffer or -1 if not found
+ */
+ public int find(String what, int start) {
+ try {
+ ByteBuffer src = ByteBuffer.wrap(this.bytes);
+ ByteBuffer tgt = encode(what);
+ byte b = tgt.get();
+ src.position(start);
+
+ while (src.hasRemaining()) {
+ if (b == src.get()) { // matching first byte
+ src.mark(); // save position in loop
+ tgt.mark(); // save position in target
+ boolean found = true;
+ int pos = src.position()-1;
+ while (tgt.hasRemaining()) {
+ if (!src.hasRemaining()) { // src expired first
+ tgt.reset();
+ src.reset();
+ found = false;
+ break;
+ }
+ if (!(tgt.get() == src.get())) {
+ tgt.reset();
+ src.reset();
+ found = false;
+ break; // no match
+ }
+ }
+ if (found) return pos;
+ }
+ }
+ return -1; // not found
+ } catch (CharacterCodingException e) {
+ // can't get here
+ e.printStackTrace();
+ return -1;
+ }
+ }
+ /** Set to contain the contents of a string.
+ * @exception CharacterCodingException if the string contains
+ * invalid codepoints or unpaired surrogate
+ */
+ public void set(String string) throws CharacterCodingException {
+ ByteBuffer bb = encode(string);
+ bytes = bb.array();
+ length = bb.limit();
+ }
+
+ /** Set to a utf8 byte array
+ * @exception CharacterCodingException if the array contains invalid UTF8 code
+ */
+ public void set(byte[] utf8) throws CharacterCodingException {
+ validateUTF8(utf8);
+ set(utf8, utf8.length);
+ }
+
+ /** copy a text. */
+ public void set(Text other) {
+ set(other.bytes, other.length);
+ }
+
+ private void set(byte[] utf8, int len ) {
+ setCapacity(len);
+ System.arraycopy(utf8, 0, bytes, 0, len);
+ this.length = len;
+ }
+
+ /*
+ * Sets the capacity of this Text object to <em>at least</em>
+ * <code>len</code> bytes. If the current buffer is longer,
+ * then the capacity and existing content of the buffer are
+ * unchanged. If <code>len</code> is larger
+ * than the current capacity, the Text object's capacity is
+ * increased to match. The existing contents of the buffer
+ * (if any) are deleted.
+ */
+ private void setCapacity( int len ) {
+ if (bytes == null || bytes.length < length)
+ bytes = new byte[length];
+ }
+
+ /**
+ * Convert text back to string
+ * @see java.lang.Object#toString()
+ */
+ public String toString() {
+ try {
+ return decode(bytes);
+ } catch (CharacterCodingException e) {
+ //bytes is supposed to contain valid utf8, therefore,
+ // this should never happen
+ return null;
+ }
+ }
+
+ /** deserialize
+ * check if the received bytes are valid utf8 code.
+ * if not throws MalformedInputException
+ * @see Writable#readFields(DataInput)
+ */
+ public void readFields(DataInput in) throws IOException {
+ length = WritableUtils.readVInt(in);
+ setCapacity(length);
+ in.readFully(bytes, 0, length);
+ validateUTF8(bytes);
+ }
+
+ /** Skips over one Text in the input. */
+ public static void skip(DataInput in) throws IOException {
+ int length = WritableUtils.readVInt(in);
+ in.skipBytes(length);
+ }
+
+ /** serialize
+ * write this object to out
+ * length uses zero-compressed encoding
+ * @see Writable#write(DataOutput)
+ */
+ public void write(DataOutput out) throws IOException {
+ WritableUtils.writeVInt(out, length); // out.writeInt(length);
+ out.write(bytes, 0, length);
+ }
+
+ /** Compare two Texts bytewise using standard UTF8 ordering. */
+ public int compareTo(Object o) {
+ Text that = (Text)o;
+ if(this == that)
+ return 0;
+ else
+ return WritableComparator.compareBytes(bytes, 0, length,
+ that.bytes, 0, that.length);
+ }
+
+ /** Returns true iff <code>o</code> is a Text with the same contents. */
+ public boolean equals(Object o) {
+ if (!(o instanceof Text))
+ return false;
+ Text that = (Text)o;
+ if (this == that)
+ return true;
+ else if (this.length != that.length)
+ return false;
+ else
+ return WritableComparator.compareBytes(bytes, 0, length,
+ that.bytes, 0, that.length) == 0;
+ }
+
+ /** hash function */
+ public int hashCode() {
+ return WritableComparator.hashBytes(bytes, length);
+ }
+
+ /** A WritableComparator optimized for Text keys. */
+ public static class Comparator extends WritableComparator {
+ public Comparator() {
+ super(Text.class);
+ }
+
+ public int compare(byte[] b1, int s1, int l1,
+ byte[] b2, int s2, int l2) {
+ try {
+ int n1 = readVInt(b1, s1);
+ int n2 = readVInt(b2, s2);
+ return compareBytes(b1, s1+WritableUtils.getVIntSize(n1), n1,
+ b2, s2+WritableUtils.getVIntSize(n2), n2);
+ }catch(IOException e) {
+ LOG.warn(e);
+ throw new RuntimeException(e);
+ }
+ }
+ }
+
+ static {
+ // register this comparator
+ WritableComparator.define(Text.class, new Comparator());
+ }
+
+ /// STATIC UTILITIES FROM HERE DOWN
+ /**
+ * Converts the provided byte array to a String using the
+ * UTF-8 encoding. If the input is malformed,
+ * throws a MalformedInputException.
+ */
+ public static String decode(byte[] utf8) throws CharacterCodingException {
+ return decode(ByteBuffer.wrap(utf8), false);
+ }
+
+ /**
+ * Converts the provided byte array to a String using the
+ * UTF-8 encoding. If <code>replace</code> is true, then
+ * malformed input is replaced with the
+ * substitution character, which is U+FFFD. Otherwise the
+ * method throws a MalformedInputException.
+ */
+ public static String decode(byte[] utf8, boolean replace)
+ throws CharacterCodingException {
+ return decode(ByteBuffer.wrap(utf8), replace);
+ }
+
+ private static String decode(ByteBuffer utf8, boolean replace)
+ throws CharacterCodingException {
+ synchronized(DECODER) {
+ if (replace) {
+ DECODER.onMalformedInput(
+ java.nio.charset.CodingErrorAction.REPLACE);
+ DECODER.onUnmappableCharacter(CodingErrorAction.REPLACE);
+ }
+ String str = DECODER.decode(utf8).toString();
+ // set decoder back to its default value: REPORT
+ if (replace) {
+ DECODER.onMalformedInput(CodingErrorAction.REPORT);
+ DECODER.onUnmappableCharacter(CodingErrorAction.REPORT);
+ }
+ return str;
+ }
+
+ }
+
+ /**
+ * Converts the provided String to bytes using the
+ * UTF-8 encoding. If the input is malformed,
+ * throws a MalformedInputException.
+ * @return ByteBuffer: bytes stores at ByteBuffer.array()
+ * and length is ByteBuffer.limit()
+ */
+
+ public static ByteBuffer encode(String string)
+ throws CharacterCodingException {
+ return encode(string, false);
+ }
+
+ /**
+ * Converts the provided String to bytes using the
+ * UTF-8 encoding. If <code>replace</code> is true, then
+ * malformed input is replaced with the
+ * substitution character, which is U+FFFD. Otherwise the
+ * method throws a MalformedInputException.
+ * @return ByteBuffer: bytes stores at ByteBuffer.array()
+ * and length is ByteBuffer.limit()
+ */
+ public static ByteBuffer encode(String string, boolean replace)
+ throws CharacterCodingException {
+ synchronized(ENCODER) {
+ if (replace) {
+ ENCODER.onMalformedInput(CodingErrorAction.REPLACE);
+ ENCODER.onUnmappableCharacter(CodingErrorAction.REPLACE);
+ }
+ ByteBuffer bytes=ENCODER.encode(CharBuffer.wrap(string.toCharArray()));
+ if (replace) {
+ ENCODER.onMalformedInput(CodingErrorAction.REPORT);
+ ENCODER.onUnmappableCharacter(CodingErrorAction.REPORT);
+ }
+ return bytes;
+ }
+ }
+
+ /** Read a UTF8 encoded string from in
+ */
+ public static String readString(DataInput in) throws IOException {
+ int length = WritableUtils.readVInt(in);
+ byte [] bytes = new byte[length];
+ in.readFully(bytes, 0, length);
+ validateUTF8(bytes);
+ return decode(bytes);
+ }
+
+ /** Write a UTF8 encoded string to out
+ */
+ public static int writeString(DataOutput out, String s) throws IOException {
+ ByteBuffer bytes = encode(s);
+ int length = bytes.limit();
+ WritableUtils.writeVInt(out, length);
+ out.write(bytes.array(), 0, length);
+ return length;
+ }
+
+ ////// states for validateUTF8
+
+ private static final int LEAD_BYTE = 0;
+
+ private static final int TRAIL_BYTE_1 = 1;
+
+ private static final int TRAIL_BYTE = 2;
+
+ /**
+ * Check if a byte array contains valid utf-8
+ * @param utf8: byte array
+ * @exception MalformedInputException if the byte array contains invalid utf-8
+ */
+ public static void validateUTF8(byte[] utf8)
+ throws MalformedInputException {
+ int count = 0;
+ int leadByte = 0;
+ int length = 0;
+ int state = LEAD_BYTE;
+ while (count < utf8.length) {
+ int aByte = ((int) utf8[count] & 0xFF);
+
+ switch (state) {
+ case LEAD_BYTE:
+ leadByte = aByte;
+ length = bytesFromUTF8[aByte];
+
+ switch (length) {
+ case 0: // check for ASCII
+ if (leadByte > 0x7E)
+ throw new MalformedInputException(count);
+ state = TRAIL_BYTE;
+ break;
+ case 1:
+ if (leadByte < 0xC2 || leadByte > 0xDF)
+ throw new MalformedInputException(count);
+ state = TRAIL_BYTE_1;
+ break;
+ case 2:
+ if (leadByte < 0xE0 || leadByte > 0xEF)
+ throw new MalformedInputException(count);
+ state = TRAIL_BYTE_1;
+ break;
+ case 3:
+ if (leadByte < 0xF0 || leadByte > 0xF4)
+ throw new MalformedInputException(count);
+ state = TRAIL_BYTE_1;
+ break;
+ default:
+ // too long! Longest valid UTF-8 is 4 bytes (lead + three)
+ // or if < 0 we got a trail byte in the lead byte position
+ throw new MalformedInputException(count);
+ } // switch (length)
+ break;
+
+ case TRAIL_BYTE_1:
+ if (leadByte == 0xF0 && aByte < 0x90)
+ throw new MalformedInputException(count);
+ if (leadByte == 0xF4 && aByte > 0x8F)
+ throw new MalformedInputException(count);
+ if (leadByte == 0xE0 && aByte < 0xA0)
+ throw new MalformedInputException(count);
+ if (leadByte == 0xED && aByte > 0x9F)
+ throw new MalformedInputException(count);
+ // falls through to regular trail-byte test!!
+ case TRAIL_BYTE:
+ if (aByte < 0x80 || aByte > 0xBF)
+ throw new MalformedInputException(count);
+ if (--length == 0) {
+ state = LEAD_BYTE;
+ } else {
+ state = TRAIL_BYTE;
+ }
+ break;
+ } // switch (state)
+ count++;
+ }
+ }
+
+ /**
+ * Magic numbers for UTF-8. These are the number of bytes
+ * that <em>follow</em> a given lead byte. Trailing bytes
+ * have the value -1. The values 4 and 5 are presented in
+ * this table, even though valid UTF-8 cannot include the
+ * five and six byte sequences.
+ */
+ static final int[] bytesFromUTF8 =
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0,
+ // trail bytes
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3,
+ 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5 };
+
+ /**
+ * Returns the next code point at the current position in
+ * the buffer. The buffer's position will be incremented.
+ * Any mark set on this buffer will be changed by this method!
+ */
+ public static int bytesToCodePoint(ByteBuffer bytes) {
+ bytes.mark();
+ byte b = bytes.get();
+ bytes.reset();
+ int extraBytesToRead = bytesFromUTF8[(int)(b & 0xFF)];
+ if (extraBytesToRead < 0) return -1; // trailing byte!
+ int ch = 0;
+
+ switch (extraBytesToRead) {
+ case 5: ch += (int)(bytes.get() & 0xFF); ch <<= 6; /* remember, illegal UTF-8 */
+ case 4: ch += (int)(bytes.get() & 0xFF); ch <<= 6; /* remember, illegal UTF-8 */
+ case 3: ch += (int)(bytes.get() & 0xFF); ch <<= 6;
+ case 2: ch += (int)(bytes.get() & 0xFF); ch <<= 6;
+ case 1: ch += (int)(bytes.get() & 0xFF); ch <<= 6;
+ case 0: ch += (int)(bytes.get() & 0xFF);
+ }
+ ch -= offsetsFromUTF8[extraBytesToRead];
+
+ return ch;
+ }
+
+
+ static final int offsetsFromUTF8[] =
+ { 0x00000000, 0x00003080,
+ 0x000E2080, 0x03C82080, 0xFA082080, 0x82082080 };
+
+ /**
+ * For the given string, returns the number of UTF-8 bytes
+ * required to encode the string.
+ * @param string text to encode
+ * @return number of UTF-8 bytes required to encode
+ */
+ public static int utf8Length(String string) {
+ CharacterIterator iter = new StringCharacterIterator(string);
+ char ch = iter.first();
+ int size = 0;
+ while (ch != CharacterIterator.DONE) {
+ if ((ch >= 0xD800) && (ch < 0xDC00)) {
+ // surrogate pair?
+ char trail = iter.next();
+ if ((trail > 0xDBFF) && (trail < 0xE000)) {
+ // valid pair
+ size += 4;
+ } else {
+ // invalid pair
+ size += 3;
+ iter.previous(); // rewind one
+ }
+ } else if (ch < 0x80) {
+ size++;
+ } else if (ch < 0x800) {
+ size += 2;
+ } else {
+ // ch < 0x10000, that is, the largest char value
+ size += 3;
+ }
+ ch = iter.next();
+ }
+ return size;
+ }
+}
Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/io/UTF8.java
URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/java/org/apache/hadoop/io/UTF8.java?rev=425661&r1=425660&r2=425661&view=diff
==============================================================================
--- lucene/hadoop/trunk/src/java/org/apache/hadoop/io/UTF8.java (original)
+++ lucene/hadoop/trunk/src/java/org/apache/hadoop/io/UTF8.java Wed Jul 26 01:06:53 2006
@@ -28,6 +28,7 @@
* <p>Also includes utilities for efficiently reading and writing UTF-8.
*
* @author Doug Cutting
+ * @deprecated replaced by Text
*/
public class UTF8 implements WritableComparable {
private static final Log LOG= LogFactory.getLog("org.apache.hadoop.io.UTF8");
Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/io/WritableComparator.java
URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/java/org/apache/hadoop/io/WritableComparator.java?rev=425661&r1=425660&r2=425661&view=diff
==============================================================================
--- lucene/hadoop/trunk/src/java/org/apache/hadoop/io/WritableComparator.java (original)
+++ lucene/hadoop/trunk/src/java/org/apache/hadoop/io/WritableComparator.java Wed Jul 26 01:06:53 2006
@@ -158,4 +158,38 @@
(readInt(bytes, start+4) & 0xFFFFFFFFL);
}
+ /**
+ * Reads a zero-compressed encoded long from a byte array and returns it.
+ * @param bytes: byte array with decode long
+ * @param start: starting index
+ * @throws java.io.IOException
+ * @return deserialized long
+ */
+ static long readVLong(byte[] bytes, int start) throws IOException {
+ int len = bytes[start];
+ if (len >= -112) {
+ return len;
+ }
+ len = (len < -120) ? -(len + 120) : -(len + 112);
+ if (start+1+len>bytes.length)
+ throw new IOException(
+ "Not enough number of bytes for a zero-compressed integer");
+ long i = 0;
+ for (int idx = 0; idx < len; idx++) {
+ i = i << 8;
+ i = i | (bytes[start+1+idx] & 0xFF);
+ }
+ return i;
+ }
+
+ /**
+ * Reads a zero-compressed encoded integer from a byte array and returns it.
+ * @param bytes: byte array with the encoded integer
+ * @param start: start index
+ * @throws java.io.IOException
+ * @return deserialized integer
+ */
+ static int readVInt(byte[] bytes, int start) throws IOException {
+ return (int) readVLong(bytes, start);
+ }
}
Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/io/WritableUtils.java
URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/java/org/apache/hadoop/io/WritableUtils.java?rev=425661&r1=425660&r2=425661&view=diff
==============================================================================
--- lucene/hadoop/trunk/src/java/org/apache/hadoop/io/WritableUtils.java (original)
+++ lucene/hadoop/trunk/src/java/org/apache/hadoop/io/WritableUtils.java Wed Jul 26 01:06:53 2006
@@ -17,6 +17,7 @@
package org.apache.hadoop.io;
import java.io.*;
+
import org.apache.hadoop.mapred.JobConf;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
@@ -229,6 +230,131 @@
} catch (IOException e) {
throw new RuntimeException("Error writing/reading clone buffer", e);
}
+ }
+
+ /**
+ * Serializes an integer to a binary stream with zero-compressed encoding.
+ * For -120 <= i <= 127, only one byte is used with the actual value.
+ * For other values of i, the first byte value indicates whether the
+ * integer is positive or negative, and the number of bytes that follow.
+ * If the first byte value v is between -121 and -124, the following integer
+ * is positive, with number of bytes that follow are -(v+120).
+ * If the first byte value v is between -125 and -128, the following integer
+ * is negative, with number of bytes that follow are -(v+124). Bytes are
+ * stored in the high-non-zero-byte-first order.
+ *
+ * @param stream Binary output stream
+ * @param i Integer to be serialized
+ * @throws java.io.IOException
+ */
+ public static void writeVInt(DataOutput stream, int i) throws IOException {
+ writeVLong(stream, i);
+ }
+
+ /**
+ * Serializes a long to a binary stream with zero-compressed encoding.
+ * For -112 <= i <= 127, only one byte is used with the actual value.
+ * For other values of i, the first byte value indicates whether the
+ * long is positive or negative, and the number of bytes that follow.
+ * If the first byte value v is between -113 and -120, the following long
+ * is positive, with number of bytes that follow are -(v+112).
+ * If the first byte value v is between -121 and -128, the following long
+ * is negative, with number of bytes that follow are -(v+120). Bytes are
+ * stored in the high-non-zero-byte-first order.
+ *
+ * @param stream Binary output stream
+ * @param i Long to be serialized
+ * @throws java.io.IOException
+ */
+ public static void writeVLong(DataOutput stream, long i) throws IOException {
+ if (i >= -112 && i <= 127) {
+ stream.writeByte((byte)i);
+ return;
+ }
+
+ int len = -112;
+ if (i < 0) {
+ i &= 0x7FFFFFFFFFFFFFFFL; // reset the sign bit
+ len = -120;
+ }
+
+ long tmp = i;
+ while (tmp != 0) {
+ tmp = tmp >> 8;
+ len--;
+ }
+
+ stream.writeByte((byte)len);
+
+ len = (len < -120) ? -(len + 120) : -(len + 112);
+
+ for (int idx = len; idx != 0; idx--) {
+ int shiftbits = (idx - 1) * 8;
+ long mask = 0xFFL << shiftbits;
+ stream.writeByte((byte)((i & mask) >> shiftbits));
+ }
+ }
+
+
+ /**
+ * Reads a zero-compressed encoded long from input stream and returns it.
+ * @param stream Binary input stream
+ * @throws java.io.IOException
+ * @return deserialized long from stream.
+ */
+ public static long readVLong(DataInput stream) throws IOException {
+ int len = stream.readByte();
+ if (len >= -112) {
+ return len;
+ }
+ len = (len < -120) ? -(len + 120) : -(len + 112);
+ byte[] barr = new byte[len];
+ stream.readFully(barr);
+ long i = 0;
+ for (int idx = 0; idx < len; idx++) {
+ i = i << 8;
+ i = i | (barr[idx] & 0xFF);
+ }
+ return i;
+ }
+
+ /**
+ * Reads a zero-compressed encoded integer from input stream and returns it.
+ * @param stream Binary input stream
+ * @throws java.io.IOException
+ * @return deserialized integer from stream.
+ */
+ public static int readVInt(DataInput stream) throws IOException {
+ return (int) readVLong(stream);
+ }
+
+
+ /**
+ * Get the encoded length if an integer is stored in a variable-length format
+ * @param i: a long
+ * @return the encoded length
+ */
+
+ public static int getVIntSize(long i) {
+ if (i >= -112 && i <= 127) {
+ return 1;
+ }
+
+ int len = -112;
+ if (i < 0) {
+ i &= 0x7FFFFFFFFFFFFFFFL; // reset the sign bit
+ len = -120;
+ }
+
+ long tmp = i;
+ while (tmp != 0) {
+ tmp = tmp >> 8;
+ len--;
+ }
+
+ len = (len < -120) ? -(len + 120) : -(len + 112);
+
+ return len+1;
}
}
Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/record/BinaryInputArchive.java
URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/java/org/apache/hadoop/record/BinaryInputArchive.java?rev=425661&r1=425660&r2=425661&view=diff
==============================================================================
--- lucene/hadoop/trunk/src/java/org/apache/hadoop/record/BinaryInputArchive.java (original)
+++ lucene/hadoop/trunk/src/java/org/apache/hadoop/record/BinaryInputArchive.java Wed Jul 26 01:06:53 2006
@@ -22,6 +22,8 @@
import java.io.DataInputStream;
import java.io.InputStream;
+import org.apache.hadoop.io.WritableUtils;
+
/**
*
@@ -61,11 +63,11 @@
}
public int readInt(String tag) throws IOException {
- return Utils.readInt(in);
+ return WritableUtils.readVInt(in);
}
public long readLong(String tag) throws IOException {
- return Utils.readLong(in);
+ return WritableUtils.readVLong(in);
}
public float readFloat(String tag) throws IOException {
@@ -77,14 +79,14 @@
}
public String readString(String tag) throws IOException {
- int len = Utils.readInt(in);
+ int len = readInt(tag);
byte[] chars = new byte[len];
in.readFully(chars);
return new String(chars, "UTF-8");
}
public ByteArrayOutputStream readBuffer(String tag) throws IOException {
- int len = Utils.readInt(in);
+ int len = readInt(tag);
ByteArrayOutputStream buf = new ByteArrayOutputStream(len);
byte[] arr = new byte[len];
in.readFully(arr);
@@ -101,13 +103,13 @@
public void endRecord(String tag) throws IOException {}
public Index startVector(String tag) throws IOException {
- return new BinaryIndex(Utils.readInt(in));
+ return new BinaryIndex(readInt(tag));
}
public void endVector(String tag) throws IOException {}
public Index startMap(String tag) throws IOException {
- return new BinaryIndex(Utils.readInt(in));
+ return new BinaryIndex(readInt(tag));
}
public void endMap(String tag) throws IOException {}
Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/record/BinaryOutputArchive.java
URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/java/org/apache/hadoop/record/BinaryOutputArchive.java?rev=425661&r1=425660&r2=425661&view=diff
==============================================================================
--- lucene/hadoop/trunk/src/java/org/apache/hadoop/record/BinaryOutputArchive.java (original)
+++ lucene/hadoop/trunk/src/java/org/apache/hadoop/record/BinaryOutputArchive.java Wed Jul 26 01:06:53 2006
@@ -24,6 +24,8 @@
import java.io.DataOutputStream;
import java.io.OutputStream;
+import org.apache.hadoop.io.WritableUtils;
+
/**
*
* @author Milind Bhandarkar
@@ -50,11 +52,11 @@
}
public void writeInt(int i, String tag) throws IOException {
- Utils.writeInt(out, i);
+ WritableUtils.writeVInt(out, i);
}
public void writeLong(long l, String tag) throws IOException {
- Utils.writeLong(out, l);
+ WritableUtils.writeVLong(out, l);
}
public void writeFloat(float f, String tag) throws IOException {
@@ -67,14 +69,14 @@
public void writeString(String s, String tag) throws IOException {
byte[] chars = s.getBytes("UTF-8");
- Utils.writeInt(out, chars.length);
+ writeInt(chars.length, tag);
out.write(chars);
}
public void writeBuffer(ByteArrayOutputStream buf, String tag)
throws IOException {
byte[] barr = buf.toByteArray();
- Utils.writeInt(out, barr.length);
+ writeInt(barr.length, tag);
out.write(barr);
}
@@ -87,13 +89,13 @@
public void endRecord(Record r, String tag) throws IOException {}
public void startVector(ArrayList v, String tag) throws IOException {
- Utils.writeInt(out, v.size());
+ writeInt(v.size(), tag);
}
public void endVector(ArrayList v, String tag) throws IOException {}
public void startMap(TreeMap v, String tag) throws IOException {
- Utils.writeInt(out, v.size());
+ writeInt(v.size(), tag);
}
public void endMap(TreeMap v, String tag) throws IOException {}
Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/record/Utils.java
URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/java/org/apache/hadoop/record/Utils.java?rev=425661&r1=425660&r2=425661&view=diff
==============================================================================
--- lucene/hadoop/trunk/src/java/org/apache/hadoop/record/Utils.java (original)
+++ lucene/hadoop/trunk/src/java/org/apache/hadoop/record/Utils.java Wed Jul 26 01:06:53 2006
@@ -46,7 +46,8 @@
* @param stream Binary output stream
* @param i Integer to be serialized
* @throws java.io.IOException
- */
+ * @deprecated replaced by {@link #org.apache.hadoop.io.WritableUtils.writeVInt}
+ */
static void writeInt(DataOutput stream, int i) throws IOException {
if (i >= -120 && i <= 127) {
stream.writeByte((byte)i);
@@ -89,7 +90,8 @@
*
* @param stream Binary output stream
* @param i Long to be serialized
- * @throws java.io.IOException
+ * @throws java.io.IOException
+ * @deprecated replaced by {@link #org.apache.hadoop.io.WritableUtils.writeVLong}
*/
static void writeLong(DataOutput stream, long i) throws IOException {
if (i >= -112 && i <= 127) {
@@ -125,6 +127,7 @@
* @param stream Binary input stream
* @throws java.io.IOException
* @return deserialized integer from stream.
+ * @deprecated replaced by {@link #org.apache.hadoop.io.WritableUtils.readVInt}
*/
static int readInt(DataInput stream) throws IOException {
int len = stream.readByte();
@@ -147,7 +150,8 @@
* @param stream Binary input stream
* @throws java.io.IOException
* @return deserialized long from stream.
- */
+ * @deprecated replaced by {@link #org.apache.hadoop.io.WritableUtils.readVLong}
+ */
static long readLong(DataInput stream) throws IOException {
int len = stream.readByte();
if (len >= -112) {
Added: lucene/hadoop/trunk/src/test/org/apache/hadoop/io/TestText.java
URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/test/org/apache/hadoop/io/TestText.java?rev=425661&view=auto
==============================================================================
--- lucene/hadoop/trunk/src/test/org/apache/hadoop/io/TestText.java (added)
+++ lucene/hadoop/trunk/src/test/org/apache/hadoop/io/TestText.java Wed Jul 26 01:06:53 2006
@@ -0,0 +1,223 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.io;
+
+import junit.framework.TestCase;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.charset.CharacterCodingException;
+import java.util.Random;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+/** Unit tests for LargeUTF8. */
+public class TestText extends TestCase {
+ private static final Log LOG= LogFactory.getLog("org.apache.hadoop.io.TestText");
+ private static final int NUM_ITERATIONS = 100;
+ public TestText(String name) { super(name); }
+
+ private static final Random RANDOM = new Random(1);
+
+ private static final int RAND_LEN = -1;
+
+ // generate a valid java String
+ private static String getTestString(int len) throws Exception {
+ StringBuffer buffer = new StringBuffer();
+ int length = (len==RAND_LEN) ? RANDOM.nextInt(1000) : len;
+ while (buffer.length()<length) {
+ int codePoint = RANDOM.nextInt(Character.MAX_CODE_POINT);
+ char tmpStr[] = new char[2];
+ if(Character.isDefined(codePoint)) {
+ //unpaired surrogate
+ if(codePoint < Character.MIN_SUPPLEMENTARY_CODE_POINT &&
+ !Character.isHighSurrogate((char)codePoint) &&
+ !Character.isLowSurrogate((char)codePoint) ) {
+ Character.toChars(codePoint, tmpStr, 0);
+ buffer.append(tmpStr);
+ }
+ }
+ }
+ return buffer.toString();
+ }
+
+ public static String getTestString() throws Exception {
+ return getTestString(RAND_LEN);
+ }
+
+ public static String getLongString() throws Exception {
+ String str = getTestString();
+ int length = Short.MAX_VALUE+str.length();
+ StringBuffer buffer = new StringBuffer();
+ while(buffer.length()<length)
+ buffer.append(str);
+
+ return buffer.toString();
+ }
+
+ public void testWritable() throws Exception {
+ for (int i = 0; i < NUM_ITERATIONS; i++) {
+ try {
+ String str;
+ if(i == 0 )
+ str = getLongString();
+ else
+ str = getTestString();
+ TestWritable.testWritable(new Text(str));
+ } catch (IOException e) {
+ LOG.info(e);
+ }
+ }
+ }
+
+
+ public void testCoding() throws Exception {
+ for (int i = 0; i < NUM_ITERATIONS; i++) {
+ try {
+ // generate a random string
+ String before;
+ if(i == 0 )
+ before = getLongString();
+ else
+ before = getTestString();
+
+ // test string to utf8
+ ByteBuffer bb = Text.encode(before);
+
+ byte[] utf8Text = bb.array();
+ byte[] utf8Java = before.getBytes("UTF-8");
+ assertEquals(0, WritableComparator.compareBytes(
+ utf8Text, 0, bb.limit(),
+ utf8Java, 0, utf8Java.length));
+
+ // test utf8 to string
+ String after = Text.decode(utf8Java);
+ assertTrue(before.equals(after));
+ }catch(CharacterCodingException e) {
+ LOG.info( e );
+ }
+ }
+ }
+
+
+ public void testIO() throws Exception {
+ DataOutputBuffer out = new DataOutputBuffer();
+ DataInputBuffer in = new DataInputBuffer();
+
+ for (int i = 0; i < NUM_ITERATIONS; i++) {
+ try {
+ // generate a random string
+ String before;
+ if(i == 0 )
+ before = getLongString();
+ else
+ before = getTestString();
+
+ // write it
+ out.reset();
+ Text.writeString(out, before);
+
+ // test that it reads correctly
+ in.reset(out.getData(), out.getLength());
+ String after = Text.readString(in);
+ assertTrue(before.equals(after));
+
+ // Test compatibility with Java's other decoder
+ int strLenSize = WritableUtils.getVIntSize(Text.utf8Length(before));
+ String after2 = new String(out.getData(), strLenSize,
+ out.getLength()-strLenSize, "UTF-8");
+ assertTrue(before.equals(after2));
+ }catch(IOException e) {
+ LOG.info(e);
+ }
+ }
+ }
+
+ public void testCompare() throws Exception {
+ DataOutputBuffer out1 = new DataOutputBuffer();
+ DataOutputBuffer out2 = new DataOutputBuffer();
+ DataOutputBuffer out3 = new DataOutputBuffer();
+ Text.Comparator comparator = new Text.Comparator();
+ for (int i=0; i<NUM_ITERATIONS; i++ ) {
+ try {
+ // reset output buffer
+ out1.reset();
+ out2.reset();
+ out3.reset();
+
+ // generate two random strings
+ String str1 = getTestString();
+ String str2 = getTestString();
+ if(i == 0 ) {
+ str1 = getLongString();
+ str2 = getLongString();
+ } else {
+ str1 = getTestString();
+ str2 = getTestString();
+ }
+
+ // convert to texts
+ Text txt1 = new Text(str1);
+ Text txt2 = new Text(str2);
+ Text txt3 = new Text(str1);
+
+ // serialize them
+ txt1.write(out1);
+ txt2.write(out2);
+ txt3.write(out3);
+
+ // compare two strings by looking at their binary formats
+ int ret1 = comparator.compare(out1.getData(), 0, out1.getLength(),
+ out2.getData(), 0, out2.getLength());
+ // compare two strings
+ int ret2 = txt1.compareTo(txt2);
+
+ assertEquals(ret1, ret2);
+
+ // test equal
+ assertEquals(txt1.compareTo(txt3), 0);
+ assertEquals(comparator.compare(out1.getData(), 0, out3.getLength(),
+ out3.getData(), 0, out3.getLength()), 0);
+ } catch (IOException e) {
+ LOG.info(e);
+ }
+ }
+ }
+
+ public void testFind() throws Exception {
+ try {
+ Text text = new Text("abcd\u20acbdcd\u20ac");
+ assertTrue(text.find("abd")==-1);
+ assertTrue(text.find("ac")==-1);
+ assertTrue(text.find("\u20ac")==4);
+ assertTrue(text.find("\u20ac", 5)==11);
+ } catch( CharacterCodingException e) {
+ LOG.warn(e);
+ }
+ }
+
+ public static void main(String[] args) throws Exception
+ {
+ TestText test = new TestText("main");
+ test.testIO();
+ test.testCompare();
+ test.testCoding();
+ test.testWritable();
+ test.testFind();
+ }
+}