You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by cu...@apache.org on 2006/10/04 19:16:53 UTC
svn commit: r452941 - in /lucene/hadoop/trunk: CHANGES.txt
src/java/org/apache/hadoop/io/Text.java
src/java/org/apache/hadoop/record/Utils.java
src/test/org/apache/hadoop/io/TestTextNonUTF8.java
Author: cutting
Date: Wed Oct 4 10:16:52 2006
New Revision: 452941
URL: http://svn.apache.org/viewvc?view=rev&rev=452941
Log:
HADOOP-550. Disable automatic UTF-8 validation in Text. Contributed by Hairong & Mahadev.
Added:
lucene/hadoop/trunk/src/test/org/apache/hadoop/io/TestTextNonUTF8.java
Modified:
lucene/hadoop/trunk/CHANGES.txt
lucene/hadoop/trunk/src/java/org/apache/hadoop/io/Text.java
lucene/hadoop/trunk/src/java/org/apache/hadoop/record/Utils.java
Modified: lucene/hadoop/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/CHANGES.txt?view=diff&rev=452941&r1=452940&r2=452941
==============================================================================
--- lucene/hadoop/trunk/CHANGES.txt (original)
+++ lucene/hadoop/trunk/CHANGES.txt Wed Oct 4 10:16:52 2006
@@ -128,6 +128,10 @@
tasktracker correctly report the task as failed to the jobtracker,
so that it will be rescheduled. (omalley via cutting)
+31. HADOOP-550. Disable automatic UTF-8 validation in Text. This
+ permits, e.g., TextInputFormat to again operate on non-UTF-8 data.
+ (Hairong and Mahadev via cutting)
+
Release 0.6.2 - 2006-09-18
Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/io/Text.java
URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/java/org/apache/hadoop/io/Text.java?view=diff&rev=452941&r1=452940&r2=452941
==============================================================================
--- lucene/hadoop/trunk/src/java/org/apache/hadoop/io/Text.java (original)
+++ lucene/hadoop/trunk/src/java/org/apache/hadoop/io/Text.java Wed Oct 4 10:16:52 2006
@@ -64,10 +64,8 @@
}
/** Construct from a string.
- * @exception CharacterCodingExcetpion if the string contains
- * invalid codepoints or unpaired surrogates
*/
- public Text(String string) throws CharacterCodingException {
+ public Text(String string) {
set(string);
}
@@ -77,9 +75,8 @@
}
/** Construct from a byte array.
- * @exception CharacterCodingExcetpion if the array has invalid UTF8 bytes
*/
- public Text(byte[] utf8) throws CharacterCodingException {
+ public Text(byte[] utf8) {
set(utf8);
}
@@ -160,29 +157,26 @@
}
}
/** Set to contain the contents of a string.
- * @exception CharacterCodingException if the string contains
- * invalid codepoints or unpaired surrogate
*/
- public void set(String string) throws CharacterCodingException {
- ByteBuffer bb = encode(string);
- bytes = bb.array();
- length = bb.limit();
+ public void set(String string) {
+ try {
+ ByteBuffer bb = encode(string, true);
+ bytes = bb.array();
+ length = bb.limit();
+ }catch(CharacterCodingException e) {
+ throw new RuntimeException("Should not have happened " + e.toString());
+ }
}
/** Set to a utf8 byte array
- * @exception CharacterCodingException if the array contains invalid UTF8 code
*/
- public void set(byte[] utf8) throws CharacterCodingException {
+ public void set(byte[] utf8) {
set(utf8, 0, utf8.length);
}
/** copy a text. */
public void set(Text other) {
- try {
- set(other.bytes, 0, other.length);
- } catch (CharacterCodingException e) {
- throw new RuntimeException("bad Text UTF8 encoding", e);
- }
+ set(other.bytes, 0, other.length);
}
/**
@@ -191,9 +185,7 @@
* @param start the first position of the new string
* @param len the number of bytes of the new string
*/
- public void set(byte[] utf8, int start, int len
- ) throws CharacterCodingException{
- validateUTF8(utf8, start, len);
+ public void set(byte[] utf8, int start, int len) {
setCapacity(len);
System.arraycopy(utf8, start, bytes, 0, len);
this.length = len;
@@ -221,22 +213,16 @@
try {
return decode(bytes, 0, length);
} catch (CharacterCodingException e) {
- //bytes is supposed to contain valid utf8, therefore,
- // this should never happen
return null;
}
}
/** deserialize
- * check if the received bytes are valid utf8 code.
- * if not throws MalformedInputException
- * @see Writable#readFields(DataInput)
*/
public void readFields(DataInput in) throws IOException {
length = WritableUtils.readVInt(in);
setCapacity(length);
in.readFully(bytes, 0, length);
- validateUTF8(bytes);
}
/** Skips over one Text in the input. */
@@ -251,7 +237,7 @@
* @see Writable#write(DataOutput)
*/
public void write(DataOutput out) throws IOException {
- WritableUtils.writeVInt(out, length); // out.writeInt(length);
+ WritableUtils.writeVInt(out, length);
out.write(bytes, 0, length);
}
@@ -313,15 +299,15 @@
/**
* Converts the provided byte array to a String using the
* UTF-8 encoding. If the input is malformed,
- * throws a MalformedInputException.
+ * replace by a default value.
*/
public static String decode(byte[] utf8) throws CharacterCodingException {
- return decode(ByteBuffer.wrap(utf8), false);
+ return decode(ByteBuffer.wrap(utf8), true);
}
public static String decode(byte[] utf8, int start, int length)
throws CharacterCodingException {
- return decode(ByteBuffer.wrap(utf8, start, length), false);
+ return decode(ByteBuffer.wrap(utf8, start, length), true);
}
/**
@@ -358,14 +344,14 @@
/**
* Converts the provided String to bytes using the
* UTF-8 encoding. If the input is malformed,
- * throws a MalformedInputException.
+ * invalid chars are replaced by a default value.
* @return ByteBuffer: bytes stores at ByteBuffer.array()
* and length is ByteBuffer.limit()
*/
public static ByteBuffer encode(String string)
throws CharacterCodingException {
- return encode(string, false);
+ return encode(string, true);
}
/**
@@ -399,7 +385,6 @@
int length = WritableUtils.readVInt(in);
byte [] bytes = new byte[length];
in.readFully(bytes, 0, length);
- validateUTF8(bytes);
return decode(bytes);
}
Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/record/Utils.java
URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/java/org/apache/hadoop/record/Utils.java?view=diff&rev=452941&r1=452940&r2=452941
==============================================================================
--- lucene/hadoop/trunk/src/java/org/apache/hadoop/record/Utils.java (original)
+++ lucene/hadoop/trunk/src/java/org/apache/hadoop/record/Utils.java Wed Oct 4 10:16:52 2006
@@ -255,12 +255,8 @@
sb.append(ch);
}
}
- try {
- return new Text(sb.toString());
- } catch (CharacterCodingException ex) {
- ex.printStackTrace();
- return new Text();
- }
+
+ return new Text(sb.toString());
}
/**
Added: lucene/hadoop/trunk/src/test/org/apache/hadoop/io/TestTextNonUTF8.java
URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/test/org/apache/hadoop/io/TestTextNonUTF8.java?view=auto&rev=452941
==============================================================================
--- lucene/hadoop/trunk/src/test/org/apache/hadoop/io/TestTextNonUTF8.java (added)
+++ lucene/hadoop/trunk/src/test/org/apache/hadoop/io/TestTextNonUTF8.java Wed Oct 4 10:16:52 2006
@@ -0,0 +1,54 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.io;
+
+import junit.framework.TestCase;
+
+import java.nio.charset.MalformedInputException;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.util.*;
+import java.util.Arrays;
+
+/** Unit tests for NonUTF8. */
+public class TestTextNonUTF8 extends TestCase {
+ private static final Log LOG= LogFactory.getLog("org.apache.hadoop.io.TestTextNonUTF8");
+
+ public void testNonUTF8() throws Exception{
+ // this is a non UTF8 byte array
+ byte b[] = {-0x01, -0x01, -0x01, -0x01, -0x01, -0x01, -0x01};
+ boolean nonUTF8 = false;
+ Text t = new Text(b);
+ try{
+ Text.validateUTF8(b);
+ }catch(MalformedInputException me){
+ nonUTF8 = false;
+ }
+ // asserting that the byte array is non utf8
+ assertFalse(nonUTF8);
+ byte ret[] = t.getBytes();
+ // asseting that the byte array are the same when the Text
+ // object is created.
+ assertTrue(Arrays.equals(b, ret));
+ }
+
+ public static void main(String[] args) throws Exception
+ {
+ TestTextNonUTF8 test = new TestTextNonUTF8();
+ test.testNonUTF8();
+ }
+}