You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by cu...@apache.org on 2006/10/04 19:16:53 UTC
svn commit: r452941 - in /lucene/hadoop/trunk: CHANGES.txt src/java/org/apache/hadoop/io/Text.java src/java/org/apache/hadoop/record/Utils.java src/test/org/apache/hadoop/io/TestTextNonUTF8.java

Author: cutting
Date: Wed Oct  4 10:16:52 2006
New Revision: 452941

URL: http://svn.apache.org/viewvc?view=rev&rev=452941
Log:
HADOOP-550.  Disable automatic UTF-8 validation in Text.  Contributed by Hairong & Mahadev.

Added:
    lucene/hadoop/trunk/src/test/org/apache/hadoop/io/TestTextNonUTF8.java
Modified:
    lucene/hadoop/trunk/CHANGES.txt
    lucene/hadoop/trunk/src/java/org/apache/hadoop/io/Text.java
    lucene/hadoop/trunk/src/java/org/apache/hadoop/record/Utils.java

Modified: lucene/hadoop/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/CHANGES.txt?view=diff&rev=452941&r1=452940&r2=452941
==============================================================================
--- lucene/hadoop/trunk/CHANGES.txt (original)
+++ lucene/hadoop/trunk/CHANGES.txt Wed Oct  4 10:16:52 2006
@@ -128,6 +128,10 @@
     tasktracker correctly report the task as failed to the jobtracker,
     so that it will be rescheduled.  (omalley via cutting)
 
+31. HADOOP-550.  Disable automatic UTF-8 validation in Text.  This
+    permits, e.g., TextInputFormat to again operate on non-UTF-8 data.
+    (Hairong and Mahadev via cutting)
+
 
 Release 0.6.2 - 2006-09-18
 

Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/io/Text.java
URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/java/org/apache/hadoop/io/Text.java?view=diff&rev=452941&r1=452940&r2=452941
==============================================================================
--- lucene/hadoop/trunk/src/java/org/apache/hadoop/io/Text.java (original)
+++ lucene/hadoop/trunk/src/java/org/apache/hadoop/io/Text.java Wed Oct  4 10:16:52 2006
@@ -64,10 +64,8 @@
   }
 
   /** Construct from a string. 
-   * @exception CharacterCodingExcetpion if the string contains 
-   *            invalid codepoints or unpaired surrogates
    */
-  public Text(String string) throws CharacterCodingException {
+  public Text(String string) {
     set(string);
   }
 
@@ -77,9 +75,8 @@
   }
 
   /** Construct from a byte array.
-   * @exception CharacterCodingExcetpion if the array has invalid UTF8 bytes 
    */
-  public Text(byte[] utf8) throws CharacterCodingException {
+  public Text(byte[] utf8)  {
     set(utf8);
   }
   
@@ -160,29 +157,26 @@
     }
   }  
   /** Set to contain the contents of a string. 
-   * @exception CharacterCodingException if the string contains 
-   *       invalid codepoints or unpaired surrogate
    */
-  public void set(String string) throws CharacterCodingException {
-    ByteBuffer bb = encode(string);
-    bytes = bb.array();
-    length = bb.limit();
+  public void set(String string) {
+    try {
+      ByteBuffer bb = encode(string, true);
+      bytes = bb.array();
+      length = bb.limit();
+    }catch(CharacterCodingException e) {
+      throw new RuntimeException("Should not have happened " + e.toString()); 
+    }
   }
 
   /** Set to a utf8 byte array
-   * @exception CharacterCodingException if the array contains invalid UTF8 code  
    */
-  public void set(byte[] utf8) throws CharacterCodingException {
+  public void set(byte[] utf8) {
     set(utf8, 0, utf8.length);
   }
   
   /** copy a text. */
   public void set(Text other) {
-    try {
-      set(other.bytes, 0, other.length);
-    } catch (CharacterCodingException e) {
-      throw new RuntimeException("bad Text UTF8 encoding", e);
-    }
+    set(other.bytes, 0, other.length);
   }
 
   /**
@@ -191,9 +185,7 @@
    * @param start the first position of the new string
    * @param len the number of bytes of the new string
    */
-  public void set(byte[] utf8, int start, int len 
-                  ) throws CharacterCodingException{
-    validateUTF8(utf8, start, len);
+  public void set(byte[] utf8, int start, int len) {
     setCapacity(len);
     System.arraycopy(utf8, start, bytes, 0, len);
     this.length = len;
@@ -221,22 +213,16 @@
     try {
       return decode(bytes, 0, length);
     } catch (CharacterCodingException e) { 
-      //bytes is supposed to contain valid utf8, therefore, 
-      // this should never happen
       return null;
     }
   }
   
   /** deserialize 
-   * check if the received bytes are valid utf8 code. 
-   * if not throws MalformedInputException
-   * @see Writable#readFields(DataInput)
    */
   public void readFields(DataInput in) throws IOException {
     length = WritableUtils.readVInt(in);
     setCapacity(length);
     in.readFully(bytes, 0, length);
-    validateUTF8(bytes);
   }
 
   /** Skips over one Text in the input. */
@@ -251,7 +237,7 @@
    * @see Writable#write(DataOutput)
    */
   public void write(DataOutput out) throws IOException {
-    WritableUtils.writeVInt(out, length); // out.writeInt(length);
+    WritableUtils.writeVInt(out, length);
     out.write(bytes, 0, length);
   }
 
@@ -313,15 +299,15 @@
   /**
    * Converts the provided byte array to a String using the
    * UTF-8 encoding. If the input is malformed,
-   * throws a MalformedInputException.
+   * replace by a default value.
    */
   public static String decode(byte[] utf8) throws CharacterCodingException {
-    return decode(ByteBuffer.wrap(utf8), false);
+    return decode(ByteBuffer.wrap(utf8), true);
   }
   
   public static String decode(byte[] utf8, int start, int length) 
       throws CharacterCodingException {
-      return decode(ByteBuffer.wrap(utf8, start, length), false);
+      return decode(ByteBuffer.wrap(utf8, start, length), true);
   }
   
   /**
@@ -358,14 +344,14 @@
   /**
    * Converts the provided String to bytes using the
    * UTF-8 encoding. If the input is malformed,
-   * throws a MalformedInputException.
+   * invalid chars are replaced by a default value.
    * @return ByteBuffer: bytes stores at ByteBuffer.array() 
    *                     and length is ByteBuffer.limit()
    */
 
   public static ByteBuffer encode(String string)
     throws CharacterCodingException {
-    return encode(string, false);
+    return encode(string, true);
   }
 
   /**
@@ -399,7 +385,6 @@
     int length = WritableUtils.readVInt(in);
     byte [] bytes = new byte[length];
     in.readFully(bytes, 0, length);
-    validateUTF8(bytes);
     return decode(bytes);
   }
 

Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/record/Utils.java
URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/java/org/apache/hadoop/record/Utils.java?view=diff&rev=452941&r1=452940&r2=452941
==============================================================================
--- lucene/hadoop/trunk/src/java/org/apache/hadoop/record/Utils.java (original)
+++ lucene/hadoop/trunk/src/java/org/apache/hadoop/record/Utils.java Wed Oct  4 10:16:52 2006
@@ -255,12 +255,8 @@
             sb.append(ch);
           }
         }
-        try {
-          return new Text(sb.toString());
-        } catch (CharacterCodingException ex) {
-          ex.printStackTrace();
-          return new Text();
-        }
+        
+        return new Text(sb.toString());
     }
     
     /**

Added: lucene/hadoop/trunk/src/test/org/apache/hadoop/io/TestTextNonUTF8.java
URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/test/org/apache/hadoop/io/TestTextNonUTF8.java?view=auto&rev=452941
==============================================================================
--- lucene/hadoop/trunk/src/test/org/apache/hadoop/io/TestTextNonUTF8.java (added)
+++ lucene/hadoop/trunk/src/test/org/apache/hadoop/io/TestTextNonUTF8.java Wed Oct  4 10:16:52 2006
@@ -0,0 +1,54 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.io;
+
+import junit.framework.TestCase;
+
+import java.nio.charset.MalformedInputException;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.util.*;
+import java.util.Arrays;
+
+/** Unit tests for NonUTF8. */
+public class TestTextNonUTF8 extends TestCase {
+  private static final Log LOG= LogFactory.getLog("org.apache.hadoop.io.TestTextNonUTF8");
+
+  public void testNonUTF8() throws Exception{
+   // this is a non UTF8 byte array
+   byte b[] = {-0x01, -0x01, -0x01, -0x01, -0x01, -0x01, -0x01};
+   boolean nonUTF8 = false;
+   Text t = new Text(b);
+   try{
+     Text.validateUTF8(b);
+   }catch(MalformedInputException me){
+     nonUTF8 = false;
+   }
+   // asserting that the byte array is non utf8
+   assertFalse(nonUTF8);
+   byte ret[] = t.getBytes();
+   // asseting that the byte array are the same when the Text
+   // object is created.
+   assertTrue(Arrays.equals(b, ret));
+  }
+
+  public static void main(String[] args)  throws Exception
+  {
+    TestTextNonUTF8 test = new TestTextNonUTF8();
+    test.testNonUTF8();
+  }
+}