You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pig.apache.org by sm...@apache.org on 2009/01/31 03:14:10 UTC

svn commit: r739492 - in /hadoop/pig/trunk: ./ src/org/apache/pig/data/ test/org/apache/pig/test/ test/org/apache/pig/test/utils/

Author: sms
Date: Sat Jan 31 02:14:10 2009
New Revision: 739492

URL: http://svn.apache.org/viewvc?rev=739492&view=rev
Log:
PIG-560: UTFDataFormatException (encoded string too long) is thrown when storing strings > 65536 bytes (in UTF8 form) using BinStorage()

Modified:
    hadoop/pig/trunk/CHANGES.txt
    hadoop/pig/trunk/src/org/apache/pig/data/DataReaderWriter.java
    hadoop/pig/trunk/src/org/apache/pig/data/DataType.java
    hadoop/pig/trunk/test/org/apache/pig/test/TestEvalPipeline.java
    hadoop/pig/trunk/test/org/apache/pig/test/TestPackage.java
    hadoop/pig/trunk/test/org/apache/pig/test/utils/GenRandomData.java

Modified: hadoop/pig/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/CHANGES.txt?rev=739492&r1=739491&r2=739492&view=diff
==============================================================================
--- hadoop/pig/trunk/CHANGES.txt (original)
+++ hadoop/pig/trunk/CHANGES.txt Sat Jan 31 02:14:10 2009
@@ -401,3 +401,6 @@
 
     PIG-649: RandomSampleLoader does not handle skipping correctly in
     getNext() (pradeepkth)
+
+    PIG-560: UTFDataFormatException (encoded string too long) is thrown when
+    storing strings > 65536 bytes (in UTF8 form) using BinStorage() (sms)

Modified: hadoop/pig/trunk/src/org/apache/pig/data/DataReaderWriter.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/data/DataReaderWriter.java?rev=739492&r1=739491&r2=739492&view=diff
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/data/DataReaderWriter.java (original)
+++ hadoop/pig/trunk/src/org/apache/pig/data/DataReaderWriter.java Sat Jan 31 02:14:10 2009
@@ -35,6 +35,8 @@
 public class DataReaderWriter {
     private static TupleFactory mTupleFactory = TupleFactory.getInstance();
     private static BagFactory mBagFactory = BagFactory.getInstance();
+    static final int UNSIGNED_SHORT_MAX = 65535;
+    static final String UTF8 = "UTF-8";
 
     public static Object readDatum(DataInput in) throws IOException, ExecException {
         // Read the data type
@@ -102,8 +104,20 @@
                 return new DataByteArray(ba);
                                      }
 
-            case DataType.CHARARRAY:
-                return in.readUTF();
+            case DataType.BIGCHARARRAY: {
+                int size = in.readInt();
+                byte[] ba = new byte[size];
+                in.readFully(ba);
+            	return new String(ba, DataReaderWriter.UTF8);
+            }
+
+            case DataType.CHARARRAY: {
+                int size = in.readUnsignedShort();
+                byte[] ba = new byte[size];
+                in.readFully(ba);
+            	return new String(ba, DataReaderWriter.UTF8);
+            }
+
 
             case DataType.NULL:
                 return null;
@@ -186,8 +200,19 @@
                                      }
 
             case DataType.CHARARRAY: {
-                out.writeByte(DataType.CHARARRAY);
-                out.writeUTF((String)val);
+                String s = (String)val;
+                byte[] utfBytes = s.getBytes(DataReaderWriter.UTF8);
+                int length = utfBytes.length;
+                
+                if(length < DataReaderWriter.UNSIGNED_SHORT_MAX) {
+                    out.writeByte(DataType.CHARARRAY);
+                    out.writeShort(length);
+                    out.write(utfBytes);
+                } else {
+                	out.writeByte(DataType.BIGCHARARRAY);
+                	out.writeInt(length);
+                	out.write(utfBytes);
+                }
                 break;
                                      }
 

Modified: hadoop/pig/trunk/src/org/apache/pig/data/DataType.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/data/DataType.java?rev=739492&r1=739491&r2=739492&view=diff
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/data/DataType.java (original)
+++ hadoop/pig/trunk/src/org/apache/pig/data/DataType.java Sat Jan 31 02:14:10 2009
@@ -62,6 +62,7 @@
     public static final byte DOUBLE    =  25;
     public static final byte BYTEARRAY =  50;
     public static final byte CHARARRAY =  55;
+    public static final byte BIGCHARARRAY =  60; //internal use only; for storing/loading chararray bigger than 64K characters in BinStorage
     public static final byte MAP       = 100;
     public static final byte TUPLE     = 110;
     public static final byte BAG       = 120;
@@ -143,13 +144,13 @@
         return types.length;
     }
     public static byte[] genAllTypes(){
-        byte[] types = { DataType.BAG, DataType.BOOLEAN, DataType.BYTE, DataType.BYTEARRAY, DataType.CHARARRAY, 
+        byte[] types = { DataType.BAG, DataType.BIGCHARARRAY, DataType.BOOLEAN, DataType.BYTE, DataType.BYTEARRAY, DataType.CHARARRAY, 
                 DataType.DOUBLE, DataType.FLOAT, DataType.INTEGER, DataType.LONG, DataType.MAP, DataType.TUPLE};
         return types;
     }
     
     private static String[] genAllTypeNames(){
-        String[] names = { "BAG", "BOOLEAN", "BYTE", "BYTEARRAY", "CHARARRAY", "DOUBLE", "FLOAT", "INTEGER", "LONG", 
+        String[] names = { "BAG", "BIGCHARARRAY", "BOOLEAN", "BYTE", "BYTEARRAY", "CHARARRAY", "DOUBLE", "FLOAT", "INTEGER", "LONG", 
                 "MAP", "TUPLE" };
         return names;
     }
@@ -198,6 +199,7 @@
         case FLOAT:     return "float";
         case DOUBLE:    return "double";
         case BYTEARRAY: return "bytearray";
+        case BIGCHARARRAY: return "bigchararray";
         case CHARARRAY: return "chararray";
         case MAP:       return "map";
         case TUPLE:     return "tuple";
@@ -228,12 +230,13 @@
     /**
      * Determine whether the this data type is atomic.
      * @param dataType Data type code to test.
-     * @return true if dataType is bytearray, chararray, integer, long,
+     * @return true if dataType is bytearray, bigchararray, chararray, integer, long,
      * float, or boolean.
      */
     public static boolean isAtomic(byte dataType) {
         return ((dataType == BYTEARRAY) ||
                 (dataType == CHARARRAY) ||
+                (dataType == BIGCHARARRAY) ||
                 (dataType == INTEGER) ||
                 (dataType == LONG) ||
                 (dataType == FLOAT) ||

Modified: hadoop/pig/trunk/test/org/apache/pig/test/TestEvalPipeline.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/test/org/apache/pig/test/TestEvalPipeline.java?rev=739492&r1=739491&r2=739492&view=diff
==============================================================================
--- hadoop/pig/trunk/test/org/apache/pig/test/TestEvalPipeline.java (original)
+++ hadoop/pig/trunk/test/org/apache/pig/test/TestEvalPipeline.java Sat Jan 31 02:14:10 2009
@@ -51,6 +51,7 @@
 import org.apache.pig.impl.logicalLayer.schema.Schema;
 import org.apache.pig.impl.logicalLayer.FrontendException;
 import org.apache.pig.impl.util.Pair;
+import org.apache.pig.test.utils.GenRandomData;
 import org.apache.pig.test.utils.Identity;
 
 import junit.framework.TestCase;
@@ -1088,5 +1089,45 @@
         Util.deleteFile(cluster, "table");        
     }
 
+    @Test
+    public void testBinStorageWithLargeStrings() throws Exception {
+        // Create input file with large strings
+    	int testSize = 100;
+    	String[] stringArray = new String[testSize];
+    	Random random = new Random();
+    	stringArray[0] = GenRandomData.genRandLargeString(random, 65534);
+    	for(int i = 1; i < stringArray.length; ++i) {
+    		//generate a few large strings every 25th record
+    		if((i % 25) == 0) {
+    			stringArray[i] = GenRandomData.genRandLargeString(random, 65535 + i);    			
+    		} else {
+    			stringArray[i] = GenRandomData.genRandString(random);
+    		}
+    	}
+        
+    	Util.createInputFile(cluster, "table", stringArray);
+        
+    	//test with BinStorage
+        pigServer.registerQuery("a = load 'table' using PigStorage() " +
+                "as (c: chararray);");
+        String output = "/pig/out/TestEvalPipeline-testBinStorageLargeStrings";
+        pigServer.deleteFile(output);
+        pigServer.store("a", output, BinStorage.class.getName());
+        
+        pigServer.registerQuery("b = load '" + output +"' using BinStorage() " +
+        "as (c:chararray);");
+        pigServer.registerQuery("c = foreach b generate c;");
+        
+        Iterator<Tuple> it = pigServer.openIterator("c");
+        int counter = 0;
+        while(it.hasNext()) {
+            Tuple tup = it.next();
+            String resultString = (String)tup.get(0);
+            String expectedString = stringArray[counter];
+          	assertTrue(expectedString.equals(resultString));
+            ++counter;
+        }
+        Util.deleteFile(cluster, "table");
+    }
 
 }

Modified: hadoop/pig/trunk/test/org/apache/pig/test/TestPackage.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/test/org/apache/pig/test/TestPackage.java?rev=739492&r1=739491&r2=739492&view=diff
==============================================================================
--- hadoop/pig/trunk/test/org/apache/pig/test/TestPackage.java (original)
+++ hadoop/pig/trunk/test/org/apache/pig/test/TestPackage.java Sat Jan 31 02:14:10 2009
@@ -125,6 +125,14 @@
         case DataType.BYTEARRAY:
             runTest(GenRandomData.genRandDBA(r),inner);
             break;
+        case DataType.BIGCHARARRAY: {
+			String s = GenRandomData.genRandString(r);			
+			for(;s.length() < 65535;) {
+				s += GenRandomData.genRandString(r);
+			}
+			runTest(s,inner);
+        	break;
+        }        	
         case DataType.CHARARRAY:
             runTest(GenRandomData.genRandString(r),inner);
             break;

Modified: hadoop/pig/trunk/test/org/apache/pig/test/utils/GenRandomData.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/test/org/apache/pig/test/utils/GenRandomData.java?rev=739492&r1=739491&r2=739492&view=diff
==============================================================================
--- hadoop/pig/trunk/test/org/apache/pig/test/utils/GenRandomData.java (original)
+++ hadoop/pig/trunk/test/org/apache/pig/test/utils/GenRandomData.java Sat Jan 31 02:14:10 2009
@@ -61,6 +61,16 @@
             chars[i] = (char)(r.nextInt(26)+65);
         }
         return new String(chars);
+    }    
+
+    public static String genRandLargeString(Random r, int size){
+        if(r==null) return "RANDOM";
+        if(size <= 10) return genRandString(r);
+        char[] chars = new char[size];
+        for(int i=0;i<size;i++){
+            chars[i] = (char)(r.nextInt(26)+65);
+        }
+        return new String(chars);
     }
     
     public static DataByteArray genRandDBA(Random r){