You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pig.apache.org by sm...@apache.org on 2009/01/31 03:14:10 UTC
svn commit: r739492 - in /hadoop/pig/trunk: ./ src/org/apache/pig/data/
test/org/apache/pig/test/ test/org/apache/pig/test/utils/
Author: sms
Date: Sat Jan 31 02:14:10 2009
New Revision: 739492
URL: http://svn.apache.org/viewvc?rev=739492&view=rev
Log:
PIG-560: UTFDataFormatException (encoded string too long) is thrown when storing strings > 65536 bytes (in UTF8 form) using BinStorage()
Modified:
hadoop/pig/trunk/CHANGES.txt
hadoop/pig/trunk/src/org/apache/pig/data/DataReaderWriter.java
hadoop/pig/trunk/src/org/apache/pig/data/DataType.java
hadoop/pig/trunk/test/org/apache/pig/test/TestEvalPipeline.java
hadoop/pig/trunk/test/org/apache/pig/test/TestPackage.java
hadoop/pig/trunk/test/org/apache/pig/test/utils/GenRandomData.java
Modified: hadoop/pig/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/CHANGES.txt?rev=739492&r1=739491&r2=739492&view=diff
==============================================================================
--- hadoop/pig/trunk/CHANGES.txt (original)
+++ hadoop/pig/trunk/CHANGES.txt Sat Jan 31 02:14:10 2009
@@ -401,3 +401,6 @@
PIG-649: RandomSampleLoader does not handle skipping correctly in
getNext() (pradeepkth)
+
+ PIG-560: UTFDataFormatException (encoded string too long) is thrown when
+ storing strings > 65536 bytes (in UTF8 form) using BinStorage() (sms)
Modified: hadoop/pig/trunk/src/org/apache/pig/data/DataReaderWriter.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/data/DataReaderWriter.java?rev=739492&r1=739491&r2=739492&view=diff
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/data/DataReaderWriter.java (original)
+++ hadoop/pig/trunk/src/org/apache/pig/data/DataReaderWriter.java Sat Jan 31 02:14:10 2009
@@ -35,6 +35,8 @@
public class DataReaderWriter {
private static TupleFactory mTupleFactory = TupleFactory.getInstance();
private static BagFactory mBagFactory = BagFactory.getInstance();
+ static final int UNSIGNED_SHORT_MAX = 65535;
+ static final String UTF8 = "UTF-8";
public static Object readDatum(DataInput in) throws IOException, ExecException {
// Read the data type
@@ -102,8 +104,20 @@
return new DataByteArray(ba);
}
- case DataType.CHARARRAY:
- return in.readUTF();
+ case DataType.BIGCHARARRAY: {
+ int size = in.readInt();
+ byte[] ba = new byte[size];
+ in.readFully(ba);
+ return new String(ba, DataReaderWriter.UTF8);
+ }
+
+ case DataType.CHARARRAY: {
+ int size = in.readUnsignedShort();
+ byte[] ba = new byte[size];
+ in.readFully(ba);
+ return new String(ba, DataReaderWriter.UTF8);
+ }
+
case DataType.NULL:
return null;
@@ -186,8 +200,19 @@
}
case DataType.CHARARRAY: {
- out.writeByte(DataType.CHARARRAY);
- out.writeUTF((String)val);
+ String s = (String)val;
+ byte[] utfBytes = s.getBytes(DataReaderWriter.UTF8);
+ int length = utfBytes.length;
+
+ if(length < DataReaderWriter.UNSIGNED_SHORT_MAX) {
+ out.writeByte(DataType.CHARARRAY);
+ out.writeShort(length);
+ out.write(utfBytes);
+ } else {
+ out.writeByte(DataType.BIGCHARARRAY);
+ out.writeInt(length);
+ out.write(utfBytes);
+ }
break;
}
Modified: hadoop/pig/trunk/src/org/apache/pig/data/DataType.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/data/DataType.java?rev=739492&r1=739491&r2=739492&view=diff
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/data/DataType.java (original)
+++ hadoop/pig/trunk/src/org/apache/pig/data/DataType.java Sat Jan 31 02:14:10 2009
@@ -62,6 +62,7 @@
public static final byte DOUBLE = 25;
public static final byte BYTEARRAY = 50;
public static final byte CHARARRAY = 55;
+ public static final byte BIGCHARARRAY = 60; //internal use only; for storing/loading chararray bigger than 64K characters in BinStorage
public static final byte MAP = 100;
public static final byte TUPLE = 110;
public static final byte BAG = 120;
@@ -143,13 +144,13 @@
return types.length;
}
public static byte[] genAllTypes(){
- byte[] types = { DataType.BAG, DataType.BOOLEAN, DataType.BYTE, DataType.BYTEARRAY, DataType.CHARARRAY,
+ byte[] types = { DataType.BAG, DataType.BIGCHARARRAY, DataType.BOOLEAN, DataType.BYTE, DataType.BYTEARRAY, DataType.CHARARRAY,
DataType.DOUBLE, DataType.FLOAT, DataType.INTEGER, DataType.LONG, DataType.MAP, DataType.TUPLE};
return types;
}
private static String[] genAllTypeNames(){
- String[] names = { "BAG", "BOOLEAN", "BYTE", "BYTEARRAY", "CHARARRAY", "DOUBLE", "FLOAT", "INTEGER", "LONG",
+ String[] names = { "BAG", "BIGCHARARRAY", "BOOLEAN", "BYTE", "BYTEARRAY", "CHARARRAY", "DOUBLE", "FLOAT", "INTEGER", "LONG",
"MAP", "TUPLE" };
return names;
}
@@ -198,6 +199,7 @@
case FLOAT: return "float";
case DOUBLE: return "double";
case BYTEARRAY: return "bytearray";
+ case BIGCHARARRAY: return "bigchararray";
case CHARARRAY: return "chararray";
case MAP: return "map";
case TUPLE: return "tuple";
@@ -228,12 +230,13 @@
/**
* Determine whether the this data type is atomic.
* @param dataType Data type code to test.
- * @return true if dataType is bytearray, chararray, integer, long,
+ * @return true if dataType is bytearray, bigchararray, chararray, integer, long,
* float, or boolean.
*/
public static boolean isAtomic(byte dataType) {
return ((dataType == BYTEARRAY) ||
(dataType == CHARARRAY) ||
+ (dataType == BIGCHARARRAY) ||
(dataType == INTEGER) ||
(dataType == LONG) ||
(dataType == FLOAT) ||
Modified: hadoop/pig/trunk/test/org/apache/pig/test/TestEvalPipeline.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/test/org/apache/pig/test/TestEvalPipeline.java?rev=739492&r1=739491&r2=739492&view=diff
==============================================================================
--- hadoop/pig/trunk/test/org/apache/pig/test/TestEvalPipeline.java (original)
+++ hadoop/pig/trunk/test/org/apache/pig/test/TestEvalPipeline.java Sat Jan 31 02:14:10 2009
@@ -51,6 +51,7 @@
import org.apache.pig.impl.logicalLayer.schema.Schema;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.impl.util.Pair;
+import org.apache.pig.test.utils.GenRandomData;
import org.apache.pig.test.utils.Identity;
import junit.framework.TestCase;
@@ -1088,5 +1089,45 @@
Util.deleteFile(cluster, "table");
}
+ @Test
+ public void testBinStorageWithLargeStrings() throws Exception {
+ // Create input file with large strings
+ int testSize = 100;
+ String[] stringArray = new String[testSize];
+ Random random = new Random();
+ stringArray[0] = GenRandomData.genRandLargeString(random, 65534);
+ for(int i = 1; i < stringArray.length; ++i) {
+ //generate a few large strings every 25th record
+ if((i % 25) == 0) {
+ stringArray[i] = GenRandomData.genRandLargeString(random, 65535 + i);
+ } else {
+ stringArray[i] = GenRandomData.genRandString(random);
+ }
+ }
+
+ Util.createInputFile(cluster, "table", stringArray);
+
+ //test with BinStorage
+ pigServer.registerQuery("a = load 'table' using PigStorage() " +
+ "as (c: chararray);");
+ String output = "/pig/out/TestEvalPipeline-testBinStorageLargeStrings";
+ pigServer.deleteFile(output);
+ pigServer.store("a", output, BinStorage.class.getName());
+
+ pigServer.registerQuery("b = load '" + output +"' using BinStorage() " +
+ "as (c:chararray);");
+ pigServer.registerQuery("c = foreach b generate c;");
+
+ Iterator<Tuple> it = pigServer.openIterator("c");
+ int counter = 0;
+ while(it.hasNext()) {
+ Tuple tup = it.next();
+ String resultString = (String)tup.get(0);
+ String expectedString = stringArray[counter];
+ assertTrue(expectedString.equals(resultString));
+ ++counter;
+ }
+ Util.deleteFile(cluster, "table");
+ }
}
Modified: hadoop/pig/trunk/test/org/apache/pig/test/TestPackage.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/test/org/apache/pig/test/TestPackage.java?rev=739492&r1=739491&r2=739492&view=diff
==============================================================================
--- hadoop/pig/trunk/test/org/apache/pig/test/TestPackage.java (original)
+++ hadoop/pig/trunk/test/org/apache/pig/test/TestPackage.java Sat Jan 31 02:14:10 2009
@@ -125,6 +125,14 @@
case DataType.BYTEARRAY:
runTest(GenRandomData.genRandDBA(r),inner);
break;
+ case DataType.BIGCHARARRAY: {
+ String s = GenRandomData.genRandString(r);
+ for(;s.length() < 65535;) {
+ s += GenRandomData.genRandString(r);
+ }
+ runTest(s,inner);
+ break;
+ }
case DataType.CHARARRAY:
runTest(GenRandomData.genRandString(r),inner);
break;
Modified: hadoop/pig/trunk/test/org/apache/pig/test/utils/GenRandomData.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/test/org/apache/pig/test/utils/GenRandomData.java?rev=739492&r1=739491&r2=739492&view=diff
==============================================================================
--- hadoop/pig/trunk/test/org/apache/pig/test/utils/GenRandomData.java (original)
+++ hadoop/pig/trunk/test/org/apache/pig/test/utils/GenRandomData.java Sat Jan 31 02:14:10 2009
@@ -61,6 +61,16 @@
chars[i] = (char)(r.nextInt(26)+65);
}
return new String(chars);
+ }
+
+ public static String genRandLargeString(Random r, int size){
+ if(r==null) return "RANDOM";
+ if(size <= 10) return genRandString(r);
+ char[] chars = new char[size];
+ for(int i=0;i<size;i++){
+ chars[i] = (char)(r.nextInt(26)+65);
+ }
+ return new String(chars);
}
public static DataByteArray genRandDBA(Random r){