You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by zs...@apache.org on 2010/01/06 02:29:42 UTC

svn commit: r896296 - in /hadoop/hive/trunk: ./ contrib/src/java/org/apache/hadoop/hive/contrib/serde2/ contrib/src/test/queries/clientpositive/ contrib/src/test/results/clientpositive/

Author: zshao
Date: Wed Jan  6 01:29:42 2010
New Revision: 896296

URL: http://svn.apache.org/viewvc?rev=896296&view=rev
Log:
HIVE-1028. Fix tinyint in TypedBytes. (Namit Jain via zshao)

Added:
    hadoop/hive/trunk/contrib/src/test/queries/clientpositive/serde_typedbytes4.q
    hadoop/hive/trunk/contrib/src/test/results/clientpositive/serde_typedbytes4.q.out
Modified:
    hadoop/hive/trunk/CHANGES.txt
    hadoop/hive/trunk/contrib/src/java/org/apache/hadoop/hive/contrib/serde2/TypedBytesSerDe.java

Modified: hadoop/hive/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/CHANGES.txt?rev=896296&r1=896295&r2=896296&view=diff
==============================================================================
--- hadoop/hive/trunk/CHANGES.txt (original)
+++ hadoop/hive/trunk/CHANGES.txt Wed Jan  6 01:29:42 2010
@@ -396,6 +396,8 @@
     HIVE-1026 Fix test sample6.q to make it deterministic
     (John Sichi via namit)
 
+    HIVE-1028. Fix tinyint in TypedBytes. (Namit Jain via zshao)
+
 Release 0.4.0 -  Unreleased
 
   INCOMPATIBLE CHANGES

Modified: hadoop/hive/trunk/contrib/src/java/org/apache/hadoop/hive/contrib/serde2/TypedBytesSerDe.java
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/contrib/src/java/org/apache/hadoop/hive/contrib/serde2/TypedBytesSerDe.java?rev=896296&r1=896295&r2=896296&view=diff
==============================================================================
--- hadoop/hive/trunk/contrib/src/java/org/apache/hadoop/hive/contrib/serde2/TypedBytesSerDe.java (original)
+++ hadoop/hive/trunk/contrib/src/java/org/apache/hadoop/hive/contrib/serde2/TypedBytesSerDe.java Wed Jan  6 01:29:42 2010
@@ -65,30 +65,30 @@
 
 /**
  * TypedBytesSerDe uses typed bytes to serialize/deserialize.
- * 
+ *
  * More info on the typedbytes stuff that Dumbo uses.
- * http://issues.apache.org/jira/browse/HADOOP-1722 
+ * http://issues.apache.org/jira/browse/HADOOP-1722
  * A fast python decoder for this, which is apparently 25% faster than the python version is available at
- * http://github.com/klbostee/ctypedbytes/tree/master 
+ * http://github.com/klbostee/ctypedbytes/tree/master
  */
 public class TypedBytesSerDe implements SerDe {
 
   public static final Log LOG = LogFactory.getLog(TypedBytesSerDe.class.getName());
-  
+
   int numColumns;
   StructObjectInspector rowOI;
   ArrayList<Object> row;
- 
+
   BytesWritable serializeBytesWritable;
   NonSyncDataOutputBuffer barrStr;
   TypedBytesWritableOutput tbOut;
-  
+
   NonSyncDataInputBuffer inBarrStr;
   TypedBytesWritableInput tbIn;
-  
+
   List<String>   columnNames;
   List<TypeInfo> columnTypes;
-  
+
   @Override
   public void initialize(Configuration conf, Properties tbl)
       throws SerDeException {
@@ -97,10 +97,10 @@
     serializeBytesWritable = new BytesWritable();
     barrStr = new NonSyncDataOutputBuffer();
     tbOut = new TypedBytesWritableOutput(barrStr);
-    
+
     inBarrStr = new NonSyncDataInputBuffer();
     tbIn = new TypedBytesWritableInput(inBarrStr);
-    
+
     // Read the configuration parameters
     String columnNameProperty = tbl.getProperty(Constants.LIST_COLUMNS);
     String columnTypeProperty = tbl.getProperty(Constants.LIST_COLUMN_TYPES);
@@ -115,29 +115,29 @@
     }
 
     assert columnNames.size() == columnTypes.size();
-    numColumns = columnNames.size(); 
-    
+    numColumns = columnNames.size();
+
     // All columns have to be primitive.
     for (int c = 0; c < numColumns; c++) {
       if (columnTypes.get(c).getCategory() != Category.PRIMITIVE) {
-        throw new SerDeException(getClass().getName() 
-            + " only accepts primitive columns, but column[" + c 
+        throw new SerDeException(getClass().getName()
+            + " only accepts primitive columns, but column[" + c
             + "] named " + columnNames.get(c) + " has category "
             + columnTypes.get(c).getCategory());
       }
     }
-    
+
     // Constructing the row ObjectInspector:
-    // The row consists of some string columns, each column will be a java 
+    // The row consists of some string columns, each column will be a java
     // String object.
     List<ObjectInspector> columnOIs = new ArrayList<ObjectInspector>(columnNames.size());
     for (int c = 0; c < numColumns; c++) {
       columnOIs.add(TypeInfoUtils.getStandardWritableObjectInspectorFromTypeInfo(columnTypes.get(c)));
     }
-    
-    // StandardStruct uses ArrayList to store the row. 
+
+    // StandardStruct uses ArrayList to store the row.
     rowOI = ObjectInspectorFactory.getStandardStructObjectInspector(columnNames, columnOIs);
-    
+
     // Constructing the row object, etc, which will be reused for all rows.
     row = new ArrayList<Object>(numColumns);
     for (int c = 0; c < numColumns; c++) {
@@ -154,12 +154,12 @@
   public Class<? extends Writable> getSerializedClass() {
     return BytesWritable.class;
   }
-  
+
   @Override
   public Object deserialize(Writable blob) throws SerDeException {
 
     BytesWritable data = (BytesWritable)blob;
-    inBarrStr.reset(data.get(), 0, data.getSize());   
+    inBarrStr.reset(data.get(), 0, data.getSize());
 
     try {
 
@@ -169,11 +169,11 @@
 
       // The next byte should be the marker
       assert tbIn.readTypeCode() == Type.ENDOFRECORD;
-      
+
     } catch (IOException e) {
       throw new SerDeException(e);
     }
-    
+
     return row;
   }
 
@@ -237,9 +237,9 @@
         }
       }
       // Currently, deserialization of complex types is not supported
-      case LIST: 
+      case LIST:
       case MAP:
-      case STRUCT: 
+      case STRUCT:
       default: {
         throw new RuntimeException("Unsupported category: " + type.getCategory());
       }
@@ -247,7 +247,7 @@
   }
 
 
-  
+
   @Override
   public Writable serialize(Object obj, ObjectInspector objInspector)
       throws SerDeException {
@@ -255,25 +255,23 @@
       barrStr.reset();
       StructObjectInspector soi = (StructObjectInspector)objInspector;
       List<? extends StructField> fields = soi.getAllStructFieldRefs();
-    
+
       for (int i = 0; i < numColumns; i++) {
         Object o = soi.getStructFieldData(obj, fields.get(i));
-        ObjectInspector oi = fields.get(i).getFieldObjectInspector(); 
+        ObjectInspector oi = fields.get(i).getFieldObjectInspector();
         serializeField(o, oi, row.get(i));
       }
-    
+
       // End of the record is part of the data
       tbOut.writeEndOfRecord();
-      
+
       serializeBytesWritable.set(barrStr.getData(), 0, barrStr.getLength());
     } catch (IOException e) {
       throw new SerDeException(e.getMessage());
     }
     return serializeBytesWritable;
   }
-   
-  private byte[] tmpByteArr = new byte[1];
-  
+
   private void serializeField(Object o, ObjectInspector oi, Object reuse) throws IOException {
     switch (oi.getCategory()) {
       case PRIMITIVE: {
@@ -291,9 +289,8 @@
           }
           case BYTE: {
             ByteObjectInspector boi = (ByteObjectInspector)poi;
-            BytesWritable r = reuse == null ? new BytesWritable() : (BytesWritable)reuse;
-            tmpByteArr[0] = boi.get(o);
-            r.set(tmpByteArr, 0, 1);
+            ByteWritable r = reuse == null ? new ByteWritable() : (ByteWritable)reuse;
+            r.set(boi.get(o));
             tbOut.write(r);
             return;
           }
@@ -343,13 +340,13 @@
           }
         }
       }
-      case LIST: 
+      case LIST:
       case MAP:
       case STRUCT: {
         // For complex object, serialize to JSON format
         String s = SerDeUtils.getJSONString(o, oi);
         Text t = reuse == null ? new Text() : (Text)reuse;
-        
+
         // convert to Text and write it
         t.set(s);
         tbOut.write(t);

Added: hadoop/hive/trunk/contrib/src/test/queries/clientpositive/serde_typedbytes4.q
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/contrib/src/test/queries/clientpositive/serde_typedbytes4.q?rev=896296&view=auto
==============================================================================
--- hadoop/hive/trunk/contrib/src/test/queries/clientpositive/serde_typedbytes4.q (added)
+++ hadoop/hive/trunk/contrib/src/test/queries/clientpositive/serde_typedbytes4.q Wed Jan  6 01:29:42 2010
@@ -0,0 +1,31 @@
+add jar ../build/contrib/hive_contrib.jar;
+
+drop table dest1;
+CREATE TABLE dest1(key STRING, value STRING) STORED AS TEXTFILE;
+
+EXPLAIN
+FROM (
+  FROM src
+  SELECT TRANSFORM(cast(src.key as tinyint), src.value) ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.TypedBytesSerDe'
+  RECORDWRITER 'org.apache.hadoop.hive.contrib.util.typedbytes.TypedBytesRecordWriter'
+  USING '/bin/cat'
+  AS (tkey, tvalue) ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.TypedBytesSerDe'
+  RECORDREADER 'org.apache.hadoop.hive.contrib.util.typedbytes.TypedBytesRecordReader'
+  WHERE key < 100
+) tmap
+INSERT OVERWRITE TABLE dest1 SELECT tkey, tvalue ORDER by tkey, tvalue;
+
+FROM (
+  FROM src
+  SELECT TRANSFORM(cast(src.key as tinyint), src.value) ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.TypedBytesSerDe'
+  RECORDWRITER 'org.apache.hadoop.hive.contrib.util.typedbytes.TypedBytesRecordWriter'
+  USING '/bin/cat'
+  AS (tkey, tvalue) ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.TypedBytesSerDe'
+  RECORDREADER 'org.apache.hadoop.hive.contrib.util.typedbytes.TypedBytesRecordReader'
+  WHERE key < 100
+) tmap
+INSERT OVERWRITE TABLE dest1 SELECT tkey, tvalue ORDER by tkey, tvalue;
+
+SELECT dest1.* FROM dest1;
+
+drop table dest1;

Added: hadoop/hive/trunk/contrib/src/test/results/clientpositive/serde_typedbytes4.q.out
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/contrib/src/test/results/clientpositive/serde_typedbytes4.q.out?rev=896296&view=auto
==============================================================================
--- hadoop/hive/trunk/contrib/src/test/results/clientpositive/serde_typedbytes4.q.out (added)
+++ hadoop/hive/trunk/contrib/src/test/results/clientpositive/serde_typedbytes4.q.out Wed Jan  6 01:29:42 2010
@@ -0,0 +1,232 @@
+PREHOOK: query: drop table dest1
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: drop table dest1
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: CREATE TABLE dest1(key STRING, value STRING) STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE dest1(key STRING, value STRING) STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@dest1
+PREHOOK: query: EXPLAIN
+FROM (
+  FROM src
+  SELECT TRANSFORM(cast(src.key as tinyint), src.value) ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.TypedBytesSerDe'
+  RECORDWRITER 'org.apache.hadoop.hive.contrib.util.typedbytes.TypedBytesRecordWriter'
+  USING '/bin/cat'
+  AS (tkey, tvalue) ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.TypedBytesSerDe'
+  RECORDREADER 'org.apache.hadoop.hive.contrib.util.typedbytes.TypedBytesRecordReader'
+  WHERE key < 100
+) tmap
+INSERT OVERWRITE TABLE dest1 SELECT tkey, tvalue ORDER by tkey, tvalue
+PREHOOK: type: QUERY
+POSTHOOK: query: EXPLAIN
+FROM (
+  FROM src
+  SELECT TRANSFORM(cast(src.key as tinyint), src.value) ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.TypedBytesSerDe'
+  RECORDWRITER 'org.apache.hadoop.hive.contrib.util.typedbytes.TypedBytesRecordWriter'
+  USING '/bin/cat'
+  AS (tkey, tvalue) ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.TypedBytesSerDe'
+  RECORDREADER 'org.apache.hadoop.hive.contrib.util.typedbytes.TypedBytesRecordReader'
+  WHERE key < 100
+) tmap
+INSERT OVERWRITE TABLE dest1 SELECT tkey, tvalue ORDER by tkey, tvalue
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+  (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF src)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TRANSFORM (TOK_EXPLIST (TOK_FUNCTION TOK_TINYINT (. (TOK_TABLE_OR_COL src) key)) (. (TOK_TABLE_OR_COL src) value)) (TOK_SERDE (TOK_SERDENAME 'org.apache.hadoop.hive.contrib.serde2.TypedBytesSerDe')) (TOK_RECORDWRITER 'org.apache.hadoop.hive.contrib.util.typedbytes.TypedBytesRecordWriter') '/bin/cat' (TOK_SERDE (TOK_SERDENAME 'org.apache.hadoop.hive.contrib.serde2.TypedBytesSerDe')) (TOK_RECORDREADER 'org.apache.hadoop.hive.contrib.util.typedbytes.TypedBytesRecordReader') (TOK_ALIASLIST tkey tvalue)))) (TOK_WHERE (< (TOK_TABLE_OR_COL key) 100)))) tmap)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB dest1)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL tkey)) (TOK_SELEXPR (TOK_TABLE_OR_COL tvalue))) (TOK_ORDERBY (TOK_TABSORTCOLNAMEASC (TOK_TABLE_OR_COL tkey)) (TOK_TABSORTCOLNAMEASC (TOK_TABLE_OR_COL tvalue)))))
+
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Map Reduce
+      Alias -> Map Operator Tree:
+        tmap:src 
+          TableScan
+            alias: src
+            Filter Operator
+              predicate:
+                  expr: (key < 100)
+                  type: boolean
+              Filter Operator
+                predicate:
+                    expr: (key < 100)
+                    type: boolean
+                Select Operator
+                  expressions:
+                        expr: UDFToByte(key)
+                        type: tinyint
+                        expr: value
+                        type: string
+                  outputColumnNames: _col0, _col1
+                  Transform Operator
+                    command: /bin/cat
+                    output info:
+                        input format: org.apache.hadoop.mapred.TextInputFormat
+                        output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+                    Select Operator
+                      expressions:
+                            expr: _col0
+                            type: string
+                            expr: _col1
+                            type: string
+                      outputColumnNames: _col0, _col1
+                      Reduce Output Operator
+                        key expressions:
+                              expr: _col0
+                              type: string
+                              expr: _col1
+                              type: string
+                        sort order: ++
+                        tag: -1
+                        value expressions:
+                              expr: _col0
+                              type: string
+                              expr: _col1
+                              type: string
+      Reduce Operator Tree:
+        Extract
+          File Output Operator
+            compressed: false
+            GlobalTableId: 1
+            table:
+                input format: org.apache.hadoop.mapred.TextInputFormat
+                output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+                serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+                name: dest1
+
+  Stage: Stage-0
+    Move Operator
+      tables:
+          replace: true
+          table:
+              input format: org.apache.hadoop.mapred.TextInputFormat
+              output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+              serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+              name: dest1
+
+
+PREHOOK: query: FROM (
+  FROM src
+  SELECT TRANSFORM(cast(src.key as tinyint), src.value) ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.TypedBytesSerDe'
+  RECORDWRITER 'org.apache.hadoop.hive.contrib.util.typedbytes.TypedBytesRecordWriter'
+  USING '/bin/cat'
+  AS (tkey, tvalue) ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.TypedBytesSerDe'
+  RECORDREADER 'org.apache.hadoop.hive.contrib.util.typedbytes.TypedBytesRecordReader'
+  WHERE key < 100
+) tmap
+INSERT OVERWRITE TABLE dest1 SELECT tkey, tvalue ORDER by tkey, tvalue
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+PREHOOK: Output: default@dest1
+POSTHOOK: query: FROM (
+  FROM src
+  SELECT TRANSFORM(cast(src.key as tinyint), src.value) ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.TypedBytesSerDe'
+  RECORDWRITER 'org.apache.hadoop.hive.contrib.util.typedbytes.TypedBytesRecordWriter'
+  USING '/bin/cat'
+  AS (tkey, tvalue) ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.TypedBytesSerDe'
+  RECORDREADER 'org.apache.hadoop.hive.contrib.util.typedbytes.TypedBytesRecordReader'
+  WHERE key < 100
+) tmap
+INSERT OVERWRITE TABLE dest1 SELECT tkey, tvalue ORDER by tkey, tvalue
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+POSTHOOK: Output: default@dest1
+PREHOOK: query: SELECT dest1.* FROM dest1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@dest1
+PREHOOK: Output: file:/data/users/njain/hive1/hive1/build/ql/tmp/1776839815/10000
+POSTHOOK: query: SELECT dest1.* FROM dest1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@dest1
+POSTHOOK: Output: file:/data/users/njain/hive1/hive1/build/ql/tmp/1776839815/10000
+0	val_0
+0	val_0
+0	val_0
+10	val_10
+11	val_11
+12	val_12
+12	val_12
+15	val_15
+15	val_15
+17	val_17
+18	val_18
+18	val_18
+19	val_19
+2	val_2
+20	val_20
+24	val_24
+24	val_24
+26	val_26
+26	val_26
+27	val_27
+28	val_28
+30	val_30
+33	val_33
+34	val_34
+35	val_35
+35	val_35
+35	val_35
+37	val_37
+37	val_37
+4	val_4
+41	val_41
+42	val_42
+42	val_42
+43	val_43
+44	val_44
+47	val_47
+5	val_5
+5	val_5
+5	val_5
+51	val_51
+51	val_51
+53	val_53
+54	val_54
+57	val_57
+58	val_58
+58	val_58
+64	val_64
+65	val_65
+66	val_66
+67	val_67
+67	val_67
+69	val_69
+70	val_70
+70	val_70
+70	val_70
+72	val_72
+72	val_72
+74	val_74
+76	val_76
+76	val_76
+77	val_77
+78	val_78
+8	val_8
+80	val_80
+82	val_82
+83	val_83
+83	val_83
+84	val_84
+84	val_84
+85	val_85
+86	val_86
+87	val_87
+9	val_9
+90	val_90
+90	val_90
+90	val_90
+92	val_92
+95	val_95
+95	val_95
+96	val_96
+97	val_97
+97	val_97
+98	val_98
+98	val_98
+PREHOOK: query: drop table dest1
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: drop table dest1
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Output: default@dest1