You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by na...@apache.org on 2009/11/04 02:38:14 UTC

svn commit: r832645 - in /hadoop/hive/trunk: ./ ql/src/java/org/apache/hadoop/hive/ql/io/ ql/src/test/org/apache/hadoop/hive/ql/io/ ql/src/test/queries/clientpositive/ ql/src/test/results/clientpositive/ serde/src/java/org/apache/hadoop/hive/serde2/ se...

Author: namit
Date: Wed Nov  4 01:38:12 2009
New Revision: 832645

URL: http://svn.apache.org/viewvc?rev=832645&view=rev
Log:
HIVE-910. NULL value is not correctly handled by ColumnarStruct
(He Yongqiang via namit)


Added:
    hadoop/hive/trunk/ql/src/test/queries/clientpositive/rcfile_null_value.q
    hadoop/hive/trunk/ql/src/test/results/clientpositive/rcfile_null_value.q.out
    hadoop/hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/ColumnProjectionUtils.java
Modified:
    hadoop/hive/trunk/CHANGES.txt
    hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/HiveFileFormatUtils.java
    hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java
    hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/RCFile.java
    hadoop/hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/io/PerformTestRCFileAndSeqFile.java
    hadoop/hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/io/TestRCFile.java
    hadoop/hive/trunk/ql/src/test/results/clientpositive/columnarserde_create_shortcut.q.out
    hadoop/hive/trunk/ql/src/test/results/clientpositive/input_columnarserde.q.out
    hadoop/hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/columnar/ColumnarSerDe.java
    hadoop/hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/columnar/ColumnarStruct.java

Modified: hadoop/hive/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/CHANGES.txt?rev=832645&r1=832644&r2=832645&view=diff
==============================================================================
--- hadoop/hive/trunk/CHANGES.txt (original)
+++ hadoop/hive/trunk/CHANGES.txt Wed Nov  4 01:38:12 2009
@@ -242,6 +242,9 @@
     HIVE-907. NullPointerException in ErrorMsg.findSQLState
     (Bill Graham via namit)
 
+    HIVE-910. NULL value is not correctly handled by ColumnarStruct
+    (He Yongqiang via namit)
+
 Release 0.4.0 -  Unreleased
 
   INCOMPATIBLE CHANGES

Modified: hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/HiveFileFormatUtils.java
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/HiveFileFormatUtils.java?rev=832645&r1=832644&r2=832645&view=diff
==============================================================================
--- hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/HiveFileFormatUtils.java (original)
+++ hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/HiveFileFormatUtils.java Wed Nov  4 01:38:12 2009
@@ -187,76 +187,4 @@
     }
     return true;
   }
-  
-  
-  public static String READ_COLUMN_IDS_CONF_STR = "hive.io.file.readcolumn.ids";
-
-  /**
-   * Sets read columns' ids(start from zero) for RCFile's Reader. Once a column
-   * is included in the list, RCFile's reader will not skip its value.
-   * 
-   */
-  public static void setReadColumnIDs(Configuration conf, ArrayList<Integer> ids) {
-    String id = toReadColumnIDString(ids);
-    setReadColumnIDConf(conf, id);
-  }
-
-  /**
-   * Sets read columns' ids(start from zero) for RCFile's Reader. Once a column
-   * is included in the list, RCFile's reader will not skip its value.
-   * 
-   */
-  public static void appendReadColumnIDs(Configuration conf, ArrayList<Integer> ids) {
-    String id = toReadColumnIDString(ids);
-    String old = conf.get(READ_COLUMN_IDS_CONF_STR, null);
-    String newConfStr = id;
-    if(old !=null )
-      newConfStr = newConfStr + StringUtils.COMMA_STR + old;
-    
-    setReadColumnIDConf(conf, newConfStr);
-  }
-  
-  private static void setReadColumnIDConf(Configuration conf, String id) {
-    if (id == null || id.length() <= 0) {
-      conf.set(READ_COLUMN_IDS_CONF_STR, "");
-      return;
-    }
-
-    conf.set(READ_COLUMN_IDS_CONF_STR, id);
-  }
-
-  private static String toReadColumnIDString(ArrayList<Integer> ids) {
-    String id = null;
-    if (ids != null) {
-      for (int i = 0; i < ids.size(); i++) {
-        if (i == 0) {
-          id = "" + ids.get(i);
-        } else {
-          id = id + StringUtils.COMMA_STR + ids.get(i);
-        }
-      }
-    }
-    return id;
-  }
-
-  /**
-   * Returns an array of column ids(start from zero) which is set in the given
-   * parameter <tt>conf</tt>.
-   */
-  public static ArrayList<Integer> getReadColumnIDs(Configuration conf) {
-    String skips = conf.get(READ_COLUMN_IDS_CONF_STR, "");
-    String[] list = StringUtils.split(skips);
-    ArrayList<Integer> result = new ArrayList<Integer>(list.length);
-    for (int i = 0; i < list.length; i++) {
-      result.add(Integer.parseInt(list[i]));
-    }
-    return result;
-  }
-
-  /**
-   * Clears the read column ids set in the conf, and will read all columns.
-   */
-  public static void setFullyReadColumns(Configuration conf) {
-    conf.set(READ_COLUMN_IDS_CONF_STR, "");
-  }
 }

Modified: hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java?rev=832645&r1=832644&r2=832645&view=diff
==============================================================================
--- hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java (original)
+++ hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java Wed Nov  4 01:38:12 2009
@@ -55,6 +55,8 @@
 import org.apache.hadoop.mapred.FileSplit;
 import org.apache.hadoop.util.ReflectionUtils;
 
+import org.apache.hadoop.hive.serde2.ColumnProjectionUtils;
+
 /**
  * HiveInputFormat is a parameterized InputFormat which looks at the path name and determine
  * the correct InputFormat for that path name from mapredPlan.pathToPartitionInfo().
@@ -201,12 +203,14 @@
       throw new IOException("cannot find class " + inputFormatClassName);
     }
 
-    initColumnsNeeded(job, inputFormatClass, hsplit.getPath().toString(), 
+    //clone a jobConf for setting needed columns for reading
+    JobConf cloneJobConf = new JobConf(job);
+    initColumnsNeeded(cloneJobConf, inputFormatClass, hsplit.getPath().toString(), 
                       hsplit.getPath().toUri().getPath());
 
-    InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, job);
+    InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, cloneJobConf);
     return new HiveRecordReader(inputFormat.getRecordReader(inputSplit,
-        job, reporter));
+    		cloneJobConf, reporter));
   }
 
   private Map<String, partitionDesc> pathToPartitionInfo;
@@ -311,9 +315,9 @@
         TableScanOperator tableScan = (TableScanOperator) op;
         ArrayList<Integer> list = tableScan.getNeededColumnIDs();
         if (list != null)
-          HiveFileFormatUtils.appendReadColumnIDs(jobConf, list);
+        	ColumnProjectionUtils.appendReadColumnIDs(jobConf, list);
         else
-          HiveFileFormatUtils.setFullyReadColumns(jobConf);
+        	ColumnProjectionUtils.setFullyReadColumns(jobConf);
       }
     }
   }

Modified: hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/RCFile.java
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/RCFile.java?rev=832645&r1=832644&r2=832645&view=diff
==============================================================================
--- hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/RCFile.java (original)
+++ hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/RCFile.java Wed Nov  4 01:38:12 2009
@@ -38,6 +38,7 @@
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hive.ql.io.CodecPool;
+import org.apache.hadoop.hive.serde2.ColumnProjectionUtils;
 import org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable;
 import org.apache.hadoop.hive.serde2.columnar.BytesRefWritable;
 import org.apache.hadoop.hive.serde2.columnar.LazyDecompressionCallback;
@@ -922,7 +923,7 @@
       columnNumber = Integer.parseInt(metadata.get(
           new Text(COLUMN_NUMBER_METADATA_STR)).toString());
 
-      java.util.ArrayList<Integer> notSkipIDs = HiveFileFormatUtils.getReadColumnIDs(conf);
+      java.util.ArrayList<Integer> notSkipIDs = ColumnProjectionUtils.getReadColumnIDs(conf);
       skippedColIDs = new boolean[columnNumber];
       if (notSkipIDs.size() > 0) {
         for (int i = 0; i < skippedColIDs.length; i++) {
@@ -1303,7 +1304,9 @@
 
       if (!currentValue.inited) {
         currentValueBuffer();
-        ret.resetValid(columnNumber); // do this only when not intialized 
+				// do this only when not initialized, but we may need to find a way to
+				// tell the caller how to initialize the valid size
+        ret.resetValid(columnNumber); 
       }
 
       // we do not use BytesWritable here to avoid the byte-copy from

Modified: hadoop/hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/io/PerformTestRCFileAndSeqFile.java
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/io/PerformTestRCFileAndSeqFile.java?rev=832645&r1=832644&r2=832645&view=diff
==============================================================================
--- hadoop/hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/io/PerformTestRCFileAndSeqFile.java (original)
+++ hadoop/hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/io/PerformTestRCFileAndSeqFile.java Wed Nov  4 01:38:12 2009
@@ -8,6 +8,7 @@
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.serde2.ColumnProjectionUtils;
 import org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable;
 import org.apache.hadoop.hive.serde2.columnar.BytesRefWritable;
 import org.apache.hadoop.hive.serde2.io.ByteWritable;
@@ -276,7 +277,7 @@
 
     java.util.ArrayList<Integer> readCols = new java.util.ArrayList<Integer>();
     readCols.add(Integer.valueOf(0));
-    HiveFileFormatUtils.setReadColumnIDs(conf, readCols);
+    ColumnProjectionUtils.setReadColumnIDs(conf, readCols);
     RCFile.Reader reader = new RCFile.Reader(fs, file, conf);
 
     LongWritable rowID = new LongWritable();
@@ -310,7 +311,7 @@
     java.util.ArrayList<Integer> readCols = new java.util.ArrayList<Integer>();
     readCols.add(Integer.valueOf(0));
     readCols.add(Integer.valueOf(allColumnsNumber - 1));
-    HiveFileFormatUtils.setReadColumnIDs(conf, readCols);
+    ColumnProjectionUtils.setReadColumnIDs(conf, readCols);
     RCFile.Reader reader = new RCFile.Reader(fs, file, conf);
 
     LongWritable rowID = new LongWritable();
@@ -344,7 +345,7 @@
 
     int actualReadCount = 0;
 
-    HiveFileFormatUtils.setFullyReadColumns(conf);
+    ColumnProjectionUtils.setFullyReadColumns(conf);
     RCFile.Reader reader = new RCFile.Reader(fs, file, conf);
 
     LongWritable rowID = new LongWritable();

Modified: hadoop/hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/io/TestRCFile.java
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/io/TestRCFile.java?rev=832645&r1=832644&r2=832645&view=diff
==============================================================================
--- hadoop/hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/io/TestRCFile.java (original)
+++ hadoop/hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/io/TestRCFile.java Wed Nov  4 01:38:12 2009
@@ -32,6 +32,7 @@
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hive.serde.Constants;
 import org.apache.hadoop.hive.serde2.SerDeException;
+import org.apache.hadoop.hive.serde2.ColumnProjectionUtils;
 import org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable;
 import org.apache.hadoop.hive.serde2.columnar.BytesRefWritable;
 import org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe;
@@ -96,7 +97,7 @@
       bytesArray = new byte[][] { "123".getBytes("UTF-8"),
           "456".getBytes("UTF-8"), "789".getBytes("UTF-8"),
           "1000".getBytes("UTF-8"), "5.3".getBytes("UTF-8"),
-          "hive and hadoop".getBytes("UTF-8"), new byte[0], new byte[0] };
+          "hive and hadoop".getBytes("UTF-8"), new byte[0], "NULL".getBytes("UTF-8") };
       s = new BytesRefArrayWritable(bytesArray.length);
       s.set(0, new BytesRefWritable("123".getBytes("UTF-8")));
       s.set(1, new BytesRefWritable("456".getBytes("UTF-8")));
@@ -127,11 +128,11 @@
     byte[][] record_1 = { "123".getBytes("UTF-8"), "456".getBytes("UTF-8"),
         "789".getBytes("UTF-8"), "1000".getBytes("UTF-8"),
         "5.3".getBytes("UTF-8"), "hive and hadoop".getBytes("UTF-8"),
-        new byte[0], new byte[0] };
+        new byte[0], "NULL".getBytes("UTF-8") };
     byte[][] record_2 = { "100".getBytes("UTF-8"), "200".getBytes("UTF-8"),
         "123".getBytes("UTF-8"), "1000".getBytes("UTF-8"),
         "5.3".getBytes("UTF-8"), "hive and hadoop".getBytes("UTF-8"),
-        new byte[0], new byte[0] };
+        new byte[0], "NULL".getBytes("UTF-8") };
 
     RCFileOutputFormat.setColumnNumber(conf, expectedFieldsData.length);
     RCFile.Writer writer = new RCFile.Writer(fs, conf, file, null,
@@ -170,6 +171,7 @@
       reader.next(rowID);
       BytesRefArrayWritable cols = new BytesRefArrayWritable();
       reader.getCurrentRow(cols);
+      cols.resetValid(8);
       Object row = serDe.deserialize(cols);
 
       StructObjectInspector oi = (StructObjectInspector) serDe
@@ -288,7 +290,7 @@
       throws IOException, SerDeException {
     LOG.debug("reading " + count + " records");
     long start = System.currentTimeMillis();
-    HiveFileFormatUtils.setFullyReadColumns(conf);
+    ColumnProjectionUtils.setFullyReadColumns(conf);
     RCFile.Reader reader = new RCFile.Reader(fs, file, conf);
 
     LongWritable rowID = new LongWritable();
@@ -296,6 +298,7 @@
     BytesRefArrayWritable cols = new BytesRefArrayWritable();
     while (reader.next(rowID)) {
       reader.getCurrentRow(cols);
+      cols.resetValid(8);
       Object row = serDe.deserialize(cols);
 
       StructObjectInspector oi = (StructObjectInspector) serDe
@@ -329,7 +332,7 @@
     java.util.ArrayList<Integer> readCols = new java.util.ArrayList<Integer>();
     readCols.add(Integer.valueOf(2));
     readCols.add(Integer.valueOf(3));
-    HiveFileFormatUtils.setReadColumnIDs(conf, readCols);
+    ColumnProjectionUtils.setReadColumnIDs(conf, readCols);
     RCFile.Reader reader = new RCFile.Reader(fs, file, conf);
 
     LongWritable rowID = new LongWritable();
@@ -337,6 +340,7 @@
     
     while (reader.next(rowID)) {
       reader.getCurrentRow(cols);
+      cols.resetValid(8);
       Object row = serDe.deserialize(cols);
 
       StructObjectInspector oi = (StructObjectInspector) serDe

Added: hadoop/hive/trunk/ql/src/test/queries/clientpositive/rcfile_null_value.q
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/test/queries/clientpositive/rcfile_null_value.q?rev=832645&view=auto
==============================================================================
--- hadoop/hive/trunk/ql/src/test/queries/clientpositive/rcfile_null_value.q (added)
+++ hadoop/hive/trunk/ql/src/test/queries/clientpositive/rcfile_null_value.q Wed Nov  4 01:38:12 2009
@@ -0,0 +1,40 @@
+CREATE TABLE src1_rc(key STRING, value STRING) STORED AS RCFILE;
+INSERT OVERWRITE TABLE src1_rc SELECT * FROM src1;
+SELECT * FROM src1_rc;
+DROP TABLE src1_rc;
+
+CREATE TABLE dest1_rc(c1 INT, c2 STRING, c3 INT, c4 STRING) STORED AS RCFILE;
+
+EXPLAIN
+FROM (
+ FROM 
+  (
+  FROM src src1 SELECT src1.key AS c1, src1.value AS c2 WHERE src1.key > 10 and src1.key < 20
+  ) a
+ RIGHT OUTER JOIN 
+ (
+  FROM src src2 SELECT src2.key AS c3, src2.value AS c4 WHERE src2.key > 15 and src2.key < 25
+ ) b 
+ ON (a.c1 = b.c3)
+ SELECT a.c1 AS c1, a.c2 AS c2, b.c3 AS c3, b.c4 AS c4
+) c
+INSERT OVERWRITE TABLE dest1_rc SELECT c.c1, c.c2, c.c3, c.c4;
+
+FROM (
+ FROM 
+  (
+  FROM src src1 SELECT src1.key AS c1, src1.value AS c2 WHERE src1.key > 10 and src1.key < 20
+  ) a
+ RIGHT OUTER JOIN 
+ (
+  FROM src src2 SELECT src2.key AS c3, src2.value AS c4 WHERE src2.key > 15 and src2.key < 25
+ ) b 
+ ON (a.c1 = b.c3)
+ SELECT a.c1 AS c1, a.c2 AS c2, b.c3 AS c3, b.c4 AS c4
+) c
+INSERT OVERWRITE TABLE dest1_rc SELECT c.c1, c.c2, c.c3, c.c4;
+
+SELECT dest1_rc.* FROM dest1_rc;
+
+DROP TABLE dest1_rc;
+

Modified: hadoop/hive/trunk/ql/src/test/results/clientpositive/columnarserde_create_shortcut.q.out
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/test/results/clientpositive/columnarserde_create_shortcut.q.out?rev=832645&r1=832644&r2=832645&view=diff
==============================================================================
--- hadoop/hive/trunk/ql/src/test/results/clientpositive/columnarserde_create_shortcut.q.out (original)
+++ hadoop/hive/trunk/ql/src/test/results/clientpositive/columnarserde_create_shortcut.q.out Wed Nov  4 01:38:12 2009
@@ -94,11 +94,11 @@
 PREHOOK: query: SELECT columnarserde_create_shortcut.* FROM columnarserde_create_shortcut DISTRIBUTE BY 1
 PREHOOK: type: QUERY
 PREHOOK: Input: default@columnarserde_create_shortcut
-PREHOOK: Output: file:/data/users/njain/hive5/hive5/build/ql/tmp/1010466676/10000
+PREHOOK: Output: file:/data/users/heyongqiang/trunk/VENDOR.hive/trunk/.ptest_1/build/ql/tmp/1405876765/10000
 POSTHOOK: query: SELECT columnarserde_create_shortcut.* FROM columnarserde_create_shortcut DISTRIBUTE BY 1
 POSTHOOK: type: QUERY
 POSTHOOK: Input: default@columnarserde_create_shortcut
-POSTHOOK: Output: file:/data/users/njain/hive5/hive5/build/ql/tmp/1010466676/10000
+POSTHOOK: Output: file:/data/users/heyongqiang/trunk/VENDOR.hive/trunk/.ptest_1/build/ql/tmp/1405876765/10000
 [0,0,0]	["0","0","0"]	{"key_0":"value_0"}	1712634731	record_0
 [1,2,3]	["10","100","1000"]	{"key_1":"value_1"}	465985200	record_1
 [2,4,6]	["20","200","2000"]	{"key_2":"value_2"}	-751827638	record_2
@@ -109,15 +109,15 @@
 [7,14,21]	["70","700","7000"]	{"key_7":"value_7"}	-1461153973	record_7
 [8,16,24]	["80","800","8000"]	{"key_8":"value_8"}	1638581578	record_8
 [9,18,27]	["90","900","9000"]	{"key_9":"value_9"}	336964413	record_9
-null	null	{}	0	NULL
+null	null	null	0	NULL
 PREHOOK: query: SELECT columnarserde_create_shortcut.a[0], columnarserde_create_shortcut.b[0], columnarserde_create_shortcut.c['key2'], columnarserde_create_shortcut.d, columnarserde_create_shortcut.e FROM columnarserde_create_shortcut DISTRIBUTE BY 1
 PREHOOK: type: QUERY
 PREHOOK: Input: default@columnarserde_create_shortcut
-PREHOOK: Output: file:/data/users/njain/hive5/hive5/build/ql/tmp/236647428/10000
+PREHOOK: Output: file:/data/users/heyongqiang/trunk/VENDOR.hive/trunk/.ptest_1/build/ql/tmp/108106790/10000
 POSTHOOK: query: SELECT columnarserde_create_shortcut.a[0], columnarserde_create_shortcut.b[0], columnarserde_create_shortcut.c['key2'], columnarserde_create_shortcut.d, columnarserde_create_shortcut.e FROM columnarserde_create_shortcut DISTRIBUTE BY 1
 POSTHOOK: type: QUERY
 POSTHOOK: Input: default@columnarserde_create_shortcut
-POSTHOOK: Output: file:/data/users/njain/hive5/hive5/build/ql/tmp/236647428/10000
+POSTHOOK: Output: file:/data/users/heyongqiang/trunk/VENDOR.hive/trunk/.ptest_1/build/ql/tmp/108106790/10000
 0	0	NULL	1712634731	record_0
 1	10	NULL	465985200	record_1
 2	20	NULL	-751827638	record_2
@@ -162,11 +162,11 @@
 PREHOOK: query: SELECT columnShortcutTable.* FROM columnShortcutTable
 PREHOOK: type: QUERY
 PREHOOK: Input: default@columnshortcuttable
-PREHOOK: Output: file:/data/users/njain/hive5/hive5/build/ql/tmp/531162572/10000
+PREHOOK: Output: file:/data/users/heyongqiang/trunk/VENDOR.hive/trunk/.ptest_1/build/ql/tmp/33712646/10000
 POSTHOOK: query: SELECT columnShortcutTable.* FROM columnShortcutTable
 POSTHOOK: type: QUERY
 POSTHOOK: Input: default@columnshortcuttable
-POSTHOOK: Output: file:/data/users/njain/hive5/hive5/build/ql/tmp/531162572/10000
+POSTHOOK: Output: file:/data/users/heyongqiang/trunk/VENDOR.hive/trunk/.ptest_1/build/ql/tmp/33712646/10000
 238	val_238
 86	val_86
 311	val_311
@@ -186,11 +186,11 @@
 PREHOOK: query: SELECT columnShortcutTable.* FROM columnShortcutTable
 PREHOOK: type: QUERY
 PREHOOK: Input: default@columnshortcuttable
-PREHOOK: Output: file:/data/users/njain/hive5/hive5/build/ql/tmp/1510560377/10000
+PREHOOK: Output: file:/data/users/heyongqiang/trunk/VENDOR.hive/trunk/.ptest_1/build/ql/tmp/137255855/10000
 POSTHOOK: query: SELECT columnShortcutTable.* FROM columnShortcutTable
 POSTHOOK: type: QUERY
 POSTHOOK: Input: default@columnshortcuttable
-POSTHOOK: Output: file:/data/users/njain/hive5/hive5/build/ql/tmp/1510560377/10000
+POSTHOOK: Output: file:/data/users/heyongqiang/trunk/VENDOR.hive/trunk/.ptest_1/build/ql/tmp/137255855/10000
 238	val_238	NULL
 86	val_86	NULL
 311	val_311	NULL
@@ -210,11 +210,11 @@
 PREHOOK: query: SELECT columnShortcutTable.* FROM columnShortcutTable
 PREHOOK: type: QUERY
 PREHOOK: Input: default@columnshortcuttable
-PREHOOK: Output: file:/data/users/njain/hive5/hive5/build/ql/tmp/1833240424/10000
+PREHOOK: Output: file:/data/users/heyongqiang/trunk/VENDOR.hive/trunk/.ptest_1/build/ql/tmp/1910249486/10000
 POSTHOOK: query: SELECT columnShortcutTable.* FROM columnShortcutTable
 POSTHOOK: type: QUERY
 POSTHOOK: Input: default@columnshortcuttable
-POSTHOOK: Output: file:/data/users/njain/hive5/hive5/build/ql/tmp/1833240424/10000
+POSTHOOK: Output: file:/data/users/heyongqiang/trunk/VENDOR.hive/trunk/.ptest_1/build/ql/tmp/1910249486/10000
 238
 86
 311

Modified: hadoop/hive/trunk/ql/src/test/results/clientpositive/input_columnarserde.q.out
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/test/results/clientpositive/input_columnarserde.q.out?rev=832645&r1=832644&r2=832645&view=diff
==============================================================================
--- hadoop/hive/trunk/ql/src/test/results/clientpositive/input_columnarserde.q.out (original)
+++ hadoop/hive/trunk/ql/src/test/results/clientpositive/input_columnarserde.q.out Wed Nov  4 01:38:12 2009
@@ -104,11 +104,11 @@
 PREHOOK: query: SELECT input_columnarserde.* FROM input_columnarserde DISTRIBUTE BY 1
 PREHOOK: type: QUERY
 PREHOOK: Input: default@input_columnarserde
-PREHOOK: Output: file:/data/users/njain/hive5/hive5/build/ql/tmp/851213096/10000
+PREHOOK: Output: file:/data/users/heyongqiang/trunk/VENDOR.hive/trunk/.ptest_0/build/ql/tmp/1940999347/10000
 POSTHOOK: query: SELECT input_columnarserde.* FROM input_columnarserde DISTRIBUTE BY 1
 POSTHOOK: type: QUERY
 POSTHOOK: Input: default@input_columnarserde
-POSTHOOK: Output: file:/data/users/njain/hive5/hive5/build/ql/tmp/851213096/10000
+POSTHOOK: Output: file:/data/users/heyongqiang/trunk/VENDOR.hive/trunk/.ptest_0/build/ql/tmp/1940999347/10000
 [0,0,0]	["0","0","0"]	{"key_0":"value_0"}	1712634731	record_0
 [1,2,3]	["10","100","1000"]	{"key_1":"value_1"}	465985200	record_1
 [2,4,6]	["20","200","2000"]	{"key_2":"value_2"}	-751827638	record_2
@@ -119,15 +119,15 @@
 [7,14,21]	["70","700","7000"]	{"key_7":"value_7"}	-1461153973	record_7
 [8,16,24]	["80","800","8000"]	{"key_8":"value_8"}	1638581578	record_8
 [9,18,27]	["90","900","9000"]	{"key_9":"value_9"}	336964413	record_9
-null	null	{}	0	NULL
+null	null	null	0	NULL
 PREHOOK: query: SELECT input_columnarserde.a[0], input_columnarserde.b[0], input_columnarserde.c['key2'], input_columnarserde.d, input_columnarserde.e FROM input_columnarserde DISTRIBUTE BY 1
 PREHOOK: type: QUERY
 PREHOOK: Input: default@input_columnarserde
-PREHOOK: Output: file:/data/users/njain/hive5/hive5/build/ql/tmp/722662209/10000
+PREHOOK: Output: file:/data/users/heyongqiang/trunk/VENDOR.hive/trunk/.ptest_0/build/ql/tmp/534165813/10000
 POSTHOOK: query: SELECT input_columnarserde.a[0], input_columnarserde.b[0], input_columnarserde.c['key2'], input_columnarserde.d, input_columnarserde.e FROM input_columnarserde DISTRIBUTE BY 1
 POSTHOOK: type: QUERY
 POSTHOOK: Input: default@input_columnarserde
-POSTHOOK: Output: file:/data/users/njain/hive5/hive5/build/ql/tmp/722662209/10000
+POSTHOOK: Output: file:/data/users/heyongqiang/trunk/VENDOR.hive/trunk/.ptest_0/build/ql/tmp/534165813/10000
 0	0	NULL	1712634731	record_0
 1	10	NULL	465985200	record_1
 2	20	NULL	-751827638	record_2

Added: hadoop/hive/trunk/ql/src/test/results/clientpositive/rcfile_null_value.q.out
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/test/results/clientpositive/rcfile_null_value.q.out?rev=832645&view=auto
==============================================================================
--- hadoop/hive/trunk/ql/src/test/results/clientpositive/rcfile_null_value.q.out (added)
+++ hadoop/hive/trunk/ql/src/test/results/clientpositive/rcfile_null_value.q.out Wed Nov  4 01:38:12 2009
@@ -0,0 +1,276 @@
+PREHOOK: query: CREATE TABLE src1_rc(key STRING, value STRING) STORED AS RCFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE src1_rc(key STRING, value STRING) STORED AS RCFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@src1_rc
+PREHOOK: query: INSERT OVERWRITE TABLE src1_rc SELECT * FROM src1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src1
+PREHOOK: Output: default@src1_rc
+POSTHOOK: query: INSERT OVERWRITE TABLE src1_rc SELECT * FROM src1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src1
+POSTHOOK: Output: default@src1_rc
+PREHOOK: query: SELECT * FROM src1_rc
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src1_rc
+PREHOOK: Output: file:/Users/heyongqiang/Documents/workspace/Hive_RCFile/build/ql/tmp/1289916622/10000
+POSTHOOK: query: SELECT * FROM src1_rc
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src1_rc
+POSTHOOK: Output: file:/Users/heyongqiang/Documents/workspace/Hive_RCFile/build/ql/tmp/1289916622/10000
+238	val_238
+	
+311	val_311
+	val_27
+	val_165
+	val_409
+255	val_255
+278	val_278
+98	val_98
+	val_484
+	val_265
+	val_193
+401	val_401
+150	val_150
+273	val_273
+224	
+369	
+66	val_66
+128	
+213	val_213
+146	val_146
+406	val_406
+	
+	
+	
+PREHOOK: query: DROP TABLE src1_rc
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: DROP TABLE src1_rc
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Output: default@src1_rc
+PREHOOK: query: CREATE TABLE dest1_rc(c1 INT, c2 STRING, c3 INT, c4 STRING) STORED AS RCFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE dest1_rc(c1 INT, c2 STRING, c3 INT, c4 STRING) STORED AS RCFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@dest1_rc
+PREHOOK: query: EXPLAIN
+FROM (
+ FROM 
+  (
+  FROM src src1 SELECT src1.key AS c1, src1.value AS c2 WHERE src1.key > 10 and src1.key < 20
+  ) a
+ RIGHT OUTER JOIN 
+ (
+  FROM src src2 SELECT src2.key AS c3, src2.value AS c4 WHERE src2.key > 15 and src2.key < 25
+ ) b 
+ ON (a.c1 = b.c3)
+ SELECT a.c1 AS c1, a.c2 AS c2, b.c3 AS c3, b.c4 AS c4
+) c
+INSERT OVERWRITE TABLE dest1_rc SELECT c.c1, c.c2, c.c3, c.c4
+PREHOOK: type: QUERY
+POSTHOOK: query: EXPLAIN
+FROM (
+ FROM 
+  (
+  FROM src src1 SELECT src1.key AS c1, src1.value AS c2 WHERE src1.key > 10 and src1.key < 20
+  ) a
+ RIGHT OUTER JOIN 
+ (
+  FROM src src2 SELECT src2.key AS c3, src2.value AS c4 WHERE src2.key > 15 and src2.key < 25
+ ) b 
+ ON (a.c1 = b.c3)
+ SELECT a.c1 AS c1, a.c2 AS c2, b.c3 AS c3, b.c4 AS c4
+) c
+INSERT OVERWRITE TABLE dest1_rc SELECT c.c1, c.c2, c.c3, c.c4
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+  (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_RIGHTOUTERJOIN (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF src src1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL src1) key) c1) (TOK_SELEXPR (. (TOK_TABLE_OR_COL src1) value) c2)) (TOK_WHERE (and (> (. (TOK_TABLE_OR_COL src1) key) 10) (< (. (TOK_TABLE_OR_COL src1) key) 20))))) a) (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF src src2)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL src2) key) c3) (TOK_SELEXPR (. (TOK_TABLE_OR_COL src2) value) c4)) (TOK_WHERE (and (> (. (TOK_TABLE_OR_COL src2) key) 15) (< (. (TOK_TABLE_OR_COL src2) key) 25))))) b) (= (. (TOK_TABLE_OR_COL a) c1) (. (TOK_TABLE_OR_COL b) c3)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) c1) c1) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) c2) c2) (TOK_SELEXPR (. (TOK_TABLE_OR_COL b) c3) c3)
  (TOK_SELEXPR (. (TOK_TABLE_OR_COL b) c4) c4)))) c)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB dest1_rc)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL c) c1)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL c) c2)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL c) c3)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL c) c4)))))
+
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Map Reduce
+      Alias -> Map Operator Tree:
+        c:a:src1 
+          TableScan
+            alias: src1
+            Filter Operator
+              predicate:
+                  expr: ((key > 10) and (key < 20))
+                  type: boolean
+              Filter Operator
+                predicate:
+                    expr: ((key > 10) and (key < 20))
+                    type: boolean
+                Select Operator
+                  expressions:
+                        expr: key
+                        type: string
+                        expr: value
+                        type: string
+                  outputColumnNames: _col0, _col1
+                  Reduce Output Operator
+                    key expressions:
+                          expr: _col0
+                          type: string
+                    sort order: +
+                    Map-reduce partition columns:
+                          expr: _col0
+                          type: string
+                    tag: 0
+                    value expressions:
+                          expr: _col0
+                          type: string
+                          expr: _col1
+                          type: string
+        c:b:src2 
+          TableScan
+            alias: src2
+            Filter Operator
+              predicate:
+                  expr: ((key > 15) and (key < 25))
+                  type: boolean
+              Filter Operator
+                predicate:
+                    expr: ((key > 15) and (key < 25))
+                    type: boolean
+                Select Operator
+                  expressions:
+                        expr: key
+                        type: string
+                        expr: value
+                        type: string
+                  outputColumnNames: _col0, _col1
+                  Reduce Output Operator
+                    key expressions:
+                          expr: _col0
+                          type: string
+                    sort order: +
+                    Map-reduce partition columns:
+                          expr: _col0
+                          type: string
+                    tag: 1
+                    value expressions:
+                          expr: _col0
+                          type: string
+                          expr: _col1
+                          type: string
+      Reduce Operator Tree:
+        Join Operator
+          condition map:
+               Right Outer Join0 to 1
+          condition expressions:
+            0 {VALUE._col0} {VALUE._col1}
+            1 {VALUE._col0} {VALUE._col1}
+          outputColumnNames: _col0, _col1, _col2, _col3
+          Select Operator
+            expressions:
+                  expr: _col0
+                  type: string
+                  expr: _col1
+                  type: string
+                  expr: _col2
+                  type: string
+                  expr: _col3
+                  type: string
+            outputColumnNames: _col0, _col1, _col2, _col3
+            Select Operator
+              expressions:
+                    expr: _col0
+                    type: string
+                    expr: _col1
+                    type: string
+                    expr: _col2
+                    type: string
+                    expr: _col3
+                    type: string
+              outputColumnNames: _col0, _col1, _col2, _col3
+              Select Operator
+                expressions:
+                      expr: UDFToInteger(_col0)
+                      type: int
+                      expr: _col1
+                      type: string
+                      expr: UDFToInteger(_col2)
+                      type: int
+                      expr: _col3
+                      type: string
+                outputColumnNames: _col0, _col1, _col2, _col3
+                File Output Operator
+                  compressed: false
+                  GlobalTableId: 1
+                  table:
+                      input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat
+                      output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat
+                      serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe
+                      name: dest1_rc
+
+  Stage: Stage-0
+    Move Operator
+      tables:
+          replace: true
+          table:
+              input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat
+              output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat
+              serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe
+              name: dest1_rc
+
+
+PREHOOK: query: FROM (
+ FROM 
+  (
+  FROM src src1 SELECT src1.key AS c1, src1.value AS c2 WHERE src1.key > 10 and src1.key < 20
+  ) a
+ RIGHT OUTER JOIN 
+ (
+  FROM src src2 SELECT src2.key AS c3, src2.value AS c4 WHERE src2.key > 15 and src2.key < 25
+ ) b 
+ ON (a.c1 = b.c3)
+ SELECT a.c1 AS c1, a.c2 AS c2, b.c3 AS c3, b.c4 AS c4
+) c
+INSERT OVERWRITE TABLE dest1_rc SELECT c.c1, c.c2, c.c3, c.c4
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+PREHOOK: Output: default@dest1_rc
+POSTHOOK: query: FROM (
+ FROM 
+  (
+  FROM src src1 SELECT src1.key AS c1, src1.value AS c2 WHERE src1.key > 10 and src1.key < 20
+  ) a
+ RIGHT OUTER JOIN 
+ (
+  FROM src src2 SELECT src2.key AS c3, src2.value AS c4 WHERE src2.key > 15 and src2.key < 25
+ ) b 
+ ON (a.c1 = b.c3)
+ SELECT a.c1 AS c1, a.c2 AS c2, b.c3 AS c3, b.c4 AS c4
+) c
+INSERT OVERWRITE TABLE dest1_rc SELECT c.c1, c.c2, c.c3, c.c4
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+POSTHOOK: Output: default@dest1_rc
+PREHOOK: query: SELECT dest1_rc.* FROM dest1_rc
+PREHOOK: type: QUERY
+PREHOOK: Input: default@dest1_rc
+PREHOOK: Output: file:/Users/heyongqiang/Documents/workspace/Hive_RCFile/build/ql/tmp/1827783552/10000
+POSTHOOK: query: SELECT dest1_rc.* FROM dest1_rc
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@dest1_rc
+POSTHOOK: Output: file:/Users/heyongqiang/Documents/workspace/Hive_RCFile/build/ql/tmp/1827783552/10000
+17	val_17	17	val_17
+18	val_18	18	val_18
+18	val_18	18	val_18
+18	val_18	18	val_18
+18	val_18	18	val_18
+19	val_19	19	val_19
+NULL	NULL	20	val_20
+NULL	NULL	24	val_24
+NULL	NULL	24	val_24
+PREHOOK: query: DROP TABLE dest1_rc
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: DROP TABLE dest1_rc
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Output: default@dest1_rc

Added: hadoop/hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/ColumnProjectionUtils.java
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/ColumnProjectionUtils.java?rev=832645&view=auto
==============================================================================
--- hadoop/hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/ColumnProjectionUtils.java (added)
+++ hadoop/hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/ColumnProjectionUtils.java Wed Nov  4 01:38:12 2009
@@ -0,0 +1,107 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.serde2;
+
+import java.util.ArrayList;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.util.StringUtils;
+
+public class ColumnProjectionUtils {
+
+	public static String READ_COLUMN_IDS_CONF_STR = "hive.io.file.readcolumn.ids";
+
+	/**
+	 * Sets read columns' ids(start from zero) for RCFile's Reader. Once a column
+	 * is included in the list, RCFile's reader will not skip its value.
+	 * 
+	 */
+	public static void setReadColumnIDs(Configuration conf, ArrayList<Integer> ids) {
+		String id = toReadColumnIDString(ids);
+		setReadColumnIDConf(conf, id);
+	}
+
+	/**
+	 * Sets read columns' ids(start from zero) for RCFile's Reader. Once a column
+	 * is included in the list, RCFile's reader will not skip its value.
+	 * 
+	 */
+	public static void appendReadColumnIDs(Configuration conf,
+	    ArrayList<Integer> ids) {
+		String id = toReadColumnIDString(ids);
+		if(id != null) {
+			String old = conf.get(READ_COLUMN_IDS_CONF_STR, null);
+			String newConfStr = id;
+			if (old != null)
+				newConfStr = newConfStr + StringUtils.COMMA_STR + old;
+
+			setReadColumnIDConf(conf, newConfStr);
+		}
+	}
+
+	private static void setReadColumnIDConf(Configuration conf, String id) {
+		if (id == null || id.length() <= 0) {
+			conf.set(READ_COLUMN_IDS_CONF_STR, "");
+			return;
+		}
+
+		conf.set(READ_COLUMN_IDS_CONF_STR, id);
+	}
+
+	private static String toReadColumnIDString(ArrayList<Integer> ids) {
+		String id = null;
+		if (ids != null) {
+			for (int i = 0; i < ids.size(); i++) {
+				if (i == 0) {
+					id = "" + ids.get(i);
+				} else {
+					id = id + StringUtils.COMMA_STR + ids.get(i);
+				}
+			}
+		}
+		return id;
+	}
+
+	/**
+	 * Returns an array of column ids(start from zero) which is set in the given
+	 * parameter <tt>conf</tt>.
+	 */
+	public static ArrayList<Integer> getReadColumnIDs(Configuration conf) {
+		if( conf == null )
+			return new ArrayList<Integer>(0);
+		String skips = conf.get(READ_COLUMN_IDS_CONF_STR, "");
+		String[] list = StringUtils.split(skips);
+		ArrayList<Integer> result = new ArrayList<Integer>(list.length);
+		for (int i = 0; i < list.length; i++) {
+			//it may contain duplicates, remove duplicates
+			Integer toAdd = Integer.parseInt(list[i]);
+			if (!result.contains(toAdd))
+				result.add(toAdd);
+		}
+		return result;
+	}
+
+	/**
+	 * Clears the read column ids set in the conf, and will read all columns.
+	 */
+	public static void setFullyReadColumns(Configuration conf) {
+		conf.set(READ_COLUMN_IDS_CONF_STR, "");
+	}
+
+}

Modified: hadoop/hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/columnar/ColumnarSerDe.java
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/columnar/ColumnarSerDe.java?rev=832645&r1=832644&r2=832645&view=diff
==============================================================================
--- hadoop/hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/columnar/ColumnarSerDe.java (original)
+++ hadoop/hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/columnar/ColumnarSerDe.java Wed Nov  4 01:38:12 2009
@@ -26,6 +26,7 @@
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hive.serde2.ColumnProjectionUtils;
 import org.apache.hadoop.hive.serde2.ByteStream;
 import org.apache.hadoop.hive.serde2.SerDe;
 import org.apache.hadoop.hive.serde2.SerDeException;
@@ -91,7 +92,10 @@
             .getSeparators(), serdeParams.getNullSequence(), serdeParams.isEscaped(),
             serdeParams.getEscapeChar());
 
-    cachedLazyStruct = new ColumnarStruct(cachedObjectInspector);
+    
+    java.util.ArrayList<Integer> notSkipIDs = ColumnProjectionUtils.getReadColumnIDs(job);
+    
+    cachedLazyStruct = new ColumnarStruct(cachedObjectInspector, notSkipIDs);
     
     int size = serdeParams.getColumnTypes().size();
     field = new BytesRefWritable[size];

Modified: hadoop/hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/columnar/ColumnarStruct.java
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/columnar/ColumnarStruct.java?rev=832645&r1=832644&r2=832645&view=diff
==============================================================================
--- hadoop/hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/columnar/ColumnarStruct.java (original)
+++ hadoop/hive/trunk/serde/src/java/org/apache/hadoop/hive/serde2/columnar/ColumnarStruct.java Wed Nov  4 01:38:12 2009
@@ -20,6 +20,7 @@
 
 import java.io.IOException;
 import java.util.ArrayList;
+import java.util.Collections;
 import java.util.List;
 
 import org.apache.hadoop.hive.serde2.lazy.ByteArrayRef;
@@ -49,7 +50,6 @@
   
   private static final Log LOG = LogFactory.getLog(ColumnarStruct.class);
   
-  boolean initialized = false;  // init() function is called?
   int[]   prjColIDs   = null;   // list of projected column IDs
   
   /**
@@ -60,19 +60,54 @@
    *          the ObjectInspector representing the type of this LazyStruct.
    */
   public ColumnarStruct(ObjectInspector oi) {
+    this(oi, null);
+  }
+  
+  /**
+   * Construct a ColumnarStruct object with the TypeInfo. It creates the first
+   * level object at the first place
+   * 
+   * @param oi
+   *          the ObjectInspector representing the type of this LazyStruct.
+   * @param notSkippedColumnIDs
+   * 		  the column ids that should not be skipped       
+   */
+  public ColumnarStruct(ObjectInspector oi, ArrayList<Integer> notSkippedColumnIDs) {
     List<? extends StructField> fieldRefs = ((StructObjectInspector) oi).getAllStructFieldRefs();
     int num = fieldRefs.size();
     fields = new LazyObject[num];
     cachedByteArrayRef = new ByteArrayRef[num];
     rawBytesField = new BytesRefWritable[num];
-    fieldIsNull = new boolean[num];
+    fieldSkipped = new boolean[num];
     inited = new boolean[num];
+    
+    //if no columns is set to be skipped, add all columns in 'notSkippedColumnIDs'
+		if (notSkippedColumnIDs == null || notSkippedColumnIDs.size() == 0) {
+			for (int i = 0; i < num; i++)
+				notSkippedColumnIDs.add(i);
+		}
+    
     for (int i = 0; i < num; i++) {
       fields[i] = LazyFactory.createLazyObject(fieldRefs.get(i).getFieldObjectInspector());
       cachedByteArrayRef[i] = new ByteArrayRef();
-      fieldIsNull[i] = false;
-      inited[i] = false;
+      if(!notSkippedColumnIDs.contains(i)){
+      	fieldSkipped[i] = true;
+      	inited[i] = true;
+      } else
+      	inited[i] = false;
     }
+    
+		// maintain a list of non-NULL column IDs
+		int min = notSkippedColumnIDs.size() > num ? num : notSkippedColumnIDs
+		    .size();
+		prjColIDs = new int[min];
+		for (int i = 0, index = 0; i < notSkippedColumnIDs.size(); ++i) {
+			int readCol = notSkippedColumnIDs.get(i).intValue();
+			if (readCol < num) {
+				prjColIDs[index] = readCol;
+				index++;
+			}
+		}
   }
 
   /**
@@ -104,7 +139,7 @@
   ByteArrayRef[] cachedByteArrayRef = null;
   BytesRefWritable[] rawBytesField = null;
   boolean[] inited = null;
-  boolean[] fieldIsNull = null;
+  boolean[] fieldSkipped = null;
 
   /**
    * Get the field out of the row without checking parsed. This is called by
@@ -117,7 +152,7 @@
    * @return The value of the field
    */
   protected Object uncheckedGetField(int fieldID, Text nullSequence) {
-    if (fieldIsNull[fieldID])
+    if (fieldSkipped[fieldID])
       return null;
     if (!inited[fieldID]) {
       BytesRefWritable passedInField = rawBytesField[fieldID];
@@ -132,9 +167,10 @@
     }
     
     byte[] data = cachedByteArrayRef[fieldID].getData();
-    int fieldLen = data.length;
+    int fieldLen = rawBytesField[fieldID].length;
+    
     if (fieldLen == nullSequence.getLength()
-        && LazyUtils.compare(data, 0,
+        && LazyUtils.compare(data, rawBytesField[fieldID].getStart(),
             fieldLen, nullSequence.getBytes(), 0, nullSequence.getLength()) == 0) {
       return null;
     }
@@ -150,45 +186,15 @@
    *  =========================================================================
    */
   public void init(BytesRefArrayWritable cols) {
-    if (initialized) { // short cut for non-first calls
-      for (int i = 0; i < prjColIDs.length; ++i ) {
-        int fieldIndex = prjColIDs[i];
-        rawBytesField[fieldIndex] = cols.unCheckedGet(fieldIndex);
-        inited[fieldIndex] = false;
-      }
-    } else { // first time call init()
-      int fieldIndex = 0;
-      int min = cols.size() < fields.length ? cols.size() : fields.length;
-      
-      ArrayList<Integer> tmp_sel_cols = new ArrayList<Integer>();
-
-      for (; fieldIndex < min; fieldIndex++) {
-        
-        // call the faster unCheckedGet() 
-        // alsert: min <= cols.size()
-        BytesRefWritable passedInField = cols.unCheckedGet(fieldIndex);
-        
-        if (passedInField.length > 0) {
-          // if (fields[fieldIndex] == null)
-          // fields[fieldIndex] = LazyFactory.createLazyObject(fieldTypeInfos
-          // .get(fieldIndex));
-          tmp_sel_cols.add(fieldIndex);
-          rawBytesField[fieldIndex] = passedInField;
-          fieldIsNull[fieldIndex] = false;
-        } else
-          fieldIsNull[fieldIndex] = true;
-        
-        inited[fieldIndex] = false;
-      }
-      for (; fieldIndex < fields.length; fieldIndex++)
-        fieldIsNull[fieldIndex] = true;
-      
-      // maintain a list of non-NULL column IDs
-      prjColIDs = new int[tmp_sel_cols.size()];
-      for (int i = 0; i < prjColIDs.length; ++i ) {
-        prjColIDs[i] = tmp_sel_cols.get(i).intValue();
+    for (int i = 0; i < prjColIDs.length; ++i ) {
+      int fieldIndex = prjColIDs[i];
+      if(fieldIndex < cols.size()){
+      	rawBytesField[fieldIndex] = cols.unCheckedGet(fieldIndex);
+        inited[fieldIndex] = false;      	
+      } else {
+      	// select columns that actually do not exist in the file.
+      	fieldSkipped[fieldIndex] = true;
       }
-      initialized = true;
     }
   }