You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by am...@apache.org on 2013/06/07 14:41:21 UTC

svn commit: r1490614 - in /hive/branches/HIVE-4115: ./ data/files/ hbase-handler/src/test/templates/ hcatalog/bin/ hcatalog/src/test/e2e/hcatalog/drivers/ hcatalog/src/test/e2e/hcatalog/tests/ ql/src/java/org/apache/hadoop/hive/ql/parse/ ql/src/java/or...

Author: amareshwari
Date: Fri Jun  7 12:41:20 2013
New Revision: 1490614

URL: http://svn.apache.org/r1490614
Log:
Merging r1489797 through r1490612 into HIVE-4115

Added:
    hive/branches/HIVE-4115/data/files/person age.txt
      - copied unchanged from r1490612, hive/trunk/data/files/person age.txt
    hive/branches/HIVE-4115/ql/src/test/queries/clientpositive/load_file_with_space_in_the_name.q
      - copied unchanged from r1490612, hive/trunk/ql/src/test/queries/clientpositive/load_file_with_space_in_the_name.q
    hive/branches/HIVE-4115/ql/src/test/queries/clientpositive/load_hdfs_file_with_space_in_the_name.q
      - copied unchanged from r1490612, hive/trunk/ql/src/test/queries/clientpositive/load_hdfs_file_with_space_in_the_name.q
    hive/branches/HIVE-4115/ql/src/test/results/clientpositive/load_file_with_space_in_the_name.q.out
      - copied unchanged from r1490612, hive/trunk/ql/src/test/results/clientpositive/load_file_with_space_in_the_name.q.out
    hive/branches/HIVE-4115/ql/src/test/results/clientpositive/load_hdfs_file_with_space_in_the_name.q.out
      - copied unchanged from r1490612, hive/trunk/ql/src/test/results/clientpositive/load_hdfs_file_with_space_in_the_name.q.out
Modified:
    hive/branches/HIVE-4115/   (props changed)
    hive/branches/HIVE-4115/build-common.xml
    hive/branches/HIVE-4115/hbase-handler/src/test/templates/TestHBaseCliDriver.vm
    hive/branches/HIVE-4115/hbase-handler/src/test/templates/TestHBaseNegativeCliDriver.vm
    hive/branches/HIVE-4115/hcatalog/bin/hcat
    hive/branches/HIVE-4115/hcatalog/src/test/e2e/hcatalog/drivers/TestDriverHiveCmdLine.pm
    hive/branches/HIVE-4115/hcatalog/src/test/e2e/hcatalog/tests/hive_cmdline.conf
    hive/branches/HIVE-4115/hcatalog/src/test/e2e/hcatalog/tests/hive_nightly.conf
    hive/branches/HIVE-4115/ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java
    hive/branches/HIVE-4115/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/NumDistinctValueEstimator.java
    hive/branches/HIVE-4115/ql/src/test/queries/clientpositive/combine2_win.q
    hive/branches/HIVE-4115/ql/src/test/results/clientpositive/combine2_win.q.out
    hive/branches/HIVE-4115/ql/src/test/results/clientpositive/compute_stats_double.q.out
    hive/branches/HIVE-4115/ql/src/test/results/clientpositive/compute_stats_long.q.out
    hive/branches/HIVE-4115/ql/src/test/results/clientpositive/compute_stats_string.q.out
    hive/branches/HIVE-4115/ql/src/test/results/clientpositive/input_part10_win.q.out
    hive/branches/HIVE-4115/ql/src/test/results/clientpositive/load_dyn_part14_win.q.out

Propchange: hive/branches/HIVE-4115/
------------------------------------------------------------------------------
  Merged /hive/trunk:r1489797-1490612

Modified: hive/branches/HIVE-4115/build-common.xml
URL: http://svn.apache.org/viewvc/hive/branches/HIVE-4115/build-common.xml?rev=1490614&r1=1490613&r2=1490614&view=diff
==============================================================================
--- hive/branches/HIVE-4115/build-common.xml (original)
+++ hive/branches/HIVE-4115/build-common.xml Fri Jun  7 12:41:20 2013
@@ -59,7 +59,7 @@
   <property name="test.output" value="true"/>
   <property name="test.junit.output.format" value="xml"/>
   <property name="test.junit.output.usefile" value="true"/>
-  <property name="minimr.query.files" value="list_bucket_dml_10.q,input16_cc.q,scriptfile1.q,bucket4.q,bucketmapjoin6.q,disable_merge_for_bucketing.q,reduce_deduplicate.q,smb_mapjoin_8.q,join1.q,groupby2.q,bucketizedhiveinputformat.q,bucketmapjoin7.q,optrstat_groupby.q,bucket_num_reducers.q,bucket5.q,load_fs2.q,bucket_num_reducers2.q,infer_bucket_sort_merge.q,infer_bucket_sort_reducers_power_two.q,infer_bucket_sort_dyn_part.q,infer_bucket_sort_bucketed_table.q,infer_bucket_sort_map_operators.q,infer_bucket_sort_num_buckets.q,schemeAuthority.q,truncate_column_buckets.q,remote_script.q"/>
+  <property name="minimr.query.files" value="list_bucket_dml_10.q,input16_cc.q,scriptfile1.q,bucket4.q,bucketmapjoin6.q,disable_merge_for_bucketing.q,reduce_deduplicate.q,smb_mapjoin_8.q,join1.q,groupby2.q,bucketizedhiveinputformat.q,bucketmapjoin7.q,optrstat_groupby.q,bucket_num_reducers.q,bucket5.q,load_fs2.q,bucket_num_reducers2.q,infer_bucket_sort_merge.q,infer_bucket_sort_reducers_power_two.q,infer_bucket_sort_dyn_part.q,infer_bucket_sort_bucketed_table.q,infer_bucket_sort_map_operators.q,infer_bucket_sort_num_buckets.q,schemeAuthority.q,truncate_column_buckets.q,remote_script.q,load_hdfs_file_with_space_in_the_name.q"/>
   <property name="minimr.query.negative.files" value="cluster_tasklog_retrieval.q,minimr_broken_pipe.q,mapreduce_stack_trace.q,mapreduce_stack_trace_turnoff.q,mapreduce_stack_trace_hadoop20.q,mapreduce_stack_trace_turnoff_hadoop20.q" />
   <property name="test.silent" value="true"/>
   <property name="hadoopVersion" value="${hadoop.version.ant-internal}"/>

Modified: hive/branches/HIVE-4115/hbase-handler/src/test/templates/TestHBaseCliDriver.vm
URL: http://svn.apache.org/viewvc/hive/branches/HIVE-4115/hbase-handler/src/test/templates/TestHBaseCliDriver.vm?rev=1490614&r1=1490613&r2=1490614&view=diff
==============================================================================
--- hive/branches/HIVE-4115/hbase-handler/src/test/templates/TestHBaseCliDriver.vm (original)
+++ hive/branches/HIVE-4115/hbase-handler/src/test/templates/TestHBaseCliDriver.vm Fri Jun  7 12:41:20 2013
@@ -89,7 +89,7 @@ public class $className extends TestCase
   #set ($fname = $qf.getName())
   #set ($eidx = $fname.indexOf('.'))
   #set ($tname = $fname.substring(0, $eidx))
-  #set ($fpath = $qf.getCanonicalPath())
+  #set ($fpath = $qf.getCanonicalPath().replaceAll("\\","\\\\"))
   public void testCliDriver_$tname() throws Exception {
     runTest("$tname", "$fname", "$fpath");
   }

Modified: hive/branches/HIVE-4115/hbase-handler/src/test/templates/TestHBaseNegativeCliDriver.vm
URL: http://svn.apache.org/viewvc/hive/branches/HIVE-4115/hbase-handler/src/test/templates/TestHBaseNegativeCliDriver.vm?rev=1490614&r1=1490613&r2=1490614&view=diff
==============================================================================
--- hive/branches/HIVE-4115/hbase-handler/src/test/templates/TestHBaseNegativeCliDriver.vm (original)
+++ hive/branches/HIVE-4115/hbase-handler/src/test/templates/TestHBaseNegativeCliDriver.vm Fri Jun  7 12:41:20 2013
@@ -68,7 +68,7 @@ public class $className extends TestCase
   #set ($fname = $qf.getName())
   #set ($eidx = $fname.indexOf('.'))
   #set ($tname = $fname.substring(0, $eidx))
-  #set ($fpath = $qf.getCanonicalPath())
+  #set ($fpath = $qf.getCanonicalPath().replaceAll("\\","\\\\"))
   public void testCliDriver_$tname() throws Exception {
     runTest("$tname", "$fname", "$fpath");
   }

Modified: hive/branches/HIVE-4115/hcatalog/bin/hcat
URL: http://svn.apache.org/viewvc/hive/branches/HIVE-4115/hcatalog/bin/hcat?rev=1490614&r1=1490613&r2=1490614&view=diff
==============================================================================
--- hive/branches/HIVE-4115/hcatalog/bin/hcat (original)
+++ hive/branches/HIVE-4115/hcatalog/bin/hcat Fri Jun  7 12:41:20 2013
@@ -91,7 +91,7 @@ if [ ! -d "$HIVE_LIB_DIR" ]; then
   exit 4;
 fi
 
-HIVE_CONF_DIR=${HIVE_HOME}/conf
+HIVE_CONF_DIR=${HIVE_CONF_DIR:-$HIVE_HOME/conf}
 if [ ! -d "$HIVE_CONF_DIR" ]; then
   echo "Cannot find conf dir within HIVE_HOME : $HIVE_CONF_DIR";
   exit 4;

Modified: hive/branches/HIVE-4115/hcatalog/src/test/e2e/hcatalog/drivers/TestDriverHiveCmdLine.pm
URL: http://svn.apache.org/viewvc/hive/branches/HIVE-4115/hcatalog/src/test/e2e/hcatalog/drivers/TestDriverHiveCmdLine.pm?rev=1490614&r1=1490613&r2=1490614&view=diff
==============================================================================
--- hive/branches/HIVE-4115/hcatalog/src/test/e2e/hcatalog/drivers/TestDriverHiveCmdLine.pm (original)
+++ hive/branches/HIVE-4115/hcatalog/src/test/e2e/hcatalog/drivers/TestDriverHiveCmdLine.pm Fri Jun  7 12:41:20 2013
@@ -1,26 +1,30 @@
-package TestDriverHiveCmdLine;
+#!/usr/bin/env perl
 
-############################################################################
-#  Licensed to the Apache Software Foundation (ASF) under one or more
-#  contributor license agreements.  See the NOTICE file distributed with
-#  this work for additional information regarding copyright ownership.
-#  The ASF licenses this file to You under the Apache License, Version 2.0
-#  (the "License"); you may not use this file except in compliance with
-#  the License.  You may obtain a copy of the License at
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
 #
-#      http://www.apache.org/licenses/LICENSE-2.0
+#     http://www.apache.org/licenses/LICENSE-2.0
 #
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+################################################################################
+
 
-###############################################################################
 # Test driver for hive nightly tests.
 # 
 #
 
+package TestDriverHiveCmdLine;
 use TestDriverHive;
 use IPC::Run; # don't do qw(run), it screws up TestDriver which also has a run method
 use Util;

Modified: hive/branches/HIVE-4115/hcatalog/src/test/e2e/hcatalog/tests/hive_cmdline.conf
URL: http://svn.apache.org/viewvc/hive/branches/HIVE-4115/hcatalog/src/test/e2e/hcatalog/tests/hive_cmdline.conf?rev=1490614&r1=1490613&r2=1490614&view=diff
==============================================================================
--- hive/branches/HIVE-4115/hcatalog/src/test/e2e/hcatalog/tests/hive_cmdline.conf (original)
+++ hive/branches/HIVE-4115/hcatalog/src/test/e2e/hcatalog/tests/hive_cmdline.conf Fri Jun  7 12:41:20 2013
@@ -1,25 +1,24 @@
 #!/usr/bin/env perl
 
-############################################################################
-#  Licensed to the Apache Software Foundation (ASF) under one or more
-#  contributor license agreements.  See the NOTICE file distributed with
-#  this work for additional information regarding copyright ownership.
-#  The ASF licenses this file to You under the Apache License, Version 2.0
-#  (the "License"); you may not use this file except in compliance with
-#  the License.  You may obtain a copy of the License at
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
 #
-#      http://www.apache.org/licenses/LICENSE-2.0
+#     http://www.apache.org/licenses/LICENSE-2.0
 #
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+################################################################################
 
-###############################################################################
-# Nightly tests for hive.
-#
-#
 
 $cfg = {
   'driver' => 'HiveCmdLine',

Modified: hive/branches/HIVE-4115/hcatalog/src/test/e2e/hcatalog/tests/hive_nightly.conf
URL: http://svn.apache.org/viewvc/hive/branches/HIVE-4115/hcatalog/src/test/e2e/hcatalog/tests/hive_nightly.conf?rev=1490614&r1=1490613&r2=1490614&view=diff
==============================================================================
--- hive/branches/HIVE-4115/hcatalog/src/test/e2e/hcatalog/tests/hive_nightly.conf (original)
+++ hive/branches/HIVE-4115/hcatalog/src/test/e2e/hcatalog/tests/hive_nightly.conf Fri Jun  7 12:41:20 2013
@@ -1,25 +1,23 @@
 #!/usr/bin/env perl
 
-############################################################################
-#  Licensed to the Apache Software Foundation (ASF) under one or more
-#  contributor license agreements.  See the NOTICE file distributed with
-#  this work for additional information regarding copyright ownership.
-#  The ASF licenses this file to You under the Apache License, Version 2.0
-#  (the "License"); you may not use this file except in compliance with
-#  the License.  You may obtain a copy of the License at
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
 #
-#      http://www.apache.org/licenses/LICENSE-2.0
+#     http://www.apache.org/licenses/LICENSE-2.0
 #
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
 
-###############################################################################
-# Nightly tests for hive.
-#
-#
+################################################################################
 
 $cfg = {
   'driver' => 'Hive',

Modified: hive/branches/HIVE-4115/ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java
URL: http://svn.apache.org/viewvc/hive/branches/HIVE-4115/ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java?rev=1490614&r1=1490613&r2=1490614&view=diff
==============================================================================
--- hive/branches/HIVE-4115/ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java (original)
+++ hive/branches/HIVE-4115/ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java Fri Jun  7 12:41:20 2013
@@ -27,6 +27,8 @@ import java.util.List;
 import java.util.Map;
 
 import org.antlr.runtime.tree.Tree;
+import org.apache.commons.httpclient.URIException;
+import org.apache.commons.httpclient.util.URIUtil;
 import org.apache.commons.lang.StringUtils;
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
@@ -82,7 +84,7 @@ public class LoadSemanticAnalyzer extend
     // directory
     if (!path.startsWith("/")) {
       if (isLocal) {
-        path = new Path(System.getProperty("user.dir"), path).toUri().toString();
+        path = URIUtil.decode( new Path(System.getProperty("user.dir"), path).toUri().toString() );
       } else {
         path = new Path(new Path("/user/" + System.getProperty("user.name")),
           path).toString();
@@ -231,8 +233,13 @@ public class LoadSemanticAnalyzer extend
       // that's just a test case.
       String copyURIStr = ctx.getExternalTmpFileURI(toURI);
       URI copyURI = URI.create(copyURIStr);
-      rTask = TaskFactory.get(new CopyWork(fromURI.toString(), copyURIStr),
-          conf);
+      try {
+        rTask = TaskFactory.get(new CopyWork(URIUtil.decode(fromURI.toString()), copyURIStr),
+            conf);
+      } catch (URIException e) {
+        throw new SemanticException(ErrorMsg.INVALID_PATH.getMsg(fromTree, e
+            .getMessage()), e);
+      }
       fromURI = copyURI;
     }
 
@@ -261,8 +268,14 @@ public class LoadSemanticAnalyzer extend
     }
 
 
-    LoadTableDesc loadTableWork = new LoadTableDesc(fromURI.toString(),
-        loadTmpPath, Utilities.getTableDesc(ts.tableHandle), partSpec, isOverWrite);
+    LoadTableDesc loadTableWork;
+    try {
+      loadTableWork = new LoadTableDesc(URIUtil.decode(fromURI.toString()),
+          loadTmpPath, Utilities.getTableDesc(ts.tableHandle), partSpec, isOverWrite);
+    } catch (URIException e1) {
+      throw new SemanticException(ErrorMsg.INVALID_PATH.getMsg(fromTree, e1
+          .getMessage()), e1);
+    }
 
     Task<? extends Serializable> childTask = TaskFactory.get(new MoveWork(getInputs(),
         getOutputs(), loadTableWork, null, true), conf);

Modified: hive/branches/HIVE-4115/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/NumDistinctValueEstimator.java
URL: http://svn.apache.org/viewvc/hive/branches/HIVE-4115/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/NumDistinctValueEstimator.java?rev=1490614&r1=1490613&r2=1490614&view=diff
==============================================================================
--- hive/branches/HIVE-4115/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/NumDistinctValueEstimator.java (original)
+++ hive/branches/HIVE-4115/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/NumDistinctValueEstimator.java Fri Jun  7 12:41:20 2013
@@ -28,7 +28,13 @@ public class NumDistinctValueEstimator {
 
   static final Log LOG = LogFactory.getLog(NumDistinctValueEstimator.class.getName());
 
-  private final int bitVectorSize = 32;
+  /* We want a,b,x to come from a finite field of size 0 to k, where k is a prime number.
+   * 2^p - 1 is prime for p = 31. Hence bitvectorSize has to be 31. Pick k to be 2^p -1.
+   * If a,b,x didn't come from a finite field ax1 + b mod k and ax2 + b mod k will not be pair wise
+   * independent. As a consequence, the hash values will not distribute uniformly from 0 to 2^p-1
+   * thus introducing errors in the estimates.
+   */
+  private static final int bitVectorSize = 31;
   private int numBitVectors;
 
   // Refer to Flajolet-Martin'86 for the value of phi
@@ -53,8 +59,23 @@ public class NumDistinctValueEstimator {
     a = new int[numBitVectors];
     b = new int[numBitVectors];
 
-    aValue = new Random(79798);
-    bValue = new Random(34115);
+    /* Use a large prime number as a seed to the random number generator.
+     * Java's random number generator uses the Linear Congruential Generator to generate random
+     * numbers using the following recurrence relation,
+     *
+     * X(n+1) = (a X(n) + c ) mod m
+     *
+     *  where X0 is the seed. Java implementation uses m = 2^48. This is problematic because 2^48
+     *  is not a prime number and hence the set of numbers from 0 to m don't form a finite field.
+     *  If these numbers don't come from a finite field any give X(n) and X(n+1) may not be pair
+     *  wise independent.
+     *
+     *  However, empirically passing in prime numbers as seeds seems to work better than when passing
+     *  composite numbers as seeds. Ideally Java's Random should pick m such that m is prime.
+     *
+     */
+    aValue = new Random(99397);
+    bValue = new Random(9876413);
 
     for (int i = 0; i < numBitVectors; i++) {
       int randVal;
@@ -76,11 +97,11 @@ public class NumDistinctValueEstimator {
       b[i] = randVal;
 
       if (a[i] < 0) {
-        a[i] = a[i] + (1 << (bitVectorSize -1));
+        a[i] = a[i] + (1 << bitVectorSize - 1);
       }
 
       if (b[i] < 0) {
-        b[i] = b[i] + (1 << (bitVectorSize -1));
+        b[i] = b[i] + (1 << bitVectorSize - 1);
       }
     }
   }
@@ -197,8 +218,8 @@ public class NumDistinctValueEstimator {
   }
 
   private int generateHash(long v, int hashNum) {
-    int mod = 1 << (bitVectorSize - 1) - 1;
-    long tempHash = a[hashNum] * v + b[hashNum];
+    int mod = (1<<bitVectorSize) - 1;
+    long tempHash = a[hashNum] * v  + b[hashNum];
     tempHash %= mod;
     int hash = (int) tempHash;
 
@@ -206,7 +227,7 @@ public class NumDistinctValueEstimator {
      * Hence hash value has to be non-negative.
      */
     if (hash < 0) {
-      hash = hash + mod + 1;
+      hash = hash + mod;
     }
     return hash;
   }
@@ -266,6 +287,7 @@ public class NumDistinctValueEstimator {
     bitVector[hash%numBitVectors].set(index);
   }
 
+
   public void mergeEstimators(NumDistinctValueEstimator o) {
     // Bitwise OR the bitvector with the bitvector in the agg buffer
     for (int i=0; i<numBitVectors; i++) {
@@ -289,36 +311,22 @@ public class NumDistinctValueEstimator {
     return ((long)numDistinctValues);
   }
 
-  /* We use two estimators - one due to Flajolet-Martin and a modification due to
-   * Alon-Matias-Szegedy. FM uses the location of the least significant zero as an estimate of
-   * log2(phi*ndvs).
-   * AMS uses the location of the most significant one as an estimate of the log2(ndvs).
-   * We average the two estimators with suitable modifications to obtain an estimate of ndvs.
+  /* We use the Flajolet-Martin estimator to estimate the number of distinct values.FM uses the
+   * location of the least significant zero as an estimate of log2(phi*ndvs).
    */
   public long estimateNumDistinctValues() {
     int sumLeastSigZero = 0;
-    int sumMostSigOne = 0;
     double avgLeastSigZero;
-    double avgMostSigOne;
     double numDistinctValues;
 
     for (int i=0; i< numBitVectors; i++) {
       int leastSigZero = bitVector[i].nextClearBit(0);
       sumLeastSigZero += leastSigZero;
-      int mostSigOne = bitVectorSize;
-
-      for (int j=0; j< bitVectorSize; j++) {
-        if (bitVector[i].get(j)) {
-          mostSigOne = j;
-        }
-      }
-      sumMostSigOne += mostSigOne;
     }
 
     avgLeastSigZero =
         (double)(sumLeastSigZero/(numBitVectors * 1.0)) - (Math.log(phi)/Math.log(2.0));
-    avgMostSigOne = (double)(sumMostSigOne/(numBitVectors * 1.0));
-    numDistinctValues = Math.pow(2.0, (avgMostSigOne + avgLeastSigZero)/2.0);
+    numDistinctValues = Math.pow(2.0, avgLeastSigZero);
     return ((long)(numDistinctValues));
   }
 }

Modified: hive/branches/HIVE-4115/ql/src/test/queries/clientpositive/combine2_win.q
URL: http://svn.apache.org/viewvc/hive/branches/HIVE-4115/ql/src/test/queries/clientpositive/combine2_win.q?rev=1490614&r1=1490613&r2=1490614&view=diff
==============================================================================
--- hive/branches/HIVE-4115/ql/src/test/queries/clientpositive/combine2_win.q (original)
+++ hive/branches/HIVE-4115/ql/src/test/queries/clientpositive/combine2_win.q Fri Jun  7 12:41:20 2013
@@ -11,6 +11,8 @@ set hive.merge.smallfiles.avgsize=0;
 -- INCLUDE_OS_WINDOWS
 -- included only on  windows because of difference in file name encoding logic
 
+-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.20, 0.20S)
+
 create table combine2(key string) partitioned by (value string);
 
 insert overwrite table combine2 partition(value) 

Modified: hive/branches/HIVE-4115/ql/src/test/results/clientpositive/combine2_win.q.out
URL: http://svn.apache.org/viewvc/hive/branches/HIVE-4115/ql/src/test/results/clientpositive/combine2_win.q.out?rev=1490614&r1=1490613&r2=1490614&view=diff
==============================================================================
--- hive/branches/HIVE-4115/ql/src/test/results/clientpositive/combine2_win.q.out (original)
+++ hive/branches/HIVE-4115/ql/src/test/results/clientpositive/combine2_win.q.out Fri Jun  7 12:41:20 2013
@@ -1,8 +1,14 @@
 PREHOOK: query: -- INCLUDE_OS_WINDOWS
+-- included only on  windows because of difference in file name encoding logic
+
+-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.20, 0.20S)
 
 create table combine2(key string) partitioned by (value string)
 PREHOOK: type: CREATETABLE
 POSTHOOK: query: -- INCLUDE_OS_WINDOWS
+-- included only on  windows because of difference in file name encoding logic
+
+-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.20, 0.20S)
 
 create table combine2(key string) partitioned by (value string)
 POSTHOOK: type: CREATETABLE
@@ -124,6 +130,7 @@ STAGE PLANS:
 
 PREHOOK: query: select key, value from combine2 where value is not null order by key
 PREHOOK: type: QUERY
+PREHOOK: Input: default@combine2
 PREHOOK: Input: default@combine2@value=%7C
 PREHOOK: Input: default@combine2@value=2010-04-21%2009%3A45%3A00
 PREHOOK: Input: default@combine2@value=val_0
@@ -135,6 +142,7 @@ PREHOOK: Input: default@combine2@value=v
 #### A masked pattern was here ####
 POSTHOOK: query: select key, value from combine2 where value is not null order by key
 POSTHOOK: type: QUERY
+POSTHOOK: Input: default@combine2
 POSTHOOK: Input: default@combine2@value=%7C
 POSTHOOK: Input: default@combine2@value=2010-04-21%2009%3A45%3A00
 POSTHOOK: Input: default@combine2@value=val_0
@@ -224,7 +232,6 @@ STAGE PLANS:
 #### A masked pattern was here ####
               name default.combine2
               numFiles 1
-              numPartitions 8
               numRows 1
               partition_columns value
               rawDataSize 2
@@ -270,7 +277,6 @@ STAGE PLANS:
 #### A masked pattern was here ####
               name default.combine2
               numFiles 1
-              numPartitions 8
               numRows 1
               partition_columns value
               rawDataSize 2
@@ -316,7 +322,6 @@ STAGE PLANS:
 #### A masked pattern was here ####
               name default.combine2
               numFiles 1
-              numPartitions 8
               numRows 3
               partition_columns value
               rawDataSize 3
@@ -362,7 +367,6 @@ STAGE PLANS:
 #### A masked pattern was here ####
               name default.combine2
               numFiles 1
-              numPartitions 8
               numRows 1
               partition_columns value
               rawDataSize 1
@@ -408,7 +412,6 @@ STAGE PLANS:
 #### A masked pattern was here ####
               name default.combine2
               numFiles 1
-              numPartitions 8
               numRows 1
               partition_columns value
               rawDataSize 1
@@ -454,7 +457,6 @@ STAGE PLANS:
 #### A masked pattern was here ####
               name default.combine2
               numFiles 1
-              numPartitions 8
               numRows 3
               partition_columns value
               rawDataSize 3
@@ -500,7 +502,6 @@ STAGE PLANS:
 #### A masked pattern was here ####
               name default.combine2
               numFiles 1
-              numPartitions 8
               numRows 1
               partition_columns value
               rawDataSize 1
@@ -546,7 +547,6 @@ STAGE PLANS:
 #### A masked pattern was here ####
               name default.combine2
               numFiles 1
-              numPartitions 8
               numRows 1
               partition_columns value
               rawDataSize 1
@@ -607,6 +607,15 @@ STAGE PLANS:
               TotalFiles: 1
               GatherStats: false
               MultiFileSpray: false
+      Truncated Path -> Alias:
+        /combine2/value=%7C [combine2]
+        /combine2/value=2010-04-21%2009%3A45%3A00 [combine2]
+        /combine2/value=val_0 [combine2]
+        /combine2/value=val_2 [combine2]
+        /combine2/value=val_4 [combine2]
+        /combine2/value=val_5 [combine2]
+        /combine2/value=val_8 [combine2]
+        /combine2/value=val_9 [combine2]
 
   Stage: Stage-0
     Fetch Operator
@@ -615,6 +624,7 @@ STAGE PLANS:
 
 PREHOOK: query: select count(1) from combine2 where value is not null
 PREHOOK: type: QUERY
+PREHOOK: Input: default@combine2
 PREHOOK: Input: default@combine2@value=%7C
 PREHOOK: Input: default@combine2@value=2010-04-21%2009%3A45%3A00
 PREHOOK: Input: default@combine2@value=val_0
@@ -626,6 +636,7 @@ PREHOOK: Input: default@combine2@value=v
 #### A masked pattern was here ####
 POSTHOOK: query: select count(1) from combine2 where value is not null
 POSTHOOK: type: QUERY
+POSTHOOK: Input: default@combine2
 POSTHOOK: Input: default@combine2@value=%7C
 POSTHOOK: Input: default@combine2@value=2010-04-21%2009%3A45%3A00
 POSTHOOK: Input: default@combine2@value=val_0
@@ -729,6 +740,7 @@ STAGE PLANS:
 
 PREHOOK: query: select ds, count(1) from srcpart where ds is not null group by ds
 PREHOOK: type: QUERY
+PREHOOK: Input: default@srcpart
 PREHOOK: Input: default@srcpart@ds=2008-04-08/hr=11
 PREHOOK: Input: default@srcpart@ds=2008-04-08/hr=12
 PREHOOK: Input: default@srcpart@ds=2008-04-09/hr=11
@@ -736,6 +748,7 @@ PREHOOK: Input: default@srcpart@ds=2008-
 #### A masked pattern was here ####
 POSTHOOK: query: select ds, count(1) from srcpart where ds is not null group by ds
 POSTHOOK: type: QUERY
+POSTHOOK: Input: default@srcpart
 POSTHOOK: Input: default@srcpart@ds=2008-04-08/hr=11
 POSTHOOK: Input: default@srcpart@ds=2008-04-08/hr=12
 POSTHOOK: Input: default@srcpart@ds=2008-04-09/hr=11

Modified: hive/branches/HIVE-4115/ql/src/test/results/clientpositive/compute_stats_double.q.out
URL: http://svn.apache.org/viewvc/hive/branches/HIVE-4115/ql/src/test/results/clientpositive/compute_stats_double.q.out?rev=1490614&r1=1490613&r2=1490614&view=diff
==============================================================================
--- hive/branches/HIVE-4115/ql/src/test/results/clientpositive/compute_stats_double.q.out (original)
+++ hive/branches/HIVE-4115/ql/src/test/results/clientpositive/compute_stats_double.q.out Fri Jun  7 12:41:20 2013
@@ -30,4 +30,4 @@ select compute_stats(a, 16) from tab_dou
 POSTHOOK: type: QUERY
 POSTHOOK: Input: default@tab_double
 #### A masked pattern was here ####
-{"columntype":"Double","min":-87.2,"max":435.33,"countnulls":2,"numdistinctvalues":8}
+{"columntype":"Double","min":-87.2,"max":435.33,"countnulls":2,"numdistinctvalues":11}

Modified: hive/branches/HIVE-4115/ql/src/test/results/clientpositive/compute_stats_long.q.out
URL: http://svn.apache.org/viewvc/hive/branches/HIVE-4115/ql/src/test/results/clientpositive/compute_stats_long.q.out?rev=1490614&r1=1490613&r2=1490614&view=diff
==============================================================================
--- hive/branches/HIVE-4115/ql/src/test/results/clientpositive/compute_stats_long.q.out (original)
+++ hive/branches/HIVE-4115/ql/src/test/results/clientpositive/compute_stats_long.q.out Fri Jun  7 12:41:20 2013
@@ -30,4 +30,4 @@ select compute_stats(a, 16) from tab_int
 POSTHOOK: type: QUERY
 POSTHOOK: Input: default@tab_int
 #### A masked pattern was here ####
-{"columntype":"Long","min":0,"max":344,"countnulls":1,"numdistinctvalues":16}
+{"columntype":"Long","min":0,"max":344,"countnulls":1,"numdistinctvalues":11}

Modified: hive/branches/HIVE-4115/ql/src/test/results/clientpositive/compute_stats_string.q.out
URL: http://svn.apache.org/viewvc/hive/branches/HIVE-4115/ql/src/test/results/clientpositive/compute_stats_string.q.out?rev=1490614&r1=1490613&r2=1490614&view=diff
==============================================================================
--- hive/branches/HIVE-4115/ql/src/test/results/clientpositive/compute_stats_string.q.out (original)
+++ hive/branches/HIVE-4115/ql/src/test/results/clientpositive/compute_stats_string.q.out Fri Jun  7 12:41:20 2013
@@ -30,4 +30,4 @@ select compute_stats(a, 16) from tab_str
 POSTHOOK: type: QUERY
 POSTHOOK: Input: default@tab_string
 #### A masked pattern was here ####
-{"columntype":"String","maxlength":11,"avglength":3.9,"countnulls":0,"numdistinctvalues":5}
+{"columntype":"String","maxlength":11,"avglength":3.9,"countnulls":0,"numdistinctvalues":7}

Modified: hive/branches/HIVE-4115/ql/src/test/results/clientpositive/input_part10_win.q.out
URL: http://svn.apache.org/viewvc/hive/branches/HIVE-4115/ql/src/test/results/clientpositive/input_part10_win.q.out?rev=1490614&r1=1490613&r2=1490614&view=diff
==============================================================================
--- hive/branches/HIVE-4115/ql/src/test/results/clientpositive/input_part10_win.q.out (original)
+++ hive/branches/HIVE-4115/ql/src/test/results/clientpositive/input_part10_win.q.out Fri Jun  7 12:41:20 2013
@@ -1,4 +1,5 @@
 PREHOOK: query: -- INCLUDE_OS_WINDOWS
+-- included only on  windows because of difference in file name encoding logic
 
 CREATE TABLE part_special (
   a STRING,
@@ -9,6 +10,7 @@ CREATE TABLE part_special (
 )
 PREHOOK: type: CREATETABLE
 POSTHOOK: query: -- INCLUDE_OS_WINDOWS
+-- included only on  windows because of difference in file name encoding logic
 
 CREATE TABLE part_special (
   a STRING,
@@ -105,18 +107,26 @@ POSTHOOK: query: DESCRIBE EXTENDED part_
 POSTHOOK: type: DESCTABLE
 POSTHOOK: Lineage: part_special PARTITION(ds=2008 04 08,ts=10:11:12=455).a SIMPLE []
 POSTHOOK: Lineage: part_special PARTITION(ds=2008 04 08,ts=10:11:12=455).b SIMPLE []
-a	string	
-b	string	
-ds	string	
-ts	string	
+a                   	string              	None                
+b                   	string              	None                
+ds                  	string              	None                
+ts                  	string              	None                
+	 	 
+# Partition Information	 	 
+# col_name            	data_type           	comment             
+	 	 
+ds                  	string              	None                
+ts                  	string              	None                
 	 	 
 #### A masked pattern was here ####
 PREHOOK: query: SELECT * FROM part_special WHERE ds='2008 04 08' AND ts = '10:11:12=455'
 PREHOOK: type: QUERY
+PREHOOK: Input: default@part_special
 PREHOOK: Input: default@part_special@ds=2008%2004%2008/ts=10%3A11%3A12%3D455
 #### A masked pattern was here ####
 POSTHOOK: query: SELECT * FROM part_special WHERE ds='2008 04 08' AND ts = '10:11:12=455'
 POSTHOOK: type: QUERY
+POSTHOOK: Input: default@part_special
 POSTHOOK: Input: default@part_special@ds=2008%2004%2008/ts=10%3A11%3A12%3D455
 #### A masked pattern was here ####
 POSTHOOK: Lineage: part_special PARTITION(ds=2008 04 08,ts=10:11:12=455).a SIMPLE []

Modified: hive/branches/HIVE-4115/ql/src/test/results/clientpositive/load_dyn_part14_win.q.out
URL: http://svn.apache.org/viewvc/hive/branches/HIVE-4115/ql/src/test/results/clientpositive/load_dyn_part14_win.q.out?rev=1490614&r1=1490613&r2=1490614&view=diff
==============================================================================
--- hive/branches/HIVE-4115/ql/src/test/results/clientpositive/load_dyn_part14_win.q.out (original)
+++ hive/branches/HIVE-4115/ql/src/test/results/clientpositive/load_dyn_part14_win.q.out Fri Jun  7 12:41:20 2013
@@ -1,9 +1,13 @@
 PREHOOK: query: -- INCLUDE_OS_WINDOWS
+-- included only on  windows because of difference in file name encoding logic
+
 
 create table if not exists nzhang_part14 (key string) 
   partitioned by (value string)
 PREHOOK: type: CREATETABLE
 POSTHOOK: query: -- INCLUDE_OS_WINDOWS
+-- included only on  windows because of difference in file name encoding logic
+
 
 create table if not exists nzhang_part14 (key string) 
   partitioned by (value string)
@@ -13,8 +17,13 @@ PREHOOK: query: describe extended nzhang
 PREHOOK: type: DESCTABLE
 POSTHOOK: query: describe extended nzhang_part14
 POSTHOOK: type: DESCTABLE
-key	string	
-value	string	
+key                 	string              	None                
+value               	string              	None                
+	 	 
+# Partition Information	 	 
+# col_name            	data_type           	comment             
+	 	 
+value               	string              	None                
 	 	 
 #### A masked pattern was here ####
 PREHOOK: query: explain
@@ -42,14 +51,16 @@ ABSTRACT SYNTAX TREE:
 
 STAGE DEPENDENCIES:
   Stage-1 is a root stage
-  Stage-2 depends on stages: Stage-1, Stage-7, Stage-8
-  Stage-6 depends on stages: Stage-2 , consists of Stage-5, Stage-4
+  Stage-2 depends on stages: Stage-1, Stage-9, Stage-10
+  Stage-8 depends on stages: Stage-2 , consists of Stage-5, Stage-4, Stage-6
   Stage-5
-  Stage-0 depends on stages: Stage-5, Stage-4
+  Stage-0 depends on stages: Stage-5, Stage-4, Stage-7
   Stage-3 depends on stages: Stage-0
   Stage-4
-  Stage-7 is a root stage
-  Stage-8 is a root stage
+  Stage-6
+  Stage-7 depends on stages: Stage-6
+  Stage-9 is a root stage
+  Stage-10 is a root stage
 
 STAGE PLANS:
   Stage: Stage-1
@@ -142,7 +153,7 @@ STAGE PLANS:
                       serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
                       name: default.nzhang_part14
 
-  Stage: Stage-6
+  Stage: Stage-8
     Conditional Operator
 
   Stage: Stage-5
@@ -179,7 +190,26 @@ STAGE PLANS:
                   serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
                   name: default.nzhang_part14
 
+  Stage: Stage-6
+    Map Reduce
+      Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+            File Output Operator
+              compressed: false
+              GlobalTableId: 0
+              table:
+                  input format: org.apache.hadoop.mapred.TextInputFormat
+                  output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+                  serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+                  name: default.nzhang_part14
+
   Stage: Stage-7
+    Move Operator
+      files:
+          hdfs directory: true
+#### A masked pattern was here ####
+
+  Stage: Stage-9
     Map Reduce
       Alias -> Map Operator Tree:
         null-subquery2:t-subquery2:src 
@@ -211,7 +241,7 @@ STAGE PLANS:
                   input format: org.apache.hadoop.mapred.SequenceFileInputFormat
                   output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
 
-  Stage: Stage-8
+  Stage: Stage-10
     Map Reduce
       Alias -> Map Operator Tree:
         null-subquery1-subquery1:t-subquery1-subquery1:src 
@@ -280,12 +310,14 @@ value=__HIVE_DEFAULT_PARTITION__
 PREHOOK: query: select * from nzhang_part14 where value <> 'a'
 order by key, value
 PREHOOK: type: QUERY
+PREHOOK: Input: default@nzhang_part14
 PREHOOK: Input: default@nzhang_part14@value=%20
 PREHOOK: Input: default@nzhang_part14@value=__HIVE_DEFAULT_PARTITION__
 #### A masked pattern was here ####
 POSTHOOK: query: select * from nzhang_part14 where value <> 'a'
 order by key, value
 POSTHOOK: type: QUERY
+POSTHOOK: Input: default@nzhang_part14
 POSTHOOK: Input: default@nzhang_part14@value=%20
 POSTHOOK: Input: default@nzhang_part14@value=__HIVE_DEFAULT_PARTITION__
 #### A masked pattern was here ####