You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by px...@apache.org on 2015/09/05 02:57:22 UTC

hive git commit: HIVE-5277: HBase handler skips rows with null valued first cells when only row key is selected (Swarnim Kulkarni, reviewed by Xuefu Zhang)

Repository: hive
Updated Branches:
  refs/heads/branch-1.2 1c80f2e5a -> 4aa432f8b


HIVE-5277: HBase handler skips rows with null valued first cells when only row key is selected (Swarnim Kulkarni, reviewed by Xuefu Zhang)


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/4aa432f8
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/4aa432f8
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/4aa432f8

Branch: refs/heads/branch-1.2
Commit: 4aa432f8b27601bca04dd7a9222711590e405eac
Parents: 1c80f2e
Author: Swarnim Kulkarni <ku...@gmail.com>
Authored: Fri Aug 21 11:21:08 2015 -0700
Committer: Pengcheng Xiong <px...@apache.org>
Committed: Fri Sep 4 17:48:13 2015 -0700

----------------------------------------------------------------------
 .../hive/hbase/HiveHBaseInputFormatUtil.java    |  50 +++++----
 .../queries/positive/hbase_null_first_col.q     |  22 ++++
 .../results/positive/hbase_null_first_col.q.out | 109 +++++++++++++++++++
 3 files changed, 158 insertions(+), 23 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/4aa432f8/hbase-handler/src/java/org/apache/hadoop/hive/hbase/HiveHBaseInputFormatUtil.java
----------------------------------------------------------------------
diff --git a/hbase-handler/src/java/org/apache/hadoop/hive/hbase/HiveHBaseInputFormatUtil.java b/hbase-handler/src/java/org/apache/hadoop/hive/hbase/HiveHBaseInputFormatUtil.java
index 0524572..c002070 100644
--- a/hbase-handler/src/java/org/apache/hadoop/hive/hbase/HiveHBaseInputFormatUtil.java
+++ b/hbase-handler/src/java/org/apache/hadoop/hive/hbase/HiveHBaseInputFormatUtil.java
@@ -18,9 +18,18 @@
 
 package org.apache.hadoop.hive.hbase;
 
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
 import org.apache.hadoop.hbase.HBaseConfiguration;
 import org.apache.hadoop.hbase.client.HTable;
 import org.apache.hadoop.hbase.client.Scan;
+import org.apache.hadoop.hbase.filter.FilterList;
+import org.apache.hadoop.hbase.filter.FirstKeyOnlyFilter;
+import org.apache.hadoop.hbase.filter.KeyOnlyFilter;
 import org.apache.hadoop.hbase.util.Bytes;
 import org.apache.hadoop.hive.hbase.ColumnMappings.ColumnMapping;
 import org.apache.hadoop.hive.ql.index.IndexSearchCondition;
@@ -28,12 +37,6 @@ import org.apache.hadoop.hive.serde2.ColumnProjectionUtils;
 import org.apache.hadoop.hive.serde2.SerDeException;
 import org.apache.hadoop.mapred.JobConf;
 
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-
 /**
  * Util code common between HiveHBaseTableInputFormat and HiveHBaseTableSnapshotInputFormat.
  */
@@ -95,26 +98,27 @@ class HiveHBaseInputFormatUtil {
       }
     }
 
-    // The HBase table's row key maps to a Hive table column. In the corner case when only the
-    // row key column is selected in Hive, the HBase Scan will be empty i.e. no column family/
-    // column qualifier will have been added to the scan. We arbitrarily add at least one column
-    // to the HBase scan so that we can retrieve all of the row keys and return them as the Hive
-    // tables column projection.
+    // If we have cases where we are running a query like count(key) or count(*),
+    // in such cases, the readColIDs is either empty(for count(*)) or has just the
+    // key column in it. In either case, nothing gets added to the scan. So if readAllColumns is
+    // true, we are going to add all columns. Else we are just going to add a key filter to run a
+    // count only on the keys
     if (empty) {
-      for (ColumnMapping colMap: columnMappings) {
-        if (colMap.hbaseRowKey || colMap.hbaseTimestamp) {
-          continue;
-        }
-
-        if (colMap.qualifierName == null) {
-          scan.addFamily(colMap.familyNameBytes);
-        } else {
-          scan.addColumn(colMap.familyNameBytes, colMap.qualifierNameBytes);
-        }
+      if (readAllColumns) {
+        for (ColumnMapping colMap: columnMappings) {
+          if (colMap.hbaseRowKey || colMap.hbaseTimestamp) {
+            continue;
+          }
 
-        if (!readAllColumns) {
-          break;
+          if (colMap.qualifierName == null) {
+            scan.addFamily(colMap.familyNameBytes);
+          } else {
+            scan.addColumn(colMap.familyNameBytes, colMap.qualifierNameBytes);
+          }
         }
+      } else {
+        // Add a filter to just do a scan on the keys so that we pick up everything
+        scan.setFilter(new FilterList(new FirstKeyOnlyFilter(), new KeyOnlyFilter()));
       }
     }
 

http://git-wip-us.apache.org/repos/asf/hive/blob/4aa432f8/hbase-handler/src/test/queries/positive/hbase_null_first_col.q
----------------------------------------------------------------------
diff --git a/hbase-handler/src/test/queries/positive/hbase_null_first_col.q b/hbase-handler/src/test/queries/positive/hbase_null_first_col.q
new file mode 100644
index 0000000..0d9ff56
--- /dev/null
+++ b/hbase-handler/src/test/queries/positive/hbase_null_first_col.q
@@ -0,0 +1,22 @@
+DROP TABLE src_null;
+DROP TABLE hbase_null;
+
+CREATE TABLE src_null(a STRING, b STRING, c STRING, d STRING) STORED AS TEXTFILE;
+LOAD DATA LOCAL INPATH '../../data/files/null.txt' INTO TABLE src_null;
+
+CREATE TABLE hbase_null(key string, col1 string, col2 string)
+STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
+WITH SERDEPROPERTIES (
+"hbase.columns.mapping" = ":key,cf1:c1,cf1:c2"
+);
+
+SELECT d, a, c FROM src_null;
+
+INSERT INTO TABLE hbase_null SELECT d, a, c FROM src_null;
+
+SELECT COUNT(d) FROM src_null;
+SELECT COUNT(key) FROM hbase_null;
+SELECT COUNT(*) FROM hbase_null;
+
+DROP TABLE src_null;
+DROP TABLE hbase_null;

http://git-wip-us.apache.org/repos/asf/hive/blob/4aa432f8/hbase-handler/src/test/results/positive/hbase_null_first_col.q.out
----------------------------------------------------------------------
diff --git a/hbase-handler/src/test/results/positive/hbase_null_first_col.q.out b/hbase-handler/src/test/results/positive/hbase_null_first_col.q.out
new file mode 100644
index 0000000..bb4491b
--- /dev/null
+++ b/hbase-handler/src/test/results/positive/hbase_null_first_col.q.out
@@ -0,0 +1,109 @@
+PREHOOK: query: DROP TABLE src_null
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: DROP TABLE src_null
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: DROP TABLE hbase_null
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: DROP TABLE hbase_null
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: CREATE TABLE src_null(a STRING, b STRING, c STRING, d STRING) STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@src_null
+POSTHOOK: query: CREATE TABLE src_null(a STRING, b STRING, c STRING, d STRING) STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@src_null
+PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/null.txt' INTO TABLE src_null
+PREHOOK: type: LOAD
+#### A masked pattern was here ####
+PREHOOK: Output: default@src_null
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/null.txt' INTO TABLE src_null
+POSTHOOK: type: LOAD
+#### A masked pattern was here ####
+POSTHOOK: Output: default@src_null
+PREHOOK: query: CREATE TABLE hbase_null(key string, col1 string, col2 string)
+STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
+WITH SERDEPROPERTIES (
+"hbase.columns.mapping" = ":key,cf1:c1,cf1:c2"
+)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@hbase_null
+POSTHOOK: query: CREATE TABLE hbase_null(key string, col1 string, col2 string)
+STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
+WITH SERDEPROPERTIES (
+"hbase.columns.mapping" = ":key,cf1:c1,cf1:c2"
+)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@hbase_null
+PREHOOK: query: SELECT d, a, c FROM src_null
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src_null
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT d, a, c FROM src_null
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src_null
+#### A masked pattern was here ####
+0	1.0	same
+1	1.0	same
+2	1.0	same
+3	1.0	same
+4	1.0	same
+5	NULL	same
+6	NULL	same
+7	1.0	same
+8	1.0	same
+9	1.0	same
+PREHOOK: query: INSERT INTO TABLE hbase_null SELECT d, a, c FROM src_null
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src_null
+PREHOOK: Output: default@hbase_null
+POSTHOOK: query: INSERT INTO TABLE hbase_null SELECT d, a, c FROM src_null
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src_null
+POSTHOOK: Output: default@hbase_null
+PREHOOK: query: SELECT COUNT(d) FROM src_null
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src_null
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT COUNT(d) FROM src_null
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src_null
+#### A masked pattern was here ####
+10
+PREHOOK: query: SELECT COUNT(key) FROM hbase_null
+PREHOOK: type: QUERY
+PREHOOK: Input: default@hbase_null
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT COUNT(key) FROM hbase_null
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@hbase_null
+#### A masked pattern was here ####
+10
+PREHOOK: query: SELECT COUNT(*) FROM hbase_null
+PREHOOK: type: QUERY
+PREHOOK: Input: default@hbase_null
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT COUNT(*) FROM hbase_null
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@hbase_null
+#### A masked pattern was here ####
+10
+PREHOOK: query: DROP TABLE src_null
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@src_null
+PREHOOK: Output: default@src_null
+POSTHOOK: query: DROP TABLE src_null
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@src_null
+POSTHOOK: Output: default@src_null
+PREHOOK: query: DROP TABLE hbase_null
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@hbase_null
+PREHOOK: Output: default@hbase_null
+POSTHOOK: query: DROP TABLE hbase_null
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@hbase_null
+POSTHOOK: Output: default@hbase_null