You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by px...@apache.org on 2015/09/05 02:57:22 UTC
hive git commit: HIVE-5277: HBase handler skips rows with null valued
first cells when only row key is selected (Swarnim Kulkarni,
reviewed by Xuefu Zhang)
Repository: hive
Updated Branches:
refs/heads/branch-1.2 1c80f2e5a -> 4aa432f8b
HIVE-5277: HBase handler skips rows with null valued first cells when only row key is selected (Swarnim Kulkarni, reviewed by Xuefu Zhang)
Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/4aa432f8
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/4aa432f8
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/4aa432f8
Branch: refs/heads/branch-1.2
Commit: 4aa432f8b27601bca04dd7a9222711590e405eac
Parents: 1c80f2e
Author: Swarnim Kulkarni <ku...@gmail.com>
Authored: Fri Aug 21 11:21:08 2015 -0700
Committer: Pengcheng Xiong <px...@apache.org>
Committed: Fri Sep 4 17:48:13 2015 -0700
----------------------------------------------------------------------
.../hive/hbase/HiveHBaseInputFormatUtil.java | 50 +++++----
.../queries/positive/hbase_null_first_col.q | 22 ++++
.../results/positive/hbase_null_first_col.q.out | 109 +++++++++++++++++++
3 files changed, 158 insertions(+), 23 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hive/blob/4aa432f8/hbase-handler/src/java/org/apache/hadoop/hive/hbase/HiveHBaseInputFormatUtil.java
----------------------------------------------------------------------
diff --git a/hbase-handler/src/java/org/apache/hadoop/hive/hbase/HiveHBaseInputFormatUtil.java b/hbase-handler/src/java/org/apache/hadoop/hive/hbase/HiveHBaseInputFormatUtil.java
index 0524572..c002070 100644
--- a/hbase-handler/src/java/org/apache/hadoop/hive/hbase/HiveHBaseInputFormatUtil.java
+++ b/hbase-handler/src/java/org/apache/hadoop/hive/hbase/HiveHBaseInputFormatUtil.java
@@ -18,9 +18,18 @@
package org.apache.hadoop.hive.hbase;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Scan;
+import org.apache.hadoop.hbase.filter.FilterList;
+import org.apache.hadoop.hbase.filter.FirstKeyOnlyFilter;
+import org.apache.hadoop.hbase.filter.KeyOnlyFilter;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hive.hbase.ColumnMappings.ColumnMapping;
import org.apache.hadoop.hive.ql.index.IndexSearchCondition;
@@ -28,12 +37,6 @@ import org.apache.hadoop.hive.serde2.ColumnProjectionUtils;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.mapred.JobConf;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-
/**
* Util code common between HiveHBaseTableInputFormat and HiveHBaseTableSnapshotInputFormat.
*/
@@ -95,26 +98,27 @@ class HiveHBaseInputFormatUtil {
}
}
- // The HBase table's row key maps to a Hive table column. In the corner case when only the
- // row key column is selected in Hive, the HBase Scan will be empty i.e. no column family/
- // column qualifier will have been added to the scan. We arbitrarily add at least one column
- // to the HBase scan so that we can retrieve all of the row keys and return them as the Hive
- // tables column projection.
+ // If we have cases where we are running a query like count(key) or count(*),
+ // in such cases, the readColIDs is either empty(for count(*)) or has just the
+ // key column in it. In either case, nothing gets added to the scan. So if readAllColumns is
+ // true, we are going to add all columns. Else we are just going to add a key filter to run a
+ // count only on the keys
if (empty) {
- for (ColumnMapping colMap: columnMappings) {
- if (colMap.hbaseRowKey || colMap.hbaseTimestamp) {
- continue;
- }
-
- if (colMap.qualifierName == null) {
- scan.addFamily(colMap.familyNameBytes);
- } else {
- scan.addColumn(colMap.familyNameBytes, colMap.qualifierNameBytes);
- }
+ if (readAllColumns) {
+ for (ColumnMapping colMap: columnMappings) {
+ if (colMap.hbaseRowKey || colMap.hbaseTimestamp) {
+ continue;
+ }
- if (!readAllColumns) {
- break;
+ if (colMap.qualifierName == null) {
+ scan.addFamily(colMap.familyNameBytes);
+ } else {
+ scan.addColumn(colMap.familyNameBytes, colMap.qualifierNameBytes);
+ }
}
+ } else {
+ // Add a filter to just do a scan on the keys so that we pick up everything
+ scan.setFilter(new FilterList(new FirstKeyOnlyFilter(), new KeyOnlyFilter()));
}
}
http://git-wip-us.apache.org/repos/asf/hive/blob/4aa432f8/hbase-handler/src/test/queries/positive/hbase_null_first_col.q
----------------------------------------------------------------------
diff --git a/hbase-handler/src/test/queries/positive/hbase_null_first_col.q b/hbase-handler/src/test/queries/positive/hbase_null_first_col.q
new file mode 100644
index 0000000..0d9ff56
--- /dev/null
+++ b/hbase-handler/src/test/queries/positive/hbase_null_first_col.q
@@ -0,0 +1,22 @@
+DROP TABLE src_null;
+DROP TABLE hbase_null;
+
+CREATE TABLE src_null(a STRING, b STRING, c STRING, d STRING) STORED AS TEXTFILE;
+LOAD DATA LOCAL INPATH '../../data/files/null.txt' INTO TABLE src_null;
+
+CREATE TABLE hbase_null(key string, col1 string, col2 string)
+STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
+WITH SERDEPROPERTIES (
+"hbase.columns.mapping" = ":key,cf1:c1,cf1:c2"
+);
+
+SELECT d, a, c FROM src_null;
+
+INSERT INTO TABLE hbase_null SELECT d, a, c FROM src_null;
+
+SELECT COUNT(d) FROM src_null;
+SELECT COUNT(key) FROM hbase_null;
+SELECT COUNT(*) FROM hbase_null;
+
+DROP TABLE src_null;
+DROP TABLE hbase_null;
http://git-wip-us.apache.org/repos/asf/hive/blob/4aa432f8/hbase-handler/src/test/results/positive/hbase_null_first_col.q.out
----------------------------------------------------------------------
diff --git a/hbase-handler/src/test/results/positive/hbase_null_first_col.q.out b/hbase-handler/src/test/results/positive/hbase_null_first_col.q.out
new file mode 100644
index 0000000..bb4491b
--- /dev/null
+++ b/hbase-handler/src/test/results/positive/hbase_null_first_col.q.out
@@ -0,0 +1,109 @@
+PREHOOK: query: DROP TABLE src_null
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: DROP TABLE src_null
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: DROP TABLE hbase_null
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: DROP TABLE hbase_null
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: CREATE TABLE src_null(a STRING, b STRING, c STRING, d STRING) STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@src_null
+POSTHOOK: query: CREATE TABLE src_null(a STRING, b STRING, c STRING, d STRING) STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@src_null
+PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/null.txt' INTO TABLE src_null
+PREHOOK: type: LOAD
+#### A masked pattern was here ####
+PREHOOK: Output: default@src_null
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/null.txt' INTO TABLE src_null
+POSTHOOK: type: LOAD
+#### A masked pattern was here ####
+POSTHOOK: Output: default@src_null
+PREHOOK: query: CREATE TABLE hbase_null(key string, col1 string, col2 string)
+STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
+WITH SERDEPROPERTIES (
+"hbase.columns.mapping" = ":key,cf1:c1,cf1:c2"
+)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@hbase_null
+POSTHOOK: query: CREATE TABLE hbase_null(key string, col1 string, col2 string)
+STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
+WITH SERDEPROPERTIES (
+"hbase.columns.mapping" = ":key,cf1:c1,cf1:c2"
+)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@hbase_null
+PREHOOK: query: SELECT d, a, c FROM src_null
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src_null
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT d, a, c FROM src_null
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src_null
+#### A masked pattern was here ####
+0 1.0 same
+1 1.0 same
+2 1.0 same
+3 1.0 same
+4 1.0 same
+5 NULL same
+6 NULL same
+7 1.0 same
+8 1.0 same
+9 1.0 same
+PREHOOK: query: INSERT INTO TABLE hbase_null SELECT d, a, c FROM src_null
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src_null
+PREHOOK: Output: default@hbase_null
+POSTHOOK: query: INSERT INTO TABLE hbase_null SELECT d, a, c FROM src_null
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src_null
+POSTHOOK: Output: default@hbase_null
+PREHOOK: query: SELECT COUNT(d) FROM src_null
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src_null
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT COUNT(d) FROM src_null
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src_null
+#### A masked pattern was here ####
+10
+PREHOOK: query: SELECT COUNT(key) FROM hbase_null
+PREHOOK: type: QUERY
+PREHOOK: Input: default@hbase_null
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT COUNT(key) FROM hbase_null
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@hbase_null
+#### A masked pattern was here ####
+10
+PREHOOK: query: SELECT COUNT(*) FROM hbase_null
+PREHOOK: type: QUERY
+PREHOOK: Input: default@hbase_null
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT COUNT(*) FROM hbase_null
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@hbase_null
+#### A masked pattern was here ####
+10
+PREHOOK: query: DROP TABLE src_null
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@src_null
+PREHOOK: Output: default@src_null
+POSTHOOK: query: DROP TABLE src_null
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@src_null
+POSTHOOK: Output: default@src_null
+PREHOOK: query: DROP TABLE hbase_null
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@hbase_null
+PREHOOK: Output: default@hbase_null
+POSTHOOK: query: DROP TABLE hbase_null
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@hbase_null
+POSTHOOK: Output: default@hbase_null