You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by ai...@apache.org on 2015/12/22 15:38:49 UTC
hive git commit: HIVE-12541: SymbolicTextInputFormat should supports
the path with regex (Xiaowei Wang, reviewed by Aihua Xu)
Repository: hive
Updated Branches:
refs/heads/master cdcc35e61 -> 4df9b4d20
HIVE-12541: SymbolicTextInputFormat should supports the path with regex (Xiaowei Wang, reviewed by Aihua Xu)
Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/4df9b4d2
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/4df9b4d2
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/4df9b4d2
Branch: refs/heads/master
Commit: 4df9b4d208087dd659309cda7d4627d000b2f6ba
Parents: cdcc35e
Author: Aihua Xu <ai...@apache.org>
Authored: Tue Dec 22 09:37:41 2015 -0500
Committer: Aihua Xu <ai...@apache.org>
Committed: Tue Dec 22 09:37:41 2015 -0500
----------------------------------------------------------------------
data/files/regex-path-2015-12-10_03.txt | 1 +
data/files/regex-path-201512-10_03.txt | 1 +
data/files/regex-path-2015121003.txt | 1 +
data/files/symlink-with-regex.txt | 2 +
.../hadoop/hive/ql/io/SymbolicInputFormat.java | 7 +-
.../clientpositive/symlink_text_input_format.q | 26 +++
.../symlink_text_input_format.q.out | 218 +++++++++++++++++++
7 files changed, 254 insertions(+), 2 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hive/blob/4df9b4d2/data/files/regex-path-2015-12-10_03.txt
----------------------------------------------------------------------
diff --git a/data/files/regex-path-2015-12-10_03.txt b/data/files/regex-path-2015-12-10_03.txt
new file mode 100644
index 0000000..315e406
--- /dev/null
+++ b/data/files/regex-path-2015-12-10_03.txt
@@ -0,0 +1 @@
+101101
http://git-wip-us.apache.org/repos/asf/hive/blob/4df9b4d2/data/files/regex-path-201512-10_03.txt
----------------------------------------------------------------------
diff --git a/data/files/regex-path-201512-10_03.txt b/data/files/regex-path-201512-10_03.txt
new file mode 100644
index 0000000..e2bdf39
--- /dev/null
+++ b/data/files/regex-path-201512-10_03.txt
@@ -0,0 +1 @@
+102102
http://git-wip-us.apache.org/repos/asf/hive/blob/4df9b4d2/data/files/regex-path-2015121003.txt
----------------------------------------------------------------------
diff --git a/data/files/regex-path-2015121003.txt b/data/files/regex-path-2015121003.txt
new file mode 100644
index 0000000..74a4ca1
--- /dev/null
+++ b/data/files/regex-path-2015121003.txt
@@ -0,0 +1 @@
+103103
http://git-wip-us.apache.org/repos/asf/hive/blob/4df9b4d2/data/files/symlink-with-regex.txt
----------------------------------------------------------------------
diff --git a/data/files/symlink-with-regex.txt b/data/files/symlink-with-regex.txt
new file mode 100644
index 0000000..21e119e
--- /dev/null
+++ b/data/files/symlink-with-regex.txt
@@ -0,0 +1,2 @@
+../../data/files/*2015{-,}12{-,}10{_03,03}*.txt
+../../data/files/T{1,3}.txt
http://git-wip-us.apache.org/repos/asf/hive/blob/4df9b4d2/ql/src/java/org/apache/hadoop/hive/ql/io/SymbolicInputFormat.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/SymbolicInputFormat.java b/ql/src/java/org/apache/hadoop/hive/ql/io/SymbolicInputFormat.java
index feef854..8b49204 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/SymbolicInputFormat.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/SymbolicInputFormat.java
@@ -75,8 +75,11 @@ public class SymbolicInputFormat implements ReworkMapredInputFormat {
while ((line = reader.readLine()) != null) {
// no check for the line? How to check?
// if the line is invalid for any reason, the job will fail.
- toAddPathToPart.put(line, partDesc);
- pathToAliases.put(line, aliases);
+ FileStatus[] matches = fileSystem.globStatus(new Path(line));
+ for(FileStatus fileStatus :matches) {
+ toAddPathToPart.put(fileStatus.getPath().toUri().getPath(), partDesc);
+ pathToAliases.put(fileStatus.getPath().toUri().getPath(), aliases);
+ }
}
} finally {
org.apache.hadoop.io.IOUtils.closeStream(reader);
http://git-wip-us.apache.org/repos/asf/hive/blob/4df9b4d2/ql/src/test/queries/clientpositive/symlink_text_input_format.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/symlink_text_input_format.q b/ql/src/test/queries/clientpositive/symlink_text_input_format.q
index 521a617..d89aad4 100644
--- a/ql/src/test/queries/clientpositive/symlink_text_input_format.q
+++ b/ql/src/test/queries/clientpositive/symlink_text_input_format.q
@@ -22,3 +22,29 @@ EXPLAIN SELECT count(1) FROM symlink_text_input_format;
SELECT count(1) FROM symlink_text_input_format;
DROP TABLE symlink_text_input_format;
+
+CREATE TABLE symlink_text_input_format (key STRING, value STRING) STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat';
+
+dfs -cp ../../data/files/symlink-with-regex.txt ${system:test.warehouse.dir}/symlink_text_input_format/symlink-with-regex.txt;
+dfs -cp ../../data/files/symlink2.txt ${system:test.warehouse.dir}/symlink_text_input_format/symlink2.txt;
+
+EXPLAIN SELECT * FROM symlink_text_input_format order by key, value;
+
+SELECT * FROM symlink_text_input_format order by key, value;
+
+EXPLAIN SELECT value FROM symlink_text_input_format order by value;
+
+SELECT value FROM symlink_text_input_format order by value;
+
+EXPLAIN SELECT count(1) FROM symlink_text_input_format;
+
+SELECT count(1) FROM symlink_text_input_format;
+
+SET hive.rework.mapredwork = true ;
+SET mapred.max.split.size= 0 ;
+SET mapred.min.split.size.per.node= 0 ;
+SET mapred.min.split.size.per.rack= 0 ;
+
+SELECT count(1) FROM symlink_text_input_format;
+
+DROP TABLE symlink_text_input_format;
http://git-wip-us.apache.org/repos/asf/hive/blob/4df9b4d2/ql/src/test/results/clientpositive/symlink_text_input_format.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/symlink_text_input_format.q.out b/ql/src/test/results/clientpositive/symlink_text_input_format.q.out
index 6c2e2e6..6a091e2 100644
--- a/ql/src/test/results/clientpositive/symlink_text_input_format.q.out
+++ b/ql/src/test/results/clientpositive/symlink_text_input_format.q.out
@@ -223,3 +223,221 @@ POSTHOOK: query: DROP TABLE symlink_text_input_format
POSTHOOK: type: DROPTABLE
POSTHOOK: Input: default@symlink_text_input_format
POSTHOOK: Output: default@symlink_text_input_format
+PREHOOK: query: CREATE TABLE symlink_text_input_format (key STRING, value STRING) STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat'
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@symlink_text_input_format
+POSTHOOK: query: CREATE TABLE symlink_text_input_format (key STRING, value STRING) STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat'
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@symlink_text_input_format
+PREHOOK: query: EXPLAIN SELECT * FROM symlink_text_input_format order by key, value
+PREHOOK: type: QUERY
+POSTHOOK: query: EXPLAIN SELECT * FROM symlink_text_input_format order by key, value
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Map Operator Tree:
+ TableScan
+ alias: symlink_text_input_format
+ Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: key (type: string), value (type: string)
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE Column stats: NONE
+ Reduce Output Operator
+ key expressions: _col0 (type: string), _col1 (type: string)
+ sort order: ++
+ Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE Column stats: NONE
+ Reduce Operator Tree:
+ Select Operator
+ expressions: KEY.reducesinkkey0 (type: string), KEY.reducesinkkey1 (type: string)
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE Column stats: NONE
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: SELECT * FROM symlink_text_input_format order by key, value
+PREHOOK: type: QUERY
+PREHOOK: Input: default@symlink_text_input_format
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT * FROM symlink_text_input_format order by key, value
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@symlink_text_input_format
+#### A masked pattern was here ####
+1 11
+101 101
+102 102
+103 103
+2 12
+2 12
+2 22
+3 13
+3 13
+4 14
+4 14
+5 15
+6 16
+7 17
+7 17
+8 18
+8 18
+8 18
+8 28
+PREHOOK: query: EXPLAIN SELECT value FROM symlink_text_input_format order by value
+PREHOOK: type: QUERY
+POSTHOOK: query: EXPLAIN SELECT value FROM symlink_text_input_format order by value
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Map Operator Tree:
+ TableScan
+ alias: symlink_text_input_format
+ Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: value (type: string)
+ outputColumnNames: _col0
+ Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE Column stats: NONE
+ Reduce Output Operator
+ key expressions: _col0 (type: string)
+ sort order: +
+ Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE Column stats: NONE
+ Reduce Operator Tree:
+ Select Operator
+ expressions: KEY.reducesinkkey0 (type: string)
+ outputColumnNames: _col0
+ Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE Column stats: NONE
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: SELECT value FROM symlink_text_input_format order by value
+PREHOOK: type: QUERY
+PREHOOK: Input: default@symlink_text_input_format
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT value FROM symlink_text_input_format order by value
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@symlink_text_input_format
+#### A masked pattern was here ####
+101
+102
+103
+11
+12
+12
+13
+13
+14
+14
+15
+16
+17
+17
+18
+18
+18
+22
+28
+PREHOOK: query: EXPLAIN SELECT count(1) FROM symlink_text_input_format
+PREHOOK: type: QUERY
+POSTHOOK: query: EXPLAIN SELECT count(1) FROM symlink_text_input_format
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Map Operator Tree:
+ TableScan
+ alias: symlink_text_input_format
+ Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ Statistics: Num rows: 1 Data size: 100 Basic stats: COMPLETE Column stats: COMPLETE
+ Group By Operator
+ aggregations: count(1)
+ mode: hash
+ outputColumnNames: _col0
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ sort order:
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE
+ value expressions: _col0 (type: bigint)
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations: count(VALUE._col0)
+ mode: mergepartial
+ outputColumnNames: _col0
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: SELECT count(1) FROM symlink_text_input_format
+PREHOOK: type: QUERY
+PREHOOK: Input: default@symlink_text_input_format
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT count(1) FROM symlink_text_input_format
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@symlink_text_input_format
+#### A masked pattern was here ####
+19
+PREHOOK: query: SELECT count(1) FROM symlink_text_input_format
+PREHOOK: type: QUERY
+PREHOOK: Input: default@symlink_text_input_format
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT count(1) FROM symlink_text_input_format
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@symlink_text_input_format
+#### A masked pattern was here ####
+19
+PREHOOK: query: DROP TABLE symlink_text_input_format
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@symlink_text_input_format
+PREHOOK: Output: default@symlink_text_input_format
+POSTHOOK: query: DROP TABLE symlink_text_input_format
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@symlink_text_input_format
+POSTHOOK: Output: default@symlink_text_input_format