You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by vg...@apache.org on 2018/05/15 22:40:41 UTC

[21/50] [abbrv] hive git commit: HIVE-19453 : Extend Load Data statement to take Input file format and Serde as parameters (Deepak Jaiswal, reviewed by Jason Dere)

HIVE-19453 : Extend Load Data statement to take Input file format and Serde as parameters (Deepak Jaiswal, reviewed by Jason Dere)


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/32e29cc6
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/32e29cc6
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/32e29cc6

Branch: refs/heads/branch-3.0.0
Commit: 32e29cc63c41d722a4b2f8ffae4b9c3a660b8db4
Parents: b331338
Author: Deepak Jaiswal <dj...@apache.org>
Authored: Wed May 9 11:06:34 2018 -0700
Committer: Deepak Jaiswal <dj...@apache.org>
Committed: Fri May 11 10:55:14 2018 -0700

----------------------------------------------------------------------
 .../apache/hadoop/hive/ql/parse/HiveParser.g    | 12 +++++--
 .../hive/ql/parse/LoadSemanticAnalyzer.java     | 33 ++++++++++++++++++--
 .../clientpositive/load_data_using_job.q        |  8 +++--
 .../llap/load_data_using_job.q.out              |  8 +++++
 4 files changed, 54 insertions(+), 7 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/32e29cc6/ql/src/java/org/apache/hadoop/hive/ql/parse/HiveParser.g
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/HiveParser.g b/ql/src/java/org/apache/hadoop/hive/ql/parse/HiveParser.g
index a837d67..3712a53 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/parse/HiveParser.g
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/HiveParser.g
@@ -422,6 +422,7 @@ TOK_ADD_TRIGGER;
 TOK_REPLACE;
 TOK_LIKERP;
 TOK_UNMANAGED;
+TOK_INPUTFORMAT;
 }
 
 
@@ -835,8 +836,8 @@ execStatement
 loadStatement
 @init { pushMsg("load statement", state); }
 @after { popMsg(state); }
-    : KW_LOAD KW_DATA (islocal=KW_LOCAL)? KW_INPATH (path=StringLiteral) (isoverwrite=KW_OVERWRITE)? KW_INTO KW_TABLE (tab=tableOrPartition)
-    -> ^(TOK_LOAD $path $tab $islocal? $isoverwrite?)
+    : KW_LOAD KW_DATA (islocal=KW_LOCAL)? KW_INPATH (path=StringLiteral) (isoverwrite=KW_OVERWRITE)? KW_INTO KW_TABLE (tab=tableOrPartition) inputFileFormat?
+    -> ^(TOK_LOAD $path $tab $islocal? $isoverwrite? inputFileFormat?)
     ;
 
 replicationClause
@@ -1489,6 +1490,13 @@ fileFormat
     | genericSpec=identifier -> ^(TOK_FILEFORMAT_GENERIC $genericSpec)
     ;
 
+inputFileFormat
+@init { pushMsg("Load Data input file format specification", state); }
+@after { popMsg(state); }
+    : KW_INPUTFORMAT inFmt=StringLiteral KW_SERDE serdeCls=StringLiteral
+      -> ^(TOK_INPUTFORMAT $inFmt $serdeCls)
+    ;
+
 tabTypeExpr
 @init { pushMsg("specifying table types", state); }
 @after { popMsg(state); }

http://git-wip-us.apache.org/repos/asf/hive/blob/32e29cc6/ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java
index 2b88ea6..866f43d 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/LoadSemanticAnalyzer.java
@@ -79,6 +79,8 @@ public class LoadSemanticAnalyzer extends SemanticAnalyzer {
   // AST specific data
   private Tree fromTree, tableTree;
   private boolean isLocal = false, isOverWrite = false;
+  private String inputFormatClassName = null;
+  private String serDeClassName = null;
 
   public LoadSemanticAnalyzer(QueryState queryState) throws SemanticException {
     super(queryState);
@@ -257,12 +259,30 @@ public class LoadSemanticAnalyzer extends SemanticAnalyzer {
     fromTree = ast.getChild(0);
     tableTree = ast.getChild(1);
 
-    if (ast.getChildCount() == 4) {
+    boolean inputInfo = false;
+    // Check the last node
+    ASTNode child = (ASTNode)ast.getChild(ast.getChildCount() - 1);
+    if (child.getToken().getType() == HiveParser.TOK_INPUTFORMAT) {
+      if (child.getChildCount() != 2) {
+        throw new SemanticException("FileFormat should contain both input format and Serde");
+      }
+      try {
+        inputFormatClassName = stripQuotes(child.getChild(0).getText());
+        serDeClassName = stripQuotes(child.getChild(1).getText());
+        inputInfo = true;
+      } catch (Exception e) {
+        throw new SemanticException("FileFormat inputFormatClassName or serDeClassName is incorrect");
+      }
+    }
+
+    if ((!inputInfo && ast.getChildCount() == 4) ||
+        (inputInfo && ast.getChildCount() == 5)) {
       isLocal = true;
       isOverWrite = true;
     }
 
-    if (ast.getChildCount() == 3) {
+    if ((!inputInfo && ast.getChildCount() == 3) ||
+        (inputInfo && ast.getChildCount() == 4)) {
       if (ast.getChild(2).getText().toLowerCase().equals("local")) {
         isLocal = true;
       } else {
@@ -450,7 +470,14 @@ public class LoadSemanticAnalyzer extends SemanticAnalyzer {
 
     // Set data location and input format, it must be text
     tempTableObj.setDataLocation(new Path(fromURI));
-    tempTableObj.setInputFormatClass(TextInputFormat.class);
+    if (inputFormatClassName != null && serDeClassName != null) {
+      try {
+        tempTableObj.setInputFormatClass(inputFormatClassName);
+        tempTableObj.setSerializationLib(serDeClassName);
+      } catch (HiveException e) {
+        throw new SemanticException("Load Data: Failed to set inputFormat or SerDe");
+      }
+    }
 
     // Step 2 : create the Insert query
     StringBuilder rewrittenQueryStr = new StringBuilder();

http://git-wip-us.apache.org/repos/asf/hive/blob/32e29cc6/ql/src/test/queries/clientpositive/load_data_using_job.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/load_data_using_job.q b/ql/src/test/queries/clientpositive/load_data_using_job.q
index 3928f1f..3659b6e 100644
--- a/ql/src/test/queries/clientpositive/load_data_using_job.q
+++ b/ql/src/test/queries/clientpositive/load_data_using_job.q
@@ -84,7 +84,11 @@ drop table srcbucket_mapjoin;
 
 -- Load into ORC table using text files
 CREATE TABLE srcbucket_mapjoin(key int, value string) partitioned by (ds string) STORED AS ORC;
-explain load data local inpath '../../data/files/load_data_job/load_data_1_partition.txt' INTO TABLE srcbucket_mapjoin;
-load data local inpath '../../data/files/load_data_job/load_data_1_partition.txt' INTO TABLE srcbucket_mapjoin;
+explain load data local inpath '../../data/files/load_data_job/load_data_1_partition.txt' INTO TABLE srcbucket_mapjoin
+INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat'
+SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe';
+load data local inpath '../../data/files/load_data_job/load_data_1_partition.txt' INTO TABLE srcbucket_mapjoin
+INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat'
+SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe';
 select * from srcbucket_mapjoin;
 drop table srcbucket_mapjoin;
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/hive/blob/32e29cc6/ql/src/test/results/clientpositive/llap/load_data_using_job.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/llap/load_data_using_job.q.out b/ql/src/test/results/clientpositive/llap/load_data_using_job.q.out
index 116630c..c3b70a3 100644
--- a/ql/src/test/results/clientpositive/llap/load_data_using_job.q.out
+++ b/ql/src/test/results/clientpositive/llap/load_data_using_job.q.out
@@ -2776,8 +2776,12 @@ POSTHOOK: type: CREATETABLE
 POSTHOOK: Output: database:default
 POSTHOOK: Output: default@srcbucket_mapjoin
 PREHOOK: query: explain load data local inpath '../../data/files/load_data_job/load_data_1_partition.txt' INTO TABLE srcbucket_mapjoin
+INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat'
+SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
 PREHOOK: type: QUERY
 POSTHOOK: query: explain load data local inpath '../../data/files/load_data_job/load_data_1_partition.txt' INTO TABLE srcbucket_mapjoin
+INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat'
+SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
 POSTHOOK: type: QUERY
 STAGE DEPENDENCIES:
   Stage-1 is a root stage
@@ -2830,10 +2834,14 @@ STAGE PLANS:
       Basic Stats Work:
 
 PREHOOK: query: load data local inpath '../../data/files/load_data_job/load_data_1_partition.txt' INTO TABLE srcbucket_mapjoin
+INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat'
+SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
 PREHOOK: type: QUERY
 PREHOOK: Input: default@srcbucket_mapjoin__TEMP_TABLE_FOR_LOAD_DATA__
 PREHOOK: Output: default@srcbucket_mapjoin
 POSTHOOK: query: load data local inpath '../../data/files/load_data_job/load_data_1_partition.txt' INTO TABLE srcbucket_mapjoin
+INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat'
+SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
 POSTHOOK: type: QUERY
 POSTHOOK: Input: default@srcbucket_mapjoin__TEMP_TABLE_FOR_LOAD_DATA__
 POSTHOOK: Output: default@srcbucket_mapjoin@ds=2008-04-08