You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by kr...@apache.org on 2022/07/19 04:48:07 UTC
[hive] branch master updated: HIVE-26395: Add support for CREATE TABLE LIKE FILE PARQUET (John Sherman, reviewed by Krisztian Kasa, Aman Sinha)
This is an automated email from the ASF dual-hosted git repository.
krisztiankasa pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new 855c0642342 HIVE-26395: Add support for CREATE TABLE LIKE FILE PARQUET (John Sherman, reviewed by Krisztian Kasa, Aman Sinha)
855c0642342 is described below
commit 855c0642342f83a80f30ecf41f1f7f08048d6f80
Author: John Sherman <jf...@cloudera.com>
AuthorDate: Mon Jul 18 21:47:57 2022 -0700
HIVE-26395: Add support for CREATE TABLE LIKE FILE PARQUET (John Sherman, reviewed by Krisztian Kasa, Aman Sinha)
- Add support for CREATE TABLE LIKE FILE PARQUET
- Attempts to derive the schema for a new table from an existing parquet file
- Example:
CREATE TABLE ctlf_table LIKE FILE PARQUET 's3a://testbucket/files/schema.parq';
- Add hive.parquet.infer.binary.as configuration option
- Determines what the unannotated Parquet binary type gets interpreted as
- either binary or string
- default is binary
- This configuration option is helpful since some systems expect binary to be
interpreted as string.
- This patch also modifies HCatalog code path and removed a section of code that
seemed incorrect.
- It seemed to attempt to force a STORED AS clause but never worked correctly
- The check would never fail due to the fact every CREATE TABLE AST included
TOK_LIKETABLE.
- The code also may have been a remnant of a time when "STORED AS" was required
for CREATE TABLE statements (before there was a default value)
---
.../java/org/apache/hadoop/hive/conf/HiveConf.java | 3 +
.../java/org/apache/hadoop/hive/ql/ErrorMsg.java | 8 +-
.../cli/SemanticAnalysis/CreateTableHook.java | 16 +-
.../apache/hadoop/hive/ql/parse/CreateDDLParser.g | 21 +-
.../org/apache/hadoop/hive/ql/parse/HiveParser.g | 1 +
.../hive/ql/ddl/table/create/CreateTableDesc.java | 30 +-
.../ql/ddl/table/create/CreateTableOperation.java | 17 +
.../hadoop/hive/ql/io/SchemaInferenceUtils.java | 74 +++
.../hive/ql/io/parquet/serde/ParquetHiveSerDe.java | 202 ++++++-
.../hadoop/hive/ql/parse/SemanticAnalyzer.java | 34 +-
.../clientnegative/create_table_like_invalid.q | 1 +
.../clientpositive/create_table_like_file.q | 90 +++
.../clientnegative/create_table_like_invalid.q.out | 1 +
.../llap/create_table_like_file.q.out | 611 +++++++++++++++++++++
.../apache/hadoop/hive/serde2/SchemaInference.java | 35 ++
15 files changed, 1108 insertions(+), 36 deletions(-)
diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
index 50cfb85ba99..044040f8f11 100644
--- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
+++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
@@ -2237,6 +2237,9 @@ public class HiveConf extends Configuration {
"Whether to use former Java date/time APIs to convert between timezones when writing timestamps in " +
"Parquet files. Once data are written to the file the effect is permanent (also reflected in the metadata)." +
"Changing the value of this property affects only new data written to the file."),
+ HIVE_PARQUET_INFER_BINARY_AS("hive.parquet.infer.binary.as", "binary", new StringSet("binary", "string"),
+ "This setting controls what the parquet binary type gets inferred as by CREATE TABLE LIKE FILE. This is helpful " +
+ "since some systems specify the parquet schema for strings as binary."),
HIVE_AVRO_TIMESTAMP_SKIP_CONVERSION("hive.avro.timestamp.skip.conversion", false,
"Some older Hive implementations (pre-3.1) wrote Avro timestamps in a UTC-normalized" +
"manner, while from version 3.1 until now Hive wrote time zone agnostic timestamps. " +
diff --git a/common/src/java/org/apache/hadoop/hive/ql/ErrorMsg.java b/common/src/java/org/apache/hadoop/hive/ql/ErrorMsg.java
index d22cc7288de..8f7887d73a9 100644
--- a/common/src/java/org/apache/hadoop/hive/ql/ErrorMsg.java
+++ b/common/src/java/org/apache/hadoop/hive/ql/ErrorMsg.java
@@ -482,7 +482,7 @@ public enum ErrorMsg {
COMPACTION_REFUSED(10432, "Compaction request for {0}.{1}{2} is refused, details: {3}.", true),
CBO_IS_REQUIRED(10433,
"The following functionality requires CBO (" + HiveConf.ConfVars.HIVE_CBO_ENABLED.varname + "): {0}", true),
-
+ CTLF_UNSUPPORTED_FORMAT(10434, "CREATE TABLE LIKE FILE is not supported by the ''{0}'' file format", true),
//========================== 20000 range starts here ========================//
@@ -517,6 +517,12 @@ public enum ErrorMsg {
REPL_EXTERNAL_SERVICE_CONNECTION_ERROR(20017, "Failed to connect to {0} service. Error code {1}.",true),
CLIENT_POLLING_OPSTATUS_INTERRUPTED(20018, "Interrupted while polling on the operation status", "70100"),
+ CTLF_FAILED_INFERENCE(20019, "Failed to infer schema:"),
+ CTLF_CLASS_NOT_FOUND(20020, "Failed to find SerDe class ({0}) for ''{1}''", true),
+ CTLF_MISSING_STORAGE_FORMAT_DESCRIPTOR(20021, "Failed to find StorageFormatDescriptor for file format ''{0}''", true),
+ PARQUET_FOOTER_ERROR(20022, "Failed to read parquet footer:"),
+ PARQUET_UNHANDLED_TYPE(20023, "Unhandled type {0}", true),
+
// An exception from runtime that will show the full stack to client
UNRESOLVED_RT_EXCEPTION(29999, "Runtime Error: {0}", "58004", true),
diff --git a/hcatalog/core/src/main/java/org/apache/hive/hcatalog/cli/SemanticAnalysis/CreateTableHook.java b/hcatalog/core/src/main/java/org/apache/hive/hcatalog/cli/SemanticAnalysis/CreateTableHook.java
index 9b66e6be74a..041a4d48458 100644
--- a/hcatalog/core/src/main/java/org/apache/hive/hcatalog/cli/SemanticAnalysis/CreateTableHook.java
+++ b/hcatalog/core/src/main/java/org/apache/hive/hcatalog/cli/SemanticAnalysis/CreateTableHook.java
@@ -64,16 +64,13 @@ final class CreateTableHook extends HCatSemanticAnalyzerBase {
// Analyze and create tbl properties object
int numCh = ast.getChildCount();
- tableName = BaseSemanticAnalyzer.getUnescapedName((ASTNode) ast
- .getChild(0));
- boolean likeTable = false;
+ tableName = BaseSemanticAnalyzer.getUnescapedName((ASTNode) ast.getChild(0));
StorageFormat format = new StorageFormat(context.getConf());
for (int num = 1; num < numCh; num++) {
ASTNode child = (ASTNode) ast.getChild(num);
if (format.fillStorageFormat(child)) {
- if (org.apache.commons.lang3.StringUtils
- .isNotEmpty(format.getStorageHandler())) {
+ if (StringUtils.isNotEmpty(format.getStorageHandler())) {
return ast;
}
continue;
@@ -88,10 +85,6 @@ final class CreateTableHook extends HCatSemanticAnalyzerBase {
case HiveParser.TOK_ALTERTABLE_BUCKETS:
break;
- case HiveParser.TOK_LIKETABLE:
- likeTable = true;
- break;
-
case HiveParser.TOK_IFNOTEXISTS:
try {
List<String> tables = db.getTablesByPattern(tableName);
@@ -121,11 +114,6 @@ final class CreateTableHook extends HCatSemanticAnalyzerBase {
}
}
- if (!likeTable && (format.getInputFormat() == null || format.getOutputFormat() == null)) {
- throw new SemanticException(
- "STORED AS specification is either incomplete or incorrect.");
- }
-
return ast;
}
diff --git a/parser/src/java/org/apache/hadoop/hive/ql/parse/CreateDDLParser.g b/parser/src/java/org/apache/hadoop/hive/ql/parse/CreateDDLParser.g
index 69da7c78ae7..97f04f8dc1f 100644
--- a/parser/src/java/org/apache/hadoop/hive/ql/parse/CreateDDLParser.g
+++ b/parser/src/java/org/apache/hadoop/hive/ql/parse/CreateDDLParser.g
@@ -1,9 +1,9 @@
/**
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
+ (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
@@ -43,12 +43,19 @@ catch (RecognitionException e) {
}
}
+likeTableOrFile
+ : (KW_LIKE KW_FILE) => KW_LIKE KW_FILE
+ | (KW_LIKE KW_FILE format=identifier uri=StringLiteral) -> ^(TOK_LIKEFILE $format $uri)
+ | (KW_LIKE likeName=tableName) -> ^(TOK_LIKETABLE $likeName)
+ ;
+
//----------------------- Rules for parsing createtable -----------------------------
createTableStatement
@init { gParent.pushMsg("create table statement", state); }
@after { gParent.popMsg(state); }
: KW_CREATE (temp=KW_TEMPORARY)? (trans=KW_TRANSACTIONAL)? (ext=KW_EXTERNAL)? KW_TABLE ifNotExists? name=tableName
- ( like=KW_LIKE likeName=tableName
+ ( likeTableOrFile
+ createTablePartitionSpec?
tableRowFormat?
tableFileFormat?
tableLocation?
@@ -65,7 +72,7 @@ createTableStatement
(KW_AS selectStatementWithCTE)?
)
-> ^(TOK_CREATETABLE $name $temp? $trans? $ext? ifNotExists?
- ^(TOK_LIKETABLE $likeName?)
+ likeTableOrFile?
columnNameTypeOrConstraintList?
tableComment?
createTablePartitionSpec?
@@ -78,7 +85,7 @@ createTableStatement
selectStatementWithCTE?
)
| KW_CREATE mgd=KW_MANAGED KW_TABLE ifNotExists? name=tableName
- ( like=KW_LIKE likeName=tableName
+ ( likeTableOrFile
tableRowFormat?
tableFileFormat?
tableLocation?
@@ -95,7 +102,7 @@ createTableStatement
(KW_AS selectStatementWithCTE)?
)
-> ^(TOK_CREATETABLE $name $mgd ifNotExists?
- ^(TOK_LIKETABLE $likeName?)
+ likeTableOrFile?
columnNameTypeOrConstraintList?
tableComment?
createTablePartitionSpec?
diff --git a/parser/src/java/org/apache/hadoop/hive/ql/parse/HiveParser.g b/parser/src/java/org/apache/hadoop/hive/ql/parse/HiveParser.g
index 25bd5a259f4..3efadee97df 100644
--- a/parser/src/java/org/apache/hadoop/hive/ql/parse/HiveParser.g
+++ b/parser/src/java/org/apache/hadoop/hive/ql/parse/HiveParser.g
@@ -161,6 +161,7 @@ TOK_CREATEDATABASE;
TOK_CREATEDATACONNECTOR;
TOK_CREATETABLE;
TOK_TRUNCATETABLE;
+TOK_LIKEFILE;
TOK_LIKETABLE;
TOK_DATACONNECTOR;
TOK_DATACONNECTORCOMMENT;
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/ddl/table/create/CreateTableDesc.java b/ql/src/java/org/apache/hadoop/hive/ql/ddl/table/create/CreateTableDesc.java
index 297b0857a0b..b484428cc07 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/ddl/table/create/CreateTableDesc.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/ddl/table/create/CreateTableDesc.java
@@ -127,6 +127,8 @@ public class CreateTableDesc implements DDLDesc, Serializable {
private transient FileSinkDesc writer;
private Long replWriteId; // to be used by repl task to get the txn and valid write id list
private String ownerName = null;
+ private String likeFile = null;
+ private String likeFileFormat = null;
public CreateTableDesc() {
}
@@ -230,6 +232,22 @@ public class CreateTableDesc implements DDLDesc, Serializable {
return copy == null ? null : new ArrayList<T>(copy);
}
+ public void setLikeFile(String likeFile) {
+ this.likeFile = likeFile;
+ }
+
+ public void setLikeFileFormat(String likeFileFormat) {
+ this.likeFileFormat = likeFileFormat;
+ }
+
+ public String getLikeFile() {
+ return likeFile;
+ }
+
+ public String getLikeFileFormat() {
+ return likeFileFormat;
+ }
+
@Explain(displayName = "columns")
public List<String> getColsString() {
return Utilities.getFieldSchemaString(getCols());
@@ -268,7 +286,7 @@ public class CreateTableDesc implements DDLDesc, Serializable {
return cols;
}
- public void setCols(ArrayList<FieldSchema> cols) {
+ public void setCols(List<FieldSchema> cols) {
this.cols = cols;
}
@@ -544,13 +562,13 @@ public class CreateTableDesc implements DDLDesc, Serializable {
this.skewedColValues = skewedColValues;
}
- public void validate(HiveConf conf)
- throws SemanticException {
+ public void validate(HiveConf conf) throws SemanticException {
if ((this.getCols() == null) || (this.getCols().size() == 0)) {
- // for now make sure that serde exists
- if (Table.hasMetastoreBasedSchema(conf, serName) &&
- StringUtils.isEmpty(getStorageHandler())) {
+ // if the table has no columns and is a HMS backed SerDe - it should have a storage handler OR
+ // is a CREATE TABLE LIKE FILE statement.
+ if (Table.hasMetastoreBasedSchema(conf, serName) && StringUtils.isEmpty(getStorageHandler())
+ && this.getLikeFile() == null) {
throw new SemanticException(ErrorMsg.INVALID_TBL_DDL_SERDE.getMsg());
}
return;
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/ddl/table/create/CreateTableOperation.java b/ql/src/java/org/apache/hadoop/hive/ql/ddl/table/create/CreateTableOperation.java
index 6dacbb0b7ec..c3d0a320942 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/ddl/table/create/CreateTableOperation.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/ddl/table/create/CreateTableOperation.java
@@ -27,6 +27,7 @@ import org.apache.hadoop.hive.conf.Constants;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.metastore.TableType;
import org.apache.hadoop.hive.metastore.api.EnvironmentContext;
+import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.metastore.api.StorageDescriptor;
import org.apache.hadoop.hive.ql.ddl.DDLOperation;
import org.apache.hadoop.hive.ql.ddl.DDLOperationContext;
@@ -37,6 +38,7 @@ import org.apache.hadoop.hive.ql.hooks.WriteEntity;
import org.apache.hadoop.hive.ql.hooks.LineageInfo.DataContainer;
import org.apache.hadoop.hive.ql.io.AcidUtils;
import org.apache.hadoop.hive.ql.io.HdfsUtils;
+import org.apache.hadoop.hive.ql.io.SchemaInferenceUtils;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.metadata.Table;
import org.apache.hadoop.hive.ql.parse.ReplicationSpec;
@@ -53,8 +55,23 @@ public class CreateTableOperation extends DDLOperation<CreateTableDesc> {
super(context, desc);
}
+ // Sets the tables columns using the FieldSchema inferred from the SerDe's SchemaInference
+ // implementation. This is used by CREATE TABLE LIKE FILE.
+ private void readSchemaFromFile() throws HiveException {
+ String fileFormat = desc.getLikeFileFormat();
+ String filePath = desc.getLikeFile();
+ List<FieldSchema> fieldSchema = SchemaInferenceUtils.readSchemaFromFile(context.getConf(), fileFormat, filePath);
+ LOG.debug("Inferred field schema for {} file {} was {}", fileFormat, filePath, fieldSchema);
+ desc.setCols(fieldSchema);
+ }
+
@Override
public int execute() throws HiveException {
+ // check if schema is being inferred via LIKE FILE
+ if (desc.getLikeFile() != null) {
+ readSchemaFromFile();
+ }
+
// create the table
Table tbl = desc.toTable(context.getConf());
LOG.debug("creating table {} on {}", tbl.getFullyQualifiedName(), tbl.getDataLocation());
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/SchemaInferenceUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/io/SchemaInferenceUtils.java
new file mode 100644
index 00000000000..5f555b7f63c
--- /dev/null
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/SchemaInferenceUtils.java
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.io;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hive.metastore.api.FieldSchema;
+import org.apache.hadoop.hive.ql.ErrorMsg;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.serde2.AbstractSerDe;
+import org.apache.hadoop.hive.serde2.SchemaInference;
+import org.apache.hadoop.hive.serde2.SerDeException;
+import org.apache.hadoop.util.ReflectionUtils;
+
+import java.util.List;
+
+public class SchemaInferenceUtils {
+ private static Class<AbstractSerDe> getSerde(Configuration conf, String fileFormat) throws HiveException {
+ StorageFormatFactory storageFormatFactory = new StorageFormatFactory();
+ StorageFormatDescriptor descriptor = storageFormatFactory.get(fileFormat);
+ if (descriptor == null) {
+ throw new HiveException(ErrorMsg.CTLF_MISSING_STORAGE_FORMAT_DESCRIPTOR.getErrorCodedMsg(fileFormat));
+ }
+ String serde = descriptor.getSerde();
+ try {
+ return (Class<AbstractSerDe>) conf.getClassByName(serde);
+ } catch (ClassNotFoundException e) {
+ throw new HiveException(ErrorMsg.CTLF_CLASS_NOT_FOUND.getErrorCodedMsg(serde, fileFormat), e);
+ }
+ }
+
+ /**
+ * Determines if a supplied fileFormat supports Schema Inference for CREATE TABLE LIKE FILE.
+ *
+ * @param conf Configuration object used to get class.
+ * @param fileFormat File format to check for Schema Inference support.
+ * @throws HiveException if unable to get SerDe class for fileFormat
+ */
+ public static boolean doesSupportSchemaInference(Configuration conf, String fileFormat) throws HiveException {
+ return SchemaInference.class.isAssignableFrom(getSerde(conf, fileFormat));
+ }
+
+ /**
+ * Returns a List containing FieldSchema as determined by the readSchema method of the provided file format.
+ *
+ * @param conf Hadoop Configuration object used to look up class and provided to the readSchema method.
+ * @param fileFormat File format in which to use SerDe from.
+ * @param filePath Path to the file to read.
+ * @throws HiveException if unable to read the schema
+ */
+ public static List<FieldSchema> readSchemaFromFile(Configuration conf, String fileFormat, String filePath)
+ throws HiveException {
+ Class<AbstractSerDe> asClass = getSerde(conf, fileFormat);
+ SchemaInference sd = (SchemaInference) ReflectionUtils.newInstance(asClass, conf);
+ try {
+ return sd.readSchema(conf, filePath);
+ } catch (SerDeException e) {
+ throw new HiveException(ErrorMsg.CTLF_FAILED_INFERENCE.getErrorCodedMsg(), e);
+ }
+ }
+}
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ParquetHiveSerDe.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ParquetHiveSerDe.java
index ce35b885cc5..881d56cd31f 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ParquetHiveSerDe.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/serde/ParquetHiveSerDe.java
@@ -22,11 +22,16 @@ import java.util.Properties;
import com.google.common.base.Preconditions;
import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hadoop.hive.metastore.api.FieldSchema;
+import org.apache.hadoop.hive.ql.ErrorMsg;
import org.apache.hadoop.hive.ql.metadata.Table;
import org.apache.hadoop.hive.ql.optimizer.FieldNode;
import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.hive.serde2.AbstractSerDe;
import org.apache.hadoop.hive.serde2.ColumnProjectionUtils;
+import org.apache.hadoop.hive.serde2.SchemaInference;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.SerDeSpec;
import org.apache.hadoop.hive.serde2.io.ParquetHiveRecord;
@@ -39,7 +44,23 @@ import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
import org.apache.hadoop.io.ArrayWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
+import org.apache.parquet.hadoop.ParquetFileReader;
import org.apache.parquet.hadoop.ParquetOutputFormat;
+import org.apache.parquet.hadoop.metadata.FileMetaData;
+import org.apache.parquet.hadoop.util.HadoopInputFile;
+import org.apache.parquet.schema.GroupType;
+import org.apache.parquet.schema.LogicalTypeAnnotation;
+import org.apache.parquet.schema.LogicalTypeAnnotation.DateLogicalTypeAnnotation;
+import org.apache.parquet.schema.LogicalTypeAnnotation.DecimalLogicalTypeAnnotation;
+import org.apache.parquet.schema.LogicalTypeAnnotation.IntLogicalTypeAnnotation;
+import org.apache.parquet.schema.LogicalTypeAnnotation.ListLogicalTypeAnnotation;
+import org.apache.parquet.schema.LogicalTypeAnnotation.MapLogicalTypeAnnotation;
+import org.apache.parquet.schema.LogicalTypeAnnotation.StringLogicalTypeAnnotation;
+import org.apache.parquet.schema.MessageType;
+import org.apache.parquet.schema.PrimitiveType;
+import org.apache.parquet.schema.Type;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
/**
* A ParquetHiveSerDe for Hive (with the deprecated package mapred). Parquet
@@ -48,7 +69,9 @@ import org.apache.parquet.hadoop.ParquetOutputFormat;
*/
@SerDeSpec(schemaProps = {serdeConstants.LIST_COLUMNS, serdeConstants.LIST_COLUMN_TYPES,
ParquetOutputFormat.COMPRESSION})
-public class ParquetHiveSerDe extends AbstractSerDe {
+public class ParquetHiveSerDe extends AbstractSerDe implements SchemaInference {
+ private static final Logger LOG = LoggerFactory.getLogger(ParquetHiveSerDe.class);
+
public static final Text MAP_KEY = new Text("key");
public static final Text MAP_VALUE = new Text("value");
public static final Text MAP = new Text("map");
@@ -57,6 +80,7 @@ public class ParquetHiveSerDe extends AbstractSerDe {
// Map precision to the number bytes needed for binary conversion.
public static final int PRECISION_TO_BYTE_COUNT[] = new int[38];
+
static {
for (int prec = 1; prec <= 38; prec++) {
// Estimated number of bytes needed.
@@ -82,8 +106,8 @@ public class ParquetHiveSerDe extends AbstractSerDe {
(StructTypeInfo) TypeInfoFactory.getStructTypeInfo(getColumnNames(), getColumnTypes());
StructTypeInfo prunedTypeInfo = null;
if (this.configuration.isPresent()) {
- String rawPrunedColumnPaths =
- this.configuration.get().get(ColumnProjectionUtils.READ_NESTED_COLUMN_PATH_CONF_STR);
+ Configuration conf = this.configuration.get();
+ String rawPrunedColumnPaths = conf.get(ColumnProjectionUtils.READ_NESTED_COLUMN_PATH_CONF_STR);
if (rawPrunedColumnPaths != null) {
List<String> prunedColumnPaths = processRawPrunedPaths(rawPrunedColumnPaths);
prunedTypeInfo = pruneFromPaths(completeTypeInfo, prunedColumnPaths);
@@ -234,4 +258,176 @@ public class ParquetHiveSerDe extends AbstractSerDe {
return (StructTypeInfo) TypeInfoFactory.getStructTypeInfo(newNames, newTypes);
}
}
+
+ // ReadSchema interface implementation
+ private String convertGroupType(GroupType group, boolean inferBinaryAsString) throws SerDeException {
+ boolean first = true;
+ StringBuilder sb = new StringBuilder(serdeConstants.STRUCT_TYPE_NAME + "<");
+ for (Type field: group.getFields()) {
+ if (first) {
+ first = false;
+ } else {
+ sb.append(",");
+ }
+ // fieldName:typeName
+ sb.append(field.getName()).append(":").append(convertParquetTypeToFieldType(field, inferBinaryAsString));
+ }
+ sb.append(">");
+ // struct<fieldName1:int, fieldName2:map<string : int>, etc
+ return sb.toString();
+ }
+
+ private String convertPrimitiveType(PrimitiveType primitive, boolean inferBinaryAsString) throws SerDeException {
+ switch (primitive.getPrimitiveTypeName()) {
+ case INT96:
+ return serdeConstants.TIMESTAMP_TYPE_NAME;
+ case INT32:
+ return serdeConstants.INT_TYPE_NAME;
+ case INT64:
+ return serdeConstants.BIGINT_TYPE_NAME;
+ case BOOLEAN:
+ return serdeConstants.BOOLEAN_TYPE_NAME;
+ case FLOAT:
+ return serdeConstants.FLOAT_TYPE_NAME;
+ case DOUBLE:
+ return serdeConstants.DOUBLE_TYPE_NAME;
+ case BINARY:
+ if (inferBinaryAsString) {
+ return serdeConstants.STRING_TYPE_NAME;
+ } else {
+ return serdeConstants.BINARY_TYPE_NAME;
+ }
+ default:
+ throw new SerDeException(ErrorMsg.PARQUET_UNHANDLED_TYPE.getErrorCodedMsg(primitive.getPrimitiveTypeName().name()));
+ }
+ }
+
+ private String convertParquetIntLogicalType(Type parquetType) throws SerDeException {
+ IntLogicalTypeAnnotation intLogicalType = (IntLogicalTypeAnnotation) parquetType.getLogicalTypeAnnotation();
+ PrimitiveType primitiveType = parquetType.asPrimitiveType();
+ // check to see if primitive type handling is implemented
+ switch (primitiveType.getPrimitiveTypeName()) {
+ case INT32:
+ case INT64:
+ break;
+ default:
+ throw new SerDeException(ErrorMsg.PARQUET_UNHANDLED_TYPE.getErrorCodedMsg(intLogicalType.toString()));
+ }
+
+ if (!intLogicalType.isSigned()) {
+ // signed types are not supported
+ throw new SerDeException(ErrorMsg.PARQUET_UNHANDLED_TYPE.getErrorCodedMsg(intLogicalType.toString()));
+ }
+
+ switch (intLogicalType.getBitWidth()) {
+ case 8: return serdeConstants.TINYINT_TYPE_NAME;
+ case 16: return serdeConstants.SMALLINT_TYPE_NAME;
+ case 32: return serdeConstants.INT_TYPE_NAME;
+ case 64: return serdeConstants.BIGINT_TYPE_NAME;
+ }
+
+ throw new SerDeException(ErrorMsg.PARQUET_UNHANDLED_TYPE.getErrorCodedMsg(intLogicalType.toString()));
+ }
+
+ private String createMapType(String keyType, String valueType) {
+ // examples: map<string, int>, map<string : struct<i : int>>
+ return serdeConstants.MAP_TYPE_NAME + "<" + keyType + "," + valueType + ">";
+ }
+
+ private String convertParquetMapLogicalTypeAnnotation(Type parquetType, boolean inferBinaryAsString)
+ throws SerDeException {
+ MapLogicalTypeAnnotation mType = (MapLogicalTypeAnnotation) parquetType.getLogicalTypeAnnotation();
+ GroupType gType = parquetType.asGroupType();
+ Type innerField = gType.getType(0);
+ GroupType innerGroup = innerField.asGroupType();
+ Type key = innerGroup.getType(0);
+ Type value = innerGroup.getType(1);
+ return createMapType(convertParquetTypeToFieldType(key, inferBinaryAsString),
+ convertParquetTypeToFieldType(value, inferBinaryAsString));
+ }
+
+ private String createArrayType(String fieldType) {
+ // examples: array<int>, array<struct<i:int>>, array<map<string : int>>
+ return serdeConstants.LIST_TYPE_NAME + "<" + fieldType + ">";
+ }
+
+ private String convertParquetListLogicalTypeAnnotation(Type parquetType, boolean inferBinaryAsString)
+ throws SerDeException {
+ ListLogicalTypeAnnotation mType = (ListLogicalTypeAnnotation) parquetType.getLogicalTypeAnnotation();
+ GroupType gType = parquetType.asGroupType();
+ Type innerField = gType.getType(0);
+ if (innerField.isPrimitive() || innerField.getOriginalType() != null) {
+ return createArrayType(convertParquetTypeToFieldType(innerField, inferBinaryAsString));
+ }
+
+ GroupType innerGroup = innerField.asGroupType();
+ if (innerGroup.getFieldCount() != 1) {
+ return createArrayType(convertGroupType(innerGroup, inferBinaryAsString));
+ }
+
+ return createArrayType(convertParquetTypeToFieldType(innerGroup.getType(0), inferBinaryAsString));
+ }
+
+ private String createDecimalType(int precision, int scale) {
+ // example: decimal(10, 4)
+ return serdeConstants.DECIMAL_TYPE_NAME + "(" + precision + "," + scale + ")";
+ }
+
+ private String convertLogicalType(Type type, boolean inferBinaryAsString) throws SerDeException {
+ LogicalTypeAnnotation lType = type.getLogicalTypeAnnotation();
+ if (lType instanceof IntLogicalTypeAnnotation) {
+ return convertParquetIntLogicalType(type);
+ } else if (lType instanceof StringLogicalTypeAnnotation) {
+ return serdeConstants.STRING_TYPE_NAME;
+ } else if (lType instanceof DecimalLogicalTypeAnnotation) {
+ DecimalLogicalTypeAnnotation dType = (DecimalLogicalTypeAnnotation) lType;
+ return createDecimalType(dType.getPrecision(), dType.getScale());
+ } else if (lType instanceof MapLogicalTypeAnnotation) {
+ return convertParquetMapLogicalTypeAnnotation(type, inferBinaryAsString);
+ } else if (lType instanceof ListLogicalTypeAnnotation) {
+ return convertParquetListLogicalTypeAnnotation(type, inferBinaryAsString);
+ } else if (lType instanceof DateLogicalTypeAnnotation) {
+ // assuming 32 bit int
+ return serdeConstants.DATE_TYPE_NAME;
+ }
+ throw new SerDeException(ErrorMsg.PARQUET_UNHANDLED_TYPE.getErrorCodedMsg(lType.toString()));
+ }
+
+ private String convertParquetTypeToFieldType(Type type, boolean inferBinaryAsString) throws SerDeException {
+ if (type.getLogicalTypeAnnotation() != null) {
+ return convertLogicalType(type, inferBinaryAsString);
+ } else if (type.isPrimitive()) {
+ return convertPrimitiveType(type.asPrimitiveType(), inferBinaryAsString);
+ }
+ return convertGroupType(type.asGroupType(), inferBinaryAsString);
+ }
+
+ private FieldSchema convertParquetTypeToFieldSchema(Type type, boolean inferBinaryAsString) throws SerDeException {
+ String columnName = type.getName();
+ String typeName = convertParquetTypeToFieldType(type, inferBinaryAsString);
+ return new FieldSchema(columnName, typeName, "Inferred from Parquet file.");
+ }
+
+ public List<FieldSchema> readSchema(Configuration conf, String file) throws SerDeException {
+ FileMetaData metadata;
+ try {
+ HadoopInputFile inputFile = HadoopInputFile.fromPath(new Path(file), conf);
+ ParquetFileReader reader = ParquetFileReader.open(inputFile);
+ metadata = reader.getFileMetaData();
+ } catch (Exception e) {
+ throw new SerDeException(ErrorMsg.PARQUET_FOOTER_ERROR.getErrorCodedMsg(), e);
+ }
+
+ MessageType msg = metadata.getSchema();
+ List<FieldSchema> schema = new ArrayList<>();
+ String inferBinaryAsStringValue = conf.get(HiveConf.ConfVars.HIVE_PARQUET_INFER_BINARY_AS.varname);
+ boolean inferBinaryAsString = "string".equalsIgnoreCase(inferBinaryAsStringValue);
+
+ for (Type field: msg.getFields()) {
+ FieldSchema fieldSchema = convertParquetTypeToFieldSchema(field, inferBinaryAsString);
+ schema.add(fieldSchema);
+ LOG.debug("Inferred field schema {}", fieldSchema);
+ }
+ return schema;
+ }
}
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java
index 48e0cf9b2ed..485089e4ad3 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java
@@ -164,6 +164,7 @@ import org.apache.hadoop.hive.ql.io.CombineHiveInputFormat;
import org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat;
import org.apache.hadoop.hive.ql.io.HiveOutputFormat;
import org.apache.hadoop.hive.ql.io.NullRowsInputFormat;
+import org.apache.hadoop.hive.ql.io.SchemaInferenceUtils;
import org.apache.hadoop.hive.ql.io.arrow.ArrowColumnarBatchSerDe;
import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker;
import org.apache.hadoop.hive.ql.lib.Node;
@@ -13477,6 +13478,7 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer {
}
return false;
}
+
/**
* Analyze the create table command. If it is a regular create-table or
* create-table-like statements, we create a DDLWork and return true. If it is
@@ -13516,7 +13518,8 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer {
final int CREATE_TABLE = 0; // regular CREATE TABLE
final int CTLT = 1; // CREATE TABLE LIKE ... (CTLT)
final int CTAS = 2; // CREATE TABLE AS SELECT ... (CTAS)
- final int ctt = 3; // CREATE TRANSACTIONAL TABLE
+ final int CTT = 3; // CREATE TRANSACTIONAL TABLE
+ final int CTLF = 4; // CREATE TABLE LIKE FILE
int command_type = CREATE_TABLE;
List<String> skewedColNames = new ArrayList<String>();
List<List<String>> skewedValues = new ArrayList<List<String>>();
@@ -13524,6 +13527,8 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer {
boolean storedAsDirs = false;
boolean isUserStorageFormat = false;
boolean partitionTransformSpecExists = false;
+ String likeFile = null;
+ String likeFileFormat = null;
RowFormatParams rowFormatParams = new RowFormatParams();
StorageFormat storageFormat = new StorageFormat(conf);
@@ -13569,7 +13574,16 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer {
break;
case HiveParser.KW_TRANSACTIONAL:
isTransactional = true;
- command_type = ctt;
+ command_type = CTT;
+ break;
+ case HiveParser.TOK_LIKEFILE:
+ if (cols.size() != 0) {
+ throw new SemanticException(ErrorMsg.CTLT_COLLST_COEXISTENCE
+ .getMsg());
+ }
+ likeFileFormat = getUnescapedName((ASTNode) child.getChild(0));
+ likeFile = getUnescapedName((ASTNode) child.getChild(1));
+ command_type = CTLF;
break;
case HiveParser.TOK_LIKETABLE:
if (child.getChildCount() > 0) {
@@ -13719,7 +13733,7 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer {
}
}
- if (command_type == CREATE_TABLE || command_type == CTLT || command_type == ctt) {
+ if (command_type == CREATE_TABLE || command_type == CTLT || command_type == CTT || command_type == CTLF) {
queryState.setCommandType(HiveOperation.CREATETABLE);
} else if (command_type == CTAS) {
queryState.setCommandType(HiveOperation.CREATETABLE_AS_SELECT);
@@ -13785,7 +13799,15 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer {
}
}
switch (command_type) {
-
+ case CTLF:
+ try {
+ if (!SchemaInferenceUtils.doesSupportSchemaInference(conf, likeFileFormat)) {
+ throw new SemanticException(ErrorMsg.CTLF_UNSUPPORTED_FORMAT.getErrorCodedMsg(likeFileFormat));
+ }
+ } catch (HiveException e) {
+ throw new SemanticException(e.getMessage(), e);
+ }
+ // fall through
case CREATE_TABLE: // REGULAR CREATE TABLE DDL
if (!CollectionUtils.isEmpty(partColNames)) {
throw new SemanticException(
@@ -13810,6 +13832,8 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer {
checkConstraints);
crtTblDesc.setStoredAsSubDirectories(storedAsDirs);
crtTblDesc.setNullFormat(rowFormatParams.nullFormat);
+ crtTblDesc.setLikeFile(likeFile);
+ crtTblDesc.setLikeFileFormat(likeFileFormat);
crtTblDesc.validate(conf);
// outputs is empty, which means this create table happens in the current
@@ -13832,7 +13856,7 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer {
"Query state attached to Session state must be not null. Table location cannot be saved.");
}
break;
- case ctt: // CREATE TRANSACTIONAL TABLE
+ case CTT: // CREATE TRANSACTIONAL TABLE
if (isExt && !isDefaultTableTypeChanged) {
throw new SemanticException(
qualifiedTabName.getTable() + " cannot be declared transactional because it's an external table");
diff --git a/ql/src/test/queries/clientnegative/create_table_like_invalid.q b/ql/src/test/queries/clientnegative/create_table_like_invalid.q
new file mode 100644
index 00000000000..ac91a8cbaaf
--- /dev/null
+++ b/ql/src/test/queries/clientnegative/create_table_like_invalid.q
@@ -0,0 +1 @@
+CREATE TABLE test LIKE FILE AVRO 'hdfs://madeuppath';
diff --git a/ql/src/test/queries/clientpositive/create_table_like_file.q b/ql/src/test/queries/clientpositive/create_table_like_file.q
new file mode 100644
index 00000000000..980c7bb2a2a
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/create_table_like_file.q
@@ -0,0 +1,90 @@
+-- all primitive types
+-- timestamp_w_tz TIMESTAMP WITH LOCAL TIME ZONE is not supported by hive's parquet implementation
+CREATE EXTERNAL TABLE test_all_types(tinyint_type TINYINT, smallint_type SMALLINT, bigint_type BIGINT, int_type INT, float_type FLOAT, double_type double, decimal_type DECIMAL(4,2), timestamp_type TIMESTAMP, date_type DATE, string_type STRING, varchar_type VARCHAR(100), char_type CHAR(34), boolean_type BOOLEAN, binary_type BINARY) STORED AS PARQUET LOCATION '${system:test.tmp.dir}/test_all_types';
+-- insert two rows (the other tables only have 1 row)
+INSERT INTO test_all_types VALUES (1, 2, 3, 4, 2.2, 2.2, 20.20, '2022-06-30 10:20:30', '2020-04-23', 'str1', 'varchar1', 'char', true, 'binary_maybe'),
+ (1, 2, 3, 4, 2.2, 2.2, 20.20, '2022-06-30 10:20:30', '2020-04-23', 'str1', 'varchar1', 'char', true, 'binary_maybe');
+SELECT * FROM test_all_types;
+DESCRIBE test_all_types;
+-- CREATE A LIKE table
+CREATE TABLE like_test_all_types LIKE FILE PARQUET '${system:test.tmp.dir}/test_all_types/000000_0';
+INSERT INTO like_test_all_types VALUES (1, 2, 3, 4, 2.2, 2.2, 20.20, '2022-06-30 10:20:30', '2020-04-23', 'str1', 'varchar1', 'char', true, 'binary_maybe'),
+ (1, 2, 3, 4, 2.2, 2.2, 20.20, '2022-06-30 10:20:30', '2020-04-23', 'str1', 'varchar1', 'char', true, 'binary_maybe');
+SELECT * FROM like_test_all_types;
+DESCRIBE like_test_all_types;
+DROP TABLE test_all_types;
+DROP TABLE like_test_all_types;
+
+-- test hive.parquet.infer.binary.as string
+SET hive.parquet.infer.binary.as = String;
+CREATE TABLE like_test_all_types LIKE FILE PARQUET '${system:test.tmp.dir}/test_all_types/000000_0';
+INSERT INTO like_test_all_types VALUES (1, 2, 3, 4, 2.2, 2.2, 20.20, '2022-06-30 10:20:30', '2020-04-23', 'str1', 'varchar1', 'char', true, 'binary_maybe'),
+ (1, 2, 3, 4, 2.2, 2.2, 20.20, '2022-06-30 10:20:30', '2020-04-23', 'str1', 'varchar1', 'char', true, 'binary_maybe');
+SELECT * FROM like_test_all_types;
+DESCRIBE like_test_all_types;
+DROP TABLE test_all_types;
+DROP TABLE like_test_all_types;
+SET hive.parquet.infer.binary.as = binary;
+
+-- complex types (struct, array, map, union)
+-- union type is not supported by PARQUET in hive
+-- array
+CREATE EXTERNAL TABLE test_array(str_array array<string>) STORED AS PARQUET LOCATION '${system:test.tmp.dir}/test_array';
+DESCRIBE test_array;
+INSERT INTO test_array SELECT array("bob", "sue");
+SELECT * FROM test_array;
+CREATE TABLE like_test_array LIKE FILE PARQUET '${system:test.tmp.dir}/test_array/000000_0';
+DESCRIBE like_test_array;
+INSERT INTO like_test_array SELECT array("bob", "sue");
+SELECT * FROM like_test_array;
+DROP TABLE like_test_array;
+
+-- map
+CREATE EXTERNAL TABLE test_map(simple_map map<int, string>, map_to_struct map<string, struct<i : int>>, map_to_map map<date,map<int, string>>, map_to_array map<binary, array<array<int>>>) STORED AS PARQUET LOCATION '${system:test.tmp.dir}/test_map';
+DESCRIBE test_map;
+INSERT INTO test_map SELECT map(10, "foo"), map("bar", named_struct("i", 99)), map(cast('1984-01-01' as date), map(10, "goodbye")), map(cast("binary" as binary), array(array(1,2,3)));
+SELECT * FROM test_map;
+CREATE TABLE like_test_map LIKE FILE PARQUET '${system:test.tmp.dir}/test_map/000000_0';
+DESCRIBE like_test_map;
+INSERT INTO like_test_map SELECT map(10, "foo"), map("bar", named_struct("i", 99)), map(cast('1984-01-01' as date), map(10, "goodbye")), map(cast("binary" as binary), array(array(1,2,3)));
+SELECT * FROM like_test_map;
+DROP TABLE like_test_map;
+
+-- struct
+CREATE EXTERNAL TABLE test_complex_struct(struct_type struct<tinyint_type : tinyint, smallint_type : smallint, bigint_type : bigint, int_type : int, float_type : float, double_type : double, decimal_type : DECIMAL(4,2), timestamp_type : TIMESTAMP, date_type : DATE, string_type : STRING, varchar_type : VARCHAR(100), char_type : CHAR(34), boolean_type : boolean, binary_type : binary>) STORED AS PARQUET LOCATION '${system:test.tmp.dir}/test_complex_struct';
+DESCRIBE test_complex_struct;
+-- disable CBO due to the fact that type conversion causes CBO failure which causes the test to fail
+-- non-CBO path works (HIVE-26398)
+SET hive.cbo.enable=false;
+INSERT INTO test_complex_struct SELECT named_struct("tinyint_type", cast(1 as tinyint), "smallint_type", cast(2 as smallint), "bigint_type", cast(3 as bigint), "int_type", 4, "float_type", cast(2.2 as float), "double_type", cast(2.2 as double), "decimal_type", cast(20.22 as decimal(4,2)), "timestamp_type", cast('2022-06-30 10:20:30' as timestamp), "date_type", cast('2020-04-23' as date), "string_type", 'str1', "varchar_type", cast('varchar1' as varchar(100)), "char_type", cast('char' as [...]
+SET hive.cbo.enable=true;
+SELECT * FROM test_complex_struct;
+-- varchar/char get created as string due to the fact that the parquet file has no information to derive this types and they are stored as string
+CREATE TABLE like_test_complex_struct LIKE FILE PARQUET '${system:test.tmp.dir}/test_complex_struct/000000_0';
+DESCRIBE like_test_complex_struct;
+-- disable CBO due to the fact that type conversion causes CBO failure which causes the test to fail
+-- non-CBO path works (HIVE-26398)
+SET hive.cbo.enable=false;
+INSERT INTO like_test_complex_struct SELECT named_struct("tinyint_type", cast(1 as tinyint), "smallint_type", cast(2 as smallint), "bigint_type", cast(3 as bigint), "int_type", 4, "float_type", cast(2.2 as float), "double_type", cast(2.2 as double), "decimal_type", cast(20.22 as decimal(4,2)), "timestamp_type", cast('2022-06-30 10:20:30' as timestamp), "date_type", cast('2020-04-23' as date), "string_type", 'str1', "varchar_type", 'varchar1', "char_type", 'char', "boolean_type", true, "b [...]
+SET hive.cbo.enable=true;
+SELECT * FROM like_test_complex_struct;
+DROP TABLE like_test_complex_struct;
+
+-- test complex types that contain other complex types
+CREATE EXTERNAL TABLE test_complex_complex(struct_type struct<i : int, s : string, m : map<string, array<int>>, struct_i : struct<str : string>>) STORED AS PARQUET LOCATION '${system:test.tmp.dir}/test_complex_complex';
+DESCRIBE test_complex_complex;
+INSERT INTO test_complex_complex SELECT named_struct("i", 10, "s", "hello, world", "m", map("arr", array(1,2,3,4)), "struct_i", named_struct("str", "test_str"));
+SELECT * FROM test_complex_complex;
+CREATE TABLE like_test_complex_complex LIKE FILE PARQUET '${system:test.tmp.dir}/test_complex_complex/000000_0';
+DESCRIBE like_test_complex_complex;
+INSERT INTO like_test_complex_complex SELECT named_struct("i", 10, "s", "hello, world", "m", map("arr", array(1,2,3,4)), "struct_i", named_struct("str", "test_str"));
+SELECT * FROM like_test_complex_complex;
+DROP TABLE like_test_complex_complex;
+
+-- test adding partitioning to the destination table
+CREATE TABLE like_test_partitioning LIKE FILE PARQUET '${system:test.tmp.dir}/test_all_types/000000_0' PARTITIONED BY (year STRING, month STRING);
+DESCRIBE like_test_partitioning;
+INSERT INTO like_test_partitioning PARTITION (year='1984', month='1') VALUES (1, 2, 3, 4, 2.2, 2.2, 20.20, '2022-06-30 10:20:30', '2020-04-23', 'str1', 'varchar1', 'char', true, 'binary_maybe'),
+ (1, 2, 3, 4, 2.2, 2.2, 20.20, '2022-06-30 10:20:30', '2020-04-23', 'str1', 'varchar1', 'char', true, 'binary_maybe');
+SELECT * FROM like_test_partitioning;
+DROP TABLE like_test_partitioning;
diff --git a/ql/src/test/results/clientnegative/create_table_like_invalid.q.out b/ql/src/test/results/clientnegative/create_table_like_invalid.q.out
new file mode 100644
index 00000000000..987da391e7c
--- /dev/null
+++ b/ql/src/test/results/clientnegative/create_table_like_invalid.q.out
@@ -0,0 +1 @@
+FAILED: SemanticException [Error 10434]: CREATE TABLE LIKE FILE is not supported by the 'AVRO' file format
diff --git a/ql/src/test/results/clientpositive/llap/create_table_like_file.q.out b/ql/src/test/results/clientpositive/llap/create_table_like_file.q.out
new file mode 100644
index 00000000000..5899d8fda4b
--- /dev/null
+++ b/ql/src/test/results/clientpositive/llap/create_table_like_file.q.out
@@ -0,0 +1,611 @@
+#### A masked pattern was here ####
+PREHOOK: type: CREATETABLE
+#### A masked pattern was here ####
+PREHOOK: Output: database:default
+PREHOOK: Output: default@test_all_types
+#### A masked pattern was here ####
+POSTHOOK: type: CREATETABLE
+#### A masked pattern was here ####
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@test_all_types
+PREHOOK: query: INSERT INTO test_all_types VALUES (1, 2, 3, 4, 2.2, 2.2, 20.20, '2022-06-30 10:20:30', '2020-04-23', 'str1', 'varchar1', 'char', true, 'binary_maybe'),
+ (1, 2, 3, 4, 2.2, 2.2, 20.20, '2022-06-30 10:20:30', '2020-04-23', 'str1', 'varchar1', 'char', true, 'binary_maybe')
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@test_all_types
+POSTHOOK: query: INSERT INTO test_all_types VALUES (1, 2, 3, 4, 2.2, 2.2, 20.20, '2022-06-30 10:20:30', '2020-04-23', 'str1', 'varchar1', 'char', true, 'binary_maybe'),
+ (1, 2, 3, 4, 2.2, 2.2, 20.20, '2022-06-30 10:20:30', '2020-04-23', 'str1', 'varchar1', 'char', true, 'binary_maybe')
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@test_all_types
+POSTHOOK: Lineage: test_all_types.bigint_type SCRIPT []
+POSTHOOK: Lineage: test_all_types.binary_type SCRIPT []
+POSTHOOK: Lineage: test_all_types.boolean_type SCRIPT []
+POSTHOOK: Lineage: test_all_types.char_type SCRIPT []
+POSTHOOK: Lineage: test_all_types.date_type SCRIPT []
+POSTHOOK: Lineage: test_all_types.decimal_type SCRIPT []
+POSTHOOK: Lineage: test_all_types.double_type SCRIPT []
+POSTHOOK: Lineage: test_all_types.float_type SCRIPT []
+POSTHOOK: Lineage: test_all_types.int_type SCRIPT []
+POSTHOOK: Lineage: test_all_types.smallint_type SCRIPT []
+POSTHOOK: Lineage: test_all_types.string_type SCRIPT []
+POSTHOOK: Lineage: test_all_types.timestamp_type SCRIPT []
+POSTHOOK: Lineage: test_all_types.tinyint_type SCRIPT []
+POSTHOOK: Lineage: test_all_types.varchar_type SCRIPT []
+PREHOOK: query: SELECT * FROM test_all_types
+PREHOOK: type: QUERY
+PREHOOK: Input: default@test_all_types
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT * FROM test_all_types
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@test_all_types
+#### A masked pattern was here ####
+1 2 3 4 2.2 2.2 20.20 2022-06-30 10:20:30 2020-04-23 str1 varchar1 char true binary_maybe
+1 2 3 4 2.2 2.2 20.20 2022-06-30 10:20:30 2020-04-23 str1 varchar1 char true binary_maybe
+PREHOOK: query: DESCRIBE test_all_types
+PREHOOK: type: DESCTABLE
+PREHOOK: Input: default@test_all_types
+POSTHOOK: query: DESCRIBE test_all_types
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Input: default@test_all_types
+tinyint_type tinyint
+smallint_type smallint
+bigint_type bigint
+int_type int
+float_type float
+double_type double
+decimal_type decimal(4,2)
+timestamp_type timestamp
+date_type date
+string_type string
+varchar_type varchar(100)
+char_type char(34)
+boolean_type boolean
+binary_type binary
+#### A masked pattern was here ####
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@like_test_all_types
+#### A masked pattern was here ####
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@like_test_all_types
+PREHOOK: query: INSERT INTO like_test_all_types VALUES (1, 2, 3, 4, 2.2, 2.2, 20.20, '2022-06-30 10:20:30', '2020-04-23', 'str1', 'varchar1', 'char', true, 'binary_maybe'),
+ (1, 2, 3, 4, 2.2, 2.2, 20.20, '2022-06-30 10:20:30', '2020-04-23', 'str1', 'varchar1', 'char', true, 'binary_maybe')
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@like_test_all_types
+POSTHOOK: query: INSERT INTO like_test_all_types VALUES (1, 2, 3, 4, 2.2, 2.2, 20.20, '2022-06-30 10:20:30', '2020-04-23', 'str1', 'varchar1', 'char', true, 'binary_maybe'),
+ (1, 2, 3, 4, 2.2, 2.2, 20.20, '2022-06-30 10:20:30', '2020-04-23', 'str1', 'varchar1', 'char', true, 'binary_maybe')
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@like_test_all_types
+POSTHOOK: Lineage: like_test_all_types.bigint_type SCRIPT []
+POSTHOOK: Lineage: like_test_all_types.binary_type SCRIPT []
+POSTHOOK: Lineage: like_test_all_types.boolean_type SCRIPT []
+POSTHOOK: Lineage: like_test_all_types.char_type SCRIPT []
+POSTHOOK: Lineage: like_test_all_types.date_type SCRIPT []
+POSTHOOK: Lineage: like_test_all_types.decimal_type SCRIPT []
+POSTHOOK: Lineage: like_test_all_types.double_type SCRIPT []
+POSTHOOK: Lineage: like_test_all_types.float_type SCRIPT []
+POSTHOOK: Lineage: like_test_all_types.int_type SCRIPT []
+POSTHOOK: Lineage: like_test_all_types.smallint_type SCRIPT []
+POSTHOOK: Lineage: like_test_all_types.string_type SCRIPT []
+POSTHOOK: Lineage: like_test_all_types.timestamp_type SCRIPT []
+POSTHOOK: Lineage: like_test_all_types.tinyint_type SCRIPT []
+POSTHOOK: Lineage: like_test_all_types.varchar_type SCRIPT []
+PREHOOK: query: SELECT * FROM like_test_all_types
+PREHOOK: type: QUERY
+PREHOOK: Input: default@like_test_all_types
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT * FROM like_test_all_types
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@like_test_all_types
+#### A masked pattern was here ####
+1 2 3 4 2.2 2.2 20.20 2022-06-30 10:20:30 2020-04-23 str1 varchar1 char true binary_maybe
+1 2 3 4 2.2 2.2 20.20 2022-06-30 10:20:30 2020-04-23 str1 varchar1 char true binary_maybe
+PREHOOK: query: DESCRIBE like_test_all_types
+PREHOOK: type: DESCTABLE
+PREHOOK: Input: default@like_test_all_types
+POSTHOOK: query: DESCRIBE like_test_all_types
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Input: default@like_test_all_types
+tinyint_type tinyint Inferred from Parquet file.
+smallint_type smallint Inferred from Parquet file.
+bigint_type bigint Inferred from Parquet file.
+int_type int Inferred from Parquet file.
+float_type float Inferred from Parquet file.
+double_type double Inferred from Parquet file.
+decimal_type decimal(4,2) Inferred from Parquet file.
+timestamp_type timestamp Inferred from Parquet file.
+date_type date Inferred from Parquet file.
+string_type string Inferred from Parquet file.
+varchar_type string Inferred from Parquet file.
+char_type string Inferred from Parquet file.
+boolean_type boolean Inferred from Parquet file.
+binary_type binary Inferred from Parquet file.
+PREHOOK: query: DROP TABLE test_all_types
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@test_all_types
+PREHOOK: Output: default@test_all_types
+POSTHOOK: query: DROP TABLE test_all_types
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@test_all_types
+POSTHOOK: Output: default@test_all_types
+PREHOOK: query: DROP TABLE like_test_all_types
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@like_test_all_types
+PREHOOK: Output: default@like_test_all_types
+POSTHOOK: query: DROP TABLE like_test_all_types
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@like_test_all_types
+POSTHOOK: Output: default@like_test_all_types
+#### A masked pattern was here ####
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@like_test_all_types
+#### A masked pattern was here ####
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@like_test_all_types
+PREHOOK: query: INSERT INTO like_test_all_types VALUES (1, 2, 3, 4, 2.2, 2.2, 20.20, '2022-06-30 10:20:30', '2020-04-23', 'str1', 'varchar1', 'char', true, 'binary_maybe'),
+ (1, 2, 3, 4, 2.2, 2.2, 20.20, '2022-06-30 10:20:30', '2020-04-23', 'str1', 'varchar1', 'char', true, 'binary_maybe')
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@like_test_all_types
+POSTHOOK: query: INSERT INTO like_test_all_types VALUES (1, 2, 3, 4, 2.2, 2.2, 20.20, '2022-06-30 10:20:30', '2020-04-23', 'str1', 'varchar1', 'char', true, 'binary_maybe'),
+ (1, 2, 3, 4, 2.2, 2.2, 20.20, '2022-06-30 10:20:30', '2020-04-23', 'str1', 'varchar1', 'char', true, 'binary_maybe')
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@like_test_all_types
+POSTHOOK: Lineage: like_test_all_types.bigint_type SCRIPT []
+POSTHOOK: Lineage: like_test_all_types.binary_type SCRIPT []
+POSTHOOK: Lineage: like_test_all_types.boolean_type SCRIPT []
+POSTHOOK: Lineage: like_test_all_types.char_type SCRIPT []
+POSTHOOK: Lineage: like_test_all_types.date_type SCRIPT []
+POSTHOOK: Lineage: like_test_all_types.decimal_type SCRIPT []
+POSTHOOK: Lineage: like_test_all_types.double_type SCRIPT []
+POSTHOOK: Lineage: like_test_all_types.float_type SCRIPT []
+POSTHOOK: Lineage: like_test_all_types.int_type SCRIPT []
+POSTHOOK: Lineage: like_test_all_types.smallint_type SCRIPT []
+POSTHOOK: Lineage: like_test_all_types.string_type SCRIPT []
+POSTHOOK: Lineage: like_test_all_types.timestamp_type SCRIPT []
+POSTHOOK: Lineage: like_test_all_types.tinyint_type SCRIPT []
+POSTHOOK: Lineage: like_test_all_types.varchar_type SCRIPT []
+PREHOOK: query: SELECT * FROM like_test_all_types
+PREHOOK: type: QUERY
+PREHOOK: Input: default@like_test_all_types
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT * FROM like_test_all_types
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@like_test_all_types
+#### A masked pattern was here ####
+1 2 3 4 2.2 2.2 20.20 2022-06-30 10:20:30 2020-04-23 str1 varchar1 char true binary_maybe
+1 2 3 4 2.2 2.2 20.20 2022-06-30 10:20:30 2020-04-23 str1 varchar1 char true binary_maybe
+PREHOOK: query: DESCRIBE like_test_all_types
+PREHOOK: type: DESCTABLE
+PREHOOK: Input: default@like_test_all_types
+POSTHOOK: query: DESCRIBE like_test_all_types
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Input: default@like_test_all_types
+tinyint_type tinyint Inferred from Parquet file.
+smallint_type smallint Inferred from Parquet file.
+bigint_type bigint Inferred from Parquet file.
+int_type int Inferred from Parquet file.
+float_type float Inferred from Parquet file.
+double_type double Inferred from Parquet file.
+decimal_type decimal(4,2) Inferred from Parquet file.
+timestamp_type timestamp Inferred from Parquet file.
+date_type date Inferred from Parquet file.
+string_type string Inferred from Parquet file.
+varchar_type string Inferred from Parquet file.
+char_type string Inferred from Parquet file.
+boolean_type boolean Inferred from Parquet file.
+binary_type string Inferred from Parquet file.
+PREHOOK: query: DROP TABLE test_all_types
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: DROP TABLE test_all_types
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: DROP TABLE like_test_all_types
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@like_test_all_types
+PREHOOK: Output: default@like_test_all_types
+POSTHOOK: query: DROP TABLE like_test_all_types
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@like_test_all_types
+POSTHOOK: Output: default@like_test_all_types
+#### A masked pattern was here ####
+PREHOOK: type: CREATETABLE
+#### A masked pattern was here ####
+PREHOOK: Output: database:default
+PREHOOK: Output: default@test_array
+#### A masked pattern was here ####
+POSTHOOK: type: CREATETABLE
+#### A masked pattern was here ####
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@test_array
+PREHOOK: query: DESCRIBE test_array
+PREHOOK: type: DESCTABLE
+PREHOOK: Input: default@test_array
+POSTHOOK: query: DESCRIBE test_array
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Input: default@test_array
+str_array array<string>
+PREHOOK: query: INSERT INTO test_array SELECT array("bob", "sue")
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@test_array
+POSTHOOK: query: INSERT INTO test_array SELECT array("bob", "sue")
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@test_array
+POSTHOOK: Lineage: test_array.str_array EXPRESSION []
+PREHOOK: query: SELECT * FROM test_array
+PREHOOK: type: QUERY
+PREHOOK: Input: default@test_array
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT * FROM test_array
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@test_array
+#### A masked pattern was here ####
+["bob","sue"]
+#### A masked pattern was here ####
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@like_test_array
+#### A masked pattern was here ####
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@like_test_array
+PREHOOK: query: DESCRIBE like_test_array
+PREHOOK: type: DESCTABLE
+PREHOOK: Input: default@like_test_array
+POSTHOOK: query: DESCRIBE like_test_array
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Input: default@like_test_array
+str_array array<string> Inferred from Parquet file.
+PREHOOK: query: INSERT INTO like_test_array SELECT array("bob", "sue")
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@like_test_array
+POSTHOOK: query: INSERT INTO like_test_array SELECT array("bob", "sue")
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@like_test_array
+POSTHOOK: Lineage: like_test_array.str_array EXPRESSION []
+PREHOOK: query: SELECT * FROM like_test_array
+PREHOOK: type: QUERY
+PREHOOK: Input: default@like_test_array
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT * FROM like_test_array
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@like_test_array
+#### A masked pattern was here ####
+["bob","sue"]
+PREHOOK: query: DROP TABLE like_test_array
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@like_test_array
+PREHOOK: Output: default@like_test_array
+POSTHOOK: query: DROP TABLE like_test_array
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@like_test_array
+POSTHOOK: Output: default@like_test_array
+#### A masked pattern was here ####
+PREHOOK: type: CREATETABLE
+#### A masked pattern was here ####
+PREHOOK: Output: database:default
+PREHOOK: Output: default@test_map
+#### A masked pattern was here ####
+POSTHOOK: type: CREATETABLE
+#### A masked pattern was here ####
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@test_map
+PREHOOK: query: DESCRIBE test_map
+PREHOOK: type: DESCTABLE
+PREHOOK: Input: default@test_map
+POSTHOOK: query: DESCRIBE test_map
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Input: default@test_map
+simple_map map<int,string>
+map_to_struct map<string,struct<i:int>>
+map_to_map map<date,map<int,string>>
+map_to_array map<binary,array<array<int>>>
+PREHOOK: query: INSERT INTO test_map SELECT map(10, "foo"), map("bar", named_struct("i", 99)), map(cast('1984-01-01' as date), map(10, "goodbye")), map(cast("binary" as binary), array(array(1,2,3)))
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@test_map
+POSTHOOK: query: INSERT INTO test_map SELECT map(10, "foo"), map("bar", named_struct("i", 99)), map(cast('1984-01-01' as date), map(10, "goodbye")), map(cast("binary" as binary), array(array(1,2,3)))
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@test_map
+POSTHOOK: Lineage: test_map.map_to_array EXPRESSION []
+POSTHOOK: Lineage: test_map.map_to_map EXPRESSION []
+POSTHOOK: Lineage: test_map.map_to_struct EXPRESSION []
+POSTHOOK: Lineage: test_map.simple_map EXPRESSION []
+PREHOOK: query: SELECT * FROM test_map
+PREHOOK: type: QUERY
+PREHOOK: Input: default@test_map
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT * FROM test_map
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@test_map
+#### A masked pattern was here ####
+{10:"foo"} {"bar":{"i":99}} {"1984-01-01":{10:"goodbye"}} {binary:[[1,2,3]]}
+#### A masked pattern was here ####
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@like_test_map
+#### A masked pattern was here ####
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@like_test_map
+PREHOOK: query: DESCRIBE like_test_map
+PREHOOK: type: DESCTABLE
+PREHOOK: Input: default@like_test_map
+POSTHOOK: query: DESCRIBE like_test_map
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Input: default@like_test_map
+simple_map map<int,string> Inferred from Parquet file.
+map_to_struct map<string,struct<i:int>> Inferred from Parquet file.
+map_to_map map<date,map<int,string>> Inferred from Parquet file.
+map_to_array map<binary,array<array<int>>> Inferred from Parquet file.
+PREHOOK: query: INSERT INTO like_test_map SELECT map(10, "foo"), map("bar", named_struct("i", 99)), map(cast('1984-01-01' as date), map(10, "goodbye")), map(cast("binary" as binary), array(array(1,2,3)))
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@like_test_map
+POSTHOOK: query: INSERT INTO like_test_map SELECT map(10, "foo"), map("bar", named_struct("i", 99)), map(cast('1984-01-01' as date), map(10, "goodbye")), map(cast("binary" as binary), array(array(1,2,3)))
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@like_test_map
+POSTHOOK: Lineage: like_test_map.map_to_array EXPRESSION []
+POSTHOOK: Lineage: like_test_map.map_to_map EXPRESSION []
+POSTHOOK: Lineage: like_test_map.map_to_struct EXPRESSION []
+POSTHOOK: Lineage: like_test_map.simple_map EXPRESSION []
+PREHOOK: query: SELECT * FROM like_test_map
+PREHOOK: type: QUERY
+PREHOOK: Input: default@like_test_map
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT * FROM like_test_map
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@like_test_map
+#### A masked pattern was here ####
+{10:"foo"} {"bar":{"i":99}} {"1984-01-01":{10:"goodbye"}} {binary:[[1,2,3]]}
+PREHOOK: query: DROP TABLE like_test_map
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@like_test_map
+PREHOOK: Output: default@like_test_map
+POSTHOOK: query: DROP TABLE like_test_map
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@like_test_map
+POSTHOOK: Output: default@like_test_map
+#### A masked pattern was here ####
+PREHOOK: type: CREATETABLE
+#### A masked pattern was here ####
+PREHOOK: Output: database:default
+PREHOOK: Output: default@test_complex_struct
+#### A masked pattern was here ####
+POSTHOOK: type: CREATETABLE
+#### A masked pattern was here ####
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@test_complex_struct
+PREHOOK: query: DESCRIBE test_complex_struct
+PREHOOK: type: DESCTABLE
+PREHOOK: Input: default@test_complex_struct
+POSTHOOK: query: DESCRIBE test_complex_struct
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Input: default@test_complex_struct
+struct_type struct<tinyint_type:tinyint,smallint_type:smallint,bigint_type:bigint,int_type:int,float_type:float,double_type:double,decimal_type:decimal(4,2),timestamp_type:timestamp,date_type:date,string_type:string,varchar_type:varchar(100),char_type:char(34),boolean_type:boolean,binary_type:binary>
+PREHOOK: query: INSERT INTO test_complex_struct SELECT named_struct("tinyint_type", cast(1 as tinyint), "smallint_type", cast(2 as smallint), "bigint_type", cast(3 as bigint), "int_type", 4, "float_type", cast(2.2 as float), "double_type", cast(2.2 as double), "decimal_type", cast(20.22 as decimal(4,2)), "timestamp_type", cast('2022-06-30 10:20:30' as timestamp), "date_type", cast('2020-04-23' as date), "string_type", 'str1', "varchar_type", cast('varchar1' as varchar(100)), "char_type", [...]
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@test_complex_struct
+POSTHOOK: query: INSERT INTO test_complex_struct SELECT named_struct("tinyint_type", cast(1 as tinyint), "smallint_type", cast(2 as smallint), "bigint_type", cast(3 as bigint), "int_type", 4, "float_type", cast(2.2 as float), "double_type", cast(2.2 as double), "decimal_type", cast(20.22 as decimal(4,2)), "timestamp_type", cast('2022-06-30 10:20:30' as timestamp), "date_type", cast('2020-04-23' as date), "string_type", 'str1', "varchar_type", cast('varchar1' as varchar(100)), "char_type" [...]
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@test_complex_struct
+POSTHOOK: Lineage: test_complex_struct.struct_type EXPRESSION []
+PREHOOK: query: SELECT * FROM test_complex_struct
+PREHOOK: type: QUERY
+PREHOOK: Input: default@test_complex_struct
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT * FROM test_complex_struct
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@test_complex_struct
+#### A masked pattern was here ####
+{"tinyint_type":1,"smallint_type":2,"bigint_type":3,"int_type":4,"float_type":2.2,"double_type":2.2,"decimal_type":20.22,"timestamp_type":"2022-06-30 10:20:30","date_type":"2020-04-23","string_type":"str1","varchar_type":"varchar1","char_type":"char ","boolean_type":true,"binary_type":binary_maybe}
+#### A masked pattern was here ####
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@like_test_complex_struct
+#### A masked pattern was here ####
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@like_test_complex_struct
+PREHOOK: query: DESCRIBE like_test_complex_struct
+PREHOOK: type: DESCTABLE
+PREHOOK: Input: default@like_test_complex_struct
+POSTHOOK: query: DESCRIBE like_test_complex_struct
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Input: default@like_test_complex_struct
+struct_type struct<tinyint_type:tinyint,smallint_type:smallint,bigint_type:bigint,int_type:int,float_type:float,double_type:double,decimal_type:decimal(4,2),timestamp_type:timestamp,date_type:date,string_type:string,varchar_type:string,char_type:string,boolean_type:boolean,binary_type:binary> Inferred from Parquet file.
+PREHOOK: query: INSERT INTO like_test_complex_struct SELECT named_struct("tinyint_type", cast(1 as tinyint), "smallint_type", cast(2 as smallint), "bigint_type", cast(3 as bigint), "int_type", 4, "float_type", cast(2.2 as float), "double_type", cast(2.2 as double), "decimal_type", cast(20.22 as decimal(4,2)), "timestamp_type", cast('2022-06-30 10:20:30' as timestamp), "date_type", cast('2020-04-23' as date), "string_type", 'str1', "varchar_type", 'varchar1', "char_type", 'char', "boolean [...]
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@like_test_complex_struct
+POSTHOOK: query: INSERT INTO like_test_complex_struct SELECT named_struct("tinyint_type", cast(1 as tinyint), "smallint_type", cast(2 as smallint), "bigint_type", cast(3 as bigint), "int_type", 4, "float_type", cast(2.2 as float), "double_type", cast(2.2 as double), "decimal_type", cast(20.22 as decimal(4,2)), "timestamp_type", cast('2022-06-30 10:20:30' as timestamp), "date_type", cast('2020-04-23' as date), "string_type", 'str1', "varchar_type", 'varchar1', "char_type", 'char', "boolea [...]
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@like_test_complex_struct
+POSTHOOK: Lineage: like_test_complex_struct.struct_type EXPRESSION []
+PREHOOK: query: SELECT * FROM like_test_complex_struct
+PREHOOK: type: QUERY
+PREHOOK: Input: default@like_test_complex_struct
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT * FROM like_test_complex_struct
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@like_test_complex_struct
+#### A masked pattern was here ####
+{"tinyint_type":1,"smallint_type":2,"bigint_type":3,"int_type":4,"float_type":2.2,"double_type":2.2,"decimal_type":20.22,"timestamp_type":"2022-06-30 10:20:30","date_type":"2020-04-23","string_type":"str1","varchar_type":"varchar1","char_type":"char","boolean_type":true,"binary_type":binary_maybe}
+PREHOOK: query: DROP TABLE like_test_complex_struct
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@like_test_complex_struct
+PREHOOK: Output: default@like_test_complex_struct
+POSTHOOK: query: DROP TABLE like_test_complex_struct
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@like_test_complex_struct
+POSTHOOK: Output: default@like_test_complex_struct
+#### A masked pattern was here ####
+PREHOOK: type: CREATETABLE
+#### A masked pattern was here ####
+PREHOOK: Output: database:default
+PREHOOK: Output: default@test_complex_complex
+#### A masked pattern was here ####
+POSTHOOK: type: CREATETABLE
+#### A masked pattern was here ####
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@test_complex_complex
+PREHOOK: query: DESCRIBE test_complex_complex
+PREHOOK: type: DESCTABLE
+PREHOOK: Input: default@test_complex_complex
+POSTHOOK: query: DESCRIBE test_complex_complex
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Input: default@test_complex_complex
+struct_type struct<i:int,s:string,m:map<string,array<int>>,struct_i:struct<str:string>>
+PREHOOK: query: INSERT INTO test_complex_complex SELECT named_struct("i", 10, "s", "hello, world", "m", map("arr", array(1,2,3,4)), "struct_i", named_struct("str", "test_str"))
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@test_complex_complex
+POSTHOOK: query: INSERT INTO test_complex_complex SELECT named_struct("i", 10, "s", "hello, world", "m", map("arr", array(1,2,3,4)), "struct_i", named_struct("str", "test_str"))
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@test_complex_complex
+POSTHOOK: Lineage: test_complex_complex.struct_type EXPRESSION []
+PREHOOK: query: SELECT * FROM test_complex_complex
+PREHOOK: type: QUERY
+PREHOOK: Input: default@test_complex_complex
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT * FROM test_complex_complex
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@test_complex_complex
+#### A masked pattern was here ####
+{"i":10,"s":"hello, world","m":{"arr":[1,2,3,4]},"struct_i":{"str":"test_str"}}
+#### A masked pattern was here ####
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@like_test_complex_complex
+#### A masked pattern was here ####
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@like_test_complex_complex
+PREHOOK: query: DESCRIBE like_test_complex_complex
+PREHOOK: type: DESCTABLE
+PREHOOK: Input: default@like_test_complex_complex
+POSTHOOK: query: DESCRIBE like_test_complex_complex
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Input: default@like_test_complex_complex
+struct_type struct<i:int,s:string,m:map<string,array<int>>,struct_i:struct<str:string>> Inferred from Parquet file.
+PREHOOK: query: INSERT INTO like_test_complex_complex SELECT named_struct("i", 10, "s", "hello, world", "m", map("arr", array(1,2,3,4)), "struct_i", named_struct("str", "test_str"))
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@like_test_complex_complex
+POSTHOOK: query: INSERT INTO like_test_complex_complex SELECT named_struct("i", 10, "s", "hello, world", "m", map("arr", array(1,2,3,4)), "struct_i", named_struct("str", "test_str"))
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@like_test_complex_complex
+POSTHOOK: Lineage: like_test_complex_complex.struct_type EXPRESSION []
+PREHOOK: query: SELECT * FROM like_test_complex_complex
+PREHOOK: type: QUERY
+PREHOOK: Input: default@like_test_complex_complex
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT * FROM like_test_complex_complex
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@like_test_complex_complex
+#### A masked pattern was here ####
+{"i":10,"s":"hello, world","m":{"arr":[1,2,3,4]},"struct_i":{"str":"test_str"}}
+PREHOOK: query: DROP TABLE like_test_complex_complex
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@like_test_complex_complex
+PREHOOK: Output: default@like_test_complex_complex
+POSTHOOK: query: DROP TABLE like_test_complex_complex
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@like_test_complex_complex
+POSTHOOK: Output: default@like_test_complex_complex
+#### A masked pattern was here ####
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@like_test_partitioning
+#### A masked pattern was here ####
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@like_test_partitioning
+PREHOOK: query: DESCRIBE like_test_partitioning
+PREHOOK: type: DESCTABLE
+PREHOOK: Input: default@like_test_partitioning
+POSTHOOK: query: DESCRIBE like_test_partitioning
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Input: default@like_test_partitioning
+tinyint_type tinyint Inferred from Parquet file.
+smallint_type smallint Inferred from Parquet file.
+bigint_type bigint Inferred from Parquet file.
+int_type int Inferred from Parquet file.
+float_type float Inferred from Parquet file.
+double_type double Inferred from Parquet file.
+decimal_type decimal(4,2) Inferred from Parquet file.
+timestamp_type timestamp Inferred from Parquet file.
+date_type date Inferred from Parquet file.
+string_type string Inferred from Parquet file.
+varchar_type string Inferred from Parquet file.
+char_type string Inferred from Parquet file.
+boolean_type boolean Inferred from Parquet file.
+binary_type binary Inferred from Parquet file.
+year string
+month string
+
+# Partition Information
+# col_name data_type comment
+year string
+month string
+PREHOOK: query: INSERT INTO like_test_partitioning PARTITION (year='1984', month='1') VALUES (1, 2, 3, 4, 2.2, 2.2, 20.20, '2022-06-30 10:20:30', '2020-04-23', 'str1', 'varchar1', 'char', true, 'binary_maybe'),
+ (1, 2, 3, 4, 2.2, 2.2, 20.20, '2022-06-30 10:20:30', '2020-04-23', 'str1', 'varchar1', 'char', true, 'binary_maybe')
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@like_test_partitioning@year=1984/month=1
+POSTHOOK: query: INSERT INTO like_test_partitioning PARTITION (year='1984', month='1') VALUES (1, 2, 3, 4, 2.2, 2.2, 20.20, '2022-06-30 10:20:30', '2020-04-23', 'str1', 'varchar1', 'char', true, 'binary_maybe'),
+ (1, 2, 3, 4, 2.2, 2.2, 20.20, '2022-06-30 10:20:30', '2020-04-23', 'str1', 'varchar1', 'char', true, 'binary_maybe')
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@like_test_partitioning@year=1984/month=1
+POSTHOOK: Lineage: like_test_partitioning PARTITION(year=1984,month=1).bigint_type SCRIPT []
+POSTHOOK: Lineage: like_test_partitioning PARTITION(year=1984,month=1).binary_type SCRIPT []
+POSTHOOK: Lineage: like_test_partitioning PARTITION(year=1984,month=1).boolean_type SCRIPT []
+POSTHOOK: Lineage: like_test_partitioning PARTITION(year=1984,month=1).char_type SCRIPT []
+POSTHOOK: Lineage: like_test_partitioning PARTITION(year=1984,month=1).date_type SCRIPT []
+POSTHOOK: Lineage: like_test_partitioning PARTITION(year=1984,month=1).decimal_type SCRIPT []
+POSTHOOK: Lineage: like_test_partitioning PARTITION(year=1984,month=1).double_type SCRIPT []
+POSTHOOK: Lineage: like_test_partitioning PARTITION(year=1984,month=1).float_type SCRIPT []
+POSTHOOK: Lineage: like_test_partitioning PARTITION(year=1984,month=1).int_type SCRIPT []
+POSTHOOK: Lineage: like_test_partitioning PARTITION(year=1984,month=1).smallint_type SCRIPT []
+POSTHOOK: Lineage: like_test_partitioning PARTITION(year=1984,month=1).string_type SCRIPT []
+POSTHOOK: Lineage: like_test_partitioning PARTITION(year=1984,month=1).timestamp_type SCRIPT []
+POSTHOOK: Lineage: like_test_partitioning PARTITION(year=1984,month=1).tinyint_type SCRIPT []
+POSTHOOK: Lineage: like_test_partitioning PARTITION(year=1984,month=1).varchar_type SCRIPT []
+PREHOOK: query: SELECT * FROM like_test_partitioning
+PREHOOK: type: QUERY
+PREHOOK: Input: default@like_test_partitioning
+PREHOOK: Input: default@like_test_partitioning@year=1984/month=1
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT * FROM like_test_partitioning
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@like_test_partitioning
+POSTHOOK: Input: default@like_test_partitioning@year=1984/month=1
+#### A masked pattern was here ####
+1 2 3 4 2.2 2.2 20.20 2022-06-30 10:20:30 2020-04-23 str1 varchar1 char true binary_maybe 1984 1
+1 2 3 4 2.2 2.2 20.20 2022-06-30 10:20:30 2020-04-23 str1 varchar1 char true binary_maybe 1984 1
+PREHOOK: query: DROP TABLE like_test_partitioning
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@like_test_partitioning
+PREHOOK: Output: default@like_test_partitioning
+POSTHOOK: query: DROP TABLE like_test_partitioning
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@like_test_partitioning
+POSTHOOK: Output: default@like_test_partitioning
diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/SchemaInference.java b/serde/src/java/org/apache/hadoop/hive/serde2/SchemaInference.java
new file mode 100644
index 00000000000..4b41cc1204e
--- /dev/null
+++ b/serde/src/java/org/apache/hadoop/hive/serde2/SchemaInference.java
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.serde2;
+
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hive.metastore.api.FieldSchema;
+
+public interface SchemaInference {
+ /**
+ * Infer Hive compatible schema from provided file. The purpose of this method is to optionally
+ * allow SerDes to implement schema inference for CREATE TABLE LIKE FILE support.
+ *
+ * @param file Fully qualified path to file to infer schema from (hadoop compatible URI + filename)
+ * @return List of FieldSchema that was derived from the provided file
+ * @throws SerDeException
+ */
+ List<FieldSchema> readSchema(Configuration conf, String file) throws SerDeException;
+}