You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@paimon.apache.org by zj...@apache.org on 2023/03/31 00:48:51 UTC
[incubator-paimon] branch master updated: [Paimon-682][Improvement]Support schema validation when invoke create schema (#762)
This is an automated email from the ASF dual-hosted git repository.
zjureel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-paimon.git
The following commit(s) were added to refs/heads/master by this push:
new 087c06a11 [Paimon-682][Improvement]Support schema validation when invoke create schema (#762)
087c06a11 is described below
commit 087c06a118d62cc333a62ce455bff7d7fc9da577
Author: hk__lrzy <hk...@163.com>
AuthorDate: Fri Mar 31 08:48:46 2023 +0800
[Paimon-682][Improvement]Support schema validation when invoke create schema (#762)
* [Paimon][Improvement]Support schema validation when invoke create schema.
* [Paimon][Improvement]fix checkstyle.
* [Paimon][Improvement]add unit test.
* [Paimon][Improvement]fix checkstyle 2.
---------
Co-authored-by: haoke <ha...@bytedance.com>
---
.../java/org/apache/paimon/format/FileFormat.java | 3 +
.../org/apache/paimon/schema/SchemaValidation.java | 8 +-
.../format/FileStatsExtractingAvroFormat.java | 5 ++
.../apache/paimon/format/FlushingFileFormat.java | 5 ++
.../apache/paimon/format/avro/AvroFileFormat.java | 8 ++
.../apache/paimon/format/orc/OrcFileFormat.java | 6 ++
.../paimon/format/parquet/ParquetFileFormat.java | 5 ++
.../paimon/format/avro/AvroFileFormatTest.java | 95 ++++++++++++++++++++++
.../paimon/format/orc/OrcFileFormatTest.java | 49 +++++++++++
.../format/parquet/ParquetFileFormatTest.java | 30 +++++++
10 files changed, 212 insertions(+), 2 deletions(-)
diff --git a/paimon-common/src/main/java/org/apache/paimon/format/FileFormat.java b/paimon-common/src/main/java/org/apache/paimon/format/FileFormat.java
index c4ea21ddc..a8164d4c2 100644
--- a/paimon-common/src/main/java/org/apache/paimon/format/FileFormat.java
+++ b/paimon-common/src/main/java/org/apache/paimon/format/FileFormat.java
@@ -60,6 +60,9 @@ public abstract class FileFormat {
/** Create a {@link FormatWriterFactory} from the type. */
public abstract FormatWriterFactory createWriterFactory(RowType type);
+ /** Validate data field type supported or not. */
+ public abstract void validateDataFields(RowType rowType);
+
public FormatReaderFactory createReaderFactory(RowType rowType) {
int[][] projection = new int[rowType.getFieldCount()][];
for (int i = 0; i < projection.length; i++) {
diff --git a/paimon-core/src/main/java/org/apache/paimon/schema/SchemaValidation.java b/paimon-core/src/main/java/org/apache/paimon/schema/SchemaValidation.java
index f531d3993..74302a3dc 100644
--- a/paimon-core/src/main/java/org/apache/paimon/schema/SchemaValidation.java
+++ b/paimon-core/src/main/java/org/apache/paimon/schema/SchemaValidation.java
@@ -20,7 +20,9 @@ package org.apache.paimon.schema;
import org.apache.paimon.CoreOptions;
import org.apache.paimon.WriteMode;
+import org.apache.paimon.format.FileFormat;
import org.apache.paimon.options.ConfigOption;
+import org.apache.paimon.options.Options;
import org.apache.paimon.types.ArrayType;
import org.apache.paimon.types.DataField;
import org.apache.paimon.types.DataType;
@@ -104,8 +106,10 @@ public class SchemaValidation {
// Get the format type here which will try to convert string value to {@Code
// FileFormatType}. If the string value is illegal, an exception will be thrown.
- // TODO Check fields type according to the format type
- options.formatType();
+ CoreOptions.FileFormatType fileFormatType = options.formatType();
+ FileFormat fileFormat =
+ FileFormat.fromIdentifier(fileFormatType.name(), new Options(schema.options()));
+ fileFormat.validateDataFields(new RowType(schema.fields()));
// Check column names in schema
schema.fieldNames()
diff --git a/paimon-core/src/test/java/org/apache/paimon/format/FileStatsExtractingAvroFormat.java b/paimon-core/src/test/java/org/apache/paimon/format/FileStatsExtractingAvroFormat.java
index 6d219990c..9aaccd56c 100644
--- a/paimon-core/src/test/java/org/apache/paimon/format/FileStatsExtractingAvroFormat.java
+++ b/paimon-core/src/test/java/org/apache/paimon/format/FileStatsExtractingAvroFormat.java
@@ -49,6 +49,11 @@ public class FileStatsExtractingAvroFormat extends FileFormat {
return avro.createWriterFactory(type);
}
+ @Override
+ public void validateDataFields(RowType rowType) {
+ return;
+ }
+
@Override
public Optional<FileStatsExtractor> createStatsExtractor(RowType type) {
return Optional.of(new TestFileStatsExtractor(this, type));
diff --git a/paimon-core/src/test/java/org/apache/paimon/format/FlushingFileFormat.java b/paimon-core/src/test/java/org/apache/paimon/format/FlushingFileFormat.java
index b63396cac..548bd8b51 100644
--- a/paimon-core/src/test/java/org/apache/paimon/format/FlushingFileFormat.java
+++ b/paimon-core/src/test/java/org/apache/paimon/format/FlushingFileFormat.java
@@ -67,4 +67,9 @@ public class FlushingFileFormat extends FileFormat {
};
};
}
+
+ @Override
+ public void validateDataFields(RowType rowType) {
+ return;
+ }
}
diff --git a/paimon-format/src/main/java/org/apache/paimon/format/avro/AvroFileFormat.java b/paimon-format/src/main/java/org/apache/paimon/format/avro/AvroFileFormat.java
index ee4cff70f..faf8a794b 100644
--- a/paimon-format/src/main/java/org/apache/paimon/format/avro/AvroFileFormat.java
+++ b/paimon-format/src/main/java/org/apache/paimon/format/avro/AvroFileFormat.java
@@ -83,6 +83,14 @@ public class AvroFileFormat extends FileFormat {
return new RowDataAvroWriterFactory(type, formatOptions.get(AVRO_OUTPUT_CODEC));
}
+ @Override
+ public void validateDataFields(RowType rowType) {
+ List<DataType> fieldTypes = rowType.getFieldTypes();
+ for (DataType dataType : fieldTypes) {
+ AvroSchemaConverter.convertToSchema(dataType);
+ }
+ }
+
private static class AvroGenericRecordBulkFormat extends AbstractAvroBulkFormat<GenericRecord> {
private static final long serialVersionUID = 1L;
diff --git a/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcFileFormat.java b/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcFileFormat.java
index 8480e2aa3..0fc7113ac 100644
--- a/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcFileFormat.java
+++ b/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcFileFormat.java
@@ -111,6 +111,12 @@ public class OrcFileFormat extends FileFormat {
formatContext.readBatchSize());
}
+ @Override
+ public void validateDataFields(RowType rowType) {
+ DataType refinedType = refineDataType(rowType);
+ OrcSplitReaderUtil.toOrcType(refinedType);
+ }
+
/**
* The {@link OrcWriterFactory} will create {@link ThreadLocalClassLoaderConfiguration} from the
* input writer config to avoid classloader leaks.
diff --git a/paimon-format/src/main/java/org/apache/paimon/format/parquet/ParquetFileFormat.java b/paimon-format/src/main/java/org/apache/paimon/format/parquet/ParquetFileFormat.java
index f63fe940e..ed2553a2b 100644
--- a/paimon-format/src/main/java/org/apache/paimon/format/parquet/ParquetFileFormat.java
+++ b/paimon-format/src/main/java/org/apache/paimon/format/parquet/ParquetFileFormat.java
@@ -66,6 +66,11 @@ public class ParquetFileFormat extends FileFormat {
type, getParquetConfiguration(formatContext.formatOptions())));
}
+ @Override
+ public void validateDataFields(RowType rowType) {
+ ParquetSchemaConverter.convertToParquetMessageType("paimon_schema", rowType);
+ }
+
@Override
public Optional<FileStatsExtractor> createStatsExtractor(RowType type) {
return Optional.of(new ParquetFileStatsExtractor(type));
diff --git a/paimon-format/src/test/java/org/apache/paimon/format/avro/AvroFileFormatTest.java b/paimon-format/src/test/java/org/apache/paimon/format/avro/AvroFileFormatTest.java
new file mode 100644
index 000000000..525f2778b
--- /dev/null
+++ b/paimon-format/src/test/java/org/apache/paimon/format/avro/AvroFileFormatTest.java
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.format.avro;
+
+import org.apache.paimon.options.Options;
+import org.apache.paimon.types.DataField;
+import org.apache.paimon.types.DataTypes;
+import org.apache.paimon.types.RowType;
+
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.Test;
+
+import java.util.ArrayList;
+
+/** Test for avro file format. */
+public class AvroFileFormatTest {
+
+ private static AvroFileFormat fileFormat;
+
+ @BeforeAll
+ public static void before() {
+ fileFormat = new AvroFileFormat(new Options());
+ }
+
+ @Test
+ public void testSupportedDataTypes() {
+ ArrayList<DataField> dataFields = new ArrayList<>();
+ int index = 0;
+ dataFields.add(new DataField(index++, "boolean_type", DataTypes.BOOLEAN()));
+ dataFields.add(new DataField(index++, "tinyint_type", DataTypes.TINYINT()));
+ dataFields.add(new DataField(index++, "smallint_type", DataTypes.SMALLINT()));
+ dataFields.add(new DataField(index++, "int_type", DataTypes.INT()));
+ dataFields.add(new DataField(index++, "bigint_type", DataTypes.BIGINT()));
+ dataFields.add(new DataField(index++, "float_type", DataTypes.FLOAT()));
+ dataFields.add(new DataField(index++, "double_type", DataTypes.DOUBLE()));
+ dataFields.add(new DataField(index++, "char_type", DataTypes.CHAR(10)));
+ dataFields.add(new DataField(index++, "varchar_type", DataTypes.VARCHAR(20)));
+ dataFields.add(new DataField(index++, "binary_type", DataTypes.BINARY(20)));
+ dataFields.add(new DataField(index++, "varbinary_type", DataTypes.VARBINARY(20)));
+ dataFields.add(new DataField(index++, "timestamp_type", DataTypes.TIMESTAMP(3)));
+ dataFields.add(new DataField(index++, "date_type", DataTypes.DATE()));
+ dataFields.add(new DataField(index++, "decimal_type", DataTypes.DECIMAL(10, 3)));
+
+ RowType rowType = new RowType(dataFields);
+ fileFormat.validateDataFields(rowType);
+ }
+
+ @Test
+ public void testSupportedComplexDataTypes() {
+ ArrayList<DataField> dataFields = new ArrayList<>();
+ int index = 0;
+ dataFields.add(
+ new DataField(
+ index++,
+ "map_type",
+ DataTypes.MAP(DataTypes.STRING(), DataTypes.BIGINT())));
+ dataFields.add(new DataField(index++, "array_type", DataTypes.ARRAY(DataTypes.STRING())));
+ dataFields.add(
+ new DataField(
+ index++,
+ "row_type",
+ DataTypes.ROW(DataTypes.STRING(), DataTypes.BIGINT())));
+
+ RowType rowType = new RowType(dataFields);
+ fileFormat.validateDataFields(rowType);
+ }
+
+ @Test
+ public void testUnsupportedDataTypes() {
+ ArrayList<DataField> dataFields = new ArrayList<>();
+ int index = 0;
+ dataFields.add(new DataField(index++, "timestamp_type", DataTypes.TIMESTAMP(6)));
+
+ RowType rowType = new RowType(dataFields);
+ Assertions.assertThrows(
+ IllegalArgumentException.class, () -> fileFormat.validateDataFields(rowType));
+ }
+}
diff --git a/paimon-format/src/test/java/org/apache/paimon/format/orc/OrcFileFormatTest.java b/paimon-format/src/test/java/org/apache/paimon/format/orc/OrcFileFormatTest.java
index fd5699894..ae7fe3a30 100644
--- a/paimon-format/src/test/java/org/apache/paimon/format/orc/OrcFileFormatTest.java
+++ b/paimon-format/src/test/java/org/apache/paimon/format/orc/OrcFileFormatTest.java
@@ -20,9 +20,16 @@ package org.apache.paimon.format.orc;
import org.apache.paimon.format.FileFormatFactory.FormatContext;
import org.apache.paimon.options.Options;
+import org.apache.paimon.types.DataField;
+import org.apache.paimon.types.DataTypes;
+import org.apache.paimon.types.RowType;
+import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
+import java.util.ArrayList;
+import java.util.List;
+
import static org.apache.paimon.format.orc.OrcFileFormatFactory.IDENTIFIER;
import static org.assertj.core.api.Assertions.assertThat;
@@ -47,4 +54,46 @@ public class OrcFileFormatTest {
assertThat(orc.orcProperties().getProperty(IDENTIFIER + ".haha", "")).isEqualTo("1");
assertThat(orc.orcProperties().getProperty(IDENTIFIER + ".compress", "")).isEqualTo("zlib");
}
+
+ @Test
+ public void testSupportedDataTypes() {
+ OrcFileFormat orc =
+ new OrcFileFormatFactory().create(new FormatContext(new Options(), 1024));
+
+ int index = 0;
+ List<DataField> dataFields = new ArrayList<DataField>();
+ dataFields.add(new DataField(index++, "boolean_type", DataTypes.BOOLEAN()));
+ dataFields.add(new DataField(index++, "tinyint_type", DataTypes.TINYINT()));
+ dataFields.add(new DataField(index++, "smallint_type", DataTypes.SMALLINT()));
+ dataFields.add(new DataField(index++, "int_type", DataTypes.INT()));
+ dataFields.add(new DataField(index++, "bigint_type", DataTypes.BIGINT()));
+ dataFields.add(new DataField(index++, "float_type", DataTypes.FLOAT()));
+ dataFields.add(new DataField(index++, "double_type", DataTypes.DOUBLE()));
+ dataFields.add(new DataField(index++, "char_type", DataTypes.CHAR(10)));
+ dataFields.add(new DataField(index++, "varchar_type", DataTypes.VARCHAR(20)));
+ dataFields.add(new DataField(index++, "binary_type", DataTypes.BINARY(20)));
+ dataFields.add(new DataField(index++, "varbinary_type", DataTypes.VARBINARY(20)));
+ dataFields.add(new DataField(index++, "timestamp_type", DataTypes.TIMESTAMP(3)));
+ dataFields.add(new DataField(index++, "date_type", DataTypes.DATE()));
+ dataFields.add(new DataField(index++, "decimal_type", DataTypes.DECIMAL(10, 3)));
+ orc.validateDataFields(new RowType(dataFields));
+ }
+
+ @Test
+ public void testUnSupportedDataTypes() {
+ OrcFileFormat orc =
+ new OrcFileFormatFactory().create(new FormatContext(new Options(), 1024));
+
+ int index = 0;
+ List<DataField> dataFields = new ArrayList<DataField>();
+ dataFields.add(
+ new DataField(
+ index++,
+ "timestamp_with_timezone",
+ DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE()));
+ Assertions.assertThrows(
+ UnsupportedOperationException.class,
+ () -> orc.validateDataFields(new RowType(dataFields)));
+ dataFields.clear();
+ }
}
diff --git a/paimon-format/src/test/java/org/apache/paimon/format/parquet/ParquetFileFormatTest.java b/paimon-format/src/test/java/org/apache/paimon/format/parquet/ParquetFileFormatTest.java
index 404965782..da53a7781 100644
--- a/paimon-format/src/test/java/org/apache/paimon/format/parquet/ParquetFileFormatTest.java
+++ b/paimon-format/src/test/java/org/apache/paimon/format/parquet/ParquetFileFormatTest.java
@@ -22,11 +22,17 @@ import org.apache.paimon.format.FileFormatFactory.FormatContext;
import org.apache.paimon.options.ConfigOption;
import org.apache.paimon.options.ConfigOptions;
import org.apache.paimon.options.Options;
+import org.apache.paimon.types.DataField;
+import org.apache.paimon.types.DataTypes;
+import org.apache.paimon.types.RowType;
import org.apache.parquet.format.CompressionCodec;
import org.apache.parquet.hadoop.ParquetOutputFormat;
import org.junit.jupiter.api.Test;
+import java.util.ArrayList;
+import java.util.List;
+
import static org.apache.paimon.format.parquet.ParquetFileFormat.getParquetConfiguration;
import static org.apache.paimon.format.parquet.ParquetFileFormatFactory.IDENTIFIER;
import static org.assertj.core.api.Assertions.assertThat;
@@ -74,4 +80,28 @@ public class ParquetFileFormatTest {
return getParquetConfiguration(parquet.formatOptions())
.getString(ParquetOutputFormat.COMPRESSION, null);
}
+
+ @Test
+ public void testSupportedDataFields() {
+ ParquetFileFormat parquet =
+ new ParquetFileFormatFactory().create(new FormatContext(new Options(), 1024));
+
+ int index = 0;
+ List<DataField> dataFields = new ArrayList<DataField>();
+ dataFields.add(new DataField(index++, "boolean_type", DataTypes.BOOLEAN()));
+ dataFields.add(new DataField(index++, "tinyint_type", DataTypes.TINYINT()));
+ dataFields.add(new DataField(index++, "smallint_type", DataTypes.SMALLINT()));
+ dataFields.add(new DataField(index++, "int_type", DataTypes.INT()));
+ dataFields.add(new DataField(index++, "bigint_type", DataTypes.BIGINT()));
+ dataFields.add(new DataField(index++, "float_type", DataTypes.FLOAT()));
+ dataFields.add(new DataField(index++, "double_type", DataTypes.DOUBLE()));
+ dataFields.add(new DataField(index++, "char_type", DataTypes.CHAR(10)));
+ dataFields.add(new DataField(index++, "varchar_type", DataTypes.VARCHAR(20)));
+ dataFields.add(new DataField(index++, "binary_type", DataTypes.BINARY(20)));
+ dataFields.add(new DataField(index++, "varbinary_type", DataTypes.VARBINARY(20)));
+ dataFields.add(new DataField(index++, "timestamp_type", DataTypes.TIMESTAMP(3)));
+ dataFields.add(new DataField(index++, "date_type", DataTypes.DATE()));
+ dataFields.add(new DataField(index++, "decimal_type", DataTypes.DECIMAL(10, 3)));
+ parquet.validateDataFields(new RowType(dataFields));
+ }
}