You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@paimon.apache.org by zj...@apache.org on 2023/03/31 00:48:51 UTC

[incubator-paimon] branch master updated: [Paimon-682][Improvement]Support schema validation when invoke create schema (#762)

This is an automated email from the ASF dual-hosted git repository.

zjureel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-paimon.git


The following commit(s) were added to refs/heads/master by this push:
     new 087c06a11 [Paimon-682][Improvement]Support schema validation when invoke create schema (#762)
087c06a11 is described below

commit 087c06a118d62cc333a62ce455bff7d7fc9da577
Author: hk__lrzy <hk...@163.com>
AuthorDate: Fri Mar 31 08:48:46 2023 +0800

    [Paimon-682][Improvement]Support schema validation when invoke create schema (#762)
    
    * [Paimon][Improvement]Support schema validation when invoke create schema.
    
    * [Paimon][Improvement]fix checkstyle.
    
    * [Paimon][Improvement]add unit test.
    
    * [Paimon][Improvement]fix checkstyle 2.
    
    ---------
    
    Co-authored-by: haoke <ha...@bytedance.com>
---
 .../java/org/apache/paimon/format/FileFormat.java  |  3 +
 .../org/apache/paimon/schema/SchemaValidation.java |  8 +-
 .../format/FileStatsExtractingAvroFormat.java      |  5 ++
 .../apache/paimon/format/FlushingFileFormat.java   |  5 ++
 .../apache/paimon/format/avro/AvroFileFormat.java  |  8 ++
 .../apache/paimon/format/orc/OrcFileFormat.java    |  6 ++
 .../paimon/format/parquet/ParquetFileFormat.java   |  5 ++
 .../paimon/format/avro/AvroFileFormatTest.java     | 95 ++++++++++++++++++++++
 .../paimon/format/orc/OrcFileFormatTest.java       | 49 +++++++++++
 .../format/parquet/ParquetFileFormatTest.java      | 30 +++++++
 10 files changed, 212 insertions(+), 2 deletions(-)

diff --git a/paimon-common/src/main/java/org/apache/paimon/format/FileFormat.java b/paimon-common/src/main/java/org/apache/paimon/format/FileFormat.java
index c4ea21ddc..a8164d4c2 100644
--- a/paimon-common/src/main/java/org/apache/paimon/format/FileFormat.java
+++ b/paimon-common/src/main/java/org/apache/paimon/format/FileFormat.java
@@ -60,6 +60,9 @@ public abstract class FileFormat {
     /** Create a {@link FormatWriterFactory} from the type. */
     public abstract FormatWriterFactory createWriterFactory(RowType type);
 
+    /** Validate data field type supported or not. */
+    public abstract void validateDataFields(RowType rowType);
+
     public FormatReaderFactory createReaderFactory(RowType rowType) {
         int[][] projection = new int[rowType.getFieldCount()][];
         for (int i = 0; i < projection.length; i++) {
diff --git a/paimon-core/src/main/java/org/apache/paimon/schema/SchemaValidation.java b/paimon-core/src/main/java/org/apache/paimon/schema/SchemaValidation.java
index f531d3993..74302a3dc 100644
--- a/paimon-core/src/main/java/org/apache/paimon/schema/SchemaValidation.java
+++ b/paimon-core/src/main/java/org/apache/paimon/schema/SchemaValidation.java
@@ -20,7 +20,9 @@ package org.apache.paimon.schema;
 
 import org.apache.paimon.CoreOptions;
 import org.apache.paimon.WriteMode;
+import org.apache.paimon.format.FileFormat;
 import org.apache.paimon.options.ConfigOption;
+import org.apache.paimon.options.Options;
 import org.apache.paimon.types.ArrayType;
 import org.apache.paimon.types.DataField;
 import org.apache.paimon.types.DataType;
@@ -104,8 +106,10 @@ public class SchemaValidation {
 
         // Get the format type here which will try to convert string value to {@Code
         // FileFormatType}. If the string value is illegal, an exception will be thrown.
-        // TODO Check fields type according to the format type
-        options.formatType();
+        CoreOptions.FileFormatType fileFormatType = options.formatType();
+        FileFormat fileFormat =
+                FileFormat.fromIdentifier(fileFormatType.name(), new Options(schema.options()));
+        fileFormat.validateDataFields(new RowType(schema.fields()));
 
         // Check column names in schema
         schema.fieldNames()
diff --git a/paimon-core/src/test/java/org/apache/paimon/format/FileStatsExtractingAvroFormat.java b/paimon-core/src/test/java/org/apache/paimon/format/FileStatsExtractingAvroFormat.java
index 6d219990c..9aaccd56c 100644
--- a/paimon-core/src/test/java/org/apache/paimon/format/FileStatsExtractingAvroFormat.java
+++ b/paimon-core/src/test/java/org/apache/paimon/format/FileStatsExtractingAvroFormat.java
@@ -49,6 +49,11 @@ public class FileStatsExtractingAvroFormat extends FileFormat {
         return avro.createWriterFactory(type);
     }
 
+    @Override
+    public void validateDataFields(RowType rowType) {
+        return;
+    }
+
     @Override
     public Optional<FileStatsExtractor> createStatsExtractor(RowType type) {
         return Optional.of(new TestFileStatsExtractor(this, type));
diff --git a/paimon-core/src/test/java/org/apache/paimon/format/FlushingFileFormat.java b/paimon-core/src/test/java/org/apache/paimon/format/FlushingFileFormat.java
index b63396cac..548bd8b51 100644
--- a/paimon-core/src/test/java/org/apache/paimon/format/FlushingFileFormat.java
+++ b/paimon-core/src/test/java/org/apache/paimon/format/FlushingFileFormat.java
@@ -67,4 +67,9 @@ public class FlushingFileFormat extends FileFormat {
             };
         };
     }
+
+    @Override
+    public void validateDataFields(RowType rowType) {
+        return;
+    }
 }
diff --git a/paimon-format/src/main/java/org/apache/paimon/format/avro/AvroFileFormat.java b/paimon-format/src/main/java/org/apache/paimon/format/avro/AvroFileFormat.java
index ee4cff70f..faf8a794b 100644
--- a/paimon-format/src/main/java/org/apache/paimon/format/avro/AvroFileFormat.java
+++ b/paimon-format/src/main/java/org/apache/paimon/format/avro/AvroFileFormat.java
@@ -83,6 +83,14 @@ public class AvroFileFormat extends FileFormat {
         return new RowDataAvroWriterFactory(type, formatOptions.get(AVRO_OUTPUT_CODEC));
     }
 
+    @Override
+    public void validateDataFields(RowType rowType) {
+        List<DataType> fieldTypes = rowType.getFieldTypes();
+        for (DataType dataType : fieldTypes) {
+            AvroSchemaConverter.convertToSchema(dataType);
+        }
+    }
+
     private static class AvroGenericRecordBulkFormat extends AbstractAvroBulkFormat<GenericRecord> {
 
         private static final long serialVersionUID = 1L;
diff --git a/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcFileFormat.java b/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcFileFormat.java
index 8480e2aa3..0fc7113ac 100644
--- a/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcFileFormat.java
+++ b/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcFileFormat.java
@@ -111,6 +111,12 @@ public class OrcFileFormat extends FileFormat {
                 formatContext.readBatchSize());
     }
 
+    @Override
+    public void validateDataFields(RowType rowType) {
+        DataType refinedType = refineDataType(rowType);
+        OrcSplitReaderUtil.toOrcType(refinedType);
+    }
+
     /**
      * The {@link OrcWriterFactory} will create {@link ThreadLocalClassLoaderConfiguration} from the
      * input writer config to avoid classloader leaks.
diff --git a/paimon-format/src/main/java/org/apache/paimon/format/parquet/ParquetFileFormat.java b/paimon-format/src/main/java/org/apache/paimon/format/parquet/ParquetFileFormat.java
index f63fe940e..ed2553a2b 100644
--- a/paimon-format/src/main/java/org/apache/paimon/format/parquet/ParquetFileFormat.java
+++ b/paimon-format/src/main/java/org/apache/paimon/format/parquet/ParquetFileFormat.java
@@ -66,6 +66,11 @@ public class ParquetFileFormat extends FileFormat {
                         type, getParquetConfiguration(formatContext.formatOptions())));
     }
 
+    @Override
+    public void validateDataFields(RowType rowType) {
+        ParquetSchemaConverter.convertToParquetMessageType("paimon_schema", rowType);
+    }
+
     @Override
     public Optional<FileStatsExtractor> createStatsExtractor(RowType type) {
         return Optional.of(new ParquetFileStatsExtractor(type));
diff --git a/paimon-format/src/test/java/org/apache/paimon/format/avro/AvroFileFormatTest.java b/paimon-format/src/test/java/org/apache/paimon/format/avro/AvroFileFormatTest.java
new file mode 100644
index 000000000..525f2778b
--- /dev/null
+++ b/paimon-format/src/test/java/org/apache/paimon/format/avro/AvroFileFormatTest.java
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.format.avro;
+
+import org.apache.paimon.options.Options;
+import org.apache.paimon.types.DataField;
+import org.apache.paimon.types.DataTypes;
+import org.apache.paimon.types.RowType;
+
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.Test;
+
+import java.util.ArrayList;
+
+/** Test for avro file format. */
+public class AvroFileFormatTest {
+
+    private static AvroFileFormat fileFormat;
+
+    @BeforeAll
+    public static void before() {
+        fileFormat = new AvroFileFormat(new Options());
+    }
+
+    @Test
+    public void testSupportedDataTypes() {
+        ArrayList<DataField> dataFields = new ArrayList<>();
+        int index = 0;
+        dataFields.add(new DataField(index++, "boolean_type", DataTypes.BOOLEAN()));
+        dataFields.add(new DataField(index++, "tinyint_type", DataTypes.TINYINT()));
+        dataFields.add(new DataField(index++, "smallint_type", DataTypes.SMALLINT()));
+        dataFields.add(new DataField(index++, "int_type", DataTypes.INT()));
+        dataFields.add(new DataField(index++, "bigint_type", DataTypes.BIGINT()));
+        dataFields.add(new DataField(index++, "float_type", DataTypes.FLOAT()));
+        dataFields.add(new DataField(index++, "double_type", DataTypes.DOUBLE()));
+        dataFields.add(new DataField(index++, "char_type", DataTypes.CHAR(10)));
+        dataFields.add(new DataField(index++, "varchar_type", DataTypes.VARCHAR(20)));
+        dataFields.add(new DataField(index++, "binary_type", DataTypes.BINARY(20)));
+        dataFields.add(new DataField(index++, "varbinary_type", DataTypes.VARBINARY(20)));
+        dataFields.add(new DataField(index++, "timestamp_type", DataTypes.TIMESTAMP(3)));
+        dataFields.add(new DataField(index++, "date_type", DataTypes.DATE()));
+        dataFields.add(new DataField(index++, "decimal_type", DataTypes.DECIMAL(10, 3)));
+
+        RowType rowType = new RowType(dataFields);
+        fileFormat.validateDataFields(rowType);
+    }
+
+    @Test
+    public void testSupportedComplexDataTypes() {
+        ArrayList<DataField> dataFields = new ArrayList<>();
+        int index = 0;
+        dataFields.add(
+                new DataField(
+                        index++,
+                        "map_type",
+                        DataTypes.MAP(DataTypes.STRING(), DataTypes.BIGINT())));
+        dataFields.add(new DataField(index++, "array_type", DataTypes.ARRAY(DataTypes.STRING())));
+        dataFields.add(
+                new DataField(
+                        index++,
+                        "row_type",
+                        DataTypes.ROW(DataTypes.STRING(), DataTypes.BIGINT())));
+
+        RowType rowType = new RowType(dataFields);
+        fileFormat.validateDataFields(rowType);
+    }
+
+    @Test
+    public void testUnsupportedDataTypes() {
+        ArrayList<DataField> dataFields = new ArrayList<>();
+        int index = 0;
+        dataFields.add(new DataField(index++, "timestamp_type", DataTypes.TIMESTAMP(6)));
+
+        RowType rowType = new RowType(dataFields);
+        Assertions.assertThrows(
+                IllegalArgumentException.class, () -> fileFormat.validateDataFields(rowType));
+    }
+}
diff --git a/paimon-format/src/test/java/org/apache/paimon/format/orc/OrcFileFormatTest.java b/paimon-format/src/test/java/org/apache/paimon/format/orc/OrcFileFormatTest.java
index fd5699894..ae7fe3a30 100644
--- a/paimon-format/src/test/java/org/apache/paimon/format/orc/OrcFileFormatTest.java
+++ b/paimon-format/src/test/java/org/apache/paimon/format/orc/OrcFileFormatTest.java
@@ -20,9 +20,16 @@ package org.apache.paimon.format.orc;
 
 import org.apache.paimon.format.FileFormatFactory.FormatContext;
 import org.apache.paimon.options.Options;
+import org.apache.paimon.types.DataField;
+import org.apache.paimon.types.DataTypes;
+import org.apache.paimon.types.RowType;
 
+import org.junit.jupiter.api.Assertions;
 import org.junit.jupiter.api.Test;
 
+import java.util.ArrayList;
+import java.util.List;
+
 import static org.apache.paimon.format.orc.OrcFileFormatFactory.IDENTIFIER;
 import static org.assertj.core.api.Assertions.assertThat;
 
@@ -47,4 +54,46 @@ public class OrcFileFormatTest {
         assertThat(orc.orcProperties().getProperty(IDENTIFIER + ".haha", "")).isEqualTo("1");
         assertThat(orc.orcProperties().getProperty(IDENTIFIER + ".compress", "")).isEqualTo("zlib");
     }
+
+    @Test
+    public void testSupportedDataTypes() {
+        OrcFileFormat orc =
+                new OrcFileFormatFactory().create(new FormatContext(new Options(), 1024));
+
+        int index = 0;
+        List<DataField> dataFields = new ArrayList<DataField>();
+        dataFields.add(new DataField(index++, "boolean_type", DataTypes.BOOLEAN()));
+        dataFields.add(new DataField(index++, "tinyint_type", DataTypes.TINYINT()));
+        dataFields.add(new DataField(index++, "smallint_type", DataTypes.SMALLINT()));
+        dataFields.add(new DataField(index++, "int_type", DataTypes.INT()));
+        dataFields.add(new DataField(index++, "bigint_type", DataTypes.BIGINT()));
+        dataFields.add(new DataField(index++, "float_type", DataTypes.FLOAT()));
+        dataFields.add(new DataField(index++, "double_type", DataTypes.DOUBLE()));
+        dataFields.add(new DataField(index++, "char_type", DataTypes.CHAR(10)));
+        dataFields.add(new DataField(index++, "varchar_type", DataTypes.VARCHAR(20)));
+        dataFields.add(new DataField(index++, "binary_type", DataTypes.BINARY(20)));
+        dataFields.add(new DataField(index++, "varbinary_type", DataTypes.VARBINARY(20)));
+        dataFields.add(new DataField(index++, "timestamp_type", DataTypes.TIMESTAMP(3)));
+        dataFields.add(new DataField(index++, "date_type", DataTypes.DATE()));
+        dataFields.add(new DataField(index++, "decimal_type", DataTypes.DECIMAL(10, 3)));
+        orc.validateDataFields(new RowType(dataFields));
+    }
+
+    @Test
+    public void testUnSupportedDataTypes() {
+        OrcFileFormat orc =
+                new OrcFileFormatFactory().create(new FormatContext(new Options(), 1024));
+
+        int index = 0;
+        List<DataField> dataFields = new ArrayList<DataField>();
+        dataFields.add(
+                new DataField(
+                        index++,
+                        "timestamp_with_timezone",
+                        DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE()));
+        Assertions.assertThrows(
+                UnsupportedOperationException.class,
+                () -> orc.validateDataFields(new RowType(dataFields)));
+        dataFields.clear();
+    }
 }
diff --git a/paimon-format/src/test/java/org/apache/paimon/format/parquet/ParquetFileFormatTest.java b/paimon-format/src/test/java/org/apache/paimon/format/parquet/ParquetFileFormatTest.java
index 404965782..da53a7781 100644
--- a/paimon-format/src/test/java/org/apache/paimon/format/parquet/ParquetFileFormatTest.java
+++ b/paimon-format/src/test/java/org/apache/paimon/format/parquet/ParquetFileFormatTest.java
@@ -22,11 +22,17 @@ import org.apache.paimon.format.FileFormatFactory.FormatContext;
 import org.apache.paimon.options.ConfigOption;
 import org.apache.paimon.options.ConfigOptions;
 import org.apache.paimon.options.Options;
+import org.apache.paimon.types.DataField;
+import org.apache.paimon.types.DataTypes;
+import org.apache.paimon.types.RowType;
 
 import org.apache.parquet.format.CompressionCodec;
 import org.apache.parquet.hadoop.ParquetOutputFormat;
 import org.junit.jupiter.api.Test;
 
+import java.util.ArrayList;
+import java.util.List;
+
 import static org.apache.paimon.format.parquet.ParquetFileFormat.getParquetConfiguration;
 import static org.apache.paimon.format.parquet.ParquetFileFormatFactory.IDENTIFIER;
 import static org.assertj.core.api.Assertions.assertThat;
@@ -74,4 +80,28 @@ public class ParquetFileFormatTest {
         return getParquetConfiguration(parquet.formatOptions())
                 .getString(ParquetOutputFormat.COMPRESSION, null);
     }
+
+    @Test
+    public void testSupportedDataFields() {
+        ParquetFileFormat parquet =
+                new ParquetFileFormatFactory().create(new FormatContext(new Options(), 1024));
+
+        int index = 0;
+        List<DataField> dataFields = new ArrayList<DataField>();
+        dataFields.add(new DataField(index++, "boolean_type", DataTypes.BOOLEAN()));
+        dataFields.add(new DataField(index++, "tinyint_type", DataTypes.TINYINT()));
+        dataFields.add(new DataField(index++, "smallint_type", DataTypes.SMALLINT()));
+        dataFields.add(new DataField(index++, "int_type", DataTypes.INT()));
+        dataFields.add(new DataField(index++, "bigint_type", DataTypes.BIGINT()));
+        dataFields.add(new DataField(index++, "float_type", DataTypes.FLOAT()));
+        dataFields.add(new DataField(index++, "double_type", DataTypes.DOUBLE()));
+        dataFields.add(new DataField(index++, "char_type", DataTypes.CHAR(10)));
+        dataFields.add(new DataField(index++, "varchar_type", DataTypes.VARCHAR(20)));
+        dataFields.add(new DataField(index++, "binary_type", DataTypes.BINARY(20)));
+        dataFields.add(new DataField(index++, "varbinary_type", DataTypes.VARBINARY(20)));
+        dataFields.add(new DataField(index++, "timestamp_type", DataTypes.TIMESTAMP(3)));
+        dataFields.add(new DataField(index++, "date_type", DataTypes.DATE()));
+        dataFields.add(new DataField(index++, "decimal_type", DataTypes.DECIMAL(10, 3)));
+        parquet.validateDataFields(new RowType(dataFields));
+    }
 }