You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@iceberg.apache.org by GitBox <gi...@apache.org> on 2020/07/08 18:48:50 UTC

[GitHub] [iceberg] fbocse commented on a change in pull request #1177: Add schema upsert API for adding new fields and updating type (where applicable) using an input schema

fbocse commented on a change in pull request #1177:
URL: https://github.com/apache/iceberg/pull/1177#discussion_r451755790



##########
File path: core/src/test/java/org/apache/iceberg/TestSchemaUpdateSync.java
##########
@@ -0,0 +1,300 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg;
+
+import static org.apache.iceberg.types.Types.NestedField.optional;
+import static org.apache.iceberg.types.Types.NestedField.required;
+
+import com.google.common.collect.Lists;
+import java.util.List;
+import java.util.concurrent.atomic.AtomicInteger;
+import org.apache.iceberg.types.Type.PrimitiveType;
+import org.apache.iceberg.types.Types;
+import org.apache.iceberg.types.Types.BinaryType;
+import org.apache.iceberg.types.Types.BooleanType;
+import org.apache.iceberg.types.Types.DateType;
+import org.apache.iceberg.types.Types.DecimalType;
+import org.apache.iceberg.types.Types.DoubleType;
+import org.apache.iceberg.types.Types.FixedType;
+import org.apache.iceberg.types.Types.FloatType;
+import org.apache.iceberg.types.Types.IntegerType;
+import org.apache.iceberg.types.Types.LongType;
+import org.apache.iceberg.types.Types.NestedField;
+import org.apache.iceberg.types.Types.StringType;
+import org.apache.iceberg.types.Types.TimeType;
+import org.apache.iceberg.types.Types.TimestampType;
+import org.apache.iceberg.types.Types.UUIDType;
+import org.junit.Assert;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.ExpectedException;
+
+public class TestSchemaUpdateSync {
+
+  @Rule
+  public ExpectedException thrown = ExpectedException.none();
+
+  private static List<? extends PrimitiveType> primitiveTypes() {
+    return Lists.newArrayList(StringType.get(),
+        TimeType.get(),
+        TimestampType.withoutZone(),
+        TimestampType.withZone(),
+        UUIDType.get(),
+        DateType.get(),
+        BooleanType.get(),
+        BinaryType.get(),
+        DoubleType.get(),
+        IntegerType.get(),
+        FixedType.ofLength(10),
+        DecimalType.of(10, 2),
+        LongType.get(),
+        FloatType.get()
+    );
+  }
+
+  private static NestedField[] primitiveFields(Integer initialValue, List<? extends PrimitiveType> primitiveTypes) {
+    AtomicInteger i = new AtomicInteger(initialValue);
+    return primitiveTypes.stream()
+        .map(type -> optional(i.incrementAndGet(), type.toString(),
+            Types.fromPrimitiveString(type.toString()))).toArray(NestedField[]::new);
+  }
+
+  @Test
+  public void testAddTopLevelPrimitives() {
+    Schema newSchema = new Schema(primitiveFields(0, primitiveTypes()));
+    Schema applied = new SchemaUpdate(new Schema(), 0).upsertSchema(newSchema).apply();
+    Assert.assertEquals(newSchema.asStruct(), applied.asStruct());
+  }
+
+  @Test
+  public void testAddTopLevelListOfPrimitives() {
+    for(PrimitiveType primitiveType : primitiveTypes()) {
+      Schema newSchema = new Schema(optional(1, "aList", Types.ListType.ofOptional(2, primitiveType)));
+      Schema applied = new SchemaUpdate(new Schema(), 0).upsertSchema(newSchema).apply();
+      Assert.assertEquals(newSchema.asStruct(), applied.asStruct());
+    }
+  }
+
+  @Test
+  public void testAddTopLevelMapOfPrimitives() {
+    for(PrimitiveType primitiveType : primitiveTypes()) {
+      Schema newSchema = new Schema(optional(1, "aMap", Types.MapType.ofOptional(2, 3, primitiveType, primitiveType)));
+      Schema applied = new SchemaUpdate(new Schema(), 0).upsertSchema(newSchema).apply();
+      Assert.assertEquals(newSchema.asStruct(), applied.asStruct());
+    }
+  }
+
+  @Test
+  public void testAddTopLevelStructOfPrimitives() {
+    for(PrimitiveType primitiveType : primitiveTypes()) {
+      Schema currentSchema = new Schema(optional(1, "aStruct", Types.StructType.of(
+          optional(2, "primitive", primitiveType))));
+      Schema applied = new SchemaUpdate(new Schema(), 0).upsertSchema(currentSchema).apply();
+      Assert.assertEquals(currentSchema.asStruct(), applied.asStruct());
+    }
+  }
+
+  @Test
+  public void testAddNestedPrimitive() {
+    for(PrimitiveType primitiveType : primitiveTypes()) {
+      Schema currentSchema = new Schema(optional(1, "aStruct", Types.StructType.of()));
+      Schema newSchema = new Schema(optional(1, "aStruct", Types.StructType.of(
+          optional(2, "primitive", primitiveType))));
+      Schema applied = new SchemaUpdate(currentSchema, 1).upsertSchema(newSchema).apply();
+      Assert.assertEquals(newSchema.asStruct(), applied.asStruct());
+    }
+  }
+
+  @Test
+  public void testAddNestedPrimitives() {
+    Schema currentSchema = new Schema(optional(1, "aStruct", Types.StructType.of()));
+    Schema newSchema = new Schema(optional(1, "aStruct", Types.StructType.of(
+        primitiveFields(1, primitiveTypes()))));
+    Schema applied = new SchemaUpdate(currentSchema, 1).upsertSchema(newSchema).apply();
+    Assert.assertEquals(newSchema.asStruct(), applied.asStruct());
+  }
+
+  @Test
+  public void testAddNestedLists() {
+    Schema newSchema = new Schema(optional(1, "aList",
+        Types.ListType.ofOptional(2,
+            Types.ListType.ofOptional(3,
+                Types.ListType.ofOptional(4,
+                    Types.ListType.ofOptional(5,
+                        Types.ListType.ofOptional(6,
+                            Types.ListType.ofOptional(7,
+                                Types.ListType.ofOptional(8,
+                                    Types.ListType.ofOptional(9,
+                                        Types.ListType.ofOptional(10,
+                                            DecimalType.of(11, 20))))))))))));
+    Schema applied = new SchemaUpdate(new Schema(), 0).upsertSchema(newSchema).apply();
+    Assert.assertEquals(newSchema.asStruct(), applied.asStruct());
+  }
+
+  @Test
+  public void testAddNestedStruct() {
+    Schema newSchema = new Schema(optional(1, "struct1", Types.StructType.of(
+            optional(2, "struct2", Types.StructType.of(
+                optional(3, "struct3", Types.StructType.of(
+                    optional(4, "struct4", Types.StructType.of(
+                        optional(5, "struct5", Types.StructType.of(
+                            optional(6, "struct6", Types.StructType.of(
+                                optional(7, "aString", StringType.get()))))))))))))));
+    Schema applied = new SchemaUpdate(new Schema(), 0).upsertSchema(newSchema).apply();
+    Assert.assertEquals(newSchema.asStruct(), applied.asStruct());
+  }
+
+  @Test
+  public void testAddNestedMaps() {
+    Schema newSchema = new Schema(optional(1, "struct", Types.MapType.ofOptional(
+        2, 3, StringType.get(), Types.MapType.ofOptional(
+            4, 5, StringType.get(), Types.MapType.ofOptional(
+                6, 7, StringType.get(), Types.MapType.ofOptional(
+                    8 ,9, StringType.get(), Types.MapType.ofOptional(
+                        10 ,11, StringType.get(), Types.MapType.ofOptional(
+                            12, 13, StringType.get(), StringType.get()))))))));
+    Schema applied = new SchemaUpdate(new Schema(), 0).upsertSchema(newSchema).apply();
+    Assert.assertEquals(newSchema.asStruct(), applied.asStruct());
+  }
+
+  @Test
+  public void testDetectInvalidTopLevelList() {
+    thrown.expect(IllegalArgumentException.class);
+    thrown.expectMessage("Cannot change column type: aList.element: string -> long");
+
+    Schema currentSchema = new Schema(optional(1, "aList",
+        Types.ListType.ofOptional(2, StringType.get())));
+    Schema newSchema = new Schema(optional(1, "aList",
+        Types.ListType.ofOptional(2, LongType.get())));
+    new SchemaUpdate(currentSchema, 2).upsertSchema(newSchema).apply();
+  }
+
+  @Test
+  public void testDetectInvalidTopLevelMapValue() {
+    thrown.expect(IllegalArgumentException.class);
+    thrown.expectMessage("Cannot change column type: aMap.value: string -> long");
+
+    Schema currentSchema = new Schema(optional(1, "aMap",
+        Types.MapType.ofOptional(2,3, StringType.get(), StringType.get())));
+    Schema newSchema = new Schema(optional(1, "aMap",
+        Types.MapType.ofOptional(2,3, StringType.get(), LongType.get())));
+    Schema apply = new SchemaUpdate(currentSchema, 3).upsertSchema(newSchema).apply();
+    System.out.println(apply.toString());
+  }
+
+  @Test
+  public void testDetectInvalidTopLevelMapKey() {
+    thrown.expect(IllegalArgumentException.class);
+    thrown.expectMessage("Cannot change column type: aMap.key: string -> uuid");
+
+    Schema currentSchema = new Schema(optional(1, "aMap",
+        Types.MapType.ofOptional(2,3 , StringType.get(), StringType.get())));
+    Schema newSchema = new Schema(optional(1, "aMap",
+        Types.MapType.ofOptional(2,3 , UUIDType.get(), StringType.get())));
+    new SchemaUpdate(currentSchema, 3).upsertSchema(newSchema).apply();
+  }
+
+  @Test
+  // int	32-bit signed integers -> Can promote to long
+  public void testTypePromoteIntegerToLong() {
+    Schema currentSchema = new Schema(required(1, "aCol", IntegerType.get()));
+    Schema newSchema = new Schema(required(1, "aCol", LongType.get()));
+
+    Schema applied = new SchemaUpdate(currentSchema, 1).upsertSchema(newSchema).apply();
+    Assert.assertEquals(1, applied.asStruct().fields().size());
+    Assert.assertEquals(LongType.get(), applied.asStruct().fields().get(0).type());
+  }
+
+  @Test
+  // float	32-bit IEEE 754 floating point -> Can promote to double
+  public void testTypePromoteFloatToDouble() {
+    Schema currentSchema = new Schema(required(1, "aCol", FloatType.get()));
+    Schema newSchema = new Schema(required(1, "aCol", DoubleType.get()));
+
+    Schema applied = new SchemaUpdate(currentSchema, 1).upsertSchema(newSchema).apply();
+    Assert.assertEquals(1, applied.asStruct().fields().size());
+    Assert.assertEquals(DoubleType.get(), applied.asStruct().fields().get(0).type());
+    // Doesn't work Assert.assertEquals(newSchema.asStruct(), applied.asStruct());
+    // java.lang.AssertionError:
+    // Expected :struct<1: aCol: required double>
+    // Actual   :struct<1: aCol: required double ()>
+  }
+
+  @Test
+  public void testInvalidTypePromoteDoubleToFloat() {
+    thrown.expect(IllegalArgumentException.class);
+    thrown.expectMessage("Cannot change column type: aCol: double -> float");
+
+    Schema currentSchema = new Schema(required(1, "aCol", DoubleType.get()));
+    Schema newSchema = new Schema(required(1, "aCol", FloatType.get()));
+
+    new SchemaUpdate(currentSchema, 1).upsertSchema(newSchema).apply();
+  }
+
+  @Test
+  // decimal(P,S)	Fixed-point decimal; precision P, scale S	-> Scale is fixed [1], precision must be 38 or less
+  public void testTypePromoteDecimalToFixedScaleWithWiderPrecision() {
+    Schema currentSchema = new Schema(required(1, "aCol", DecimalType.of(20, 1)));
+    Schema newSchema = new Schema(required(1, "aCol", DecimalType.of(22, 1)));
+
+    Schema applied = new SchemaUpdate(currentSchema, 1).upsertSchema(newSchema).apply();
+    Assert.assertEquals(newSchema.asStruct(), applied.asStruct());
+  }
+
+  @Test
+  public void testAddPrimitiveToNestedStruct() {
+    Schema schema = new Schema(
+        required(1, "struct1", Types.StructType.of(
+          optional(2, "struct2", Types.StructType.of(
+                  optional(3, "list", Types.ListType.ofOptional(
+                      4, Types.StructType.of(
+                          optional(5, "value", StringType.get())))))))));
+
+    Schema newSchema = new Schema(
+        required(1, "struct1", Types.StructType.of(
+            optional(2, "struct2", Types.StructType.of(
+                optional(3, "list", Types.ListType.ofOptional(
+                    4, Types.StructType.of(
+                        optional(5, "time", TimeType.get())))))))));

Review comment:
       Right, nor should it, the main use-case for this API is to accommodate schema unions as @rdblue's summed it up during our chat  in the Iceberg sync-up today.
   The use-case is as follows:
   
   1.  think a dataset with a large schema - call it `effective schema` 
   2. we get data that only contains particular columns - those columns make up only for a subset of the large schema - call it `observed schema` 
   3. as these chunks of data and their respective observed schemas are being landed in the iceberg table we want to keep the table schema on-par with the union of these observed schemas
   
   So in this particular `expected` is the union of `schema` and `newSchema`. 
   Does this make sense? 
   Does this resemble your use-case as well?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@iceberg.apache.org
For additional commands, e-mail: issues-help@iceberg.apache.org