You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@parquet.apache.org by ju...@apache.org on 2016/02/06 20:57:27 UTC

parquet-mr git commit: PARQUET-385 PARQUET-379: Fixes strict schema merging

Repository: parquet-mr
Updated Branches:
  refs/heads/master a4acf5333 -> c26fa7881


PARQUET-385 PARQUET-379: Fixes strict schema merging

This PR fixes strict mode schema merging. To merge two `PrimitiveType` `t1` and `t2`, they must satisfy the following conditions:

1. `t1` and `t2` have the same primitive type name
1. `t1` and `t2` either
   - don't have original type, or
   - have the same original type
1. If `t1` and `t2` are both `FIXED_LEN_BYTE_ARRAY`, they should have the same length

Also, merged schema now preserves original name if there's any.

Author: Cheng Lian <li...@databricks.com>

Closes #315 from liancheng/fix-strict-schema-merge and squashes the following commits:

a29138c [Cheng Lian] Addresses PR comment
1ac804e [Cheng Lian] Fixes strict schema merging


Project: http://git-wip-us.apache.org/repos/asf/parquet-mr/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-mr/commit/c26fa788
Tree: http://git-wip-us.apache.org/repos/asf/parquet-mr/tree/c26fa788
Diff: http://git-wip-us.apache.org/repos/asf/parquet-mr/diff/c26fa788

Branch: refs/heads/master
Commit: c26fa78817f30cc3eb91165b783e07fb80d80f59
Parents: a4acf53
Author: Cheng Lian <li...@databricks.com>
Authored: Sat Feb 6 11:57:19 2016 -0800
Committer: Julien Le Dem <ju...@dremio.com>
Committed: Sat Feb 6 11:57:19 2016 -0800

----------------------------------------------------------------------
 .../apache/parquet/schema/PrimitiveType.java    | 36 +++++++++++++++-----
 .../apache/parquet/schema/TestMessageType.java  | 35 ++++++++++++++++---
 2 files changed, 59 insertions(+), 12 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/c26fa788/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveType.java
----------------------------------------------------------------------
diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveType.java b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveType.java
index 5c6e460..1cdc6c3 100644
--- a/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveType.java
+++ b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveType.java
@@ -1,4 +1,4 @@
-/* 
+/*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -514,16 +514,36 @@ public final class PrimitiveType extends Type {
     return union(toMerge, true);
   }
 
+  private void reportSchemaMergeError(Type toMerge) {
+    throw new IncompatibleSchemaModificationException("can not merge type " + toMerge + " into " + this);
+  }
+
   @Override
   protected Type union(Type toMerge, boolean strict) {
-    if (!toMerge.isPrimitive() || (strict && !primitive.equals(toMerge.asPrimitiveType().getPrimitiveTypeName()))) {
-      throw new IncompatibleSchemaModificationException("can not merge type " + toMerge + " into " + this);
+    if (!toMerge.isPrimitive()) {
+      reportSchemaMergeError(toMerge);
+    }
+
+    if (strict) {
+      // Can't merge primitive fields of different type names or different original types
+      if (!primitive.equals(toMerge.asPrimitiveType().getPrimitiveTypeName()) ||
+          getOriginalType() != toMerge.getOriginalType()) {
+        reportSchemaMergeError(toMerge);
+      }
+
+      // Can't merge FIXED_LEN_BYTE_ARRAY fields of different lengths
+      int toMergeLength = toMerge.asPrimitiveType().getTypeLength();
+      if (primitive == PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY && length != toMergeLength) {
+        reportSchemaMergeError(toMerge);
+      }
     }
-    Types.PrimitiveBuilder<PrimitiveType> builder = Types.primitive(
-        primitive, toMerge.getRepetition());
+
+    Types.PrimitiveBuilder<PrimitiveType> builder = Types.primitive(primitive, toMerge.getRepetition());
+
     if (PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY == primitive) {
       builder.length(length);
     }
-    return builder.named(getName());
+
+    return builder.as(getOriginalType()).named(getName());
   }
 }

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/c26fa788/parquet-column/src/test/java/org/apache/parquet/schema/TestMessageType.java
----------------------------------------------------------------------
diff --git a/parquet-column/src/test/java/org/apache/parquet/schema/TestMessageType.java b/parquet-column/src/test/java/org/apache/parquet/schema/TestMessageType.java
index ca5d939..438fae9 100644
--- a/parquet-column/src/test/java/org/apache/parquet/schema/TestMessageType.java
+++ b/parquet-column/src/test/java/org/apache/parquet/schema/TestMessageType.java
@@ -1,4 +1,4 @@
-/* 
+/*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -18,6 +18,7 @@
  */
 package org.apache.parquet.schema;
 
+import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.fail;
 import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY;
@@ -131,6 +132,33 @@ public class TestMessageType {
     } catch (IncompatibleSchemaModificationException e) {
       assertEquals("can not merge type optional int32 a into optional binary a", e.getMessage());
     }
+
+    MessageType t9 = Types.buildMessage()
+        .addField(Types.optional(BINARY).as(OriginalType.UTF8).named("a"))
+        .named("root1");
+    MessageType t10 = Types.buildMessage()
+        .addField(Types.optional(BINARY).named("a"))
+        .named("root1");
+    assertEquals(t9.union(t9), t9);
+    try {
+      t9.union(t10);
+      fail("moving from BINARY (UTF8) to BINARY");
+    } catch (IncompatibleSchemaModificationException e) {
+      assertEquals("can not merge type optional binary a into optional binary a (UTF8)", e.getMessage());
+    }
+
+    MessageType t11 = Types.buildMessage()
+        .addField(Types.optional(FIXED_LEN_BYTE_ARRAY).length(10).named("a"))
+        .named("root1");
+    MessageType t12 = Types.buildMessage()
+        .addField(Types.optional(FIXED_LEN_BYTE_ARRAY).length(20).named("a"))
+        .named("root2");
+    try {
+      t11.union(t12);
+      fail("moving from FIXED_LEN_BYTE_ARRAY(10) to FIXED_LEN_BYTE_ARRAY(20)");
+    } catch (IncompatibleSchemaModificationException e) {
+      assertEquals("can not merge type optional fixed_len_byte_array(20) a into optional fixed_len_byte_array(10) a", e.getMessage());
+    }
   }
 
   @Test
@@ -145,5 +173,4 @@ public class TestMessageType {
     assertEquals(schema, schema2);
     assertEquals(schema.toString(), schema2.toString());
   }
-
 }