You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@parquet.apache.org by bl...@apache.org on 2017/10/10 19:37:19 UTC

parquet-format git commit: PARQUET-906: Add LogicalType annotation.

Repository: parquet-format
Updated Branches:
  refs/heads/master f59258a05 -> 863875e0b


PARQUET-906: Add LogicalType annotation.

This commit adds a `LogicalType` union and a field for this logical type to `SchemaElement`. Adding a new structure for logical types is needed for a few reasons:

1. Adding to the ConvertedType enum is not forward-compatible. Adding new types to the `LogicalType` union is forward-compatible.
2. Using a struct for each type allows additional metadata, like `isAdjustedToUTC`, without adding more fields to `SchemaElement` that don't apply to all types.
3. Types without additional metadata can be updated later. For example, adding an `encoding` field to `StringType` when it is needed.

Author: Ryan Blue <bl...@apache.org>

Closes #51 from rdblue/PARQUET-906-add-timestamp-adjustment-metadata and squashes the following commits:

ad8e91d [Ryan Blue] PARQUET-906: Clarify the use of NullType.
7cc29f7 [Ryan Blue] PARQUET-906: Rename NULL to UNKNOWN.
02f3868 [Ryan Blue] PARQUET-906: Update from comments on the PR.
c0386e9 [Ryan Blue] PARQUET-906: Remove NULL ConvertedType.
190bd8a [Ryan Blue] PARQUET-906: Update for review comments.
8203b21 [Ryan Blue] PARQUET-906: Add copyright header to LogicalTypes.
993102e [Ryan Blue] PARQUET-906: Remove the unreleased NULL ConvertedType.
86a22b4 [Ryan Blue] PARQUET-906: Add LogicalType annotation.


Project: http://git-wip-us.apache.org/repos/asf/parquet-format/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-format/commit/863875e0
Tree: http://git-wip-us.apache.org/repos/asf/parquet-format/tree/863875e0
Diff: http://git-wip-us.apache.org/repos/asf/parquet-format/diff/863875e0

Branch: refs/heads/master
Commit: 863875e0be3237c6aa4ed71733d54c91a51deabe
Parents: f59258a
Author: Ryan Blue <bl...@apache.org>
Authored: Tue Oct 10 12:37:15 2017 -0700
Committer: Ryan Blue <bl...@apache.org>
Committed: Tue Oct 10 12:37:15 2017 -0700

----------------------------------------------------------------------
 .../org/apache/parquet/format/LogicalTypes.java |  55 +++++++++
 src/main/thrift/parquet.thrift                  | 122 +++++++++++++++++--
 2 files changed, 170 insertions(+), 7 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/parquet-format/blob/863875e0/src/main/java/org/apache/parquet/format/LogicalTypes.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/parquet/format/LogicalTypes.java b/src/main/java/org/apache/parquet/format/LogicalTypes.java
new file mode 100644
index 0000000..7c63e41
--- /dev/null
+++ b/src/main/java/org/apache/parquet/format/LogicalTypes.java
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.parquet.format;
+
+/**
+ * Convenience instances of logical type classes.
+ */
+public class LogicalTypes {
+  public static class TimeUnits {
+    public static final TimeUnit MILLIS = TimeUnit.MILLIS(new MilliSeconds());
+    public static final TimeUnit MICROS = TimeUnit.MICROS(new MicroSeconds());
+  }
+
+  public static LogicalType DECIMAL(int scale, int precision) {
+    return LogicalType.DECIMAL(new DecimalType(scale, precision));
+  }
+
+  public static final LogicalType UTF8 = LogicalType.STRING(new StringType());
+  public static final LogicalType MAP  = LogicalType.MAP(new MapType());
+  public static final LogicalType LIST = LogicalType.LIST(new ListType());
+  public static final LogicalType ENUM = LogicalType.ENUM(new EnumType());
+  public static final LogicalType DATE = LogicalType.DATE(new DateType());
+  public static final LogicalType TIME_MILLIS = LogicalType.TIME(new TimeType(true, TimeUnits.MILLIS));
+  public static final LogicalType TIME_MICROS = LogicalType.TIME(new TimeType(true, TimeUnits.MICROS));
+  public static final LogicalType TIMESTAMP_MILLIS = LogicalType.TIMESTAMP(new TimestampType(true, TimeUnits.MILLIS));
+  public static final LogicalType TIMESTAMP_MICROS = LogicalType.TIMESTAMP(new TimestampType(true, TimeUnits.MICROS));
+  public static final LogicalType INT_8 = LogicalType.INTEGER(new IntType((byte) 8, true));
+  public static final LogicalType INT_16 = LogicalType.INTEGER(new IntType((byte) 16, true));
+  public static final LogicalType INT_32 = LogicalType.INTEGER(new IntType((byte) 32, true));
+  public static final LogicalType INT_64 = LogicalType.INTEGER(new IntType((byte) 64, true));
+  public static final LogicalType UINT_8 = LogicalType.INTEGER(new IntType((byte) 8, false));
+  public static final LogicalType UINT_16 = LogicalType.INTEGER(new IntType((byte) 16, false));
+  public static final LogicalType UINT_32 = LogicalType.INTEGER(new IntType((byte) 32, false));
+  public static final LogicalType UINT_64 = LogicalType.INTEGER(new IntType((byte) 64, false));
+  public static final LogicalType UNKNOWN = LogicalType.UNKNOWN(new NullType());
+  public static final LogicalType JSON = LogicalType.JSON(new JsonType());
+  public static final LogicalType BSON = LogicalType.BSON(new BsonType());
+}

http://git-wip-us.apache.org/repos/asf/parquet-format/blob/863875e0/src/main/thrift/parquet.thrift
----------------------------------------------------------------------
diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift
index d881c74..4c76cbd 100644
--- a/src/main/thrift/parquet.thrift
+++ b/src/main/thrift/parquet.thrift
@@ -174,13 +174,6 @@ enum ConvertedType {
    * particular timezone or date.
    */
   INTERVAL = 21;
-
-  /**
-   * Annotates a column that is always null
-   * Sometimes when discovering the schema of existing data
-   * values are always null
-   */
-  NULL = 25;
 }
 
 /**
@@ -231,6 +224,114 @@ struct Statistics {
    6: optional binary min_value;
 }
 
+/** Empty structs to use as logical type annotations */
+struct StringType {}  // allowed for BINARY, must be encoded with UTF-8
+struct MapType {}     // see LogicalTypes.md
+struct ListType {}    // see LogicalTypes.md
+struct EnumType {}    // allowed for BINARY, must be encoded with UTF-8
+struct DateType {}    // allowed for INT32
+
+/**
+ * Logical type to annotate a column that is always null.
+ *
+ * Sometimes when discovering the schema of existing data, values are always
+ * null and the physical type can't be determined. This annotation signals
+ * the case where the physical type was guessed from all null values.
+ */
+struct NullType {}    // allowed for any physical type, only null values stored
+
+/**
+ * Decimal logical type annotation
+ *
+ * To maintain forward-compatibility in v1, implementations using this logical
+ * type must also set scale and precision on the annotated SchemaElement.
+ *
+ * Allowed for physical types: INT32, INT64, FIXED, and BINARY
+ */
+struct DecimalType {
+  1: required i32 scale
+  2: required i32 precision
+}
+
+/** Time units for logical types */
+struct MilliSeconds {}
+struct MicroSeconds {}
+union TimeUnit {
+  1: MilliSeconds MILLIS
+  2: MicroSeconds MICROS
+}
+
+/**
+ * Timestamp logical type annotation
+ *
+ * Allowed for physical types: INT64
+ */
+struct TimestampType {
+  1: required bool isAdjustedToUTC
+  2: required TimeUnit unit
+}
+
+/**
+ * Time logical type annotation
+ *
+ * Allowed for physical types: INT32 (millis), INT64 (micros)
+ */
+struct TimeType {
+  1: required bool isAdjustedToUTC
+  2: required TimeUnit unit
+}
+
+/**
+ * Integer logical type annotation
+ *
+ * bitWidth must be 8, 16, 32, or 64.
+ *
+ * Allowed for physical types: INT32, INT64
+ */
+struct IntType {
+  1: required byte bitWidth
+  2: required bool isSigned
+}
+
+/**
+ * Embedded JSON logical type annotation
+ *
+ * Allowed for physical types: BINARY
+ */
+struct JsonType {
+}
+
+/**
+ * Embedded BSON logical type annotation
+ *
+ * Allowed for physical types: BINARY
+ */
+struct BsonType {
+}
+
+/**
+ * LogicalType annotations to replace ConvertedType.
+ *
+ * To maintain compatibility, implementations using LogicalType for a
+ * SchemaElement must also set the corresponding ConvertedType from the
+ * following table.
+ */
+union LogicalType {
+  1:  StringType STRING       // use ConvertedType UTF8 if encoding is UTF-8
+  2:  MapType MAP             // use ConvertedType MAP
+  3:  ListType LIST           // use ConvertedType LIST
+  4:  EnumType ENUM           // use ConvertedType ENUM
+  5:  DecimalType DECIMAL     // use ConvertedType DECIMAL
+  6:  DateType DATE           // use ConvertedType DATE
+  7:  TimeType TIME           // use ConvertedType TIME_MICROS or TIME_MILLIS
+  8:  TimestampType TIMESTAMP // use ConvertedType TIMESTAMP_MICROS or TIMESTAMP_MILLIS
+  // 9: reserved for INTERVAL
+  10: IntType INTEGER         // use ConvertedType INT_* or UINT_*
+  11: NullType UNKNOWN        // no compatible ConvertedType
+  12: JsonType JSON           // use ConvertedType JSON
+  13: BsonType BSON           // use ConvertedType BSON
+}
+
 /**
  * Represents a element inside a schema definition.
  *  - if it is a group (inner node) then type is undefined and num_children is defined
@@ -278,6 +379,13 @@ struct SchemaElement {
    */
   9: optional i32 field_id;
 
+  /**
+   * The logical type of this SchemaElement; only valid for primitives.
+   *
+   * LogicalType replaces ConvertedType, but ConvertedType is still required
+   * for some logical types to ensure forward-compatibility in format v1.
+   */
+  10: optional LogicalType logicalType
 }
 
 /**