You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@parquet.apache.org by bl...@apache.org on 2017/10/10 19:37:19 UTC
parquet-format git commit: PARQUET-906: Add LogicalType annotation.
Repository: parquet-format
Updated Branches:
refs/heads/master f59258a05 -> 863875e0b
PARQUET-906: Add LogicalType annotation.
This commit adds a `LogicalType` union and a field for this logical type to `SchemaElement`. Adding a new structure for logical types is needed for a few reasons:
1. Adding to the ConvertedType enum is not forward-compatible. Adding new types to the `LogicalType` union is forward-compatible.
2. Using a struct for each type allows additional metadata, like `isAdjustedToUTC`, without adding more fields to `SchemaElement` that don't apply to all types.
3. Types without additional metadata can be updated later. For example, adding an `encoding` field to `StringType` when it is needed.
Author: Ryan Blue <bl...@apache.org>
Closes #51 from rdblue/PARQUET-906-add-timestamp-adjustment-metadata and squashes the following commits:
ad8e91d [Ryan Blue] PARQUET-906: Clarify the use of NullType.
7cc29f7 [Ryan Blue] PARQUET-906: Rename NULL to UNKNOWN.
02f3868 [Ryan Blue] PARQUET-906: Update from comments on the PR.
c0386e9 [Ryan Blue] PARQUET-906: Remove NULL ConvertedType.
190bd8a [Ryan Blue] PARQUET-906: Update for review comments.
8203b21 [Ryan Blue] PARQUET-906: Add copyright header to LogicalTypes.
993102e [Ryan Blue] PARQUET-906: Remove the unreleased NULL ConvertedType.
86a22b4 [Ryan Blue] PARQUET-906: Add LogicalType annotation.
Project: http://git-wip-us.apache.org/repos/asf/parquet-format/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-format/commit/863875e0
Tree: http://git-wip-us.apache.org/repos/asf/parquet-format/tree/863875e0
Diff: http://git-wip-us.apache.org/repos/asf/parquet-format/diff/863875e0
Branch: refs/heads/master
Commit: 863875e0be3237c6aa4ed71733d54c91a51deabe
Parents: f59258a
Author: Ryan Blue <bl...@apache.org>
Authored: Tue Oct 10 12:37:15 2017 -0700
Committer: Ryan Blue <bl...@apache.org>
Committed: Tue Oct 10 12:37:15 2017 -0700
----------------------------------------------------------------------
.../org/apache/parquet/format/LogicalTypes.java | 55 +++++++++
src/main/thrift/parquet.thrift | 122 +++++++++++++++++--
2 files changed, 170 insertions(+), 7 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/parquet-format/blob/863875e0/src/main/java/org/apache/parquet/format/LogicalTypes.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/parquet/format/LogicalTypes.java b/src/main/java/org/apache/parquet/format/LogicalTypes.java
new file mode 100644
index 0000000..7c63e41
--- /dev/null
+++ b/src/main/java/org/apache/parquet/format/LogicalTypes.java
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.parquet.format;
+
+/**
+ * Convenience instances of logical type classes.
+ */
+public class LogicalTypes {
+ public static class TimeUnits {
+ public static final TimeUnit MILLIS = TimeUnit.MILLIS(new MilliSeconds());
+ public static final TimeUnit MICROS = TimeUnit.MICROS(new MicroSeconds());
+ }
+
+ public static LogicalType DECIMAL(int scale, int precision) {
+ return LogicalType.DECIMAL(new DecimalType(scale, precision));
+ }
+
+ public static final LogicalType UTF8 = LogicalType.STRING(new StringType());
+ public static final LogicalType MAP = LogicalType.MAP(new MapType());
+ public static final LogicalType LIST = LogicalType.LIST(new ListType());
+ public static final LogicalType ENUM = LogicalType.ENUM(new EnumType());
+ public static final LogicalType DATE = LogicalType.DATE(new DateType());
+ public static final LogicalType TIME_MILLIS = LogicalType.TIME(new TimeType(true, TimeUnits.MILLIS));
+ public static final LogicalType TIME_MICROS = LogicalType.TIME(new TimeType(true, TimeUnits.MICROS));
+ public static final LogicalType TIMESTAMP_MILLIS = LogicalType.TIMESTAMP(new TimestampType(true, TimeUnits.MILLIS));
+ public static final LogicalType TIMESTAMP_MICROS = LogicalType.TIMESTAMP(new TimestampType(true, TimeUnits.MICROS));
+ public static final LogicalType INT_8 = LogicalType.INTEGER(new IntType((byte) 8, true));
+ public static final LogicalType INT_16 = LogicalType.INTEGER(new IntType((byte) 16, true));
+ public static final LogicalType INT_32 = LogicalType.INTEGER(new IntType((byte) 32, true));
+ public static final LogicalType INT_64 = LogicalType.INTEGER(new IntType((byte) 64, true));
+ public static final LogicalType UINT_8 = LogicalType.INTEGER(new IntType((byte) 8, false));
+ public static final LogicalType UINT_16 = LogicalType.INTEGER(new IntType((byte) 16, false));
+ public static final LogicalType UINT_32 = LogicalType.INTEGER(new IntType((byte) 32, false));
+ public static final LogicalType UINT_64 = LogicalType.INTEGER(new IntType((byte) 64, false));
+ public static final LogicalType UNKNOWN = LogicalType.UNKNOWN(new NullType());
+ public static final LogicalType JSON = LogicalType.JSON(new JsonType());
+ public static final LogicalType BSON = LogicalType.BSON(new BsonType());
+}
http://git-wip-us.apache.org/repos/asf/parquet-format/blob/863875e0/src/main/thrift/parquet.thrift
----------------------------------------------------------------------
diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift
index d881c74..4c76cbd 100644
--- a/src/main/thrift/parquet.thrift
+++ b/src/main/thrift/parquet.thrift
@@ -174,13 +174,6 @@ enum ConvertedType {
* particular timezone or date.
*/
INTERVAL = 21;
-
- /**
- * Annotates a column that is always null
- * Sometimes when discovering the schema of existing data
- * values are always null
- */
- NULL = 25;
}
/**
@@ -231,6 +224,114 @@ struct Statistics {
6: optional binary min_value;
}
+/** Empty structs to use as logical type annotations */
+struct StringType {} // allowed for BINARY, must be encoded with UTF-8
+struct MapType {} // see LogicalTypes.md
+struct ListType {} // see LogicalTypes.md
+struct EnumType {} // allowed for BINARY, must be encoded with UTF-8
+struct DateType {} // allowed for INT32
+
+/**
+ * Logical type to annotate a column that is always null.
+ *
+ * Sometimes when discovering the schema of existing data, values are always
+ * null and the physical type can't be determined. This annotation signals
+ * the case where the physical type was guessed from all null values.
+ */
+struct NullType {} // allowed for any physical type, only null values stored
+
+/**
+ * Decimal logical type annotation
+ *
+ * To maintain forward-compatibility in v1, implementations using this logical
+ * type must also set scale and precision on the annotated SchemaElement.
+ *
+ * Allowed for physical types: INT32, INT64, FIXED, and BINARY
+ */
+struct DecimalType {
+ 1: required i32 scale
+ 2: required i32 precision
+}
+
+/** Time units for logical types */
+struct MilliSeconds {}
+struct MicroSeconds {}
+union TimeUnit {
+ 1: MilliSeconds MILLIS
+ 2: MicroSeconds MICROS
+}
+
+/**
+ * Timestamp logical type annotation
+ *
+ * Allowed for physical types: INT64
+ */
+struct TimestampType {
+ 1: required bool isAdjustedToUTC
+ 2: required TimeUnit unit
+}
+
+/**
+ * Time logical type annotation
+ *
+ * Allowed for physical types: INT32 (millis), INT64 (micros)
+ */
+struct TimeType {
+ 1: required bool isAdjustedToUTC
+ 2: required TimeUnit unit
+}
+
+/**
+ * Integer logical type annotation
+ *
+ * bitWidth must be 8, 16, 32, or 64.
+ *
+ * Allowed for physical types: INT32, INT64
+ */
+struct IntType {
+ 1: required byte bitWidth
+ 2: required bool isSigned
+}
+
+/**
+ * Embedded JSON logical type annotation
+ *
+ * Allowed for physical types: BINARY
+ */
+struct JsonType {
+}
+
+/**
+ * Embedded BSON logical type annotation
+ *
+ * Allowed for physical types: BINARY
+ */
+struct BsonType {
+}
+
+/**
+ * LogicalType annotations to replace ConvertedType.
+ *
+ * To maintain compatibility, implementations using LogicalType for a
+ * SchemaElement must also set the corresponding ConvertedType from the
+ * following table.
+ */
+union LogicalType {
+ 1: StringType STRING // use ConvertedType UTF8 if encoding is UTF-8
+ 2: MapType MAP // use ConvertedType MAP
+ 3: ListType LIST // use ConvertedType LIST
+ 4: EnumType ENUM // use ConvertedType ENUM
+ 5: DecimalType DECIMAL // use ConvertedType DECIMAL
+ 6: DateType DATE // use ConvertedType DATE
+ 7: TimeType TIME // use ConvertedType TIME_MICROS or TIME_MILLIS
+ 8: TimestampType TIMESTAMP // use ConvertedType TIMESTAMP_MICROS or TIMESTAMP_MILLIS
+ // 9: reserved for INTERVAL
+ 10: IntType INTEGER // use ConvertedType INT_* or UINT_*
+ 11: NullType UNKNOWN // no compatible ConvertedType
+ 12: JsonType JSON // use ConvertedType JSON
+ 13: BsonType BSON // use ConvertedType BSON
+}
+
/**
* Represents a element inside a schema definition.
* - if it is a group (inner node) then type is undefined and num_children is defined
@@ -278,6 +379,13 @@ struct SchemaElement {
*/
9: optional i32 field_id;
+ /**
+ * The logical type of this SchemaElement; only valid for primitives.
+ *
+ * LogicalType replaces ConvertedType, but ConvertedType is still required
+ * for some logical types to ensure forward-compatibility in format v1.
+ */
+ 10: optional LogicalType logicalType
}
/**