You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@parquet.apache.org by uw...@apache.org on 2017/06/17 17:33:49 UTC

parquet-cpp git commit: PARQUET-1007: Update parquet.thrift

Repository: parquet-cpp
Updated Branches:
  refs/heads/master 52c878919 -> fc6021414


PARQUET-1007: Update parquet.thrift

Author: Deepak Majeti <de...@hpe.com>

Closes #353 from majetideepak/PARQUET-1007 and squashes the following commits:

5bb1d57 [Deepak Majeti] use NA instead of NULL, use parquet.format
155d930 [Deepak Majeti] PARQUET-1007: Update parquet.thrift


Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/fc602141
Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/fc602141
Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/fc602141

Branch: refs/heads/master
Commit: fc602141410084ec66b618607b81f58bf11c79fb
Parents: 52c8789
Author: Deepak Majeti <de...@hpe.com>
Authored: Sat Jun 17 19:33:44 2017 +0200
Committer: Uwe L. Korn <uw...@apache.org>
Committed: Sat Jun 17 19:33:44 2017 +0200

----------------------------------------------------------------------
 src/parquet/parquet.thrift | 68 +++++++++++++++++++++++++++++++++++++++--
 1 file changed, 66 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/fc602141/src/parquet/parquet.thrift
----------------------------------------------------------------------
diff --git a/src/parquet/parquet.thrift b/src/parquet/parquet.thrift
index 2e840d8..f774d49 100644
--- a/src/parquet/parquet.thrift
+++ b/src/parquet/parquet.thrift
@@ -23,13 +23,24 @@ cpp_include "parquet/util/windows_compatibility.h"
  * File format description for the parquet file format
  */
 namespace cpp parquet.format
-namespace java parquet.format
+namespace java org.apache.parquet.format
 
 /**
  * Types supported by Parquet.  These types are intended to be used in combination
  * with the encodings to control the on disk storage format.
  * For example INT16 is not included as a type since a good encoding of INT32
  * would handle this.
+ *
+ * When a logical type is not present, the type-defined sort order of these
+ * physical types are:
+ * * BOOLEAN - false, true
+ * * INT32 - signed comparison
+ * * INT64 - signed comparison
+ * * INT96 - signed comparison
+ * * FLOAT - signed comparison
+ * * DOUBLE - signed comparison
+ * * BYTE_ARRAY - unsigned byte-wise comparison
+ * * FIXED_LEN_BYTE_ARRAY - unsigned byte-wise comparison
  */
 enum Type {
   BOOLEAN = 0;
@@ -177,6 +188,13 @@ enum ConvertedType {
    */
   INTERVAL = 21;
 
+  /**
+   * Annotates a column that is always null
+   * Sometimes when discovering the schema of existing data
+   * values are always null
+   * This is NULL in parquet-format
+   */
+  NA = 25;
 }
 
 /**
@@ -198,13 +216,33 @@ enum FieldRepetitionType {
  * All fields are optional.
  */
 struct Statistics {
-   /** min and max value of the column, encoded in PLAIN encoding */
+   /**
+    * DEPRECATED: min and max value of the column. Use min_value and max_value.
+    *
+    * Values are encoded using PLAIN encoding, except that variable-length byte
+    * arrays do not include a length prefix.
+    *
+    * These fields encode min and max values determined by SIGNED comparison
+    * only. New files should use the correct order for a column's logical type
+    * and store the values in the min_value and max_value fields.
+    *
+    * To support older readers, these may be set when the column order is
+    * SIGNED.
+    */
    1: optional binary max;
    2: optional binary min;
    /** count of null value in the column */
    3: optional i64 null_count;
    /** count of distinct values occurring */
    4: optional i64 distinct_count;
+   /**
+    * Min and max values for the column, determined by its ColumnOrder.
+    *
+    * Values are encoded using PLAIN encoding, except that variable-length byte
+    * arrays do not include a length prefix.
+    */
+   5: optional binary max_value;
+   6: optional binary min_value;
 }
 
 /**
@@ -543,6 +581,23 @@ struct RowGroup {
   4: optional list<SortingColumn> sorting_columns
 }
 
+/** Empty struct to signal the order defined by the physical or logical type */
+struct TypeDefinedOrder {}
+
+/**
+ * Union to specify the order used for min, max, and sorting values in a column.
+ *
+ * Possible values are:
+ * * TypeDefinedOrder - the column uses the order defined by its logical or
+ *                      physical type (if there is no logical type).
+ *
+ * If the reader does not support the value of this union, min and max stats
+ * for this column should be ignored.
+ */
+union ColumnOrder {
+  1: TypeDefinedOrder TYPE_ORDER;
+}
+
 /**
  * Description for file metadata
  */
@@ -572,5 +627,14 @@ struct FileMetaData {
    * e.g. impala version 1.0 (build 6cf94d29b2b7115df4de2c06e2ab4326d721eb55)
    **/
   6: optional string created_by
+
+  /**
+   * Sort order used for each column in this file.
+   *
+   * If this list is not present, then the order for each column is assumed to
+   * be Signed. In addition, min and max values for INTERVAL or DECIMAL stored
+   * as fixed or bytes should be ignored.
+   */
+  7: optional list<ColumnOrder> column_orders;
 }