You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@parquet.apache.org by "ASF GitHub Bot (JIRA)" <ji...@apache.org> on 2018/11/05 21:55:00 UTC

[jira] [Commented] (PARQUET-1322) Statistics is not available for DECIMAL types

    [ https://issues.apache.org/jira/browse/PARQUET-1322?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16675797#comment-16675797 ] 

ASF GitHub Bot commented on PARQUET-1322:
-----------------------------------------

vrozov closed pull request #494: PARQUET-1322: Statistics is not available for DECIMAL types
URL: https://github.com/apache/parquet-mr/pull/494
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
index 2baad15cd..46524b7cd 100644
--- a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
+++ b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
@@ -702,6 +702,7 @@ private static SortOrder sortOrder(PrimitiveType primitive) {
         case TIME_MILLIS:
         case TIMESTAMP_MICROS:
         case TIMESTAMP_MILLIS:
+        case DECIMAL:
           return SortOrder.SIGNED;
         case UINT_8:
         case UINT_16:
@@ -712,7 +713,6 @@ private static SortOrder sortOrder(PrimitiveType primitive) {
         case BSON:
         case JSON:
           return SortOrder.UNSIGNED;
-        case DECIMAL:
         case LIST:
         case MAP:
         case MAP_KEY_VALUE:
diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java
index b3eebd6ae..50d4d9598 100644
--- a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java
+++ b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java
@@ -758,21 +758,56 @@ public void testV2OnlyStats() {
     testV2OnlyStats(Types.optional(PrimitiveTypeName.INT64).as(OriginalType.UINT_64).named(""),
         0x7FFFFFFFFFFFFFFFL,
         0x8000000000000000L);
-    testV2OnlyStats(Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.DECIMAL).precision(6).named(""),
+  }
+
+  private void testV2OnlyStats(PrimitiveType type, Object min, Object max) {
+    Statistics<?> stats = createStats(type, min, max);
+    org.apache.parquet.format.Statistics statistics = ParquetMetadataConverter.toParquetStatistics(stats);
+    assertFalse(statistics.isSetMin());
+    assertFalse(statistics.isSetMax());
+    assertEquals(ByteBuffer.wrap(stats.getMinBytes()), statistics.min_value);
+    assertEquals(ByteBuffer.wrap(stats.getMaxBytes()), statistics.max_value);
+  }
+
+  @Test
+  public void testDecimalStats() {
+    testDecimalStats(Types.optional(PrimitiveTypeName.INT32).as(OriginalType.DECIMAL).precision(9).named(""),
+        Integer.MIN_VALUE,
+        Integer.MAX_VALUE);
+    testDecimalStats(Types.optional(PrimitiveTypeName.INT32).as(OriginalType.DECIMAL).precision(9).named(""),
+        0,
+        255);
+    testDecimalStats(Types.optional(PrimitiveTypeName.INT32).as(OriginalType.DECIMAL).precision(9).named(""),
+        -765875,
+        876856);
+    testDecimalStats(Types.optional(PrimitiveTypeName.INT64).as(OriginalType.DECIMAL).precision(18).named(""),
+        Long.MIN_VALUE,
+        Long.MAX_VALUE);
+    testDecimalStats(Types.optional(PrimitiveTypeName.INT64).as(OriginalType.DECIMAL).precision(18).named(""),
+        0L,
+        255L);
+    testDecimalStats(Types.optional(PrimitiveTypeName.INT64).as(OriginalType.DECIMAL).precision(9).named(""),
+        -765875L,
+        876856L);
+    testDecimalStats(Types.optional(PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY).length(18).as(OriginalType.DECIMAL).precision(7)
+            .named(""),
+        BigInteger.valueOf(Long.MIN_VALUE),
+        BigInteger.valueOf(Long.MAX_VALUE));
+    testDecimalStats(Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.DECIMAL).precision(6).named(""),
         new BigInteger("-765875"),
         new BigInteger("876856"));
-    testV2OnlyStats(
+    testDecimalStats(
         Types.optional(PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY).length(14).as(OriginalType.DECIMAL).precision(7)
             .named(""),
         new BigInteger("-6769643"),
         new BigInteger("9864675"));
   }
 
-  private void testV2OnlyStats(PrimitiveType type, Object min, Object max) {
+  private void testDecimalStats(PrimitiveType type, Object min, Object max) {
     Statistics<?> stats = createStats(type, min, max);
     org.apache.parquet.format.Statistics statistics = ParquetMetadataConverter.toParquetStatistics(stats);
-    assertFalse(statistics.isSetMin());
-    assertFalse(statistics.isSetMax());
+    assertTrue(statistics.isSetMin());
+    assertTrue(statistics.isSetMax());
     assertEquals(ByteBuffer.wrap(stats.getMinBytes()), statistics.min_value);
     assertEquals(ByteBuffer.wrap(stats.getMaxBytes()), statistics.max_value);
   }


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


> Statistics is not available for DECIMAL types
> ---------------------------------------------
>
>                 Key: PARQUET-1322
>                 URL: https://issues.apache.org/jira/browse/PARQUET-1322
>             Project: Parquet
>          Issue Type: Bug
>    Affects Versions: 1.9.0, 1.10.0
>            Reporter: Vlad Rozov
>            Assignee: Vlad Rozov
>            Priority: Minor
>              Labels: pull-request-available
>
> According to parquet format specification columns annotated as DECIMAL should use SIGNED comparator and statistics should be available. The sort order returned by {{org.apache.parquet.format.converter.ParquetMetadataConverter}} for DECIMAL is {{SortOrder.UNKNOWN}} which contradicts the specification and makes statistics for DECIMAL types unavailable.



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)