You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@parquet.apache.org by "ASF GitHub Bot (JIRA)" <ji...@apache.org> on 2018/09/28 15:30:00 UTC

[jira] [Commented] (PARQUET-1369) [Python] Unavailable Parquet column statistics from Spark-generated file

    [ https://issues.apache.org/jira/browse/PARQUET-1369?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16632014#comment-16632014 ] 

ASF GitHub Bot commented on PARQUET-1369:
-----------------------------------------

rgruener closed pull request #491: PARQUET-1369: Disregard column sort order if statistics max/min are equal
URL: https://github.com/apache/parquet-cpp/pull/491
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/src/parquet/metadata-test.cc b/src/parquet/metadata-test.cc
index 53653bd7..25bab380 100644
--- a/src/parquet/metadata-test.cc
+++ b/src/parquet/metadata-test.cc
@@ -20,6 +20,7 @@
 #include "parquet/schema.h"
 #include "parquet/statistics.h"
 #include "parquet/types.h"
+#include "parquet/parquet_types.h"
 
 namespace parquet {
 
@@ -219,12 +220,36 @@ TEST(ApplicationVersion, Basics) {
 
   ASSERT_EQ(true, version.VersionLt(version1));
 
-  ASSERT_FALSE(version1.HasCorrectStatistics(Type::INT96, SortOrder::UNKNOWN));
-  ASSERT_TRUE(version.HasCorrectStatistics(Type::INT32, SortOrder::SIGNED));
-  ASSERT_FALSE(version.HasCorrectStatistics(Type::BYTE_ARRAY, SortOrder::SIGNED));
-  ASSERT_TRUE(version1.HasCorrectStatistics(Type::BYTE_ARRAY, SortOrder::SIGNED));
-  ASSERT_TRUE(
-      version3.HasCorrectStatistics(Type::FIXED_LEN_BYTE_ARRAY, SortOrder::SIGNED));
+  EncodedStatistics stats;
+  ASSERT_FALSE(version1.HasCorrectStatistics(Type::INT96, stats, SortOrder::UNKNOWN));
+  ASSERT_TRUE(version.HasCorrectStatistics(Type::INT32, stats, SortOrder::SIGNED));
+  ASSERT_FALSE(version.HasCorrectStatistics(Type::BYTE_ARRAY, stats, SortOrder::SIGNED));
+  ASSERT_TRUE(version1.HasCorrectStatistics(Type::BYTE_ARRAY, stats, SortOrder::SIGNED));
+  ASSERT_FALSE(version1.HasCorrectStatistics(Type::BYTE_ARRAY, stats,
+                                             SortOrder::UNSIGNED));
+  ASSERT_TRUE(version3.HasCorrectStatistics(Type::FIXED_LEN_BYTE_ARRAY,
+                                            stats, SortOrder::SIGNED));
+
+  // Check that the old stats are correct if min and max are the same
+  // regardless of sort order
+  EncodedStatistics stats_str;
+  stats_str.set_min("a").set_max("b");
+  ASSERT_FALSE(version1.HasCorrectStatistics(Type::BYTE_ARRAY,
+                                             stats_str, SortOrder::UNSIGNED));
+  stats_str.set_max("a");
+  ASSERT_TRUE(version1.HasCorrectStatistics(Type::BYTE_ARRAY,
+                                            stats_str, SortOrder::UNSIGNED));
+
+  // Check that the same holds true for ints
+  int32_t int_min = 100, int_max = 200;
+  EncodedStatistics stats_int;
+  stats_int.set_min(std::string(reinterpret_cast<const char*>(&int_min), 4))
+          .set_max(std::string(reinterpret_cast<const char*>(&int_max), 4));
+  ASSERT_FALSE(version1.HasCorrectStatistics(Type::BYTE_ARRAY,
+                                             stats_int, SortOrder::UNSIGNED));
+  stats_int.set_max(std::string(reinterpret_cast<const char*>(&int_min), 4));
+  ASSERT_TRUE(version1.HasCorrectStatistics(Type::BYTE_ARRAY,
+                                            stats_int, SortOrder::UNSIGNED));
 }
 
 }  // namespace metadata
diff --git a/src/parquet/metadata.cc b/src/parquet/metadata.cc
index 1cab51f0..3414d258 100644
--- a/src/parquet/metadata.cc
+++ b/src/parquet/metadata.cc
@@ -99,7 +99,7 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl {
     for (auto encoding : meta_data.encodings) {
       encodings_.push_back(FromThrift(encoding));
     }
-    stats_ = nullptr;
+    possible_stats_ = nullptr;
   }
   ~ColumnChunkMetaDataImpl() {}
 
@@ -124,15 +124,19 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl {
   //    Eg: UTF8
   inline bool is_stats_set() const {
     DCHECK(writer_version_ != nullptr);
-    return column_->meta_data.__isset.statistics &&
-           writer_version_->HasCorrectStatistics(type(), descr_->sort_order());
+    if (!column_->meta_data.__isset.statistics) {
+      return false;
+    }
+    if (possible_stats_ == nullptr) {
+      possible_stats_ = MakeColumnStats(column_->meta_data, descr_);
+    }
+    EncodedStatistics encodedStatistics = possible_stats_->Encode();
+    return writer_version_->HasCorrectStatistics(type(), encodedStatistics,
+                                                 descr_->sort_order());
   }
 
   inline std::shared_ptr<RowGroupStatistics> statistics() const {
-    if (stats_ == nullptr && is_stats_set()) {
-      stats_ = MakeColumnStats(column_->meta_data, descr_);
-    }
-    return stats_;
+    return is_stats_set() ? possible_stats_ : nullptr;
   }
 
   inline Compression::type compression() const {
@@ -168,7 +172,7 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl {
   }
 
  private:
-  mutable std::shared_ptr<RowGroupStatistics> stats_;
+  mutable std::shared_ptr<RowGroupStatistics> possible_stats_;
   std::vector<Encoding::type> encodings_;
   const format::ColumnChunk* column_;
   const ColumnDescriptor* descr_;
@@ -528,12 +532,19 @@ bool ApplicationVersion::VersionEq(const ApplicationVersion& other_version) cons
 // Reference:
 // parquet-mr/parquet-column/src/main/java/org/apache/parquet/CorruptStatistics.java
 // PARQUET-686 has more disussion on statistics
-bool ApplicationVersion::HasCorrectStatistics(Type::type col_type,
-                                              SortOrder::type sort_order) const {
+bool ApplicationVersion::HasCorrectStatistics(
+        Type::type col_type,
+        EncodedStatistics &statistics,
+        SortOrder::type sort_order) const {
   // Parquet cpp version 1.3.0 onwards stats are computed correctly for all types
-  if ((application_ != "parquet-cpp") || (VersionLt(PARQUET_CPP_FIXED_STATS_VERSION()))) {
-    // Only SIGNED are valid
-    if (SortOrder::SIGNED != sort_order) {
+  if ((application_ != "parquet-cpp")
+      || (VersionLt(PARQUET_CPP_FIXED_STATS_VERSION()))) {
+    // Only SIGNED are valid unless max and min are the same
+    // (in which case the sort order does not matter)
+    bool max_equals_min = statistics.has_min && statistics.has_max ?
+                          statistics.min() == statistics.max() :
+                          false;
+    if (SortOrder::SIGNED != sort_order && !max_equals_min) {
       return false;
     }
 
diff --git a/src/parquet/metadata.h b/src/parquet/metadata.h
index 5d51e3d2..7e426dea 100644
--- a/src/parquet/metadata.h
+++ b/src/parquet/metadata.h
@@ -84,6 +84,7 @@ class ApplicationVersion {
 
   // Checks if the Version has the correct statistics for a given column
   bool HasCorrectStatistics(Type::type primitive,
+                            EncodedStatistics &statistics,
                             SortOrder::type sort_order = SortOrder::SIGNED) const;
 };
 


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


> [Python] Unavailable Parquet column statistics from Spark-generated file
> ------------------------------------------------------------------------
>
>                 Key: PARQUET-1369
>                 URL: https://issues.apache.org/jira/browse/PARQUET-1369
>             Project: Parquet
>          Issue Type: Bug
>          Components: parquet-cpp
>    Affects Versions: cpp-1.4.0
>            Reporter: Robert Gruener
>            Assignee: Robert Gruener
>            Priority: Major
>              Labels: parquet, pull-request-available
>             Fix For: cpp-1.5.0
>
>          Time Spent: 10m
>  Remaining Estimate: 0h
>
> I have a dataset generated by spark which shows it has statistics for the string column when using the java parquet-mr code (shown by using `parquet-tools meta`) however reading from pyarrow shows that the statistics for that column are not set.  I should not the column only has a single value, though it still seems like a problem that pyarrow can't recognize it (it can recognize statistics set for the long and double types).
> See https://github.com/apache/arrow/files/2161147/metadata.zip for file example.
> Pyarrow Code To Check Statistics:
> {code}
> from pyarrow import parquet as pq
> meta = pq.read_metadata('/tmp/metadata.parquet')
> # No Statistics For String Column, prints false and statistics object is None
> print(meta.row_group(0).column(1).is_stats_set)
> {code}
> Example parquet-meta output:
> {code}
> file schema: spark_schema 
> --------------------------------------------------------------------------------
> int:         REQUIRED INT64 R:0 D:0
> string:      OPTIONAL BINARY O:UTF8 R:0 D:1
> float:       REQUIRED DOUBLE R:0 D:0
> row group 1: RC:8333 TS:76031 OFFSET:4 
> --------------------------------------------------------------------------------
> int:          INT64 SNAPPY DO:0 FPO:4 SZ:7793/8181/1.05 VC:8333 ENC:PLAIN_DICTIONARY,BIT_PACKED ST:[min: 0, max: 100, num_nulls: 0]
> string:       BINARY SNAPPY DO:0 FPO:7797 SZ:1146/1139/0.99 VC:8333 ENC:PLAIN_DICTIONARY,BIT_PACKED,RLE ST:[min: hello, max: hello, num_nulls: 4192]
> float:        DOUBLE SNAPPY DO:0 FPO:8943 SZ:66720/66711/1.00 VC:8333 ENC:PLAIN,BIT_PACKED ST:[min: 0.0057611096964338415, max: 99.99811053829232, num_nulls: 0]
> {code}
> I realize the column only has a single value though it still seems like pyarrow should be able to read the statistics set. I made this here and not a JIRA since I wanted to be sure this is actually an issue and there wasnt a ticket already made there (I couldnt find one but I wanted to be sure). Either way I would like to understand why this is



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)