You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by "pitrou (via GitHub)" <gi...@apache.org> on 2023/07/06 15:31:31 UTC

[GitHub] [arrow] pitrou commented on a diff in pull request #35989: GH-34351: [C++][Parquet] Statistics: add detail documentation and tiny optimization

pitrou commented on code in PR #35989:
URL: https://github.com/apache/arrow/pull/35989#discussion_r1254600944


##########
cpp/src/parquet/statistics.cc:
##########
@@ -609,9 +618,10 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
     }
     if (HasNullCount()) {
       s.set_null_count(this->null_count());
+      // num_values_ is reliable and it means number of non-null values.
+      s.all_null_value = num_values_ == 0;
     }
-    // num_values_ is reliable and it means number of non-null values.
-    s.all_null_value = num_values_ == 0;
+    // FIXME(mwish): distinct count is not encoded for now.

Review Comment:
   Can you open an issue for this? FIXMEs in the code will be forgotten...



##########
cpp/src/parquet/statistics_test.cc:
##########
@@ -590,6 +593,178 @@ TYPED_TEST(TestNumericStatistics, Equals) {
   ASSERT_NO_FATAL_FAILURE(this->TestEquals());
 }
 
+template <typename TestType>
+class TestStatisticsHasFlag : public TestStatistics<TestType> {
+ public:
+  void SetUp() override {
+    TestStatistics<TestType>::SetUp();
+    this->SetUpSchema(Repetition::OPTIONAL);
+  }
+
+  std::shared_ptr<TypedStatistics<TestType>> MergedStatistics(
+      const TypedStatistics<TestType>& stats1, const TypedStatistics<TestType>& stats2) {
+    auto chunk_statistics = MakeStatistics<TestType>(this->schema_.Column(0));
+    chunk_statistics->Merge(stats1);
+    chunk_statistics->Merge(stats2);
+    return chunk_statistics;
+  }
+
+  void VerifyMergedStatistics(
+      const TypedStatistics<TestType>& stats1, const TypedStatistics<TestType>& stats2,
+      const std::function<void(TypedStatistics<TestType>*)>& test_fn) {
+    ASSERT_NO_FATAL_FAILURE(test_fn(MergedStatistics(stats1, stats2).get()));
+    ASSERT_NO_FATAL_FAILURE(test_fn(MergedStatistics(stats2, stats1).get()));
+  }
+
+  // Distinct count should set to false when Merge is called.
+  void TestMergeDistinctCount() {
+    // Create a statistics object with distinct count.
+    std::shared_ptr<TypedStatistics<TestType>> statistics1;
+    {
+      EncodedStatistics encoded_statistics1;
+      statistics1 = std::dynamic_pointer_cast<TypedStatistics<TestType>>(
+          Statistics::Make(this->schema_.Column(0), &encoded_statistics1,
+                           /*num_values=*/1000));
+      EXPECT_FALSE(statistics1->HasDistinctCount());
+    }
+
+    // Create a statistics object with distinct count.
+    std::shared_ptr<TypedStatistics<TestType>> statistics2;
+    {
+      EncodedStatistics encoded_statistics2;
+      encoded_statistics2.has_distinct_count = true;
+      encoded_statistics2.distinct_count = 500;
+      statistics2 = std::dynamic_pointer_cast<TypedStatistics<TestType>>(
+          Statistics::Make(this->schema_.Column(0), &encoded_statistics2,
+                           /*num_values=*/1000));
+      EXPECT_TRUE(statistics2->HasDistinctCount());
+    }
+
+    VerifyMergedStatistics(*statistics1, *statistics2,
+                           [](TypedStatistics<TestType>* merged_statistics) {
+                             EXPECT_FALSE(merged_statistics->HasDistinctCount());
+                             EXPECT_FALSE(merged_statistics->Encode().has_distinct_count);
+                           });
+  }
+
+  // If all values in a page are null or nan, its stats should not set min-max.
+  // Merging its stats with another page having good min-max stats should not
+  // drop the valid min-max from the latter page.
+  void TestMergeMinMax() {
+    this->GenerateData(1000);
+    // Create a statistics object without min-max.
+    std::shared_ptr<TypedStatistics<TestType>> statistics1;
+    {
+      statistics1 = MakeStatistics<TestType>(this->schema_.Column(0));
+      statistics1->Update(this->values_ptr_, /*num_values=*/0,
+                          /*null_count=*/this->values_.size());
+      auto encoded_stats1 = statistics1->Encode();
+      EXPECT_FALSE(statistics1->HasMinMax());
+      EXPECT_FALSE(encoded_stats1.has_min);
+      EXPECT_FALSE(encoded_stats1.has_max);
+    }
+    // Create a statistics object with min-max.
+    std::shared_ptr<TypedStatistics<TestType>> statistics2;
+    {
+      statistics2 = MakeStatistics<TestType>(this->schema_.Column(0));
+      statistics2->Update(this->values_ptr_, this->values_.size(), 0);
+      auto encoded_stats2 = statistics2->Encode();
+      EXPECT_TRUE(statistics2->HasMinMax());
+      EXPECT_TRUE(encoded_stats2.has_min);
+      EXPECT_TRUE(encoded_stats2.has_max);
+    }
+    VerifyMergedStatistics(*statistics1, *statistics2,
+                           [](TypedStatistics<TestType>* merged_statistics) {
+                             EXPECT_TRUE(merged_statistics->HasMinMax());
+                             EXPECT_TRUE(merged_statistics->Encode().has_min);
+                             EXPECT_TRUE(merged_statistics->Encode().has_max);
+                           });
+  }
+
+  // Default statistics should have null_count even if no nulls is written.
+  // However, if statistics is created from thrift message, it might not
+  // have null_count. Merging statistics from such page will result in an
+  // invalid null_count as well.
+  void TestMergeNullCount() {
+    this->GenerateData(/*num_values=*/1000);
+
+    // Page should have null-count even if no nulls
+    std::shared_ptr<TypedStatistics<TestType>> statistics1;
+    {
+      statistics1 = MakeStatistics<TestType>(this->schema_.Column(0));
+      statistics1->Update(this->values_ptr_, /*num_values=*/this->values_.size(),
+                          /*null_count=*/0);
+      auto encoded_stats1 = statistics1->Encode();
+      EXPECT_TRUE(statistics1->HasNullCount());
+      EXPECT_EQ(0, statistics1->null_count());
+      EXPECT_TRUE(statistics1->Encode().has_null_count);
+    }
+    // Merge with null-count should also have null count
+    VerifyMergedStatistics(*statistics1, *statistics1,
+                           [](TypedStatistics<TestType>* merged_statistics) {
+                             EXPECT_TRUE(merged_statistics->HasNullCount());
+                             EXPECT_EQ(0, merged_statistics->null_count());
+                             auto encoded = merged_statistics->Encode();
+                             EXPECT_TRUE(encoded.has_null_count);
+                             EXPECT_EQ(0, encoded.null_count);
+                           });
+
+    // When loaded from thrift, might not have null count.
+    std::shared_ptr<TypedStatistics<TestType>> statistics2;
+    {
+      EncodedStatistics encoded_statistics2;
+      encoded_statistics2.has_null_count = false;
+      statistics2 = std::dynamic_pointer_cast<TypedStatistics<TestType>>(
+          Statistics::Make(this->schema_.Column(0), &encoded_statistics2,
+                           /*num_values=*/1000));
+      EXPECT_FALSE(statistics2->Encode().has_null_count);
+      EXPECT_FALSE(statistics2->HasNullCount());
+    }
+
+    // Merge without null-count should not have null count
+    VerifyMergedStatistics(*statistics1, *statistics2,
+                           [](TypedStatistics<TestType>* merged_statistics) {
+                             EXPECT_FALSE(merged_statistics->HasNullCount());
+                             EXPECT_FALSE(merged_statistics->Encode().has_null_count);
+                           });
+  }
+
+  // statistics.all_null_value is used to build the page index.
+  // If statistics doesn't have null count, all_null_value should be false.
+  void TestMissingNullCount() {
+    EncodedStatistics encoded_statistics;
+    encoded_statistics.has_null_count = false;
+    auto statistics = Statistics::Make(this->schema_.Column(0), &encoded_statistics,
+                                       /*num_values=*/1000);
+    auto typed_stats = std::dynamic_pointer_cast<TypedStatistics<TestType>>(statistics);
+    EXPECT_FALSE(typed_stats->HasNullCount());
+    auto encoded = typed_stats->Encode();
+    EXPECT_FALSE(encoded.all_null_value);
+    EXPECT_FALSE(encoded.has_null_count);
+    EXPECT_FALSE(encoded.has_distinct_count);
+    EXPECT_FALSE(encoded.has_min);
+    EXPECT_FALSE(encoded.has_max);
+  }
+};
+
+TYPED_TEST_SUITE(TestStatisticsHasFlag, Types);
+
+TYPED_TEST(TestStatisticsHasFlag, MergeDistinctCount) {
+  ASSERT_NO_FATAL_FAILURE(this->TestMergeDistinctCount());
+}
+
+TYPED_TEST(TestStatisticsHasFlag, MergeNullCount) {
+  ASSERT_NO_FATAL_FAILURE(this->TestMergeNullCount());
+}
+
+TYPED_TEST(TestStatisticsHasFlag, MergeMinMax) {
+  ASSERT_NO_FATAL_FAILURE(this->TestMergeMinMax());
+}
+
+TYPED_TEST(TestStatisticsHasFlag, MissingNullValues) {

Review Comment:
   Naming consistency
   ```suggestion
   TYPED_TEST(TestStatisticsHasFlag, MissingNullCount) {
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org