You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by "pitrou (via GitHub)" <gi...@apache.org> on 2023/06/14 16:02:17 UTC

[GitHub] [arrow] pitrou commented on a diff in pull request #35989: GH-34351: [C++][Parquet] Statistic: tiny optimization and specification

pitrou commented on code in PR #35989:
URL: https://github.com/apache/arrow/pull/35989#discussion_r1229842212


##########
cpp/src/parquet/statistics_test.cc:
##########
@@ -377,6 +377,35 @@ class TestStatistics : public PrimitiveTypedTest<TestType> {
     ASSERT_EQ(total->max(), std::max(statistics1->max(), statistics2->max()));
   }
 
+  void TestMergeEmpty() {
+    EncodedStatistics encoded_statistics1;
+    encoded_statistics1.has_distinct_count = false;
+    // NOTE: currently, has_null_count_ in Statistics is always true.
+    auto statistics1 =
+        Statistics::Make(this->schema_.Column(0), &encoded_statistics1, 1000);

Review Comment:
   Can you add the parameter name for readability?
   ```suggestion
           Statistics::Make(this->schema_.Column(0), &encoded_statistics1, /*xxx=*/ 1000);
   ```



##########
cpp/src/parquet/statistics_test.cc:
##########
@@ -377,6 +377,35 @@ class TestStatistics : public PrimitiveTypedTest<TestType> {
     ASSERT_EQ(total->max(), std::max(statistics1->max(), statistics2->max()));
   }
 
+  void TestMergeEmpty() {
+    EncodedStatistics encoded_statistics1;
+    encoded_statistics1.has_distinct_count = false;
+    // NOTE: currently, has_null_count_ in Statistics is always true.
+    auto statistics1 =
+        Statistics::Make(this->schema_.Column(0), &encoded_statistics1, 1000);
+    auto s1 = std::dynamic_pointer_cast<TypedStatistics<TestType>>(statistics1);
+
+    EXPECT_FALSE(statistics1->HasMinMax());
+    EXPECT_FALSE(statistics1->HasDistinctCount());
+
+    EncodedStatistics encoded_statistics2;
+    encoded_statistics2.has_distinct_count = true;
+    encoded_statistics2.distinct_count = 500;
+    auto statistics2 =
+        Statistics::Make(this->schema_.Column(0), &encoded_statistics2, 1000);
+
+    EXPECT_FALSE(statistics2->HasMinMax());
+    EXPECT_TRUE(statistics2->HasDistinctCount());
+    auto s2 = std::dynamic_pointer_cast<TypedStatistics<TestType>>(statistics2);
+
+    auto total = MakeStatistics<TestType>(this->schema_.Column(0));
+    total->Merge(*s1);
+    total->Merge(*s2);
+
+    EXPECT_FALSE(total->HasDistinctCount());

Review Comment:
   Also perhaps
   ```suggestion
       EXPECT_FALSE(total->HasDistinctCount());
       EXPECT_FALSE(total->HasMinMax());
   ```



##########
cpp/src/parquet/statistics.cc:
##########
@@ -463,6 +463,8 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
  public:
   using T = typename DType::c_type;
 
+  // Currently, this function will be called by ColumnWriter to create the
+  // statistics collector during write.

Review Comment:
   These constructors can be invoked indirectly by third-party code through `MakeStatistics`, so the added comments seem misleading.



##########
cpp/src/parquet/statistics.cc:
##########
@@ -552,12 +560,19 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
 
   void Merge(const TypedStatistics<DType>& other) override {
     this->num_values_ += other.num_values();
-    if (other.HasNullCount()) {
+    // Merge always runs when Merge builder's page statistics
+    // into column chunk statistics, so it tent to have null count.
+    if (ARROW_PREDICT_TRUE(other.HasNullCount())) {

Review Comment:
   This is really a pointless micro-optimization, IMHO. We are not in a critical loop.



##########
cpp/src/parquet/statistics.cc:
##########
@@ -602,6 +617,7 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
   }
 
   EncodedStatistics Encode() override {
+    // Currently, distinct_count will never be set in EncodedStatistics.

Review Comment:
   Why? Should we fix this? It doesn't seem right.



##########
cpp/src/parquet/statistics.cc:
##########
@@ -472,34 +474,40 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
     comparator_ = std::static_pointer_cast<TypedComparator<DType>>(comp);
     TypedStatisticsImpl::Reset();
     has_null_count_ = true;
-    has_distinct_count_ = true;
+    has_distinct_count_ = false;

Review Comment:
   It doesn't seem consistent to set `has_null_count_` and `has_distinct_count_` to different values by default. Why this change?



##########
cpp/src/parquet/statistics_test.cc:
##########
@@ -377,6 +377,35 @@ class TestStatistics : public PrimitiveTypedTest<TestType> {
     ASSERT_EQ(total->max(), std::max(statistics1->max(), statistics2->max()));
   }
 
+  void TestMergeEmpty() {
+    EncodedStatistics encoded_statistics1;
+    encoded_statistics1.has_distinct_count = false;

Review Comment:
   This doesn't test the semantic change you're making in this PR, is it?



##########
cpp/src/parquet/statistics.cc:
##########
@@ -552,12 +560,19 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
 
   void Merge(const TypedStatistics<DType>& other) override {
     this->num_values_ += other.num_values();
-    if (other.HasNullCount()) {
+    // Merge always runs when Merge builder's page statistics
+    // into column chunk statistics, so it tent to have null count.
+    if (ARROW_PREDICT_TRUE(other.HasNullCount())) {
       this->statistics_.null_count += other.null_count();
+    } else {
+      this->has_null_count_ = false;
     }
-    if (other.HasDistinctCount()) {
-      this->statistics_.distinct_count += other.distinct_count();
+    // Distinct count cannot be merged.

Review Comment:
   I understand the reasoning, but this might break existing usage. Perhaps we should instead document that the distinct count can be an upper bound?
   
   Can you use `git log` or `git blame` to find out who touched this code previously, and ping them to get their opinion?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org