You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by "wgtmac (via GitHub)" <gi...@apache.org> on 2023/06/08 08:34:03 UTC

[GitHub] [arrow] wgtmac commented on a diff in pull request #35989: GH-34351: [C++][Parquet] Statistic: tiny optimization and specification

wgtmac commented on code in PR #35989:
URL: https://github.com/apache/arrow/pull/35989#discussion_r1222625483


##########
cpp/src/parquet/statistics.h:
##########
@@ -149,10 +148,10 @@ class PARQUET_EXPORT EncodedStatistics {
   // the true minimum for aggregations and there is no way to mark that a
   // value has been truncated and is a lower bound and not in the page.
   void ApplyStatSizeLimits(size_t length) {
-    if (max_->length() > length) {
+    if (max_.length() > length) {

Review Comment:
   Should we clear `max_` as well?



##########
cpp/src/parquet/statistics.cc:
##########
@@ -463,6 +463,8 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
  public:
   using T = typename DType::c_type;
 
+  // Currently, this function will call by ColumnWriter to create the

Review Comment:
   ```suggestion
     // Currently, this function will be called by ColumnWriter to create the
   ```



##########
cpp/src/parquet/statistics.cc:
##########
@@ -472,34 +474,40 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
     comparator_ = std::static_pointer_cast<TypedComparator<DType>>(comp);
     TypedStatisticsImpl::Reset();
     has_null_count_ = true;
-    has_distinct_count_ = true;
+    has_distinct_count_ = false;

Review Comment:
   nit: has_distinct_count_ is set to false in `TypedStatisticsImpl::Reset();`



##########
cpp/src/parquet/statistics.cc:
##########
@@ -552,12 +560,19 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
 
   void Merge(const TypedStatistics<DType>& other) override {
     this->num_values_ += other.num_values();
-    if (other.HasNullCount()) {
+    // Merge always runs when Merge builder's page statistics
+    // into column chunk statistics, so it tent to have null count.
+    if (ARROW_PREDICT_TRUE(other.HasNullCount())) {

Review Comment:
   Should we check this->has_null_count_ as well?



##########
cpp/src/parquet/statistics.cc:
##########
@@ -472,34 +474,40 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
     comparator_ = std::static_pointer_cast<TypedComparator<DType>>(comp);
     TypedStatisticsImpl::Reset();
     has_null_count_ = true;
-    has_distinct_count_ = true;
+    has_distinct_count_ = false;
   }
 
+  // Currently, this function will call by testing.
   TypedStatisticsImpl(const T& min, const T& max, int64_t num_values, int64_t null_count,
                       int64_t distinct_count)
       : pool_(default_memory_pool()),
         min_buffer_(AllocateBuffer(pool_, 0)),
         max_buffer_(AllocateBuffer(pool_, 0)) {
     TypedStatisticsImpl::IncrementNumValues(num_values);
     TypedStatisticsImpl::IncrementNullCount(null_count);
-    IncrementDistinctCount(distinct_count);
+    SetDistinctCount(distinct_count);
 
     Copy(min, &min_, min_buffer_.get());
     Copy(max, &max_, max_buffer_.get());
     has_min_max_ = true;
   }
 
+  // Currently, this function will call by ColumnChunkMetaData during read.

Review Comment:
   ```suggestion
     // Currently, this function will only be called by ColumnChunkMetaData during read.
   ```



##########
cpp/src/parquet/statistics.cc:
##########
@@ -552,12 +560,19 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
 
   void Merge(const TypedStatistics<DType>& other) override {
     this->num_values_ += other.num_values();
-    if (other.HasNullCount()) {
+    // Merge always runs when Merge builder's page statistics
+    // into column chunk statistics, so it tent to have null count.
+    if (ARROW_PREDICT_TRUE(other.HasNullCount())) {
       this->statistics_.null_count += other.null_count();
+    } else {
+      this->has_null_count_ = false;
     }
-    if (other.HasDistinctCount()) {
-      this->statistics_.distinct_count += other.distinct_count();
+    // Distinct count cannot be merged.
+    if (has_distinct_count_) {
+      has_distinct_count_ = false;
     }
+    // If !other.HasMinMax, might be all-nulls or nulls and nan,

Review Comment:
   We cannot differentiate at least two states:
   - (1) the stats has not seen any value except null or nan, so it does not have min/max
   - (2) the stats has seen some valid values but min/max are truncated due to size limit
   
   So if currently the stats is in the state (2), we should not merge min/max from the incoming stats



##########
cpp/src/parquet/statistics.cc:
##########
@@ -472,34 +474,40 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
     comparator_ = std::static_pointer_cast<TypedComparator<DType>>(comp);
     TypedStatisticsImpl::Reset();
     has_null_count_ = true;
-    has_distinct_count_ = true;
+    has_distinct_count_ = false;
   }
 
+  // Currently, this function will call by testing.

Review Comment:
   ```suggestion
     // Currently, this function will only be called by testing.
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org