You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by yi...@apache.org on 2022/05/18 01:55:11 UTC

[arrow] branch master updated: MINOR: [C++] cpp/parquet/Statistics: clarify that num_values() is the number of non-null values

This is an automated email from the ASF dual-hosted git repository.

yibocai pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new ed084f6bcd MINOR: [C++] cpp/parquet/Statistics: clarify that num_values() is the number of non-null values
ed084f6bcd is described below

commit ed084f6bcd3336bf08d326546883854244768f80
Author: Even Rouault <ev...@spatialys.com>
AuthorDate: Wed May 18 01:54:48 2022 +0000

    MINOR: [C++] cpp/parquet/Statistics: clarify that num_values() is the number of non-null values
    
    The current documentation of Statistics::num_values() is a bit
    ambiguous as it mentions the 'total number of values' and my initial
    understanding is that it also included null values. But experimentation
    and documentation of https://arrow.apache.org/docs/python/generated/pyarrow.parquet.Statistics.html
    shows that it is the number of non-null values.
    
    Closes #13164 from rouault/statistics_num_values
    
    Authored-by: Even Rouault <ev...@spatialys.com>
    Signed-off-by: Yibo Cai <yi...@arm.com>
---
 cpp/src/parquet/statistics.cc | 34 +++++++++++++++++-----------------
 cpp/src/parquet/statistics.h  | 12 ++++++------
 2 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/cpp/src/parquet/statistics.cc b/cpp/src/parquet/statistics.cc
index 231c211a26..33b6c6dfbe 100644
--- a/cpp/src/parquet/statistics.cc
+++ b/cpp/src/parquet/statistics.cc
@@ -562,10 +562,10 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
     }
   }
 
-  void Update(const T* values, int64_t num_not_null, int64_t num_null) override;
+  void Update(const T* values, int64_t num_values, int64_t null_count) override;
   void UpdateSpaced(const T* values, const uint8_t* valid_bits, int64_t valid_bits_offset,
-                    int64_t num_spaced_values, int64_t num_not_null,
-                    int64_t num_null) override;
+                    int64_t num_spaced_values, int64_t num_values,
+                    int64_t null_count) override;
 
   void Update(const ::arrow::Array& values, bool update_counts) override {
     if (update_counts) {
@@ -698,30 +698,30 @@ inline void TypedStatisticsImpl<ByteArrayType>::Copy(const ByteArray& src, ByteA
 }
 
 template <typename DType>
-void TypedStatisticsImpl<DType>::Update(const T* values, int64_t num_not_null,
-                                        int64_t num_null) {
-  DCHECK_GE(num_not_null, 0);
-  DCHECK_GE(num_null, 0);
+void TypedStatisticsImpl<DType>::Update(const T* values, int64_t num_values,
+                                        int64_t null_count) {
+  DCHECK_GE(num_values, 0);
+  DCHECK_GE(null_count, 0);
 
-  IncrementNullCount(num_null);
-  IncrementNumValues(num_not_null);
+  IncrementNullCount(null_count);
+  IncrementNumValues(num_values);
 
-  if (num_not_null == 0) return;
-  SetMinMaxPair(comparator_->GetMinMax(values, num_not_null));
+  if (num_values == 0) return;
+  SetMinMaxPair(comparator_->GetMinMax(values, num_values));
 }
 
 template <typename DType>
 void TypedStatisticsImpl<DType>::UpdateSpaced(const T* values, const uint8_t* valid_bits,
                                               int64_t valid_bits_offset,
                                               int64_t num_spaced_values,
-                                              int64_t num_not_null, int64_t num_null) {
-  DCHECK_GE(num_not_null, 0);
-  DCHECK_GE(num_null, 0);
+                                              int64_t num_values, int64_t null_count) {
+  DCHECK_GE(num_values, 0);
+  DCHECK_GE(null_count, 0);
 
-  IncrementNullCount(num_null);
-  IncrementNumValues(num_not_null);
+  IncrementNullCount(null_count);
+  IncrementNumValues(num_values);
 
-  if (num_not_null == 0) return;
+  if (num_values == 0) return;
   SetMinMaxPair(comparator_->GetMinMaxSpaced(values, num_spaced_values, valid_bits,
                                              valid_bits_offset));
 }
diff --git a/cpp/src/parquet/statistics.h b/cpp/src/parquet/statistics.h
index ac7abda902..b4e9fb382a 100644
--- a/cpp/src/parquet/statistics.h
+++ b/cpp/src/parquet/statistics.h
@@ -228,7 +228,7 @@ class PARQUET_EXPORT Statistics {
   /// \brief The number of distinct values, may not be set
   virtual int64_t distinct_count() const = 0;
 
-  /// \brief The total number of values in the column
+  /// \brief The number of non-null values in the column
   virtual int64_t num_values() const = 0;
 
   /// \brief Return true if the min and max statistics are set. Obtain
@@ -278,7 +278,7 @@ class TypedStatistics : public Statistics {
   virtual void Merge(const TypedStatistics<DType>& other) = 0;
 
   /// \brief Batch statistics update
-  virtual void Update(const T* values, int64_t num_not_null, int64_t num_null) = 0;
+  virtual void Update(const T* values, int64_t num_values, int64_t null_count) = 0;
 
   /// \brief Batch statistics update with supplied validity bitmap
   /// \param[in] values pointer to column values
@@ -287,13 +287,13 @@ class TypedStatistics : public Statistics {
   ///                              data begins.
   /// \param[in] num_spaced_values The length of values in values/valid_bits to inspect
   ///                              when calculating statistics. This can be smaller than
-  ///                              num_not_null+num_null as num_null can include nulls
+  ///                              num_values+null_count as null_count can include nulls
   ///                              from parents while num_spaced_values does not.
-  /// \param[in] num_not_null Number of values that are not null.
-  /// \param[in] num_null Number of values that are null.
+  /// \param[in] num_values Number of values that are not null.
+  /// \param[in] null_count Number of values that are null.
   virtual void UpdateSpaced(const T* values, const uint8_t* valid_bits,
                             int64_t valid_bits_offset, int64_t num_spaced_values,
-                            int64_t num_not_null, int64_t num_null) = 0;
+                            int64_t num_values, int64_t null_count) = 0;
 
   /// \brief EXPERIMENTAL: Update statistics with an Arrow array without
   /// conversion to a primitive Parquet C type. Only implemented for certain