You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by yi...@apache.org on 2022/05/18 01:55:11 UTC
[arrow] branch master updated: MINOR: [C++] cpp/parquet/Statistics: clarify that num_values() is the number of non-null values
This is an automated email from the ASF dual-hosted git repository.
yibocai pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new ed084f6bcd MINOR: [C++] cpp/parquet/Statistics: clarify that num_values() is the number of non-null values
ed084f6bcd is described below
commit ed084f6bcd3336bf08d326546883854244768f80
Author: Even Rouault <ev...@spatialys.com>
AuthorDate: Wed May 18 01:54:48 2022 +0000
MINOR: [C++] cpp/parquet/Statistics: clarify that num_values() is the number of non-null values
The current documentation of Statistics::num_values() is a bit
ambiguous as it mentions the 'total number of values' and my initial
understanding is that it also included null values. But experimentation
and documentation of https://arrow.apache.org/docs/python/generated/pyarrow.parquet.Statistics.html
shows that it is the number of non-null values.
Closes #13164 from rouault/statistics_num_values
Authored-by: Even Rouault <ev...@spatialys.com>
Signed-off-by: Yibo Cai <yi...@arm.com>
---
cpp/src/parquet/statistics.cc | 34 +++++++++++++++++-----------------
cpp/src/parquet/statistics.h | 12 ++++++------
2 files changed, 23 insertions(+), 23 deletions(-)
diff --git a/cpp/src/parquet/statistics.cc b/cpp/src/parquet/statistics.cc
index 231c211a26..33b6c6dfbe 100644
--- a/cpp/src/parquet/statistics.cc
+++ b/cpp/src/parquet/statistics.cc
@@ -562,10 +562,10 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
}
}
- void Update(const T* values, int64_t num_not_null, int64_t num_null) override;
+ void Update(const T* values, int64_t num_values, int64_t null_count) override;
void UpdateSpaced(const T* values, const uint8_t* valid_bits, int64_t valid_bits_offset,
- int64_t num_spaced_values, int64_t num_not_null,
- int64_t num_null) override;
+ int64_t num_spaced_values, int64_t num_values,
+ int64_t null_count) override;
void Update(const ::arrow::Array& values, bool update_counts) override {
if (update_counts) {
@@ -698,30 +698,30 @@ inline void TypedStatisticsImpl<ByteArrayType>::Copy(const ByteArray& src, ByteA
}
template <typename DType>
-void TypedStatisticsImpl<DType>::Update(const T* values, int64_t num_not_null,
- int64_t num_null) {
- DCHECK_GE(num_not_null, 0);
- DCHECK_GE(num_null, 0);
+void TypedStatisticsImpl<DType>::Update(const T* values, int64_t num_values,
+ int64_t null_count) {
+ DCHECK_GE(num_values, 0);
+ DCHECK_GE(null_count, 0);
- IncrementNullCount(num_null);
- IncrementNumValues(num_not_null);
+ IncrementNullCount(null_count);
+ IncrementNumValues(num_values);
- if (num_not_null == 0) return;
- SetMinMaxPair(comparator_->GetMinMax(values, num_not_null));
+ if (num_values == 0) return;
+ SetMinMaxPair(comparator_->GetMinMax(values, num_values));
}
template <typename DType>
void TypedStatisticsImpl<DType>::UpdateSpaced(const T* values, const uint8_t* valid_bits,
int64_t valid_bits_offset,
int64_t num_spaced_values,
- int64_t num_not_null, int64_t num_null) {
- DCHECK_GE(num_not_null, 0);
- DCHECK_GE(num_null, 0);
+ int64_t num_values, int64_t null_count) {
+ DCHECK_GE(num_values, 0);
+ DCHECK_GE(null_count, 0);
- IncrementNullCount(num_null);
- IncrementNumValues(num_not_null);
+ IncrementNullCount(null_count);
+ IncrementNumValues(num_values);
- if (num_not_null == 0) return;
+ if (num_values == 0) return;
SetMinMaxPair(comparator_->GetMinMaxSpaced(values, num_spaced_values, valid_bits,
valid_bits_offset));
}
diff --git a/cpp/src/parquet/statistics.h b/cpp/src/parquet/statistics.h
index ac7abda902..b4e9fb382a 100644
--- a/cpp/src/parquet/statistics.h
+++ b/cpp/src/parquet/statistics.h
@@ -228,7 +228,7 @@ class PARQUET_EXPORT Statistics {
/// \brief The number of distinct values, may not be set
virtual int64_t distinct_count() const = 0;
- /// \brief The total number of values in the column
+ /// \brief The number of non-null values in the column
virtual int64_t num_values() const = 0;
/// \brief Return true if the min and max statistics are set. Obtain
@@ -278,7 +278,7 @@ class TypedStatistics : public Statistics {
virtual void Merge(const TypedStatistics<DType>& other) = 0;
/// \brief Batch statistics update
- virtual void Update(const T* values, int64_t num_not_null, int64_t num_null) = 0;
+ virtual void Update(const T* values, int64_t num_values, int64_t null_count) = 0;
/// \brief Batch statistics update with supplied validity bitmap
/// \param[in] values pointer to column values
@@ -287,13 +287,13 @@ class TypedStatistics : public Statistics {
/// data begins.
/// \param[in] num_spaced_values The length of values in values/valid_bits to inspect
/// when calculating statistics. This can be smaller than
- /// num_not_null+num_null as num_null can include nulls
+ /// num_values+null_count as null_count can include nulls
/// from parents while num_spaced_values does not.
- /// \param[in] num_not_null Number of values that are not null.
- /// \param[in] num_null Number of values that are null.
+ /// \param[in] num_values Number of values that are not null.
+ /// \param[in] null_count Number of values that are null.
virtual void UpdateSpaced(const T* values, const uint8_t* valid_bits,
int64_t valid_bits_offset, int64_t num_spaced_values,
- int64_t num_not_null, int64_t num_null) = 0;
+ int64_t num_values, int64_t null_count) = 0;
/// \brief EXPERIMENTAL: Update statistics with an Arrow array without
/// conversion to a primitive Parquet C type. Only implemented for certain