You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2018/08/01 20:32:58 UTC

[arrow] branch master updated: ARROW-2911: [Python] Parquet binary statistics that end in '\0' truncate last byte

This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new cf79192  ARROW-2911: [Python] Parquet binary statistics that end in '\0' truncate last byte
cf79192 is described below

commit cf79192ebc2bf54c68b7b56dc938d91f72441b96
Author: Korn, Uwe <Uw...@blue-yonder.com>
AuthorDate: Wed Aug 1 22:32:45 2018 +0200

    ARROW-2911: [Python] Parquet binary statistics that end in '\0' truncate last byte
    
    Depends on https://github.com/apache/parquet-cpp/pull/479
    
    Merge once parquet-cpp 1.5.0 is released.
    
    Author: Korn, Uwe <Uw...@blue-yonder.com>
    
    Closes #2326 from xhochy/ARROW-2911 and squashes the following commits:
    
    50fa61d <Korn, Uwe> Remove deprecated version of FormatStatValue
    bb2a4fa <Korn, Uwe> ARROW-2911:  Parquet binary statistics that end in '\0' truncate last byte
---
 python/pyarrow/_parquet.pxd          | 2 +-
 python/pyarrow/_parquet.pyx          | 4 ++--
 python/pyarrow/tests/test_parquet.py | 4 ++++
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd
index ca20ce2..564391e 100644
--- a/python/pyarrow/_parquet.pxd
+++ b/python/pyarrow/_parquet.pxd
@@ -134,7 +134,7 @@ cdef extern from "parquet/api/schema.h" namespace "parquet" nogil:
         c_bool Equals(const SchemaDescriptor& other)
         int num_columns()
 
-    cdef c_string FormatStatValue(ParquetType parquet_type, const char* val)
+    cdef c_string FormatStatValue(ParquetType parquet_type, c_string val)
 
 
 cdef extern from "parquet/api/reader.h" namespace "parquet" nogil:
diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx
index 1aa2124..7b97d06 100644
--- a/python/pyarrow/_parquet.pyx
+++ b/python/pyarrow/_parquet.pyx
@@ -99,7 +99,7 @@ cdef class RowGroupStatistics:
         raw_physical_type = self.statistics.get().physical_type()
         encode_min = self.statistics.get().EncodeMin()
 
-        min_value = FormatStatValue(raw_physical_type, encode_min.c_str())
+        min_value = FormatStatValue(raw_physical_type, encode_min)
         return self._cast_statistic(min_value)
 
     @property
@@ -107,7 +107,7 @@ cdef class RowGroupStatistics:
         raw_physical_type = self.statistics.get().physical_type()
         encode_max = self.statistics.get().EncodeMax()
 
-        max_value = FormatStatValue(raw_physical_type, encode_max.c_str())
+        max_value = FormatStatValue(raw_physical_type, encode_max)
         return self._cast_statistic(max_value)
 
     @property
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index 1d3a6c1..324ae16 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -630,6 +630,10 @@ def test_parquet_metadata_api():
             [True, False, False, True, True], pa.bool_(),
             'BOOLEAN', False, True, 0, 5, 0
         ),
+        (
+            [b'\x00', b'b', b'12', None, b'aaa'], pa.binary(),
+            'BYTE_ARRAY', b'\x00', b'b', 1, 4, 0
+        ),
     ]
 )
 def test_parquet_column_statistics_api(data, type, physical_type, min_value,