You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2020/05/05 21:40:38 UTC

[arrow] branch master updated: ARROW-8694: [C++][Parquet] Relax string size limit when deserializing Thrift messages

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new bc283e3  ARROW-8694: [C++][Parquet] Relax string size limit when deserializing Thrift messages
bc283e3 is described below

commit bc283e33232a382e1f016d57c172c718b06b38ab
Author: Wes McKinney <we...@apache.org>
AuthorDate: Tue May 5 16:40:14 2020 -0500

    ARROW-8694: [C++][Parquet] Relax string size limit when deserializing Thrift messages
    
    While it's not an ideal use case for Parquet, the 10MB limit for strings was causing a Thrift deserialization failure due to the large "pandas metadata" JSON blob written with the Schema when there are many columns. A 100MB limit should still catch "memory bombs" caused by nefarious input while allowing pretty wide data frames to be stored successfully
    
    Closes #7103 from wesm/ARROW-8694
    
    Authored-by: Wes McKinney <we...@apache.org>
    Signed-off-by: Wes McKinney <we...@apache.org>
---
 cpp/src/parquet/thrift_internal.h    | 2 +-
 python/pyarrow/tests/test_parquet.py | 9 +++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/cpp/src/parquet/thrift_internal.h b/cpp/src/parquet/thrift_internal.h
index 5a988c7..3e823c7 100644
--- a/cpp/src/parquet/thrift_internal.h
+++ b/cpp/src/parquet/thrift_internal.h
@@ -362,7 +362,7 @@ inline void DeserializeThriftUnencryptedMsg(const uint8_t* buf, uint32_t* len,
       new ThriftBuffer(const_cast<uint8_t*>(buf), *len));
   apache::thrift::protocol::TCompactProtocolFactoryT<ThriftBuffer> tproto_factory;
   // Protect against CPU and memory bombs
-  tproto_factory.setStringSizeLimit(10 * 1000 * 1000);
+  tproto_factory.setStringSizeLimit(100 * 1000 * 1000);
   tproto_factory.setContainerSizeLimit(10 * 1000 * 1000);
   shared_ptr<apache::thrift::protocol::TProtocol> tproto =  //
       tproto_factory.getProtocol(tmem_transport);
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index f76496d..31de9fe 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -3955,6 +3955,15 @@ def test_fastparquet_cross_compatibility(tempdir):
     tm.assert_frame_equal(table_fp.to_pandas(), df)
 
 
+def test_table_large_metadata():
+    # ARROW-8694
+    my_schema = pa.schema([pa.field('f0', 'double')],
+                          metadata={'large': 'x' * 10000000})
+
+    table = pa.table([np.arange(10)], schema=my_schema)
+    _check_roundtrip(table)
+
+
 @parametrize_legacy_dataset_skip_buffer
 @pytest.mark.parametrize('array_factory', [
     lambda: pa.array([0, None] * 10),