You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ks...@apache.org on 2020/05/13 01:15:25 UTC

[arrow] 09/17: ARROW-8694: [C++][Parquet] Relax string size limit when deserializing Thrift messages

This is an automated email from the ASF dual-hosted git repository.

kszucs pushed a commit to branch maint-0.17.x
in repository https://gitbox.apache.org/repos/asf/arrow.git

commit b90321bc031773de67bf0da5c0c765925a833768
Author: Wes McKinney <we...@apache.org>
AuthorDate: Tue May 5 16:40:14 2020 -0500

    ARROW-8694: [C++][Parquet] Relax string size limit when deserializing Thrift messages
    
    While it's not an ideal use case for Parquet, the 10MB limit for strings was causing a Thrift deserialization failure due to the large "pandas metadata" JSON blob written with the Schema when there are many columns. A 100MB limit should still catch "memory bombs" caused by nefarious input while allowing pretty wide data frames to be stored successfully
    
    Closes #7103 from wesm/ARROW-8694
    
    Authored-by: Wes McKinney <we...@apache.org>
    Signed-off-by: Wes McKinney <we...@apache.org>
---
 cpp/src/parquet/thrift_internal.h    | 2 +-
 python/pyarrow/tests/test_parquet.py | 9 +++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/cpp/src/parquet/thrift_internal.h b/cpp/src/parquet/thrift_internal.h
index 5a988c7..3e823c7 100644
--- a/cpp/src/parquet/thrift_internal.h
+++ b/cpp/src/parquet/thrift_internal.h
@@ -362,7 +362,7 @@ inline void DeserializeThriftUnencryptedMsg(const uint8_t* buf, uint32_t* len,
       new ThriftBuffer(const_cast<uint8_t*>(buf), *len));
   apache::thrift::protocol::TCompactProtocolFactoryT<ThriftBuffer> tproto_factory;
   // Protect against CPU and memory bombs
-  tproto_factory.setStringSizeLimit(10 * 1000 * 1000);
+  tproto_factory.setStringSizeLimit(100 * 1000 * 1000);
   tproto_factory.setContainerSizeLimit(10 * 1000 * 1000);
   shared_ptr<apache::thrift::protocol::TProtocol> tproto =  //
       tproto_factory.getProtocol(tmem_transport);
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index c7d04c4..86eb964 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -3823,6 +3823,15 @@ def test_fastparquet_cross_compatibility(tempdir):
     tm.assert_frame_equal(table_fp.to_pandas(), df)
 
 
+def test_table_large_metadata():
+    # ARROW-8694
+    my_schema = pa.schema([pa.field('f0', 'double')],
+                          metadata={'large': 'x' * 10000000})
+
+    table = pa.table([np.arange(10)], schema=my_schema)
+    _check_roundtrip(table)
+
+
 @parametrize_legacy_dataset_skip_buffer
 @pytest.mark.parametrize('array_factory', [
     lambda: pa.array([0, None] * 10),