You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ks...@apache.org on 2020/05/13 01:15:16 UTC

[arrow] branch maint-0.17.x created (now 1342f25)

This is an automated email from the ASF dual-hosted git repository.

kszucs pushed a change to branch maint-0.17.x
in repository https://gitbox.apache.org/repos/asf/arrow.git.


      at 1342f25  ARROW-8741: [Python][Packaging] Keep VS2015 with for the windows wheels

This branch includes the following new commits:

     new b43067b  ARROW-8501: [Packaging][RPM] Upgrade devtoolset to 8 on CentOS 6
     new 263a2c8  ARROW-8503: [Packaging][deb] Fix building apache-arrow-archive-keyring for RC
     new 558dae4  ARROW-8549: [R] Assorted post-0.17 release cleanups
     new d6210c8  ARROW-8584: [C++] Fix ORC link order
     new 2f30021  ARROW-8609: [C++] Fix ORC Java JNI crash
     new 136506c  ARROW-8608: [C++] Update vendored 'variant.hpp' to fix CUDA 10.2
     new ddfd4de  ARROW-8699: [R] Fix automatic r_to_py conversion
     new c85a2fc  ARROW-8704: [C++] Fix Parquet undefined behaviour on invalid input
     new b90321b  ARROW-8694: [C++][Parquet] Relax string size limit when deserializing Thrift messages
     new 5fb626d  PARQUET-1857: [C++] Do not fail to read unencrypted files with over 32767 row groups. Change some DCHECKs causing segfaults to throw exceptions
     new 3a61e9c  ARROW-8657: [C++][Python] Add separate configuration for data pages
     new 57da2c1  ARROW-8728: [C++] Fix bitmap operation buffer overflow
     new 057cbe4  ARROW-8641: [C++][Python] Sort included indices in IpcReader - Respect column selection in FeatherReader
     new 116ed88  ARROW-8758: [R] Updates for compatibility with dplyr 1.0
     new bda2b5b  ARROW-8750: [Python] Correctly default to lz4 compression for Feather V2 in Python
     new dd3e2ff  ARROW-8684: [Python] Workaround Cython type initialization bug
     new 1342f25  ARROW-8741: [Python][Packaging] Keep VS2015 with for the windows wheels

The 17 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.



[arrow] 02/17: ARROW-8503: [Packaging][deb] Fix building apache-arrow-archive-keyring for RC

Posted by ks...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

kszucs pushed a commit to branch maint-0.17.x
in repository https://gitbox.apache.org/repos/asf/arrow.git

commit 263a2c8a275cbf0b93c9e113e4982fa6a2ff6000
Author: Sutou Kouhei <ko...@clear-code.com>
AuthorDate: Sat Apr 18 13:20:11 2020 +0900

    ARROW-8503: [Packaging][deb] Fix building apache-arrow-archive-keyring for RC
    
    Closes #6974 from kou/packaging-linux-rc
    
    Authored-by: Sutou Kouhei <ko...@clear-code.com>
    Signed-off-by: Sutou Kouhei <ko...@clear-code.com>
---
 .../linux-packages/apache-arrow-archive-keyring/Rakefile  |  8 +++++++-
 dev/tasks/linux-packages/package-task.rb                  | 15 +++++++++++----
 2 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/dev/tasks/linux-packages/apache-arrow-archive-keyring/Rakefile b/dev/tasks/linux-packages/apache-arrow-archive-keyring/Rakefile
index 35518c7..9c1ac97 100644
--- a/dev/tasks/linux-packages/apache-arrow-archive-keyring/Rakefile
+++ b/dev/tasks/linux-packages/apache-arrow-archive-keyring/Rakefile
@@ -44,7 +44,13 @@ class ApacheArrowArchiveKeyringPackageTask < PackageTask
 
     if deb_archive_name != @archive_name
       file deb_archive_name => @archive_name do
-        cp(@archive_name, deb_archive_name)
+        if @archive_base_name == deb_archive_base_name
+          cp(@archive_name, deb_archive_name)
+        else
+          sh("tar", "xf", @archive_name)
+          mv(@archive_base_name, deb_archive_base_name)
+          sh("tar", "czf", deb_archive_name, deb_archive_base_name)
+        end
       end
     end
   end
diff --git a/dev/tasks/linux-packages/package-task.rb b/dev/tasks/linux-packages/package-task.rb
index 1197b1b..ebe906e 100644
--- a/dev/tasks/linux-packages/package-task.rb
+++ b/dev/tasks/linux-packages/package-task.rb
@@ -39,15 +39,18 @@ class PackageTask
       type = $2
       if type == "rc" and options[:rc_build_type] == :release
         @deb_upstream_version = base_version
+        @deb_archive_base_name_version = base_version
         @rpm_version = base_version
         @rpm_release = "1"
       else
         @deb_upstream_version = "#{base_version}~#{sub_version}"
+        @deb_archive_base_name_version = @version
         @rpm_version = base_version
         @rpm_release = "0.#{sub_version}"
       end
     else
       @deb_upstream_version = @version
+      @deb_archive_base_name_version = @version
       @rpm_version = @version
       @rpm_release = "1"
     end
@@ -196,6 +199,10 @@ class PackageTask
     ]
   end
 
+  def deb_archive_base_name
+    "#{@package}-#{@deb_archive_base_name_version}"
+  end
+
   def deb_archive_name
     "#{@package}-#{@deb_upstream_version}.tar.gz"
   end
@@ -291,14 +298,14 @@ VERSION=#{@deb_upstream_version}
     ]
   end
 
-  def rpm_archive_name
-    "#{rpm_archive_base_name}.tar.gz"
-  end
-
   def rpm_archive_base_name
     "#{@package}-#{@rpm_version}"
   end
 
+  def rpm_archive_name
+    "#{rpm_archive_base_name}.tar.gz"
+  end
+
   def yum_dir
     "yum"
   end


[arrow] 16/17: ARROW-8684: [Python] Workaround Cython type initialization bug

Posted by ks...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

kszucs pushed a commit to branch maint-0.17.x
in repository https://gitbox.apache.org/repos/asf/arrow.git

commit dd3e2ffd8d5527cef89c63b702dbefa6404182d7
Author: Antoine Pitrou <an...@python.org>
AuthorDate: Wed May 13 02:16:24 2020 +0200

    ARROW-8684: [Python] Workaround Cython type initialization bug
    
    This should fix a crash on some macOS builds.
    
    Closes #7160 from pitrou/ARROW-8684-workaround-cy-type-initialization-bug
    
    Authored-by: Antoine Pitrou <an...@python.org>
    Signed-off-by: Krisztián Szűcs <sz...@gmail.com>
---
 python/pyarrow/__init__.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
index 1cf8cf6..9ae0d9c 100644
--- a/python/pyarrow/__init__.py
+++ b/python/pyarrow/__init__.py
@@ -29,6 +29,7 @@ streaming messaging and interprocess communication.
 For more information see the official page at https://arrow.apache.org
 """
 
+import gc as _gc
 import os as _os
 import sys as _sys
 
@@ -56,6 +57,14 @@ except ImportError:
 
 import pyarrow.compat as compat
 
+# ARROW-8684: Disable GC while initializing Cython extension module,
+# to workaround Cython bug in https://github.com/cython/cython/issues/3603
+_gc_enabled = _gc.isenabled()
+_gc.disable()
+import pyarrow.lib as _lib
+if _gc_enabled:
+    _gc.enable()
+
 from pyarrow.lib import cpu_count, set_cpu_count
 from pyarrow.lib import (null, bool_,
                          int8, int16, int32, int64,


[arrow] 11/17: ARROW-8657: [C++][Python] Add separate configuration for data pages

Posted by ks...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

kszucs pushed a commit to branch maint-0.17.x
in repository https://gitbox.apache.org/repos/asf/arrow.git

commit 3a61e9c4018860d893e5d7429c951b96adb81381
Author: Micah Kornfield <em...@gmail.com>
AuthorDate: Tue May 5 17:14:42 2020 -0500

    ARROW-8657: [C++][Python] Add separate configuration for data pages
    
    - Adds a separate write config to determine which version of
      data page to use.
    - Plumb this throught to python.
    - At the moment version and data page version are completely
      independent.
    
    Closes #7089 from emkornfield/ARROW-8657
    
    Lead-authored-by: Micah Kornfield <em...@gmail.com>
    Co-authored-by: Wes McKinney <we...@apache.org>
    Signed-off-by: Wes McKinney <we...@apache.org>
---
 cpp/src/parquet/column_writer.cc     | 16 +++++++-------
 cpp/src/parquet/properties.h         | 42 +++++++++++++++++++++++++++++++-----
 cpp/src/parquet/properties_test.cc   |  5 ++++-
 python/pyarrow/_parquet.pxd          |  9 ++++++++
 python/pyarrow/_parquet.pyx          | 17 ++++++++++++++-
 python/pyarrow/parquet.py            | 18 +++++++++++++++-
 python/pyarrow/tests/test_parquet.py | 27 ++++++++++++++---------
 7 files changed, 108 insertions(+), 26 deletions(-)

diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc
index dbb7df2..e37beba 100644
--- a/cpp/src/parquet/column_writer.cc
+++ b/cpp/src/parquet/column_writer.cc
@@ -729,24 +729,24 @@ void ColumnWriterImpl::AddDataPage() {
   int64_t repetition_levels_rle_size = 0;
 
   std::shared_ptr<Buffer> values = GetValuesBuffer();
-  bool is_v1 = properties_->version() == ParquetVersion::PARQUET_1_0;
+  bool is_v1_data_page = properties_->data_page_version() == ParquetDataPageVersion::V1;
 
   if (descr_->max_definition_level() > 0) {
-    definition_levels_rle_size =
-        RleEncodeLevels(definition_levels_sink_.data(), definition_levels_rle_.get(),
-                        descr_->max_definition_level(), /*include_length_prefix=*/is_v1);
+    definition_levels_rle_size = RleEncodeLevels(
+        definition_levels_sink_.data(), definition_levels_rle_.get(),
+        descr_->max_definition_level(), /*include_length_prefix=*/is_v1_data_page);
   }
 
   if (descr_->max_repetition_level() > 0) {
-    repetition_levels_rle_size =
-        RleEncodeLevels(repetition_levels_sink_.data(), repetition_levels_rle_.get(),
-                        descr_->max_repetition_level(), /*include_length_prefix=*/is_v1);
+    repetition_levels_rle_size = RleEncodeLevels(
+        repetition_levels_sink_.data(), repetition_levels_rle_.get(),
+        descr_->max_repetition_level(), /*include_length_prefix=*/is_v1_data_page);
   }
 
   int64_t uncompressed_size =
       definition_levels_rle_size + repetition_levels_rle_size + values->size();
 
-  if (is_v1) {
+  if (is_v1_data_page) {
     BuildDataPageV1(definition_levels_rle_size, repetition_levels_rle_size,
                     uncompressed_size, values);
   } else {
diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h
index 9c2ec1d..df4fb41 100644
--- a/cpp/src/parquet/properties.h
+++ b/cpp/src/parquet/properties.h
@@ -34,10 +34,30 @@
 
 namespace parquet {
 
+/// Determines use of Parquet Format version >= 2.0.0 logical types. For
+/// example, when writing from Arrow data structures, PARQUET_2_0 will enable
+/// use of INT_* and UINT_* converted types as well as nanosecond timestamps
+/// stored physically as INT64. Since some Parquet implementations do not
+/// support the logical types added in the 2.0.0 format version, if you want to
+/// maximize compatibility of your files you may want to use PARQUET_1_0.
+///
+/// Note that the 2.x format version series also introduced new serialized
+/// data page metadata and on disk data page layout. To enable this, use
+/// ParquetDataPageVersion.
 struct ParquetVersion {
   enum type { PARQUET_1_0, PARQUET_2_0 };
 };
 
+/// Controls serialization format of data pages.  parquet-format v2.0.0
+/// introduced a new data page metadata type DataPageV2 and serialized page
+/// structure (for example, encoded levels are no longer compressed). Prior to
+/// the completion of PARQUET-457 in 2020, this library did not implement
+/// DataPageV2 correctly, so if you use the V2 data page format, you may have
+/// forward compatibility issues (older versions of the library will be unable
+/// to read the files). Note that some Parquet implementations do not implement
+/// DataPageV2 at all.
+enum class ParquetDataPageVersion { V1, V2 };
+
 static int64_t DEFAULT_BUFFER_SIZE = 1024;
 static bool DEFAULT_USE_BUFFERED_STREAM = false;
 
@@ -89,8 +109,6 @@ static constexpr int64_t DEFAULT_MAX_ROW_GROUP_LENGTH = 64 * 1024 * 1024;
 static constexpr bool DEFAULT_ARE_STATISTICS_ENABLED = true;
 static constexpr int64_t DEFAULT_MAX_STATISTICS_SIZE = 4096;
 static constexpr Encoding::type DEFAULT_ENCODING = Encoding::PLAIN;
-static constexpr ParquetVersion::type DEFAULT_WRITER_VERSION =
-    ParquetVersion::PARQUET_1_0;
 static const char DEFAULT_CREATED_BY[] = CREATED_BY_VERSION;
 static constexpr Compression::type DEFAULT_COMPRESSION_TYPE = Compression::UNCOMPRESSED;
 
@@ -159,7 +177,8 @@ class PARQUET_EXPORT WriterProperties {
           write_batch_size_(DEFAULT_WRITE_BATCH_SIZE),
           max_row_group_length_(DEFAULT_MAX_ROW_GROUP_LENGTH),
           pagesize_(kDefaultDataPageSize),
-          version_(DEFAULT_WRITER_VERSION),
+          version_(ParquetVersion::PARQUET_1_0),
+          data_page_version_(ParquetDataPageVersion::V1),
           created_by_(DEFAULT_CREATED_BY) {}
     virtual ~Builder() {}
 
@@ -216,6 +235,11 @@ class PARQUET_EXPORT WriterProperties {
       return this;
     }
 
+    Builder* data_page_version(ParquetDataPageVersion data_page_version) {
+      data_page_version_ = data_page_version;
+      return this;
+    }
+
     Builder* version(ParquetVersion::type version) {
       version_ = version;
       return this;
@@ -394,7 +418,7 @@ class PARQUET_EXPORT WriterProperties {
       return std::shared_ptr<WriterProperties>(new WriterProperties(
           pool_, dictionary_pagesize_limit_, write_batch_size_, max_row_group_length_,
           pagesize_, version_, created_by_, std::move(file_encryption_properties_),
-          default_column_properties_, column_properties));
+          default_column_properties_, column_properties, data_page_version_));
     }
 
    private:
@@ -404,6 +428,7 @@ class PARQUET_EXPORT WriterProperties {
     int64_t max_row_group_length_;
     int64_t pagesize_;
     ParquetVersion::type version_;
+    ParquetDataPageVersion data_page_version_;
     std::string created_by_;
 
     std::shared_ptr<FileEncryptionProperties> file_encryption_properties_;
@@ -427,6 +452,10 @@ class PARQUET_EXPORT WriterProperties {
 
   inline int64_t data_pagesize() const { return pagesize_; }
 
+  inline ParquetDataPageVersion data_page_version() const {
+    return parquet_data_page_version_;
+  }
+
   inline ParquetVersion::type version() const { return parquet_version_; }
 
   inline std::string created_by() const { return parquet_created_by_; }
@@ -498,12 +527,14 @@ class PARQUET_EXPORT WriterProperties {
       const std::string& created_by,
       std::shared_ptr<FileEncryptionProperties> file_encryption_properties,
       const ColumnProperties& default_column_properties,
-      const std::unordered_map<std::string, ColumnProperties>& column_properties)
+      const std::unordered_map<std::string, ColumnProperties>& column_properties,
+      ParquetDataPageVersion data_page_version)
       : pool_(pool),
         dictionary_pagesize_limit_(dictionary_pagesize_limit),
         write_batch_size_(write_batch_size),
         max_row_group_length_(max_row_group_length),
         pagesize_(pagesize),
+        parquet_data_page_version_(data_page_version),
         parquet_version_(version),
         parquet_created_by_(created_by),
         file_encryption_properties_(file_encryption_properties),
@@ -515,6 +546,7 @@ class PARQUET_EXPORT WriterProperties {
   int64_t write_batch_size_;
   int64_t max_row_group_length_;
   int64_t pagesize_;
+  ParquetDataPageVersion parquet_data_page_version_;
   ParquetVersion::type parquet_version_;
   std::string parquet_created_by_;
 
diff --git a/cpp/src/parquet/properties_test.cc b/cpp/src/parquet/properties_test.cc
index 94ff79a..aef563b 100644
--- a/cpp/src/parquet/properties_test.cc
+++ b/cpp/src/parquet/properties_test.cc
@@ -43,7 +43,8 @@ TEST(TestWriterProperties, Basics) {
 
   ASSERT_EQ(kDefaultDataPageSize, props->data_pagesize());
   ASSERT_EQ(DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT, props->dictionary_pagesize_limit());
-  ASSERT_EQ(DEFAULT_WRITER_VERSION, props->version());
+  ASSERT_EQ(ParquetVersion::PARQUET_1_0, props->version());
+  ASSERT_EQ(ParquetDataPageVersion::V1, props->data_page_version());
 }
 
 TEST(TestWriterProperties, AdvancedHandling) {
@@ -53,6 +54,7 @@ TEST(TestWriterProperties, AdvancedHandling) {
   builder.compression(Compression::SNAPPY);
   builder.encoding(Encoding::DELTA_BINARY_PACKED);
   builder.encoding("delta-length", Encoding::DELTA_LENGTH_BYTE_ARRAY);
+  builder.data_page_version(ParquetDataPageVersion::V2);
   std::shared_ptr<WriterProperties> props = builder.build();
 
   ASSERT_EQ(Compression::GZIP, props->compression(ColumnPath::FromDotString("gzip")));
@@ -63,6 +65,7 @@ TEST(TestWriterProperties, AdvancedHandling) {
             props->encoding(ColumnPath::FromDotString("gzip")));
   ASSERT_EQ(Encoding::DELTA_LENGTH_BYTE_ARRAY,
             props->encoding(ColumnPath::FromDotString("delta-length")));
+  ASSERT_EQ(ParquetDataPageVersion::V2, props->data_page_version());
 }
 
 TEST(TestReaderProperties, GetStreamInsufficientData) {
diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd
index e737113..2b370b3 100644
--- a/python/pyarrow/_parquet.pxd
+++ b/python/pyarrow/_parquet.pxd
@@ -347,6 +347,7 @@ cdef extern from "parquet/api/reader.h" namespace "parquet" nogil:
 cdef extern from "parquet/api/writer.h" namespace "parquet" nogil:
     cdef cppclass WriterProperties:
         cppclass Builder:
+            Builder* data_page_version(ParquetDataPageVersion version)
             Builder* version(ParquetVersion version)
             Builder* compression(ParquetCompression codec)
             Builder* compression(const c_string& path,
@@ -443,6 +444,14 @@ cdef extern from "parquet/properties.h" namespace "parquet" nogil:
         V1 "parquet::ArrowWriterProperties::V1",
         V2 "parquet::ArrowWriterProperties::V2"
 
+    cdef cppclass ParquetDataPageVersion:
+        pass
+
+    cdef ParquetDataPageVersion ParquetDataPageVersion_V1 \
+        " parquet::ParquetDataPageVersion::V1"
+    cdef ParquetDataPageVersion ParquetDataPageVersion_V2 \
+        " parquet::ParquetDataPageVersion::V2"
+
 cdef extern from "parquet/arrow/writer.h" namespace "parquet::arrow" nogil:
     cdef cppclass FileWriter:
 
diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx
index a8dbc0e..de9b23a 100644
--- a/python/pyarrow/_parquet.pyx
+++ b/python/pyarrow/_parquet.pyx
@@ -1207,6 +1207,7 @@ cdef class ParquetWriter:
         object allow_truncated_timestamps
         object compression
         object compression_level
+        object data_page_version
         object version
         object write_statistics
         object writer_engine_version
@@ -1223,7 +1224,8 @@ cdef class ParquetWriter:
                   allow_truncated_timestamps=False,
                   compression_level=None,
                   use_byte_stream_split=False,
-                  writer_engine_version=None):
+                  writer_engine_version=None,
+                  data_page_version=None):
         cdef:
             shared_ptr[WriterProperties] properties
             c_string c_where
@@ -1250,8 +1252,10 @@ cdef class ParquetWriter:
         self.allow_truncated_timestamps = allow_truncated_timestamps
         self.use_byte_stream_split = use_byte_stream_split
         self.writer_engine_version = writer_engine_version
+        self.data_page_version = data_page_version
 
         cdef WriterProperties.Builder properties_builder
+        self._set_data_page_version(&properties_builder)
         self._set_version(&properties_builder)
         self._set_compression_props(&properties_builder)
         self._set_dictionary_props(&properties_builder)
@@ -1324,6 +1328,17 @@ cdef class ParquetWriter:
                 raise ValueError("Unsupported Parquet format version: {0}"
                                  .format(self.version))
 
+    cdef int _set_data_page_version(self, WriterProperties.Builder* props) \
+            except -1:
+        if self.data_page_version is not None:
+            if self.data_page_version == "1.0":
+                props.data_page_version(ParquetDataPageVersion_V1)
+            elif self.data_page_version == "2.0":
+                props.data_page_version(ParquetDataPageVersion_V2)
+            else:
+                raise ValueError("Unsupported Parquet data page version: {0}"
+                                 .format(self.data_page_version))
+
     cdef void _set_compression_props(self, WriterProperties.Builder* props) \
             except *:
         if isinstance(self.compression, basestring):
diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py
index fcd7454..3158e8f 100644
--- a/python/pyarrow/parquet.py
+++ b/python/pyarrow/parquet.py
@@ -432,7 +432,15 @@ def _sanitize_table(table, new_schema, flavor):
 
 
 _parquet_writer_arg_docs = """version : {"1.0", "2.0"}, default "1.0"
-    The Parquet format version, defaults to 1.0.
+    Determine which Parquet logical types are available for use, whether the
+    reduced set from the Parquet 1.x.x format or the expanded logical types
+    added in format version 2.0.0 and after. Note that files written with
+    version='2.0' may not be readable in all Parquet implementations, so
+    version='1.0' is likely the choice that maximizes file compatibility. Some
+    features, such as lossless storage of nanosecond timestamps as INT64
+    physical storage, are only available with version='2.0'. The Parquet 2.0.0
+    format version also introduced a new serialized data page format; this can
+    be enabled separately using the data_page_version option.
 use_dictionary : bool or list
     Specify if we should use dictionary encoding in general or only for
     some columns.
@@ -481,6 +489,10 @@ writer_engine_version: str, default "V2"
     all nested types. V1 is legacy and will be removed in a future release.
     Setting the environment variable ARROW_PARQUET_WRITER_ENGINE will
     override the default.
+data_page_version : {"1.0", "2.0"}, default "1.0"
+    The serialized Parquet data page format version to write, defaults to
+    1.0. This does not impact the file schema logical types and Arrow to
+    Parquet type casting behavior; for that use the "version" option.
 """
 
 
@@ -511,6 +523,7 @@ schema : arrow Schema
                  compression_level=None,
                  use_byte_stream_split=False,
                  writer_engine_version=None,
+                 data_page_version='1.0',
                  **options):
         if use_deprecated_int96_timestamps is None:
             # Use int96 timestamps for Spark
@@ -549,6 +562,7 @@ schema : arrow Schema
             compression_level=compression_level,
             use_byte_stream_split=use_byte_stream_split,
             writer_engine_version=engine_version,
+            data_page_version=data_page_version,
             **options)
         self.is_open = True
 
@@ -1584,6 +1598,7 @@ def write_table(table, where, row_group_size=None, version='1.0',
                 filesystem=None,
                 compression_level=None,
                 use_byte_stream_split=False,
+                data_page_version='1.0',
                 **kwargs):
     row_group_size = kwargs.pop('chunk_size', row_group_size)
     use_int96 = use_deprecated_int96_timestamps
@@ -1602,6 +1617,7 @@ def write_table(table, where, row_group_size=None, version='1.0',
                 use_deprecated_int96_timestamps=use_int96,
                 compression_level=compression_level,
                 use_byte_stream_split=use_byte_stream_split,
+                data_page_version=data_page_version,
                 **kwargs) as writer:
             writer.write_table(table, row_group_size=row_group_size)
     except Exception:
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index f215eaa..5e6e227 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -210,6 +210,10 @@ def test_parquet_invalid_version(tempdir):
     table = pa.table({'a': [1, 2, 3]})
     with pytest.raises(ValueError, match="Unsupported Parquet format version"):
         _write_table(table, tempdir / 'test_version.parquet', version="2.2")
+    with pytest.raises(ValueError, match="Unsupported Parquet data page " +
+                       "version"):
+        _write_table(table, tempdir / 'test_version.parquet',
+                     data_page_version="2.2")
 
 
 @parametrize_legacy_dataset
@@ -230,16 +234,19 @@ def test_chunked_table_write(use_legacy_dataset):
     # ARROW-232
     df = alltypes_sample(size=10)
 
-    batch = pa.RecordBatch.from_pandas(df)
-    table = pa.Table.from_batches([batch] * 3)
-    _check_roundtrip(
-        table, version='2.0', use_legacy_dataset=use_legacy_dataset)
+    for data_page_version in ['1.0', '2.0']:
+        batch = pa.RecordBatch.from_pandas(df)
+        table = pa.Table.from_batches([batch] * 3)
+        _check_roundtrip(
+            table, version='2.0', use_legacy_dataset=use_legacy_dataset,
+            data_page_version=data_page_version)
 
-    df, _ = dataframe_with_lists()
-    batch = pa.RecordBatch.from_pandas(df)
-    table = pa.Table.from_batches([batch] * 3)
-    _check_roundtrip(
-        table, version='2.0', use_legacy_dataset=use_legacy_dataset)
+        df, _ = dataframe_with_lists()
+        batch = pa.RecordBatch.from_pandas(df)
+        table = pa.Table.from_batches([batch] * 3)
+        _check_roundtrip(
+            table, version='2.0', use_legacy_dataset=use_legacy_dataset,
+            data_page_version=data_page_version)
 
 
 @pytest.mark.pandas
@@ -3738,7 +3745,7 @@ def test_multi_dataset_metadata(tempdir):
         'one': [1, 2, 3],
         'two': [-1, -2, -3],
         'three': [[1, 2], [2, 3], [3, 4]],
-        })
+    })
     table = pa.Table.from_pandas(df)
 
     # write dataset twice and collect/merge metadata


[arrow] 12/17: ARROW-8728: [C++] Fix bitmap operation buffer overflow

Posted by ks...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

kszucs pushed a commit to branch maint-0.17.x
in repository https://gitbox.apache.org/repos/asf/arrow.git

commit 57da2c186769f42ace236c60611706daf5c74adb
Author: Yibo Cai <yi...@arm.com>
AuthorDate: Thu May 7 14:27:12 2020 +0200

    ARROW-8728: [C++] Fix bitmap operation buffer overflow
    
    Closes #7123 from cyb70289/bitmap-bug
    
    Authored-by: Yibo Cai <yi...@arm.com>
    Signed-off-by: Antoine Pitrou <an...@python.org>
---
 cpp/src/arrow/util/bit_util.cc      | 2 +-
 cpp/src/arrow/util/bit_util_test.cc | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/src/arrow/util/bit_util.cc b/cpp/src/arrow/util/bit_util.cc
index 15bc561..5f17838 100644
--- a/cpp/src/arrow/util/bit_util.cc
+++ b/cpp/src/arrow/util/bit_util.cc
@@ -264,7 +264,7 @@ void AlignedBitmapOp(const uint8_t* left, int64_t left_offset, const uint8_t* ri
   DCHECK_EQ(left_offset % 8, right_offset % 8);
   DCHECK_EQ(left_offset % 8, out_offset % 8);
 
-  const int64_t nbytes = BitUtil::BytesForBits(length + left_offset);
+  const int64_t nbytes = BitUtil::BytesForBits(length + left_offset % 8);
   left += left_offset / 8;
   right += right_offset / 8;
   out += out_offset / 8;
diff --git a/cpp/src/arrow/util/bit_util_test.cc b/cpp/src/arrow/util/bit_util_test.cc
index a8e6279..e597b63 100644
--- a/cpp/src/arrow/util/bit_util_test.cc
+++ b/cpp/src/arrow/util/bit_util_test.cc
@@ -577,7 +577,7 @@ class BitmapOp : public TestBase {
     std::shared_ptr<Buffer> left, right, out;
     int64_t length;
 
-    for (int64_t left_offset : {0, 1, 3, 5, 7, 8, 13, 21, 38, 75, 120}) {
+    for (int64_t left_offset : {0, 1, 3, 5, 7, 8, 13, 21, 38, 75, 120, 65536}) {
       BitmapFromVector(left_bits, left_offset, &left, &length);
       for (int64_t right_offset : {left_offset, left_offset + 8, left_offset + 40}) {
         BitmapFromVector(right_bits, right_offset, &right, &length);
@@ -604,7 +604,7 @@ class BitmapOp : public TestBase {
                      const std::vector<int>& result_bits) {
     std::shared_ptr<Buffer> left, right, out;
     int64_t length;
-    auto offset_values = {0, 1, 3, 5, 7, 8, 13, 21, 38, 75, 120};
+    auto offset_values = {0, 1, 3, 5, 7, 8, 13, 21, 38, 75, 120, 65536};
 
     for (int64_t left_offset : offset_values) {
       BitmapFromVector(left_bits, left_offset, &left, &length);


[arrow] 14/17: ARROW-8758: [R] Updates for compatibility with dplyr 1.0

Posted by ks...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

kszucs pushed a commit to branch maint-0.17.x
in repository https://gitbox.apache.org/repos/asf/arrow.git

commit 116ed8832834d102583597be6031e9cb28e5dfb2
Author: Neal Richardson <ne...@gmail.com>
AuthorDate: Mon May 11 10:15:27 2020 -0700

    ARROW-8758: [R] Updates for compatibility with dplyr 1.0
    
    I tested this locally with the current version of `dplyr` on CRAN and the dev version scheduled to be released to CRAN on May 15. Our tests now pass with both versions.
    
    Changes addressed:
    
    * `group_by` now requires a character vector of grouping variable names, so now we use `group_vars()` instead of `groups()`. `group_vars()` works in the current `dplyr` release, so this is a simple change.
    * The argument name in `group_by()` changed from `add` to `.add`, and calling it with the name that works in the current version raises a deprecation warning in dplyr 1.0. The fix here supports both spellings of the argument, and it avoids the warning by determining which version of the internal dplyr function exists and calling the appropriate one.
    * `dplyr::transmute()` no longer calls `dplyr::mutate()` internally, so it doesn't just work on Arrow objects anymore. I skipped the one test that called it and left a TODO to add a transmute method.
    
    Closes #7147 from nealrichardson/dplyr-1.0
    
    Authored-by: Neal Richardson <ne...@gmail.com>
    Signed-off-by: Neal Richardson <ne...@gmail.com>
---
 r/NEWS.md                     |  4 ++++
 r/R/dplyr.R                   | 13 +++++++++----
 r/tests/testthat/test-dplyr.R |  1 +
 3 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/r/NEWS.md b/r/NEWS.md
index 7cf8ef9..231b3d3 100644
--- a/r/NEWS.md
+++ b/r/NEWS.md
@@ -17,6 +17,10 @@
   under the License.
 -->
 
+# arrow 0.17.0.9000
+
+* Updates for compatibility with `dplyr` 1.0
+
 # arrow 0.17.0
 
 ## Feather v2
diff --git a/r/R/dplyr.R b/r/R/dplyr.R
index 1d3b5ee..5ea0a7a 100644
--- a/r/R/dplyr.R
+++ b/r/R/dplyr.R
@@ -264,7 +264,7 @@ restore_dplyr_features <- function(df, query) {
   }
   # Preserve groupings, if present
   if (length(query$group_by_vars)) {
-    df <- dplyr::grouped_df(df, dplyr::groups(query))
+    df <- dplyr::grouped_df(df, dplyr::group_vars(query))
   }
   df
 }
@@ -294,9 +294,15 @@ summarise.arrow_dplyr_query <- function(.data, ...) {
 }
 summarise.Dataset <- summarise.Table <- summarise.RecordBatch <- summarise.arrow_dplyr_query
 
-group_by.arrow_dplyr_query <- function(.data, ..., add = FALSE) {
+group_by.arrow_dplyr_query <- function(.data, ..., .add = FALSE, add = .add) {
   .data <- arrow_dplyr_query(.data)
-  .data$group_by_vars <- dplyr::group_by_prepare(.data, ..., add = add)$group_names
+  if (".add" %in% names(formals(dplyr::group_by))) {
+    # dplyr >= 1.0
+    gv <- dplyr::group_by_prepare(.data, ..., .add = .add)$group_names
+  } else {
+    gv <- dplyr::group_by_prepare(.data, ..., add = add)$group_names
+  }
+  .data$group_by_vars <- gv
   .data
 }
 group_by.Dataset <- group_by.Table <- group_by.RecordBatch <- group_by.arrow_dplyr_query
@@ -324,7 +330,6 @@ mutate.arrow_dplyr_query <- function(.data, ...) {
   dplyr::mutate(dplyr::collect(.data), ...)
 }
 mutate.Dataset <- mutate.Table <- mutate.RecordBatch <- mutate.arrow_dplyr_query
-# transmute() "just works" because it calls mutate() internally
 # TODO: add transmute() that does what summarise() does (select only the vars we need)
 
 arrange.arrow_dplyr_query <- function(.data, ...) {
diff --git a/r/tests/testthat/test-dplyr.R b/r/tests/testthat/test-dplyr.R
index e531fb3..3ae915e 100644
--- a/r/tests/testthat/test-dplyr.R
+++ b/r/tests/testthat/test-dplyr.R
@@ -228,6 +228,7 @@ test_that("mutate", {
 })
 
 test_that("transmute", {
+  skip("TODO: reimplement transmute (with dplyr 1.0, it no longer just works via mutate)")
   expect_dplyr_equal(
     input %>%
       select(int, chr) %>%


[arrow] 15/17: ARROW-8750: [Python] Correctly default to lz4 compression for Feather V2 in Python

Posted by ks...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

kszucs pushed a commit to branch maint-0.17.x
in repository https://gitbox.apache.org/repos/asf/arrow.git

commit bda2b5b35d9fcab93444243ee6fa5498b5059b0d
Author: Wes McKinney <we...@apache.org>
AuthorDate: Tue May 12 12:01:37 2020 -0500

    ARROW-8750: [Python] Correctly default to lz4 compression for Feather V2 in Python
    
    This was the intention but I had not implemented it correctly. It's now tested to be so
    
    Closes #7150 from wesm/ARROW-8750
    
    Authored-by: Wes McKinney <we...@apache.org>
    Signed-off-by: Wes McKinney <we...@apache.org>
---
 python/pyarrow/feather.py            | 20 +++++++++++++-------
 python/pyarrow/io.pxi                | 13 ++++++++-----
 python/pyarrow/tests/test_feather.py | 20 ++++++++++++++++++++
 3 files changed, 41 insertions(+), 12 deletions(-)

diff --git a/python/pyarrow/feather.py b/python/pyarrow/feather.py
index a599e15..6071b5e 100644
--- a/python/pyarrow/feather.py
+++ b/python/pyarrow/feather.py
@@ -19,8 +19,8 @@
 import os
 
 from pyarrow.pandas_compat import _pandas_api  # noqa
-from pyarrow.lib import FeatherError  # noqa
-from pyarrow.lib import Table, concat_tables, schema
+from pyarrow.lib import (Codec, FeatherError, Table,  # noqa
+                         concat_tables, schema)
 import pyarrow.lib as ext
 
 
@@ -112,6 +112,9 @@ def check_chunked_overflow(name, col):
                          "Feather format".format(name, str(col.type)))
 
 
+_FEATHER_SUPPORTED_CODECS = {'lz4', 'zstd', 'uncompressed'}
+
+
 def write_feather(df, dest, compression=None, compression_level=None,
                   chunksize=None, version=2):
     """
@@ -165,11 +168,14 @@ def write_feather(df, dest, compression=None, compression_level=None,
         if chunksize is not None:
             raise ValueError("Feather V1 files do not support chunksize "
                              "option")
-
-    supported_compression_options = (None, 'lz4', 'zstd', 'uncompressed')
-    if compression not in supported_compression_options:
-        raise ValueError('compression="{}" not supported, must be one of {}'
-                         .format(compression, supported_compression_options))
+    else:
+        if compression is None and Codec.is_available('lz4_frame'):
+            compression = 'lz4'
+        elif (compression is not None and
+              compression not in _FEATHER_SUPPORTED_CODECS):
+            raise ValueError('compression="{}" not supported, must be '
+                             'one of {}'.format(compression,
+                                                _FEATHER_SUPPORTED_CODECS))
 
     try:
         ext.write_feather(table, dest, compression=compression,
diff --git a/python/pyarrow/io.pxi b/python/pyarrow/io.pxi
index c4538db..d86f80c 100644
--- a/python/pyarrow/io.pxi
+++ b/python/pyarrow/io.pxi
@@ -1472,8 +1472,10 @@ cdef CCompressionType _ensure_compression(str name) except *:
         return CCompressionType_BZ2
     elif uppercase == 'BROTLI':
         return CCompressionType_BROTLI
-    elif uppercase == 'LZ4':
+    elif uppercase == 'LZ4' or uppercase == 'LZ4_FRAME':
         return CCompressionType_LZ4_FRAME
+    elif uppercase == 'LZ4_RAW':
+        return CCompressionType_LZ4
     elif uppercase == 'ZSTD':
         return CCompressionType_ZSTD
     elif uppercase == 'SNAPPY':
@@ -1489,8 +1491,9 @@ cdef class Codec:
     Parameters
     ----------
     compression : str
-        Type of compression codec to initialize, valid values are: gzip, bz2,
-        brotli, lz4, zstd and snappy.
+        Type of compression codec to initialize, valid values are: 'gzip',
+        'bz2', 'brotli', 'lz4' (or 'lz4_frame'), 'lz4_raw', 'zstd' and
+        'snappy'.
 
     Raises
     ------
@@ -1674,7 +1677,7 @@ def compress(object buf, codec='lz4', asbytes=False, memory_pool=None):
     buf : pyarrow.Buffer, bytes, or other object supporting buffer protocol
     codec : str, default 'lz4'
         Compression codec.
-        Supported types: {'brotli, 'gzip', 'lz4', 'snappy', 'zstd'}
+        Supported types: {'brotli, 'gzip', 'lz4', 'lz4_raw', 'snappy', 'zstd'}
     asbytes : bool, default False
         Return result as Python bytes object, otherwise Buffer.
     memory_pool : MemoryPool, default None
@@ -1702,7 +1705,7 @@ def decompress(object buf, decompressed_size=None, codec='lz4',
         the uncompressed buffer size.
     codec : str, default 'lz4'
         Compression codec.
-        Supported types: {'brotli, 'gzip', 'lz4', 'snappy', 'zstd'}
+        Supported types: {'brotli, 'gzip', 'lz4', 'lz4_raw', 'snappy', 'zstd'}
     asbytes : bool, default False
         Return result as Python bytes object, otherwise Buffer.
     memory_pool : MemoryPool, default None
diff --git a/python/pyarrow/tests/test_feather.py b/python/pyarrow/tests/test_feather.py
index fbfcade..06dfac9 100644
--- a/python/pyarrow/tests/test_feather.py
+++ b/python/pyarrow/tests/test_feather.py
@@ -637,6 +637,26 @@ def test_v2_compression_options():
         write_feather(df, buf, compression='snappy')
 
 
+def test_v2_lz4_default_compression():
+    # ARROW-8750: Make sure that the compression=None option selects lz4 if
+    # it's available
+    if not pa.Codec.is_available('lz4_frame'):
+        pytest.skip("LZ4 compression support is not built in C++")
+
+    # some highly compressible data
+    t = pa.table([np.repeat(0, 100000)], names=['f0'])
+
+    buf = io.BytesIO()
+    write_feather(t, buf)
+    default_result = buf.getvalue()
+
+    buf = io.BytesIO()
+    write_feather(t, buf, compression='uncompressed')
+    uncompressed_result = buf.getvalue()
+
+    assert len(default_result) < len(uncompressed_result)
+
+
 def test_v1_unsupported_types():
     table = pa.table([pa.array([[1, 2, 3], [], None])], names=['f0'])
 


[arrow] 04/17: ARROW-8584: [C++] Fix ORC link order

Posted by ks...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

kszucs pushed a commit to branch maint-0.17.x
in repository https://gitbox.apache.org/repos/asf/arrow.git

commit d6210c8d12424133fb3ccb3282574ada0daa9e10
Author: Sutou Kouhei <ko...@clear-code.com>
AuthorDate: Sun Apr 26 17:01:42 2020 +0900

    ARROW-8584: [C++] Fix ORC link order
    
    Closes #7041 from kou/cpp-fix-orc-link-order
    
    Authored-by: Sutou Kouhei <ko...@clear-code.com>
    Signed-off-by: Sutou Kouhei <ko...@clear-code.com>
---
 cpp/CMakeLists.txt | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 45a6f2e..e28ae16 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -676,10 +676,10 @@ if(ARROW_WITH_ZSTD)
 endif()
 
 if(ARROW_ORC)
-  list(APPEND ARROW_LINK_LIBS ${ARROW_PROTOBUF_LIBPROTOBUF} orc::liborc)
-  list(APPEND ARROW_STATIC_LINK_LIBS ${ARROW_PROTOBUF_LIBPROTOBUF} orc::liborc)
-  list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS ${ARROW_PROTOBUF_LIBPROTOBUF}
-              orc::liborc)
+  list(APPEND ARROW_LINK_LIBS orc::liborc ${ARROW_PROTOBUF_LIBPROTOBUF})
+  list(APPEND ARROW_STATIC_LINK_LIBS orc::liborc ${ARROW_PROTOBUF_LIBPROTOBUF})
+  list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS orc::liborc
+              ${ARROW_PROTOBUF_LIBPROTOBUF})
 endif()
 
 if(ARROW_USE_GLOG)


[arrow] 10/17: PARQUET-1857: [C++] Do not fail to read unencrypted files with over 32767 row groups. Change some DCHECKs causing segfaults to throw exceptions

Posted by ks...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

kszucs pushed a commit to branch maint-0.17.x
in repository https://gitbox.apache.org/repos/asf/arrow.git

commit 5fb626dcf7c6cf0ab347f32c157e053c5c99fbc8
Author: Wes McKinney <we...@apache.org>
AuthorDate: Wed May 6 11:28:35 2020 -0500

    PARQUET-1857: [C++] Do not fail to read unencrypted files with over 32767 row groups. Change some DCHECKs causing segfaults to throw exceptions
    
    While writing files with so many row groups is a bad idea, people will still do it and currently the C++ library will segfault in release builds when trying to read such a file. This removes those potential segfaults and enables reading the many-row-group files again. Files with encrypted row group metadata with that many row groups cannot be read because the Parquet metadata uses an int16 row group ordinal key.
    
    Closes #7108 from wesm/PARQUET-1857
    
    Authored-by: Wes McKinney <we...@apache.org>
    Signed-off-by: Wes McKinney <we...@apache.org>
---
 cpp/src/parquet/file_reader.cc       | 40 +++++++++++++++++++++++-------------
 cpp/src/parquet/reader_test.cc       | 11 ++++++++++
 python/pyarrow/tests/test_parquet.py | 11 ++++++++++
 3 files changed, 48 insertions(+), 14 deletions(-)

diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc
index 167321c..46c28a5 100644
--- a/cpp/src/parquet/file_reader.cc
+++ b/cpp/src/parquet/file_reader.cc
@@ -57,9 +57,12 @@ RowGroupReader::RowGroupReader(std::unique_ptr<Contents> contents)
     : contents_(std::move(contents)) {}
 
 std::shared_ptr<ColumnReader> RowGroupReader::Column(int i) {
-  DCHECK(i < metadata()->num_columns())
-      << "The RowGroup only has " << metadata()->num_columns()
-      << "columns, requested column: " << i;
+  if (i >= metadata()->num_columns()) {
+    std::stringstream ss;
+    ss << "Trying to read column index " << i << " but row group metadata has only "
+       << metadata()->num_columns() << " columns";
+    throw ParquetException(ss.str());
+  }
   const ColumnDescriptor* descr = metadata()->schema()->Column(i);
 
   std::unique_ptr<PageReader> page_reader = contents_->GetColumnPageReader(i);
@@ -69,9 +72,12 @@ std::shared_ptr<ColumnReader> RowGroupReader::Column(int i) {
 }
 
 std::unique_ptr<PageReader> RowGroupReader::GetColumnPageReader(int i) {
-  DCHECK(i < metadata()->num_columns())
-      << "The RowGroup only has " << metadata()->num_columns()
-      << "columns, requested column: " << i;
+  if (i >= metadata()->num_columns()) {
+    std::stringstream ss;
+    ss << "Trying to read column index " << i << " but row group metadata has only "
+       << metadata()->num_columns() << " columns";
+    throw ParquetException(ss.str());
+  }
   return contents_->GetColumnPageReader(i);
 }
 
@@ -136,6 +142,11 @@ class SerializedRowGroup : public RowGroupReader::Contents {
       throw ParquetException("RowGroup is noted as encrypted but no file decryptor");
     }
 
+    constexpr auto kEncryptedRowGroupsLimit = 32767;
+    if (i > kEncryptedRowGroupsLimit) {
+      throw ParquetException("Encrypted files cannot contain more than 32767 row groups");
+    }
+
     // The column is encrypted
     std::shared_ptr<Decryptor> meta_decryptor;
     std::shared_ptr<Decryptor> data_decryptor;
@@ -170,7 +181,7 @@ class SerializedRowGroup : public RowGroupReader::Contents {
   FileMetaData* file_metadata_;
   std::unique_ptr<RowGroupMetaData> row_group_metadata_;
   ReaderProperties properties_;
-  int16_t row_group_ordinal_;
+  int row_group_ordinal_;
   std::shared_ptr<InternalFileDecryptor> file_decryptor_;
 };
 
@@ -200,9 +211,8 @@ class SerializedFile : public ParquetFileReader::Contents {
   }
 
   std::shared_ptr<RowGroupReader> GetRowGroup(int i) override {
-    std::unique_ptr<SerializedRowGroup> contents(
-        new SerializedRowGroup(source_, source_size_, file_metadata_.get(),
-                               static_cast<int16_t>(i), properties_, file_decryptor_));
+    std::unique_ptr<SerializedRowGroup> contents(new SerializedRowGroup(
+        source_, source_size_, file_metadata_.get(), i, properties_, file_decryptor_));
     return std::make_shared<RowGroupReader>(std::move(contents));
   }
 
@@ -529,10 +539,12 @@ std::shared_ptr<FileMetaData> ParquetFileReader::metadata() const {
 }
 
 std::shared_ptr<RowGroupReader> ParquetFileReader::RowGroup(int i) {
-  DCHECK(i < metadata()->num_row_groups())
-      << "The file only has " << metadata()->num_row_groups()
-      << "row groups, requested reader for: " << i;
-
+  if (i >= metadata()->num_row_groups()) {
+    std::stringstream ss;
+    ss << "Trying to read row group " << i << " but file only has "
+       << metadata()->num_row_groups() << " row groups";
+    throw ParquetException(ss.str());
+  }
   return contents_->GetRowGroup(i);
 }
 
diff --git a/cpp/src/parquet/reader_test.cc b/cpp/src/parquet/reader_test.cc
index f28b226..a271075 100644
--- a/cpp/src/parquet/reader_test.cc
+++ b/cpp/src/parquet/reader_test.cc
@@ -105,6 +105,17 @@ TEST_F(TestAllTypesPlain, TestBatchRead) {
   ASSERT_FALSE(col->HasNext());
 }
 
+TEST_F(TestAllTypesPlain, RowGroupColumnBoundchecking) {
+  // Part of PARQUET-1857
+  ASSERT_THROW(reader_->RowGroup(reader_->metadata()->num_row_groups()),
+               ParquetException);
+
+  auto row_group = reader_->RowGroup(0);
+  ASSERT_THROW(row_group->Column(row_group->metadata()->num_columns()), ParquetException);
+  ASSERT_THROW(row_group->GetColumnPageReader(row_group->metadata()->num_columns()),
+               ParquetException);
+}
+
 TEST_F(TestAllTypesPlain, TestFlatScannerInt32) {
   std::shared_ptr<RowGroupReader> group = reader_->RowGroup(0);
 
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index 86eb964..f215eaa 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -288,6 +288,17 @@ def test_special_chars_filename(tempdir, use_legacy_dataset):
     assert table_read.equals(table)
 
 
+@pytest.mark.slow
+def test_file_with_over_int16_max_row_groups():
+    # PARQUET-1857: Parquet encryption support introduced a INT16_MAX upper
+    # limit on the number of row groups, but this limit only impacts files with
+    # encrypted row group metadata because of the int16 row group ordinal used
+    # in the Parquet Thrift metadata. Unencrypted files are not impacted, so
+    # this test checks that it works (even if it isn't a good idea)
+    t = pa.table([list(range(40000))], names=['f0'])
+    _check_roundtrip(t, row_group_size=1)
+
+
 @pytest.mark.pandas
 @parametrize_legacy_dataset
 def test_empty_table_roundtrip(use_legacy_dataset):


[arrow] 05/17: ARROW-8609: [C++] Fix ORC Java JNI crash

Posted by ks...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

kszucs pushed a commit to branch maint-0.17.x
in repository https://gitbox.apache.org/repos/asf/arrow.git

commit 2f3002148ea234a34e569eab1b97bf78bee0fcab
Author: Yuan Zhou <yu...@intel.com>
AuthorDate: Wed Apr 29 09:01:48 2020 +0200

    ARROW-8609: [C++] Fix ORC Java JNI crash
    
    check if arrow buffer is null before passing to the constructor
    
    Signed-off-by: Yuan Zhou <yu...@intel.com>
    
    Closes #7048 from zhouyuan/fix_orc_jni
    
    Authored-by: Yuan Zhou <yu...@intel.com>
    Signed-off-by: Krisztián Szűcs <sz...@gmail.com>
---
 ci/scripts/java_test.sh                                       |  2 +-
 cpp/src/jni/orc/jni_wrapper.cpp                               | 11 +++++++++--
 .../test/java/org/apache/arrow/adapter/orc/OrcReaderTest.java |  2 --
 3 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/ci/scripts/java_test.sh b/ci/scripts/java_test.sh
index 1383388..a30fb00 100755
--- a/ci/scripts/java_test.sh
+++ b/ci/scripts/java_test.sh
@@ -33,7 +33,7 @@ pushd ${source_dir}
 ${mvn} test
 
 if [ "${ARROW_GANDIVA_JAVA}" = "ON" ]; then
-  ${mvn} test -Parrow-jni -pl gandiva -Darrow.cpp.build.dir=${cpp_build_dir}
+  ${mvn} test -Parrow-jni -pl adapter/orc,gandiva -Darrow.cpp.build.dir=${cpp_build_dir}
 fi
 
 if [ "${ARROW_PLASMA}" = "ON" ]; then
diff --git a/cpp/src/jni/orc/jni_wrapper.cpp b/cpp/src/jni/orc/jni_wrapper.cpp
index ce05467..a341928 100644
--- a/cpp/src/jni/orc/jni_wrapper.cpp
+++ b/cpp/src/jni/orc/jni_wrapper.cpp
@@ -276,9 +276,16 @@ Java_org_apache_arrow_adapter_orc_OrcStripeReaderJniWrapper_next(JNIEnv* env,
 
   for (size_t j = 0; j < buffers.size(); ++j) {
     auto buffer = buffers[j];
+    uint8_t* data = nullptr;
+    int size = 0;
+    int64_t capacity = 0;
+    if (buffer != nullptr) {
+      data = (uint8_t*)buffer->data();
+      size = (int)buffer->size();
+      capacity = buffer->capacity();
+    }
     jobject memory = env->NewObject(orc_memory_class, orc_memory_constructor,
-                                    buffer_holder_.Insert(buffer), buffer->data(),
-                                    buffer->size(), buffer->capacity());
+                                    buffer_holder_.Insert(buffer), data, size, capacity);
     env->SetObjectArrayElement(memory_array, j, memory);
   }
 
diff --git a/java/adapter/orc/src/test/java/org/apache/arrow/adapter/orc/OrcReaderTest.java b/java/adapter/orc/src/test/java/org/apache/arrow/adapter/orc/OrcReaderTest.java
index cc95b82..00f47ee 100644
--- a/java/adapter/orc/src/test/java/org/apache/arrow/adapter/orc/OrcReaderTest.java
+++ b/java/adapter/orc/src/test/java/org/apache/arrow/adapter/orc/OrcReaderTest.java
@@ -41,13 +41,11 @@ import org.apache.orc.OrcFile;
 import org.apache.orc.TypeDescription;
 import org.apache.orc.Writer;
 import org.junit.BeforeClass;
-import org.junit.Ignore;
 import org.junit.Rule;
 import org.junit.Test;
 import org.junit.rules.TemporaryFolder;
 
 
-@Ignore
 public class OrcReaderTest {
 
   @Rule


[arrow] 08/17: ARROW-8704: [C++] Fix Parquet undefined behaviour on invalid input

Posted by ks...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

kszucs pushed a commit to branch maint-0.17.x
in repository https://gitbox.apache.org/repos/asf/arrow.git

commit c85a2fc68bf3f129c43008180ed27fc038c45c12
Author: Antoine Pitrou <an...@python.org>
AuthorDate: Tue May 5 08:49:15 2020 -0400

    ARROW-8704: [C++] Fix Parquet undefined behaviour on invalid input
    
    Should fix https://oss-fuzz.com/issue/5673014655188992 .
    
    Closes #7105 from pitrou/ARROW-8704-parquet-oss-fuzz
    
    Authored-by: Antoine Pitrou <an...@python.org>
    Signed-off-by: François Saint-Jacques <fs...@gmail.com>
---
 cpp/src/parquet/column_reader.cc | 6 +++++-
 testing                          | 2 +-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc
index f746eb7..ee497ab 100644
--- a/cpp/src/parquet/column_reader.cc
+++ b/cpp/src/parquet/column_reader.cc
@@ -35,6 +35,7 @@
 #include "arrow/util/bit_stream_utils.h"
 #include "arrow/util/checked_cast.h"
 #include "arrow/util/compression.h"
+#include "arrow/util/int_util.h"
 #include "arrow/util/logging.h"
 #include "arrow/util/rle_encoding.h"
 #include "parquet/column_page.h"
@@ -1024,7 +1025,10 @@ class TypedRecordReader : public ColumnReaderImplBase<DType>,
 
   // Compute the values capacity in bytes for the given number of elements
   int64_t bytes_for_values(int64_t nitems) const {
-    int type_size = GetTypeByteSize(this->descr_->physical_type());
+    int64_t type_size = GetTypeByteSize(this->descr_->physical_type());
+    if (::arrow::internal::HasMultiplyOverflow(nitems, type_size)) {
+      throw ParquetException("Total size of items too large");
+    }
     return nitems * type_size;
   }
 
diff --git a/testing b/testing
index 3772a1b..7660b5f 160000
--- a/testing
+++ b/testing
@@ -1 +1 @@
-Subproject commit 3772a1b0ba8ae163b4ff4f17a0b8bd89a46a7660
+Subproject commit 7660b5fb3dca9f21996ac239d9f520e631f4d3e5


[arrow] 01/17: ARROW-8501: [Packaging][RPM] Upgrade devtoolset to 8 on CentOS 6

Posted by ks...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

kszucs pushed a commit to branch maint-0.17.x
in repository https://gitbox.apache.org/repos/asf/arrow.git

commit b43067b2cc90b1ead205d24ef592742821b09c48
Author: Sutou Kouhei <ko...@clear-code.com>
AuthorDate: Sat Apr 18 07:21:31 2020 +0900

    ARROW-8501: [Packaging][RPM] Upgrade devtoolset to 8 on CentOS 6
    
    It seems that devtoolset-6 is removed.
    
    Closes #6971 from kou/packaging-centos-6-upgrade-devtoolset
    
    Authored-by: Sutou Kouhei <ko...@clear-code.com>
    Signed-off-by: Sutou Kouhei <ko...@clear-code.com>
---
 dev/tasks/linux-packages/apache-arrow/yum/centos-6/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dev/tasks/linux-packages/apache-arrow/yum/centos-6/Dockerfile b/dev/tasks/linux-packages/apache-arrow/yum/centos-6/Dockerfile
index 08e6f15..60c312a 100644
--- a/dev/tasks/linux-packages/apache-arrow/yum/centos-6/Dockerfile
+++ b/dev/tasks/linux-packages/apache-arrow/yum/centos-6/Dockerfile
@@ -20,7 +20,7 @@ FROM centos:6
 ARG DEBUG
 
 ENV \
-  DEVTOOLSET_VERSION=6 \
+  DEVTOOLSET_VERSION=8 \
   LIBARCHIVE_SRPM_BASE=libarchive-3.1.2-10.el7_2.src.rpm \
   SRPM_DOWNLOAD_URL=http://vault.centos.org/7.6.1810/os/Source/SPackages
 


[arrow] 09/17: ARROW-8694: [C++][Parquet] Relax string size limit when deserializing Thrift messages

Posted by ks...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

kszucs pushed a commit to branch maint-0.17.x
in repository https://gitbox.apache.org/repos/asf/arrow.git

commit b90321bc031773de67bf0da5c0c765925a833768
Author: Wes McKinney <we...@apache.org>
AuthorDate: Tue May 5 16:40:14 2020 -0500

    ARROW-8694: [C++][Parquet] Relax string size limit when deserializing Thrift messages
    
    While it's not an ideal use case for Parquet, the 10MB limit for strings was causing a Thrift deserialization failure due to the large "pandas metadata" JSON blob written with the Schema when there are many columns. A 100MB limit should still catch "memory bombs" caused by nefarious input while allowing pretty wide data frames to be stored successfully
    
    Closes #7103 from wesm/ARROW-8694
    
    Authored-by: Wes McKinney <we...@apache.org>
    Signed-off-by: Wes McKinney <we...@apache.org>
---
 cpp/src/parquet/thrift_internal.h    | 2 +-
 python/pyarrow/tests/test_parquet.py | 9 +++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/cpp/src/parquet/thrift_internal.h b/cpp/src/parquet/thrift_internal.h
index 5a988c7..3e823c7 100644
--- a/cpp/src/parquet/thrift_internal.h
+++ b/cpp/src/parquet/thrift_internal.h
@@ -362,7 +362,7 @@ inline void DeserializeThriftUnencryptedMsg(const uint8_t* buf, uint32_t* len,
       new ThriftBuffer(const_cast<uint8_t*>(buf), *len));
   apache::thrift::protocol::TCompactProtocolFactoryT<ThriftBuffer> tproto_factory;
   // Protect against CPU and memory bombs
-  tproto_factory.setStringSizeLimit(10 * 1000 * 1000);
+  tproto_factory.setStringSizeLimit(100 * 1000 * 1000);
   tproto_factory.setContainerSizeLimit(10 * 1000 * 1000);
   shared_ptr<apache::thrift::protocol::TProtocol> tproto =  //
       tproto_factory.getProtocol(tmem_transport);
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index c7d04c4..86eb964 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -3823,6 +3823,15 @@ def test_fastparquet_cross_compatibility(tempdir):
     tm.assert_frame_equal(table_fp.to_pandas(), df)
 
 
+def test_table_large_metadata():
+    # ARROW-8694
+    my_schema = pa.schema([pa.field('f0', 'double')],
+                          metadata={'large': 'x' * 10000000})
+
+    table = pa.table([np.arange(10)], schema=my_schema)
+    _check_roundtrip(table)
+
+
 @parametrize_legacy_dataset_skip_buffer
 @pytest.mark.parametrize('array_factory', [
     lambda: pa.array([0, None] * 10),


[arrow] 06/17: ARROW-8608: [C++] Update vendored 'variant.hpp' to fix CUDA 10.2

Posted by ks...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

kszucs pushed a commit to branch maint-0.17.x
in repository https://gitbox.apache.org/repos/asf/arrow.git

commit 136506c54d971c783ee690afe671bd8bf133b811
Author: Paul Taylor <pa...@me.com>
AuthorDate: Fri May 1 13:59:28 2020 -0400

    ARROW-8608: [C++] Update vendored 'variant.hpp' to fix CUDA 10.2
    
    Updates vendored `variant.hpp` to [latest](https://github.com/mpark/variant/commit/d1cdfdd3f2ed80710ba4d671fe6bffaa3e28201a) with fixes for CUDA 10.2 nvcc.
    
    Fixes https://issues.apache.org/jira/browse/ARROW-8608.
    
    Closes #7053 from trxcllnt/ARROW-8608
    
    Lead-authored-by: Paul Taylor <pa...@me.com>
    Co-authored-by: ptaylor <pa...@me.com>
    Signed-off-by: François Saint-Jacques <fs...@gmail.com>
---
 cpp/src/arrow/vendored/variant.hpp | 142 +++++++++++++++++++++----------------
 1 file changed, 81 insertions(+), 61 deletions(-)

diff --git a/cpp/src/arrow/vendored/variant.hpp b/cpp/src/arrow/vendored/variant.hpp
index 8af0bfe..d558803 100644
--- a/cpp/src/arrow/vendored/variant.hpp
+++ b/cpp/src/arrow/vendored/variant.hpp
@@ -1,4 +1,4 @@
-// Vendored from v1.4.0, from single-header branch at https://github.com/mpark/variant
+// Vendored from v1.4.0, from single-header branch at https://github.com/mpark/variant/commit/d1cdfdd3f2ed80710ba4d671fe6bffaa3e28201a
 
 // MPark.Variant
 //
@@ -200,6 +200,7 @@ namespace std {
 #include <exception>
 #include <functional>
 #include <initializer_list>
+#include <limits>
 #include <new>
 #include <type_traits>
 #include <utility>
@@ -282,7 +283,7 @@ namespace std {
 #define MPARK_INTEGER_SEQUENCE
 #endif
 
-#if defined(__cpp_return_type_deduction) || defined(_MSC_VER)
+#if (defined(__cpp_decltype_auto) && defined(__cpp_return_type_deduction)) || defined(_MSC_VER)
 #define MPARK_RETURN_TYPE_DEDUCTION
 #endif
 
@@ -389,6 +390,10 @@ namespace mpark {
       using remove_reference_t = typename std::remove_reference<T>::type;
 
       template <typename T>
+      using remove_cvref_t =
+          typename std::remove_cv<remove_reference_t<T>>::type;
+
+      template <typename T>
       inline constexpr T &&forward(remove_reference_t<T> &t) noexcept {
         return static_cast<T &&>(t);
       }
@@ -693,44 +698,6 @@ namespace mpark {
       template <typename R, typename F, typename... Args>
       using is_invocable_r = detail::is_invocable_r<void, R, F, Args...>;
 
-      namespace detail {
-
-        template <bool Invocable, typename F, typename... Args>
-        struct is_nothrow_invocable {
-          static constexpr bool value =
-              noexcept(lib::invoke(std::declval<F>(), std::declval<Args>()...));
-        };
-
-        template <typename F, typename... Args>
-        struct is_nothrow_invocable<false, F, Args...> : std::false_type {};
-
-        template <bool Invocable, typename R, typename F, typename... Args>
-        struct is_nothrow_invocable_r {
-          private:
-          inline static R impl() {
-            return lib::invoke(std::declval<F>(), std::declval<Args>()...);
-          }
-
-          public:
-          static constexpr bool value = noexcept(impl());
-        };
-
-        template <typename R, typename F, typename... Args>
-        struct is_nothrow_invocable_r<false, R, F, Args...> : std::false_type {};
-
-      }  // namespace detail
-
-      template <typename F, typename... Args>
-      using is_nothrow_invocable = detail::
-          is_nothrow_invocable<is_invocable<F, Args...>::value, F, Args...>;
-
-      template <typename R, typename F, typename... Args>
-      using is_nothrow_invocable_r =
-          detail::is_nothrow_invocable_r<is_invocable_r<R, F, Args...>::value,
-                                         R,
-                                         F,
-                                         Args...>;
-
       // <memory>
 #ifdef MPARK_BUILTIN_ADDRESSOF
       template <typename T>
@@ -1697,13 +1664,21 @@ namespace mpark {
 
 #undef MPARK_VARIANT_RECURSIVE_UNION
 
-    using index_t = unsigned int;
+    template <typename... Ts>
+    using index_t = typename std::conditional<
+            sizeof...(Ts) < (std::numeric_limits<unsigned char>::max)(),
+            unsigned char,
+            typename std::conditional<
+                sizeof...(Ts) < (std::numeric_limits<unsigned short>::max)(),
+                unsigned short,
+                unsigned int>::type
+            >::type;
 
     template <Trait DestructibleTrait, typename... Ts>
     class base {
       public:
       inline explicit constexpr base(valueless_t tag) noexcept
-          : data_(tag), index_(static_cast<index_t>(-1)) {}
+          : data_(tag), index_(static_cast<index_t<Ts...>>(-1)) {}
 
       template <std::size_t I, typename... Args>
       inline explicit constexpr base(in_place_index_t<I>, Args &&... args)
@@ -1711,7 +1686,7 @@ namespace mpark {
             index_(I) {}
 
       inline constexpr bool valueless_by_exception() const noexcept {
-        return index_ == static_cast<index_t>(-1);
+        return index_ == static_cast<index_t<Ts...>>(-1);
       }
 
       inline constexpr std::size_t index() const noexcept {
@@ -1734,7 +1709,7 @@ namespace mpark {
       inline static constexpr std::size_t size() { return sizeof...(Ts); }
 
       data_t data_;
-      index_t index_;
+      index_t<Ts...> index_;
 
       friend struct access::base;
       friend struct visitation::base;
@@ -1788,7 +1763,7 @@ namespace mpark {
         Trait::TriviallyAvailable,
         ~destructor() = default;,
         inline void destroy() noexcept {
-          this->index_ = static_cast<index_t>(-1);
+          this->index_ = static_cast<index_t<Ts...>>(-1);
         });
 
     MPARK_VARIANT_DESTRUCTOR(
@@ -1798,7 +1773,7 @@ namespace mpark {
           if (!this->valueless_by_exception()) {
             visitation::alt::visit_alt(dtor{}, *this);
           }
-          this->index_ = static_cast<index_t>(-1);
+          this->index_ = static_cast<index_t<Ts...>>(-1);
         });
 
     MPARK_VARIANT_DESTRUCTOR(
@@ -2099,6 +2074,12 @@ namespace mpark {
       MPARK_INHERITING_CTOR(impl, super)
       using super::operator=;
 
+      impl(const impl&) = default;
+      impl(impl&&) = default;
+      ~impl() = default;
+      impl &operator=(const impl &) = default;
+      impl &operator=(impl &&) = default;
+
       template <std::size_t I, typename Arg>
       inline void assign(Arg &&arg) {
         this->assign_alt(access::base::get_alt<I>(*this),
@@ -2169,30 +2150,69 @@ namespace mpark {
 
 #undef MPARK_INHERITING_CTOR
 
-    template <std::size_t I, typename T>
-    struct overload_leaf {
-      using F = lib::size_constant<I> (*)(T);
-      operator F() const { return nullptr; }
+    template <typename From, typename To>
+    struct is_non_narrowing_convertible {
+      template <typename T>
+      static std::true_type test(T(&&)[1]);
+
+      template <typename T>
+      static auto impl(int) -> decltype(test<T>({std::declval<From>()}));
+
+      template <typename>
+      static auto impl(...) -> std::false_type;
+
+      static constexpr bool value = decltype(impl<To>(0))::value;
     };
 
-    template <typename... Ts>
+    template <typename Arg,
+              std::size_t I,
+              typename T,
+              bool = std::is_arithmetic<T>::value,
+              typename = void>
+    struct overload_leaf {};
+
+    template <typename Arg, std::size_t I, typename T>
+    struct overload_leaf<Arg, I, T, false> {
+      using impl = lib::size_constant<I> (*)(T);
+      operator impl() const { return nullptr; };
+    };
+
+    template <typename Arg, std::size_t I, typename T>
+    struct overload_leaf<
+        Arg,
+        I,
+        T,
+        true
+#if defined(__clang__) || !defined(__GNUC__) || __GNUC__ >= 5
+        ,
+        lib::enable_if_t<
+            std::is_same<lib::remove_cvref_t<T>, bool>::value
+                ? std::is_same<lib::remove_cvref_t<Arg>, bool>::value
+                : is_non_narrowing_convertible<Arg, T>::value>
+#endif
+        > {
+      using impl = lib::size_constant<I> (*)(T);
+      operator impl() const { return nullptr; };
+    };
+
+    template <typename Arg, typename... Ts>
     struct overload_impl {
       private:
       template <typename>
       struct impl;
 
       template <std::size_t... Is>
-      struct impl<lib::index_sequence<Is...>> : overload_leaf<Is, Ts>... {};
+      struct impl<lib::index_sequence<Is...>> : overload_leaf<Arg, Is, Ts>... {};
 
       public:
       using type = impl<lib::index_sequence_for<Ts...>>;
     };
 
-    template <typename... Ts>
-    using overload = typename overload_impl<Ts...>::type;
+    template <typename Arg, typename... Ts>
+    using overload = typename overload_impl<Arg, Ts...>::type;
 
-    template <typename T, typename... Ts>
-    using best_match = lib::invoke_result_t<overload<Ts...>, T &&>;
+    template <typename Arg, typename... Ts>
+    using best_match = lib::invoke_result_t<overload<Arg, Ts...>, Arg>;
 
     template <typename T>
     struct is_in_place_index : std::false_type {};
@@ -2660,20 +2680,20 @@ namespace mpark {
 #ifdef MPARK_CPP14_CONSTEXPR
   namespace detail {
 
-    inline constexpr bool all(std::initializer_list<bool> bs) {
+    inline constexpr bool any(std::initializer_list<bool> bs) {
       for (bool b : bs) {
-        if (!b) {
-          return false;
+        if (b) {
+          return true;
         }
       }
-      return true;
+      return false;
     }
 
   }  // namespace detail
 
   template <typename Visitor, typename... Vs>
   inline constexpr decltype(auto) visit(Visitor &&visitor, Vs &&... vs) {
-    return (detail::all({!vs.valueless_by_exception()...})
+    return (!detail::any({vs.valueless_by_exception()...})
                 ? (void)0
                 : throw_bad_variant_access()),
            detail::visitation::variant::visit_value(


[arrow] 17/17: ARROW-8741: [Python][Packaging] Keep VS2015 with for the windows wheels

Posted by ks...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

kszucs pushed a commit to branch maint-0.17.x
in repository https://gitbox.apache.org/repos/asf/arrow.git

commit 1342f25754eb5580791fbc6778826d26f6509e0c
Author: Krisztián Szűcs <sz...@gmail.com>
AuthorDate: Wed May 13 02:27:01 2020 +0200

    ARROW-8741: [Python][Packaging] Keep VS2015 with for the windows wheels
    
    Closes #7129 from kszucs/win-wheel-bundled-deps
    
    Authored-by: Krisztián Szűcs <sz...@gmail.com>
    Signed-off-by: Krisztián Szűcs <sz...@gmail.com>
---
 dev/tasks/python-wheels/appveyor.yml      |  5 +-
 dev/tasks/python-wheels/win-build-3.5.bat | 90 -------------------------------
 dev/tasks/python-wheels/win-build.bat     | 29 +++++-----
 dev/tasks/tasks.yml                       |  6 ++-
 python/pyarrow/tests/conftest.py          | 19 +++----
 5 files changed, 35 insertions(+), 114 deletions(-)

diff --git a/dev/tasks/python-wheels/appveyor.yml b/dev/tasks/python-wheels/appveyor.yml
index 0782f2c..b2df013 100644
--- a/dev/tasks/python-wheels/appveyor.yml
+++ b/dev/tasks/python-wheels/appveyor.yml
@@ -21,9 +21,9 @@ environment:
   ARCH: "64"
   GENERATOR: Visual Studio 14 2015 Win64
   PYTHON_VERSION: "{{ python_version }}"
+  PYTHON_INTERPRETER: {{ python_interpreter }}
   ARROW_SRC: C:\apache-arrow
   PYARROW_VERSION: {{ arrow.no_rc_version }}
-  PYARROW_REF: {{ arrow.head }}
 
 init:
   - set MINICONDA=C:\Miniconda3-x64
@@ -47,6 +47,9 @@ build_script:
   - rd /s /q C:\OpenSSL-v111-Win32
   - rd /s /q C:\OpenSSL-v111-Win64
 
+  # Disable vcpkg to prevent unrelated noisy link warnings
+  - vcpkg integrate remove
+
   - call %ARROW_SRC%\dev\tasks\python-wheels\{{ script }}
 
 after_build:
diff --git a/dev/tasks/python-wheels/win-build-3.5.bat b/dev/tasks/python-wheels/win-build-3.5.bat
deleted file mode 100644
index 1e2d2ca..0000000
--- a/dev/tasks/python-wheels/win-build-3.5.bat
+++ /dev/null
@@ -1,90 +0,0 @@
-@rem Licensed to the Apache Software Foundation (ASF) under one
-@rem or more contributor license agreements.  See the NOTICE file
-@rem distributed with this work for additional information
-@rem regarding copyright ownership.  The ASF licenses this file
-@rem to you under the Apache License, Version 2.0 (the
-@rem "License"); you may not use this file except in compliance
-@rem with the License.  You may obtain a copy of the License at
-@rem
-@rem   http://www.apache.org/licenses/LICENSE-2.0
-@rem
-@rem Unless required by applicable law or agreed to in writing,
-@rem software distributed under the License is distributed on an
-@rem "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-@rem KIND, either express or implied.  See the License for the
-@rem specific language governing permissions and limitations
-@rem under the License.
-
-@echo on
-
-@rem create conda environment for compiling
-call conda update --yes --quiet conda
-
-call conda create -n wheel-build -q -y -c conda-forge ^
-    "boost-cpp>=1.68.0" ^
-    "python=3.5" ^
-    zlib || exit /B
-
-call conda.bat activate wheel-build
-
-@rem Cannot use conda_env_python.yml here because conda-forge has
-@rem ceased providing up-to-date packages for Python 3.5
-pip install -r %ARROW_SRC%\python\requirements-wheel-build.txt
-
-set ARROW_HOME=%CONDA_PREFIX%\Library
-set PARQUET_HOME=%CONDA_PREFIX%\Library
-echo %ARROW_HOME%
-
-@rem Build Arrow C++ libraries
-mkdir %ARROW_SRC%\cpp\build
-pushd %ARROW_SRC%\cpp\build
-
-cmake -G "%GENERATOR%" ^
-      -DCMAKE_INSTALL_PREFIX=%ARROW_HOME% ^
-      -DARROW_BOOST_USE_SHARED=OFF ^
-      -DCMAKE_BUILD_TYPE=Release ^
-      -DARROW_DEPENDENCY_SOURCE=BUNDLED ^
-      -DZLIB_SOURCE=SYSTEM ^
-      -DBOOST_SOURCE=SYSTEM ^
-      -DZLIB_ROOT=%CONDA_PREFIX%\Library ^
-      -DARROW_CXXFLAGS="/MP" ^
-      -DARROW_WITH_ZLIB=ON ^
-      -DARROW_WITH_ZSTD=ON ^
-      -DARROW_WITH_LZ4=ON ^
-      -DARROW_WITH_SNAPPY=ON ^
-      -DARROW_WITH_BROTLI=ON ^
-      -DARROW_PYTHON=ON ^
-      -DARROW_PARQUET=ON ^
-      .. || exit /B
-cmake --build . --target install --config Release || exit /B
-popd
-
-set PYARROW_BUILD_TYPE=Release
-set PYARROW_PARALLEL=8
-set PYARROW_WITH_PARQUET=1
-set PYARROW_WITH_STATIC_BOOST=1
-set PYARROW_BUNDLE_ARROW_CPP=1
-set SETUPTOOLS_SCM_PRETEND_VERSION=%PYARROW_VERSION%
-
-pushd %ARROW_SRC%\python
-python setup.py build_ext --extra-cmake-args="-DZLIB_ROOT=%CONDA_PREFIX%\Library" bdist_wheel || exit /B
-popd
-
-call conda.bat deactivate
-
-set ARROW_TEST_DATA=%ARROW_SRC%\testing\data
-
-@rem test the wheel
-@rem TODO For maximum reliability, we should test in a plain virtualenv instead.
-call conda create -n wheel-test -c conda-forge -q -y python=3.5 || exit /B
-call conda.bat activate wheel-test
-
-@rem install the built wheel
-pip install %ARROW_SRC%\python\dist\pyarrow-%PYARROW_VERSION%-cp35-cp35m-win_amd64.whl || exit /B
-pip install -r %ARROW_SRC%\python\requirements-wheel-test.txt || exit /B
-
-@rem test the imports
-python -c "import pyarrow; import pyarrow.parquet" || exit /B
-
-@rem run the python tests
-pytest -rs --pyargs pyarrow || exit /B
diff --git a/dev/tasks/python-wheels/win-build.bat b/dev/tasks/python-wheels/win-build.bat
index f26973e..b7bd25f 100644
--- a/dev/tasks/python-wheels/win-build.bat
+++ b/dev/tasks/python-wheels/win-build.bat
@@ -18,11 +18,10 @@
 @echo on
 
 @rem create conda environment for compiling
-call conda update --yes --quiet conda
-
 call conda create -n wheel-build -q -y -c conda-forge ^
     --file=%ARROW_SRC%\ci\conda_env_cpp.yml ^
     --file=%ARROW_SRC%\ci\conda_env_gandiva.yml ^
+    "vs2015_runtime<14.16" ^
     python=%PYTHON_VERSION% || exit /B
 
 call conda.bat activate wheel-build
@@ -63,6 +62,7 @@ cmake -G "%GENERATOR%" ^
       -DARROW_PYTHON=ON ^
       -DARROW_PARQUET=ON ^
       -DARROW_GANDIVA=ON ^
+      -DARROW_MIMAllOC=ON ^
       -DZSTD_SOURCE=BUNDLED ^
       .. || exit /B
 cmake --build . --target install --config Release || exit /B
@@ -86,17 +86,20 @@ call conda.bat deactivate
 
 set ARROW_TEST_DATA=%ARROW_SRC%\testing\data
 
-@rem test the wheel
-@rem TODO For maximum reliability, we should test in a plain virtualenv instead.
-call conda create -n wheel-test -c conda-forge -q -y python=%PYTHON_VERSION% || exit /B
-call conda.bat activate wheel-test
+@rem install the test dependencies
+%PYTHON_INTERPRETER% -m pip install -r %ARROW_SRC%\python\requirements-wheel-test.txt || exit /B
 
-@rem install the built wheel
-pip install -r %ARROW_SRC%\python\requirements-wheel-test.txt || exit /B
-pip install --no-index --find-links=%ARROW_SRC%\python\dist\ pyarrow || exit /B
+@rem install the produced wheel in a non-conda environment
+%PYTHON_INTERPRETER% -m pip install --no-index --find-links=%ARROW_SRC%\python\dist\ pyarrow || exit /B
 
 @rem test the imports
-python -c "import pyarrow; import pyarrow.parquet; import pyarrow.flight; import pyarrow.dataset; import pyarrow.gandiva;" || exit /B
-
-@rem run the python tests
-pytest -rs --pyargs pyarrow || exit /B
+%PYTHON_INTERPRETER% -c "import pyarrow" || exit /B
+%PYTHON_INTERPRETER% -c "import pyarrow.parquet" || exit /B
+%PYTHON_INTERPRETER% -c "import pyarrow.flight" || exit /B
+%PYTHON_INTERPRETER% -c "import pyarrow.gandiva" || exit /B
+%PYTHON_INTERPRETER% -c "import pyarrow.dataset" || exit /B
+
+@rem run the python tests, but disable the cython because there is a linking
+@rem issue on python 3.8
+set PYARROW_TEST_CYTHON=OFF
+%PYTHON_INTERPRETER% -m pytest -rs --pyargs pyarrow || exit /B
diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml
index a03a44d..8ff78ee 100644
--- a/dev/tasks/tasks.yml
+++ b/dev/tasks/tasks.yml
@@ -433,8 +433,9 @@ tasks:
     platform: win
     template: python-wheels/appveyor.yml
     params:
-      script: win-build-3.5.bat
+      script: win-build.bat
       python_version: 3.5
+      python_interpreter: C:\Python35-x64\python.exe
     artifacts:
       - pyarrow-{no_rc_version}-cp35-cp35m-win_amd64.whl
 
@@ -445,6 +446,7 @@ tasks:
     params:
       script: win-build.bat
       python_version: 3.6
+      python_interpreter: C:\Python36-x64\python.exe
     artifacts:
       - pyarrow-{no_rc_version}-cp36-cp36m-win_amd64.whl
 
@@ -455,6 +457,7 @@ tasks:
     params:
       script: win-build.bat
       python_version: 3.7
+      python_interpreter: C:\Python37-x64\python.exe
     artifacts:
       - pyarrow-{no_rc_version}-cp37-cp37m-win_amd64.whl
 
@@ -465,6 +468,7 @@ tasks:
     params:
       script: win-build.bat
       python_version: 3.8
+      python_interpreter: C:\Python38-x64\python.exe
     artifacts:
       - pyarrow-{no_rc_version}-cp38-cp38-win_amd64.whl
 
diff --git a/python/pyarrow/tests/conftest.py b/python/pyarrow/tests/conftest.py
index 08271a0..ce72f2e 100644
--- a/python/pyarrow/tests/conftest.py
+++ b/python/pyarrow/tests/conftest.py
@@ -168,12 +168,13 @@ def pytest_addoption(parser):
                              .format(name.upper(), value))
 
     for group in groups:
-        for flag, envvar in [('--{}', 'PYARROW_TEST_{}'),
-                             ('--enable-{}', 'PYARROW_TEST_ENABLE_{}')]:
-            default = bool_env(envvar.format(group), defaults[group])
-            parser.addoption(flag.format(group),
-                             action='store_true', default=default,
-                             help=('Enable the {} test group'.format(group)))
+        default = bool_env('PYARROW_TEST_{}'.format(group), defaults[group])
+        parser.addoption('--enable-{}'.format(group),
+                         action='store_true', default=default,
+                         help=('Enable the {} test group'.format(group)))
+        parser.addoption('--disable-{}'.format(group),
+                         action='store_true', default=False,
+                         help=('Disable the {} test group'.format(group)))
 
 
 class PyArrowConfig:
@@ -199,11 +200,11 @@ def pytest_configure(config):
             "markers", mark,
         )
 
-        flag = '--{}'.format(mark)
         enable_flag = '--enable-{}'.format(mark)
+        disable_flag = '--disable-{}'.format(mark)
 
-        is_enabled = (config.getoption(flag) or
-                      config.getoption(enable_flag))
+        is_enabled = (config.getoption(enable_flag) and not
+                      config.getoption(disable_flag))
         config.pyarrow.is_enabled[mark] = is_enabled
 
 


[arrow] 03/17: ARROW-8549: [R] Assorted post-0.17 release cleanups

Posted by ks...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

kszucs pushed a commit to branch maint-0.17.x
in repository https://gitbox.apache.org/repos/asf/arrow.git

commit 558dae47a6408fe41d241eafbdd60b8e4bcc495c
Author: Neal Richardson <ne...@gmail.com>
AuthorDate: Wed Apr 22 12:42:59 2020 -0700

    ARROW-8549: [R] Assorted post-0.17 release cleanups
    
    Functional changes in the R package installation:
    
    * Downloading dependencies is made completely silent by default
    * When downloading source, it now tries three Apache mirrors
    * Limit parallelization in the C++ build to 2 CPUs (unless MAKEFLAGS is set), per CRAN repository policy
    
    CI/test changes:
    
    * Adds a `try()` around a non-essential dependency download to fix a source of occasional build flakiness
    * Makes the R install logs always print out, not just on failure. It was useful to be able to see what exactly the successful builds were doing to make sure they were doing the expected right things.
    * Makes the `--as-cran` checks build and check the docs/vignettes. This adds a lot of time to the build but is only done nightly, and something needs to check this, at least before release
    
    Closes #6995 from nealrichardson/0.17.0-rpkg
    
    Authored-by: Neal Richardson <ne...@gmail.com>
    Signed-off-by: Neal Richardson <ne...@gmail.com>
---
 ci/scripts/r_deps.sh              |  3 +-
 ci/scripts/r_test.sh              | 11 ++---
 dev/tasks/r/azure.linux.yml       |  3 +-
 dev/tasks/r/github.linux.cran.yml |  3 +-
 docker-compose.yml                |  3 ++
 r/cran-comments.md                |  5 ++-
 r/tools/linuxlibs.R               | 93 +++++++++++++++++++++++++--------------
 r/tools/winlibs.R                 |  9 +++-
 8 files changed, 83 insertions(+), 47 deletions(-)

diff --git a/ci/scripts/r_deps.sh b/ci/scripts/r_deps.sh
index edb8cdf..7cf6dc5 100755
--- a/ci/scripts/r_deps.sh
+++ b/ci/scripts/r_deps.sh
@@ -27,7 +27,8 @@ pushd ${source_dir}
 # Install R package dependencies
 ${R_BIN} -e "install.packages('remotes'); remotes::install_cran(c('glue', 'rcmdcheck'))"
 ${R_BIN} -e "remotes::install_deps(dependencies = TRUE)"
-${R_BIN} -e "remotes::install_github('nealrichardson/decor')"
+# This isn't required for testing, only for if you're using this to build your dev environment
+${R_BIN} -e "try(remotes::install_github('nealrichardson/decor'))"
 
 popd
 
diff --git a/ci/scripts/r_test.sh b/ci/scripts/r_test.sh
index e79fac5..edfa32b 100755
--- a/ci/scripts/r_test.sh
+++ b/ci/scripts/r_test.sh
@@ -41,15 +41,16 @@ export TEST_R_WITH_ARROW=TRUE
 export _R_CHECK_TESTS_NLINES_=0
 export _R_CHECK_CRAN_INCOMING_REMOTE_=FALSE
 export _R_CHECK_LIMIT_CORES_=FALSE
-export VERSION=$(grep ^Version DESCRIPTION | sed s/Version:\ //)
 
 # Make sure we aren't writing to the home dir (CRAN _hates_ this but there is no official check)
 BEFORE=$(ls -alh ~/)
 
-# Conditionally run --as-cran because crossbow jobs aren't using _R_CHECK_COMPILATION_FLAGS_KNOWN_
-# (maybe an R version thing, needs 3.6.2?)
-# Also only --run-donttest if NOT_CRAN because Parquet example requires snappy (optional dependency)
-${R_BIN} -e "cran <- !identical(tolower(Sys.getenv('NOT_CRAN')), 'true'); rcmdcheck::rcmdcheck(build_args = '--no-build-vignettes', args = c('--no-manual', '--ignore-vignettes', ifelse(cran, '--as-cran', '--run-donttest')), error_on = 'warning', check_dir = 'check')"
+${R_BIN} -e "as_cran <- !identical(tolower(Sys.getenv('NOT_CRAN')), 'true')
+  if (as_cran) {
+    rcmdcheck::rcmdcheck(args = c('--as-cran', '--run-donttest'), error_on = 'warning', check_dir = 'check')
+  } else {
+    rcmdcheck::rcmdcheck(build_args = '--no-build-vignettes', args = c('--no-manual', '--ignore-vignettes', '--run-donttest'), error_on = 'warning', check_dir = 'check')
+  }"
 
 AFTER=$(ls -alh ~/)
 if [ "$BEFORE" != "$AFTER" ]; then
diff --git a/dev/tasks/r/azure.linux.yml b/dev/tasks/r/azure.linux.yml
index 59d5b02..4b4211c 100644
--- a/dev/tasks/r/azure.linux.yml
+++ b/dev/tasks/r/azure.linux.yml
@@ -55,5 +55,4 @@ jobs:
     - script: |
         set -ex
         cat arrow/r/check/arrow.Rcheck/00install.out
-      displayName: Dump install logs on failure
-      condition: failed()
+      displayName: Dump install logs
diff --git a/dev/tasks/r/github.linux.cran.yml b/dev/tasks/r/github.linux.cran.yml
index 0a5362b..2b2d884 100644
--- a/dev/tasks/r/github.linux.cran.yml
+++ b/dev/tasks/r/github.linux.cran.yml
@@ -62,6 +62,5 @@ jobs:
       - name: Docker Run
         shell: bash
         run: cd arrow && docker-compose run r
-      - name: Dump install logs on failure
-        if: failure()
+      - name: Dump install logs
         run: cat arrow/r/check/arrow.Rcheck/00install.out
diff --git a/docker-compose.yml b/docker-compose.yml
index 9c0bfdd..ec8d483 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -784,6 +784,7 @@ services:
     shm_size: *shm-size
     environment:
       <<: *ccache
+      NOT_CRAN: 'true'
     volumes: *conda-volumes
     command:
       ["/arrow/ci/scripts/cpp_build.sh /arrow /build &&
@@ -808,6 +809,7 @@ services:
       <<: *ccache
       ARROW_R_CXXFLAGS: '-Werror'
       LIBARROW_BUILD: 'false'
+      NOT_CRAN: 'true'
     volumes: *ubuntu-volumes
     command: >
       /bin/bash -c "
@@ -834,6 +836,7 @@ services:
     environment:
       LIBARROW_DOWNLOAD: "false"
       ARROW_HOME: "/arrow"
+      # To test for CRAN release, delete ^^ these two env vars so we download the Apache release
       ARROW_USE_PKG_CONFIG: "false"
     volumes:
       - .:/arrow:delegated
diff --git a/r/cran-comments.md b/r/cran-comments.md
index 59f80b6..07a184b 100644
--- a/r/cran-comments.md
+++ b/r/cran-comments.md
@@ -1,8 +1,9 @@
 ## Test environments
-* Debian Linux, R-devel, GCC ASAN/UBSAN
+* Debian Linux, GCC, R-devel/R-patched/R-release
+* Fedora Linux, GCC/clang, R-devel
 * Ubuntu Linux 16.04 LTS, R-release, GCC
 * win-builder (R-devel and R-release)
-* macOS (10.11, 10.14), R-release
+* macOS 10.14, R-release
 
 ## R CMD check results
 
diff --git a/r/tools/linuxlibs.R b/r/tools/linuxlibs.R
index b87465b..96fd8e1 100644
--- a/r/tools/linuxlibs.R
+++ b/r/tools/linuxlibs.R
@@ -20,10 +20,6 @@ VERSION <- args[1]
 dst_dir <- paste0("libarrow/arrow-", VERSION)
 
 arrow_repo <- "https://dl.bintray.com/ursalabs/arrow-r/libarrow/"
-apache_src_url <- paste0(
-  "https://archive.apache.org/dist/arrow/arrow-", VERSION,
-  "/apache-arrow-", VERSION, ".tar.gz"
-)
 
 options(.arrow.cleanup = character()) # To collect dirs to rm on exit
 on.exit(unlink(getOption(".arrow.cleanup")))
@@ -40,17 +36,23 @@ binary_ok <- !identical(tolower(Sys.getenv("LIBARROW_BINARY", "false")), "false"
 # For local debugging, set ARROW_R_DEV=TRUE to make this script print more
 quietly <- !env_is("ARROW_R_DEV", "true")
 
+try_download <- function(from_url, to_file) {
+  try(
+    suppressWarnings(
+      download.file(from_url, to_file, quiet = quietly)
+    ),
+    silent = quietly
+  )
+  file.exists(to_file)
+}
+
 download_binary <- function(os = identify_os()) {
   libfile <- tempfile()
   if (!is.null(os)) {
     # See if we can map this os-version to one we have binaries for
     os <- find_available_binary(os)
     binary_url <- paste0(arrow_repo, "bin/", os, "/arrow-", VERSION, ".zip")
-    try(
-      download.file(binary_url, libfile, quiet = quietly),
-      silent = quietly
-    )
-    if (file.exists(libfile)) {
+    if (try_download(binary_url, libfile)) {
       cat(sprintf("*** Successfully retrieved C++ binaries for %s\n", os))
     } else {
       cat(sprintf("*** No C++ binaries found for %s\n", os))
@@ -133,31 +135,54 @@ find_available_binary <- function(os) {
 
 download_source <- function() {
   tf1 <- tempfile()
-  src_dir <- NULL
-  source_url <- paste0(arrow_repo, "src/arrow-", VERSION, ".zip")
-  try(
-    download.file(source_url, tf1, quiet = quietly),
-    silent = quietly
-  )
-  if (!file.exists(tf1)) {
-    # Try for an official release
-    try(
-      download.file(apache_src_url, tf1, quiet = quietly),
-      silent = quietly
-    )
-  }
-  if (file.exists(tf1)) {
+  src_dir <- tempfile()
+  if (bintray_download(tf1)) {
+    # First try from bintray
     cat("*** Successfully retrieved C++ source\n")
-    src_dir <- tempfile()
     unzip(tf1, exdir = src_dir)
     unlink(tf1)
-    # These scripts need to be executable
-    system(sprintf("chmod 755 %s/cpp/build-support/*.sh", src_dir))
-    options(.arrow.cleanup = c(getOption(".arrow.cleanup"), src_dir))
-    # The actual src is in cpp
     src_dir <- paste0(src_dir, "/cpp")
+  } else if (apache_download(tf1)) {
+    # If that fails, try for an official release
+    cat("*** Successfully retrieved C++ source\n")
+    untar(tf1, exdir = src_dir)
+    unlink(tf1)
+    src_dir <- paste0(src_dir, "/apache-arrow-", VERSION, "/cpp")
   }
-  src_dir
+
+  if (dir.exists(src_dir)) {
+    options(.arrow.cleanup = c(getOption(".arrow.cleanup"), src_dir))
+    # These scripts need to be executable
+    system(
+      sprintf("chmod 755 %s/build-support/*.sh", src_dir),
+      ignore.stdout = quietly, ignore.stderr = quietly
+    )
+    return(src_dir)
+  } else {
+    return(NULL)
+  }
+}
+
+bintray_download <- function(destfile) {
+  source_url <- paste0(arrow_repo, "src/arrow-", VERSION, ".zip")
+  try_download(source_url, destfile)
+}
+
+apache_download <- function(destfile, n_mirrors = 3) {
+  apache_path <- paste0("arrow/arrow-", VERSION, "/apache-arrow-", VERSION, ".tar.gz")
+  apache_urls <- c(
+    # This returns a different mirror each time
+    rep("https://www.apache.org/dyn/closer.lua?action=download&filename=", n_mirrors),
+    "https://downloads.apache.org/" # The backup
+  )
+  downloaded <- FALSE
+  for (u in apache_urls) {
+    downloaded <- try_download(paste0(u, apache_path), destfile)
+    if (downloaded) {
+      break
+    }
+  }
+  downloaded
 }
 
 find_local_source <- function(arrow_home = Sys.getenv("ARROW_HOME", "..")) {
@@ -176,7 +201,10 @@ build_libarrow <- function(src_dir, dst_dir) {
   # Set up make for parallel building
   makeflags <- Sys.getenv("MAKEFLAGS")
   if (makeflags == "") {
-    makeflags <- sprintf("-j%s", parallel::detectCores())
+    # CRAN policy says not to use more than 2 cores during checks
+    # If you have more and want to use more, set MAKEFLAGS
+    ncores <- min(parallel::detectCores(), 2)
+    makeflags <- sprintf("-j%s", ncores)
     Sys.setenv(MAKEFLAGS = makeflags)
   }
   if (!quietly) {
@@ -212,10 +240,7 @@ ensure_cmake <- function() {
     )
     cmake_tar <- tempfile()
     cmake_dir <- tempfile()
-    try(
-      download.file(cmake_binary_url, cmake_tar, quiet = quietly),
-      silent = quietly
-    )
+    try_download(cmake_binary_url, cmake_tar)
     untar(cmake_tar, exdir = cmake_dir)
     unlink(cmake_tar)
     options(.arrow.cleanup = c(getOption(".arrow.cleanup"), cmake_dir))
diff --git a/r/tools/winlibs.R b/r/tools/winlibs.R
index 390e1f9..ea31d4f 100644
--- a/r/tools/winlibs.R
+++ b/r/tools/winlibs.R
@@ -29,11 +29,18 @@ if(!file.exists(sprintf("windows/arrow-%s/include/arrow/api.h", VERSION))){
   } else {
     # Download static arrow from rwinlib
     if (getRversion() < "3.3.0") setInternet2()
+    quietly <- !identical(tolower(Sys.getenv("ARROW_R_DEV")), "true")
     get_file <- function(template, version) {
-      try(download.file(sprintf(template, version), "lib.zip", quiet = TRUE), silent = TRUE)
+      try(
+        suppressWarnings(
+          download.file(sprintf(template, version), "lib.zip", quiet = quietly)
+        ),
+        silent = quietly
+      )
     }
     # URL templates
     # TODO: don't hard-code RTools 3.5? Can we detect which toolchain we have?
+    # ifelse(nzchar(Sys.getenv("RTOOLS40_HOME")), "40", "35")
     nightly <- "https://dl.bintray.com/ursalabs/arrow-r/libarrow/bin/windows-35/arrow-%s.zip"
     rwinlib <- "https://github.com/rwinlib/arrow/archive/v%s.zip"
     # First look for a nightly


[arrow] 13/17: ARROW-8641: [C++][Python] Sort included indices in IpcReader - Respect column selection in FeatherReader

Posted by ks...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

kszucs pushed a commit to branch maint-0.17.x
in repository https://gitbox.apache.org/repos/asf/arrow.git

commit 057cbe4d304f8706c8d08d955027e38abc0d7783
Author: Joris Van den Bossche <jo...@gmail.com>
AuthorDate: Thu May 7 11:23:25 2020 -0500

    ARROW-8641: [C++][Python] Sort included indices in IpcReader - Respect column selection in FeatherReader
    
    Closes #7122 from jorisvandenbossche/ARROW-8641-feather-order
    
    Authored-by: Joris Van den Bossche <jo...@gmail.com>
    Signed-off-by: Wes McKinney <we...@apache.org>
---
 cpp/src/arrow/ipc/read_write_test.cc | 18 +++++++++
 cpp/src/arrow/ipc/reader.cc          |  6 ++-
 python/pyarrow/feather.pxi           |  4 ++
 python/pyarrow/feather.py            | 29 ++++++++++----
 python/pyarrow/tests/test_dataset.py |  6 +++
 python/pyarrow/tests/test_feather.py | 74 ++++++++++++++++++++++++++++++++++--
 6 files changed, 125 insertions(+), 12 deletions(-)

diff --git a/cpp/src/arrow/ipc/read_write_test.cc b/cpp/src/arrow/ipc/read_write_test.cc
index 1b5073e..bc4e080 100644
--- a/cpp/src/arrow/ipc/read_write_test.cc
+++ b/cpp/src/arrow/ipc/read_write_test.cc
@@ -1043,6 +1043,24 @@ class ReaderWriterMixin {
       AssertBatchesEqual(*ex_batch, *out_batches[0], /*check_metadata=*/true);
     }
 
+    // Duplicated or unordered indices are normalized when reading
+    options.included_fields = {3, 1, 1};
+
+    {
+      WriterHelper writer_helper;
+      BatchVector out_batches;
+      std::shared_ptr<Schema> out_schema;
+      ASSERT_OK(RoundTripHelper(writer_helper, {batch}, IpcWriteOptions::Defaults(),
+                                options, &out_batches, &out_schema));
+
+      auto ex_schema = schema({field("a1", utf8()), field("a3", utf8())},
+                              key_value_metadata({"key1"}, {"value1"}));
+      AssertSchemaEqual(*ex_schema, *out_schema);
+
+      auto ex_batch = RecordBatch::Make(ex_schema, a0->length(), {a1, a3});
+      AssertBatchesEqual(*ex_batch, *out_batches[0], /*check_metadata=*/true);
+    }
+
     // Out of bounds cases
     options.included_fields = {1, 3, 5};
     {
diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc
index 95b1c5a..95c5cb5 100644
--- a/cpp/src/arrow/ipc/reader.cc
+++ b/cpp/src/arrow/ipc/reader.cc
@@ -17,6 +17,7 @@
 
 #include "arrow/ipc/reader.h"
 
+#include <algorithm>
 #include <climits>
 #include <cstdint>
 #include <cstring>
@@ -528,8 +529,11 @@ Status GetInclusionMaskAndOutSchema(const std::shared_ptr<Schema>& full_schema,
 
   inclusion_mask->resize(full_schema->num_fields(), false);
 
+  auto included_indices_sorted = included_indices;
+  std::sort(included_indices_sorted.begin(), included_indices_sorted.end());
+
   FieldVector included_fields;
-  for (int i : included_indices) {
+  for (int i : included_indices_sorted) {
     // Ignore out of bounds indices
     if (i < 0 || i >= full_schema->num_fields()) {
       return Status::Invalid("Out of bounds field index: ", i);
diff --git a/python/pyarrow/feather.pxi b/python/pyarrow/feather.pxi
index 96cb741..4da5435 100644
--- a/python/pyarrow/feather.pxi
+++ b/python/pyarrow/feather.pxi
@@ -66,6 +66,10 @@ cdef class FeatherReader:
         with nogil:
             self.reader = GetResultValue(CFeatherReader.Open(reader))
 
+    @property
+    def version(self):
+        return self.reader.get().version()
+
     def read(self):
         cdef shared_ptr[CTable] sp_table
         with nogil:
diff --git a/python/pyarrow/feather.py b/python/pyarrow/feather.py
index 6d909ef..a599e15 100644
--- a/python/pyarrow/feather.py
+++ b/python/pyarrow/feather.py
@@ -20,7 +20,7 @@ import os
 
 from pyarrow.pandas_compat import _pandas_api  # noqa
 from pyarrow.lib import FeatherError  # noqa
-from pyarrow.lib import Table, concat_tables
+from pyarrow.lib import Table, concat_tables, schema
 import pyarrow.lib as ext
 
 
@@ -234,11 +234,24 @@ def read_table(source, columns=None, memory_map=True):
 
     column_types = [type(column) for column in columns]
     if all(map(lambda t: t == int, column_types)):
-        return reader.read_indices(columns)
+        table = reader.read_indices(columns)
     elif all(map(lambda t: t == str, column_types)):
-        return reader.read_names(columns)
-
-    column_type_names = [t.__name__ for t in column_types]
-    raise TypeError("Columns must be indices or names. "
-                    "Got columns {} of types {}"
-                    .format(columns, column_type_names))
+        table = reader.read_names(columns)
+    else:
+        column_type_names = [t.__name__ for t in column_types]
+        raise TypeError("Columns must be indices or names. "
+                        "Got columns {} of types {}"
+                        .format(columns, column_type_names))
+
+    # Feather v1 already respects the column selection
+    if reader.version < 3:
+        return table
+    # Feather v2 reads with sorted / deduplicated selection
+    elif sorted(set(columns)) == columns:
+        return table
+    else:
+        # follow exact order / selection of names
+        new_fields = [table.schema.field(c) for c in columns]
+        new_schema = schema(new_fields, metadata=table.schema.metadata)
+        new_columns = [table.column(c) for c in columns]
+        return Table.from_arrays(new_columns, schema=new_schema)
diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py
index d67a6b5..efaa4b0 100644
--- a/python/pyarrow/tests/test_dataset.py
+++ b/python/pyarrow/tests/test_dataset.py
@@ -1485,6 +1485,12 @@ def test_feather_format(tempdir):
     result = dataset.to_table()
     assert result.equals(table)
 
+    # ARROW-8641 - column selection order
+    result = dataset.to_table(columns=["b", "a"])
+    assert result.column_names == ["b", "a"]
+    result = dataset.to_table(columns=["a", "a"])
+    assert result.column_names == ["a", "a"]
+
     # error with Feather v1 files
     write_feather(table, str(basedir / "data1.feather"), version=1)
     with pytest.raises(ValueError):
diff --git a/python/pyarrow/tests/test_feather.py b/python/pyarrow/tests/test_feather.py
index 779649b..fbfcade 100644
--- a/python/pyarrow/tests/test_feather.py
+++ b/python/pyarrow/tests/test_feather.py
@@ -87,7 +87,20 @@ def _check_pandas_roundtrip(df, expected=None, path=None,
     assert_frame_equal(result, expected)
 
 
-def _assert_error_on_write(df, exc, path=None):
+def _check_arrow_roundtrip(table, path=None):
+    if path is None:
+        path = random_path()
+
+    TEST_FILES.append(path)
+    write_feather(table, path)
+    if not os.path.exists(path):
+        raise Exception('file not written')
+
+    result = read_table(path)
+    assert result.equals(table)
+
+
+def _assert_error_on_write(df, exc, path=None, version=2):
     # check that we are raising the exception
     # on writing
 
@@ -97,7 +110,7 @@ def _assert_error_on_write(df, exc, path=None):
     TEST_FILES.append(path)
 
     def f():
-        write_feather(df, path)
+        write_feather(df, path, version=version)
 
     pytest.raises(exc, f)
 
@@ -535,7 +548,7 @@ def test_sparse_dataframe(version):
 
 
 @pytest.mark.pandas
-def test_duplicate_columns():
+def test_duplicate_columns_pandas():
 
     # https://github.com/wesm/feather/issues/53
     # not currently able to handle duplicate columns
@@ -544,6 +557,13 @@ def test_duplicate_columns():
     _assert_error_on_write(df, ValueError)
 
 
+def test_duplicate_columns():
+    # only works for version 2
+    table = pa.table([[1, 2, 3], [4, 5, 6], [7, 8, 9]], names=['a', 'a', 'b'])
+    _check_arrow_roundtrip(table)
+    _assert_error_on_write(table, ValueError, version=1)
+
+
 @pytest.mark.pandas
 def test_unsupported():
     # https://github.com/wesm/feather/issues/240
@@ -665,3 +685,51 @@ def test_feather_without_pandas(tempdir, version):
     write_feather(table, str(tempdir / "data.feather"), version=version)
     result = read_table(str(tempdir / "data.feather"))
     assert result.equals(table)
+
+
+@pytest.mark.pandas
+def test_read_column_selection(version):
+    # ARROW-8641
+    df = pd.DataFrame(np.arange(12).reshape(4, 3), columns=['a', 'b', 'c'])
+
+    # select columns as string names or integer indices
+    _check_pandas_roundtrip(
+        df, columns=['a', 'c'], expected=df[['a', 'c']], version=version)
+    _check_pandas_roundtrip(
+        df, columns=[0, 2], expected=df[['a', 'c']], version=version)
+
+    # different order is followed
+    _check_pandas_roundtrip(
+        df, columns=['b', 'a'], expected=df[['b', 'a']], version=version)
+    _check_pandas_roundtrip(
+        df, columns=[1, 0], expected=df[['b', 'a']], version=version)
+
+
+def test_read_column_duplicated_selection(tempdir, version):
+    # duplicated columns in the column selection
+    table = pa.table([[1, 2, 3], [4, 5, 6], [7, 8, 9]], names=['a', 'b', 'c'])
+    path = str(tempdir / "data.feather")
+    write_feather(table, path, version=version)
+
+    for col_selection in [['a', 'b', 'a'], [0, 1, 0]]:
+        result = read_table(path, columns=col_selection)
+        assert result.column_names == ['a', 'b', 'a']
+
+
+def test_read_column_duplicated_in_file(tempdir):
+    # duplicated columns in feather file (only works for feather v2)
+    table = pa.table([[1, 2, 3], [4, 5, 6], [7, 8, 9]], names=['a', 'b', 'a'])
+    path = str(tempdir / "data.feather")
+    write_feather(table, path, version=2)
+
+    # no selection works fine
+    result = read_table(path)
+    assert result.equals(table)
+
+    # selection with indices works
+    result = read_table(path, columns=[0, 2])
+    assert result.column_names == ['a', 'a']
+
+    # selection with column names errors
+    with pytest.raises(ValueError):
+        read_table(path, columns=['a', 'b'])


[arrow] 07/17: ARROW-8699: [R] Fix automatic r_to_py conversion

Posted by ks...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

kszucs pushed a commit to branch maint-0.17.x
in repository https://gitbox.apache.org/repos/asf/arrow.git

commit ddfd4de97bf9317cd3d0c47a48f10b278ed8bf1d
Author: Neal Richardson <ne...@gmail.com>
AuthorDate: Mon May 4 16:04:09 2020 -0700

    ARROW-8699: [R] Fix automatic r_to_py conversion
    
    This appears to be the fix for https://github.com/rstudio/reticulate/issues/748
    
    cc @kevinushey
    
    Closes #7102 from nealrichardson/r-to-py
    
    Authored-by: Neal Richardson <ne...@gmail.com>
    Signed-off-by: Neal Richardson <ne...@gmail.com>
---
 r/R/python.R           | 16 ++++++++++++----
 r/vignettes/python.Rmd |  2 +-
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/r/R/python.R b/r/R/python.R
index 4d84a40..3964076 100644
--- a/r/R/python.R
+++ b/r/R/python.R
@@ -35,9 +35,13 @@ r_to_py.Array <- function(x, convert = FALSE) {
     delete_arrow_array(array_ptr)
   })
 
-  pa <- reticulate::import("pyarrow", convert = convert)
+  # Import with convert = FALSE so that `_import_from_c` returns a Python object
+  pa <- reticulate::import("pyarrow", convert = FALSE)
   ExportArray(x, array_ptr, schema_ptr)
-  pa$Array$`_import_from_c`(array_ptr, schema_ptr)
+  out <- pa$Array$`_import_from_c`(array_ptr, schema_ptr)
+  # But set the convert attribute on the return object to the requested value
+  assign("convert", convert, out)
+  out
 }
 
 py_to_r.pyarrow.lib.RecordBatch <- function(x, ...) {
@@ -60,9 +64,13 @@ r_to_py.RecordBatch <- function(x, convert = FALSE) {
     delete_arrow_array(array_ptr)
   })
 
-  pa <- reticulate::import("pyarrow", convert = convert)
+  # Import with convert = FALSE so that `_import_from_c` returns a Python object
+  pa <- reticulate::import("pyarrow", convert = FALSE)
   ExportRecordBatch(x, array_ptr, schema_ptr)
-  pa$RecordBatch$`_import_from_c`(array_ptr, schema_ptr)
+  out <- pa$RecordBatch$`_import_from_c`(array_ptr, schema_ptr)
+  # But set the convert attribute on the return object to the requested value
+  assign("convert", convert, out)
+  out
 }
 
 #' Install pyarrow for use with reticulate
diff --git a/r/vignettes/python.Rmd b/r/vignettes/python.Rmd
index 5ee5654..c05ee7d 100644
--- a/r/vignettes/python.Rmd
+++ b/r/vignettes/python.Rmd
@@ -84,7 +84,7 @@ to use it efficiently.
 
 ```r
 b <- Array$create(c(5, 6, 7, 8, 9))
-a_and_b <- pa$concat_arrays(r_to_py(list(a, b)))
+a_and_b <- pa$concat_arrays(list(a, b))
 a_and_b
 
 ## Array