You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2021/12/09 13:27:49 UTC

[GitHub] [arrow] pitrou commented on a change in pull request #9702: ARROW-11297: [C++][Python] Add ORC writer options

pitrou commented on a change in pull request #9702:
URL: https://github.com/apache/arrow/pull/9702#discussion_r765780489



##########
File path: cpp/src/arrow/adapters/orc/adapter_common.h
##########
@@ -0,0 +1,111 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+namespace adapters {
+
+namespace orc {
+
+enum WriterId {
+  ORC_JAVA_WRITER = 0,
+  ORC_CPP_WRITER = 1,
+  PRESTO_WRITER = 2,
+  SCRITCHLEY_GO = 3,
+  TRINO_WRITER = 4,
+  UNKNOWN_WRITER = INT32_MAX
+};
+
+enum WriterVersion {
+  WriterVersion_ORIGINAL = 0,
+  WriterVersion_HIVE_8732 = 1,
+  WriterVersion_HIVE_4243 = 2,
+  WriterVersion_HIVE_12055 = 3,
+  WriterVersion_HIVE_13083 = 4,
+  WriterVersion_ORC_101 = 5,
+  WriterVersion_ORC_135 = 6,
+  WriterVersion_ORC_517 = 7,
+  WriterVersion_ORC_203 = 8,
+  WriterVersion_ORC_14 = 9,
+  WriterVersion_MAX = INT32_MAX
+};
+
+enum CompressionKind {
+  CompressionKind_NONE = 0,
+  CompressionKind_ZLIB = 1,
+  CompressionKind_SNAPPY = 2,
+  CompressionKind_LZO = 3,
+  CompressionKind_LZ4 = 4,
+  CompressionKind_ZSTD = 5,
+  CompressionKind_MAX = INT32_MAX
+};
+
+enum CompressionStrategy {
+  CompressionStrategy_SPEED = 0,
+  CompressionStrategy_COMPRESSION
+};
+
+enum RleVersion { RleVersion_1 = 0, RleVersion_2 = 1 };
+
+enum BloomFilterVersion {
+  // Include both the BLOOM_FILTER and BLOOM_FILTER_UTF8 streams to support
+  // both old and new readers.
+  ORIGINAL = 0,
+  // Only include the BLOOM_FILTER_UTF8 streams that consistently use UTF8.
+  // See ORC-101
+  UTF8 = 1,
+  FUTURE = INT32_MAX

Review comment:
       What's this? Would you like to add a comment?

##########
File path: cpp/src/arrow/adapters/orc/adapter_common.h
##########
@@ -0,0 +1,111 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+namespace adapters {
+
+namespace orc {
+
+enum WriterId {

Review comment:
       Can you make this a `enum class` and avoid adding `WRITER` to the names below? For example:
   ```
   enum class WriterId {
     kOrcJava = 0,
     /* etc. */
   };

##########
File path: cpp/src/arrow/adapters/orc/adapter_common.h
##########
@@ -0,0 +1,111 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+namespace adapters {
+
+namespace orc {
+
+enum WriterId {
+  ORC_JAVA_WRITER = 0,
+  ORC_CPP_WRITER = 1,
+  PRESTO_WRITER = 2,
+  SCRITCHLEY_GO = 3,
+  TRINO_WRITER = 4,
+  UNKNOWN_WRITER = INT32_MAX
+};
+
+enum WriterVersion {
+  WriterVersion_ORIGINAL = 0,
+  WriterVersion_HIVE_8732 = 1,
+  WriterVersion_HIVE_4243 = 2,
+  WriterVersion_HIVE_12055 = 3,
+  WriterVersion_HIVE_13083 = 4,
+  WriterVersion_ORC_101 = 5,
+  WriterVersion_ORC_135 = 6,
+  WriterVersion_ORC_517 = 7,
+  WriterVersion_ORC_203 = 8,
+  WriterVersion_ORC_14 = 9,
+  WriterVersion_MAX = INT32_MAX
+};
+
+enum CompressionKind {

Review comment:
       We already have such definitions in `arrow/util/compression.h`, is it actually useful to redefine this?

##########
File path: cpp/src/arrow/adapters/orc/CMakeLists.txt
##########
@@ -20,7 +20,8 @@
 #
 
 # Headers: top level
-install(FILES adapter.h DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow/adapters/orc")
+install(FILES adapter.h adapter_options.h adapter_common.h

Review comment:
       Hmm, I know that we started this way, but I don't think it makes sense to have "adapter" in the filename, especially as it's already in the directory path. Can you name those `options.h` and `common.h`, simply?

##########
File path: cpp/src/arrow/adapters/orc/adapter_options.h
##########
@@ -0,0 +1,228 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <set>
+#include <sstream>
+
+#include "arrow/adapters/orc/adapter_common.h"
+#include "arrow/io/interfaces.h"
+#include "arrow/util/visibility.h"
+#include "orc/OrcFile.hh"
+
+namespace liborc = orc;

Review comment:
       Is this actually used in this header?

##########
File path: cpp/src/arrow/testing/gtest_util.cc
##########
@@ -122,6 +122,7 @@ void AssertArraysEqualWith(const Array& expected, const Array& actual, bool verb
     if (expected.data()->null_count != actual.data()->null_count) {
       diff << "Null counts differ. Expected " << expected.data()->null_count
            << " but was " << actual.data()->null_count << "\n";
+      ARROW_LOG(FATAL) << diff.str();

Review comment:
       I suppose you'll remove this when ready.

##########
File path: cpp/src/arrow/adapters/orc/adapter_options.h
##########
@@ -0,0 +1,228 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <set>
+#include <sstream>
+
+#include "arrow/adapters/orc/adapter_common.h"
+#include "arrow/io/interfaces.h"
+#include "arrow/util/visibility.h"
+#include "orc/OrcFile.hh"
+
+namespace liborc = orc;
+
+namespace arrow {
+
+namespace adapters {
+
+namespace orc {
+// Components of ORC Writer Options
+
+/**
+ * Options for creating a Writer.
+ */
+class ARROW_EXPORT WriteOptions {
+ public:
+  WriteOptions();
+  WriteOptions(const WriteOptions&);
+  WriteOptions(WriteOptions&);
+  WriteOptions& operator=(const WriteOptions&);
+  virtual ~WriteOptions();
+
+  /**
+   * Get the ORC writer options
+   * @return The ORC writer options this WriteOption encapsulates
+   */
+  std::shared_ptr<liborc::WriterOptions> GetOrcWriterOptions() const {
+    return orc_writer_options_;
+  }
+
+  /**
+   * Set the batch size.
+   */
+  WriteOptions& SetBatchSize(uint64_t size);
+
+  /**
+   * Get the batch size.
+   * @return if not set, return default value.
+   */
+  uint64_t GetBatchSize() const;

Review comment:
       Really, instead of writing and maintaining lots of trivial accessors, I think we should use another idiom where the options are a simple `struct` and the construction of `liborc::WriterOptions` becomes private.
   
   So this would be, for example:
   ```c++
   struct WriteOptions {
     int64_t batch_size = ...;
     int64_t stripe_size = ...;
   };
   
   ```
   
   and the `GetOrcWriterOptions` functionality wouldn't be exposed publicly.

##########
File path: cpp/src/arrow/adapters/orc/adapter_options.h
##########
@@ -0,0 +1,228 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <set>
+#include <sstream>
+
+#include "arrow/adapters/orc/adapter_common.h"
+#include "arrow/io/interfaces.h"
+#include "arrow/util/visibility.h"
+#include "orc/OrcFile.hh"
+
+namespace liborc = orc;
+
+namespace arrow {
+
+namespace adapters {
+
+namespace orc {
+// Components of ORC Writer Options
+
+/**
+ * Options for creating a Writer.
+ */
+class ARROW_EXPORT WriteOptions {
+ public:
+  WriteOptions();
+  WriteOptions(const WriteOptions&);
+  WriteOptions(WriteOptions&);
+  WriteOptions& operator=(const WriteOptions&);
+  virtual ~WriteOptions();
+
+  /**
+   * Get the ORC writer options
+   * @return The ORC writer options this WriteOption encapsulates
+   */
+  std::shared_ptr<liborc::WriterOptions> GetOrcWriterOptions() const {
+    return orc_writer_options_;
+  }
+
+  /**
+   * Set the batch size.
+   */
+  WriteOptions& SetBatchSize(uint64_t size);
+
+  /**
+   * Get the batch size.
+   * @return if not set, return default value.
+   */
+  uint64_t GetBatchSize() const;
+
+  /**
+   * Set the stripe size.
+   */
+  WriteOptions& SetStripeSize(uint64_t size);
+
+  /**
+   * Get the stripe size.
+   * @return if not set, return default value.
+   */
+  uint64_t GetStripeSize() const;

Review comment:
       Unless really useful and/or necessary, we tend to use signed integers, e.g. `int64_t` here.

##########
File path: cpp/src/arrow/adapters/orc/adapter_options.h
##########
@@ -0,0 +1,228 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <set>
+#include <sstream>
+
+#include "arrow/adapters/orc/adapter_common.h"
+#include "arrow/io/interfaces.h"
+#include "arrow/util/visibility.h"
+#include "orc/OrcFile.hh"

Review comment:
       We should try to avoid including third-party headers in public `.h` files.

##########
File path: cpp/src/arrow/adapters/orc/adapter_common.h
##########
@@ -0,0 +1,111 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+namespace adapters {
+
+namespace orc {
+
+enum WriterId {
+  ORC_JAVA_WRITER = 0,
+  ORC_CPP_WRITER = 1,
+  PRESTO_WRITER = 2,
+  SCRITCHLEY_GO = 3,
+  TRINO_WRITER = 4,
+  UNKNOWN_WRITER = INT32_MAX
+};
+
+enum WriterVersion {

Review comment:
       Same here.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org