You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/01/24 15:07:46 UTC

[GitHub] [arrow] jorisvandenbossche commented on a change in pull request #9702: ARROW-11297: [C++][Python] Add ORC writer options

jorisvandenbossche commented on a change in pull request #9702:
URL: https://github.com/apache/arrow/pull/9702#discussion_r790787342



##########
File path: python/pyarrow/orc.py
##########
@@ -118,21 +186,93 @@ def read(self, columns=None):
         return self.reader.read(columns=columns)
 
 
-class ORCWriter:
-    """
-    Writer interface for a single ORC file
+_orc_writer_args_docs = """file_version : {"0.11", "0.12"}, default "0.12"
+    Determine which ORC file version to use.
+    `Hive 0.11 / ORC v0 <https://orc.apache.org/specification/ORCv0/>`_
+    is the older version
+    while `Hive 0.12 / ORC v1 <https://orc.apache.org/specification/ORCv1/>`_
+    is the newer one.
+batch_size : int, default 1024
+    Number of rows the ORC writer writes at a time.
+stripe_size : int, default 64 * 1024 * 1024
+    Size of each ORC stripe.

Review comment:
       I think we should explicitly mention here this is in bytes (given this has a big source of confusion for ourselves :))

##########
File path: cpp/src/arrow/adapters/orc/options.h
##########
@@ -0,0 +1,119 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <vector>
+
+#include "arrow/io/interfaces.h"
+#include "arrow/status.h"
+#include "arrow/util/type_fwd.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+namespace adapters {
+
+namespace orc {
+
+enum class WriterId : int32_t {
+  kOrcJava = 0,
+  kOrcCpp = 1,
+  kPresto = 2,
+  kScritchleyGo = 3,
+  kTrino = 4,
+  kUnknown = INT32_MAX
+};
+
+enum class WriterVersion : int32_t {
+  kOriginal = 0,
+  kHive8732 = 1,
+  kHive4243 = 2,
+  kHive12055 = 3,
+  kHive13083 = 4,
+  kOrc101 = 5,
+  kOrc135 = 6,
+  kOrc517 = 7,
+  kOrc203 = 8,
+  kOrc14 = 9,
+  kMax = INT32_MAX
+};
+
+enum class CompressionStrategy : int32_t { kSpeed = 0, kCompression };
+
+class ARROW_EXPORT FileVersion {
+ private:
+  int32_t major_version;
+  int32_t minor_version;
+
+ public:
+  static const FileVersion& v_0_11();
+  static const FileVersion& v_0_12();
+
+  FileVersion(int32_t major, int32_t minor)
+      : major_version(major), minor_version(minor) {}
+
+  /**
+   * Get major version
+   */
+  int32_t major() const { return this->major_version; }
+
+  /**
+   * Get minor version
+   */
+  int32_t minor() const { return this->minor_version; }
+
+  bool operator==(const FileVersion& right) const {
+    return this->major_version == right.major() && this->minor_version == right.minor();
+  }
+
+  bool operator!=(const FileVersion& right) const { return !(*this == right); }
+
+  std::string ToString() const;
+};
+
+/// Options for the ORC Writer
+struct ARROW_EXPORT WriteOptions {
+  /// Number of rows the ORC writer writes at a time, default 1024
+  int64_t batch_size = 1024;

Review comment:
       Previously, we used a `constexpr uint64_t kOrcWriterBatchSize = 128 * 1024;`, so now we made the batch size considerably smaller. Was that intentional? This might have consequences on the performance?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org