You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/07/26 15:06:02 UTC

[GitHub] [arrow-cookbook] lidavidm commented on a diff in pull request #227: Adding recipe for custom compute functions

lidavidm commented on code in PR #227:
URL: https://github.com/apache/arrow-cookbook/pull/227#discussion_r930077003


##########
cpp/code/compute_fn.cc:
##########
@@ -0,0 +1,270 @@
+// ------------------------------
+// Dependencies
+
+// standard dependencies
+#include <stdint.h>
+#include <string>
+#include <iostream>
+
+// arrow dependencies
+#include <arrow/api.h>
+#include <arrow/compute/api.h>
+#include <arrow/compute/exec/key_hash.h>
+
+#include "common.h"
+
+
+// >> aliases for types in standard library
+using std::shared_ptr;
+using std::vector;
+
+// arrow util types
+using arrow::Result;
+using arrow::Status;
+using arrow::Datum;
+
+// arrow data types and helpers
+using arrow::UInt32Builder;
+using arrow::Int32Builder;
+
+using arrow::Array;
+using arrow::ArraySpan;
+
+
+// aliases for types used in `NamedScalarFn`
+//    |> kernel parameters
+using arrow::compute::KernelContext;
+using arrow::compute::ExecSpan;
+using arrow::compute::ExecResult;
+
+//    |> other context types
+using arrow::compute::ExecContext;
+using arrow::compute::LightContext;
+
+//    |> common types for compute functions
+using arrow::compute::FunctionRegistry;
+using arrow::compute::FunctionDoc;
+using arrow::compute::InputType;
+using arrow::compute::OutputType;
+using arrow::compute::Arity;
+
+//    |> the "kind" of function we want
+using arrow::compute::ScalarFunction;
+
+//    |> structs and classes for hashing
+using arrow::util::MiniBatch;
+using arrow::util::TempVectorStack;
+
+using arrow::compute::KeyColumnArray;
+using arrow::compute::Hashing32;
+
+//    |> functions used for hashing
+using arrow::compute::ColumnArrayFromArrayData;
+
+
+// ------------------------------
+// Structs and Classes
+
+// >> Documentation for a compute function
+/**
+ * Create a const instance of `FunctionDoc` that contains 3 attributes:
+ *  1. Short description
+ *  2. Long  description (limited to 78 characters)

Review Comment:
   78 characters wide right?



##########
cpp/code/compute_fn.cc:
##########
@@ -0,0 +1,270 @@
+// ------------------------------
+// Dependencies
+
+// standard dependencies
+#include <stdint.h>
+#include <string>
+#include <iostream>
+
+// arrow dependencies
+#include <arrow/api.h>
+#include <arrow/compute/api.h>
+#include <arrow/compute/exec/key_hash.h>
+
+#include "common.h"
+
+
+// >> aliases for types in standard library
+using std::shared_ptr;
+using std::vector;
+
+// arrow util types
+using arrow::Result;
+using arrow::Status;
+using arrow::Datum;
+
+// arrow data types and helpers
+using arrow::UInt32Builder;
+using arrow::Int32Builder;
+
+using arrow::Array;
+using arrow::ArraySpan;
+
+
+// aliases for types used in `NamedScalarFn`
+//    |> kernel parameters
+using arrow::compute::KernelContext;
+using arrow::compute::ExecSpan;
+using arrow::compute::ExecResult;
+
+//    |> other context types
+using arrow::compute::ExecContext;
+using arrow::compute::LightContext;
+
+//    |> common types for compute functions
+using arrow::compute::FunctionRegistry;
+using arrow::compute::FunctionDoc;
+using arrow::compute::InputType;
+using arrow::compute::OutputType;
+using arrow::compute::Arity;
+
+//    |> the "kind" of function we want
+using arrow::compute::ScalarFunction;
+
+//    |> structs and classes for hashing
+using arrow::util::MiniBatch;
+using arrow::util::TempVectorStack;
+
+using arrow::compute::KeyColumnArray;
+using arrow::compute::Hashing32;
+
+//    |> functions used for hashing
+using arrow::compute::ColumnArrayFromArrayData;
+
+
+// ------------------------------
+// Structs and Classes
+
+// >> Documentation for a compute function
+/**
+ * Create a const instance of `FunctionDoc` that contains 3 attributes:
+ *  1. Short description
+ *  2. Long  description (limited to 78 characters)
+ *  3. Name of input arguments
+ */
+const FunctionDoc named_scalar_fn_doc {
+   "Unary function that calculates a hash for each row of the input"
+  ,"This function uses an xxHash-like algorithm which produces 32-bit hashes."
+  ,{ "input_array" }
+};
+
+
+// >> Kernel implementations for a compute function
+/**
+ * Create implementations that will be associated with our compute function. When a
+ * compute function is invoked, the compute API framework will delegate execution to an
+ * associated kernel that matches: (1) input argument types/shapes and (2) output argument
+ * types/shapes.
+ *
+ * Kernel implementations may be functions or may be methods (functions within a class or
+ * struct).
+ */
+struct NamedScalarFn {
+
+  /**
+   * A kernel implementation that expects a single array as input, and outputs an array of
+   * uint32 values. We write this implementation knowing what function we want to
+   * associate it with ("NamedScalarFn"), but that association is made later (see
+   * `RegisterScalarFnKernels()` below).
+   */
+  static Status
+  Exec(KernelContext *ctx, const ExecSpan &input_arg, ExecResult *out) {
+    StartRecipe("DefineAComputeKernel");
+
+    if (input_arg.num_values() != 1 or not input_arg[0].is_array()) {
+      return Status::Invalid("Unsupported argument types or shape");
+    }
+
+    // >> Initialize stack-based memory allocator with an allocator and memory size
+    TempVectorStack stack_memallocator;
+    auto            input_dtype_width = input_arg[0].type()->bit_width();
+    if (input_dtype_width > 0) {
+      ARROW_RETURN_NOT_OK(
+        stack_memallocator.Init(
+           ctx->exec_context()->memory_pool()
+          ,input_dtype_width * max_batchsize
+        )
+      );
+    }
+
+    // >> Prepare input data structure for propagation to hash function
+    // NOTE: "start row index" and "row count" can potentially be options in the future
+    ArraySpan hash_input    = input_arg[0].array;
+    int64_t   hash_startrow = 0;
+    int64_t   hash_rowcount = hash_input.length;
+    ARROW_ASSIGN_OR_RAISE(
+       KeyColumnArray input_keycol
+      ,ColumnArrayFromArrayData(hash_input.ToArrayData(), hash_startrow, hash_rowcount)
+    );

Review Comment:
   While for an actual implementation, we would want to reuse Arrow hashing code, I wonder if this example would be clearer/more focused if we implemented a hash function inline (even a fairly trivial one) and removed the use of semi-internal APIs



##########
cpp/code/compute_fn.cc:
##########
@@ -0,0 +1,270 @@
+// ------------------------------
+// Dependencies
+
+// standard dependencies
+#include <stdint.h>
+#include <string>
+#include <iostream>
+
+// arrow dependencies
+#include <arrow/api.h>
+#include <arrow/compute/api.h>
+#include <arrow/compute/exec/key_hash.h>
+
+#include "common.h"
+
+
+// >> aliases for types in standard library
+using std::shared_ptr;
+using std::vector;
+
+// arrow util types
+using arrow::Result;
+using arrow::Status;
+using arrow::Datum;
+
+// arrow data types and helpers
+using arrow::UInt32Builder;
+using arrow::Int32Builder;
+
+using arrow::Array;
+using arrow::ArraySpan;
+
+
+// aliases for types used in `NamedScalarFn`
+//    |> kernel parameters
+using arrow::compute::KernelContext;
+using arrow::compute::ExecSpan;
+using arrow::compute::ExecResult;
+
+//    |> other context types
+using arrow::compute::ExecContext;
+using arrow::compute::LightContext;
+
+//    |> common types for compute functions
+using arrow::compute::FunctionRegistry;
+using arrow::compute::FunctionDoc;
+using arrow::compute::InputType;
+using arrow::compute::OutputType;
+using arrow::compute::Arity;
+
+//    |> the "kind" of function we want
+using arrow::compute::ScalarFunction;
+
+//    |> structs and classes for hashing
+using arrow::util::MiniBatch;
+using arrow::util::TempVectorStack;
+
+using arrow::compute::KeyColumnArray;
+using arrow::compute::Hashing32;
+
+//    |> functions used for hashing
+using arrow::compute::ColumnArrayFromArrayData;
+
+
+// ------------------------------
+// Structs and Classes
+
+// >> Documentation for a compute function
+/**
+ * Create a const instance of `FunctionDoc` that contains 3 attributes:
+ *  1. Short description
+ *  2. Long  description (limited to 78 characters)
+ *  3. Name of input arguments
+ */
+const FunctionDoc named_scalar_fn_doc {
+   "Unary function that calculates a hash for each row of the input"
+  ,"This function uses an xxHash-like algorithm which produces 32-bit hashes."
+  ,{ "input_array" }
+};
+
+
+// >> Kernel implementations for a compute function
+/**
+ * Create implementations that will be associated with our compute function. When a
+ * compute function is invoked, the compute API framework will delegate execution to an
+ * associated kernel that matches: (1) input argument types/shapes and (2) output argument
+ * types/shapes.
+ *
+ * Kernel implementations may be functions or may be methods (functions within a class or
+ * struct).
+ */
+struct NamedScalarFn {
+
+  /**
+   * A kernel implementation that expects a single array as input, and outputs an array of
+   * uint32 values. We write this implementation knowing what function we want to
+   * associate it with ("NamedScalarFn"), but that association is made later (see
+   * `RegisterScalarFnKernels()` below).
+   */
+  static Status
+  Exec(KernelContext *ctx, const ExecSpan &input_arg, ExecResult *out) {
+    StartRecipe("DefineAComputeKernel");
+
+    if (input_arg.num_values() != 1 or not input_arg[0].is_array()) {
+      return Status::Invalid("Unsupported argument types or shape");
+    }
+
+    // >> Initialize stack-based memory allocator with an allocator and memory size
+    TempVectorStack stack_memallocator;
+    auto            input_dtype_width = input_arg[0].type()->bit_width();
+    if (input_dtype_width > 0) {
+      ARROW_RETURN_NOT_OK(
+        stack_memallocator.Init(
+           ctx->exec_context()->memory_pool()
+          ,input_dtype_width * max_batchsize
+        )
+      );
+    }
+
+    // >> Prepare input data structure for propagation to hash function
+    // NOTE: "start row index" and "row count" can potentially be options in the future

Review Comment:
   You would just slice the input (0-copy). Scalar functions have to provide a row of output per row of input so such options wouldn't make sense.



##########
cpp/code/compute_fn.cc:
##########
@@ -0,0 +1,270 @@
+// ------------------------------
+// Dependencies
+
+// standard dependencies
+#include <stdint.h>
+#include <string>
+#include <iostream>
+
+// arrow dependencies
+#include <arrow/api.h>
+#include <arrow/compute/api.h>
+#include <arrow/compute/exec/key_hash.h>
+
+#include "common.h"
+
+
+// >> aliases for types in standard library
+using std::shared_ptr;
+using std::vector;
+
+// arrow util types
+using arrow::Result;
+using arrow::Status;
+using arrow::Datum;
+
+// arrow data types and helpers
+using arrow::UInt32Builder;
+using arrow::Int32Builder;
+
+using arrow::Array;
+using arrow::ArraySpan;
+
+
+// aliases for types used in `NamedScalarFn`
+//    |> kernel parameters
+using arrow::compute::KernelContext;
+using arrow::compute::ExecSpan;
+using arrow::compute::ExecResult;
+
+//    |> other context types
+using arrow::compute::ExecContext;
+using arrow::compute::LightContext;
+
+//    |> common types for compute functions
+using arrow::compute::FunctionRegistry;
+using arrow::compute::FunctionDoc;
+using arrow::compute::InputType;
+using arrow::compute::OutputType;
+using arrow::compute::Arity;
+
+//    |> the "kind" of function we want
+using arrow::compute::ScalarFunction;
+
+//    |> structs and classes for hashing
+using arrow::util::MiniBatch;
+using arrow::util::TempVectorStack;
+
+using arrow::compute::KeyColumnArray;
+using arrow::compute::Hashing32;
+
+//    |> functions used for hashing
+using arrow::compute::ColumnArrayFromArrayData;
+
+
+// ------------------------------
+// Structs and Classes
+
+// >> Documentation for a compute function
+/**
+ * Create a const instance of `FunctionDoc` that contains 3 attributes:
+ *  1. Short description
+ *  2. Long  description (limited to 78 characters)
+ *  3. Name of input arguments
+ */
+const FunctionDoc named_scalar_fn_doc {
+   "Unary function that calculates a hash for each row of the input"
+  ,"This function uses an xxHash-like algorithm which produces 32-bit hashes."
+  ,{ "input_array" }
+};
+
+
+// >> Kernel implementations for a compute function
+/**
+ * Create implementations that will be associated with our compute function. When a
+ * compute function is invoked, the compute API framework will delegate execution to an
+ * associated kernel that matches: (1) input argument types/shapes and (2) output argument
+ * types/shapes.
+ *
+ * Kernel implementations may be functions or may be methods (functions within a class or
+ * struct).
+ */
+struct NamedScalarFn {
+
+  /**
+   * A kernel implementation that expects a single array as input, and outputs an array of
+   * uint32 values. We write this implementation knowing what function we want to
+   * associate it with ("NamedScalarFn"), but that association is made later (see
+   * `RegisterScalarFnKernels()` below).
+   */
+  static Status
+  Exec(KernelContext *ctx, const ExecSpan &input_arg, ExecResult *out) {
+    StartRecipe("DefineAComputeKernel");
+
+    if (input_arg.num_values() != 1 or not input_arg[0].is_array()) {
+      return Status::Invalid("Unsupported argument types or shape");
+    }
+
+    // >> Initialize stack-based memory allocator with an allocator and memory size
+    TempVectorStack stack_memallocator;
+    auto            input_dtype_width = input_arg[0].type()->bit_width();
+    if (input_dtype_width > 0) {
+      ARROW_RETURN_NOT_OK(
+        stack_memallocator.Init(
+           ctx->exec_context()->memory_pool()
+          ,input_dtype_width * max_batchsize
+        )
+      );
+    }

Review Comment:
   Wouldn't this fail for 0-width types (if that's even a thing)? Or really, is the conditional really necessary?



##########
cpp/code/compute_fn.cc:
##########
@@ -0,0 +1,270 @@
+// ------------------------------
+// Dependencies
+
+// standard dependencies
+#include <stdint.h>
+#include <string>
+#include <iostream>
+
+// arrow dependencies
+#include <arrow/api.h>
+#include <arrow/compute/api.h>
+#include <arrow/compute/exec/key_hash.h>
+
+#include "common.h"
+
+
+// >> aliases for types in standard library
+using std::shared_ptr;
+using std::vector;
+
+// arrow util types
+using arrow::Result;
+using arrow::Status;
+using arrow::Datum;
+
+// arrow data types and helpers
+using arrow::UInt32Builder;
+using arrow::Int32Builder;
+
+using arrow::Array;
+using arrow::ArraySpan;
+
+
+// aliases for types used in `NamedScalarFn`
+//    |> kernel parameters
+using arrow::compute::KernelContext;
+using arrow::compute::ExecSpan;
+using arrow::compute::ExecResult;
+
+//    |> other context types
+using arrow::compute::ExecContext;
+using arrow::compute::LightContext;
+
+//    |> common types for compute functions
+using arrow::compute::FunctionRegistry;
+using arrow::compute::FunctionDoc;
+using arrow::compute::InputType;
+using arrow::compute::OutputType;
+using arrow::compute::Arity;
+
+//    |> the "kind" of function we want
+using arrow::compute::ScalarFunction;
+
+//    |> structs and classes for hashing
+using arrow::util::MiniBatch;
+using arrow::util::TempVectorStack;
+
+using arrow::compute::KeyColumnArray;
+using arrow::compute::Hashing32;
+
+//    |> functions used for hashing
+using arrow::compute::ColumnArrayFromArrayData;
+
+
+// ------------------------------
+// Structs and Classes
+
+// >> Documentation for a compute function
+/**
+ * Create a const instance of `FunctionDoc` that contains 3 attributes:
+ *  1. Short description
+ *  2. Long  description (limited to 78 characters)
+ *  3. Name of input arguments
+ */
+const FunctionDoc named_scalar_fn_doc {
+   "Unary function that calculates a hash for each row of the input"
+  ,"This function uses an xxHash-like algorithm which produces 32-bit hashes."
+  ,{ "input_array" }
+};
+
+
+// >> Kernel implementations for a compute function
+/**
+ * Create implementations that will be associated with our compute function. When a
+ * compute function is invoked, the compute API framework will delegate execution to an
+ * associated kernel that matches: (1) input argument types/shapes and (2) output argument
+ * types/shapes.
+ *
+ * Kernel implementations may be functions or may be methods (functions within a class or
+ * struct).
+ */
+struct NamedScalarFn {
+
+  /**
+   * A kernel implementation that expects a single array as input, and outputs an array of
+   * uint32 values. We write this implementation knowing what function we want to
+   * associate it with ("NamedScalarFn"), but that association is made later (see
+   * `RegisterScalarFnKernels()` below).
+   */
+  static Status
+  Exec(KernelContext *ctx, const ExecSpan &input_arg, ExecResult *out) {
+    StartRecipe("DefineAComputeKernel");

Review Comment:
   The StartRecipe/EndRecipe might be a little too limiting in this case, since presumably we want to show the entire struct



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org