You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2021/01/26 21:36:03 UTC
[GitHub] [arrow] bkietz commented on a change in pull request #9323: ARROW-10438: [C++][Dataset] Partitioning::Format on nulls

bkietz commented on a change in pull request #9323:
URL: https://github.com/apache/arrow/pull/9323#discussion_r564831474



##########
File path: cpp/src/arrow/compute/kernels/vector_hash.cc
##########
@@ -451,8 +487,12 @@ struct HashKernelTraits<Type, Action, enable_if_has_string_view<Type>> {
 template <typename Type, typename Action>
 std::unique_ptr<HashKernel> HashInitImpl(KernelContext* ctx, const KernelInitArgs& args) {
   using HashKernelType = typename HashKernelTraits<Type, Action>::HashKernel;
-  auto result = ::arrow::internal::make_unique<HashKernelType>(args.inputs[0].type,
-                                                               ctx->memory_pool());
+  DictionaryEncodeOptions options;
+  if (auto options_ptr = static_cast<const DictionaryEncodeOptions*>(args.options)) {

Review comment:
       This seems fragile; if other functions which use `HashInitImpl` add options this will break. Would it be possible to branch on the Action instead?
   
   ```suggestion
     if (std::is_same<Action, DictEncodeAction::value) {
   ```
   
   (currently dictionary_encode uses ValueCountsAction in Init, which is confusing and would invalidate the above approach)

##########
File path: cpp/src/arrow/dataset/partition.cc
##########
@@ -625,9 +621,17 @@ class StructDictionary {
 
  private:
   Status AddOne(Datum column, std::shared_ptr<Int32Array>* fused_indices) {
-    if (column.type()->id() != Type::DICTIONARY) {
-      ARROW_ASSIGN_OR_RAISE(column, compute::DictionaryEncode(std::move(column)));
+    if (column.type()->id() == Type::DICTIONARY) {
+      // compute::DictionaryEncode doesn't support dictionary and, even if it did, it
+      // would be a null op and return a flat dictionary.  In order to group by dictionary
+      // we would need to be able to create a nested dictionary.
+      return Status::NotImplemented(
+          "Cannot use column of type dictionary as grouping criteria");

Review comment:
       Please revert this: in case of a dictionary array, dictionary encode is unnecessary as we already have the dictionary and the indices and they can be used unchanged below.

##########
File path: cpp/src/arrow/compute/api_vector.h
##########
@@ -296,7 +315,8 @@ Result<std::shared_ptr<StructArray>> ValueCounts(const Datum& value,
 /// \since 1.0.0
 /// \note API not yet finalized
 ARROW_EXPORT
-Result<Datum> DictionaryEncode(const Datum& data, ExecContext* ctx = NULLPTR);
+Result<Datum> DictionaryEncode(const Datum& data, const DictionaryEncodeOptions& options,

Review comment:
       You could provide the default dict encode options here:
   ```suggestion
   Result<Datum> DictionaryEncode(const Datum& data,
                                  const DictionaryEncodeOptions& options = DictionaryEncodeOptions::Defaults(),
   ```

##########
File path: cpp/src/arrow/compute/api_vector.h
##########
@@ -63,6 +63,25 @@ enum class SortOrder {
   Descending,
 };
 
+struct DictionaryEncodeOptions : public FunctionOptions {
+  /// Configure how null values will be encoded
+  enum NullEncodingBehavior {
+    /// the null value will be added to the dictionary with a proper index
+    ENCODE,
+    /// the null value will be masked in the indices array
+    MASK,
+    /// the null value will not be included in the dictionary
+    SKIP

Review comment:
       These are confusingly worded. It'd also be good to have inline examples of each behavior
   ```suggestion
       /// nulls result in an index referencing a null slot in the dictionary
       ENCODE,
       /// nulls result in a null index (does not correspond to any slot in the dictionary)
       MASK,
       /// nulls are ignored
       IGNORE
   ```

##########
File path: cpp/src/arrow/compute/kernels/vector_hash_test.cc
##########
@@ -649,13 +692,42 @@ TEST_F(TestHashKernel, ChunkedArrayInvoke) {
 TEST_F(TestHashKernel, ZeroLengthDictionaryEncode) {
   // ARROW-7008
   auto values = ArrayFromJSON(utf8(), "[]");
-  ASSERT_OK_AND_ASSIGN(Datum datum_result, DictionaryEncode(values));
+  ASSERT_OK_AND_ASSIGN(Datum datum_result,
+                       DictionaryEncode(values, DictionaryEncodeOptions::Defaults()));
 
   std::shared_ptr<Array> result = datum_result.make_array();
   const auto& dict_result = checked_cast<const DictionaryArray&>(*result);
   ASSERT_OK(dict_result.ValidateFull());
 }
 
+TEST_F(TestHashKernel, NullEncodingSchemes) {
+  auto values = ArrayFromJSON(uint8(), "[1, 1, null, 2, null]");
+
+  // Masking should put null in the indices array
+  auto expected_mask_indices = ArrayFromJSON(int32(), "[0, 0, null, 1, null]");
+  auto expected_mask_dictionary = ArrayFromJSON(uint8(), "[1, 2]");
+  auto dictionary_type = dictionary(int32(), uint8());
+  std::shared_ptr<Array> expected = std::make_shared<DictionaryArray>(
+      dictionary_type, expected_mask_indices, expected_mask_dictionary);
+
+  ASSERT_OK_AND_ASSIGN(Datum datum_result,
+                       DictionaryEncode(values, DictionaryEncodeOptions::Defaults()));
+  std::shared_ptr<Array> result = datum_result.make_array();
+  AssertArraysEqual(*expected, *result);
+
+  // Encoding should put null in the dictionary
+  auto expected_encoded_indices = ArrayFromJSON(int32(), "[0, 0, 1, 2, 1]");
+  auto expected_encoded_dict = ArrayFromJSON(uint8(), "[1, null, 2]");
+  expected = std::make_shared<DictionaryArray>(dictionary_type, expected_encoded_indices,
+                                               expected_encoded_dict);
+
+  auto options = DictionaryEncodeOptions::Defaults();
+  options.null_encoding_behavior = DictionaryEncodeOptions::ENCODE;
+  ASSERT_OK_AND_ASSIGN(datum_result, DictionaryEncode(values, options));
+  result = datum_result.make_array();
+  AssertArraysEqual(*expected, *result);
+}

Review comment:
       It'd be acceptable to just remove `DictionaryEncodeOptions::SKIP` for now; we can add it in a follow up if there's demand

##########
File path: cpp/src/arrow/pretty_print.cc
##########
@@ -670,4 +670,49 @@ Status PrettyPrint(const Schema& schema, const PrettyPrintOptions& options,
   return Status::OK();
 }
 
+void GdbPrintArray(const Array& arr, int indent) {

Review comment:
       Thanks for working on these, but please revert them here and add them in a dedicated PR




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org