You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2021/07/30 20:42:24 UTC

[GitHub] [arrow] westonpace commented on a change in pull request #10802: ARROW-1568: [C++] Implement Drop Null Kernel for Arrays

westonpace commented on a change in pull request #10802:
URL: https://github.com/apache/arrow/pull/10802#discussion_r680168763



##########
File path: cpp/src/arrow/compute/kernels/vector_selection.cc
##########
@@ -1988,7 +1989,7 @@ class FilterMetaFunction : public MetaFunction {
 
 Result<std::shared_ptr<Array>> TakeAA(const Array& values, const Array& indices,
                                       const TakeOptions& options, ExecContext* ctx) {
-  ARROW_ASSIGN_OR_RAISE(Datum result,
+    ARROW_ASSIGN_OR_RAISE(Datum result,

Review comment:
       What's going on here?  I've found it easiest to configure the IDE to run format-on-save.

##########
File path: python/pyarrow/tests/test_compute.py
##########
@@ -974,6 +974,51 @@ def test_take_null_type():
     assert len(table.take(indices).column(0)) == 4
 
 
+@pytest.mark.parametrize(('ty', 'values'), all_array_types)
+def test_dropnull(ty, values):
+    arr = pa.array(values, type=ty)
+    result = arr.dropnull()
+    result.validate()
+    indices = [i for i in range(len(arr)) if arr[i].is_valid]
+    expected = arr.take(pa.array(indices))
+    assert result.equals(expected)
+
+
+def test_dropnull_chunked_array():
+    arr = pa.chunked_array([["a", None], ["c", "d", None]])
+    expected_drop = pa.chunked_array([["a"], ["c", "d"]])
+    result = arr.dropnull()
+    assert result.equals(expected_drop)
+
+
+def test_dropnull_record_batch():
+    batch = pa.record_batch(
+        [pa.array(["a", None, "c", "d", None])], names=["a'"])

Review comment:
       Can you make some tests for record batches with multiple columns? (I think this will fail if they have different numbers of null elements).

##########
File path: cpp/src/arrow/compute/kernels/vector_selection.cc
##########
@@ -2146,6 +2147,144 @@ class TakeMetaFunction : public MetaFunction {
   }
 };
 
+// ----------------------------------------------------------------------
+// DropNull Implementation
+
+Result<std::shared_ptr<arrow::Array>> GetNotNullIndices(
+    const std::shared_ptr<Array>& column, MemoryPool* memory_pool) {
+  std::shared_ptr<arrow::Array> indices;
+  arrow::NumericBuilder<arrow::Int32Type> builder(memory_pool);
+  builder.Reserve(column->length() - column->null_count());
+
+  std::vector<int32_t> values;
+  for (int64_t i = 0; i < column->length(); i++) {
+    if (column->IsValid(i)) {
+      builder.UnsafeAppend(static_cast<int32_t>(i));
+    }
+  }
+  RETURN_NOT_OK(builder.Finish(&indices));
+  return indices;
+}
+
+Result<std::shared_ptr<arrow::Array>> GetNotNullIndices(
+    const std::shared_ptr<ChunkedArray>& chunks, MemoryPool* memory_pool) {
+  std::shared_ptr<arrow::Array> indices;
+  arrow::NumericBuilder<arrow::Int32Type> builder(memory_pool);
+  builder.Reserve(chunks->length() - chunks->null_count());
+  int64_t relative_index = 0;
+  for (int64_t chunk_index = 0; chunk_index < chunks->num_chunks(); ++chunk_index) {
+    auto column_chunk = chunks->chunk(chunk_index);
+    for (int64_t col_index = 0; col_index < column_chunk->length(); col_index++) {
+      if (column_chunk->IsValid(col_index)) {
+        builder.UnsafeAppend(static_cast<int32_t>(relative_index + col_index));
+      }
+    }
+    relative_index += column_chunk->length();
+  }
+  RETURN_NOT_OK(builder.Finish(&indices));
+  return indices;
+}
+
+Result<std::shared_ptr<RecordBatch>> DropNullRecordBatch(const RecordBatch& batch,
+                                                         ExecContext* ctx) {
+  int64_t length_count = 0;
+  std::vector<std::shared_ptr<Array>> columns(batch.num_columns());
+  for (int i = 0; i < batch.num_columns(); ++i) {
+    ARROW_ASSIGN_OR_RAISE(auto indices,
+                          GetNotNullIndices(batch.column(i), ctx->memory_pool()));
+    ARROW_ASSIGN_OR_RAISE(Datum out, Take(batch.column(i)->data(), Datum(indices),
+                                          TakeOptions::NoBoundsCheck(), ctx));
+    columns[i] = out.make_array();
+    length_count += columns[i]->length();
+  }
+  return RecordBatch::Make(batch.schema(), length_count, std::move(columns));
+}
+
+Result<std::shared_ptr<Table>> DropNullTable(const Table& table, ExecContext* ctx) {
+  if (table.num_rows() == 0) {
+    return Table::Make(table.schema(), table.columns(), 0);
+  }
+  const int num_columns = table.num_columns();
+  std::vector<ArrayVector> inputs(num_columns);
+
+  // Fetch table columns
+  for (int i = 0; i < num_columns; ++i) {
+    inputs[i] = table.column(i)->chunks();
+  }
+  std::set<int32_t> notnull_indices;
+  // Rechunk inputs to allow consistent iteration over their respective chunks
+  inputs = arrow::internal::RechunkArraysConsistently(inputs);
+
+  const int64_t num_chunks = static_cast<int64_t>(inputs.back().size());
+  for (int col = 0; col < num_columns; ++col) {
+    int64_t relative_index = 0;
+    for (int64_t chunk_index = 0; chunk_index < num_chunks; ++chunk_index) {
+      const auto& column_chunk = inputs[col][chunk_index];
+      for (int64_t i = 0; i < column_chunk->length(); ++i) {
+        if (!column_chunk->IsValid(i)) {
+          notnull_indices.insert(relative_index + i);
+        }
+      }
+      relative_index += column_chunk->length();
+    }
+  }
+  std::shared_ptr<arrow::Array> indices;

Review comment:
       Can you add a check here.  If the length of `notnull_indices` is equal to the length of the table then you can just return the table an save some time.

##########
File path: cpp/src/arrow/compute/kernels/vector_selection_test.cc
##########
@@ -1718,5 +1718,240 @@ TEST(TestTake, RandomFixedSizeBinary) {
   TakeRandomTest<FixedSizeBinaryType>::Test(fixed_size_binary(16));
 }
 
+// ----------------------------------------------------------------------
+// DropNull tests
+
+void AssertDropNullArrays(const std::shared_ptr<Array>& values,
+                          const std::shared_ptr<Array>& expected) {
+  ASSERT_OK_AND_ASSIGN(std::shared_ptr<Array> actual, DropNull(*values));
+  ValidateOutput(actual);
+  AssertArraysEqual(*expected, *actual, /*verbose=*/true);
+}
+
+Status DropNullJSON(const std::shared_ptr<DataType>& type, const std::string& values,
+                    std::shared_ptr<Array>* out) {
+  return DropNull(*ArrayFromJSON(type, values)).Value(out);
+}
+
+void CheckDropNull(const std::shared_ptr<DataType>& type, const std::string& values,
+                   const std::string& expected) {
+  std::shared_ptr<Array> actual;
+
+  ASSERT_OK(DropNullJSON(type, values, &actual));
+  ValidateOutput(actual);
+  AssertArraysEqual(*ArrayFromJSON(type, expected), *actual, /*verbose=*/true);
+}
+
+struct TestDropNullKernel : public ::testing::Test {
+  void TestNoValidityBitmapButUnknownNullCount(const std::shared_ptr<Array>& values) {
+    ASSERT_EQ(values->null_count(), 0);
+    auto expected = (*DropNull(values)).make_array();
+
+    auto new_values = MakeArray(values->data()->Copy());
+    new_values->data()->buffers[0].reset();
+    new_values->data()->null_count = kUnknownNullCount;
+    auto result = (*DropNull(new_values)).make_array();
+    AssertArraysEqual(*expected, *result);
+  }
+
+  void TestNoValidityBitmapButUnknownNullCount(const std::shared_ptr<DataType>& type,
+                                               const std::string& values) {
+    TestNoValidityBitmapButUnknownNullCount(ArrayFromJSON(type, values));
+  }
+};
+
+TEST_F(TestDropNullKernel, DropNull) {
+  CheckDropNull(null(), "[null, null, null]", "[]");
+  CheckDropNull(null(), "[null]", "[]");
+}
+
+TEST_F(TestDropNullKernel, DropNullBoolean) {
+  CheckDropNull(boolean(), "[true, false, true]", "[true, false, true]");
+  CheckDropNull(boolean(), "[null, false, true]", "[false, true]");
+
+  TestNoValidityBitmapButUnknownNullCount(boolean(), "[true, false, true]");
+}
+
+template <typename ArrowType>
+class TestDropNullKernelTyped : public TestDropNullKernel {};
+
+template <typename ArrowType>
+class TestDropNullKernelWithNumeric : public TestDropNullKernelTyped<ArrowType> {
+ protected:
+  void AssertDropNull(const std::string& values, const std::string& expected) {
+    CheckDropNull(type_singleton(), values, expected);
+  }
+
+  std::shared_ptr<DataType> type_singleton() {
+    return TypeTraits<ArrowType>::type_singleton();
+  }
+};
+
+TYPED_TEST_SUITE(TestDropNullKernelWithNumeric, NumericArrowTypes);
+TYPED_TEST(TestDropNullKernelWithNumeric, DropNullNumeric) {
+  this->AssertDropNull("[7, 8, 9]", "[7, 8, 9]");
+  this->AssertDropNull("[null, 8, 9]", "[8, 9]");
+  this->AssertDropNull("[null, null, null]", "[]");
+}
+
+template <typename TypeClass>
+class TestDropNullKernelWithString : public TestDropNullKernelTyped<TypeClass> {
+ public:
+  std::shared_ptr<DataType> value_type() {
+    return TypeTraits<TypeClass>::type_singleton();
+  }
+
+  void AssertDropNull(const std::string& values, const std::string& expected) {
+    CheckDropNull(value_type(), values, expected);
+  }
+
+  void AssertDropNullDictionary(const std::string& dictionary_values,
+                                const std::string& dictionary_indices,
+                                const std::string& expected_indices) {
+    auto dict = ArrayFromJSON(value_type(), dictionary_values);
+    auto type = dictionary(int8(), value_type());
+    ASSERT_OK_AND_ASSIGN(auto values,
+                         DictionaryArray::FromArrays(
+                             type, ArrayFromJSON(int8(), dictionary_indices), dict));
+    ASSERT_OK_AND_ASSIGN(
+        auto expected,
+        DictionaryArray::FromArrays(type, ArrayFromJSON(int8(), expected_indices), dict));
+    AssertDropNullArrays(values, expected);
+  }
+};
+
+TYPED_TEST_SUITE(TestDropNullKernelWithString, BinaryTypes);
+
+TYPED_TEST(TestDropNullKernelWithString, DropNullString) {
+  this->AssertDropNull(R"(["a", "b", "c"])", R"(["a", "b", "c"])");
+  this->AssertDropNull(R"([null, "b", "c"])", "[\"b\", \"c\"]");
+  this->AssertDropNull(R"(["a", "b", null])", R"(["a", "b"])");
+
+  this->TestNoValidityBitmapButUnknownNullCount(this->value_type(), R"(["a", "b", "c"])");
+}
+
+TYPED_TEST(TestDropNullKernelWithString, DropNullDictionary) {
+  auto dict = R"(["a", "b", "c", "d", "e"])";
+  this->AssertDropNullDictionary(dict, "[3, 4, 2]", "[3, 4, 2]");
+  this->AssertDropNullDictionary(dict, "[null, 4, 2]", "[4, 2]");
+}
+
+class TestDropNullKernelFSB : public TestDropNullKernelTyped<FixedSizeBinaryType> {
+ public:
+  std::shared_ptr<DataType> value_type() { return fixed_size_binary(3); }
+
+  void AssertDropNull(const std::string& values, const std::string& expected) {
+    CheckDropNull(value_type(), values, expected);
+  }
+};
+
+TEST_F(TestDropNullKernelFSB, DropNullFixedSizeBinary) {
+  this->AssertDropNull(R"(["aaa", "bbb", "ccc"])", R"(["aaa", "bbb", "ccc"])");
+  this->AssertDropNull(R"([null, "bbb", "ccc"])", "[\"bbb\", \"ccc\"]");
+
+  this->TestNoValidityBitmapButUnknownNullCount(this->value_type(),
+                                                R"(["aaa", "bbb", "ccc"])");
+}
+
+class TestDropNullKernelWithList : public TestDropNullKernelTyped<ListType> {};
+
+TEST_F(TestDropNullKernelWithList, DropNullListInt32) {
+  std::string list_json = "[[], [1,2], null, [3]]";
+  CheckDropNull(list(int32()), list_json, "[[], [1,2], [3]]");
+  this->TestNoValidityBitmapButUnknownNullCount(list(int32()), "[[], [1,2], [3]]");
+}
+
+TEST_F(TestDropNullKernelWithList, DropNullListListInt32) {
+  std::string list_json = R"([
+    [],
+    [[1], [2, null, 2], []],
+    null,
+    [[3, null], null]
+  ])";
+  auto type = list(list(int32()));
+  CheckDropNull(type, list_json, R"([
+    [],
+    [[1], [2, null, 2], []],
+    [[3, null], null]
+  ])");
+
+  this->TestNoValidityBitmapButUnknownNullCount(type,
+                                                "[[[1], [2, null, 2], []], [[3, null]]]");
+}
+
+class TestDropNullKernelWithLargeList : public TestDropNullKernelTyped<LargeListType> {};
+
+TEST_F(TestDropNullKernelWithLargeList, DropNullLargeListInt32) {
+  std::string list_json = "[[], [1,2], null, [3]]";
+  CheckDropNull(large_list(int32()), list_json, "[[], [1,2],  [3]]");
+
+  this->TestNoValidityBitmapButUnknownNullCount(
+      fixed_size_list(int32(), 3), "[[1, null, 3], [4, 5, 6], [7, 8, null]]");
+}
+
+class TestDropNullKernelWithFixedSizeList
+    : public TestDropNullKernelTyped<FixedSizeListType> {};
+
+TEST_F(TestDropNullKernelWithFixedSizeList, DropNullFixedSizeListInt32) {
+  std::string list_json = "[null, [1, null, 3], [4, 5, 6], [7, 8, null]]";
+  CheckDropNull(fixed_size_list(int32(), 3), list_json,
+                "[[1, null, 3], [4, 5, 6], [7, 8, null]]");
+
+  this->TestNoValidityBitmapButUnknownNullCount(
+      fixed_size_list(int32(), 3), "[[1, null, 3], [4, 5, 6], [7, 8, null]]");
+}
+
+class TestDropNullKernelWithMap : public TestDropNullKernelTyped<MapType> {};
+
+TEST_F(TestDropNullKernelWithMap, DropNullMapStringToInt32) {
+  std::string map_json = R"([
+    [["joe", 0], ["mark", null]],
+    null,
+    [["cap", 8]],
+    []
+  ])";
+  std::string expected_json = R"([
+    [["joe", 0], ["mark", null]],
+    [["cap", 8]],
+    []
+  ])";
+  CheckDropNull(map(utf8(), int32()), map_json, expected_json);
+}
+
+class TestDropNullKernelWithStruct : public TestDropNullKernelTyped<StructType> {};
+
+TEST_F(TestDropNullKernelWithStruct, DropNullStruct) {
+  auto struct_type = struct_({field("a", int32()), field("b", utf8())});
+  auto struct_json = R"([
+    null,
+    {"a": 1, "b": ""},
+    {"a": 2, "b": "hello"},
+    {"a": 4, "b": "eh"}
+  ])";
+  auto expected_struct_json = R"([
+    {"a": 1, "b": ""},
+    {"a": 2, "b": "hello"},
+    {"a": 4, "b": "eh"}
+  ])";
+  CheckDropNull(struct_type, struct_json, expected_struct_json);
+  this->TestNoValidityBitmapButUnknownNullCount(struct_type, expected_struct_json);
+}
+
+class TestDropNullKernelWithUnion : public TestDropNullKernelTyped<UnionType> {};
+
+TEST_F(TestDropNullKernelWithUnion, DropNullUnion) {
+  auto union_type = dense_union({field("a", int32()), field("b", utf8())}, {2, 5});
+  auto union_json = R"([
+      [2, null],
+      [2, 222],
+      [5, "hello"],
+      [5, "eh"],
+      [2, null],
+      [2, 111],
+      [5, null]
+    ])";
+  CheckDropNull(union_type, union_json, union_json);
+}
+

Review comment:
       I don't know what the rules are for what parts get tested in C++ and what parts get tested in JSON.  @nirandaperera / @bkietz maybe you can help me out here.  However, there are no C++ tests for the record batch, chunked array, or table versionsof drop null.  Though maybe this is ok.

##########
File path: cpp/src/arrow/compute/kernels/vector_selection.cc
##########
@@ -2146,6 +2147,144 @@ class TakeMetaFunction : public MetaFunction {
   }
 };
 
+// ----------------------------------------------------------------------
+// DropNull Implementation
+
+Result<std::shared_ptr<arrow::Array>> GetNotNullIndices(
+    const std::shared_ptr<Array>& column, MemoryPool* memory_pool) {
+  std::shared_ptr<arrow::Array> indices;
+  arrow::NumericBuilder<arrow::Int32Type> builder(memory_pool);
+  builder.Reserve(column->length() - column->null_count());
+
+  std::vector<int32_t> values;
+  for (int64_t i = 0; i < column->length(); i++) {
+    if (column->IsValid(i)) {
+      builder.UnsafeAppend(static_cast<int32_t>(i));
+    }
+  }
+  RETURN_NOT_OK(builder.Finish(&indices));
+  return indices;
+}
+
+Result<std::shared_ptr<arrow::Array>> GetNotNullIndices(
+    const std::shared_ptr<ChunkedArray>& chunks, MemoryPool* memory_pool) {
+  std::shared_ptr<arrow::Array> indices;
+  arrow::NumericBuilder<arrow::Int32Type> builder(memory_pool);
+  builder.Reserve(chunks->length() - chunks->null_count());
+  int64_t relative_index = 0;
+  for (int64_t chunk_index = 0; chunk_index < chunks->num_chunks(); ++chunk_index) {
+    auto column_chunk = chunks->chunk(chunk_index);
+    for (int64_t col_index = 0; col_index < column_chunk->length(); col_index++) {
+      if (column_chunk->IsValid(col_index)) {
+        builder.UnsafeAppend(static_cast<int32_t>(relative_index + col_index));
+      }
+    }
+    relative_index += column_chunk->length();
+  }
+  RETURN_NOT_OK(builder.Finish(&indices));
+  return indices;
+}
+
+Result<std::shared_ptr<RecordBatch>> DropNullRecordBatch(const RecordBatch& batch,
+                                                         ExecContext* ctx) {
+  int64_t length_count = 0;
+  std::vector<std::shared_ptr<Array>> columns(batch.num_columns());
+  for (int i = 0; i < batch.num_columns(); ++i) {
+    ARROW_ASSIGN_OR_RAISE(auto indices,
+                          GetNotNullIndices(batch.column(i), ctx->memory_pool()));
+    ARROW_ASSIGN_OR_RAISE(Datum out, Take(batch.column(i)->data(), Datum(indices),
+                                          TakeOptions::NoBoundsCheck(), ctx));
+    columns[i] = out.make_array();
+    length_count += columns[i]->length();
+  }
+  return RecordBatch::Make(batch.schema(), length_count, std::move(columns));
+}
+
+Result<std::shared_ptr<Table>> DropNullTable(const Table& table, ExecContext* ctx) {
+  if (table.num_rows() == 0) {
+    return Table::Make(table.schema(), table.columns(), 0);
+  }
+  const int num_columns = table.num_columns();
+  std::vector<ArrayVector> inputs(num_columns);
+
+  // Fetch table columns
+  for (int i = 0; i < num_columns; ++i) {
+    inputs[i] = table.column(i)->chunks();
+  }
+  std::set<int32_t> notnull_indices;
+  // Rechunk inputs to allow consistent iteration over their respective chunks
+  inputs = arrow::internal::RechunkArraysConsistently(inputs);
+
+  const int64_t num_chunks = static_cast<int64_t>(inputs.back().size());
+  for (int col = 0; col < num_columns; ++col) {
+    int64_t relative_index = 0;
+    for (int64_t chunk_index = 0; chunk_index < num_chunks; ++chunk_index) {
+      const auto& column_chunk = inputs[col][chunk_index];
+      for (int64_t i = 0; i < column_chunk->length(); ++i) {
+        if (!column_chunk->IsValid(i)) {
+          notnull_indices.insert(relative_index + i);
+        }
+      }
+      relative_index += column_chunk->length();
+    }
+  }
+  std::shared_ptr<arrow::Array> indices;
+  arrow::NumericBuilder<arrow::Int32Type> builder(ctx->memory_pool());
+  builder.Reserve(static_cast<int64_t>(table.num_rows() - notnull_indices.size()));
+  for (int64_t row_index = 0; row_index < table.num_rows(); ++row_index) {
+    if (notnull_indices.find(row_index) == notnull_indices.end()) {
+      builder.UnsafeAppend(static_cast<int32_t>(row_index));
+    }
+  }
+  RETURN_NOT_OK(builder.Finish(&indices));
+  return TakeTA(table, *indices, TakeOptions::Defaults(), ctx);
+}
+
+const FunctionDoc dropnull_doc(
+    "DropNull kernel",
+    ("The output is populated with values from the input without the null values"),
+    {"input"});
+
+class DropNullMetaFunction : public MetaFunction {
+ public:
+  DropNullMetaFunction() : MetaFunction("dropnull", Arity::Unary(), &dropnull_doc) {}
+
+  Result<Datum> ExecuteImpl(const std::vector<Datum>& args,
+                            const FunctionOptions* options,
+                            ExecContext* ctx) const override {
+    switch (args[0].kind()) {
+      case Datum::ARRAY: {
+        const auto values = args[0].make_array();
+        ARROW_ASSIGN_OR_RAISE(auto indices,

Review comment:
       If `values.null_count` is known (and 0) can we skip some work here?

##########
File path: cpp/src/arrow/compute/kernels/vector_selection.cc
##########
@@ -2146,6 +2147,144 @@ class TakeMetaFunction : public MetaFunction {
   }
 };
 
+// ----------------------------------------------------------------------
+// DropNull Implementation
+
+Result<std::shared_ptr<arrow::Array>> GetNotNullIndices(
+    const std::shared_ptr<Array>& column, MemoryPool* memory_pool) {
+  std::shared_ptr<arrow::Array> indices;
+  arrow::NumericBuilder<arrow::Int32Type> builder(memory_pool);
+  builder.Reserve(column->length() - column->null_count());
+
+  std::vector<int32_t> values;
+  for (int64_t i = 0; i < column->length(); i++) {
+    if (column->IsValid(i)) {
+      builder.UnsafeAppend(static_cast<int32_t>(i));
+    }
+  }
+  RETURN_NOT_OK(builder.Finish(&indices));
+  return indices;
+}
+
+Result<std::shared_ptr<arrow::Array>> GetNotNullIndices(
+    const std::shared_ptr<ChunkedArray>& chunks, MemoryPool* memory_pool) {
+  std::shared_ptr<arrow::Array> indices;
+  arrow::NumericBuilder<arrow::Int32Type> builder(memory_pool);
+  builder.Reserve(chunks->length() - chunks->null_count());
+  int64_t relative_index = 0;
+  for (int64_t chunk_index = 0; chunk_index < chunks->num_chunks(); ++chunk_index) {
+    auto column_chunk = chunks->chunk(chunk_index);
+    for (int64_t col_index = 0; col_index < column_chunk->length(); col_index++) {
+      if (column_chunk->IsValid(col_index)) {
+        builder.UnsafeAppend(static_cast<int32_t>(relative_index + col_index));
+      }
+    }
+    relative_index += column_chunk->length();
+  }
+  RETURN_NOT_OK(builder.Finish(&indices));
+  return indices;
+}
+
+Result<std::shared_ptr<RecordBatch>> DropNullRecordBatch(const RecordBatch& batch,
+                                                         ExecContext* ctx) {
+  int64_t length_count = 0;
+  std::vector<std::shared_ptr<Array>> columns(batch.num_columns());
+  for (int i = 0; i < batch.num_columns(); ++i) {
+    ARROW_ASSIGN_OR_RAISE(auto indices,
+                          GetNotNullIndices(batch.column(i), ctx->memory_pool()));
+    ARROW_ASSIGN_OR_RAISE(Datum out, Take(batch.column(i)->data(), Datum(indices),
+                                          TakeOptions::NoBoundsCheck(), ctx));
+    columns[i] = out.make_array();
+    length_count += columns[i]->length();
+  }
+  return RecordBatch::Make(batch.schema(), length_count, std::move(columns));
+}
+
+Result<std::shared_ptr<Table>> DropNullTable(const Table& table, ExecContext* ctx) {
+  if (table.num_rows() == 0) {

Review comment:
       If this check makes sense here then should we have the same check for the record batch and array versions?

##########
File path: cpp/src/arrow/compute/kernels/vector_selection_test.cc
##########
@@ -1718,5 +1718,240 @@ TEST(TestTake, RandomFixedSizeBinary) {
   TakeRandomTest<FixedSizeBinaryType>::Test(fixed_size_binary(16));
 }
 
+// ----------------------------------------------------------------------
+// DropNull tests
+
+void AssertDropNullArrays(const std::shared_ptr<Array>& values,
+                          const std::shared_ptr<Array>& expected) {
+  ASSERT_OK_AND_ASSIGN(std::shared_ptr<Array> actual, DropNull(*values));
+  ValidateOutput(actual);
+  AssertArraysEqual(*expected, *actual, /*verbose=*/true);
+}
+
+Status DropNullJSON(const std::shared_ptr<DataType>& type, const std::string& values,
+                    std::shared_ptr<Array>* out) {
+  return DropNull(*ArrayFromJSON(type, values)).Value(out);
+}
+
+void CheckDropNull(const std::shared_ptr<DataType>& type, const std::string& values,
+                   const std::string& expected) {
+  std::shared_ptr<Array> actual;
+
+  ASSERT_OK(DropNullJSON(type, values, &actual));
+  ValidateOutput(actual);
+  AssertArraysEqual(*ArrayFromJSON(type, expected), *actual, /*verbose=*/true);
+}
+
+struct TestDropNullKernel : public ::testing::Test {
+  void TestNoValidityBitmapButUnknownNullCount(const std::shared_ptr<Array>& values) {
+    ASSERT_EQ(values->null_count(), 0);
+    auto expected = (*DropNull(values)).make_array();
+
+    auto new_values = MakeArray(values->data()->Copy());
+    new_values->data()->buffers[0].reset();
+    new_values->data()->null_count = kUnknownNullCount;
+    auto result = (*DropNull(new_values)).make_array();
+    AssertArraysEqual(*expected, *result);
+  }
+
+  void TestNoValidityBitmapButUnknownNullCount(const std::shared_ptr<DataType>& type,
+                                               const std::string& values) {
+    TestNoValidityBitmapButUnknownNullCount(ArrayFromJSON(type, values));
+  }
+};
+
+TEST_F(TestDropNullKernel, DropNull) {
+  CheckDropNull(null(), "[null, null, null]", "[]");
+  CheckDropNull(null(), "[null]", "[]");
+}
+
+TEST_F(TestDropNullKernel, DropNullBoolean) {
+  CheckDropNull(boolean(), "[true, false, true]", "[true, false, true]");
+  CheckDropNull(boolean(), "[null, false, true]", "[false, true]");
+
+  TestNoValidityBitmapButUnknownNullCount(boolean(), "[true, false, true]");
+}
+
+template <typename ArrowType>
+class TestDropNullKernelTyped : public TestDropNullKernel {};
+
+template <typename ArrowType>
+class TestDropNullKernelWithNumeric : public TestDropNullKernelTyped<ArrowType> {
+ protected:
+  void AssertDropNull(const std::string& values, const std::string& expected) {
+    CheckDropNull(type_singleton(), values, expected);
+  }
+
+  std::shared_ptr<DataType> type_singleton() {
+    return TypeTraits<ArrowType>::type_singleton();
+  }
+};
+
+TYPED_TEST_SUITE(TestDropNullKernelWithNumeric, NumericArrowTypes);

Review comment:
       Since the implementation of DropNull relies on the Take kernel and there aren't different versions of it for each data type I don't know how much we need to test drop null on all the different types.
   
   @nirandaperera / @bkietz I'd be interested in a second opinion here as I haven't spent a lot of time in kernels code.

##########
File path: cpp/src/arrow/compute/kernels/vector_selection.cc
##########
@@ -2146,6 +2147,144 @@ class TakeMetaFunction : public MetaFunction {
   }
 };
 
+// ----------------------------------------------------------------------
+// DropNull Implementation
+
+Result<std::shared_ptr<arrow::Array>> GetNotNullIndices(
+    const std::shared_ptr<Array>& column, MemoryPool* memory_pool) {
+  std::shared_ptr<arrow::Array> indices;
+  arrow::NumericBuilder<arrow::Int32Type> builder(memory_pool);

Review comment:
       If you had such a boolean array (and this may be future work) could it be more efficient to have a take that accepts a boolean array (of the same size as the input) and doesn't even construct the intermediate indices array?

##########
File path: python/pyarrow/tests/test_compute.py
##########
@@ -974,6 +974,51 @@ def test_take_null_type():
     assert len(table.take(indices).column(0)) == 4
 
 
+@pytest.mark.parametrize(('ty', 'values'), all_array_types)
+def test_dropnull(ty, values):
+    arr = pa.array(values, type=ty)
+    result = arr.dropnull()
+    result.validate()
+    indices = [i for i in range(len(arr)) if arr[i].is_valid]
+    expected = arr.take(pa.array(indices))
+    assert result.equals(expected)
+
+
+def test_dropnull_chunked_array():
+    arr = pa.chunked_array([["a", None], ["c", "d", None]])
+    expected_drop = pa.chunked_array([["a"], ["c", "d"]])
+    result = arr.dropnull()
+    assert result.equals(expected_drop)
+
+
+def test_dropnull_record_batch():
+    batch = pa.record_batch(
+        [pa.array(["a", None, "c", "d", None])], names=["a'"])
+
+    result = batch.dropnull()
+    expected = pa.record_batch([pa.array(["a", "c", "d"])], names=["a'"])
+    assert result.equals(expected)
+
+
+def test_dropnull_table():
+    table = pa.table([pa.array(["a", None, "c", "d", None])], names=["a"])

Review comment:
       Can you make some tests for tables with multiple columns?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org