You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/11/21 17:31:29 UTC
[GitHub] [arrow] westonpace commented on a diff in pull request #14681: ARROW-18367: [C++] Enable using InMemoryDataset to create substrait plans

westonpace commented on code in PR #14681:
URL: https://github.com/apache/arrow/pull/14681#discussion_r1028307635


##########
cpp/src/arrow/engine/substrait/expression_internal.cc:
##########
@@ -19,16 +19,44 @@
 
 #include "arrow/engine/substrait/expression_internal.h"
 
+#include <stdint.h>
+#include <algorithm>
+#include <array>
+#include <cstring>
+#include <functional>
 #include <memory>
+#include <optional>
+#include <string>
+#include <string_view>
+#include <type_traits>
 #include <utility>
-
-#include "arrow/builder.h"

Review Comment:
   What tool are you using to generate these includes?  I'm not sure about exploding `arrow/builder.h` but overall it seems like you caught quite a few IWYU violations which is handy but this is also introducing a lot of forward declarations which are already in `type_fwd.h` files.



##########
cpp/src/arrow/engine/substrait/expression_internal.cc:
##########
@@ -19,16 +19,44 @@
 
 #include "arrow/engine/substrait/expression_internal.h"
 
+#include <stdint.h>

Review Comment:
   ```suggestion
   #include <cstdint>
   ```



##########
cpp/src/arrow/engine/substrait/relation_internal.cc:
##########
@@ -689,7 +695,68 @@ Result<std::unique_ptr<substrait::ReadRel>> ScanRelationConverter(
     read_rel_lfs->mutable_items()->AddAllocated(read_rel_lfs_ffs.release());
   }
   read_rel->set_allocated_local_files(read_rel_lfs.release());
-  return std::move(read_rel);
+
+  return Status::OK();
+}
+
+Status SetReadRelationNamedTable(const dataset::InMemoryDataset* dataset,
+                                 std::unique_ptr<substrait::ReadRel>& read_rel) {
+  const auto& metadata = dataset->schema()->metadata();
+  if (metadata == nullptr || metadata->size() == 0) {
+    return Status::Invalid("Table name not found in dataset schema metadata");
+  }
+
+  ARROW_ASSIGN_OR_RAISE(auto table_name, metadata->Get(Table::kTableNameKey));
+  if (table_name.empty()) {
+    return Status::Invalid("Table name cannot be empty");
+  }
+
+  auto read_rel_tn = std::make_unique<substrait::ReadRel::NamedTable>();
+
+  const auto names = SplitString(table_name, '.');
+  for (auto& name : names) {
+    read_rel_tn->add_names(name.data(), name.size());
+  }
+  read_rel->set_allocated_named_table(read_rel_tn.release());
+
+  return Status::OK();
+}
+
+Result<std::unique_ptr<substrait::ReadRel>> ScanRelationConverter(
+    const std::shared_ptr<Schema>& schema, const compute::Declaration& declaration,
+    ExtensionSet* ext_set, const ConversionOptions& conversion_options) {
+  auto read_rel = std::make_unique<substrait::ReadRel>();
+  const auto& scan_node_options =
+      checked_cast<const dataset::ScanNodeOptions&>(*declaration.options);
+  auto* dataset = scan_node_options.dataset.get();
+
+  // set schema
+  ARROW_ASSIGN_OR_RAISE(auto named_struct,
+                        ToProto(*dataset->schema(), ext_set, conversion_options));
+  read_rel->set_allocated_base_schema(named_struct.release());
+
+  auto* file_system_dataset = dynamic_cast<dataset::FileSystemDataset*>(dataset);
+  if (file_system_dataset != nullptr) {
+    ARROW_RETURN_NOT_OK(SetReadRelationLocalFiles(file_system_dataset, read_rel));
+    return std::move(read_rel);
+  }
+
+  auto* in_memory_dataset = dynamic_cast<dataset::InMemoryDataset*>(dataset);
+  if (in_memory_dataset != nullptr) {
+    ARROW_ASSIGN_OR_RAISE(auto scanner_builder, in_memory_dataset->NewScan());
+    ARROW_ASSIGN_OR_RAISE(auto scanner, scanner_builder->Finish());
+    ARROW_ASSIGN_OR_RAISE(auto num_rows, scanner->CountRows());

Review Comment:
   We could maybe just add a `NumRows` method to `InMemoryDataset` to avoid having to create and run a scan which might save a bit of complexity.



##########
cpp/src/arrow/table.h:
##########
@@ -41,6 +41,8 @@ class MemoryPool;
 /// \brief Logical table as sequence of chunked arrays
 class ARROW_EXPORT Table {
  public:
+  static constexpr char const kTableNameKey[] = "tableName";
+

Review Comment:
   Given that this key is specific to Substrait use I'm not sure this is the right place for this.  In fact the "table" here is not an Arrow table but more of a database table (which is more similar to an arrow dataset).  Can you move this constant into the substrait package?



##########
cpp/src/arrow/engine/substrait/serde_test.cc:
##########
@@ -2221,6 +2252,67 @@ TEST(Substrait, BasicPlanRoundTrippingEndToEnd) {
   EXPECT_TRUE(expected_table->Equals(*rnd_trp_table));
 }
 
+TEST(Substrait, FilterNamedTable) {
+  compute::ExecContext exec_context;
+  arrow::dataset::internal::Initialize();
+
+  const std::vector<std::string> table_name{"table", "1"};
+  const std::unordered_map<std::string, std::string> metadata = {
+      {Table::kTableNameKey, arrow::internal::JoinStrings(table_name, ".")}};
+  const auto dummy_schema =
+      schema({field("A", int32()), field("B", int32()), field("C", int32())},
+             key_value_metadata(metadata));
+  auto dataset =
+      std::make_shared<dataset::InMemoryDataset>(dummy_schema, RecordBatchVector{});
+
+  auto scan_options = std::make_shared<dataset::ScanOptions>();
+  scan_options->projection = compute::project({}, {});
+  auto filter = compute::equal(compute::field_ref("A"), compute::field_ref("B"));
+
+  auto declarations = compute::Declaration::Sequence(
+      {compute::Declaration(
+           {"scan", dataset::ScanNodeOptions{dataset, scan_options}, "s"}),
+       compute::Declaration({"filter", compute::FilterNodeOptions{filter}, "f"})});
+
+  std::shared_ptr<ExtensionIdRegistry> sp_ext_id_reg = MakeExtensionIdRegistry();
+  ExtensionIdRegistry* ext_id_reg = sp_ext_id_reg.get();
+  ExtensionSet ext_set(ext_id_reg);

Review Comment:
   You probably don't need to create an `ExtensionIdRegistry` (it will just use the default if you do not supply one).



##########
cpp/src/arrow/engine/substrait/relation_internal.cc:
##########
@@ -689,7 +695,68 @@ Result<std::unique_ptr<substrait::ReadRel>> ScanRelationConverter(
     read_rel_lfs->mutable_items()->AddAllocated(read_rel_lfs_ffs.release());
   }
   read_rel->set_allocated_local_files(read_rel_lfs.release());
-  return std::move(read_rel);
+
+  return Status::OK();
+}
+
+Status SetReadRelationNamedTable(const dataset::InMemoryDataset* dataset,

Review Comment:
   ```suggestion
   Status SetReadRelationNamedTable(const dataset::InMemoryDataset& dataset,
   ```
   The style guide prefers const reference over const pointer:
   
   > Non-optional input parameters should usually be values or const references



##########
cpp/src/arrow/engine/substrait/type_internal.cc:
##########
@@ -471,8 +482,14 @@ void ToProtoGetDepthFirstNames(const FieldVector& fields,
 Result<std::unique_ptr<::substrait::NamedStruct>> ToProto(
     const Schema& schema, ExtensionSet* ext_set,
     const ConversionOptions& conversion_options) {
-  if (schema.metadata()) {
-    return Status::Invalid("::substrait::NamedStruct does not support schema metadata");
+  const auto& metadata = schema.metadata();
+  if (metadata) {
+    std::unordered_map<std::string, std::string> metadata_map{};
+    metadata->ToUnorderedMap(&metadata_map);
+    metadata_map.erase(Table::kTableNameKey);
+    if (!metadata_map.empty()) {
+      return Status::Invalid("::substrait::NamedStruct does not support schema metadata");
+    }

Review Comment:
   Maybe only check this if the conversion strictness is `EXACT_ROUNDTRIP`?  Otherwise it seems ok to drop/ignore the metadata.



##########
cpp/src/arrow/engine/substrait/serde.h:
##########
@@ -25,19 +25,34 @@
 #include <string_view>
 #include <vector>
 
-#include "arrow/compute/type_fwd.h"
-#include "arrow/dataset/type_fwd.h"
 #include "arrow/engine/substrait/options.h"
-#include "arrow/engine/substrait/type_fwd.h"
 #include "arrow/engine/substrait/visibility.h"
 #include "arrow/result.h"
 #include "arrow/status.h"
-#include "arrow/type_fwd.h"
 #include "arrow/util/macros.h"
 
 namespace arrow {
+
+class Buffer;
+class DataType;
+class Schema;
+
+namespace compute {
+class ExecPlan;
+class Expression;
+class SinkNodeConsumer;
+struct Declaration;
+}  // namespace compute
+
+namespace dataset {
+class WriteNodeOptions;
+}
+
 namespace engine {
 
+class ExtensionIdRegistry;
+class ExtensionSet;

Review Comment:
   I'm not sure this is better than using the `type_fwd.h` files.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org