You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2021/06/08 19:14:51 UTC

[GitHub] [arrow] lidavidm opened a new pull request #10484: ARROW-12891: [C++] Move subtree pruning to compute

lidavidm opened a new pull request #10484:
URL: https://github.com/apache/arrow/pull/10484


   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] bkietz closed pull request #10484: ARROW-12891: [C++] Move subtree pruning to compute

Posted by GitBox <gi...@apache.org>.
bkietz closed pull request #10484:
URL: https://github.com/apache/arrow/pull/10484


   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] bkietz commented on a change in pull request #10484: ARROW-12891: [C++] Move subtree pruning to compute

Posted by GitBox <gi...@apache.org>.
bkietz commented on a change in pull request #10484:
URL: https://github.com/apache/arrow/pull/10484#discussion_r663126955



##########
File path: cpp/src/arrow/dataset/file_base.cc
##########
@@ -266,20 +268,21 @@ void FileSystemDataset::SetupSubtreePruning() {
     }
   }
 
-  subtrees_->forest = Forest(static_cast<int>(encoded.size()), [&](int l, int r) {
-    if (encoded[l].fragment_index) {
-      // Fragment: not an ancestor.
-      return false;
-    }
+  subtrees_->forest =
+      compute::Forest(static_cast<int>(encoded.size()), [&](int l, int r) {
+        if (encoded[l].fragment_index) {

Review comment:
       Same: move into Subtree or Forest

##########
File path: cpp/src/arrow/compute/exec/subtree_internal.h
##########
@@ -0,0 +1,148 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <stdint.h>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "arrow/compute/exec/expression.h"
+#include "arrow/util/optional.h"
+
+namespace arrow {
+namespace compute {
+// Helper class for efficiently detecting subtrees given fragment partition expressions.
+// Partition expressions are broken into conjunction members and each member dictionary
+// encoded to impose a sortable ordering. In addition, subtrees are generated which span
+// groups of fragments and nested subtrees. After encoding each fragment is guaranteed to
+// be a descendant of at least one subtree. For example, given fragments in a
+// HivePartitioning with paths:
+//
+//   /num=0/al=eh/dat.par
+//   /num=0/al=be/dat.par
+//   /num=1/al=eh/dat.par
+//   /num=1/al=be/dat.par
+//
+// The following subtrees will be introduced:
+//
+//   /num=0/
+//   /num=0/al=eh/
+//   /num=0/al=eh/dat.par
+//   /num=0/al=be/
+//   /num=0/al=be/dat.par
+//   /num=1/
+//   /num=1/al=eh/
+//   /num=1/al=eh/dat.par
+//   /num=1/al=be/
+//   /num=1/al=be/dat.par
+struct SubtreeImpl {
+  // Each unique conjunction member is mapped to an integer.
+  using expression_code = char32_t;
+  // Partition expressions are mapped to strings of codes; strings give us lexicographic
+  // ordering (and potentially useful optimizations).
+  using expression_codes = std::basic_string<expression_code>;
+  // An encoded fragment (if fragment_index is set) or subtree.
+  struct Encoded {
+    util::optional<int> fragment_index;
+    expression_codes partition_expression;
+  };
+
+  std::unordered_map<compute::Expression, expression_code, compute::Expression::Hash>
+      expr_to_code_;
+  std::vector<compute::Expression> code_to_expr_;
+  std::unordered_set<expression_codes> subtree_exprs_;
+
+  // Encode a subexpression (returning the existing code if possible).
+  expression_code GetOrInsert(const compute::Expression& expr) {
+    auto next_code = static_cast<int>(expr_to_code_.size());
+    auto it_success = expr_to_code_.emplace(expr, next_code);
+
+    if (it_success.second) {
+      code_to_expr_.push_back(expr);
+    }
+    return it_success.first->second;
+  }
+
+  // Encode an expression (recursively breaking up conjunction members if possible).
+  void EncodeConjunctionMembers(const compute::Expression& expr,
+                                expression_codes* codes) {
+    if (auto call = expr.call()) {
+      if (call->function_name == "and_kleene") {
+        // expr is a conjunction, encode its arguments
+        EncodeConjunctionMembers(call->arguments[0], codes);
+        EncodeConjunctionMembers(call->arguments[1], codes);
+        return;
+      }
+    }
+    // expr is not a conjunction, encode it whole
+    codes->push_back(GetOrInsert(expr));
+  }
+
+  // Convert an encoded subtree or fragment back into an expression.
+  compute::Expression GetSubtreeExpression(const Encoded& encoded_subtree) {
+    // Filters will already be simplified by all of a subtree's ancestors, so
+    // we only need to simplify the filter by the trailing conjunction member
+    // of each subtree.
+    return code_to_expr_[encoded_subtree.partition_expression.back()];
+  }
+
+  // Insert subtrees for each component of an encoded partition expression.
+  void GenerateSubtrees(expression_codes partition_expression,
+                        std::vector<Encoded>* encoded) {
+    while (!partition_expression.empty()) {
+      if (subtree_exprs_.insert(partition_expression).second) {
+        Encoded encoded_subtree{/*fragment_index=*/util::nullopt, partition_expression};
+        encoded->push_back(std::move(encoded_subtree));
+      }
+      partition_expression.resize(partition_expression.size() - 1);
+    }
+  }
+
+  // Encode the fragment's partition expression and generate subtrees for it as well.
+  void EncodeOneExpression(int fragment_index, const Expression& partition_expression,
+                           std::vector<Encoded>* encoded) {
+    Encoded encoded_fragment{fragment_index, {}};
+
+    EncodeConjunctionMembers(partition_expression,
+                             &encoded_fragment.partition_expression);
+
+    GenerateSubtrees(encoded_fragment.partition_expression, encoded);
+
+    encoded->push_back(std::move(encoded_fragment));
+  }
+
+  template <typename Fragments>
+  std::vector<Encoded> EncodeFragments(const Fragments& fragments) {
+    std::vector<Encoded> encoded;
+    for (size_t i = 0; i < fragments.size(); ++i) {
+      EncodeOneExpression(static_cast<int>(i), fragments[i]->partition_expression(),
+                          &encoded);
+    }
+    return encoded;
+  }

Review comment:
       Since this isn't in dataset:: anymore, let's remove references to `Fragment` and make it generically about a set of guarantees
   ```suggestion
     template <typename GetGuarantee>
     std::vector<Encoded> EncodeGuarantee(const GetGuarantee& get, int count) {
       std::vector<Encoded> encoded;
       for (int i = 0; i < count; ++i) {
         EncodeOneGuarantee(i, get(i), &encoded);
       }
       return encoded;
     }
   ```

##########
File path: cpp/src/arrow/compute/exec/forest_internal.h
##########
@@ -21,15 +21,16 @@
 #include <utility>
 #include <vector>
 
-#include "arrow/dataset/visibility.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
 
 namespace arrow {
-namespace dataset {
+namespace compute {
 
 /// A Forest is a view of a sorted range which carries an ancestry relation in addition
 /// to an ordering relation: each element's descendants appear directly after it.
 /// This can be used to efficiently skip subtrees when iterating through the range.
-class ARROW_DS_EXPORT Forest {
+class Forest {

Review comment:
       ```suggestion
   class ARROW_EXPORT Forest {
   ```

##########
File path: cpp/src/arrow/dataset/file_base.cc
##########
@@ -243,20 +244,21 @@ std::string FileSystemDataset::ToString() const {
 
 void FileSystemDataset::SetupSubtreePruning() {
   subtrees_ = std::make_shared<FragmentSubtrees>();
-  SubtreeImpl impl;
+  compute::SubtreeImpl impl;
 
   auto encoded = impl.EncodeFragments(fragments_);
 
-  std::sort(encoded.begin(), encoded.end(),
-            [](const SubtreeImpl::Encoded& l, const SubtreeImpl::Encoded& r) {
-              const auto cmp = l.partition_expression.compare(r.partition_expression);
-              if (cmp != 0) {
-                return cmp < 0;
-              }
-              // Equal partition expressions; sort encodings with fragment indices after
-              // encodings without
-              return (l.fragment_index ? 1 : 0) < (r.fragment_index ? 1 : 0);
-            });
+  std::sort(
+      encoded.begin(), encoded.end(),
+      [](const compute::SubtreeImpl::Encoded& l, const compute::SubtreeImpl::Encoded& r) {

Review comment:
       Since this sort doesn't refer to anything in this scope it should move into SubtreeImpl. Otherwise other consumers of SubtreeImpl would need to reiterate the sort




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] github-actions[bot] commented on pull request #10484: ARROW-12891: [C++] Move subtree pruning to compute

Posted by GitBox <gi...@apache.org>.
github-actions[bot] commented on pull request #10484:
URL: https://github.com/apache/arrow/pull/10484#issuecomment-857031758


   https://issues.apache.org/jira/browse/ARROW-12891


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] lidavidm commented on a change in pull request #10484: ARROW-12891: [C++] Move subtree pruning to compute

Posted by GitBox <gi...@apache.org>.
lidavidm commented on a change in pull request #10484:
URL: https://github.com/apache/arrow/pull/10484#discussion_r663907887



##########
File path: cpp/src/arrow/compute/exec/forest_internal.h
##########
@@ -21,15 +21,16 @@
 #include <utility>
 #include <vector>
 
-#include "arrow/dataset/visibility.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
 
 namespace arrow {
-namespace dataset {
+namespace compute {
 
 /// A Forest is a view of a sorted range which carries an ancestry relation in addition
 /// to an ordering relation: each element's descendants appear directly after it.
 /// This can be used to efficiently skip subtrees when iterating through the range.
-class ARROW_DS_EXPORT Forest {
+class Forest {

Review comment:
       The MSVC2015 linker gets really unhappy with this:
   
   ```
   unity_0_cxx.cxx.obj : error LNK2019: unresolved external symbol "__declspec(dllimport) public: __cdecl arrow::compute::Forest::Forest(int,class std::function<bool __cdecl(int,int)>)" (__imp_??0Forest@compute@arrow@@QEAA@HV?$function@$$A6A_NHH@Z@std@@@Z) referenced in function "class arrow::compute::Forest __cdecl arrow::compute::MakeForest(class std::vector<struct arrow::compute::FileInfo,class std::allocator<struct arrow::compute::FileInfo> > *)" (?MakeForest@compute@arrow@@YA?AVForest@12@PEAV?$vector@UFileInfo@compute@arrow@@V?$allocator@UFileInfo@compute@arrow@@@std@@@std@@@Z)
   unity_0_cxx.cxx.obj : error LNK2019: unresolved external symbol "__declspec(dllimport) public: int __cdecl arrow::compute::Forest::size(void)const " (__imp_?size@Forest@compute@arrow@@QEBAHXZ) referenced in function "private: virtual void __cdecl arrow::compute::Forest_Visit_Test::TestBody(void)" (?TestBody@Forest_Visit_Test@compute@arrow@@EEAAXXZ)
   unity_0_cxx.cxx.obj : error LNK2019: unresolved external symbol "__declspec(dllimport) public: struct arrow::compute::Forest::Ref __cdecl arrow::compute::Forest::operator[](int)const " (__imp_??AForest@compute@arrow@@QEBA?AURef@012@H@Z) referenced in function "public: __cdecl arrow::compute::TestPathTree::TestPathTree(struct arrow::compute::Forest::Ref,class std::vector<struct arrow::compute::FileInfo,class std::allocator<struct arrow::compute::FileInfo> > const &)" (??0TestPathTree@compute@arrow@@QEAA@URef@Forest@12@AEBV?$vector@UFileInfo@compute@arrow@@V?$allocator@UFileInfo@compute@arrow@@@std@@@std@@@Z)
   unity_0_cxx.cxx.obj : error LNK2019: unresolved external symbol "__declspec(dllimport) public: __cdecl arrow::compute::Forest::~Forest(void)" (__imp_??1Forest@compute@arrow@@QEAA@XZ) referenced in function "void __cdecl arrow::compute::ExpectForestIs(class std::vector<struct arrow::compute::FileInfo,class std::allocator<struct arrow::compute::FileInfo> >,class std::vector<struct arrow::compute::TestPathTree,class std::allocator<struct arrow::compute::TestPathTree> >)" (?ExpectForestIs@compute@arrow@@YAXV?$vector@UFileInfo@compute@arrow@@V?$allocator@UFileInfo@compute@arrow@@@std@@@std@@V?$vector@UTestPathTree@compute@arrow@@V?$allocator@UTestPathTree@compute@arrow@@@std@@@4@@Z)
   release\arrow-compute-expression-test.exe : fatal error LNK1120: 4 unresolved externals
   ```




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] lidavidm commented on a change in pull request #10484: ARROW-12891: [C++] Move subtree pruning to compute

Posted by GitBox <gi...@apache.org>.
lidavidm commented on a change in pull request #10484:
URL: https://github.com/apache/arrow/pull/10484#discussion_r663907887



##########
File path: cpp/src/arrow/compute/exec/forest_internal.h
##########
@@ -21,15 +21,16 @@
 #include <utility>
 #include <vector>
 
-#include "arrow/dataset/visibility.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
 
 namespace arrow {
-namespace dataset {
+namespace compute {
 
 /// A Forest is a view of a sorted range which carries an ancestry relation in addition
 /// to an ordering relation: each element's descendants appear directly after it.
 /// This can be used to efficiently skip subtrees when iterating through the range.
-class ARROW_DS_EXPORT Forest {
+class Forest {

Review comment:
       The MSVC2015 linker gets really unhappy with this:
   
   ```
   unity_0_cxx.cxx.obj : error LNK2019: unresolved external symbol "__declspec(dllimport) public: __cdecl arrow::compute::Forest::Forest(int,class std::function<bool __cdecl(int,int)>)" (__imp_??0Forest@compute@arrow@@QEAA@HV?$function@$$A6A_NHH@Z@std@@@Z) referenced in function "class arrow::compute::Forest __cdecl arrow::compute::MakeForest(class std::vector<struct arrow::compute::FileInfo,class std::allocator<struct arrow::compute::FileInfo> > *)" (?MakeForest@compute@arrow@@YA?AVForest@12@PEAV?$vector@UFileInfo@compute@arrow@@V?$allocator@UFileInfo@compute@arrow@@@std@@@std@@@Z)
   unity_0_cxx.cxx.obj : error LNK2019: unresolved external symbol "__declspec(dllimport) public: int __cdecl arrow::compute::Forest::size(void)const " (__imp_?size@Forest@compute@arrow@@QEBAHXZ) referenced in function "private: virtual void __cdecl arrow::compute::Forest_Visit_Test::TestBody(void)" (?TestBody@Forest_Visit_Test@compute@arrow@@EEAAXXZ)
   unity_0_cxx.cxx.obj : error LNK2019: unresolved external symbol "__declspec(dllimport) public: struct arrow::compute::Forest::Ref __cdecl arrow::compute::Forest::operator[](int)const " (__imp_??AForest@compute@arrow@@QEBA?AURef@012@H@Z) referenced in function "public: __cdecl arrow::compute::TestPathTree::TestPathTree(struct arrow::compute::Forest::Ref,class std::vector<struct arrow::compute::FileInfo,class std::allocator<struct arrow::compute::FileInfo> > const &)" (??0TestPathTree@compute@arrow@@QEAA@URef@Forest@12@AEBV?$vector@UFileInfo@compute@arrow@@V?$allocator@UFileInfo@compute@arrow@@@std@@@std@@@Z)
   unity_0_cxx.cxx.obj : error LNK2019: unresolved external symbol "__declspec(dllimport) public: __cdecl arrow::compute::Forest::~Forest(void)" (__imp_??1Forest@compute@arrow@@QEAA@XZ) referenced in function "void __cdecl arrow::compute::ExpectForestIs(class std::vector<struct arrow::compute::FileInfo,class std::allocator<struct arrow::compute::FileInfo> >,class std::vector<struct arrow::compute::TestPathTree,class std::allocator<struct arrow::compute::TestPathTree> >)" (?ExpectForestIs@compute@arrow@@YAXV?$vector@UFileInfo@compute@arrow@@V?$allocator@UFileInfo@compute@arrow@@@std@@@std@@V?$vector@UTestPathTree@compute@arrow@@V?$allocator@UTestPathTree@compute@arrow@@@std@@@4@@Z)
   release\arrow-compute-expression-test.exe : fatal error LNK1120: 4 unresolved externals
   ```




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org