You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/03/24 08:17:41 UTC
[GitHub] [arrow] westonpace commented on a change in pull request #12664: ARROW-15658: [C++] Parquet pushdown filtering fails if the filter expression uses numeric field references

westonpace commented on a change in pull request #12664:
URL: https://github.com/apache/arrow/pull/12664#discussion_r834022864



##########
File path: cpp/examples/arrow/dataset_parquet_scan_example.cc
##########
@@ -39,6 +39,17 @@ namespace ds = arrow::dataset;
 
 namespace cp = arrow::compute;
 
+/**
+ * @brief Run Example
+ * ./dataset-parquet-scan-example file:///sell_data.parquet
+ * sample data set
+ *           pickup_at dropoff_at  total_amount
+ *         0         A          B            10
+ *         1         B          A          1200
+ *         2         C          A          2400
+ *         3         A          C           500
+ */
+

Review comment:
       Can you change the commenting here to be `//`?

##########
File path: cpp/src/arrow/dataset/file_parquet.cc
##########
@@ -160,18 +160,35 @@ Status ResolveOneFieldRef(
     const std::unordered_map<std::string, const SchemaField*>& field_lookup,
     const std::unordered_set<std::string>& duplicate_fields,
     std::vector<int>* columns_selection) {
-  if (const std::string* name = field_ref.name()) {
+  auto resolve_field_ref = [&](const std::string* name) -> Status {
     auto it = field_lookup.find(*name);
     if (it != field_lookup.end()) {
       AddColumnIndices(*it->second, columns_selection);
     } else if (duplicate_fields.find(*name) != duplicate_fields.end()) {
-      // We shouldn't generally get here because SetProjection will reject such references
+      // We shouldn't generally get here because SetProjection will reject such
+      // references
       return Status::Invalid("Ambiguous reference to column '", *name,
                              "' which occurs more than once");
     }
     // "Virtual" column: field is not in file but is in the ScanOptions.
     // Ignore it here, as projection will pad the batch with a null column.
     return Status::OK();
+  };
+
+  if (const std::string* name = field_ref.name()) {
+    RETURN_NOT_OK(resolve_field_ref(name));
+    return Status::OK();

Review comment:
       ```suggestion
       return resolve_field_ref(name);
   ```

##########
File path: cpp/src/arrow/dataset/file_parquet.cc
##########
@@ -160,18 +160,35 @@ Status ResolveOneFieldRef(
     const std::unordered_map<std::string, const SchemaField*>& field_lookup,
     const std::unordered_set<std::string>& duplicate_fields,
     std::vector<int>* columns_selection) {
-  if (const std::string* name = field_ref.name()) {
+  auto resolve_field_ref = [&](const std::string* name) -> Status {
     auto it = field_lookup.find(*name);
     if (it != field_lookup.end()) {
       AddColumnIndices(*it->second, columns_selection);
     } else if (duplicate_fields.find(*name) != duplicate_fields.end()) {
-      // We shouldn't generally get here because SetProjection will reject such references
+      // We shouldn't generally get here because SetProjection will reject such
+      // references
       return Status::Invalid("Ambiguous reference to column '", *name,
                              "' which occurs more than once");
     }
     // "Virtual" column: field is not in file but is in the ScanOptions.
     // Ignore it here, as projection will pad the batch with a null column.
     return Status::OK();
+  };
+
+  if (const std::string* name = field_ref.name()) {
+    RETURN_NOT_OK(resolve_field_ref(name));
+    return Status::OK();
+  } else if (const FieldPath* path = field_ref.field_path()) {
+    auto indices = path->indices();
+    if (indices.size() > 1) {
+      return Status::NotImplemented("Provided FieldRef ", field_ref.ToString(),
+                                    ", Nested FieldPaths are not supported!");
+    }
+    int index = indices[0];
+    auto schema_field = manifest.schema_fields.at(index);
+    auto col_name = schema_field.field->name();
+    RETURN_NOT_OK(resolve_field_ref(&col_name));
+    return Status::OK();

Review comment:
       ```suggestion
       return resolve_field_ref(&col_name);
   ```

##########
File path: cpp/examples/arrow/dataset_parquet_scan_example.cc
##########
@@ -62,8 +73,11 @@ struct Configuration {
 
   // Indicates the filter by which rows will be filtered. This optimization can
   // make use of partition information and/or file metadata if possible.
-  cp::Expression filter =
-      cp::greater(cp::field_ref("total_amount"), cp::literal(1000.0f));
+  // Equivalent filter with field_name instead of field_index
+  // cp::Expression filter = cp::greater(cp::field_ref("total_amount"),
+  // cp::literal(1000.0f));
+  // 0: pickup_at, 1: dropoff_at, 2: total_amount
+  cp::Expression filter = cp::greater(cp::field_ref(2), cp::literal(1000.0f));

Review comment:
       I think this comment makes the example a little more complex than needed.  Maybe we could do an AND expression `field_ref(2) > 1000 & field_ref("total_amount") < 2000` to demonstrate both types of field refs?  Or we could just leave it how it was and not worry about an example field ref by id.

##########
File path: cpp/src/arrow/dataset/file_parquet.cc
##########
@@ -160,18 +160,35 @@ Status ResolveOneFieldRef(
     const std::unordered_map<std::string, const SchemaField*>& field_lookup,
     const std::unordered_set<std::string>& duplicate_fields,
     std::vector<int>* columns_selection) {
-  if (const std::string* name = field_ref.name()) {
+  auto resolve_field_ref = [&](const std::string* name) -> Status {
     auto it = field_lookup.find(*name);
     if (it != field_lookup.end()) {
       AddColumnIndices(*it->second, columns_selection);
     } else if (duplicate_fields.find(*name) != duplicate_fields.end()) {
-      // We shouldn't generally get here because SetProjection will reject such references
+      // We shouldn't generally get here because SetProjection will reject such
+      // references
       return Status::Invalid("Ambiguous reference to column '", *name,
                              "' which occurs more than once");
     }
     // "Virtual" column: field is not in file but is in the ScanOptions.
     // Ignore it here, as projection will pad the batch with a null column.
     return Status::OK();
+  };
+
+  if (const std::string* name = field_ref.name()) {
+    RETURN_NOT_OK(resolve_field_ref(name));
+    return Status::OK();
+  } else if (const FieldPath* path = field_ref.field_path()) {
+    auto indices = path->indices();
+    if (indices.size() > 1) {
+      return Status::NotImplemented("Provided FieldRef ", field_ref.ToString(),
+                                    ", Nested FieldPaths are not supported!");
+    }
+    int index = indices[0];
+    auto schema_field = manifest.schema_fields.at(index);

Review comment:
       Is `manifest.schema_fields` a vector?  If so can we use `[index]` instead of `.at(index)` for stylistic consistency?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org