You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@iceberg.apache.org by GitBox <gi...@apache.org> on 2022/12/16 09:26:09 UTC

[GitHub] [iceberg] Fokko commented on a diff in pull request #6437: Python: Projection by Field ID

Fokko commented on code in PR #6437:
URL: https://github.com/apache/iceberg/pull/6437#discussion_r1050533828


##########
python/pyiceberg/expressions/visitors.py:
##########
@@ -753,3 +757,68 @@ def inclusive_projection(
     schema: Schema, spec: PartitionSpec, case_sensitive: bool = True
 ) -> Callable[[BooleanExpression], BooleanExpression]:
     return InclusiveProjection(schema, spec, case_sensitive).project
+
+
+class _ExpressionProjector(BooleanExpressionVisitor[BooleanExpression]):
+    """Rewrites a boolean expression by replacing unbound references with references to fields in a struct schema
+
+    Args:
+      table_schema (Schema): The schema of the table
+      file_schema (Schema): The schema of the file
+      case_sensitive (bool): Whether to consider case when binding a reference to a field in a schema, defaults to True
+
+    Raises:
+        TypeError: In the case a predicate is already bound
+    """
+
+    table_schema: Schema
+    file_schema: Schema
+    case_sensitive: bool
+
+    def __init__(self, table_schema: Schema, file_schema: Schema, case_sensitive: bool) -> None:
+        self.table_schema = table_schema
+        self.file_schema = file_schema
+        self.case_sensitive = case_sensitive
+
+    def visit_true(self) -> BooleanExpression:
+        return AlwaysTrue()
+
+    def visit_false(self) -> BooleanExpression:
+        return AlwaysFalse()
+
+    def visit_not(self, child_result: BooleanExpression) -> BooleanExpression:
+        return Not(child=child_result)
+
+    def visit_and(self, left_result: BooleanExpression, right_result: BooleanExpression) -> BooleanExpression:
+        return And(left=left_result, right=right_result)
+
+    def visit_or(self, left_result: BooleanExpression, right_result: BooleanExpression) -> BooleanExpression:
+        return Or(left=left_result, right=right_result)
+
+    def visit_unbound_predicate(self, predicate: UnboundPredicate[L]) -> BooleanExpression:
+        if not isinstance(predicate.term, Reference):
+            raise ValueError(f"Exprected reference: {predicate.term}")
+
+        field = self.table_schema.find_field(predicate.term.name, case_sensitive=self.case_sensitive)
+        file_column_name = self.file_schema.find_column_name(field.field_id)
+
+        if not file_column_name:
+            raise ValueError(f"Not found in schema: {file_column_name}")
+
+        if isinstance(predicate, UnaryPredicate):
+            return predicate.__class__(Reference(file_column_name))

Review Comment:
   Ah, I see. I think it is nicer to convert it back to an unbound expression, and then bind that with the file schema. Let me update the code



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscribe@iceberg.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@iceberg.apache.org
For additional commands, e-mail: issues-help@iceberg.apache.org