You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by "ozankabak (via GitHub)" <gi...@apache.org> on 2023/05/03 02:55:54 UTC

[GitHub] [arrow-datafusion] ozankabak commented on a diff in pull request #6160: Add Support for Ordering Equivalence

ozankabak commented on code in PR #6160:
URL: https://github.com/apache/arrow-datafusion/pull/6160#discussion_r1183199984


##########
datafusion/core/tests/sqllogictests/test_files/window.slt:
##########
@@ -2204,3 +2204,163 @@ SELECT SUM(c12) OVER(ORDER BY c1, c2 GROUPS BETWEEN 1 PRECEDING AND 1 FOLLOWING)
 2.994840293343 NULL
 9.674390599321 NULL
 7.728066219895 NULL
+
+# test_c9_rn_ordering_alias
+# These tests check whether Datafusion is aware of the ordering generated by the ROW_NUMBER() window function.
+# Physical plan shouldn't have a SortExec after the BoundedWindowAggExec since the table after BoundedWindowAggExec is already ordered by rn1 ASC and c9 DESC.
+query TT
+EXPLAIN SELECT c9, rn1 FROM (SELECT c9,
+                   ROW_NUMBER() OVER(ORDER BY c9 ASC) as rn1
+                   FROM aggregate_test_100
+                   ORDER BY c9 ASC)
+   ORDER BY rn1
+   LIMIT 5
+----
+logical_plan
+Limit: skip=0, fetch=5
+  Sort: rn1 ASC NULLS LAST, fetch=5
+    Sort: aggregate_test_100.c9 ASC NULLS LAST
+      Projection: aggregate_test_100.c9, ROW_NUMBER() ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS rn1
+        WindowAggr: windowExpr=[[ROW_NUMBER() ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+          TableScan: aggregate_test_100 projection=[c9]
+physical_plan
+GlobalLimitExec: skip=0, fetch=5
+  ProjectionExec: expr=[c9@0 as c9, ROW_NUMBER() ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@1 as rn1]
+    BoundedWindowAggExec: wdw=[ROW_NUMBER(): Ok(Field { name: "ROW_NUMBER()", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(UInt64(NULL)), end_bound: CurrentRow }], mode=[Sorted]
+      SortExec: expr=[c9@0 ASC NULLS LAST]
+        CsvExec: files={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, has_header=true, limit=None, projection=[c9]
+
+query II
+SELECT c9, rn1 FROM (SELECT c9,
+                   ROW_NUMBER() OVER(ORDER BY c9 ASC) as rn1
+                   FROM aggregate_test_100
+                   ORDER BY c9 ASC)
+   ORDER BY rn1
+   LIMIT 5
+----
+28774375 1
+63044568 2
+141047417 3
+141680161 4
+145294611 5
+
+# test_c9_rn_ordering_alias_opposite_direction
+# These tests check whether Datafusion is aware of the ordering generated by the ROW_NUMBER() window function.
+# Physical plan shouldn't have a SortExec after the BoundedWindowAggExec since the table after BoundedWindowAggExec is already ordered by rn1 ASC and c9 DESC.
+query TT
+EXPLAIN SELECT c9, rn1 FROM (SELECT c9,
+                   ROW_NUMBER() OVER(ORDER BY c9 DESC) as rn1
+                   FROM aggregate_test_100
+                   ORDER BY c9 DESC)
+   ORDER BY rn1
+   LIMIT 5
+----
+logical_plan
+Limit: skip=0, fetch=5
+  Sort: rn1 ASC NULLS LAST, fetch=5
+    Sort: aggregate_test_100.c9 DESC NULLS FIRST
+      Projection: aggregate_test_100.c9, ROW_NUMBER() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS rn1
+        WindowAggr: windowExpr=[[ROW_NUMBER() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+          TableScan: aggregate_test_100 projection=[c9]
+physical_plan
+GlobalLimitExec: skip=0, fetch=5
+  ProjectionExec: expr=[c9@0 as c9, ROW_NUMBER() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@1 as rn1]
+    BoundedWindowAggExec: wdw=[ROW_NUMBER(): Ok(Field { name: "ROW_NUMBER()", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(UInt64(NULL)), end_bound: CurrentRow }], mode=[Sorted]
+      SortExec: expr=[c9@0 DESC]
+        CsvExec: files={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, has_header=true, limit=None, projection=[c9]
+
+query II
+SELECT c9, rn1 FROM (SELECT c9,
+                   ROW_NUMBER() OVER(ORDER BY c9 DESC) as rn1
+                   FROM aggregate_test_100
+                   ORDER BY c9 DESC)
+   ORDER BY rn1
+   LIMIT 5
+----
+4268716378 1
+4229654142 2
+4216440507 3
+4144173353 4
+4076864659 5
+
+# test_c9_rn_ordering_alias_opposite_direction2
+# These tests check whether Datafusion is aware of the ordering generated by the ROW_NUMBER() window function.
+# Physical plan _should_ have a SortExec after BoundedWindowAggExec since the table after BoundedWindowAggExec is ordered by rn1 ASC and c9 DESC, which is conflicting with the requirement rn1 DESC.
+query TT
+EXPLAIN SELECT c9, rn1 FROM (SELECT c9,
+                   ROW_NUMBER() OVER(ORDER BY c9 DESC) as rn1
+                   FROM aggregate_test_100
+                   ORDER BY c9 DESC)
+   ORDER BY rn1 DESC
+   LIMIT 5
+----
+logical_plan
+Limit: skip=0, fetch=5
+  Sort: rn1 DESC NULLS FIRST, fetch=5
+    Sort: aggregate_test_100.c9 DESC NULLS FIRST
+      Projection: aggregate_test_100.c9, ROW_NUMBER() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS rn1
+        WindowAggr: windowExpr=[[ROW_NUMBER() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+          TableScan: aggregate_test_100 projection=[c9]
+physical_plan
+GlobalLimitExec: skip=0, fetch=5
+  SortExec: fetch=5, expr=[rn1@1 DESC]
+    ProjectionExec: expr=[c9@0 as c9, ROW_NUMBER() ORDER BY [aggregate_test_100.c9 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@1 as rn1]
+      BoundedWindowAggExec: wdw=[ROW_NUMBER(): Ok(Field { name: "ROW_NUMBER()", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(UInt64(NULL)), end_bound: CurrentRow }], mode=[Sorted]
+        SortExec: expr=[c9@0 DESC]
+          CsvExec: files={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, has_header=true, limit=None, projection=[c9]
+
+query II
+SELECT c9, rn1 FROM (SELECT c9,
+               ROW_NUMBER() OVER(ORDER BY c9 DESC) as rn1
+               FROM aggregate_test_100
+               ORDER BY c9 DESC)
+   ORDER BY rn1 DESC
+   LIMIT 5
+----
+28774375 100
+63044568 99
+141047417 98
+141680161 97
+145294611 96
+
+# test_c9_rn_ordering_alias_opposite_direction3
+# These test check for whether datafusion is aware of the ordering of the column generated by ROW_NUMBER() window function.
+# Physical plan should have a SortExec after BoundedWindowAggExec.
+# Since table after BoundedWindowAggExec is ordered by rn1 ASC, and c9 DESC. And it is conflicting with requirement rn1 ASC, c9 DESC
+# (possibly violates global ordering of the c9 column).

Review Comment:
   This comment is unclear -- the requirements you mention (`rn1 ASC, c9 DESC`) is the same with the post-window ordering you mention. Let's improve the comment to accurately reflect what is going on here.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org