You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by "alamb (via GitHub)" <gi...@apache.org> on 2023/02/13 12:13:03 UTC

[GitHub] [arrow-datafusion] alamb commented on a diff in pull request #5199: Minor: Begin porting some window tests to sqllogictests

alamb commented on code in PR #5199:
URL: https://github.com/apache/arrow-datafusion/pull/5199#discussion_r1097496707


##########
datafusion/core/tests/sqllogictests/test_files/window.slt:
##########
@@ -0,0 +1,397 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+statement ok
+CREATE EXTERNAL TABLE aggregate_test_100 (
+  c1  VARCHAR NOT NULL,
+  c2  TINYINT NOT NULL,
+  c3  SMALLINT NOT NULL,
+  c4  SMALLINT,
+  c5  INT,
+  c6  BIGINT NOT NULL,
+  c7  SMALLINT NOT NULL,
+  c8  INT NOT NULL,
+  c9  BIGINT UNSIGNED NOT NULL,
+  c10 VARCHAR NOT NULL,
+  c11 FLOAT NOT NULL,
+  c12 DOUBLE NOT NULL,
+  c13 VARCHAR NOT NULL
+)
+STORED AS CSV
+WITH HEADER ROW
+LOCATION '../../testing/data/csv/aggregate_test_100.csv'
+
+### This is the same table as
+### execute_with_partition with 4 partitions
+statement ok
+CREATE EXTERNAL TABLE test (c1 int, c2 bigint, c3 boolean)
+STORED AS CSV LOCATION 'tests/data/partitioned_csv';
+
+
+# for window functions without order by the first, last, and nth function call does not make sense
+# csv_query_window_with_empty_over
+query IIII
+select
+c9,
+count(c5) over () as count1,
+max(c5) over () as max1,
+min(c5) over () as min1
+from aggregate_test_100
+order by c9
+limit 5
+----
+28774375 100 2143473091 -2141999138
+63044568 100 2143473091 -2141999138
+141047417 100 2143473091 -2141999138
+141680161 100 2143473091 -2141999138
+145294611 100 2143473091 -2141999138
+
+# for window functions without order by the first, last, and nth function call does not make sense
+# csv_query_window_with_partition_by
+query IIIIII
+select
+c9,
+sum(cast(c4 as Int)) over (partition by c3) as sum1,
+avg(cast(c4 as Int)) over (partition by c3) as avg1,
+count(cast(c4 as Int)) over (partition by c3) as count1,
+max(cast(c4 as Int)) over (partition by c3) as max1,
+min(cast(c4 as Int)) over (partition by c3) as min1
+from aggregate_test_100
+order by c9
+limit 5
+----
+28774375 -16110 -16110 1 -16110 -16110
+63044568 3917 3917 1 3917 3917
+141047417 -38455 -19227.5 2 -16974 -21481
+141680161 -1114 -1114 1 -1114 -1114
+145294611 15673 15673 1 15673 15673
+
+
+
+# async fn csv_query_window_with_order_by
+query IIIIII
+select
+c9,
+sum(c5) over (order by c9) as sum1,
+avg(c5) over (order by c9) as avg1,
+count(c5) over (order by c9) as count1,
+max(c5) over (order by c9) as max1,
+min(c5) over (order by c9) as min1,
+first_value(c5) over (order by c9) as fv1,
+last_value(c5) over (order by c9) as lv1,
+nth_value(c5, 2) over (order by c9) as nv1
+from aggregate_test_100
+order by c9
+limit 5
+----
+28774375 61035129 61035129 1 61035129 61035129 61035129 61035129 NULL
+63044568 -47938237 -23969118.5 2 61035129 -108973366 61035129 -108973366 -108973366
+141047417 575165281 191721760.33333334 3 623103518 -108973366 61035129 623103518 -108973366
+141680161 -1352462829 -338115707.25 4 623103518 -1927628110 61035129 -1927628110 -108973366
+145294611 -3251637940 -650327588 5 623103518 -1927628110 61035129 -1899175111 -108973366
+
+# csv_query_window_with_partition_by_order_by
+query IIIIII
+select
+ c9,
+ sum(c5) over (partition by c4 order by c9) as sum1,
+ avg(c5) over (partition by c4 order by c9) as avg1,
+ count(c5) over (partition by c4 order by c9) as count1,
+ max(c5) over (partition by c4 order by c9) as max1,
+ min(c5) over (partition by c4 order by c9) as min1,
+ first_value(c5) over (partition by c4 order by c9) as fv1,
+ last_value(c5) over (partition by c4 order by c9) as lv1,
+ nth_value(c5, 2) over (partition by c4 order by c9) as nv1
+from aggregate_test_100
+order by c9
+limit 5
+----
+28774375 61035129 61035129 1 61035129 61035129 61035129 61035129 NULL
+63044568 -108973366 -108973366 1 -108973366 -108973366 -108973366 -108973366 NULL
+141047417 623103518 623103518 1 623103518 623103518 623103518 623103518 NULL
+141680161 -1927628110 -1927628110 1 -1927628110 -1927628110 -1927628110 -1927628110 NULL
+145294611 -1899175111 -1899175111 1 -1899175111 -1899175111 -1899175111 -1899175111 NULL
+
+# window()
+query IIIIII
+SELECT
+c1,
+c2,
+SUM(c2) OVER () as sum1,
+COUNT(c2) OVER () as count1,
+MAX(c2) OVER () as max1,
+MIN(c2) OVER () as min1,
+AVG(c2) OVER () as avg1
+FROM test
+ORDER BY c1, c2
+LIMIT 5
+----
+0 0 220 44 10 0 5
+0 1 220 44 10 0 5
+0 2 220 44 10 0 5
+0 3 220 44 10 0 5
+0 4 220 44 10 0 5
+
+
+# window_order_by
+query IIIIII
+SELECT
+c1,
+c2,
+ROW_NUMBER() OVER (ORDER BY c1, c2) as rn1,
+FIRST_VALUE(c2) OVER (ORDER BY c1, c2) as fv1,
+LAST_VALUE(c2) OVER (ORDER BY c1, c2) as lv1,
+NTH_VALUE(c2, 2) OVER (ORDER BY c1, c2) as nv1,
+SUM(c2) OVER (ORDER BY c1, c2) as sum1,
+COUNT(c2) OVER (ORDER BY c1, c2) as count1,
+MAX(c2) OVER (ORDER BY c1, c2) as max1,
+MIN(c2) OVER (ORDER BY c1, c2) as min1,
+AVG(c2) OVER (ORDER BY c1, c2) as avg1
+FROM test
+ORDER BY c1, c2
+LIMIT 5
+----
+0 0 1 0 0 NULL 0 1 0 0 0
+0 1 2 0 1 1 1 2 1 0 0.5
+0 2 3 0 2 1 3 3 2 0 1
+0 3 4 0 3 1 6 4 3 0 1.5
+0 4 5 0 4 1 10 5 4 0 2
+
+# window_partition_by
+query IIIIII
+SELECT
+c1,
+c2,
+SUM(c2) OVER (PARTITION BY c2) as sum1,
+COUNT(c2) OVER (PARTITION BY c2) as count1,
+MAX(c2) OVER (PARTITION BY c2) as max1,
+MIN(c2) OVER (PARTITION BY c2) as min1,
+AVG(c2) OVER (PARTITION BY c2) as avg1
+FROM test
+ORDER BY c1, c2
+LIMIT 5
+----
+0 0 0 4 0 0 0
+0 1 4 4 1 1 1
+0 2 8 4 2 2 2
+0 3 12 4 3 3 3
+0 4 16 4 4 4 4
+
+query IIIIIIIIII
+SELECT
+c1,
+c2,
+ROW_NUMBER() OVER (PARTITION BY c2 ORDER BY c1) as rn1,
+FIRST_VALUE(c2 + c1) OVER (PARTITION BY c2 ORDER BY c1) as fv1,
+LAST_VALUE(c2 + c1) OVER (PARTITION BY c2 ORDER BY c1) as lv1,
+NTH_VALUE(c2 + c1, 1) OVER (PARTITION BY c2 ORDER BY c1) as nv1,
+SUM(c2) OVER (PARTITION BY c2 ORDER BY c1) as sum1,
+COUNT(c2) OVER (PARTITION BY c2 ORDER BY c1) as count1,
+MAX(c2) OVER (PARTITION BY c2 ORDER BY c1) as max1,
+MIN(c2) OVER (PARTITION BY c2 ORDER BY c1) as min1,
+AVG(c2) OVER (PARTITION BY c2 ORDER BY c1) as avg1
+FROM test
+ORDER BY c1, c2
+LIMIT 5
+----
+0 0 1 0 0 0 0 1 0 0 0

Review Comment:
   the careful reader will note that the original test does not have any data for c2 = 0 (because I think it uses a slightly different test setup)



##########
datafusion/core/tests/sqllogictests/test_files/window.slt:
##########
@@ -0,0 +1,400 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+statement ok
+CREATE EXTERNAL TABLE aggregate_test_100 (
+  c1  VARCHAR NOT NULL,
+  c2  TINYINT NOT NULL,
+  c3  SMALLINT NOT NULL,
+  c4  SMALLINT,
+  c5  INT,
+  c6  BIGINT NOT NULL,
+  c7  SMALLINT NOT NULL,
+  c8  INT NOT NULL,
+  c9  BIGINT UNSIGNED NOT NULL,
+  c10 VARCHAR NOT NULL,
+  c11 FLOAT NOT NULL,
+  c12 DOUBLE NOT NULL,
+  c13 VARCHAR NOT NULL
+)
+STORED AS CSV
+WITH HEADER ROW
+LOCATION '../../testing/data/csv/aggregate_test_100.csv'
+
+### This is the same table as
+### execute_with_partition with 4 partitions
+statement ok
+CREATE EXTERNAL TABLE test (c1 int, c2 bigint, c3 boolean)
+STORED AS CSV LOCATION 'tests/data/partitioned_csv';
+
+
+# for window functions without order by the first, last, and nth function call does not make sense
+# csv_query_window_with_empty_over
+query IIII
+select
+c9,
+count(c5) over () as count1,
+max(c5) over () as max1,
+min(c5) over () as min1
+from aggregate_test_100
+order by c9
+limit 5
+----
+28774375 100 2143473091 -2141999138
+63044568 100 2143473091 -2141999138
+141047417 100 2143473091 -2141999138
+141680161 100 2143473091 -2141999138
+145294611 100 2143473091 -2141999138
+
+# for window functions without order by the first, last, and nth function call does not make sense
+# csv_query_window_with_partition_by
+query IIIIII
+select
+c9,
+sum(cast(c4 as Int)) over (partition by c3) as sum1,
+avg(cast(c4 as Int)) over (partition by c3) as avg1,
+count(cast(c4 as Int)) over (partition by c3) as count1,
+max(cast(c4 as Int)) over (partition by c3) as max1,
+min(cast(c4 as Int)) over (partition by c3) as min1
+from aggregate_test_100
+order by c9
+limit 5
+----
+28774375 -16110 -16110 1 -16110 -16110
+63044568 3917 3917 1 3917 3917
+141047417 -38455 -19227.5 2 -16974 -21481
+141680161 -1114 -1114 1 -1114 -1114
+145294611 15673 15673 1 15673 15673
+
+
+
+# async fn csv_query_window_with_order_by
+query IIIIII
+select
+c9,
+sum(c5) over (order by c9) as sum1,
+avg(c5) over (order by c9) as avg1,
+count(c5) over (order by c9) as count1,
+max(c5) over (order by c9) as max1,
+min(c5) over (order by c9) as min1,
+first_value(c5) over (order by c9) as fv1,
+last_value(c5) over (order by c9) as lv1,
+nth_value(c5, 2) over (order by c9) as nv1
+from aggregate_test_100
+order by c9
+limit 5
+----
+28774375 61035129 61035129 1 61035129 61035129 61035129 61035129 NULL
+63044568 -47938237 -23969118.5 2 61035129 -108973366 61035129 -108973366 -108973366
+141047417 575165281 191721760.33333334 3 623103518 -108973366 61035129 623103518 -108973366
+141680161 -1352462829 -338115707.25 4 623103518 -1927628110 61035129 -1927628110 -108973366
+145294611 -3251637940 -650327588 5 623103518 -1927628110 61035129 -1899175111 -108973366
+
+# csv_query_window_with_partition_by_order_by
+query IIIIII
+select
+ c9,
+ sum(c5) over (partition by c4 order by c9) as sum1,
+ avg(c5) over (partition by c4 order by c9) as avg1,
+ count(c5) over (partition by c4 order by c9) as count1,
+ max(c5) over (partition by c4 order by c9) as max1,
+ min(c5) over (partition by c4 order by c9) as min1,
+ first_value(c5) over (partition by c4 order by c9) as fv1,
+ last_value(c5) over (partition by c4 order by c9) as lv1,
+ nth_value(c5, 2) over (partition by c4 order by c9) as nv1
+from aggregate_test_100
+order by c9
+limit 5
+----
+28774375 61035129 61035129 1 61035129 61035129 61035129 61035129 NULL
+63044568 -108973366 -108973366 1 -108973366 -108973366 -108973366 -108973366 NULL
+141047417 623103518 623103518 1 623103518 623103518 623103518 623103518 NULL
+141680161 -1927628110 -1927628110 1 -1927628110 -1927628110 -1927628110 -1927628110 NULL
+145294611 -1899175111 -1899175111 1 -1899175111 -1899175111 -1899175111 -1899175111 NULL
+
+# window()
+query IIIIII
+SELECT
+c1,
+c2,
+SUM(c2) OVER () as sum1,
+COUNT(c2) OVER () as count1,
+MAX(c2) OVER () as max1,
+MIN(c2) OVER () as min1,
+AVG(c2) OVER () as avg1
+FROM test
+ORDER BY c1, c2
+LIMIT 5
+----
+0 0 220 44 10 0 5
+0 1 220 44 10 0 5
+0 2 220 44 10 0 5
+0 3 220 44 10 0 5
+0 4 220 44 10 0 5

Review Comment:
   > Maybe there were parsed as header.)
   
   Yes I think this is exactly what was happening -- that is a good observation
   
   I think it is ok that the results changed as the point of the test seems to be to cover a broad range of window function results rather than any specific window function or case (as in skipping  the header row seems like a oversight in the original test rather than an important behavior to replicate)
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org