You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2023/04/24 15:15:35 UTC
[arrow-datafusion] branch main updated: refactor(sqllogictests): port group by test to sqllogic (#6088)
This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 05762363f9 refactor(sqllogictests): port group by test to sqllogic (#6088)
05762363f9 is described below
commit 05762363f9ad76e2115d9ebf777aae6907da33f7
Author: Armin Primadi <ap...@gmail.com>
AuthorDate: Mon Apr 24 22:15:26 2023 +0700
refactor(sqllogictests): port group by test to sqllogic (#6088)
* refactor(sqllogictests): wip refactor group by test to sqllogic
* refactor(sqllogictest): porting more group by tests
* docs(core): revert accidental documentation styling update
* refactor(sqllogictest): cleanup group test already in sqllogictest
* refactor(sqllogictest): move group by time test
* refactor(sqllogictest): no longer need setup for group test
* refactor(sqllogictest): port more group by test
* refactor(sqllogictest): port more group by test
* style(sqllogictest): conform to cargo fmt
---
datafusion/core/tests/sql/group_by.rs | 745 ---------------------
.../core/tests/sqllogictests/test_files/group.slt | 389 +++++++++++
2 files changed, 389 insertions(+), 745 deletions(-)
diff --git a/datafusion/core/tests/sql/group_by.rs b/datafusion/core/tests/sql/group_by.rs
index a92eaf0f4d..b4a92db3fc 100644
--- a/datafusion/core/tests/sql/group_by.rs
+++ b/datafusion/core/tests/sql/group_by.rs
@@ -17,633 +17,6 @@
use super::*;
-#[tokio::test]
-async fn csv_query_group_by_int_min_max() -> Result<()> {
- let ctx = SessionContext::new();
- register_aggregate_csv(&ctx).await?;
- let sql = "SELECT c2, MIN(c12), MAX(c12) FROM aggregate_test_100 GROUP BY c2";
- let actual = execute_to_batches(&ctx, sql).await;
- let expected = vec![
- "+----+-----------------------------+-----------------------------+",
- "| c2 | MIN(aggregate_test_100.c12) | MAX(aggregate_test_100.c12) |",
- "+----+-----------------------------+-----------------------------+",
- "| 1 | 0.05636955101974106 | 0.9965400387585364 |",
- "| 2 | 0.16301110515739792 | 0.991517828651004 |",
- "| 3 | 0.047343434291126085 | 0.9293883502480845 |",
- "| 4 | 0.02182578039211991 | 0.9237877978193884 |",
- "| 5 | 0.01479305307777301 | 0.9723580396501548 |",
- "+----+-----------------------------+-----------------------------+",
- ];
- assert_batches_sorted_eq!(expected, &actual);
- Ok(())
-}
-
-#[tokio::test]
-async fn csv_query_group_by_float32() -> Result<()> {
- let ctx = SessionContext::new();
- register_aggregate_simple_csv(&ctx).await?;
-
- let sql =
- "SELECT COUNT(*) as cnt, c1 FROM aggregate_simple GROUP BY c1 ORDER BY cnt DESC";
- let actual = execute_to_batches(&ctx, sql).await;
-
- let expected = vec![
- "+-----+---------+",
- "| cnt | c1 |",
- "+-----+---------+",
- "| 5 | 0.00005 |",
- "| 4 | 0.00004 |",
- "| 3 | 0.00003 |",
- "| 2 | 0.00002 |",
- "| 1 | 0.00001 |",
- "+-----+---------+",
- ];
- assert_batches_eq!(expected, &actual);
-
- Ok(())
-}
-
-#[tokio::test]
-async fn csv_query_group_by_float64() -> Result<()> {
- let ctx = SessionContext::new();
- register_aggregate_simple_csv(&ctx).await?;
-
- let sql =
- "SELECT COUNT(*) as cnt, c2 FROM aggregate_simple GROUP BY c2 ORDER BY cnt DESC";
- let actual = execute_to_batches(&ctx, sql).await;
-
- let expected = vec![
- "+-----+---------+",
- "| cnt | c2 |",
- "+-----+---------+",
- "| 5 | 5.0e-12 |",
- "| 4 | 4.0e-12 |",
- "| 3 | 3.0e-12 |",
- "| 2 | 2.0e-12 |",
- "| 1 | 1.0e-12 |",
- "+-----+---------+",
- ];
- assert_batches_eq!(expected, &actual);
-
- Ok(())
-}
-
-#[tokio::test]
-async fn csv_query_group_by_boolean() -> Result<()> {
- let ctx = SessionContext::new();
- register_aggregate_simple_csv(&ctx).await?;
-
- let sql =
- "SELECT COUNT(*) as cnt, c3 FROM aggregate_simple GROUP BY c3 ORDER BY cnt DESC";
- let actual = execute_to_batches(&ctx, sql).await;
-
- let expected = vec![
- "+-----+-------+",
- "| cnt | c3 |",
- "+-----+-------+",
- "| 9 | true |",
- "| 6 | false |",
- "+-----+-------+",
- ];
- assert_batches_eq!(expected, &actual);
-
- Ok(())
-}
-
-#[tokio::test]
-async fn csv_query_group_by_two_columns() -> Result<()> {
- let ctx = SessionContext::new();
- register_aggregate_csv(&ctx).await?;
- let sql = "SELECT c1, c2, MIN(c3) FROM aggregate_test_100 GROUP BY c1, c2";
- let actual = execute_to_batches(&ctx, sql).await;
- let expected = vec![
- "+----+----+----------------------------+",
- "| c1 | c2 | MIN(aggregate_test_100.c3) |",
- "+----+----+----------------------------+",
- "| a | 1 | -85 |",
- "| a | 2 | -48 |",
- "| a | 3 | -72 |",
- "| a | 4 | -101 |",
- "| a | 5 | -101 |",
- "| b | 1 | 12 |",
- "| b | 2 | -60 |",
- "| b | 3 | -101 |",
- "| b | 4 | -117 |",
- "| b | 5 | -82 |",
- "| c | 1 | -24 |",
- "| c | 2 | -117 |",
- "| c | 3 | -2 |",
- "| c | 4 | -90 |",
- "| c | 5 | -94 |",
- "| d | 1 | -99 |",
- "| d | 2 | 93 |",
- "| d | 3 | -76 |",
- "| d | 4 | 5 |",
- "| d | 5 | -59 |",
- "| e | 1 | 36 |",
- "| e | 2 | -61 |",
- "| e | 3 | -95 |",
- "| e | 4 | -56 |",
- "| e | 5 | -86 |",
- "+----+----+----------------------------+",
- ];
- assert_batches_sorted_eq!(expected, &actual);
- Ok(())
-}
-
-#[tokio::test]
-async fn csv_query_group_by_and_having() -> Result<()> {
- let ctx = SessionContext::new();
- register_aggregate_csv(&ctx).await?;
- let sql = "SELECT c1, MIN(c3) AS m FROM aggregate_test_100 GROUP BY c1 HAVING m < -100 AND MAX(c3) > 70";
- let actual = execute_to_batches(&ctx, sql).await;
- let expected = vec![
- "+----+------+",
- "| c1 | m |",
- "+----+------+",
- "| a | -101 |",
- "| c | -117 |",
- "+----+------+",
- ];
- assert_batches_sorted_eq!(expected, &actual);
- Ok(())
-}
-
-#[tokio::test]
-async fn csv_query_group_by_and_having_and_where() -> Result<()> {
- let ctx = SessionContext::new();
- register_aggregate_csv(&ctx).await?;
- let sql = "SELECT c1, MIN(c3) AS m
- FROM aggregate_test_100
- WHERE c1 IN ('a', 'b')
- GROUP BY c1
- HAVING m < -100 AND MAX(c3) > 70";
- let actual = execute_to_batches(&ctx, sql).await;
- let expected = vec![
- "+----+------+",
- "| c1 | m |",
- "+----+------+",
- "| a | -101 |",
- "+----+------+",
- ];
- assert_batches_eq!(expected, &actual);
- Ok(())
-}
-
-#[tokio::test]
-async fn csv_query_group_by_substr() -> Result<()> {
- let ctx = SessionContext::new();
- register_aggregate_csv(&ctx).await?;
- // there is an input column "c1" as well a projection expression aliased as "c1"
- let sql = "SELECT substr(c1, 1, 1) c1 \
- FROM aggregate_test_100 \
- GROUP BY substr(c1, 1, 1) \
- ";
- let actual = execute_to_batches(&ctx, sql).await;
- #[rustfmt::skip]
- let expected = vec![
- "+----+",
- "| c1 |",
- "+----+",
- "| a |",
- "| b |",
- "| c |",
- "| d |",
- "| e |",
- "+----+",
- ];
- assert_batches_sorted_eq!(expected, &actual);
- Ok(())
-}
-
-#[tokio::test]
-async fn csv_query_group_by_avg() -> Result<()> {
- let ctx = SessionContext::new();
- register_aggregate_csv(&ctx).await?;
- let sql = "SELECT c1, avg(c12) FROM aggregate_test_100 GROUP BY c1";
- let actual = execute_to_batches(&ctx, sql).await;
- let expected = vec![
- "+----+-----------------------------+",
- "| c1 | AVG(aggregate_test_100.c12) |",
- "+----+-----------------------------+",
- "| a | 0.48754517466109415 |",
- "| b | 0.41040709263815384 |",
- "| c | 0.6600456536439784 |",
- "| d | 0.48855379387549824 |",
- "| e | 0.48600669271341534 |",
- "+----+-----------------------------+",
- ];
- assert_batches_sorted_eq!(expected, &actual);
- Ok(())
-}
-
-#[tokio::test]
-async fn csv_query_group_by_with_aliases() -> Result<()> {
- let ctx = SessionContext::new();
- register_aggregate_csv(&ctx).await?;
- let sql = "SELECT c1 AS c12, avg(c12) AS c1 FROM aggregate_test_100 GROUP BY c1";
- let actual = execute_to_batches(&ctx, sql).await;
- let expected = vec![
- "+-----+---------------------+",
- "| c12 | c1 |",
- "+-----+---------------------+",
- "| a | 0.48754517466109415 |",
- "| b | 0.41040709263815384 |",
- "| c | 0.6600456536439784 |",
- "| d | 0.48855379387549824 |",
- "| e | 0.48600669271341534 |",
- "+-----+---------------------+",
- ];
- assert_batches_sorted_eq!(expected, &actual);
- Ok(())
-}
-
-#[tokio::test]
-async fn csv_query_group_by_int_count() -> Result<()> {
- let ctx = SessionContext::new();
- register_aggregate_csv(&ctx).await?;
- let sql = "SELECT c1, count(c12) FROM aggregate_test_100 GROUP BY c1";
- let actual = execute_to_batches(&ctx, sql).await;
- let expected = vec![
- "+----+-------------------------------+",
- "| c1 | COUNT(aggregate_test_100.c12) |",
- "+----+-------------------------------+",
- "| a | 21 |",
- "| b | 19 |",
- "| c | 21 |",
- "| d | 18 |",
- "| e | 21 |",
- "+----+-------------------------------+",
- ];
- assert_batches_sorted_eq!(expected, &actual);
- Ok(())
-}
-
-#[tokio::test]
-async fn csv_query_group_with_aliased_aggregate() -> Result<()> {
- let ctx = SessionContext::new();
- register_aggregate_csv(&ctx).await?;
- let sql = "SELECT c1, count(c12) AS count FROM aggregate_test_100 GROUP BY c1";
- let actual = execute_to_batches(&ctx, sql).await;
- let expected = vec![
- "+----+-------+",
- "| c1 | count |",
- "+----+-------+",
- "| a | 21 |",
- "| b | 19 |",
- "| c | 21 |",
- "| d | 18 |",
- "| e | 21 |",
- "+----+-------+",
- ];
- assert_batches_sorted_eq!(expected, &actual);
- Ok(())
-}
-
-#[tokio::test]
-async fn csv_query_group_by_string_min_max() -> Result<()> {
- let ctx = SessionContext::new();
- register_aggregate_csv(&ctx).await?;
- let sql = "SELECT c1, MIN(c12), MAX(c12) FROM aggregate_test_100 GROUP BY c1";
- let actual = execute_to_batches(&ctx, sql).await;
- let expected = vec![
- "+----+-----------------------------+-----------------------------+",
- "| c1 | MIN(aggregate_test_100.c12) | MAX(aggregate_test_100.c12) |",
- "+----+-----------------------------+-----------------------------+",
- "| a | 0.02182578039211991 | 0.9800193410444061 |",
- "| b | 0.04893135681998029 | 0.9185813970744787 |",
- "| c | 0.0494924465469434 | 0.991517828651004 |",
- "| d | 0.061029375346466685 | 0.9748360509016578 |",
- "| e | 0.01479305307777301 | 0.9965400387585364 |",
- "+----+-----------------------------+-----------------------------+",
- ];
- assert_batches_sorted_eq!(expected, &actual);
- Ok(())
-}
-
-#[tokio::test]
-async fn query_group_on_null() -> Result<()> {
- let schema = Arc::new(Schema::new(vec![Field::new("c1", DataType::Int32, true)]));
-
- let data = RecordBatch::try_new(
- schema.clone(),
- vec![Arc::new(Int32Array::from(vec![
- Some(0),
- Some(3),
- None,
- Some(1),
- Some(3),
- ]))],
- )?;
-
- let ctx = SessionContext::new();
- ctx.register_batch("test", data)?;
- let sql = "SELECT COUNT(*), c1 FROM test GROUP BY c1";
-
- let actual = execute_to_batches(&ctx, sql).await;
-
- // Note that the results also
- // include a row for NULL (c1=NULL, count = 1)
- let expected = vec![
- "+-----------------+----+",
- "| COUNT(UInt8(1)) | c1 |",
- "+-----------------+----+",
- "| 1 | |",
- "| 1 | 0 |",
- "| 1 | 1 |",
- "| 2 | 3 |",
- "+-----------------+----+",
- ];
- assert_batches_sorted_eq!(expected, &actual);
- Ok(())
-}
-
-#[tokio::test]
-async fn query_group_on_null_multi_col() -> Result<()> {
- let schema = Arc::new(Schema::new(vec![
- Field::new("c1", DataType::Int32, true),
- Field::new("c2", DataType::Utf8, true),
- ]));
-
- let data = RecordBatch::try_new(
- schema.clone(),
- vec![
- Arc::new(Int32Array::from(vec![
- Some(0),
- Some(0),
- Some(3),
- None,
- None,
- Some(3),
- Some(0),
- None,
- Some(3),
- ])),
- Arc::new(StringArray::from(vec![
- None,
- None,
- Some("foo"),
- None,
- Some("bar"),
- Some("foo"),
- None,
- Some("bar"),
- Some("foo"),
- ])),
- ],
- )?;
-
- let ctx = SessionContext::new();
- ctx.register_batch("test", data)?;
- let sql = "SELECT COUNT(*), c1, c2 FROM test GROUP BY c1, c2";
-
- let actual = execute_to_batches(&ctx, sql).await;
-
- // Note that the results also include values for null
- // include a row for NULL (c1=NULL, count = 1)
- let expected = vec![
- "+-----------------+----+-----+",
- "| COUNT(UInt8(1)) | c1 | c2 |",
- "+-----------------+----+-----+",
- "| 1 | | |",
- "| 2 | | bar |",
- "| 3 | 0 | |",
- "| 3 | 3 | foo |",
- "+-----------------+----+-----+",
- ];
- assert_batches_sorted_eq!(expected, &actual);
-
- // Also run query with group columns reversed (results should be the same)
- let sql = "SELECT COUNT(*), c1, c2 FROM test GROUP BY c2, c1";
- let actual = execute_to_batches(&ctx, sql).await;
- assert_batches_sorted_eq!(expected, &actual);
- Ok(())
-}
-
-#[tokio::test]
-async fn csv_group_by_date() -> Result<()> {
- let ctx = SessionContext::new();
- let schema = Arc::new(Schema::new(vec![
- Field::new("date", DataType::Date32, false),
- Field::new("cnt", DataType::Int32, false),
- ]));
- let data = RecordBatch::try_new(
- schema.clone(),
- vec![
- Arc::new(Date32Array::from(vec![
- Some(100),
- Some(100),
- Some(100),
- Some(101),
- Some(101),
- Some(101),
- ])),
- Arc::new(Int32Array::from(vec![
- Some(1),
- Some(2),
- Some(3),
- Some(3),
- Some(3),
- Some(3),
- ])),
- ],
- )?;
-
- ctx.register_batch("dates", data)?;
- let sql = "SELECT SUM(cnt) FROM dates GROUP BY date";
- let actual = execute_to_batches(&ctx, sql).await;
- let expected = vec![
- "+----------------+",
- "| SUM(dates.cnt) |",
- "+----------------+",
- "| 6 |",
- "| 9 |",
- "+----------------+",
- ];
- assert_batches_sorted_eq!(expected, &actual);
- Ok(())
-}
-
-#[tokio::test]
-async fn csv_group_by_time32second() -> Result<()> {
- let ctx = SessionContext::new();
- let schema = Arc::new(Schema::new(vec![
- Field::new("time", DataType::Time32(TimeUnit::Second), false),
- Field::new("cnt", DataType::Int32, false),
- ]));
- let data = RecordBatch::try_new(
- schema.clone(),
- vec![
- Arc::new(Time32SecondArray::from(vec![
- Some(5_000),
- Some(5_000),
- Some(5_500),
- Some(5_500),
- Some(5_900),
- Some(5_900),
- ])),
- Arc::new(Int32Array::from(vec![
- Some(1),
- Some(1),
- Some(1),
- Some(2),
- Some(1),
- Some(3),
- ])),
- ],
- )?;
-
- ctx.register_batch("times", data)?;
- let sql = "SELECT SUM(cnt) FROM times GROUP BY time";
- let actual = execute_to_batches(&ctx, sql).await;
- let expected = vec![
- "+----------------+",
- "| SUM(times.cnt) |",
- "+----------------+",
- "| 2 |",
- "| 3 |",
- "| 4 |",
- "+----------------+",
- ];
- assert_batches_sorted_eq!(expected, &actual);
- Ok(())
-}
-
-#[tokio::test]
-async fn csv_group_by_time32millisecond() -> Result<()> {
- let ctx = SessionContext::new();
- let schema = Arc::new(Schema::new(vec![
- Field::new("time", DataType::Time32(TimeUnit::Millisecond), false),
- Field::new("cnt", DataType::Int32, false),
- ]));
- let data = RecordBatch::try_new(
- schema.clone(),
- vec![
- Arc::new(Time32MillisecondArray::from(vec![
- Some(5_000_000),
- Some(5_000_000),
- Some(5_500_000),
- Some(5_500_000),
- Some(5_900_000),
- Some(5_900_000),
- ])),
- Arc::new(Int32Array::from(vec![
- Some(1),
- Some(1),
- Some(1),
- Some(2),
- Some(1),
- Some(3),
- ])),
- ],
- )?;
-
- ctx.register_batch("times", data)?;
- let sql = "SELECT SUM(cnt) FROM times GROUP BY time";
- let actual = execute_to_batches(&ctx, sql).await;
- let expected = vec![
- "+----------------+",
- "| SUM(times.cnt) |",
- "+----------------+",
- "| 2 |",
- "| 3 |",
- "| 4 |",
- "+----------------+",
- ];
- assert_batches_sorted_eq!(expected, &actual);
- Ok(())
-}
-
-#[tokio::test]
-async fn csv_group_by_time64microsecond() -> Result<()> {
- let ctx = SessionContext::new();
- let schema = Arc::new(Schema::new(vec![
- Field::new("time", DataType::Time64(TimeUnit::Microsecond), false),
- Field::new("cnt", DataType::Int64, false),
- ]));
- let data = RecordBatch::try_new(
- schema.clone(),
- vec![
- Arc::new(Time64MicrosecondArray::from(vec![
- Some(5_000_000_000),
- Some(5_000_000_000),
- Some(5_500_000_000),
- Some(5_500_000_000),
- Some(5_900_000_000),
- Some(5_900_000_000),
- ])),
- Arc::new(Int64Array::from(vec![
- Some(1),
- Some(1),
- Some(1),
- Some(2),
- Some(1),
- Some(3),
- ])),
- ],
- )?;
-
- ctx.register_batch("times", data)?;
- let sql = "SELECT SUM(cnt) FROM times GROUP BY time";
- let actual = execute_to_batches(&ctx, sql).await;
- let expected = vec![
- "+----------------+",
- "| SUM(times.cnt) |",
- "+----------------+",
- "| 2 |",
- "| 3 |",
- "| 4 |",
- "+----------------+",
- ];
- assert_batches_sorted_eq!(expected, &actual);
- Ok(())
-}
-
-#[tokio::test]
-async fn csv_group_by_time64nanosecond() -> Result<()> {
- let ctx = SessionContext::new();
- let schema = Arc::new(Schema::new(vec![
- Field::new("time", DataType::Time64(TimeUnit::Nanosecond), false),
- Field::new("cnt", DataType::Int64, false),
- ]));
- let data = RecordBatch::try_new(
- schema.clone(),
- vec![
- Arc::new(Time64NanosecondArray::from(vec![
- Some(5_000_000_000_000),
- Some(5_000_000_000_000),
- Some(5_500_000_000_000),
- Some(5_500_000_000_000),
- Some(5_900_000_000_000),
- Some(5_900_000_000_000),
- ])),
- Arc::new(Int64Array::from(vec![
- Some(1),
- Some(1),
- Some(1),
- Some(2),
- Some(1),
- Some(3),
- ])),
- ],
- )?;
-
- ctx.register_batch("times", data)?;
- let sql = "SELECT SUM(cnt) FROM times GROUP BY time";
- let actual = execute_to_batches(&ctx, sql).await;
- let expected = vec![
- "+----------------+",
- "| SUM(times.cnt) |",
- "+----------------+",
- "| 2 |",
- "| 3 |",
- "| 4 |",
- "+----------------+",
- ];
- assert_batches_sorted_eq!(expected, &actual);
- Ok(())
-}
-
#[tokio::test]
async fn group_by_date_trunc() -> Result<()> {
let tmp_dir = TempDir::new()?;
@@ -695,52 +68,6 @@ async fn group_by_date_trunc() -> Result<()> {
Ok(())
}
-#[tokio::test]
-async fn group_by_largeutf8() {
- let ctx = SessionContext::new();
-
- // input data looks like:
- // A, 1
- // B, 2
- // A, 2
- // A, 4
- // C, 1
- // A, 1
-
- let str_array: LargeStringArray = vec!["A", "B", "A", "A", "C", "A"]
- .into_iter()
- .map(Some)
- .collect();
- let str_array = Arc::new(str_array);
-
- let val_array: Int64Array = vec![1, 2, 2, 4, 1, 1].into();
- let val_array = Arc::new(val_array);
-
- let schema = Arc::new(Schema::new(vec![
- Field::new("str", str_array.data_type().clone(), false),
- Field::new("val", val_array.data_type().clone(), false),
- ]));
-
- let batch = RecordBatch::try_new(schema.clone(), vec![str_array, val_array]).unwrap();
-
- ctx.register_batch("t", batch).unwrap();
-
- let results = plan_and_collect(&ctx, "SELECT str, count(val) FROM t GROUP BY str")
- .await
- .expect("ran plan correctly");
-
- let expected = vec![
- "+-----+--------------+",
- "| str | COUNT(t.val) |",
- "+-----+--------------+",
- "| A | 4 |",
- "| B | 1 |",
- "| C | 1 |",
- "+-----+--------------+",
- ];
- assert_batches_sorted_eq!(expected, &results);
-}
-
#[tokio::test]
async fn group_by_dictionary() {
async fn run_test_case<K: ArrowDictionaryKeyType>() {
@@ -833,75 +160,3 @@ async fn group_by_dictionary() {
run_test_case::<UInt32Type>().await;
run_test_case::<UInt64Type>().await;
}
-
-#[tokio::test]
-async fn csv_query_group_by_order_by_substr() -> Result<()> {
- let ctx = SessionContext::new();
- register_aggregate_csv(&ctx).await?;
- let sql = "SELECT substr(c1, 1, 1), avg(c12) \
- FROM aggregate_test_100 \
- GROUP BY substr(c1, 1, 1) \
- ORDER BY substr(c1, 1, 1)";
- let actual = execute_to_batches(&ctx, sql).await;
- let expected = vec![
- "+-------------------------------------------------+-----------------------------+",
- "| substr(aggregate_test_100.c1,Int64(1),Int64(1)) | AVG(aggregate_test_100.c12) |",
- "+-------------------------------------------------+-----------------------------+",
- "| a | 0.48754517466109415 |",
- "| b | 0.41040709263815384 |",
- "| c | 0.6600456536439784 |",
- "| d | 0.48855379387549824 |",
- "| e | 0.48600669271341534 |",
- "+-------------------------------------------------+-----------------------------+",
- ];
- assert_batches_sorted_eq!(expected, &actual);
- Ok(())
-}
-
-#[tokio::test]
-async fn csv_query_group_by_order_by_substr_aliased_projection() -> Result<()> {
- let ctx = SessionContext::new();
- register_aggregate_csv(&ctx).await?;
- let sql = "SELECT substr(c1, 1, 1) as name, avg(c12) as average \
- FROM aggregate_test_100 \
- GROUP BY substr(c1, 1, 1) \
- ORDER BY substr(c1, 1, 1)";
- let actual = execute_to_batches(&ctx, sql).await;
- let expected = vec![
- "+------+---------------------+",
- "| name | average |",
- "+------+---------------------+",
- "| a | 0.48754517466109415 |",
- "| b | 0.41040709263815384 |",
- "| c | 0.6600456536439784 |",
- "| d | 0.48855379387549824 |",
- "| e | 0.48600669271341534 |",
- "+------+---------------------+",
- ];
- assert_batches_sorted_eq!(expected, &actual);
- Ok(())
-}
-
-#[tokio::test]
-async fn csv_query_group_by_order_by_avg_group_by_substr() -> Result<()> {
- let ctx = SessionContext::new();
- register_aggregate_csv(&ctx).await?;
- let sql = "SELECT substr(c1, 1, 1) as name, avg(c12) as average \
- FROM aggregate_test_100 \
- GROUP BY substr(c1, 1, 1) \
- ORDER BY avg(c12)";
- let actual = execute_to_batches(&ctx, sql).await;
- let expected = vec![
- "+------+---------------------+",
- "| name | average |",
- "+------+---------------------+",
- "| b | 0.41040709263815384 |",
- "| e | 0.48600669271341534 |",
- "| a | 0.48754517466109415 |",
- "| d | 0.48855379387549824 |",
- "| c | 0.6600456536439784 |",
- "+------+---------------------+",
- ];
- assert_batches_sorted_eq!(expected, &actual);
- Ok(())
-}
diff --git a/datafusion/core/tests/sqllogictests/test_files/group.slt b/datafusion/core/tests/sqllogictests/test_files/group.slt
new file mode 100644
index 0000000000..a56451d7aa
--- /dev/null
+++ b/datafusion/core/tests/sqllogictests/test_files/group.slt
@@ -0,0 +1,389 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+
+# http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+statement ok
+CREATE EXTERNAL TABLE aggregate_test_100 (
+ c1 VARCHAR NOT NULL,
+ c2 TINYINT NOT NULL,
+ c3 SMALLINT NOT NULL,
+ c4 SMALLINT,
+ c5 INT,
+ c6 BIGINT NOT NULL,
+ c7 SMALLINT NOT NULL,
+ c8 INT NOT NULL,
+ c9 BIGINT UNSIGNED NOT NULL,
+ c10 VARCHAR NOT NULL,
+ c11 FLOAT NOT NULL,
+ c12 DOUBLE NOT NULL,
+ c13 VARCHAR NOT NULL
+)
+STORED AS CSV
+WITH HEADER ROW
+LOCATION '../../testing/data/csv/aggregate_test_100.csv'
+
+statement ok
+CREATE external table aggregate_simple(c1 real, c2 double, c3 boolean) STORED as CSV WITH HEADER ROW LOCATION 'tests/data/aggregate_simple.csv';
+
+
+# csv_query_group_by_int_min_max
+query IRR rowsort
+SELECT c2, MIN(c12), MAX(c12) FROM aggregate_test_100 GROUP BY c2
+----
+1 0.05636955102 0.996540038759
+2 0.163011105157 0.991517828651
+3 0.047343434291 0.929388350248
+4 0.021825780392 0.923787797819
+5 0.014793053078 0.97235803965
+
+# csv_query_group_by_float32
+query IR
+SELECT COUNT(*) as cnt, c1 FROM aggregate_simple GROUP BY c1 ORDER BY cnt DESC
+----
+5 0.00005
+4 0.00004
+3 0.00003
+2 0.00002
+1 0.00001
+
+# csv_query_group_by_float64
+query IR
+SELECT COUNT(*) as cnt, c2 FROM aggregate_simple GROUP BY c2 ORDER BY cnt DESC
+----
+5 0.000000000005
+4 0.000000000004
+3 0.000000000003
+2 0.000000000002
+1 0.000000000001
+
+# csv_query_group_by_boolean
+query IB
+SELECT COUNT(*) as cnt, c3 FROM aggregate_simple GROUP BY c3 ORDER BY cnt DESC
+----
+9 true
+6 false
+
+# csv_query_group_by_two_columns
+query TII rowsort
+SELECT c1, c2, MIN(c3) FROM aggregate_test_100 GROUP BY c1, c2
+----
+a 1 -85
+a 2 -48
+a 3 -72
+a 4 -101
+a 5 -101
+b 1 12
+b 2 -60
+b 3 -101
+b 4 -117
+b 5 -82
+c 1 -24
+c 2 -117
+c 3 -2
+c 4 -90
+c 5 -94
+d 1 -99
+d 2 93
+d 3 -76
+d 4 5
+d 5 -59
+e 1 36
+e 2 -61
+e 3 -95
+e 4 -56
+e 5 -86
+
+# csv_query_group_by_and_having
+query TI rowsort
+SELECT c1, MIN(c3) AS m FROM aggregate_test_100 GROUP BY c1 HAVING m < -100 AND MAX(c3) > 70
+----
+a -101
+c -117
+
+# csv_query_group_by_and_having_and_where
+query TI
+SELECT c1, MIN(c3) AS m
+FROM aggregate_test_100
+WHERE c1 IN ('a', 'b')
+GROUP BY c1
+HAVING m < -100 AND MAX(c3) > 70
+----
+a -101
+
+# csv_query_group_by_substr
+query T rowsort
+SELECT substr(c1, 1, 1) c1 FROM aggregate_test_100 GROUP BY substr(c1, 1, 1)
+----
+a
+b
+c
+d
+e
+
+# csv_query_group_by_avg
+query TR rowsort
+SELECT c1, avg(c12) FROM aggregate_test_100 GROUP BY c1
+----
+a 0.487545174661
+b 0.410407092638
+c 0.660045653644
+d 0.488553793875
+e 0.486006692713
+
+# csv_query_group_by_with_aliases
+query TR rowsort
+SELECT c1 AS c12, avg(c12) AS c1 FROM aggregate_test_100 GROUP BY c1
+----
+a 0.487545174661
+b 0.410407092638
+c 0.660045653644
+d 0.488553793875
+e 0.486006692713
+
+# csv_query_group_by_int_count
+query TI rowsort
+SELECT c1, count(c12) FROM aggregate_test_100 GROUP BY c1
+----
+a 21
+b 19
+c 21
+d 18
+e 21
+
+# csv_query_group_with_aliased_aggregate
+query TI rowsort
+SELECT c1, count(c12) AS count FROM aggregate_test_100 GROUP BY c1
+----
+a 21
+b 19
+c 21
+d 18
+e 21
+
+# csv_query_group_by_string_min_max
+query TRR rowsort
+SELECT c1, MIN(c12), MAX(c12) FROM aggregate_test_100 GROUP BY c1
+----
+a 0.021825780392 0.980019341044
+b 0.04893135682 0.918581397074
+c 0.049492446547 0.991517828651
+d 0.061029375346 0.974836050902
+e 0.014793053078 0.996540038759
+
+
+# Create a table containing null values
+
+statement ok
+create table null_data(c1 int) as values
+ (0),
+ (3),
+ (NULL),
+ (1),
+ (3);
+
+# query_group_on_null
+query II rowsort
+SELECT COUNT(*), c1 FROM null_data GROUP BY c1
+----
+1 0
+1 1
+1 NULL
+2 3
+
+
+# Create a table containing multiple column with null values
+
+statement ok
+create table multi_null_data(c1 int, c2 string) as values
+ (0, NULL),
+ (0, NULL),
+ (3, 'foo'),
+ (NULL, NULL),
+ (NULL, 'bar'),
+ (3, 'foo'),
+ (0, NULL),
+ (NULL, 'bar'),
+ (3, 'foo');
+
+# query_group_on_null_multi_col
+
+query IIT rowsort
+SELECT COUNT(*), c1, c2 FROM multi_null_data GROUP BY c1, c2
+----
+1 NULL NULL
+2 NULL bar
+3 0 NULL
+3 3 foo
+
+query IIT rowsort
+SELECT COUNT(*), c1, c2 FROM multi_null_data GROUP BY c2, c1
+----
+1 NULL NULL
+2 NULL bar
+3 0 NULL
+3 3 foo
+
+
+# Create a date table
+
+statement ok
+create table date_data(date date, cnt int) as values
+ (100, 1),
+ (100, 2),
+ (100, 3),
+ (101, 3),
+ (101, 3),
+ (101, 3);
+
+# csv_group_by_date
+query I rowsort
+SELECT SUM(cnt) FROM date_data GROUP BY date
+----
+6
+9
+
+# Create time tables with different precisions but the same logical values
+
+statement ok
+create table time_data(ts bigint, cnt int) as values
+ (5000000000000, 1),
+ (5000000000000, 1),
+ (5500000000000, 1),
+ (5500000000000, 2),
+ (5900000000000, 1),
+ (5900000000000, 3);
+
+statement ok
+create table time64_nanos as
+select arrow_cast(ts, 'Time64(Nanosecond)') as time, arrow_cast(cnt, 'Int64') as cnt
+from time_data;
+
+statement ok
+create table time64_micros as
+select arrow_cast(ts / 1000, 'Time64(Microsecond)') as time, arrow_cast(cnt, 'Int64') as cnt
+from time_data;
+
+statement ok
+create table time32_millis as
+select arrow_cast(arrow_cast(ts / 1000 / 1000, 'Int32'), 'Time32(Millisecond)') as time, cnt
+from time_data;
+
+statement ok
+create table time32_s as
+select arrow_cast(arrow_cast(ts / 1000 / 1000 / 1000, 'Int32'), 'Time32(Second)') as time, cnt
+from time_data;
+
+# csv_group_by_time32second
+query I rowsort
+SELECT SUM(cnt) FROM time32_s GROUP BY time
+----
+2
+3
+4
+
+# csv_group_by_time32millisecond
+query I rowsort
+SELECT SUM(cnt) FROM time32_millis GROUP BY time
+----
+2
+3
+4
+
+# csv_group_by_time64microsecond
+query I rowsort
+SELECT SUM(cnt) FROM time64_micros GROUP BY time
+----
+2
+3
+4
+
+# csv_group_by_time64nanosecond
+query I rowsort
+SELECT SUM(cnt) FROM time64_nanos GROUP BY time
+----
+2
+3
+4
+
+
+# TODO: group_by_date_trunc
+
+
+# Create table with LargeUtf8 data
+
+statement ok
+create table utf8_data(str string, val bigint) as values
+ ('A', 1),
+ ('B', 2),
+ ('A', 2),
+ ('A', 4),
+ ('C', 1),
+ ('A', 1);
+
+statement ok
+create table largeutf8_data as
+select arrow_cast(str, 'LargeUtf8') as str, val
+from utf8_data;
+
+# group_by_largeutf8
+query TI rowsort
+SELECT str, count(val) FROM largeutf8_data GROUP BY str
+----
+A 4
+B 1
+C 1
+
+
+# TODO: group_by_dictionary
+
+# csv_query_group_by_order_by_substr
+query TR
+SELECT substr(c1, 1, 1), avg(c12)
+FROM aggregate_test_100
+GROUP BY substr(c1, 1, 1)
+ORDER BY substr(c1, 1, 1)
+----
+a 0.487545174661
+b 0.410407092638
+c 0.660045653644
+d 0.488553793875
+e 0.486006692713
+
+# csv_query_group_by_order_by_substr_aliased_projection
+query TR
+SELECT substr(c1, 1, 1) as name, avg(c12) as average
+FROM aggregate_test_100
+GROUP BY substr(c1, 1, 1)
+ORDER BY substr(c1, 1, 1)
+----
+a 0.487545174661
+b 0.410407092638
+c 0.660045653644
+d 0.488553793875
+e 0.486006692713
+
+# csv_query_group_by_order_by_avg_group_by_substr
+query TR
+SELECT substr(c1, 1, 1) as name, avg(c12) as average
+FROM aggregate_test_100
+GROUP BY substr(c1, 1, 1)
+ORDER BY avg(c12)
+----
+b 0.410407092638
+e 0.486006692713
+a 0.487545174661
+d 0.488553793875
+c 0.660045653644