You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2023/04/24 15:15:35 UTC

[arrow-datafusion] branch main updated: refactor(sqllogictests): port group by test to sqllogic (#6088)

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new 05762363f9 refactor(sqllogictests): port group by test to sqllogic (#6088)
05762363f9 is described below

commit 05762363f9ad76e2115d9ebf777aae6907da33f7
Author: Armin Primadi <ap...@gmail.com>
AuthorDate: Mon Apr 24 22:15:26 2023 +0700

    refactor(sqllogictests): port group by test to sqllogic (#6088)
    
    * refactor(sqllogictests): wip refactor group by test to sqllogic
    
    * refactor(sqllogictest): porting more group by tests
    
    * docs(core): revert accidental documentation styling update
    
    * refactor(sqllogictest): cleanup group test already in sqllogictest
    
    * refactor(sqllogictest): move group by time test
    
    * refactor(sqllogictest): no longer need setup for group test
    
    * refactor(sqllogictest): port more group by test
    
    * refactor(sqllogictest): port more group by test
    
    * style(sqllogictest): conform to cargo fmt
---
 datafusion/core/tests/sql/group_by.rs              | 745 ---------------------
 .../core/tests/sqllogictests/test_files/group.slt  | 389 +++++++++++
 2 files changed, 389 insertions(+), 745 deletions(-)

diff --git a/datafusion/core/tests/sql/group_by.rs b/datafusion/core/tests/sql/group_by.rs
index a92eaf0f4d..b4a92db3fc 100644
--- a/datafusion/core/tests/sql/group_by.rs
+++ b/datafusion/core/tests/sql/group_by.rs
@@ -17,633 +17,6 @@
 
 use super::*;
 
-#[tokio::test]
-async fn csv_query_group_by_int_min_max() -> Result<()> {
-    let ctx = SessionContext::new();
-    register_aggregate_csv(&ctx).await?;
-    let sql = "SELECT c2, MIN(c12), MAX(c12) FROM aggregate_test_100 GROUP BY c2";
-    let actual = execute_to_batches(&ctx, sql).await;
-    let expected = vec![
-        "+----+-----------------------------+-----------------------------+",
-        "| c2 | MIN(aggregate_test_100.c12) | MAX(aggregate_test_100.c12) |",
-        "+----+-----------------------------+-----------------------------+",
-        "| 1  | 0.05636955101974106         | 0.9965400387585364          |",
-        "| 2  | 0.16301110515739792         | 0.991517828651004           |",
-        "| 3  | 0.047343434291126085        | 0.9293883502480845          |",
-        "| 4  | 0.02182578039211991         | 0.9237877978193884          |",
-        "| 5  | 0.01479305307777301         | 0.9723580396501548          |",
-        "+----+-----------------------------+-----------------------------+",
-    ];
-    assert_batches_sorted_eq!(expected, &actual);
-    Ok(())
-}
-
-#[tokio::test]
-async fn csv_query_group_by_float32() -> Result<()> {
-    let ctx = SessionContext::new();
-    register_aggregate_simple_csv(&ctx).await?;
-
-    let sql =
-        "SELECT COUNT(*) as cnt, c1 FROM aggregate_simple GROUP BY c1 ORDER BY cnt DESC";
-    let actual = execute_to_batches(&ctx, sql).await;
-
-    let expected = vec![
-        "+-----+---------+",
-        "| cnt | c1      |",
-        "+-----+---------+",
-        "| 5   | 0.00005 |",
-        "| 4   | 0.00004 |",
-        "| 3   | 0.00003 |",
-        "| 2   | 0.00002 |",
-        "| 1   | 0.00001 |",
-        "+-----+---------+",
-    ];
-    assert_batches_eq!(expected, &actual);
-
-    Ok(())
-}
-
-#[tokio::test]
-async fn csv_query_group_by_float64() -> Result<()> {
-    let ctx = SessionContext::new();
-    register_aggregate_simple_csv(&ctx).await?;
-
-    let sql =
-        "SELECT COUNT(*) as cnt, c2 FROM aggregate_simple GROUP BY c2 ORDER BY cnt DESC";
-    let actual = execute_to_batches(&ctx, sql).await;
-
-    let expected = vec![
-        "+-----+---------+",
-        "| cnt | c2      |",
-        "+-----+---------+",
-        "| 5   | 5.0e-12 |",
-        "| 4   | 4.0e-12 |",
-        "| 3   | 3.0e-12 |",
-        "| 2   | 2.0e-12 |",
-        "| 1   | 1.0e-12 |",
-        "+-----+---------+",
-    ];
-    assert_batches_eq!(expected, &actual);
-
-    Ok(())
-}
-
-#[tokio::test]
-async fn csv_query_group_by_boolean() -> Result<()> {
-    let ctx = SessionContext::new();
-    register_aggregate_simple_csv(&ctx).await?;
-
-    let sql =
-        "SELECT COUNT(*) as cnt, c3 FROM aggregate_simple GROUP BY c3 ORDER BY cnt DESC";
-    let actual = execute_to_batches(&ctx, sql).await;
-
-    let expected = vec![
-        "+-----+-------+",
-        "| cnt | c3    |",
-        "+-----+-------+",
-        "| 9   | true  |",
-        "| 6   | false |",
-        "+-----+-------+",
-    ];
-    assert_batches_eq!(expected, &actual);
-
-    Ok(())
-}
-
-#[tokio::test]
-async fn csv_query_group_by_two_columns() -> Result<()> {
-    let ctx = SessionContext::new();
-    register_aggregate_csv(&ctx).await?;
-    let sql = "SELECT c1, c2, MIN(c3) FROM aggregate_test_100 GROUP BY c1, c2";
-    let actual = execute_to_batches(&ctx, sql).await;
-    let expected = vec![
-        "+----+----+----------------------------+",
-        "| c1 | c2 | MIN(aggregate_test_100.c3) |",
-        "+----+----+----------------------------+",
-        "| a  | 1  | -85                        |",
-        "| a  | 2  | -48                        |",
-        "| a  | 3  | -72                        |",
-        "| a  | 4  | -101                       |",
-        "| a  | 5  | -101                       |",
-        "| b  | 1  | 12                         |",
-        "| b  | 2  | -60                        |",
-        "| b  | 3  | -101                       |",
-        "| b  | 4  | -117                       |",
-        "| b  | 5  | -82                        |",
-        "| c  | 1  | -24                        |",
-        "| c  | 2  | -117                       |",
-        "| c  | 3  | -2                         |",
-        "| c  | 4  | -90                        |",
-        "| c  | 5  | -94                        |",
-        "| d  | 1  | -99                        |",
-        "| d  | 2  | 93                         |",
-        "| d  | 3  | -76                        |",
-        "| d  | 4  | 5                          |",
-        "| d  | 5  | -59                        |",
-        "| e  | 1  | 36                         |",
-        "| e  | 2  | -61                        |",
-        "| e  | 3  | -95                        |",
-        "| e  | 4  | -56                        |",
-        "| e  | 5  | -86                        |",
-        "+----+----+----------------------------+",
-    ];
-    assert_batches_sorted_eq!(expected, &actual);
-    Ok(())
-}
-
-#[tokio::test]
-async fn csv_query_group_by_and_having() -> Result<()> {
-    let ctx = SessionContext::new();
-    register_aggregate_csv(&ctx).await?;
-    let sql = "SELECT c1, MIN(c3) AS m FROM aggregate_test_100 GROUP BY c1 HAVING m < -100 AND MAX(c3) > 70";
-    let actual = execute_to_batches(&ctx, sql).await;
-    let expected = vec![
-        "+----+------+",
-        "| c1 | m    |",
-        "+----+------+",
-        "| a  | -101 |",
-        "| c  | -117 |",
-        "+----+------+",
-    ];
-    assert_batches_sorted_eq!(expected, &actual);
-    Ok(())
-}
-
-#[tokio::test]
-async fn csv_query_group_by_and_having_and_where() -> Result<()> {
-    let ctx = SessionContext::new();
-    register_aggregate_csv(&ctx).await?;
-    let sql = "SELECT c1, MIN(c3) AS m
-               FROM aggregate_test_100
-               WHERE c1 IN ('a', 'b')
-               GROUP BY c1
-               HAVING m < -100 AND MAX(c3) > 70";
-    let actual = execute_to_batches(&ctx, sql).await;
-    let expected = vec![
-        "+----+------+",
-        "| c1 | m    |",
-        "+----+------+",
-        "| a  | -101 |",
-        "+----+------+",
-    ];
-    assert_batches_eq!(expected, &actual);
-    Ok(())
-}
-
-#[tokio::test]
-async fn csv_query_group_by_substr() -> Result<()> {
-    let ctx = SessionContext::new();
-    register_aggregate_csv(&ctx).await?;
-    // there is an input column "c1" as well a projection expression aliased as "c1"
-    let sql = "SELECT substr(c1, 1, 1) c1 \
-        FROM aggregate_test_100 \
-        GROUP BY substr(c1, 1, 1) \
-        ";
-    let actual = execute_to_batches(&ctx, sql).await;
-    #[rustfmt::skip]
-    let expected = vec![
-        "+----+",
-        "| c1 |",
-        "+----+",
-        "| a  |",
-        "| b  |",
-        "| c  |",
-        "| d  |",
-        "| e  |",
-        "+----+",
-    ];
-    assert_batches_sorted_eq!(expected, &actual);
-    Ok(())
-}
-
-#[tokio::test]
-async fn csv_query_group_by_avg() -> Result<()> {
-    let ctx = SessionContext::new();
-    register_aggregate_csv(&ctx).await?;
-    let sql = "SELECT c1, avg(c12) FROM aggregate_test_100 GROUP BY c1";
-    let actual = execute_to_batches(&ctx, sql).await;
-    let expected = vec![
-        "+----+-----------------------------+",
-        "| c1 | AVG(aggregate_test_100.c12) |",
-        "+----+-----------------------------+",
-        "| a  | 0.48754517466109415         |",
-        "| b  | 0.41040709263815384         |",
-        "| c  | 0.6600456536439784          |",
-        "| d  | 0.48855379387549824         |",
-        "| e  | 0.48600669271341534         |",
-        "+----+-----------------------------+",
-    ];
-    assert_batches_sorted_eq!(expected, &actual);
-    Ok(())
-}
-
-#[tokio::test]
-async fn csv_query_group_by_with_aliases() -> Result<()> {
-    let ctx = SessionContext::new();
-    register_aggregate_csv(&ctx).await?;
-    let sql = "SELECT c1 AS c12, avg(c12) AS c1 FROM aggregate_test_100 GROUP BY c1";
-    let actual = execute_to_batches(&ctx, sql).await;
-    let expected = vec![
-        "+-----+---------------------+",
-        "| c12 | c1                  |",
-        "+-----+---------------------+",
-        "| a   | 0.48754517466109415 |",
-        "| b   | 0.41040709263815384 |",
-        "| c   | 0.6600456536439784  |",
-        "| d   | 0.48855379387549824 |",
-        "| e   | 0.48600669271341534 |",
-        "+-----+---------------------+",
-    ];
-    assert_batches_sorted_eq!(expected, &actual);
-    Ok(())
-}
-
-#[tokio::test]
-async fn csv_query_group_by_int_count() -> Result<()> {
-    let ctx = SessionContext::new();
-    register_aggregate_csv(&ctx).await?;
-    let sql = "SELECT c1, count(c12) FROM aggregate_test_100 GROUP BY c1";
-    let actual = execute_to_batches(&ctx, sql).await;
-    let expected = vec![
-        "+----+-------------------------------+",
-        "| c1 | COUNT(aggregate_test_100.c12) |",
-        "+----+-------------------------------+",
-        "| a  | 21                            |",
-        "| b  | 19                            |",
-        "| c  | 21                            |",
-        "| d  | 18                            |",
-        "| e  | 21                            |",
-        "+----+-------------------------------+",
-    ];
-    assert_batches_sorted_eq!(expected, &actual);
-    Ok(())
-}
-
-#[tokio::test]
-async fn csv_query_group_with_aliased_aggregate() -> Result<()> {
-    let ctx = SessionContext::new();
-    register_aggregate_csv(&ctx).await?;
-    let sql = "SELECT c1, count(c12) AS count FROM aggregate_test_100 GROUP BY c1";
-    let actual = execute_to_batches(&ctx, sql).await;
-    let expected = vec![
-        "+----+-------+",
-        "| c1 | count |",
-        "+----+-------+",
-        "| a  | 21    |",
-        "| b  | 19    |",
-        "| c  | 21    |",
-        "| d  | 18    |",
-        "| e  | 21    |",
-        "+----+-------+",
-    ];
-    assert_batches_sorted_eq!(expected, &actual);
-    Ok(())
-}
-
-#[tokio::test]
-async fn csv_query_group_by_string_min_max() -> Result<()> {
-    let ctx = SessionContext::new();
-    register_aggregate_csv(&ctx).await?;
-    let sql = "SELECT c1, MIN(c12), MAX(c12) FROM aggregate_test_100 GROUP BY c1";
-    let actual = execute_to_batches(&ctx, sql).await;
-    let expected = vec![
-        "+----+-----------------------------+-----------------------------+",
-        "| c1 | MIN(aggregate_test_100.c12) | MAX(aggregate_test_100.c12) |",
-        "+----+-----------------------------+-----------------------------+",
-        "| a  | 0.02182578039211991         | 0.9800193410444061          |",
-        "| b  | 0.04893135681998029         | 0.9185813970744787          |",
-        "| c  | 0.0494924465469434          | 0.991517828651004           |",
-        "| d  | 0.061029375346466685        | 0.9748360509016578          |",
-        "| e  | 0.01479305307777301         | 0.9965400387585364          |",
-        "+----+-----------------------------+-----------------------------+",
-    ];
-    assert_batches_sorted_eq!(expected, &actual);
-    Ok(())
-}
-
-#[tokio::test]
-async fn query_group_on_null() -> Result<()> {
-    let schema = Arc::new(Schema::new(vec![Field::new("c1", DataType::Int32, true)]));
-
-    let data = RecordBatch::try_new(
-        schema.clone(),
-        vec![Arc::new(Int32Array::from(vec![
-            Some(0),
-            Some(3),
-            None,
-            Some(1),
-            Some(3),
-        ]))],
-    )?;
-
-    let ctx = SessionContext::new();
-    ctx.register_batch("test", data)?;
-    let sql = "SELECT COUNT(*), c1 FROM test GROUP BY c1";
-
-    let actual = execute_to_batches(&ctx, sql).await;
-
-    // Note that the results also
-    // include a row for NULL (c1=NULL, count = 1)
-    let expected = vec![
-        "+-----------------+----+",
-        "| COUNT(UInt8(1)) | c1 |",
-        "+-----------------+----+",
-        "| 1               |    |",
-        "| 1               | 0  |",
-        "| 1               | 1  |",
-        "| 2               | 3  |",
-        "+-----------------+----+",
-    ];
-    assert_batches_sorted_eq!(expected, &actual);
-    Ok(())
-}
-
-#[tokio::test]
-async fn query_group_on_null_multi_col() -> Result<()> {
-    let schema = Arc::new(Schema::new(vec![
-        Field::new("c1", DataType::Int32, true),
-        Field::new("c2", DataType::Utf8, true),
-    ]));
-
-    let data = RecordBatch::try_new(
-        schema.clone(),
-        vec![
-            Arc::new(Int32Array::from(vec![
-                Some(0),
-                Some(0),
-                Some(3),
-                None,
-                None,
-                Some(3),
-                Some(0),
-                None,
-                Some(3),
-            ])),
-            Arc::new(StringArray::from(vec![
-                None,
-                None,
-                Some("foo"),
-                None,
-                Some("bar"),
-                Some("foo"),
-                None,
-                Some("bar"),
-                Some("foo"),
-            ])),
-        ],
-    )?;
-
-    let ctx = SessionContext::new();
-    ctx.register_batch("test", data)?;
-    let sql = "SELECT COUNT(*), c1, c2 FROM test GROUP BY c1, c2";
-
-    let actual = execute_to_batches(&ctx, sql).await;
-
-    // Note that the results also include values for null
-    // include a row for NULL (c1=NULL, count = 1)
-    let expected = vec![
-        "+-----------------+----+-----+",
-        "| COUNT(UInt8(1)) | c1 | c2  |",
-        "+-----------------+----+-----+",
-        "| 1               |    |     |",
-        "| 2               |    | bar |",
-        "| 3               | 0  |     |",
-        "| 3               | 3  | foo |",
-        "+-----------------+----+-----+",
-    ];
-    assert_batches_sorted_eq!(expected, &actual);
-
-    // Also run query with group columns reversed (results should be the same)
-    let sql = "SELECT COUNT(*), c1, c2 FROM test GROUP BY c2, c1";
-    let actual = execute_to_batches(&ctx, sql).await;
-    assert_batches_sorted_eq!(expected, &actual);
-    Ok(())
-}
-
-#[tokio::test]
-async fn csv_group_by_date() -> Result<()> {
-    let ctx = SessionContext::new();
-    let schema = Arc::new(Schema::new(vec![
-        Field::new("date", DataType::Date32, false),
-        Field::new("cnt", DataType::Int32, false),
-    ]));
-    let data = RecordBatch::try_new(
-        schema.clone(),
-        vec![
-            Arc::new(Date32Array::from(vec![
-                Some(100),
-                Some(100),
-                Some(100),
-                Some(101),
-                Some(101),
-                Some(101),
-            ])),
-            Arc::new(Int32Array::from(vec![
-                Some(1),
-                Some(2),
-                Some(3),
-                Some(3),
-                Some(3),
-                Some(3),
-            ])),
-        ],
-    )?;
-
-    ctx.register_batch("dates", data)?;
-    let sql = "SELECT SUM(cnt) FROM dates GROUP BY date";
-    let actual = execute_to_batches(&ctx, sql).await;
-    let expected = vec![
-        "+----------------+",
-        "| SUM(dates.cnt) |",
-        "+----------------+",
-        "| 6              |",
-        "| 9              |",
-        "+----------------+",
-    ];
-    assert_batches_sorted_eq!(expected, &actual);
-    Ok(())
-}
-
-#[tokio::test]
-async fn csv_group_by_time32second() -> Result<()> {
-    let ctx = SessionContext::new();
-    let schema = Arc::new(Schema::new(vec![
-        Field::new("time", DataType::Time32(TimeUnit::Second), false),
-        Field::new("cnt", DataType::Int32, false),
-    ]));
-    let data = RecordBatch::try_new(
-        schema.clone(),
-        vec![
-            Arc::new(Time32SecondArray::from(vec![
-                Some(5_000),
-                Some(5_000),
-                Some(5_500),
-                Some(5_500),
-                Some(5_900),
-                Some(5_900),
-            ])),
-            Arc::new(Int32Array::from(vec![
-                Some(1),
-                Some(1),
-                Some(1),
-                Some(2),
-                Some(1),
-                Some(3),
-            ])),
-        ],
-    )?;
-
-    ctx.register_batch("times", data)?;
-    let sql = "SELECT SUM(cnt) FROM times GROUP BY time";
-    let actual = execute_to_batches(&ctx, sql).await;
-    let expected = vec![
-        "+----------------+",
-        "| SUM(times.cnt) |",
-        "+----------------+",
-        "| 2              |",
-        "| 3              |",
-        "| 4              |",
-        "+----------------+",
-    ];
-    assert_batches_sorted_eq!(expected, &actual);
-    Ok(())
-}
-
-#[tokio::test]
-async fn csv_group_by_time32millisecond() -> Result<()> {
-    let ctx = SessionContext::new();
-    let schema = Arc::new(Schema::new(vec![
-        Field::new("time", DataType::Time32(TimeUnit::Millisecond), false),
-        Field::new("cnt", DataType::Int32, false),
-    ]));
-    let data = RecordBatch::try_new(
-        schema.clone(),
-        vec![
-            Arc::new(Time32MillisecondArray::from(vec![
-                Some(5_000_000),
-                Some(5_000_000),
-                Some(5_500_000),
-                Some(5_500_000),
-                Some(5_900_000),
-                Some(5_900_000),
-            ])),
-            Arc::new(Int32Array::from(vec![
-                Some(1),
-                Some(1),
-                Some(1),
-                Some(2),
-                Some(1),
-                Some(3),
-            ])),
-        ],
-    )?;
-
-    ctx.register_batch("times", data)?;
-    let sql = "SELECT SUM(cnt) FROM times GROUP BY time";
-    let actual = execute_to_batches(&ctx, sql).await;
-    let expected = vec![
-        "+----------------+",
-        "| SUM(times.cnt) |",
-        "+----------------+",
-        "| 2              |",
-        "| 3              |",
-        "| 4              |",
-        "+----------------+",
-    ];
-    assert_batches_sorted_eq!(expected, &actual);
-    Ok(())
-}
-
-#[tokio::test]
-async fn csv_group_by_time64microsecond() -> Result<()> {
-    let ctx = SessionContext::new();
-    let schema = Arc::new(Schema::new(vec![
-        Field::new("time", DataType::Time64(TimeUnit::Microsecond), false),
-        Field::new("cnt", DataType::Int64, false),
-    ]));
-    let data = RecordBatch::try_new(
-        schema.clone(),
-        vec![
-            Arc::new(Time64MicrosecondArray::from(vec![
-                Some(5_000_000_000),
-                Some(5_000_000_000),
-                Some(5_500_000_000),
-                Some(5_500_000_000),
-                Some(5_900_000_000),
-                Some(5_900_000_000),
-            ])),
-            Arc::new(Int64Array::from(vec![
-                Some(1),
-                Some(1),
-                Some(1),
-                Some(2),
-                Some(1),
-                Some(3),
-            ])),
-        ],
-    )?;
-
-    ctx.register_batch("times", data)?;
-    let sql = "SELECT SUM(cnt) FROM times GROUP BY time";
-    let actual = execute_to_batches(&ctx, sql).await;
-    let expected = vec![
-        "+----------------+",
-        "| SUM(times.cnt) |",
-        "+----------------+",
-        "| 2              |",
-        "| 3              |",
-        "| 4              |",
-        "+----------------+",
-    ];
-    assert_batches_sorted_eq!(expected, &actual);
-    Ok(())
-}
-
-#[tokio::test]
-async fn csv_group_by_time64nanosecond() -> Result<()> {
-    let ctx = SessionContext::new();
-    let schema = Arc::new(Schema::new(vec![
-        Field::new("time", DataType::Time64(TimeUnit::Nanosecond), false),
-        Field::new("cnt", DataType::Int64, false),
-    ]));
-    let data = RecordBatch::try_new(
-        schema.clone(),
-        vec![
-            Arc::new(Time64NanosecondArray::from(vec![
-                Some(5_000_000_000_000),
-                Some(5_000_000_000_000),
-                Some(5_500_000_000_000),
-                Some(5_500_000_000_000),
-                Some(5_900_000_000_000),
-                Some(5_900_000_000_000),
-            ])),
-            Arc::new(Int64Array::from(vec![
-                Some(1),
-                Some(1),
-                Some(1),
-                Some(2),
-                Some(1),
-                Some(3),
-            ])),
-        ],
-    )?;
-
-    ctx.register_batch("times", data)?;
-    let sql = "SELECT SUM(cnt) FROM times GROUP BY time";
-    let actual = execute_to_batches(&ctx, sql).await;
-    let expected = vec![
-        "+----------------+",
-        "| SUM(times.cnt) |",
-        "+----------------+",
-        "| 2              |",
-        "| 3              |",
-        "| 4              |",
-        "+----------------+",
-    ];
-    assert_batches_sorted_eq!(expected, &actual);
-    Ok(())
-}
-
 #[tokio::test]
 async fn group_by_date_trunc() -> Result<()> {
     let tmp_dir = TempDir::new()?;
@@ -695,52 +68,6 @@ async fn group_by_date_trunc() -> Result<()> {
     Ok(())
 }
 
-#[tokio::test]
-async fn group_by_largeutf8() {
-    let ctx = SessionContext::new();
-
-    // input data looks like:
-    // A, 1
-    // B, 2
-    // A, 2
-    // A, 4
-    // C, 1
-    // A, 1
-
-    let str_array: LargeStringArray = vec!["A", "B", "A", "A", "C", "A"]
-        .into_iter()
-        .map(Some)
-        .collect();
-    let str_array = Arc::new(str_array);
-
-    let val_array: Int64Array = vec![1, 2, 2, 4, 1, 1].into();
-    let val_array = Arc::new(val_array);
-
-    let schema = Arc::new(Schema::new(vec![
-        Field::new("str", str_array.data_type().clone(), false),
-        Field::new("val", val_array.data_type().clone(), false),
-    ]));
-
-    let batch = RecordBatch::try_new(schema.clone(), vec![str_array, val_array]).unwrap();
-
-    ctx.register_batch("t", batch).unwrap();
-
-    let results = plan_and_collect(&ctx, "SELECT str, count(val) FROM t GROUP BY str")
-        .await
-        .expect("ran plan correctly");
-
-    let expected = vec![
-        "+-----+--------------+",
-        "| str | COUNT(t.val) |",
-        "+-----+--------------+",
-        "| A   | 4            |",
-        "| B   | 1            |",
-        "| C   | 1            |",
-        "+-----+--------------+",
-    ];
-    assert_batches_sorted_eq!(expected, &results);
-}
-
 #[tokio::test]
 async fn group_by_dictionary() {
     async fn run_test_case<K: ArrowDictionaryKeyType>() {
@@ -833,75 +160,3 @@ async fn group_by_dictionary() {
     run_test_case::<UInt32Type>().await;
     run_test_case::<UInt64Type>().await;
 }
-
-#[tokio::test]
-async fn csv_query_group_by_order_by_substr() -> Result<()> {
-    let ctx = SessionContext::new();
-    register_aggregate_csv(&ctx).await?;
-    let sql = "SELECT substr(c1, 1, 1), avg(c12) \
-        FROM aggregate_test_100 \
-        GROUP BY substr(c1, 1, 1) \
-        ORDER BY substr(c1, 1, 1)";
-    let actual = execute_to_batches(&ctx, sql).await;
-    let expected = vec![
-        "+-------------------------------------------------+-----------------------------+",
-        "| substr(aggregate_test_100.c1,Int64(1),Int64(1)) | AVG(aggregate_test_100.c12) |",
-        "+-------------------------------------------------+-----------------------------+",
-        "| a                                               | 0.48754517466109415         |",
-        "| b                                               | 0.41040709263815384         |",
-        "| c                                               | 0.6600456536439784          |",
-        "| d                                               | 0.48855379387549824         |",
-        "| e                                               | 0.48600669271341534         |",
-        "+-------------------------------------------------+-----------------------------+",
-    ];
-    assert_batches_sorted_eq!(expected, &actual);
-    Ok(())
-}
-
-#[tokio::test]
-async fn csv_query_group_by_order_by_substr_aliased_projection() -> Result<()> {
-    let ctx = SessionContext::new();
-    register_aggregate_csv(&ctx).await?;
-    let sql = "SELECT substr(c1, 1, 1) as name, avg(c12) as average \
-        FROM aggregate_test_100 \
-        GROUP BY substr(c1, 1, 1) \
-        ORDER BY substr(c1, 1, 1)";
-    let actual = execute_to_batches(&ctx, sql).await;
-    let expected = vec![
-        "+------+---------------------+",
-        "| name | average             |",
-        "+------+---------------------+",
-        "| a    | 0.48754517466109415 |",
-        "| b    | 0.41040709263815384 |",
-        "| c    | 0.6600456536439784  |",
-        "| d    | 0.48855379387549824 |",
-        "| e    | 0.48600669271341534 |",
-        "+------+---------------------+",
-    ];
-    assert_batches_sorted_eq!(expected, &actual);
-    Ok(())
-}
-
-#[tokio::test]
-async fn csv_query_group_by_order_by_avg_group_by_substr() -> Result<()> {
-    let ctx = SessionContext::new();
-    register_aggregate_csv(&ctx).await?;
-    let sql = "SELECT substr(c1, 1, 1) as name, avg(c12) as average \
-        FROM aggregate_test_100 \
-        GROUP BY substr(c1, 1, 1) \
-        ORDER BY avg(c12)";
-    let actual = execute_to_batches(&ctx, sql).await;
-    let expected = vec![
-        "+------+---------------------+",
-        "| name | average             |",
-        "+------+---------------------+",
-        "| b    | 0.41040709263815384 |",
-        "| e    | 0.48600669271341534 |",
-        "| a    | 0.48754517466109415 |",
-        "| d    | 0.48855379387549824 |",
-        "| c    | 0.6600456536439784  |",
-        "+------+---------------------+",
-    ];
-    assert_batches_sorted_eq!(expected, &actual);
-    Ok(())
-}
diff --git a/datafusion/core/tests/sqllogictests/test_files/group.slt b/datafusion/core/tests/sqllogictests/test_files/group.slt
new file mode 100644
index 0000000000..a56451d7aa
--- /dev/null
+++ b/datafusion/core/tests/sqllogictests/test_files/group.slt
@@ -0,0 +1,389 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+statement ok
+CREATE EXTERNAL TABLE aggregate_test_100 (
+  c1  VARCHAR NOT NULL,
+  c2  TINYINT NOT NULL,
+  c3  SMALLINT NOT NULL,
+  c4  SMALLINT,
+  c5  INT,
+  c6  BIGINT NOT NULL,
+  c7  SMALLINT NOT NULL,
+  c8  INT NOT NULL,
+  c9  BIGINT UNSIGNED NOT NULL,
+  c10 VARCHAR NOT NULL,
+  c11 FLOAT NOT NULL,
+  c12 DOUBLE NOT NULL,
+  c13 VARCHAR NOT NULL
+)
+STORED AS CSV
+WITH HEADER ROW
+LOCATION '../../testing/data/csv/aggregate_test_100.csv'
+
+statement ok
+CREATE external table aggregate_simple(c1 real, c2 double, c3 boolean) STORED as CSV WITH HEADER ROW LOCATION 'tests/data/aggregate_simple.csv';
+
+
+# csv_query_group_by_int_min_max
+query IRR rowsort
+SELECT c2, MIN(c12), MAX(c12) FROM aggregate_test_100 GROUP BY c2
+----
+1 0.05636955102 0.996540038759
+2 0.163011105157 0.991517828651
+3 0.047343434291 0.929388350248
+4 0.021825780392 0.923787797819
+5 0.014793053078 0.97235803965
+
+# csv_query_group_by_float32
+query IR
+SELECT COUNT(*) as cnt, c1 FROM aggregate_simple GROUP BY c1 ORDER BY cnt DESC
+----
+5 0.00005
+4 0.00004
+3 0.00003
+2 0.00002
+1 0.00001
+
+# csv_query_group_by_float64
+query IR
+SELECT COUNT(*) as cnt, c2 FROM aggregate_simple GROUP BY c2 ORDER BY cnt DESC
+----
+5 0.000000000005
+4 0.000000000004
+3 0.000000000003
+2 0.000000000002
+1 0.000000000001
+
+# csv_query_group_by_boolean
+query IB
+SELECT COUNT(*) as cnt, c3 FROM aggregate_simple GROUP BY c3 ORDER BY cnt DESC
+----
+9 true
+6 false
+
+# csv_query_group_by_two_columns
+query TII rowsort
+SELECT c1, c2, MIN(c3) FROM aggregate_test_100 GROUP BY c1, c2
+----
+a 1 -85
+a 2 -48
+a 3 -72
+a 4 -101
+a 5 -101
+b 1 12
+b 2 -60
+b 3 -101
+b 4 -117
+b 5 -82
+c 1 -24
+c 2 -117
+c 3 -2
+c 4 -90
+c 5 -94
+d 1 -99
+d 2 93
+d 3 -76
+d 4 5
+d 5 -59
+e 1 36
+e 2 -61
+e 3 -95
+e 4 -56
+e 5 -86
+
+# csv_query_group_by_and_having
+query TI rowsort
+SELECT c1, MIN(c3) AS m FROM aggregate_test_100 GROUP BY c1 HAVING m < -100 AND MAX(c3) > 70
+----
+a -101
+c -117
+
+# csv_query_group_by_and_having_and_where
+query TI
+SELECT c1, MIN(c3) AS m
+FROM aggregate_test_100
+WHERE c1 IN ('a', 'b')
+GROUP BY c1
+HAVING m < -100 AND MAX(c3) > 70
+----
+a -101
+
+# csv_query_group_by_substr
+query T rowsort
+SELECT substr(c1, 1, 1) c1 FROM aggregate_test_100 GROUP BY substr(c1, 1, 1)
+----
+a
+b
+c
+d
+e
+
+# csv_query_group_by_avg
+query TR rowsort
+SELECT c1, avg(c12) FROM aggregate_test_100 GROUP BY c1
+----
+a 0.487545174661
+b 0.410407092638
+c 0.660045653644
+d 0.488553793875
+e 0.486006692713
+
+# csv_query_group_by_with_aliases
+query TR rowsort
+SELECT c1 AS c12, avg(c12) AS c1 FROM aggregate_test_100 GROUP BY c1
+----
+a 0.487545174661
+b 0.410407092638
+c 0.660045653644
+d 0.488553793875
+e 0.486006692713
+
+# csv_query_group_by_int_count
+query TI rowsort
+SELECT c1, count(c12) FROM aggregate_test_100 GROUP BY c1
+----
+a 21
+b 19
+c 21
+d 18
+e 21
+
+# csv_query_group_with_aliased_aggregate
+query TI rowsort
+SELECT c1, count(c12) AS count FROM aggregate_test_100 GROUP BY c1
+----
+a 21
+b 19
+c 21
+d 18
+e 21
+
+# csv_query_group_by_string_min_max
+query TRR rowsort
+SELECT c1, MIN(c12), MAX(c12) FROM aggregate_test_100 GROUP BY c1
+----
+a 0.021825780392 0.980019341044
+b 0.04893135682 0.918581397074
+c 0.049492446547 0.991517828651
+d 0.061029375346 0.974836050902
+e 0.014793053078 0.996540038759
+
+
+# Create a table containing null values
+
+statement ok
+create table null_data(c1 int) as values
+  (0),
+  (3),
+  (NULL),
+  (1),
+  (3);
+
+# query_group_on_null
+query II rowsort
+SELECT COUNT(*), c1 FROM null_data GROUP BY c1
+----
+1 0
+1 1
+1 NULL
+2 3
+
+
+# Create a table containing multiple column with null values
+
+statement ok
+create table multi_null_data(c1 int, c2 string) as values
+  (0, NULL),
+  (0, NULL),
+  (3, 'foo'),
+  (NULL, NULL),
+  (NULL, 'bar'),
+  (3, 'foo'),
+  (0, NULL),
+  (NULL, 'bar'),
+  (3, 'foo');
+
+# query_group_on_null_multi_col
+
+query IIT rowsort
+SELECT COUNT(*), c1, c2 FROM multi_null_data GROUP BY c1, c2
+----
+1 NULL NULL
+2 NULL bar
+3 0 NULL
+3 3 foo
+
+query IIT rowsort
+SELECT COUNT(*), c1, c2 FROM multi_null_data GROUP BY c2, c1
+----
+1 NULL NULL
+2 NULL bar
+3 0 NULL
+3 3 foo
+
+
+# Create a date table
+
+statement ok
+create table date_data(date date, cnt int) as values
+  (100, 1),
+  (100, 2),
+  (100, 3),
+  (101, 3),
+  (101, 3),
+  (101, 3);
+
+# csv_group_by_date
+query I rowsort
+SELECT SUM(cnt) FROM date_data GROUP BY date
+----
+6
+9
+
+# Create time tables with different precisions but the same logical values
+
+statement ok
+create table time_data(ts bigint, cnt int) as values
+  (5000000000000, 1),
+  (5000000000000, 1),
+  (5500000000000, 1),
+  (5500000000000, 2),
+  (5900000000000, 1),
+  (5900000000000, 3);
+
+statement ok
+create table time64_nanos as
+select arrow_cast(ts, 'Time64(Nanosecond)') as time, arrow_cast(cnt, 'Int64') as cnt
+from time_data;
+
+statement ok
+create table time64_micros as
+select arrow_cast(ts / 1000, 'Time64(Microsecond)') as time, arrow_cast(cnt, 'Int64') as cnt
+from time_data;
+
+statement ok
+create table time32_millis as
+select arrow_cast(arrow_cast(ts / 1000 / 1000, 'Int32'), 'Time32(Millisecond)') as time, cnt
+from time_data;
+
+statement ok
+create table time32_s as
+select arrow_cast(arrow_cast(ts / 1000 / 1000 / 1000, 'Int32'), 'Time32(Second)') as time, cnt
+from time_data;
+
+# csv_group_by_time32second
+query I rowsort
+SELECT SUM(cnt) FROM time32_s GROUP BY time
+----
+2
+3
+4
+
+# csv_group_by_time32millisecond
+query I rowsort
+SELECT SUM(cnt) FROM time32_millis GROUP BY time
+----
+2
+3
+4
+
+# csv_group_by_time64microsecond
+query I rowsort
+SELECT SUM(cnt) FROM time64_micros GROUP BY time
+----
+2
+3
+4
+
+# csv_group_by_time64nanosecond
+query I rowsort
+SELECT SUM(cnt) FROM time64_nanos GROUP BY time
+----
+2
+3
+4
+
+
+# TODO: group_by_date_trunc
+
+
+# Create table with LargeUtf8 data
+
+statement ok
+create table utf8_data(str string, val bigint) as values
+  ('A', 1),
+  ('B', 2),
+  ('A', 2),
+  ('A', 4),
+  ('C', 1),
+  ('A', 1);
+
+statement ok
+create table largeutf8_data as
+select arrow_cast(str, 'LargeUtf8') as str, val
+from utf8_data;
+
+# group_by_largeutf8
+query TI rowsort
+SELECT str, count(val) FROM largeutf8_data GROUP BY str
+----
+A 4
+B 1
+C 1
+
+
+# TODO: group_by_dictionary
+
+# csv_query_group_by_order_by_substr
+query TR
+SELECT substr(c1, 1, 1), avg(c12)
+FROM aggregate_test_100
+GROUP BY substr(c1, 1, 1)
+ORDER BY substr(c1, 1, 1)
+----
+a 0.487545174661
+b 0.410407092638
+c 0.660045653644
+d 0.488553793875
+e 0.486006692713
+
+# csv_query_group_by_order_by_substr_aliased_projection
+query TR
+SELECT substr(c1, 1, 1) as name, avg(c12) as average
+FROM aggregate_test_100
+GROUP BY substr(c1, 1, 1)
+ORDER BY substr(c1, 1, 1)
+----
+a 0.487545174661
+b 0.410407092638
+c 0.660045653644
+d 0.488553793875
+e 0.486006692713
+
+# csv_query_group_by_order_by_avg_group_by_substr
+query TR
+SELECT substr(c1, 1, 1) as name, avg(c12) as average
+FROM aggregate_test_100
+GROUP BY substr(c1, 1, 1)
+ORDER BY avg(c12)
+----
+b 0.410407092638
+e 0.486006692713
+a 0.487545174661
+d 0.488553793875
+c 0.660045653644