You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2023/06/13 12:37:55 UTC

[arrow-datafusion] branch main updated: minor: use sql to setup test data for joins.slt rather than rust (#6656)

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new 2ba067c8d2 minor: use sql to setup test data for joins.slt rather than rust (#6656)
2ba067c8d2 is described below

commit 2ba067c8d2d7c3029ae2d6e4998dd2e4cd9d504f
Author: Andrew Lamb <an...@nerdnetworks.org>
AuthorDate: Tue Jun 13 08:37:47 2023 -0400

    minor: use sql to setup test data for joins.slt rather than rust (#6656)
    
    * port register_right_semi_anti_join_table to sql
    
    * port register_left_semi_anti_join_table
    
    * port register_timestamps_table
    
    * Port register_hashjoin_datatype_table to sql
    
    * fix comment
---
 datafusion/core/tests/sqllogictests/src/main.rs    |   6 +-
 datafusion/core/tests/sqllogictests/src/setup.rs   | 279 ---------------------
 .../core/tests/sqllogictests/test_files/joins.slt  | 117 ++++++++-
 3 files changed, 117 insertions(+), 285 deletions(-)

diff --git a/datafusion/core/tests/sqllogictests/src/main.rs b/datafusion/core/tests/sqllogictests/src/main.rs
index 6f17010a39..adb3be9a6e 100644
--- a/datafusion/core/tests/sqllogictests/src/main.rs
+++ b/datafusion/core/tests/sqllogictests/src/main.rs
@@ -286,11 +286,7 @@ async fn context_for_test_file(relative_path: &Path) -> Option<TestContext> {
             }
         }
         "joins.slt" => {
-            info!("Registering timestamps tables");
-            setup::register_timestamps_table(test_ctx.session_ctx()).await;
-            setup::register_hashjoin_datatype_table(test_ctx.session_ctx()).await;
-            setup::register_left_semi_anti_join_table(test_ctx.session_ctx()).await;
-            setup::register_right_semi_anti_join_table(test_ctx.session_ctx()).await;
+            info!("Registering partition table tables");
 
             let mut test_ctx = test_ctx;
             setup::register_partition_table(&mut test_ctx).await;
diff --git a/datafusion/core/tests/sqllogictests/src/setup.rs b/datafusion/core/tests/sqllogictests/src/setup.rs
index 96090d676a..26f9d2501a 100644
--- a/datafusion/core/tests/sqllogictests/src/setup.rs
+++ b/datafusion/core/tests/sqllogictests/src/setup.rs
@@ -15,12 +15,6 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use arrow_array::types::Int32Type;
-use arrow_array::{
-    Array, Date32Array, Date64Array, Decimal128Array, DictionaryArray, StringArray,
-    TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray,
-    TimestampSecondArray,
-};
 use datafusion::{
     arrow::{
         array::{
@@ -221,97 +215,6 @@ fn register_nan_table(ctx: &SessionContext) {
     ctx.register_batch("test_float", data).unwrap();
 }
 
-pub async fn register_timestamps_table(ctx: &SessionContext) {
-    let batch = make_timestamps();
-    let schema = batch.schema();
-    let partitions = vec![vec![batch]];
-
-    ctx.register_table(
-        "test_timestamps_table",
-        Arc::new(MemTable::try_new(schema, partitions).unwrap()),
-    )
-    .unwrap();
-}
-
-/// Return  record batch with all of the supported timestamp types
-/// values
-///
-/// Columns are named:
-/// "nanos" --> TimestampNanosecondArray
-/// "micros" --> TimestampMicrosecondArray
-/// "millis" --> TimestampMillisecondArray
-/// "secs" --> TimestampSecondArray
-/// "names" --> StringArray
-pub fn make_timestamps() -> RecordBatch {
-    let ts_strings = vec![
-        Some("2018-11-13T17:11:10.011375885995"),
-        Some("2011-12-13T11:13:10.12345"),
-        None,
-        Some("2021-1-1T05:11:10.432"),
-    ];
-
-    let ts_nanos = ts_strings
-        .into_iter()
-        .map(|t| {
-            t.map(|t| {
-                t.parse::<chrono::NaiveDateTime>()
-                    .unwrap()
-                    .timestamp_nanos()
-            })
-        })
-        .collect::<Vec<_>>();
-
-    let ts_micros = ts_nanos
-        .iter()
-        .map(|t| t.as_ref().map(|ts_nanos| ts_nanos / 1000))
-        .collect::<Vec<_>>();
-
-    let ts_millis = ts_nanos
-        .iter()
-        .map(|t| t.as_ref().map(|ts_nanos| ts_nanos / 1000000))
-        .collect::<Vec<_>>();
-
-    let ts_secs = ts_nanos
-        .iter()
-        .map(|t| t.as_ref().map(|ts_nanos| ts_nanos / 1000000000))
-        .collect::<Vec<_>>();
-
-    let names = ts_nanos
-        .iter()
-        .enumerate()
-        .map(|(i, _)| format!("Row {i}"))
-        .collect::<Vec<_>>();
-
-    let arr_nanos = TimestampNanosecondArray::from(ts_nanos);
-    let arr_micros = TimestampMicrosecondArray::from(ts_micros);
-    let arr_millis = TimestampMillisecondArray::from(ts_millis);
-    let arr_secs = TimestampSecondArray::from(ts_secs);
-
-    let names = names.iter().map(|s| s.as_str()).collect::<Vec<_>>();
-    let arr_names = StringArray::from(names);
-
-    let schema = Schema::new(vec![
-        Field::new("nanos", arr_nanos.data_type().clone(), true),
-        Field::new("micros", arr_micros.data_type().clone(), true),
-        Field::new("millis", arr_millis.data_type().clone(), true),
-        Field::new("secs", arr_secs.data_type().clone(), true),
-        Field::new("name", arr_names.data_type().clone(), true),
-    ]);
-    let schema = Arc::new(schema);
-
-    RecordBatch::try_new(
-        schema,
-        vec![
-            Arc::new(arr_nanos),
-            Arc::new(arr_micros),
-            Arc::new(arr_millis),
-            Arc::new(arr_secs),
-            Arc::new(arr_names),
-        ],
-    )
-    .unwrap()
-}
-
 /// Generate a partitioned CSV file and register it with an execution context
 pub async fn register_partition_table(test_ctx: &mut TestContext) {
     test_ctx.enable_testdir();
@@ -346,185 +249,3 @@ pub async fn register_partition_table(test_ctx: &mut TestContext) {
         .await
         .unwrap();
 }
-
-pub async fn register_hashjoin_datatype_table(ctx: &SessionContext) {
-    let t1_schema = Schema::new(vec![
-        Field::new("c1", DataType::Date32, true),
-        Field::new("c2", DataType::Date64, true),
-        Field::new("c3", DataType::Decimal128(5, 2), true),
-        Field::new(
-            "c4",
-            DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
-            true,
-        ),
-    ]);
-    let dict1: DictionaryArray<Int32Type> =
-        vec!["abc", "def", "ghi", "jkl"].into_iter().collect();
-    let t1_data = RecordBatch::try_new(
-        Arc::new(t1_schema),
-        vec![
-            Arc::new(Date32Array::from(vec![Some(1), Some(2), None, Some(3)])),
-            Arc::new(Date64Array::from(vec![
-                Some(86400000),
-                Some(172800000),
-                Some(259200000),
-                None,
-            ])),
-            Arc::new(
-                Decimal128Array::from_iter_values([123, 45600, 78900, -12312])
-                    .with_precision_and_scale(5, 2)
-                    .unwrap(),
-            ),
-            Arc::new(dict1),
-        ],
-    )
-    .unwrap();
-    ctx.register_batch("hashjoin_datatype_table_t1", t1_data)
-        .unwrap();
-
-    let t2_schema = Schema::new(vec![
-        Field::new("c1", DataType::Date32, true),
-        Field::new("c2", DataType::Date64, true),
-        Field::new("c3", DataType::Decimal128(10, 2), true),
-        Field::new(
-            "c4",
-            DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
-            true,
-        ),
-    ]);
-    let dict2: DictionaryArray<Int32Type> = vec!["abc", "abcdefg", "qwerty", "qwe"]
-        .into_iter()
-        .collect();
-    let t2_data = RecordBatch::try_new(
-        Arc::new(t2_schema),
-        vec![
-            Arc::new(Date32Array::from(vec![Some(1), None, None, Some(3)])),
-            Arc::new(Date64Array::from(vec![
-                Some(86400000),
-                None,
-                Some(259200000),
-                None,
-            ])),
-            Arc::new(
-                Decimal128Array::from_iter_values([-12312, 10000000, 0, 78900])
-                    .with_precision_and_scale(10, 2)
-                    .unwrap(),
-            ),
-            Arc::new(dict2),
-        ],
-    )
-    .unwrap();
-    ctx.register_batch("hashjoin_datatype_table_t2", t2_data)
-        .unwrap();
-}
-
-pub async fn register_left_semi_anti_join_table(ctx: &SessionContext) {
-    let t1_schema = Arc::new(Schema::new(vec![
-        Field::new("t1_id", DataType::UInt32, true),
-        Field::new("t1_name", DataType::Utf8, true),
-        Field::new("t1_int", DataType::UInt32, true),
-    ]));
-    let t1_data = RecordBatch::try_new(
-        t1_schema,
-        vec![
-            Arc::new(UInt32Array::from(vec![
-                Some(11),
-                Some(11),
-                Some(22),
-                Some(33),
-                Some(44),
-                None,
-            ])),
-            Arc::new(StringArray::from(vec![
-                Some("a"),
-                Some("a"),
-                Some("b"),
-                Some("c"),
-                Some("d"),
-                Some("e"),
-            ])),
-            Arc::new(UInt32Array::from(vec![1, 1, 2, 3, 4, 0])),
-        ],
-    )
-    .unwrap();
-    ctx.register_batch("left_semi_anti_join_table_t1", t1_data)
-        .unwrap();
-
-    let t2_schema = Arc::new(Schema::new(vec![
-        Field::new("t2_id", DataType::UInt32, true),
-        Field::new("t2_name", DataType::Utf8, true),
-        Field::new("t2_int", DataType::UInt32, true),
-    ]));
-    let t2_data = RecordBatch::try_new(
-        t2_schema,
-        vec![
-            Arc::new(UInt32Array::from(vec![
-                Some(11),
-                Some(11),
-                Some(22),
-                Some(44),
-                Some(55),
-                None,
-            ])),
-            Arc::new(StringArray::from(vec![
-                Some("z"),
-                Some("z"),
-                Some("y"),
-                Some("x"),
-                Some("w"),
-                Some("v"),
-            ])),
-            Arc::new(UInt32Array::from(vec![3, 3, 1, 3, 3, 0])),
-        ],
-    )
-    .unwrap();
-    ctx.register_batch("left_semi_anti_join_table_t2", t2_data)
-        .unwrap();
-}
-
-pub async fn register_right_semi_anti_join_table(ctx: &SessionContext) {
-    let t1_schema = Arc::new(Schema::new(vec![
-        Field::new("t1_id", DataType::UInt32, true),
-        Field::new("t1_name", DataType::Utf8, true),
-        Field::new("t1_int", DataType::UInt32, true),
-    ]));
-    let t1_data = RecordBatch::try_new(
-        t1_schema,
-        vec![
-            Arc::new(UInt32Array::from(vec![
-                Some(11),
-                Some(22),
-                Some(33),
-                Some(44),
-                None,
-            ])),
-            Arc::new(StringArray::from(vec![
-                Some("a"),
-                Some("b"),
-                Some("c"),
-                Some("d"),
-                Some("e"),
-            ])),
-            Arc::new(UInt32Array::from(vec![1, 2, 3, 4, 0])),
-        ],
-    )
-    .unwrap();
-    ctx.register_batch("right_semi_anti_join_table_t1", t1_data)
-        .unwrap();
-
-    let t2_schema = Arc::new(Schema::new(vec![
-        Field::new("t2_id", DataType::UInt32, true),
-        Field::new("t2_name", DataType::Utf8, true),
-    ]));
-    // t2 data size is smaller than t1
-    let t2_data = RecordBatch::try_new(
-        t2_schema,
-        vec![
-            Arc::new(UInt32Array::from(vec![Some(11), Some(11), None])),
-            Arc::new(StringArray::from(vec![Some("a"), Some("x"), None])),
-        ],
-    )
-    .unwrap();
-    ctx.register_batch("right_semi_anti_join_table_t2", t2_data)
-        .unwrap();
-}
diff --git a/datafusion/core/tests/sqllogictests/test_files/joins.slt b/datafusion/core/tests/sqllogictests/test_files/joins.slt
index 4486d7c47b..56c0c97701 100644
--- a/datafusion/core/tests/sqllogictests/test_files/joins.slt
+++ b/datafusion/core/tests/sqllogictests/test_files/joins.slt
@@ -75,6 +75,113 @@ AS VALUES
 (55, 'w', 3),
 (NULL, 'v', 0);
 
+statement ok
+CREATE TABLE left_semi_anti_join_table_t1(t1_id INT UNSIGNED, t1_name VARCHAR, t1_int INT UNSIGNED)
+AS VALUES
+(11, 'a', 1),
+(11, 'a', 1),
+(22, 'b', 2),
+(33, 'c', 3),
+(44, 'd', 4),
+(NULL, 'e', 0);
+
+statement ok
+CREATE TABLE left_semi_anti_join_table_t2(t2_id INT UNSIGNED, t2_name VARCHAR, t2_int INT UNSIGNED)
+AS VALUES
+(11, 'z', 3),
+(11, 'z', 3),
+(22, 'y', 1),
+(44, 'x', 3),
+(55, 'w', 3),
+(NULL, 'v', 0);
+
+
+statement ok
+CREATE TABLE right_semi_anti_join_table_t1(t1_id INT UNSIGNED, t1_name VARCHAR, t1_int INT UNSIGNED)
+AS VALUES
+(11, 'a', 1),
+(22, 'b', 2),
+(33, 'c', 3),
+(44, 'd', 4),
+(NULL, 'e', 0);
+
+statement ok
+CREATE TABLE right_semi_anti_join_table_t2(t2_id INT UNSIGNED, t2_name VARCHAR)
+AS VALUES
+(11, 'a'),
+(11, 'x'),
+(NULL, NULL);
+
+# Table with all of the supported timestamp types values
+#
+# Columns are named:
+# "nanos" --> TimestampNanosecondArray
+# "micros" --> TimestampMicrosecondArray
+# "millis" --> TimestampMillisecondArray
+# "secs" --> TimestampSecondArray
+# "names" --> StringArray
+statement ok
+CREATE TABLE test_timestamps_table_source(ts varchar, names varchar)
+AS VALUES
+('2018-11-13T17:11:10.011375885995', 'Row 0'),
+('2011-12-13T11:13:10.12345', 'Row 1'),
+(NULL, 'Row 2'),
+('2021-01-01T05:11:10.432', 'Row 3');
+
+
+statement ok
+CREATE TABLE test_timestamps_table as
+SELECT
+  arrow_cast(ts::timestamp::bigint, 'Timestamp(Nanosecond, None)') as nanos,
+  arrow_cast(ts::timestamp::bigint / 1000, 'Timestamp(Microsecond, None)') as micros,
+  arrow_cast(ts::timestamp::bigint / 1000000, 'Timestamp(Millisecond, None)') as millis,
+  arrow_cast(ts::timestamp::bigint / 1000000000, 'Timestamp(Second, None)') as secs,
+  names
+FROM
+  test_timestamps_table_source;
+
+
+
+statement ok
+CREATE TABLE hashjoin_datatype_table_t1_source(c1 INT, c2 BIGINT, c3 DECIMAL(5,2), c4 VARCHAR)
+AS VALUES
+(1,    86400000,  1.23,    'abc'),
+(2,    172800000, 456.00,  'def'),
+(null, 259200000, 789.000, 'ghi'),
+(3,    null,      -123.12, 'jkl')
+;
+
+statement ok
+CREATE TABLE hashjoin_datatype_table_t1
+AS SELECT
+  arrow_cast(c1, 'Date32') as c1,
+  arrow_cast(c2, 'Date64') as c2,
+  c3,
+  arrow_cast(c4, 'Dictionary(Int32, Utf8)') as c4
+FROM
+  hashjoin_datatype_table_t1_source
+
+statement ok
+CREATE TABLE hashjoin_datatype_table_t2_source(c1 INT, c2 BIGINT, c3 DECIMAL(10,2), c4 VARCHAR)
+AS VALUES
+(1,    86400000,  -123.12,   'abc'),
+(null, null,      100000.00, 'abcdefg'),
+(null, 259200000, 0.00,      'qwerty'),
+(3,   null,       789.000,   'qwe')
+;
+
+statement ok
+CREATE TABLE hashjoin_datatype_table_t2
+AS SELECT
+  arrow_cast(c1, 'Date32') as c1,
+  arrow_cast(c2, 'Date64') as c2,
+  c3,
+  arrow_cast(c4, 'Dictionary(Int32, Utf8)') as c4
+FROM
+  hashjoin_datatype_table_t2_source
+
+
+
 statement ok
 set datafusion.execution.target_partitions = 2;
 
@@ -2338,6 +2445,15 @@ WHERE NOT EXISTS(
 statement ok
 set datafusion.explain.logical_plan_only = false;
 
+# show the contents of the timestamp table
+query PPPPT
+select * from
+test_timestamps_table
+----
+2018-11-13T17:11:10.011375885 2018-11-13T17:11:10.011375 2018-11-13T17:11:10.011 2018-11-13T17:11:10 Row 0
+2011-12-13T11:13:10.123450 2011-12-13T11:13:10.123450 2011-12-13T11:13:10.123 2011-12-13T11:13:10 Row 1
+NULL NULL NULL NULL Row 2
+2021-01-01T05:11:10.432 2021-01-01T05:11:10.432 2021-01-01T05:11:10.432 2021-01-01T05:11:10 Row 3
 
 # test timestamp join on nanos datatype
 query PPPPTPPPPT rowsort
@@ -2936,4 +3052,3 @@ set datafusion.execution.target_partitions = 2;
 
 statement ok
 set datafusion.execution.batch_size = 4096;
-