You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2023/06/13 12:37:55 UTC
[arrow-datafusion] branch main updated: minor: use sql to setup test data for joins.slt rather than rust (#6656)
This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 2ba067c8d2 minor: use sql to setup test data for joins.slt rather than rust (#6656)
2ba067c8d2 is described below
commit 2ba067c8d2d7c3029ae2d6e4998dd2e4cd9d504f
Author: Andrew Lamb <an...@nerdnetworks.org>
AuthorDate: Tue Jun 13 08:37:47 2023 -0400
minor: use sql to setup test data for joins.slt rather than rust (#6656)
* port register_right_semi_anti_join_table to sql
* port register_left_semi_anti_join_table
* port register_timestamps_table
* Port register_hashjoin_datatype_table to sql
* fix comment
---
datafusion/core/tests/sqllogictests/src/main.rs | 6 +-
datafusion/core/tests/sqllogictests/src/setup.rs | 279 ---------------------
.../core/tests/sqllogictests/test_files/joins.slt | 117 ++++++++-
3 files changed, 117 insertions(+), 285 deletions(-)
diff --git a/datafusion/core/tests/sqllogictests/src/main.rs b/datafusion/core/tests/sqllogictests/src/main.rs
index 6f17010a39..adb3be9a6e 100644
--- a/datafusion/core/tests/sqllogictests/src/main.rs
+++ b/datafusion/core/tests/sqllogictests/src/main.rs
@@ -286,11 +286,7 @@ async fn context_for_test_file(relative_path: &Path) -> Option<TestContext> {
}
}
"joins.slt" => {
- info!("Registering timestamps tables");
- setup::register_timestamps_table(test_ctx.session_ctx()).await;
- setup::register_hashjoin_datatype_table(test_ctx.session_ctx()).await;
- setup::register_left_semi_anti_join_table(test_ctx.session_ctx()).await;
- setup::register_right_semi_anti_join_table(test_ctx.session_ctx()).await;
+ info!("Registering partition table tables");
let mut test_ctx = test_ctx;
setup::register_partition_table(&mut test_ctx).await;
diff --git a/datafusion/core/tests/sqllogictests/src/setup.rs b/datafusion/core/tests/sqllogictests/src/setup.rs
index 96090d676a..26f9d2501a 100644
--- a/datafusion/core/tests/sqllogictests/src/setup.rs
+++ b/datafusion/core/tests/sqllogictests/src/setup.rs
@@ -15,12 +15,6 @@
// specific language governing permissions and limitations
// under the License.
-use arrow_array::types::Int32Type;
-use arrow_array::{
- Array, Date32Array, Date64Array, Decimal128Array, DictionaryArray, StringArray,
- TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray,
- TimestampSecondArray,
-};
use datafusion::{
arrow::{
array::{
@@ -221,97 +215,6 @@ fn register_nan_table(ctx: &SessionContext) {
ctx.register_batch("test_float", data).unwrap();
}
-pub async fn register_timestamps_table(ctx: &SessionContext) {
- let batch = make_timestamps();
- let schema = batch.schema();
- let partitions = vec![vec![batch]];
-
- ctx.register_table(
- "test_timestamps_table",
- Arc::new(MemTable::try_new(schema, partitions).unwrap()),
- )
- .unwrap();
-}
-
-/// Return record batch with all of the supported timestamp types
-/// values
-///
-/// Columns are named:
-/// "nanos" --> TimestampNanosecondArray
-/// "micros" --> TimestampMicrosecondArray
-/// "millis" --> TimestampMillisecondArray
-/// "secs" --> TimestampSecondArray
-/// "names" --> StringArray
-pub fn make_timestamps() -> RecordBatch {
- let ts_strings = vec![
- Some("2018-11-13T17:11:10.011375885995"),
- Some("2011-12-13T11:13:10.12345"),
- None,
- Some("2021-1-1T05:11:10.432"),
- ];
-
- let ts_nanos = ts_strings
- .into_iter()
- .map(|t| {
- t.map(|t| {
- t.parse::<chrono::NaiveDateTime>()
- .unwrap()
- .timestamp_nanos()
- })
- })
- .collect::<Vec<_>>();
-
- let ts_micros = ts_nanos
- .iter()
- .map(|t| t.as_ref().map(|ts_nanos| ts_nanos / 1000))
- .collect::<Vec<_>>();
-
- let ts_millis = ts_nanos
- .iter()
- .map(|t| t.as_ref().map(|ts_nanos| ts_nanos / 1000000))
- .collect::<Vec<_>>();
-
- let ts_secs = ts_nanos
- .iter()
- .map(|t| t.as_ref().map(|ts_nanos| ts_nanos / 1000000000))
- .collect::<Vec<_>>();
-
- let names = ts_nanos
- .iter()
- .enumerate()
- .map(|(i, _)| format!("Row {i}"))
- .collect::<Vec<_>>();
-
- let arr_nanos = TimestampNanosecondArray::from(ts_nanos);
- let arr_micros = TimestampMicrosecondArray::from(ts_micros);
- let arr_millis = TimestampMillisecondArray::from(ts_millis);
- let arr_secs = TimestampSecondArray::from(ts_secs);
-
- let names = names.iter().map(|s| s.as_str()).collect::<Vec<_>>();
- let arr_names = StringArray::from(names);
-
- let schema = Schema::new(vec![
- Field::new("nanos", arr_nanos.data_type().clone(), true),
- Field::new("micros", arr_micros.data_type().clone(), true),
- Field::new("millis", arr_millis.data_type().clone(), true),
- Field::new("secs", arr_secs.data_type().clone(), true),
- Field::new("name", arr_names.data_type().clone(), true),
- ]);
- let schema = Arc::new(schema);
-
- RecordBatch::try_new(
- schema,
- vec![
- Arc::new(arr_nanos),
- Arc::new(arr_micros),
- Arc::new(arr_millis),
- Arc::new(arr_secs),
- Arc::new(arr_names),
- ],
- )
- .unwrap()
-}
-
/// Generate a partitioned CSV file and register it with an execution context
pub async fn register_partition_table(test_ctx: &mut TestContext) {
test_ctx.enable_testdir();
@@ -346,185 +249,3 @@ pub async fn register_partition_table(test_ctx: &mut TestContext) {
.await
.unwrap();
}
-
-pub async fn register_hashjoin_datatype_table(ctx: &SessionContext) {
- let t1_schema = Schema::new(vec![
- Field::new("c1", DataType::Date32, true),
- Field::new("c2", DataType::Date64, true),
- Field::new("c3", DataType::Decimal128(5, 2), true),
- Field::new(
- "c4",
- DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
- true,
- ),
- ]);
- let dict1: DictionaryArray<Int32Type> =
- vec!["abc", "def", "ghi", "jkl"].into_iter().collect();
- let t1_data = RecordBatch::try_new(
- Arc::new(t1_schema),
- vec![
- Arc::new(Date32Array::from(vec![Some(1), Some(2), None, Some(3)])),
- Arc::new(Date64Array::from(vec![
- Some(86400000),
- Some(172800000),
- Some(259200000),
- None,
- ])),
- Arc::new(
- Decimal128Array::from_iter_values([123, 45600, 78900, -12312])
- .with_precision_and_scale(5, 2)
- .unwrap(),
- ),
- Arc::new(dict1),
- ],
- )
- .unwrap();
- ctx.register_batch("hashjoin_datatype_table_t1", t1_data)
- .unwrap();
-
- let t2_schema = Schema::new(vec![
- Field::new("c1", DataType::Date32, true),
- Field::new("c2", DataType::Date64, true),
- Field::new("c3", DataType::Decimal128(10, 2), true),
- Field::new(
- "c4",
- DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
- true,
- ),
- ]);
- let dict2: DictionaryArray<Int32Type> = vec!["abc", "abcdefg", "qwerty", "qwe"]
- .into_iter()
- .collect();
- let t2_data = RecordBatch::try_new(
- Arc::new(t2_schema),
- vec![
- Arc::new(Date32Array::from(vec![Some(1), None, None, Some(3)])),
- Arc::new(Date64Array::from(vec![
- Some(86400000),
- None,
- Some(259200000),
- None,
- ])),
- Arc::new(
- Decimal128Array::from_iter_values([-12312, 10000000, 0, 78900])
- .with_precision_and_scale(10, 2)
- .unwrap(),
- ),
- Arc::new(dict2),
- ],
- )
- .unwrap();
- ctx.register_batch("hashjoin_datatype_table_t2", t2_data)
- .unwrap();
-}
-
-pub async fn register_left_semi_anti_join_table(ctx: &SessionContext) {
- let t1_schema = Arc::new(Schema::new(vec![
- Field::new("t1_id", DataType::UInt32, true),
- Field::new("t1_name", DataType::Utf8, true),
- Field::new("t1_int", DataType::UInt32, true),
- ]));
- let t1_data = RecordBatch::try_new(
- t1_schema,
- vec![
- Arc::new(UInt32Array::from(vec![
- Some(11),
- Some(11),
- Some(22),
- Some(33),
- Some(44),
- None,
- ])),
- Arc::new(StringArray::from(vec![
- Some("a"),
- Some("a"),
- Some("b"),
- Some("c"),
- Some("d"),
- Some("e"),
- ])),
- Arc::new(UInt32Array::from(vec![1, 1, 2, 3, 4, 0])),
- ],
- )
- .unwrap();
- ctx.register_batch("left_semi_anti_join_table_t1", t1_data)
- .unwrap();
-
- let t2_schema = Arc::new(Schema::new(vec![
- Field::new("t2_id", DataType::UInt32, true),
- Field::new("t2_name", DataType::Utf8, true),
- Field::new("t2_int", DataType::UInt32, true),
- ]));
- let t2_data = RecordBatch::try_new(
- t2_schema,
- vec![
- Arc::new(UInt32Array::from(vec![
- Some(11),
- Some(11),
- Some(22),
- Some(44),
- Some(55),
- None,
- ])),
- Arc::new(StringArray::from(vec![
- Some("z"),
- Some("z"),
- Some("y"),
- Some("x"),
- Some("w"),
- Some("v"),
- ])),
- Arc::new(UInt32Array::from(vec![3, 3, 1, 3, 3, 0])),
- ],
- )
- .unwrap();
- ctx.register_batch("left_semi_anti_join_table_t2", t2_data)
- .unwrap();
-}
-
-pub async fn register_right_semi_anti_join_table(ctx: &SessionContext) {
- let t1_schema = Arc::new(Schema::new(vec![
- Field::new("t1_id", DataType::UInt32, true),
- Field::new("t1_name", DataType::Utf8, true),
- Field::new("t1_int", DataType::UInt32, true),
- ]));
- let t1_data = RecordBatch::try_new(
- t1_schema,
- vec![
- Arc::new(UInt32Array::from(vec![
- Some(11),
- Some(22),
- Some(33),
- Some(44),
- None,
- ])),
- Arc::new(StringArray::from(vec![
- Some("a"),
- Some("b"),
- Some("c"),
- Some("d"),
- Some("e"),
- ])),
- Arc::new(UInt32Array::from(vec![1, 2, 3, 4, 0])),
- ],
- )
- .unwrap();
- ctx.register_batch("right_semi_anti_join_table_t1", t1_data)
- .unwrap();
-
- let t2_schema = Arc::new(Schema::new(vec![
- Field::new("t2_id", DataType::UInt32, true),
- Field::new("t2_name", DataType::Utf8, true),
- ]));
- // t2 data size is smaller than t1
- let t2_data = RecordBatch::try_new(
- t2_schema,
- vec![
- Arc::new(UInt32Array::from(vec![Some(11), Some(11), None])),
- Arc::new(StringArray::from(vec![Some("a"), Some("x"), None])),
- ],
- )
- .unwrap();
- ctx.register_batch("right_semi_anti_join_table_t2", t2_data)
- .unwrap();
-}
diff --git a/datafusion/core/tests/sqllogictests/test_files/joins.slt b/datafusion/core/tests/sqllogictests/test_files/joins.slt
index 4486d7c47b..56c0c97701 100644
--- a/datafusion/core/tests/sqllogictests/test_files/joins.slt
+++ b/datafusion/core/tests/sqllogictests/test_files/joins.slt
@@ -75,6 +75,113 @@ AS VALUES
(55, 'w', 3),
(NULL, 'v', 0);
+statement ok
+CREATE TABLE left_semi_anti_join_table_t1(t1_id INT UNSIGNED, t1_name VARCHAR, t1_int INT UNSIGNED)
+AS VALUES
+(11, 'a', 1),
+(11, 'a', 1),
+(22, 'b', 2),
+(33, 'c', 3),
+(44, 'd', 4),
+(NULL, 'e', 0);
+
+statement ok
+CREATE TABLE left_semi_anti_join_table_t2(t2_id INT UNSIGNED, t2_name VARCHAR, t2_int INT UNSIGNED)
+AS VALUES
+(11, 'z', 3),
+(11, 'z', 3),
+(22, 'y', 1),
+(44, 'x', 3),
+(55, 'w', 3),
+(NULL, 'v', 0);
+
+
+statement ok
+CREATE TABLE right_semi_anti_join_table_t1(t1_id INT UNSIGNED, t1_name VARCHAR, t1_int INT UNSIGNED)
+AS VALUES
+(11, 'a', 1),
+(22, 'b', 2),
+(33, 'c', 3),
+(44, 'd', 4),
+(NULL, 'e', 0);
+
+statement ok
+CREATE TABLE right_semi_anti_join_table_t2(t2_id INT UNSIGNED, t2_name VARCHAR)
+AS VALUES
+(11, 'a'),
+(11, 'x'),
+(NULL, NULL);
+
+# Table with all of the supported timestamp types values
+#
+# Columns are named:
+# "nanos" --> TimestampNanosecondArray
+# "micros" --> TimestampMicrosecondArray
+# "millis" --> TimestampMillisecondArray
+# "secs" --> TimestampSecondArray
+# "names" --> StringArray
+statement ok
+CREATE TABLE test_timestamps_table_source(ts varchar, names varchar)
+AS VALUES
+('2018-11-13T17:11:10.011375885995', 'Row 0'),
+('2011-12-13T11:13:10.12345', 'Row 1'),
+(NULL, 'Row 2'),
+('2021-01-01T05:11:10.432', 'Row 3');
+
+
+statement ok
+CREATE TABLE test_timestamps_table as
+SELECT
+ arrow_cast(ts::timestamp::bigint, 'Timestamp(Nanosecond, None)') as nanos,
+ arrow_cast(ts::timestamp::bigint / 1000, 'Timestamp(Microsecond, None)') as micros,
+ arrow_cast(ts::timestamp::bigint / 1000000, 'Timestamp(Millisecond, None)') as millis,
+ arrow_cast(ts::timestamp::bigint / 1000000000, 'Timestamp(Second, None)') as secs,
+ names
+FROM
+ test_timestamps_table_source;
+
+
+
+statement ok
+CREATE TABLE hashjoin_datatype_table_t1_source(c1 INT, c2 BIGINT, c3 DECIMAL(5,2), c4 VARCHAR)
+AS VALUES
+(1, 86400000, 1.23, 'abc'),
+(2, 172800000, 456.00, 'def'),
+(null, 259200000, 789.000, 'ghi'),
+(3, null, -123.12, 'jkl')
+;
+
+statement ok
+CREATE TABLE hashjoin_datatype_table_t1
+AS SELECT
+ arrow_cast(c1, 'Date32') as c1,
+ arrow_cast(c2, 'Date64') as c2,
+ c3,
+ arrow_cast(c4, 'Dictionary(Int32, Utf8)') as c4
+FROM
+ hashjoin_datatype_table_t1_source
+
+statement ok
+CREATE TABLE hashjoin_datatype_table_t2_source(c1 INT, c2 BIGINT, c3 DECIMAL(10,2), c4 VARCHAR)
+AS VALUES
+(1, 86400000, -123.12, 'abc'),
+(null, null, 100000.00, 'abcdefg'),
+(null, 259200000, 0.00, 'qwerty'),
+(3, null, 789.000, 'qwe')
+;
+
+statement ok
+CREATE TABLE hashjoin_datatype_table_t2
+AS SELECT
+ arrow_cast(c1, 'Date32') as c1,
+ arrow_cast(c2, 'Date64') as c2,
+ c3,
+ arrow_cast(c4, 'Dictionary(Int32, Utf8)') as c4
+FROM
+ hashjoin_datatype_table_t2_source
+
+
+
statement ok
set datafusion.execution.target_partitions = 2;
@@ -2338,6 +2445,15 @@ WHERE NOT EXISTS(
statement ok
set datafusion.explain.logical_plan_only = false;
+# show the contents of the timestamp table
+query PPPPT
+select * from
+test_timestamps_table
+----
+2018-11-13T17:11:10.011375885 2018-11-13T17:11:10.011375 2018-11-13T17:11:10.011 2018-11-13T17:11:10 Row 0
+2011-12-13T11:13:10.123450 2011-12-13T11:13:10.123450 2011-12-13T11:13:10.123 2011-12-13T11:13:10 Row 1
+NULL NULL NULL NULL Row 2
+2021-01-01T05:11:10.432 2021-01-01T05:11:10.432 2021-01-01T05:11:10.432 2021-01-01T05:11:10 Row 3
# test timestamp join on nanos datatype
query PPPPTPPPPT rowsort
@@ -2936,4 +3052,3 @@ set datafusion.execution.target_partitions = 2;
statement ok
set datafusion.execution.batch_size = 4096;
-