You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ag...@apache.org on 2023/01/24 18:59:29 UTC
[arrow-datafusion-python] branch main updated: test: Expand tests for built-in functions (#129)
This is an automated email from the ASF dual-hosted git repository.
agrove pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion-python.git
The following commit(s) were added to refs/heads/main by this push:
new da3a072 test: Expand tests for built-in functions (#129)
da3a072 is described below
commit da3a07253e3fefc5a78b27f3fa17fcbbf93f0403
Author: Dejan Simic <10...@users.noreply.github.com>
AuthorDate: Tue Jan 24 19:59:25 2023 +0100
test: Expand tests for built-in functions (#129)
* Expand math function tests
* Expand string function tests
* Expand hash function tests
* Add temporal function tests
* Run formatter and fix failing tests after branch update
---
datafusion/tests/test_functions.py | 199 +++++++++++++++++++++++++++++++++++--
1 file changed, 190 insertions(+), 9 deletions(-)
diff --git a/datafusion/tests/test_functions.py b/datafusion/tests/test_functions.py
index daa2f19..76edfa2 100644
--- a/datafusion/tests/test_functions.py
+++ b/datafusion/tests/test_functions.py
@@ -18,6 +18,7 @@
import numpy as np
import pyarrow as pa
import pytest
+from datetime import datetime
from datafusion import SessionContext, column
from datafusion import functions as f
@@ -29,8 +30,19 @@ def df():
ctx = SessionContext()
# create a RecordBatch and a new DataFrame from it
batch = pa.RecordBatch.from_arrays(
- [pa.array(["Hello", "World", "!"]), pa.array([4, 5, 6])],
- names=["a", "b"],
+ [
+ pa.array(["Hello", "World", "!"]),
+ pa.array([4, 5, 6]),
+ pa.array(["hello ", " world ", " !"]),
+ pa.array(
+ [
+ datetime(2022, 12, 31),
+ datetime(2027, 6, 26),
+ datetime(2020, 7, 2),
+ ]
+ ),
+ ],
+ names=["a", "b", "c", "d"],
)
return ctx.create_dataframe([[batch]])
@@ -91,6 +103,16 @@ def test_math_functions():
f.log2(col_v + literal(pa.scalar(1))),
f.log10(col_v + literal(pa.scalar(1))),
f.random(),
+ f.atan(col_v),
+ f.atan2(col_v, literal(pa.scalar(1.1))),
+ f.ceil(col_v),
+ f.floor(col_v),
+ f.power(col_v, literal(pa.scalar(3))),
+ f.pow(col_v, literal(pa.scalar(4))),
+ f.round(col_v),
+ f.sqrt(col_v),
+ f.signum(col_v),
+ f.trunc(col_v),
)
batches = df.collect()
assert len(batches) == 1
@@ -113,29 +135,116 @@ def test_math_functions():
result.column(9), np.log10(values + 1.0)
)
np.testing.assert_array_less(result.column(10), np.ones_like(values))
+ np.testing.assert_array_almost_equal(result.column(11), np.arctan(values))
+ np.testing.assert_array_almost_equal(
+ result.column(12), np.arctan2(values, 1.1)
+ )
+ np.testing.assert_array_almost_equal(result.column(13), np.ceil(values))
+ np.testing.assert_array_almost_equal(result.column(14), np.floor(values))
+ np.testing.assert_array_almost_equal(
+ result.column(15), np.power(values, 3)
+ )
+ np.testing.assert_array_almost_equal(
+ result.column(16), np.power(values, 4)
+ )
+ np.testing.assert_array_almost_equal(result.column(17), np.round(values))
+ np.testing.assert_array_almost_equal(result.column(18), np.sqrt(values))
+ np.testing.assert_array_almost_equal(result.column(19), np.sign(values))
+ np.testing.assert_array_almost_equal(result.column(20), np.trunc(values))
def test_string_functions(df):
- df = df.select(f.md5(column("a")), f.lower(column("a")))
+ df = df.select(
+ f.ascii(column("a")),
+ f.bit_length(column("a")),
+ f.btrim(literal(" World ")),
+ f.character_length(column("a")),
+ f.chr(literal(68)),
+ f.concat_ws("-", column("a"), literal("test")),
+ f.concat(column("a"), literal("?")),
+ f.initcap(column("c")),
+ f.left(column("a"), literal(3)),
+ f.length(column("c")),
+ f.lower(column("a")),
+ f.lpad(column("a"), literal(7)),
+ f.ltrim(column("c")),
+ f.md5(column("a")),
+ f.octet_length(column("a")),
+ f.repeat(column("a"), literal(2)),
+ f.replace(column("a"), literal("l"), literal("?")),
+ f.reverse(column("a")),
+ f.right(column("a"), literal(4)),
+ f.rpad(column("a"), literal(8)),
+ f.rtrim(column("c")),
+ f.split_part(column("a"), literal("l"), literal(1)),
+ f.starts_with(column("a"), literal("Wor")),
+ f.strpos(column("a"), literal("o")),
+ f.substr(column("a"), literal(3)),
+ f.translate(column("a"), literal("or"), literal("ld")),
+ f.trim(column("c")),
+ f.upper(column("c")),
+ )
result = df.collect()
assert len(result) == 1
result = result[0]
assert result.column(0) == pa.array(
+ [72, 87, 33], type=pa.int32()
+ ) # H = 72; W = 87; ! = 33
+ assert result.column(1) == pa.array([40, 40, 8], type=pa.int32())
+ assert result.column(2) == pa.array(["World", "World", "World"])
+ assert result.column(3) == pa.array([5, 5, 1], type=pa.int32())
+ assert result.column(4) == pa.array(["D", "D", "D"])
+ assert result.column(5) == pa.array(["Hello-test", "World-test", "!-test"])
+ assert result.column(6) == pa.array(["Hello?", "World?", "!?"])
+ assert result.column(7) == pa.array(["Hello ", " World ", " !"])
+ assert result.column(8) == pa.array(["Hel", "Wor", "!"])
+ assert result.column(9) == pa.array([6, 7, 2], type=pa.int32())
+ assert result.column(10) == pa.array(["hello", "world", "!"])
+ assert result.column(11) == pa.array([" Hello", " World", " !"])
+ assert result.column(12) == pa.array(["hello ", "world ", "!"])
+ assert result.column(13) == pa.array(
[
"8b1a9953c4611296a827abf8c47804d7",
"f5a7924e621e84c9280a9a27e1bcb7f6",
"9033e0e305f247c0c3c80d0c7848c8b3",
]
)
- assert result.column(1) == pa.array(["hello", "world", "!"])
+ assert result.column(14) == pa.array([5, 5, 1], type=pa.int32())
+ assert result.column(15) == pa.array(["HelloHello", "WorldWorld", "!!"])
+ assert result.column(16) == pa.array(["He??o", "Wor?d", "!"])
+ assert result.column(17) == pa.array(["olleH", "dlroW", "!"])
+ assert result.column(18) == pa.array(["ello", "orld", "!"])
+ assert result.column(19) == pa.array(["Hello ", "World ", "! "])
+ assert result.column(20) == pa.array(["hello", " world", " !"])
+ assert result.column(21) == pa.array(["He", "Wor", "!"])
+ assert result.column(22) == pa.array([False, True, False])
+ assert result.column(23) == pa.array([5, 2, 0], type=pa.int32())
+ assert result.column(24) == pa.array(["llo", "rld", ""])
+ assert result.column(25) == pa.array(["Helll", "Wldld", "!"])
+ assert result.column(26) == pa.array(["hello", "world", "!"])
+ assert result.column(27) == pa.array(["HELLO ", " WORLD ", " !"])
def test_hash_functions(df):
exprs = [
f.digest(column("a"), literal(m))
- for m in ("md5", "sha256", "sha512", "blake2s", "blake3")
+ for m in (
+ "md5",
+ "sha224",
+ "sha256",
+ "sha384",
+ "sha512",
+ "blake2s",
+ "blake3",
+ )
]
- df = df.select(*exprs)
+ df = df.select(
+ *exprs,
+ f.sha224(column("a")),
+ f.sha256(column("a")),
+ f.sha384(column("a")),
+ f.sha512(column("a")),
+ )
result = df.collect()
assert len(result) == 1
result = result[0]
@@ -148,6 +257,13 @@ def test_hash_functions(df):
]
)
assert result.column(1) == pa.array(
+ [
+ b("4149DA18AA8BFC2B1E382C6C26556D01A92C261B6436DAD5E3BE3FCC"),
+ b("12972632B6D3B6AA52BD6434552F08C1303D56B817119406466E9236"),
+ b("6641A7E8278BCD49E476E7ACAE158F4105B2952D22AEB2E0B9A231A0"),
+ ]
+ )
+ assert result.column(2) == pa.array(
[
b(
"185F8DB32271FE25F561A6FC938B2E26"
@@ -163,7 +279,26 @@ def test_hash_functions(df):
),
]
)
- assert result.column(2) == pa.array(
+ assert result.column(3) == pa.array(
+ [
+ b(
+ "3519FE5AD2C596EFE3E276A6F351B8FC"
+ "0B03DB861782490D45F7598EBD0AB5FD"
+ "5520ED102F38C4A5EC834E98668035FC"
+ ),
+ b(
+ "ED7CED84875773603AF90402E42C65F3"
+ "B48A5E77F84ADC7A19E8F3E8D3101010"
+ "22F552AEC70E9E1087B225930C1D260A"
+ ),
+ b(
+ "1D0EC8C84EE9521E21F06774DE232367"
+ "B64DE628474CB5B2E372B699A1F55AE3"
+ "35CC37193EF823E33324DFD9A70738A6"
+ ),
+ ]
+ )
+ assert result.column(4) == pa.array(
[
b(
"3615F80C9D293ED7402687F94B22D58E"
@@ -185,7 +320,7 @@ def test_hash_functions(df):
),
]
)
- assert result.column(3) == pa.array(
+ assert result.column(5) == pa.array(
[
b(
"F73A5FBF881F89B814871F46E26AD3FA"
@@ -201,7 +336,7 @@ def test_hash_functions(df):
),
]
)
- assert result.column(4) == pa.array(
+ assert result.column(6) == pa.array(
[
b(
"FBC2B0516EE8744D293B980779178A35"
@@ -217,3 +352,49 @@ def test_hash_functions(df):
),
]
)
+ assert result.column(7) == result.column(1) # SHA-224
+ assert result.column(8) == result.column(2) # SHA-256
+ assert result.column(9) == result.column(3) # SHA-384
+ assert result.column(10) == result.column(4) # SHA-512
+
+
+def test_temporal_functions(df):
+ df = df.select(
+ f.date_part(literal("month"), column("d")),
+ f.datepart(literal("year"), column("d")),
+ f.date_trunc(literal("month"), column("d")),
+ f.datetrunc(literal("day"), column("d")),
+ f.from_unixtime(literal(1673383974)),
+ f.to_timestamp(literal("2023-09-07 05:06:14.523952")),
+ f.to_timestamp_seconds(literal("2023-09-07 05:06:14.523952")),
+ f.to_timestamp_millis(literal("2023-09-07 05:06:14.523952")),
+ f.to_timestamp_micros(literal("2023-09-07 05:06:14.523952")),
+ )
+ result = df.collect()
+ assert len(result) == 1
+ result = result[0]
+ assert result.column(0) == pa.array([12, 6, 7], type=pa.float64())
+ assert result.column(1) == pa.array([2022, 2027, 2020], type=pa.float64())
+ assert result.column(2) == pa.array(
+ [datetime(2022, 12, 1), datetime(2027, 6, 1), datetime(2020, 7, 1)],
+ type=pa.timestamp("ns"),
+ )
+ assert result.column(3) == pa.array(
+ [datetime(2022, 12, 31), datetime(2027, 6, 26), datetime(2020, 7, 2)],
+ type=pa.timestamp("ns"),
+ )
+ assert result.column(4) == pa.array(
+ [datetime(2023, 1, 10, 20, 52, 54)] * 3, type=pa.timestamp("s")
+ )
+ assert result.column(5) == pa.array(
+ [datetime(2023, 9, 7, 5, 6, 14, 523952)] * 3, type=pa.timestamp("ns")
+ )
+ assert result.column(6) == pa.array(
+ [datetime(2023, 9, 7, 5, 6, 14)] * 3, type=pa.timestamp("s")
+ )
+ assert result.column(7) == pa.array(
+ [datetime(2023, 9, 7, 5, 6, 14, 523000)] * 3, type=pa.timestamp("ms")
+ )
+ assert result.column(8) == pa.array(
+ [datetime(2023, 9, 7, 5, 6, 14, 523952)] * 3, type=pa.timestamp("us")
+ )