You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by "jorisvandenbossche (via GitHub)" <gi...@apache.org> on 2023/06/20 08:23:13 UTC

[GitHub] [arrow] jorisvandenbossche commented on a diff in pull request #36162: GH-21761: [Python] accept pyarrow scalars in array constructor

jorisvandenbossche commented on code in PR #36162:
URL: https://github.com/apache/arrow/pull/36162#discussion_r1234913799


##########
python/pyarrow/tests/test_convert_builtin.py:
##########
@@ -2363,3 +2363,113 @@ def test_array_from_pylist_offset_overflow():
     assert isinstance(arr, pa.ChunkedArray)
     assert len(arr) == 2**31
     assert len(arr.chunks) > 1
+
+
+@parametrize_with_collections_types
+@pytest.mark.parametrize(('data', 'scalar_data'), [
+    ([True, False, None], [pa.scalar(True), pa.scalar(False), None]),
+    ([1, 2, None], [pa.scalar(1), pa.scalar(2), None]),
+    ([1, None, None], [pa.scalar(1), None, pa.scalar(None, type=pa.int64())]),
+    ([None, None], [pa.scalar(None), pa.scalar(None)]),
+    ([1., 2., None], [pa.scalar(1.), pa.scalar(2.), None]),
+    ([None, datetime.date.today()], [None, pa.scalar(datetime.date.today())]),
+    ([datetime.time(1, 1, 1), None], [pa.scalar(datetime.time(1, 1, 1)), None]),
+    ([datetime.timedelta(seconds=10)], [pa.scalar(datetime.timedelta(seconds=10))]),
+    ([None, datetime.datetime(2014, 1, 1)], [
+     None, pa.scalar(datetime.datetime(2014, 1, 1))]),
+    ([pa.MonthDayNano([1, -1, -10100])], [pa.scalar(pa.MonthDayNano([1, -1, -10100]))]),
+    (["a", "b"], [pa.scalar("a"), pa.scalar("b")]),
+    ([b"a", b"b"], [pa.scalar(b"a"), pa.scalar(b"b")]),
+    ([1, 2, 3], pa.scalar([1, 2, 3])),
+    (["a", "b"], pa.scalar(["a", "b"])),

Review Comment:
   ```suggestion
       ([[1, 2, 3]], [pa.scalar([1, 2, 3])]),
       ([["a", "b"]], [pa.scalar(["a", "b"])]),
   ```
   
   ?



##########
python/pyarrow/tests/test_convert_builtin.py:
##########
@@ -2363,3 +2363,113 @@ def test_array_from_pylist_offset_overflow():
     assert isinstance(arr, pa.ChunkedArray)
     assert len(arr) == 2**31
     assert len(arr.chunks) > 1
+
+
+@parametrize_with_collections_types
+@pytest.mark.parametrize(('data', 'scalar_data'), [
+    ([True, False, None], [pa.scalar(True), pa.scalar(False), None]),
+    ([1, 2, None], [pa.scalar(1), pa.scalar(2), None]),
+    ([1, None, None], [pa.scalar(1), None, pa.scalar(None, type=pa.int64())]),
+    ([None, None], [pa.scalar(None), pa.scalar(None)]),
+    ([1., 2., None], [pa.scalar(1.), pa.scalar(2.), None]),
+    ([None, datetime.date.today()], [None, pa.scalar(datetime.date.today())]),
+    ([datetime.time(1, 1, 1), None], [pa.scalar(datetime.time(1, 1, 1)), None]),
+    ([datetime.timedelta(seconds=10)], [pa.scalar(datetime.timedelta(seconds=10))]),
+    ([None, datetime.datetime(2014, 1, 1)], [
+     None, pa.scalar(datetime.datetime(2014, 1, 1))]),
+    ([pa.MonthDayNano([1, -1, -10100])], [pa.scalar(pa.MonthDayNano([1, -1, -10100]))]),
+    (["a", "b"], [pa.scalar("a"), pa.scalar("b")]),
+    ([b"a", b"b"], [pa.scalar(b"a"), pa.scalar(b"b")]),
+    ([1, 2, 3], pa.scalar([1, 2, 3])),
+    (["a", "b"], pa.scalar(["a", "b"])),
+])
+def test_array_accepts_pyarrow_scalar(seq, data, scalar_data):
+    if type(seq(scalar_data)) == set:
+        pytest.skip("TODO: look at the reordering of the elements in the set")
+    expect = pa.array(data)
+    result = pa.array(seq(scalar_data))
+    assert expect.equals(result)
+
+
+@parametrize_with_collections_types
+@pytest.mark.parametrize(('data', 'scalar_data', 'value_type'), [
+    ([1, 2, None], [pa.scalar(1, type=pa.int8()),
+     pa.scalar(2, type=pa.int8()), None], pa.int8()),
+    ([1, None], [pa.scalar(1.0, type=pa.int32()), None], pa.int32()),
+    (["aaa", "bbb"], [pa.scalar("aaa", type=pa.binary(3)),
+     pa.scalar("bbb", type=pa.binary(3))], pa.binary(3)),
+    ([b"a"], [pa.scalar("a", type=pa.large_binary())], pa.large_binary()),
+    (["a"], [pa.scalar("a", type=pa.large_string())], pa.large_string()),
+    (
+        ["a"],
+        [pa.scalar("a", type=pa.dictionary(pa.int64(), pa.string()))],
+        pa.dictionary(pa.int64(), pa.string())
+    ),
+    (
+        ["a", "b"],
+        [pa.scalar("a", pa.dictionary(pa.int64(), pa.string())),
+         pa.scalar("b", pa.dictionary(pa.int64(), pa.string()))],
+        pa.dictionary(pa.int64(), pa.string())
+    ),
+    (
+        [1],
+        [pa.scalar(1, type=pa.dictionary(pa.int64(), pa.int32()))],
+        pa.dictionary(pa.int64(), pa.int32())
+    ),
+    (
+        [(1, 2)],
+        [pa.scalar([('a', 1), ('b', 2)], type=pa.struct(
+            [('a', pa.int8()), ('b', pa.int8())]))],
+        pa.struct([('a', pa.int8()), ('b', pa.int8())])
+    ),
+    (
+        [(1, 'bar')],
+        [pa.scalar([('a', 1), ('b', 'bar')], type=pa.struct(
+            [('a', pa.int8()), ('b', pa.string())]))],
+        pa.struct([('a', pa.int8()), ('b', pa.string())])
+    )
+])
+def test_array_accepts_pyarrow_scalar_with_type(seq, data, scalar_data, value_type):
+    if type(seq(scalar_data)) == set:
+        pytest.skip("TODO: look at the reordering of the elements in the set")
+    expect = pa.array(data, type=value_type)
+    result = pa.array(seq(scalar_data), type=value_type)
+    assert expect.equals(result)
+
+
+def test_array_accepts_pyarrow_scalar_something():
+    arr = pa.array([1, 2, 3])
+    result = pa.array([arr.sum()])
+    expect = pa.array([6])
+    assert expect.equals(result)
+
+
+@parametrize_with_collections_types
+def test_array_accepts_pyarrow_scalar_errors(seq):
+    sequence = seq([pa.scalar(1), pa.scalar("a"), pa.scalar(3.0)])
+    with pytest.raises(pa.ArrowInvalid,
+                       match="cannot mix scalars with different types"):
+        pa.array(sequence)
+
+    sequence = seq([1, pa.scalar("a"), None])
+    with pytest.raises(pa.ArrowInvalid,
+                       match="pyarrow scalars cannot be mixed with other "
+                             "Python scalar values currently"):
+        pa.array(sequence)
+
+    sequence = seq([np.float16("0.1"), pa.scalar("a"), None])
+    with pytest.raises(pa.ArrowInvalid,
+                       match="pyarrow scalars cannot be mixed with other "
+                             "Python scalar values currently"):
+        pa.array(sequence)
+
+    sequence = seq([pa.scalar("a"), np.float16("0.1"), None])
+    with pytest.raises(pa.ArrowInvalid,
+                       match="pyarrow scalars cannot be mixed with other "
+                             "Python scalar values currently"):
+        pa.array(sequence)
+
+    with pytest.raises(pa.ArrowInvalid,
+                       match="Cannot append scalar of type string "
+                             "to builder for type int32"):
+        pa.array([pa.scalar("a")], type=pa.int32())

Review Comment:
   > Also casting `int64` to `int8` should work but it currently doesn't.
   
   I suppose it is the builder that currently doesn't support that? 
   (it might also be fine to not support this for this initial PR)



##########
python/pyarrow/src/arrow/python/inference.cc:
##########
@@ -77,6 +79,8 @@ Status ImportPresentIntervalTypes(OwnedRefNoGIL* interval_types_tuple) {
 
 }  // namespace
 
+int import_pyarrow();

Review Comment:
   ```suggestion
   import_pyarrow();
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org