You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2020/06/25 13:33:59 UTC
[GitHub] [arrow] jorisvandenbossche commented on a change in pull request #7519: ARROW-9153: [C++][Python] Refactor scalar bindings

jorisvandenbossche commented on a change in pull request #7519:
URL: https://github.com/apache/arrow/pull/7519#discussion_r445523294



##########
File path: python/pyarrow/_dataset.pyx
##########
@@ -216,22 +216,18 @@ cdef class Expression:
     @staticmethod
     def _scalar(value):
         cdef:
-            shared_ptr[CScalar] scalar
-
-        if value is None:
-            scalar.reset(new CNullScalar())
-        elif isinstance(value, bool):
-            scalar = MakeScalar(<c_bool>value)
-        elif isinstance(value, float):
-            scalar = MakeScalar(<double>value)
-        elif isinstance(value, int):
-            scalar = MakeScalar(<int64_t>value)
-        elif isinstance(value, (bytes, str)):
-            scalar = MakeStringScalar(tobytes(value))

Review comment:
       I think the `MakeStringScalar` included in libarrow.pxd can then be removed (I don't see any other usage of it)

##########
File path: python/pyarrow/tests/test_parquet.py
##########
@@ -2028,7 +2028,7 @@ def test_filters_invalid_pred_op(tempdir, use_legacy_dataset):
                                     use_legacy_dataset=use_legacy_dataset)
         assert dataset.read().num_rows == 0
 
-    with pytest.raises(ValueError if use_legacy_dataset else TypeError):
+    with pytest.raises(ValueError if use_legacy_dataset else pa.ArrowInvalid):
         # dataset API returns TypeError when trying create invalid comparison

Review comment:
       ```suggestion
       with pytest.raises(ValueError):
   ```
   
   ArrowInvalid is a ValueError, I think, so if this changed, the above should be sufficient

##########
File path: python/pyarrow/util.py
##########
@@ -41,6 +41,24 @@ def wrapper(*args, **kwargs):
     return wrapper
 
 
+def _deprecate_class(old_name, new_class, next_version,
+                     instancecheck=True):
+    """
+    Raise warning if a deprecated class is used in an isinstance check.

Review comment:
       Shouldn't it also raise when instantiated?

##########
File path: python/pyarrow/scalar.pxi
##########
@@ -16,1198 +16,704 @@
 # under the License.
 
 
-_NULL = NA = None
-
-
 cdef class Scalar:
     """
-    The base class for all array elements.
+    The base class for scalars.
     """
 
+    def __init__(self):
+        raise TypeError("Do not call {}'s constructor directly, use "
+                        "pa.scalar() instead.".format(self.__class__.__name__))
 
-cdef class NullType(Scalar):
-    """
-    Singleton for null array elements.
-    """
-    # TODO rename this NullValue?
+    cdef void init(self, const shared_ptr[CScalar]& wrapped):
+        self.wrapped = wrapped
 
-    def __cinit__(self):
-        global NA
-        if NA is not None:
-            raise Exception('Cannot create multiple NAType instances')
+    @staticmethod
+    cdef wrap(const shared_ptr[CScalar]& wrapped):
+        cdef:
+            Scalar self
+            Type type_id = wrapped.get().type.get().id()
+
+        if type_id == _Type_NA:
+            return _NULL
+
+        typ = _scalar_classes[type_id]
+        self = typ.__new__(typ)
+        self.init(wrapped)
+
+        return self
+
+    cdef inline shared_ptr[CScalar] unwrap(self) nogil:
+        return self.wrapped
 
-        self.type = null()
+    @property
+    def type(self):
+        return pyarrow_wrap_data_type(self.wrapped.get().type)
 
     def __repr__(self):
-        return 'NULL'
+        return '<pyarrow.{}: {!r}>'.format(
+            self.__class__.__name__, self.as_py()
+        )
 
-    def as_py(self):
-        """
-        Return None
-        """
-        return None
+    def __str__(self):
+        return str(self.as_py())
 
     def __eq__(self, other):
-        return NA
+        # TODO(kszucs): use c++ Equals
+        if isinstance(other, Scalar):
+            other = other.as_py()
+        return self.as_py() == other
 
+    def __hash__(self):
+        # TODO(kszucs): use C++ hash if implemented for the type
+        return hash(self.as_py())
+
+    def as_py(self):
+        raise NotImplementedError()
 
-_NULL = NA = NullType()
+
+_NULL = NA = None
 
 
-cdef class ArrayValue(Scalar):
+cdef class NullScalar(Scalar):
     """
-    The base class for non-null array elements.
+    Concrete class for null scalars.
     """
 
-    def __init__(self):
-        raise TypeError("Do not call {}'s constructor directly, use array "
-                        "subscription instead."
-                        .format(self.__class__.__name__))
-
-    cdef void init(self, DataType type, const shared_ptr[CArray]& sp_array,
-                   int64_t index):
-        self.type = type
-        self.index = index
-        self._set_array(sp_array)
+    def __cinit__(self):
+        global NA
+        if NA is not None:
+            raise Exception('Cannot create multiple NAType instances')
+        self.init(shared_ptr[CScalar](new CNullScalar()))
 
-    cdef void _set_array(self, const shared_ptr[CArray]& sp_array):
-        self.sp_array = sp_array
+    def __init__(self):
+        pass
 
-    def __repr__(self):
-        if hasattr(self, 'as_py'):
-            return repr(self.as_py())
-        else:
-            return super(Scalar, self).__repr__()
+    def __eq__(self, other):
+        return NA
 
-    def __str__(self):
-        if hasattr(self, 'as_py'):
-            return str(self.as_py())
-        else:
-            return super(Scalar, self).__str__()
+    def as_py(self):
+        """
+        Return this value as a Python None.
+        """
+        return None
 
-    def __eq__(self, other):
-        if hasattr(self, 'as_py'):
-            if isinstance(other, ArrayValue):
-                other = other.as_py()
-            return self.as_py() == other
-        else:
-            raise NotImplementedError(
-                "Cannot compare Arrow values that don't support as_py()")
 
-    def __hash__(self):
-        return hash(self.as_py())
+_NULL = NA = NullScalar()
 
 
-cdef class BooleanValue(ArrayValue):
+cdef class BooleanScalar(Scalar):
     """
-    Concrete class for boolean array elements.
+    Concrete class for boolean scalars.
     """
 
     def as_py(self):
         """
         Return this value as a Python bool.
         """
-        cdef CBooleanArray* ap = <CBooleanArray*> self.sp_array.get()
-        return ap.Value(self.index)
+        cdef CBooleanScalar* sp = <CBooleanScalar*> self.wrapped.get()
+        return sp.value if sp.is_valid else None
 
 
-cdef class Int8Value(ArrayValue):
+cdef class UInt8Scalar(Scalar):
     """
-    Concrete class for int8 array elements.
+    Concrete class for uint8 scalars.
     """
 
     def as_py(self):
         """
         Return this value as a Python int.
         """
-        cdef CInt8Array* ap = <CInt8Array*> self.sp_array.get()
-        return ap.Value(self.index)
+        cdef CUInt8Scalar* sp = <CUInt8Scalar*> self.wrapped.get()
+        return sp.value if sp.is_valid else None
 
 
-cdef class UInt8Value(ArrayValue):
+cdef class Int8Scalar(Scalar):
     """
-    Concrete class for uint8 array elements.
+    Concrete class for int8 scalars.
     """
 
     def as_py(self):
         """
         Return this value as a Python int.
         """
-        cdef CUInt8Array* ap = <CUInt8Array*> self.sp_array.get()
-        return ap.Value(self.index)
+        cdef CInt8Scalar* sp = <CInt8Scalar*> self.wrapped.get()
+        return sp.value if sp.is_valid else None
 
 
-cdef class Int16Value(ArrayValue):
+cdef class UInt16Scalar(Scalar):
     """
-    Concrete class for int16 array elements.
+    Concrete class for uint16 scalars.
     """
 
     def as_py(self):
         """
         Return this value as a Python int.
         """
-        cdef CInt16Array* ap = <CInt16Array*> self.sp_array.get()
-        return ap.Value(self.index)
+        cdef CUInt16Scalar* sp = <CUInt16Scalar*> self.wrapped.get()
+        return sp.value if sp.is_valid else None
 
 
-cdef class UInt16Value(ArrayValue):
+cdef class Int16Scalar(Scalar):
     """
-    Concrete class for uint16 array elements.
+    Concrete class for int16 scalars.
     """
 
     def as_py(self):
         """
         Return this value as a Python int.
         """
-        cdef CUInt16Array* ap = <CUInt16Array*> self.sp_array.get()
-        return ap.Value(self.index)
+        cdef CInt16Scalar* sp = <CInt16Scalar*> self.wrapped.get()
+        return sp.value if sp.is_valid else None
 
 
-cdef class Int32Value(ArrayValue):
+cdef class UInt32Scalar(Scalar):
     """
-    Concrete class for int32 array elements.
+    Concrete class for uint32 scalars.
     """
 
     def as_py(self):
         """
         Return this value as a Python int.
         """
-        cdef CInt32Array* ap = <CInt32Array*> self.sp_array.get()
-        return ap.Value(self.index)
+        cdef CUInt32Scalar* sp = <CUInt32Scalar*> self.wrapped.get()
+        return sp.value if sp.is_valid else None
 
 
-cdef class UInt32Value(ArrayValue):
+cdef class Int32Scalar(Scalar):
     """
-    Concrete class for uint32 array elements.
+    Concrete class for int32 scalars.
     """
 
     def as_py(self):
         """
         Return this value as a Python int.
         """
-        cdef CUInt32Array* ap = <CUInt32Array*> self.sp_array.get()
-        return ap.Value(self.index)
+        cdef CInt32Scalar* sp = <CInt32Scalar*> self.wrapped.get()
+        return sp.value if sp.is_valid else None
 
 
-cdef class Int64Value(ArrayValue):
+cdef class UInt64Scalar(Scalar):
     """
-    Concrete class for int64 array elements.
+    Concrete class for uint64 scalars.
     """
 
     def as_py(self):
         """
         Return this value as a Python int.
         """
-        cdef CInt64Array* ap = <CInt64Array*> self.sp_array.get()
-        return ap.Value(self.index)
+        cdef CUInt64Scalar* sp = <CUInt64Scalar*> self.wrapped.get()
+        return sp.value if sp.is_valid else None
 
 
-cdef class UInt64Value(ArrayValue):
+cdef class Int64Scalar(Scalar):
     """
-    Concrete class for uint64 array elements.
+    Concrete class for int64 scalars.
     """
 
     def as_py(self):
         """
         Return this value as a Python int.
         """
-        cdef CUInt64Array* ap = <CUInt64Array*> self.sp_array.get()
-        return ap.Value(self.index)
+        cdef CInt64Scalar* sp = <CInt64Scalar*> self.wrapped.get()
+        return sp.value if sp.is_valid else None
 
 
-cdef class Date32Value(ArrayValue):
+cdef class HalfFloatScalar(Scalar):
     """
-    Concrete class for date32 array elements.
+    Concrete class for float scalars.
     """
 
     def as_py(self):
         """
-        Return this value as a Python datetime.datetime instance.
+        Return this value as a Python float.
         """
-        cdef CDate32Array* ap = <CDate32Array*> self.sp_array.get()
+        cdef CHalfFloatScalar* sp = <CHalfFloatScalar*> self.wrapped.get()
+        return PyHalf_FromHalf(sp.value) if sp.is_valid else None
+
+
+cdef class FloatScalar(Scalar):
+    """
+    Concrete class for float scalars.
+    """
 
-        # Shift to seconds since epoch
-        return (datetime.date(1970, 1, 1) +
-                datetime.timedelta(days=ap.Value(self.index)))
+    def as_py(self):
+        """
+        Return this value as a Python float.
+        """
+        cdef CFloatScalar* sp = <CFloatScalar*> self.wrapped.get()
+        return sp.value if sp.is_valid else None
 
 
-cdef class Date64Value(ArrayValue):
+cdef class DoubleScalar(Scalar):
     """
-    Concrete class for date64 array elements.
+    Concrete class for double scalars.
     """
 
     def as_py(self):
         """
-        Return this value as a Python datetime.datetime instance.
+        Return this value as a Python float.
         """
-        cdef CDate64Array* ap = <CDate64Array*> self.sp_array.get()
-        return (datetime.date(1970, 1, 1) +
-                datetime.timedelta(
-                    days=ap.Value(self.index) / 86400000))
+        cdef CDoubleScalar* sp = <CDoubleScalar*> self.wrapped.get()
+        return sp.value if sp.is_valid else None
 
 
-cdef class Time32Value(ArrayValue):
+cdef class DecimalScalar(Scalar):
     """
-    Concrete class for time32 array elements.
+    Concrete class for decimal128 scalars.
     """
 
     def as_py(self):
         """
-        Return this value as a Python datetime.timedelta instance.
+        Return this value as a Python Decimal.
         """
         cdef:
-            CTime32Array* ap = <CTime32Array*> self.sp_array.get()
-            CTime32Type* dtype = <CTime32Type*> ap.type().get()
-
-        if dtype.unit() == TimeUnit_SECOND:
-            delta = datetime.timedelta(seconds=ap.Value(self.index))
-            return (datetime.datetime(1970, 1, 1) + delta).time()
+            CDecimal128Scalar* sp = <CDecimal128Scalar*> self.wrapped.get()
+            CDecimal128Type* dtype = <CDecimal128Type*> sp.type.get()
+        if sp.is_valid:
+            return _pydecimal.Decimal(
+                frombytes(sp.value.ToString(dtype.scale()))
+            )
         else:
-            return _box_time_milli(ap.Value(self.index))
+            return None
 
 
-cdef class Time64Value(ArrayValue):
+cdef class Date32Scalar(Scalar):
     """
-    Concrete class for time64 array elements.
+    Concrete class for date32 scalars.
     """
 
     def as_py(self):
         """
-        Return this value as a Python datetime.timedelta instance.
+        Return this value as a Python datetime.datetime instance.
         """
-        cdef:
-            CTime64Array* ap = <CTime64Array*> self.sp_array.get()
-            CTime64Type* dtype = <CTime64Type*> ap.type().get()
+        cdef CDate32Scalar* sp = <CDate32Scalar*> self.wrapped.get()
 
-        cdef int64_t val = ap.Value(self.index)
-        if dtype.unit() == TimeUnit_MICRO:
-            return _box_time_micro(val)
+        if sp.is_valid:
+            # shift to seconds since epoch
+            return (
+                datetime.date(1970, 1, 1) + datetime.timedelta(days=sp.value)
+            )
         else:
-            return (datetime.datetime(1970, 1, 1) +
-                    datetime.timedelta(microseconds=val / 1000)).time()
-
+            return None
 
-cpdef _box_time_milli(int64_t val):
-    delta = datetime.timedelta(milliseconds=val)
-    return (datetime.datetime(1970, 1, 1) + delta).time()
 
+cdef class Date64Scalar(Scalar):
+    """
+    Concrete class for date64 scalars.
+    """
 
-cpdef _box_time_micro(int64_t val):
-    return (datetime.datetime(1970, 1, 1) +
-            datetime.timedelta(microseconds=val)).time()
+    def as_py(self):
+        """
+        Return this value as a Python datetime.datetime instance.
+        """
+        cdef CDate64Scalar* sp = <CDate64Scalar*> self.wrapped.get()
 
+        if sp.is_valid:
+            return (
+                datetime.date(1970, 1, 1) +
+                datetime.timedelta(days=sp.value / 86400000)
+            )
+        else:
+            return None
 
-cdef dict _DATETIME_CONVERSION_FUNCTIONS = {}
-cdef c_bool _datetime_conversion_initialized = False
 
+def _datetime_from_int(int64_t value, TimeUnit unit, tzinfo=None):
+    if unit == TimeUnit_SECOND:
+        delta = datetime.timedelta(seconds=value)
+    elif unit == TimeUnit_MILLI:
+        delta = datetime.timedelta(milliseconds=value)
+    elif unit == TimeUnit_MICRO:
+        delta = datetime.timedelta(microseconds=value)
+    else:
+        # TimeUnit_NANO: prefer pandas timestamps if available
+        if _pandas_api.have_pandas:
+            return _pandas_api.pd.Timestamp(value, tz=tzinfo, unit='ns')
+        # otherwise safely truncate to microsecond resolution datetime
+        if value % 1000 != 0:
+            raise ValueError(
+                "Nanosecond resolution temporal type {} is not safely "
+                "convertible to microseconds to convert to datetime.datetime. "
+                "Install pandas to return as Timestamp with nanosecond "
+                "support or access the .value attribute.".format(value)
+            )
+        delta = datetime.timedelta(microseconds=value)
 
-cdef _add_micros_maybe_localize(dt, micros, tzinfo):
-    import pytz
-    dt = dt.replace(microsecond=micros)
+    dt = datetime.datetime(1970, 1, 1) + delta
+    # adjust timezone if set to the datatype
     if tzinfo is not None:
-        if not isinstance(tzinfo, datetime.tzinfo):
-            tzinfo = string_to_tzinfo(tzinfo)
         dt = tzinfo.fromutc(dt)
-    return dt
-
-
-cdef _datetime_from_seconds(int64_t v):
-    return datetime.datetime(1970, 1, 1) + datetime.timedelta(seconds=v)
-
-
-def _nanoseconds_to_datetime_safe(v, tzinfo):
-    if v % 1000 != 0:
-        raise ValueError("Nanosecond timestamp {} is not safely convertible "
-                         " to microseconds to convert to datetime.datetime."
-                         " Install pandas to return as Timestamp with "
-                         " nanosecond support or access the .value attribute.")
-    v = v // 1000
-    micros = v % 1_000_000
-
-    dt = _datetime_from_seconds(v // 1_000_000)
-    return _add_micros_maybe_localize(dt, micros, tzinfo)
-
-
-def _microseconds_to_datetime(v, tzinfo):
-    micros = v % 1_000_000
-    dt = _datetime_from_seconds(v // 1_000_000)
-    return _add_micros_maybe_localize(dt, micros, tzinfo)
 
+    return dt
 
-def _millis_to_datetime(v, tzinfo):
-    millis = v % 1_000
-    dt = _datetime_from_seconds(v // 1000)
-    return _add_micros_maybe_localize(dt, millis * 1000, tzinfo)
 
+cdef class Time32Scalar(Scalar):
+    """
+    Concrete class for time32 scalars.
+    """
 
-def _seconds_to_datetime(v, tzinfo):
-    dt = _datetime_from_seconds(v)
-    return _add_micros_maybe_localize(dt, 0, tzinfo)
+    def as_py(self):
+        """
+        Return this value as a Python datetime.timedelta instance.
+        """
+        cdef:
+            CTime32Scalar* sp = <CTime32Scalar*> self.wrapped.get()
+            CTime32Type* dtype = <CTime32Type*> sp.type.get()
 
+        if sp.is_valid:
+            return _datetime_from_int(sp.value, unit=dtype.unit()).time()
+        else:
+            return None
 
-def _datetime_conversion_functions():
-    global _datetime_conversion_initialized
-    if _datetime_conversion_initialized:
-        return _DATETIME_CONVERSION_FUNCTIONS
 
-    _DATETIME_CONVERSION_FUNCTIONS.update({
-        TimeUnit_SECOND: _seconds_to_datetime,
-        TimeUnit_MILLI: _millis_to_datetime,
-        TimeUnit_MICRO: _microseconds_to_datetime,
-        TimeUnit_NANO: _nanoseconds_to_datetime_safe
-    })
+cdef class Time64Scalar(Scalar):
+    """
+    Concrete class for time64 scalars.
+    """
 
-    try:
-        import pandas as pd
-        _DATETIME_CONVERSION_FUNCTIONS[TimeUnit_NANO] = (
-            lambda x, tzinfo: pd.Timestamp(
-                x, tz=tzinfo, unit='ns',
-            )
-        )
-    except ImportError:
-        pass
+    def as_py(self):
+        """
+        Return this value as a Python datetime.timedelta instance.
+        """
+        cdef:
+            CTime64Scalar* sp = <CTime64Scalar*> self.wrapped.get()
+            CTime64Type* dtype = <CTime64Type*> sp.type.get()
 
-    _datetime_conversion_initialized = True
-    return _DATETIME_CONVERSION_FUNCTIONS
+        if sp.is_valid:
+            return _datetime_from_int(sp.value, unit=dtype.unit()).time()
+        else:
+            return None
 
 
-cdef class TimestampValue(ArrayValue):
+cdef class TimestampScalar(Scalar):
     """
-    Concrete class for timestamp array elements.
+    Concrete class for timestamp scalars.
     """
 
     @property
     def value(self):
-        cdef CTimestampArray* ap = <CTimestampArray*> self.sp_array.get()
-        cdef CTimestampType* dtype = <CTimestampType*> ap.type().get()
-        return ap.Value(self.index)
+        cdef CTimestampScalar* sp = <CTimestampScalar*> self.wrapped.get()
+        return sp.value if sp.is_valid else None
 
     def as_py(self):
         """
         Return this value as a Pandas Timestamp instance (if available),
         otherwise as a Python datetime.timedelta instance.
         """
-        cdef CTimestampArray* ap = <CTimestampArray*> self.sp_array.get()
-        cdef CTimestampType* dtype = <CTimestampType*> ap.type().get()
+        cdef:
+            CTimestampScalar* sp = <CTimestampScalar*> self.wrapped.get()
+            CTimestampType* dtype = <CTimestampType*> sp.type.get()
 
-        value = self.value
+        if not sp.is_valid:
+            return None
 
         if not dtype.timezone().empty():
             tzinfo = string_to_tzinfo(frombytes(dtype.timezone()))
+            if not isinstance(tzinfo, datetime.tzinfo):
+                tzinfo = string_to_tzinfo(tzinfo)
         else:
             tzinfo = None
 
-        try:
-            converter = _datetime_conversion_functions()[dtype.unit()]
-        except KeyError:
-            raise ValueError(
-                'Cannot convert nanosecond timestamps without pandas'
-            )
-        return converter(value, tzinfo=tzinfo)
-
-
-cdef dict _TIMEDELTA_CONVERSION_FUNCTIONS = {}
-
-
-def _nanoseconds_to_timedelta_safe(v):
-    if v % 1000 != 0:
-        raise ValueError(
-            "Nanosecond duration {} is not safely convertible to microseconds "
-            "to convert to datetime.timedelta. Install pandas to return as "
-            "Timedelta with nanosecond support or access the .value "
-            "attribute.".format(v))
-    micros = v // 1000
-
-    return datetime.timedelta(microseconds=micros)
-
+        return _datetime_from_int(sp.value, unit=dtype.unit(), tzinfo=tzinfo)
 
-def _timedelta_conversion_functions():
-    if _TIMEDELTA_CONVERSION_FUNCTIONS:
-        return _TIMEDELTA_CONVERSION_FUNCTIONS
 
-    _TIMEDELTA_CONVERSION_FUNCTIONS.update({
-        TimeUnit_SECOND: lambda v: datetime.timedelta(seconds=v),
-        TimeUnit_MILLI: lambda v: datetime.timedelta(milliseconds=v),
-        TimeUnit_MICRO: lambda v: datetime.timedelta(microseconds=v),
-        TimeUnit_NANO: _nanoseconds_to_timedelta_safe
-    })
-
-    try:
-        import pandas as pd
-        _TIMEDELTA_CONVERSION_FUNCTIONS[TimeUnit_NANO] = (
-            lambda v: pd.Timedelta(v, unit='ns')
-        )
-    except ImportError:
-        pass
-
-    return _TIMEDELTA_CONVERSION_FUNCTIONS
-
-
-cdef class DurationValue(ArrayValue):
+cdef class DurationScalar(Scalar):
     """
-    Concrete class for duration array elements.
+    Concrete class for duration scalars.
     """
 
     @property
     def value(self):
-        cdef CDurationArray* ap = <CDurationArray*> self.sp_array.get()
-        return ap.Value(self.index)
+        cdef CDurationScalar* sp = <CDurationScalar*> self.wrapped.get()
+        return sp.value if sp.is_valid else None
 
     def as_py(self):
         """
         Return this value as a Pandas Timestamp instance (if available),
         otherwise as a Python datetime.timedelta instance.
         """
-        cdef CDurationArray* ap = <CDurationArray*> self.sp_array.get()
-        cdef CDurationType* dtype = <CDurationType*> ap.type().get()
-
-        cdef int64_t value = ap.Value(self.index)
-        converter = _timedelta_conversion_functions()[dtype.unit()]
-        return converter(value)
-
+        cdef:
+            CDurationScalar* sp = <CDurationScalar*> self.wrapped.get()
+            CDurationType* dtype = <CDurationType*> sp.type.get()
+            TimeUnit unit = dtype.unit()
 
-cdef class HalfFloatValue(ArrayValue):
-    """
-    Concrete class for float16 array elements.
-    """
+        if not sp.is_valid:
+            return None
 
-    def as_py(self):
-        """
-        Return this value as a Python float.
-        """
-        cdef CHalfFloatArray* ap = <CHalfFloatArray*> self.sp_array.get()
-        return PyHalf_FromHalf(ap.Value(self.index))
+        if unit == TimeUnit_SECOND:
+            return datetime.timedelta(seconds=sp.value)
+        elif unit == TimeUnit_MILLI:
+            return datetime.timedelta(milliseconds=sp.value)
+        elif unit == TimeUnit_MICRO:
+            return datetime.timedelta(microseconds=sp.value)
+        else:
+            # TimeUnit_NANO: prefer pandas timestamps if available
+            if _pandas_api.have_pandas:
+                return _pandas_api.pd.Timedelta(sp.value, unit='ns')
+            # otherwise safely truncate to microsecond resolution timedelta
+            if sp.value % 1000 != 0:
+                raise ValueError(
+                    "Nanosecond duration {} is not safely convertible to "
+                    "microseconds to convert to datetime.timedelta. Install "
+                    "pandas to return as Timedelta with nanosecond support or "
+                    "access the .value attribute.".format(sp.value)
+                )
+            return datetime.timedelta(microseconds=sp.value // 1000)
 
 
-cdef class FloatValue(ArrayValue):
+cdef class BinaryScalar(Scalar):
     """
-    Concrete class for float32 array elements.
+    Concrete class for binary-like scalars.
     """
 
-    def as_py(self):
+    def as_buffer(self):
         """
-        Return this value as a Python float.
+        Return a view over this value as a Buffer object.
         """
-        cdef CFloatArray* ap = <CFloatArray*> self.sp_array.get()
-        return ap.Value(self.index)
-
-
-cdef class DoubleValue(ArrayValue):
-    """
-    Concrete class for float64 array elements.
-    """
+        cdef CBinaryScalar* sp = <CBinaryScalar*> self.wrapped.get()
+        return pyarrow_wrap_buffer(sp.value) if sp.is_valid else None
 
     def as_py(self):
         """
-        Return this value as a Python float.
+        Return this value as a Python bytes.
         """
-        cdef CDoubleArray* ap = <CDoubleArray*> self.sp_array.get()
-        return ap.Value(self.index)
+        buffer = self.as_buffer()
+        if buffer is not None:
+            return self.as_buffer().to_pybytes()
+        else:
+            return None
 
 
-cdef class DecimalValue(ArrayValue):
-    """
-    Concrete class for decimal128 array elements.
-    """
+cdef class LargeBinaryScalar(BinaryScalar):
+    pass
 
-    def as_py(self):
-        """
-        Return this value as a Python Decimal.
-        """
-        cdef:
-            CDecimal128Array* ap = <CDecimal128Array*> self.sp_array.get()
-            c_string s = ap.FormatValue(self.index)
-        return _pydecimal.Decimal(s.decode('utf8'))
 
+cdef class FixedSizeBinaryScalar(BinaryScalar):
+    pass
 
-cdef class StringValue(ArrayValue):
+
+cdef class StringScalar(BinaryScalar):
     """
-    Concrete class for string (utf8) array elements.
+    Concrete class for string-like (utf8) scalars.
     """
 
     def as_py(self):
         """
-        Return this value as a Python unicode string.
+        Return this value as a Python string.
         """
-        cdef CStringArray* ap = <CStringArray*> self.sp_array.get()
-        return ap.GetString(self.index).decode('utf-8')
+        buffer = self.as_buffer()
+        if buffer is not None:
+            return frombytes(self.as_buffer().to_pybytes())
+        else:
+            return None
 
-    def as_buffer(self):
-        """
-        Return a view over this value as a Buffer object.
-        """
-        cdef:
-            CStringArray* ap = <CStringArray*> self.sp_array.get()
-            shared_ptr[CBuffer] buf
 
-        buf = SliceBuffer(ap.value_data(), ap.value_offset(self.index),
-                          ap.value_length(self.index))
-        return pyarrow_wrap_buffer(buf)
+cdef class LargeStringScalar(StringScalar):
+    pass
 
 
-cdef class LargeStringValue(ArrayValue):
+cdef class ListScalar(Scalar):
     """
-    Concrete class for large string (utf8) array elements.
+    Concrete class for list-like scalars.
     """
 
-    def as_py(self):
+    cdef array(self):

Review comment:
       do we want to expose this in some form publicly as well? (like you have the "raw" `value` for timestamps)

##########
File path: python/pyarrow/scalar.pxi
##########
@@ -16,1198 +16,704 @@
 # under the License.
 
 
-_NULL = NA = None
-
-
 cdef class Scalar:
     """
-    The base class for all array elements.
+    The base class for scalars.
     """
 
+    def __init__(self):
+        raise TypeError("Do not call {}'s constructor directly, use "
+                        "pa.scalar() instead.".format(self.__class__.__name__))
 
-cdef class NullType(Scalar):
-    """
-    Singleton for null array elements.
-    """
-    # TODO rename this NullValue?
+    cdef void init(self, const shared_ptr[CScalar]& wrapped):
+        self.wrapped = wrapped
 
-    def __cinit__(self):
-        global NA
-        if NA is not None:
-            raise Exception('Cannot create multiple NAType instances')
+    @staticmethod
+    cdef wrap(const shared_ptr[CScalar]& wrapped):
+        cdef:
+            Scalar self
+            Type type_id = wrapped.get().type.get().id()
+
+        if type_id == _Type_NA:
+            return _NULL
+
+        typ = _scalar_classes[type_id]
+        self = typ.__new__(typ)
+        self.init(wrapped)
+
+        return self
+
+    cdef inline shared_ptr[CScalar] unwrap(self) nogil:
+        return self.wrapped
 
-        self.type = null()
+    @property
+    def type(self):
+        return pyarrow_wrap_data_type(self.wrapped.get().type)
 
     def __repr__(self):
-        return 'NULL'
+        return '<pyarrow.{}: {!r}>'.format(

Review comment:
       Big +1 on this better repr! Just showing the repr of the as_py value was always very confusing :)

##########
File path: python/pyarrow/tests/test_scalars.py
##########
@@ -17,426 +17,395 @@
 
 import datetime
 import pytest
-import unittest
 
 import numpy as np
 
 import pyarrow as pa
 
 
-class TestScalars(unittest.TestCase):
-
-    def test_null_singleton(self):
-        with pytest.raises(Exception):
-            pa.NAType()
+@pytest.mark.parametrize(['value', 'ty', 'klass', 'deprecated'], [
+    (None, None, pa.NullScalar, pa.NullType),
+    (False, None, pa.BooleanScalar, pa.BooleanValue),
+    (True, None, pa.BooleanScalar, pa.BooleanValue),
+    (1, None, pa.Int64Scalar, pa.Int64Value),
+    (-1, None, pa.Int64Scalar, pa.Int64Value),
+    (1, pa.int8(), pa.Int8Scalar, pa.Int8Value),
+    (1, pa.uint8(), pa.UInt8Scalar, pa.UInt8Value),
+    (1, pa.int16(), pa.Int16Scalar, pa.Int16Value),
+    (1, pa.uint16(), pa.UInt16Scalar, pa.UInt16Value),
+    (1, pa.int32(), pa.Int32Scalar, pa.Int32Value),
+    (1, pa.uint32(), pa.UInt32Scalar, pa.UInt32Value),
+    (1, pa.int64(), pa.Int64Scalar, pa.Int64Value),
+    (1, pa.uint64(), pa.UInt64Scalar, pa.UInt64Value),
+    (1.0, None, pa.DoubleScalar, pa.DoubleValue),
+    (np.float16(1.0), pa.float16(), pa.HalfFloatScalar, pa.HalfFloatValue),
+    (1.0, pa.float32(), pa.FloatScalar, pa.FloatValue),
+    ("string", None, pa.StringScalar, pa.StringValue),
+    (b"bytes", None, pa.BinaryScalar, pa.BinaryValue),
+    ([1, 2, 3], None, pa.ListScalar, pa.ListValue),
+    ([1, 2, 3, 4], pa.large_list(pa.int8()), pa.LargeListScalar,
+     pa.LargeListValue),
+    # date
+    # time
+])
+def test_type_inference(value, ty, klass, deprecated):
+    s = pa.scalar(value, type=ty)
+    assert isinstance(s, klass)
+    assert s == value
+    with pytest.warns(FutureWarning):
+        isinstance(s, deprecated)
+
+
+def test_null_singleton():
+    with pytest.raises(Exception):
+        pa.NullScalar()
+
+
+def test_nulls():
+    arr = pa.array([None, None])
+    for v in arr:
+        assert v is pa.NA
+        assert v.as_py() is None
+
+
+def test_null_equality():
+    assert (pa.NA == pa.NA) is pa.NA
+    assert (pa.NA == 1) is pa.NA

Review comment:
       I don't know to what extent we want to fully work out the scalars (so can certainly be a follow-up), but so the typed null scalars (not pa.NA), should probably behave the same as pa.NA, eg when it comes to equality (`pa.NA == 1` gives pa.NA, but `pa.scalar(None, pa.int64()) == 1` gives False)

##########
File path: python/pyarrow/scalar.pxi
##########
@@ -16,1198 +16,704 @@
 # under the License.
 
 
-_NULL = NA = None
-
-
 cdef class Scalar:
     """
-    The base class for all array elements.
+    The base class for scalars.
     """
 
+    def __init__(self):
+        raise TypeError("Do not call {}'s constructor directly, use "
+                        "pa.scalar() instead.".format(self.__class__.__name__))
 
-cdef class NullType(Scalar):
-    """
-    Singleton for null array elements.
-    """
-    # TODO rename this NullValue?
+    cdef void init(self, const shared_ptr[CScalar]& wrapped):
+        self.wrapped = wrapped
 
-    def __cinit__(self):
-        global NA
-        if NA is not None:
-            raise Exception('Cannot create multiple NAType instances')
+    @staticmethod
+    cdef wrap(const shared_ptr[CScalar]& wrapped):
+        cdef:
+            Scalar self
+            Type type_id = wrapped.get().type.get().id()
+
+        if type_id == _Type_NA:
+            return _NULL
+
+        typ = _scalar_classes[type_id]
+        self = typ.__new__(typ)
+        self.init(wrapped)
+
+        return self
+
+    cdef inline shared_ptr[CScalar] unwrap(self) nogil:
+        return self.wrapped
 
-        self.type = null()
+    @property
+    def type(self):
+        return pyarrow_wrap_data_type(self.wrapped.get().type)
 
     def __repr__(self):
-        return 'NULL'
+        return '<pyarrow.{}: {!r}>'.format(
+            self.__class__.__name__, self.as_py()
+        )
 
-    def as_py(self):
-        """
-        Return None
-        """
-        return None
+    def __str__(self):
+        return str(self.as_py())
 
     def __eq__(self, other):
-        return NA
+        # TODO(kszucs): use c++ Equals
+        if isinstance(other, Scalar):
+            other = other.as_py()
+        return self.as_py() == other
 
+    def __hash__(self):
+        # TODO(kszucs): use C++ hash if implemented for the type
+        return hash(self.as_py())
+
+    def as_py(self):
+        raise NotImplementedError()
 
-_NULL = NA = NullType()
+
+_NULL = NA = None
 
 
-cdef class ArrayValue(Scalar):
+cdef class NullScalar(Scalar):
     """
-    The base class for non-null array elements.
+    Concrete class for null scalars.
     """
 
-    def __init__(self):
-        raise TypeError("Do not call {}'s constructor directly, use array "
-                        "subscription instead."
-                        .format(self.__class__.__name__))
-
-    cdef void init(self, DataType type, const shared_ptr[CArray]& sp_array,
-                   int64_t index):
-        self.type = type
-        self.index = index
-        self._set_array(sp_array)
+    def __cinit__(self):
+        global NA
+        if NA is not None:
+            raise Exception('Cannot create multiple NAType instances')
+        self.init(shared_ptr[CScalar](new CNullScalar()))
 
-    cdef void _set_array(self, const shared_ptr[CArray]& sp_array):
-        self.sp_array = sp_array
+    def __init__(self):
+        pass
 
-    def __repr__(self):
-        if hasattr(self, 'as_py'):
-            return repr(self.as_py())
-        else:
-            return super(Scalar, self).__repr__()
+    def __eq__(self, other):
+        return NA
 
-    def __str__(self):
-        if hasattr(self, 'as_py'):
-            return str(self.as_py())
-        else:
-            return super(Scalar, self).__str__()
+    def as_py(self):
+        """
+        Return this value as a Python None.
+        """
+        return None
 
-    def __eq__(self, other):
-        if hasattr(self, 'as_py'):
-            if isinstance(other, ArrayValue):
-                other = other.as_py()
-            return self.as_py() == other
-        else:
-            raise NotImplementedError(
-                "Cannot compare Arrow values that don't support as_py()")
 
-    def __hash__(self):
-        return hash(self.as_py())
+_NULL = NA = NullScalar()
 
 
-cdef class BooleanValue(ArrayValue):
+cdef class BooleanScalar(Scalar):
     """
-    Concrete class for boolean array elements.
+    Concrete class for boolean scalars.
     """
 
     def as_py(self):
         """
         Return this value as a Python bool.
         """
-        cdef CBooleanArray* ap = <CBooleanArray*> self.sp_array.get()
-        return ap.Value(self.index)
+        cdef CBooleanScalar* sp = <CBooleanScalar*> self.wrapped.get()
+        return sp.value if sp.is_valid else None
 
 
-cdef class Int8Value(ArrayValue):
+cdef class UInt8Scalar(Scalar):
     """
-    Concrete class for int8 array elements.
+    Concrete class for uint8 scalars.
     """
 
     def as_py(self):
         """
         Return this value as a Python int.
         """
-        cdef CInt8Array* ap = <CInt8Array*> self.sp_array.get()
-        return ap.Value(self.index)
+        cdef CUInt8Scalar* sp = <CUInt8Scalar*> self.wrapped.get()
+        return sp.value if sp.is_valid else None
 
 
-cdef class UInt8Value(ArrayValue):
+cdef class Int8Scalar(Scalar):
     """
-    Concrete class for uint8 array elements.
+    Concrete class for int8 scalars.
     """
 
     def as_py(self):
         """
         Return this value as a Python int.
         """
-        cdef CUInt8Array* ap = <CUInt8Array*> self.sp_array.get()
-        return ap.Value(self.index)
+        cdef CInt8Scalar* sp = <CInt8Scalar*> self.wrapped.get()
+        return sp.value if sp.is_valid else None
 
 
-cdef class Int16Value(ArrayValue):
+cdef class UInt16Scalar(Scalar):
     """
-    Concrete class for int16 array elements.
+    Concrete class for uint16 scalars.
     """
 
     def as_py(self):
         """
         Return this value as a Python int.
         """
-        cdef CInt16Array* ap = <CInt16Array*> self.sp_array.get()
-        return ap.Value(self.index)
+        cdef CUInt16Scalar* sp = <CUInt16Scalar*> self.wrapped.get()
+        return sp.value if sp.is_valid else None
 
 
-cdef class UInt16Value(ArrayValue):
+cdef class Int16Scalar(Scalar):
     """
-    Concrete class for uint16 array elements.
+    Concrete class for int16 scalars.
     """
 
     def as_py(self):
         """
         Return this value as a Python int.
         """
-        cdef CUInt16Array* ap = <CUInt16Array*> self.sp_array.get()
-        return ap.Value(self.index)
+        cdef CInt16Scalar* sp = <CInt16Scalar*> self.wrapped.get()
+        return sp.value if sp.is_valid else None
 
 
-cdef class Int32Value(ArrayValue):
+cdef class UInt32Scalar(Scalar):
     """
-    Concrete class for int32 array elements.
+    Concrete class for uint32 scalars.
     """
 
     def as_py(self):
         """
         Return this value as a Python int.
         """
-        cdef CInt32Array* ap = <CInt32Array*> self.sp_array.get()
-        return ap.Value(self.index)
+        cdef CUInt32Scalar* sp = <CUInt32Scalar*> self.wrapped.get()
+        return sp.value if sp.is_valid else None
 
 
-cdef class UInt32Value(ArrayValue):
+cdef class Int32Scalar(Scalar):
     """
-    Concrete class for uint32 array elements.
+    Concrete class for int32 scalars.
     """
 
     def as_py(self):
         """
         Return this value as a Python int.
         """
-        cdef CUInt32Array* ap = <CUInt32Array*> self.sp_array.get()
-        return ap.Value(self.index)
+        cdef CInt32Scalar* sp = <CInt32Scalar*> self.wrapped.get()
+        return sp.value if sp.is_valid else None
 
 
-cdef class Int64Value(ArrayValue):
+cdef class UInt64Scalar(Scalar):
     """
-    Concrete class for int64 array elements.
+    Concrete class for uint64 scalars.
     """
 
     def as_py(self):
         """
         Return this value as a Python int.
         """
-        cdef CInt64Array* ap = <CInt64Array*> self.sp_array.get()
-        return ap.Value(self.index)
+        cdef CUInt64Scalar* sp = <CUInt64Scalar*> self.wrapped.get()
+        return sp.value if sp.is_valid else None
 
 
-cdef class UInt64Value(ArrayValue):
+cdef class Int64Scalar(Scalar):
     """
-    Concrete class for uint64 array elements.
+    Concrete class for int64 scalars.
     """
 
     def as_py(self):
         """
         Return this value as a Python int.
         """
-        cdef CUInt64Array* ap = <CUInt64Array*> self.sp_array.get()
-        return ap.Value(self.index)
+        cdef CInt64Scalar* sp = <CInt64Scalar*> self.wrapped.get()
+        return sp.value if sp.is_valid else None
 
 
-cdef class Date32Value(ArrayValue):
+cdef class HalfFloatScalar(Scalar):
     """
-    Concrete class for date32 array elements.
+    Concrete class for float scalars.
     """
 
     def as_py(self):
         """
-        Return this value as a Python datetime.datetime instance.
+        Return this value as a Python float.
         """
-        cdef CDate32Array* ap = <CDate32Array*> self.sp_array.get()
+        cdef CHalfFloatScalar* sp = <CHalfFloatScalar*> self.wrapped.get()
+        return PyHalf_FromHalf(sp.value) if sp.is_valid else None
+
+
+cdef class FloatScalar(Scalar):
+    """
+    Concrete class for float scalars.
+    """
 
-        # Shift to seconds since epoch
-        return (datetime.date(1970, 1, 1) +
-                datetime.timedelta(days=ap.Value(self.index)))
+    def as_py(self):
+        """
+        Return this value as a Python float.
+        """
+        cdef CFloatScalar* sp = <CFloatScalar*> self.wrapped.get()
+        return sp.value if sp.is_valid else None
 
 
-cdef class Date64Value(ArrayValue):
+cdef class DoubleScalar(Scalar):
     """
-    Concrete class for date64 array elements.
+    Concrete class for double scalars.
     """
 
     def as_py(self):
         """
-        Return this value as a Python datetime.datetime instance.
+        Return this value as a Python float.
         """
-        cdef CDate64Array* ap = <CDate64Array*> self.sp_array.get()
-        return (datetime.date(1970, 1, 1) +
-                datetime.timedelta(
-                    days=ap.Value(self.index) / 86400000))
+        cdef CDoubleScalar* sp = <CDoubleScalar*> self.wrapped.get()
+        return sp.value if sp.is_valid else None
 
 
-cdef class Time32Value(ArrayValue):
+cdef class DecimalScalar(Scalar):
     """
-    Concrete class for time32 array elements.
+    Concrete class for decimal128 scalars.
     """
 
     def as_py(self):
         """
-        Return this value as a Python datetime.timedelta instance.
+        Return this value as a Python Decimal.
         """
         cdef:
-            CTime32Array* ap = <CTime32Array*> self.sp_array.get()
-            CTime32Type* dtype = <CTime32Type*> ap.type().get()
-
-        if dtype.unit() == TimeUnit_SECOND:
-            delta = datetime.timedelta(seconds=ap.Value(self.index))
-            return (datetime.datetime(1970, 1, 1) + delta).time()
+            CDecimal128Scalar* sp = <CDecimal128Scalar*> self.wrapped.get()
+            CDecimal128Type* dtype = <CDecimal128Type*> sp.type.get()
+        if sp.is_valid:
+            return _pydecimal.Decimal(
+                frombytes(sp.value.ToString(dtype.scale()))
+            )
         else:
-            return _box_time_milli(ap.Value(self.index))
+            return None
 
 
-cdef class Time64Value(ArrayValue):
+cdef class Date32Scalar(Scalar):
     """
-    Concrete class for time64 array elements.
+    Concrete class for date32 scalars.
     """
 
     def as_py(self):
         """
-        Return this value as a Python datetime.timedelta instance.
+        Return this value as a Python datetime.datetime instance.
         """
-        cdef:
-            CTime64Array* ap = <CTime64Array*> self.sp_array.get()
-            CTime64Type* dtype = <CTime64Type*> ap.type().get()
+        cdef CDate32Scalar* sp = <CDate32Scalar*> self.wrapped.get()
 
-        cdef int64_t val = ap.Value(self.index)
-        if dtype.unit() == TimeUnit_MICRO:
-            return _box_time_micro(val)
+        if sp.is_valid:
+            # shift to seconds since epoch
+            return (
+                datetime.date(1970, 1, 1) + datetime.timedelta(days=sp.value)
+            )
         else:
-            return (datetime.datetime(1970, 1, 1) +
-                    datetime.timedelta(microseconds=val / 1000)).time()
-
+            return None
 
-cpdef _box_time_milli(int64_t val):
-    delta = datetime.timedelta(milliseconds=val)
-    return (datetime.datetime(1970, 1, 1) + delta).time()
 
+cdef class Date64Scalar(Scalar):
+    """
+    Concrete class for date64 scalars.
+    """
 
-cpdef _box_time_micro(int64_t val):
-    return (datetime.datetime(1970, 1, 1) +
-            datetime.timedelta(microseconds=val)).time()
+    def as_py(self):
+        """
+        Return this value as a Python datetime.datetime instance.
+        """
+        cdef CDate64Scalar* sp = <CDate64Scalar*> self.wrapped.get()
 
+        if sp.is_valid:
+            return (
+                datetime.date(1970, 1, 1) +
+                datetime.timedelta(days=sp.value / 86400000)
+            )
+        else:
+            return None
 
-cdef dict _DATETIME_CONVERSION_FUNCTIONS = {}
-cdef c_bool _datetime_conversion_initialized = False
 
+def _datetime_from_int(int64_t value, TimeUnit unit, tzinfo=None):
+    if unit == TimeUnit_SECOND:
+        delta = datetime.timedelta(seconds=value)
+    elif unit == TimeUnit_MILLI:
+        delta = datetime.timedelta(milliseconds=value)
+    elif unit == TimeUnit_MICRO:
+        delta = datetime.timedelta(microseconds=value)
+    else:
+        # TimeUnit_NANO: prefer pandas timestamps if available
+        if _pandas_api.have_pandas:
+            return _pandas_api.pd.Timestamp(value, tz=tzinfo, unit='ns')
+        # otherwise safely truncate to microsecond resolution datetime
+        if value % 1000 != 0:
+            raise ValueError(
+                "Nanosecond resolution temporal type {} is not safely "
+                "convertible to microseconds to convert to datetime.datetime. "
+                "Install pandas to return as Timestamp with nanosecond "
+                "support or access the .value attribute.".format(value)
+            )
+        delta = datetime.timedelta(microseconds=value)
 
-cdef _add_micros_maybe_localize(dt, micros, tzinfo):
-    import pytz
-    dt = dt.replace(microsecond=micros)
+    dt = datetime.datetime(1970, 1, 1) + delta
+    # adjust timezone if set to the datatype
     if tzinfo is not None:
-        if not isinstance(tzinfo, datetime.tzinfo):
-            tzinfo = string_to_tzinfo(tzinfo)
         dt = tzinfo.fromutc(dt)
-    return dt
-
-
-cdef _datetime_from_seconds(int64_t v):
-    return datetime.datetime(1970, 1, 1) + datetime.timedelta(seconds=v)
-
-
-def _nanoseconds_to_datetime_safe(v, tzinfo):
-    if v % 1000 != 0:
-        raise ValueError("Nanosecond timestamp {} is not safely convertible "
-                         " to microseconds to convert to datetime.datetime."
-                         " Install pandas to return as Timestamp with "
-                         " nanosecond support or access the .value attribute.")
-    v = v // 1000
-    micros = v % 1_000_000
-
-    dt = _datetime_from_seconds(v // 1_000_000)
-    return _add_micros_maybe_localize(dt, micros, tzinfo)
-
-
-def _microseconds_to_datetime(v, tzinfo):
-    micros = v % 1_000_000
-    dt = _datetime_from_seconds(v // 1_000_000)
-    return _add_micros_maybe_localize(dt, micros, tzinfo)
 
+    return dt
 
-def _millis_to_datetime(v, tzinfo):
-    millis = v % 1_000
-    dt = _datetime_from_seconds(v // 1000)
-    return _add_micros_maybe_localize(dt, millis * 1000, tzinfo)
 
+cdef class Time32Scalar(Scalar):
+    """
+    Concrete class for time32 scalars.
+    """
 
-def _seconds_to_datetime(v, tzinfo):
-    dt = _datetime_from_seconds(v)
-    return _add_micros_maybe_localize(dt, 0, tzinfo)
+    def as_py(self):
+        """
+        Return this value as a Python datetime.timedelta instance.
+        """
+        cdef:
+            CTime32Scalar* sp = <CTime32Scalar*> self.wrapped.get()
+            CTime32Type* dtype = <CTime32Type*> sp.type.get()
 
+        if sp.is_valid:
+            return _datetime_from_int(sp.value, unit=dtype.unit()).time()
+        else:
+            return None
 
-def _datetime_conversion_functions():
-    global _datetime_conversion_initialized
-    if _datetime_conversion_initialized:
-        return _DATETIME_CONVERSION_FUNCTIONS
 
-    _DATETIME_CONVERSION_FUNCTIONS.update({
-        TimeUnit_SECOND: _seconds_to_datetime,
-        TimeUnit_MILLI: _millis_to_datetime,
-        TimeUnit_MICRO: _microseconds_to_datetime,
-        TimeUnit_NANO: _nanoseconds_to_datetime_safe
-    })
+cdef class Time64Scalar(Scalar):
+    """
+    Concrete class for time64 scalars.
+    """
 
-    try:
-        import pandas as pd
-        _DATETIME_CONVERSION_FUNCTIONS[TimeUnit_NANO] = (
-            lambda x, tzinfo: pd.Timestamp(
-                x, tz=tzinfo, unit='ns',
-            )
-        )
-    except ImportError:
-        pass
+    def as_py(self):
+        """
+        Return this value as a Python datetime.timedelta instance.
+        """
+        cdef:
+            CTime64Scalar* sp = <CTime64Scalar*> self.wrapped.get()
+            CTime64Type* dtype = <CTime64Type*> sp.type.get()
 
-    _datetime_conversion_initialized = True
-    return _DATETIME_CONVERSION_FUNCTIONS
+        if sp.is_valid:
+            return _datetime_from_int(sp.value, unit=dtype.unit()).time()
+        else:
+            return None
 
 
-cdef class TimestampValue(ArrayValue):
+cdef class TimestampScalar(Scalar):
     """
-    Concrete class for timestamp array elements.
+    Concrete class for timestamp scalars.
     """
 
     @property
     def value(self):
-        cdef CTimestampArray* ap = <CTimestampArray*> self.sp_array.get()
-        cdef CTimestampType* dtype = <CTimestampType*> ap.type().get()
-        return ap.Value(self.index)
+        cdef CTimestampScalar* sp = <CTimestampScalar*> self.wrapped.get()
+        return sp.value if sp.is_valid else None
 
     def as_py(self):
         """
         Return this value as a Pandas Timestamp instance (if available),
         otherwise as a Python datetime.timedelta instance.
         """
-        cdef CTimestampArray* ap = <CTimestampArray*> self.sp_array.get()
-        cdef CTimestampType* dtype = <CTimestampType*> ap.type().get()
+        cdef:
+            CTimestampScalar* sp = <CTimestampScalar*> self.wrapped.get()
+            CTimestampType* dtype = <CTimestampType*> sp.type.get()
 
-        value = self.value
+        if not sp.is_valid:
+            return None
 
         if not dtype.timezone().empty():
             tzinfo = string_to_tzinfo(frombytes(dtype.timezone()))
+            if not isinstance(tzinfo, datetime.tzinfo):
+                tzinfo = string_to_tzinfo(tzinfo)
         else:
             tzinfo = None
 
-        try:
-            converter = _datetime_conversion_functions()[dtype.unit()]
-        except KeyError:
-            raise ValueError(
-                'Cannot convert nanosecond timestamps without pandas'
-            )
-        return converter(value, tzinfo=tzinfo)
-
-
-cdef dict _TIMEDELTA_CONVERSION_FUNCTIONS = {}
-
-
-def _nanoseconds_to_timedelta_safe(v):
-    if v % 1000 != 0:
-        raise ValueError(
-            "Nanosecond duration {} is not safely convertible to microseconds "
-            "to convert to datetime.timedelta. Install pandas to return as "
-            "Timedelta with nanosecond support or access the .value "
-            "attribute.".format(v))
-    micros = v // 1000
-
-    return datetime.timedelta(microseconds=micros)
-
+        return _datetime_from_int(sp.value, unit=dtype.unit(), tzinfo=tzinfo)
 
-def _timedelta_conversion_functions():
-    if _TIMEDELTA_CONVERSION_FUNCTIONS:
-        return _TIMEDELTA_CONVERSION_FUNCTIONS
 
-    _TIMEDELTA_CONVERSION_FUNCTIONS.update({
-        TimeUnit_SECOND: lambda v: datetime.timedelta(seconds=v),
-        TimeUnit_MILLI: lambda v: datetime.timedelta(milliseconds=v),
-        TimeUnit_MICRO: lambda v: datetime.timedelta(microseconds=v),
-        TimeUnit_NANO: _nanoseconds_to_timedelta_safe
-    })
-
-    try:
-        import pandas as pd
-        _TIMEDELTA_CONVERSION_FUNCTIONS[TimeUnit_NANO] = (
-            lambda v: pd.Timedelta(v, unit='ns')
-        )
-    except ImportError:
-        pass
-
-    return _TIMEDELTA_CONVERSION_FUNCTIONS
-
-
-cdef class DurationValue(ArrayValue):
+cdef class DurationScalar(Scalar):
     """
-    Concrete class for duration array elements.
+    Concrete class for duration scalars.
     """
 
     @property
     def value(self):
-        cdef CDurationArray* ap = <CDurationArray*> self.sp_array.get()
-        return ap.Value(self.index)
+        cdef CDurationScalar* sp = <CDurationScalar*> self.wrapped.get()
+        return sp.value if sp.is_valid else None
 
     def as_py(self):
         """
         Return this value as a Pandas Timestamp instance (if available),
         otherwise as a Python datetime.timedelta instance.
         """
-        cdef CDurationArray* ap = <CDurationArray*> self.sp_array.get()
-        cdef CDurationType* dtype = <CDurationType*> ap.type().get()
-
-        cdef int64_t value = ap.Value(self.index)
-        converter = _timedelta_conversion_functions()[dtype.unit()]
-        return converter(value)
-
+        cdef:
+            CDurationScalar* sp = <CDurationScalar*> self.wrapped.get()
+            CDurationType* dtype = <CDurationType*> sp.type.get()
+            TimeUnit unit = dtype.unit()
 
-cdef class HalfFloatValue(ArrayValue):
-    """
-    Concrete class for float16 array elements.
-    """
+        if not sp.is_valid:
+            return None
 
-    def as_py(self):
-        """
-        Return this value as a Python float.
-        """
-        cdef CHalfFloatArray* ap = <CHalfFloatArray*> self.sp_array.get()
-        return PyHalf_FromHalf(ap.Value(self.index))
+        if unit == TimeUnit_SECOND:
+            return datetime.timedelta(seconds=sp.value)
+        elif unit == TimeUnit_MILLI:
+            return datetime.timedelta(milliseconds=sp.value)
+        elif unit == TimeUnit_MICRO:
+            return datetime.timedelta(microseconds=sp.value)
+        else:
+            # TimeUnit_NANO: prefer pandas timestamps if available
+            if _pandas_api.have_pandas:
+                return _pandas_api.pd.Timedelta(sp.value, unit='ns')
+            # otherwise safely truncate to microsecond resolution timedelta
+            if sp.value % 1000 != 0:
+                raise ValueError(
+                    "Nanosecond duration {} is not safely convertible to "
+                    "microseconds to convert to datetime.timedelta. Install "
+                    "pandas to return as Timedelta with nanosecond support or "
+                    "access the .value attribute.".format(sp.value)
+                )
+            return datetime.timedelta(microseconds=sp.value // 1000)
 
 
-cdef class FloatValue(ArrayValue):
+cdef class BinaryScalar(Scalar):
     """
-    Concrete class for float32 array elements.
+    Concrete class for binary-like scalars.
     """
 
-    def as_py(self):
+    def as_buffer(self):
         """
-        Return this value as a Python float.
+        Return a view over this value as a Buffer object.
         """
-        cdef CFloatArray* ap = <CFloatArray*> self.sp_array.get()
-        return ap.Value(self.index)
-
-
-cdef class DoubleValue(ArrayValue):
-    """
-    Concrete class for float64 array elements.
-    """
+        cdef CBinaryScalar* sp = <CBinaryScalar*> self.wrapped.get()
+        return pyarrow_wrap_buffer(sp.value) if sp.is_valid else None
 
     def as_py(self):
         """
-        Return this value as a Python float.
+        Return this value as a Python bytes.
         """
-        cdef CDoubleArray* ap = <CDoubleArray*> self.sp_array.get()
-        return ap.Value(self.index)
+        buffer = self.as_buffer()
+        if buffer is not None:
+            return self.as_buffer().to_pybytes()
+        else:
+            return None
 
 
-cdef class DecimalValue(ArrayValue):
-    """
-    Concrete class for decimal128 array elements.
-    """
+cdef class LargeBinaryScalar(BinaryScalar):
+    pass
 
-    def as_py(self):
-        """
-        Return this value as a Python Decimal.
-        """
-        cdef:
-            CDecimal128Array* ap = <CDecimal128Array*> self.sp_array.get()
-            c_string s = ap.FormatValue(self.index)
-        return _pydecimal.Decimal(s.decode('utf8'))
 
+cdef class FixedSizeBinaryScalar(BinaryScalar):
+    pass
 
-cdef class StringValue(ArrayValue):
+
+cdef class StringScalar(BinaryScalar):
     """
-    Concrete class for string (utf8) array elements.
+    Concrete class for string-like (utf8) scalars.
     """
 
     def as_py(self):
         """
-        Return this value as a Python unicode string.
+        Return this value as a Python string.
         """
-        cdef CStringArray* ap = <CStringArray*> self.sp_array.get()
-        return ap.GetString(self.index).decode('utf-8')
+        buffer = self.as_buffer()
+        if buffer is not None:
+            return frombytes(self.as_buffer().to_pybytes())
+        else:
+            return None
 
-    def as_buffer(self):
-        """
-        Return a view over this value as a Buffer object.
-        """
-        cdef:
-            CStringArray* ap = <CStringArray*> self.sp_array.get()
-            shared_ptr[CBuffer] buf
 
-        buf = SliceBuffer(ap.value_data(), ap.value_offset(self.index),
-                          ap.value_length(self.index))
-        return pyarrow_wrap_buffer(buf)
+cdef class LargeStringScalar(StringScalar):
+    pass
 
 
-cdef class LargeStringValue(ArrayValue):
+cdef class ListScalar(Scalar):
     """
-    Concrete class for large string (utf8) array elements.
+    Concrete class for list-like scalars.
     """
 
-    def as_py(self):
+    cdef array(self):
+        cdef CListScalar* sp = <CListScalar*> self.wrapped.get()
+        if sp.is_valid:
+            return pyarrow_wrap_array(sp.value)
+        else:
+            return None
+
+    def __len__(self):
         """
-        Return this value as a Python unicode string.
+        Return the number of values.
         """
-        cdef CLargeStringArray* ap = <CLargeStringArray*> self.sp_array.get()
-        return ap.GetString(self.index).decode('utf-8')
+        return len(self.array())
 
-    def as_buffer(self):
+    def __getitem__(self, i):
         """
-        Return a view over this value as a Buffer object.
+        Return the value at the given index.
         """
-        cdef:
-            CLargeStringArray* ap = <CLargeStringArray*> self.sp_array.get()
-            shared_ptr[CBuffer] buf
-
-        buf = SliceBuffer(ap.value_data(), ap.value_offset(self.index),
-                          ap.value_length(self.index))
-        return pyarrow_wrap_buffer(buf)
+        return self.array()[_normalize_index(i, len(self))]
 
-
-cdef class BinaryValue(ArrayValue):
-    """
-    Concrete class for variable-sized binary array elements.
-    """
+    def __iter__(self):
+        """
+        Iterate over this element's values.
+        """
+        return iter(self.array())
 
     def as_py(self):
         """
-        Return this value as a Python bytes object.
+        Return this value as a Python list.
         """
-        cdef:
-            const uint8_t* ptr
-            int32_t length
-            CBinaryArray* ap = <CBinaryArray*> self.sp_array.get()
+        arr = self.array()
+        return None if arr is None else arr.to_pylist()
 
-        ptr = ap.GetValue(self.index, &length)
-        return cp.PyBytes_FromStringAndSize(<const char*>(ptr), length)
 
-    def as_buffer(self):
-        """
-        Return a view over this value as a Buffer object.
-        """
-        cdef:
-            CBinaryArray* ap = <CBinaryArray*> self.sp_array.get()
-            shared_ptr[CBuffer] buf
+cdef class FixedSizeListScalar(ListScalar):
+    pass
 
-        buf = SliceBuffer(ap.value_data(), ap.value_offset(self.index),
-                          ap.value_length(self.index))
-        return pyarrow_wrap_buffer(buf)
 
+cdef class LargeListScalar(ListScalar):
+    pass
 
-cdef class LargeBinaryValue(ArrayValue):
+
+cdef class StructScalar(Scalar):
     """
-    Concrete class for large variable-sized binary array elements.
+    Concrete class for struct scalars.
     """
 
-    def as_py(self):
+    def __len__(self):
+        cdef CStructScalar* sp = <CStructScalar*> self.wrapped.get()
+        return sp.value.size()
+
+    def __getitem__(self, key):
         """
-        Return this value as a Python bytes object.
+        Return the child value for the given field.
+
+        Parameters
+        ----------
+        index : Union[int, str]
+            Index / position or name of the field.
+
+        Returns
+        -------
+        result : Scalar
         """
         cdef:
-            const uint8_t* ptr
-            int64_t length
-            CLargeBinaryArray* ap = <CLargeBinaryArray*> self.sp_array.get()
+            CFieldRef ref
+            CStructScalar* sp = <CStructScalar*> self.wrapped.get()
+
+        if isinstance(key, (bytes, str)):
+            ref = CFieldRef(<c_string> tobytes(key))
+        elif isinstance(key, int):
+            ref = CFieldRef(<int> key)
+        else:
+            raise TypeError('Expected integer or string index')
 
-        ptr = ap.GetValue(self.index, &length)
-        return cp.PyBytes_FromStringAndSize(<const char*>(ptr), length)
+        try:
+            return Scalar.wrap(GetResultValue(sp.field(ref)))
+        except ArrowInvalid:
+            raise IndexError(key)
 
-    def as_buffer(self):
+    def as_py(self):
         """
-        Return a view over this value as a Buffer object.
+        Return this value as a Python dict.
         """
         cdef:
-            CLargeBinaryArray* ap = <CLargeBinaryArray*> self.sp_array.get()
-            shared_ptr[CBuffer] buf
+            CStructScalar* sp = <CStructScalar*> self.wrapped.get()
+            CStructType* dtype = <CStructType*> sp.type.get()
+            vector[shared_ptr[CField]] fields = dtype.fields()
 
-        buf = SliceBuffer(ap.value_data(), ap.value_offset(self.index),
-                          ap.value_length(self.index))
-        return pyarrow_wrap_buffer(buf)
+        if sp.is_valid:
+            return {frombytes(fields[i].get().name()): Scalar.wrap(sp.value[i])

Review comment:
       Should we directly take the `as_py` from the values in the dict as well?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org