You are viewing a plain text version of this content. The canonical link for it is here.
Posted to reviews@spark.apache.org by GitBox <gi...@apache.org> on 2021/10/12 23:25:58 UTC
[GitHub] [spark] ueshin commented on a change in pull request #34174: [SPARK-36910][PYTHON] Inline type hints for python/pyspark/sql/types.py

ueshin commented on a change in pull request #34174:
URL: https://github.com/apache/spark/pull/34174#discussion_r727570315



##########
File path: python/pyspark/sql/types.py
##########
@@ -95,12 +110,13 @@ def fromInternal(self, obj):
 class DataTypeSingleton(type):
     """Metaclass for DataType"""
 
-    _instances = {}
+    _instances: Dict = {}

Review comment:
       How about:
   ```py
   _instances: Dict[Type["DataTypeSingleton"], "DataTypeSingleton"] = {}
   ```

##########
File path: python/pyspark/pandas/frame.py
##########
@@ -6429,14 +6429,14 @@ def select_dtypes(
         include_spark_type = []
         for inc in include_list:
             try:
-                include_spark_type.append(_parse_datatype_string(inc))
+                include_spark_type.append(_parse_datatype_string(cast(str, inc)))

Review comment:
       Shall we rather specify the type of `include_list`?
   
   at line 6408-6411:
   ```py
           include_list: List[str]
           if not is_list_like(include):
               include_list = [cast(str, include)] if include is not None else []
           else:
               include_list = list(include)
   ```

##########
File path: python/pyspark/sql/types.py
##########
@@ -1446,25 +1517,25 @@ def verify_struct(obj):
         verify_value = verify_struct
 
     else:
-        def verify_default(obj):
+        def verify_default(obj: Any) -> None:
             assert_acceptable_types(obj)
             verify_acceptable_types(obj)
 
         verify_value = verify_default
 
-    def verify(obj):
+    def verify(obj: Any) -> None:
         if not verify_nullability(obj):
             verify_value(obj)
 
     return verify
 
 
 # This is used to unpickle a Row from JVM
-def _create_row_inbound_converter(dataType):
+def _create_row_inbound_converter(dataType: DataType) -> Callable:
     return lambda *a: dataType.fromInternal(a)
 
 
-def _create_row(fields, values):
+def _create_row(fields: Any, values: Any) -> "Row":

Review comment:
       I guess:
   
   ```py
   def _create_row(fields: List[str], values: List[Any]) -> "Row":
   ```

##########
File path: python/pyspark/sql/types.py
##########
@@ -1161,18 +1224,21 @@ def _merge_type(a, b, name=None):
         return StructType(fields)
 
     elif isinstance(a, ArrayType):
-        return ArrayType(_merge_type(a.elementType, b.elementType,
+        return ArrayType(_merge_type(a.elementType, b.elementType,  # type: ignore[attr-defined]
                                      name='element in array %s' % name), True)
 
     elif isinstance(a, MapType):
-        return MapType(_merge_type(a.keyType, b.keyType, name='key of map %s' % name),
-                       _merge_type(a.valueType, b.valueType, name='value of map %s' % name),
-                       True)
+        return MapType(
+            _merge_type(a.keyType, b.keyType,  # type: ignore[attr-defined]
+                        name='key of map %s' % name),
+            _merge_type(a.valueType, b.valueType,  # type: ignore[attr-defined]
+                        name='value of map %s' % name),

Review comment:
       I guess we can also avoid `ignore[attr-defined]` her.

##########
File path: python/pyspark/sql/types.py
##########
@@ -1015,7 +1065,7 @@ def _int_size_to_type(size):
     size = ctypes.sizeof(_array_signed_int_typecode_ctype_mappings[_typecode]) * 8
     dt = _int_size_to_type(size)
     if dt is not None:
-        _array_type_mappings[_typecode] = dt
+        _array_type_mappings[_typecode] = dt  # type: ignore[assignment]

Review comment:
       Shall we rather specify the type of `_array_type_mappings`?
   
   ```py
   _array_type_mappings: Dict[str, Type[DataType]] = {
       ...
   }
   ```

##########
File path: python/pyspark/sql/types.py
##########
@@ -1150,7 +1213,7 @@ def _merge_type(a, b, name=None):
 
     # same type
     if isinstance(a, StructType):
-        nfs = dict((f.name, f.dataType) for f in b.fields)
+        nfs = dict((f.name, f.dataType) for f in b.fields)  # type: ignore[attr-defined]

Review comment:
       We should use `cast(StructType, b).fields`?

##########
File path: python/pyspark/sql/types.py
##########
@@ -522,7 +546,27 @@ def __init__(self, fields=None):
         self._needConversion = [f.needConversion() for f in self]
         self._needSerializeAnyField = any(self._needConversion)
 
-    def add(self, field, data_type=None, nullable=True, metadata=None):
+    @overload
+    def add(
+        self,
+        field: str,
+        data_type: Union[str, DataType],
+        nullable: bool = True,
+        metadata: Optional[Dict[str, Any]] = None,
+    ) -> "StructType":
+        ...
+
+    @overload
+    def add(self, field: StructField) -> "StructType":
+        ...
+
+    def add(
+        self,
+        field: Union[str, StructField],
+        data_type: Optional[Union[str, DataType]] = None,
+        nullable: Optional[bool] = True,

Review comment:
       `nullable: bool = True`?

##########
File path: python/pyspark/sql/types.py
##########
@@ -439,7 +457,13 @@ class StructField(DataType):
     False
     """
 
-    def __init__(self, name, dataType, nullable=True, metadata=None):
+    def __init__(
+        self,
+        name: str,
+        dataType: DataType,
+        nullable: Optional[bool] = True,

Review comment:
       `nullable: bool = True`?

##########
File path: python/pyspark/sql/types.py
##########
@@ -232,18 +248,18 @@ class DecimalType(FractionalType):
         the number of digits on right side of dot. (default: 0)
     """
 
-    def __init__(self, precision=10, scale=0):
+    def __init__(self, precision: int = 10, scale: int = 0) -> None:

Review comment:
       nit: I don't think we need `-> None` for the initializer.

##########
File path: python/pyspark/pandas/frame.py
##########
@@ -6429,14 +6429,14 @@ def select_dtypes(
         include_spark_type = []
         for inc in include_list:
             try:
-                include_spark_type.append(_parse_datatype_string(inc))
+                include_spark_type.append(_parse_datatype_string(cast(str, inc)))
             except:
                 pass
 
         exclude_spark_type = []
         for exc in exclude_list:
             try:
-                exclude_spark_type.append(_parse_datatype_string(exc))
+                exclude_spark_type.append(_parse_datatype_string(cast(str, exc)))

Review comment:
       ditto.

##########
File path: python/pyspark/sql/types.py
##########
@@ -1083,22 +1137,31 @@ def _infer_type(obj, infer_dict_as_struct=False, prefer_timestamp_ntz=False):
             raise TypeError("not supported type: %s" % type(obj))
 
 
-def _infer_schema(row, names=None, infer_dict_as_struct=False, prefer_timestamp_ntz=False):
+def _infer_schema(
+    row: Any,
+    names: Optional[str] = None,
+    infer_dict_as_struct: bool = False,
+    prefer_timestamp_ntz: bool = False,
+) -> StructType:
     """Infer the schema from dict/namedtuple/object"""
     if isinstance(row, dict):
         items = sorted(row.items())
 
     elif isinstance(row, (tuple, list)):
         if hasattr(row, "__fields__"):  # Row
-            items = zip(row.__fields__, tuple(row))
+            items: zip[Tuple[Any, Any]] = zip(  # type: ignore[no-redef]

Review comment:
       Could you explicitly specify a proper type of `items` beforehand and avoid using `ignore[no-redef]`?

##########
File path: python/pyspark/sql/types.py
##########
@@ -677,7 +721,7 @@ def fromInternal(self, obj):
             values = [f.fromInternal(v) if c else v
                       for f, v, c in zip(self.fields, obj, self._needConversion)]
         else:
-            values = obj
+            values: Tuple = obj  # type: ignore[no-redef]

Review comment:
       ```py
   values: Tuple
   if self._needSerializeAnyField:
       ...
   else:
       ...
   ```

##########
File path: python/pyspark/sql/types.py
##########
@@ -1055,7 +1109,7 @@ def _infer_type(obj, infer_dict_as_struct=False, prefer_timestamp_ntz=False):
             struct = StructType()
             for key, value in obj.items():
                 if key is not None and value is not None:
-                    struct.add(
+                    struct.add(  # type: ignore[call-overload]

Review comment:
       Do we need this?

##########
File path: python/pyspark/sql/types.py
##########
@@ -1161,18 +1224,21 @@ def _merge_type(a, b, name=None):
         return StructType(fields)
 
     elif isinstance(a, ArrayType):
-        return ArrayType(_merge_type(a.elementType, b.elementType,
+        return ArrayType(_merge_type(a.elementType, b.elementType,  # type: ignore[attr-defined]

Review comment:
       Could you avoid using `ignore[attr-defined]`?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org