You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by ze...@apache.org on 2022/03/17 23:15:15 UTC

[spark] branch master updated: [SPARK-37425][PYTHON] Inline type hints for python/pyspark/mllib/recommendation.py

This is an automated email from the ASF dual-hosted git repository.

zero323 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 2d1d18a  [SPARK-37425][PYTHON] Inline type hints for python/pyspark/mllib/recommendation.py
2d1d18a is described below

commit 2d1d18ab4268eb5ba2ff22f74a5b3b85988d65b4
Author: dch nguyen <dc...@gmail.com>
AuthorDate: Fri Mar 18 00:14:04 2022 +0100

    [SPARK-37425][PYTHON] Inline type hints for python/pyspark/mllib/recommendation.py
    
    ### What changes were proposed in this pull request?
    Inline type hints for recommendation in python/pyspark/mllib/
    
    ### Why are the changes needed?
    We can take advantage of static type checking within the functions by inlining the type hints.
    
    ### Does this PR introduce _any_ user-facing change?
    No
    
    ### How was this patch tested?
    Existing tests
    
    Closes #35766 from dchvn/SPARK-37425.
    
    Authored-by: dch nguyen <dc...@gmail.com>
    Signed-off-by: zero323 <ms...@gmail.com>
---
 python/pyspark/mllib/recommendation.py  | 70 +++++++++++++++++++-------------
 python/pyspark/mllib/recommendation.pyi | 71 ---------------------------------
 2 files changed, 42 insertions(+), 99 deletions(-)

diff --git a/python/pyspark/mllib/recommendation.py b/python/pyspark/mllib/recommendation.py
index cb412fb..55eae10 100644
--- a/python/pyspark/mllib/recommendation.py
+++ b/python/pyspark/mllib/recommendation.py
@@ -17,7 +17,7 @@
 
 import array
 import sys
-from collections import namedtuple
+from typing import Any, List, NamedTuple, Optional, Tuple, Type, Union
 
 from pyspark import SparkContext, since
 from pyspark.rdd import RDD
@@ -28,7 +28,7 @@ from pyspark.sql import DataFrame
 __all__ = ["MatrixFactorizationModel", "ALS", "Rating"]
 
 
-class Rating(namedtuple("Rating", ["user", "product", "rating"])):
+class Rating(NamedTuple):
     """
     Represents a (user, product, rating) tuple.
 
@@ -43,12 +43,18 @@ class Rating(namedtuple("Rating", ["user", "product", "rating"])):
     (1, 2, 5.0)
     """
 
-    def __reduce__(self):
+    user: int
+    product: int
+    rating: float
+
+    def __reduce__(self) -> Tuple[Type["Rating"], Tuple[int, int, float]]:
         return Rating, (int(self.user), int(self.product), float(self.rating))
 
 
 @inherit_doc
-class MatrixFactorizationModel(JavaModelWrapper, JavaSaveable, JavaLoader):
+class MatrixFactorizationModel(
+    JavaModelWrapper, JavaSaveable, JavaLoader["MatrixFactorizationModel"]
+):
 
     """A matrix factorisation model trained by regularized alternating
     least-squares.
@@ -135,14 +141,14 @@ class MatrixFactorizationModel(JavaModelWrapper, JavaSaveable, JavaLoader):
     """
 
     @since("0.9.0")
-    def predict(self, user, product):
+    def predict(self, user: int, product: int) -> float:
         """
         Predicts rating for the given user and product.
         """
         return self._java_model.predict(int(user), int(product))
 
     @since("0.9.0")
-    def predictAll(self, user_product):
+    def predictAll(self, user_product: RDD[Tuple[int, int]]) -> RDD[Rating]:
         """
         Returns a list of predicted ratings for input user and product
         pairs.
@@ -154,7 +160,7 @@ class MatrixFactorizationModel(JavaModelWrapper, JavaSaveable, JavaLoader):
         return self.call("predict", user_product)
 
     @since("1.2.0")
-    def userFeatures(self):
+    def userFeatures(self) -> RDD[Tuple[int, array.array]]:
         """
         Returns a paired RDD, where the first element is the user and the
         second is an array of features corresponding to that user.
@@ -162,7 +168,7 @@ class MatrixFactorizationModel(JavaModelWrapper, JavaSaveable, JavaLoader):
         return self.call("getUserFeatures").mapValues(lambda v: array.array("d", v))
 
     @since("1.2.0")
-    def productFeatures(self):
+    def productFeatures(self) -> RDD[Tuple[int, array.array]]:
         """
         Returns a paired RDD, where the first element is the product and the
         second is an array of features corresponding to that product.
@@ -170,7 +176,7 @@ class MatrixFactorizationModel(JavaModelWrapper, JavaSaveable, JavaLoader):
         return self.call("getProductFeatures").mapValues(lambda v: array.array("d", v))
 
     @since("1.4.0")
-    def recommendUsers(self, product, num):
+    def recommendUsers(self, product: int, num: int) -> List[Rating]:
         """
         Recommends the top "num" number of users for a given product and
         returns a list of Rating objects sorted by the predicted rating in
@@ -179,7 +185,7 @@ class MatrixFactorizationModel(JavaModelWrapper, JavaSaveable, JavaLoader):
         return list(self.call("recommendUsers", product, num))
 
     @since("1.4.0")
-    def recommendProducts(self, user, num):
+    def recommendProducts(self, user: int, num: int) -> List[Rating]:
         """
         Recommends the top "num" number of products for a given user and
         returns a list of Rating objects sorted by the predicted rating in
@@ -187,14 +193,14 @@ class MatrixFactorizationModel(JavaModelWrapper, JavaSaveable, JavaLoader):
         """
         return list(self.call("recommendProducts", user, num))
 
-    def recommendProductsForUsers(self, num):
+    def recommendProductsForUsers(self, num: int) -> RDD[Tuple[int, Tuple[Rating, ...]]]:
         """
         Recommends the top "num" number of products for all users. The
         number of recommendations returned per user may be less than "num".
         """
         return self.call("wrappedRecommendProductsForUsers", num)
 
-    def recommendUsersForProducts(self, num):
+    def recommendUsersForProducts(self, num: int) -> RDD[Tuple[int, Tuple[Rating, ...]]]:
         """
         Recommends the top "num" number of users for all products. The
         number of recommendations returned per product may be less than
@@ -202,17 +208,18 @@ class MatrixFactorizationModel(JavaModelWrapper, JavaSaveable, JavaLoader):
         """
         return self.call("wrappedRecommendUsersForProducts", num)
 
-    @property
+    @property  # type: ignore[misc]
     @since("1.4.0")
-    def rank(self):
+    def rank(self) -> int:
         """Rank for the features in this model"""
         return self.call("rank")
 
     @classmethod
     @since("1.3.1")
-    def load(cls, sc, path):
+    def load(cls, sc: SparkContext, path: str) -> "MatrixFactorizationModel":
         """Load a model from the given path"""
         model = cls._load_java(sc, path)
+        assert sc._jvm is not None
         wrapper = sc._jvm.org.apache.spark.mllib.api.python.MatrixFactorizationModelWrapper(model)
         return MatrixFactorizationModel(wrapper)
 
@@ -224,7 +231,7 @@ class ALS:
     """
 
     @classmethod
-    def _prepare(cls, ratings):
+    def _prepare(cls, ratings: Any) -> RDD[Rating]:
         if isinstance(ratings, RDD):
             pass
         elif isinstance(ratings, DataFrame):
@@ -245,8 +252,15 @@ class ALS:
 
     @classmethod
     def train(
-        cls, ratings, rank, iterations=5, lambda_=0.01, blocks=-1, nonnegative=False, seed=None
-    ):
+        cls,
+        ratings: Union[RDD[Rating], RDD[Tuple[int, int, float]]],
+        rank: int,
+        iterations: int = 5,
+        lambda_: float = 0.01,
+        blocks: int = -1,
+        nonnegative: bool = False,
+        seed: Optional[int] = None,
+    ) -> MatrixFactorizationModel:
         """
         Train a matrix factorization model given an RDD of ratings by users
         for a subset of products. The ratings matrix is approximated as the
@@ -296,15 +310,15 @@ class ALS:
     @classmethod
     def trainImplicit(
         cls,
-        ratings,
-        rank,
-        iterations=5,
-        lambda_=0.01,
-        blocks=-1,
-        alpha=0.01,
-        nonnegative=False,
-        seed=None,
-    ):
+        ratings: Union[RDD[Rating], RDD[Tuple[int, int, float]]],
+        rank: int,
+        iterations: int = 5,
+        lambda_: float = 0.01,
+        blocks: int = -1,
+        alpha: float = 0.01,
+        nonnegative: bool = False,
+        seed: Optional[int] = None,
+    ) -> MatrixFactorizationModel:
         """
         Train a matrix factorization model given an RDD of 'implicit
         preferences' of users for a subset of products. The ratings matrix
@@ -356,7 +370,7 @@ class ALS:
         return MatrixFactorizationModel(model)
 
 
-def _test():
+def _test() -> None:
     import doctest
     import pyspark.mllib.recommendation
     from pyspark.sql import SQLContext
diff --git a/python/pyspark/mllib/recommendation.pyi b/python/pyspark/mllib/recommendation.pyi
deleted file mode 100644
index 43d2872..0000000
--- a/python/pyspark/mllib/recommendation.pyi
+++ /dev/null
@@ -1,71 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from typing import List, Optional, Tuple, Type, Union
-
-import array
-from collections import namedtuple
-
-from pyspark.context import SparkContext
-from pyspark.rdd import RDD
-from pyspark.mllib.common import JavaModelWrapper
-from pyspark.mllib.util import JavaLoader, JavaSaveable
-
-class Rating(namedtuple("Rating", ["user", "product", "rating"])):
-    def __reduce__(self) -> Tuple[Type[Rating], Tuple[int, int, float]]: ...
-
-class MatrixFactorizationModel(
-    JavaModelWrapper, JavaSaveable, JavaLoader[MatrixFactorizationModel]
-):
-    def predict(self, user: int, product: int) -> float: ...
-    def predictAll(self, user_product: RDD[Tuple[int, int]]) -> RDD[Rating]: ...
-    def userFeatures(self) -> RDD[Tuple[int, array.array]]: ...
-    def productFeatures(self) -> RDD[Tuple[int, array.array]]: ...
-    def recommendUsers(self, product: int, num: int) -> List[Rating]: ...
-    def recommendProducts(self, user: int, num: int) -> List[Rating]: ...
-    def recommendProductsForUsers(self, num: int) -> RDD[Tuple[int, Tuple[Rating, ...]]]: ...
-    def recommendUsersForProducts(self, num: int) -> RDD[Tuple[int, Tuple[Rating, ...]]]: ...
-    @property
-    def rank(self) -> int: ...
-    @classmethod
-    def load(cls, sc: SparkContext, path: str) -> MatrixFactorizationModel: ...
-
-class ALS:
-    @classmethod
-    def train(
-        cls,
-        ratings: Union[RDD[Rating], RDD[Tuple[int, int, float]]],
-        rank: int,
-        iterations: int = ...,
-        lambda_: float = ...,
-        blocks: int = ...,
-        nonnegative: bool = ...,
-        seed: Optional[int] = ...,
-    ) -> MatrixFactorizationModel: ...
-    @classmethod
-    def trainImplicit(
-        cls,
-        ratings: Union[RDD[Rating], RDD[Tuple[int, int, float]]],
-        rank: int,
-        iterations: int = ...,
-        lambda_: float = ...,
-        blocks: int = ...,
-        alpha: float = ...,
-        nonnegative: bool = ...,
-        seed: Optional[int] = ...,
-    ) -> MatrixFactorizationModel: ...

---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org