You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by ze...@apache.org on 2022/03/17 23:15:15 UTC
[spark] branch master updated: [SPARK-37425][PYTHON] Inline type hints for python/pyspark/mllib/recommendation.py
This is an automated email from the ASF dual-hosted git repository.
zero323 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 2d1d18a [SPARK-37425][PYTHON] Inline type hints for python/pyspark/mllib/recommendation.py
2d1d18a is described below
commit 2d1d18ab4268eb5ba2ff22f74a5b3b85988d65b4
Author: dch nguyen <dc...@gmail.com>
AuthorDate: Fri Mar 18 00:14:04 2022 +0100
[SPARK-37425][PYTHON] Inline type hints for python/pyspark/mllib/recommendation.py
### What changes were proposed in this pull request?
Inline type hints for recommendation in python/pyspark/mllib/
### Why are the changes needed?
We can take advantage of static type checking within the functions by inlining the type hints.
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
Existing tests
Closes #35766 from dchvn/SPARK-37425.
Authored-by: dch nguyen <dc...@gmail.com>
Signed-off-by: zero323 <ms...@gmail.com>
---
python/pyspark/mllib/recommendation.py | 70 +++++++++++++++++++-------------
python/pyspark/mllib/recommendation.pyi | 71 ---------------------------------
2 files changed, 42 insertions(+), 99 deletions(-)
diff --git a/python/pyspark/mllib/recommendation.py b/python/pyspark/mllib/recommendation.py
index cb412fb..55eae10 100644
--- a/python/pyspark/mllib/recommendation.py
+++ b/python/pyspark/mllib/recommendation.py
@@ -17,7 +17,7 @@
import array
import sys
-from collections import namedtuple
+from typing import Any, List, NamedTuple, Optional, Tuple, Type, Union
from pyspark import SparkContext, since
from pyspark.rdd import RDD
@@ -28,7 +28,7 @@ from pyspark.sql import DataFrame
__all__ = ["MatrixFactorizationModel", "ALS", "Rating"]
-class Rating(namedtuple("Rating", ["user", "product", "rating"])):
+class Rating(NamedTuple):
"""
Represents a (user, product, rating) tuple.
@@ -43,12 +43,18 @@ class Rating(namedtuple("Rating", ["user", "product", "rating"])):
(1, 2, 5.0)
"""
- def __reduce__(self):
+ user: int
+ product: int
+ rating: float
+
+ def __reduce__(self) -> Tuple[Type["Rating"], Tuple[int, int, float]]:
return Rating, (int(self.user), int(self.product), float(self.rating))
@inherit_doc
-class MatrixFactorizationModel(JavaModelWrapper, JavaSaveable, JavaLoader):
+class MatrixFactorizationModel(
+ JavaModelWrapper, JavaSaveable, JavaLoader["MatrixFactorizationModel"]
+):
"""A matrix factorisation model trained by regularized alternating
least-squares.
@@ -135,14 +141,14 @@ class MatrixFactorizationModel(JavaModelWrapper, JavaSaveable, JavaLoader):
"""
@since("0.9.0")
- def predict(self, user, product):
+ def predict(self, user: int, product: int) -> float:
"""
Predicts rating for the given user and product.
"""
return self._java_model.predict(int(user), int(product))
@since("0.9.0")
- def predictAll(self, user_product):
+ def predictAll(self, user_product: RDD[Tuple[int, int]]) -> RDD[Rating]:
"""
Returns a list of predicted ratings for input user and product
pairs.
@@ -154,7 +160,7 @@ class MatrixFactorizationModel(JavaModelWrapper, JavaSaveable, JavaLoader):
return self.call("predict", user_product)
@since("1.2.0")
- def userFeatures(self):
+ def userFeatures(self) -> RDD[Tuple[int, array.array]]:
"""
Returns a paired RDD, where the first element is the user and the
second is an array of features corresponding to that user.
@@ -162,7 +168,7 @@ class MatrixFactorizationModel(JavaModelWrapper, JavaSaveable, JavaLoader):
return self.call("getUserFeatures").mapValues(lambda v: array.array("d", v))
@since("1.2.0")
- def productFeatures(self):
+ def productFeatures(self) -> RDD[Tuple[int, array.array]]:
"""
Returns a paired RDD, where the first element is the product and the
second is an array of features corresponding to that product.
@@ -170,7 +176,7 @@ class MatrixFactorizationModel(JavaModelWrapper, JavaSaveable, JavaLoader):
return self.call("getProductFeatures").mapValues(lambda v: array.array("d", v))
@since("1.4.0")
- def recommendUsers(self, product, num):
+ def recommendUsers(self, product: int, num: int) -> List[Rating]:
"""
Recommends the top "num" number of users for a given product and
returns a list of Rating objects sorted by the predicted rating in
@@ -179,7 +185,7 @@ class MatrixFactorizationModel(JavaModelWrapper, JavaSaveable, JavaLoader):
return list(self.call("recommendUsers", product, num))
@since("1.4.0")
- def recommendProducts(self, user, num):
+ def recommendProducts(self, user: int, num: int) -> List[Rating]:
"""
Recommends the top "num" number of products for a given user and
returns a list of Rating objects sorted by the predicted rating in
@@ -187,14 +193,14 @@ class MatrixFactorizationModel(JavaModelWrapper, JavaSaveable, JavaLoader):
"""
return list(self.call("recommendProducts", user, num))
- def recommendProductsForUsers(self, num):
+ def recommendProductsForUsers(self, num: int) -> RDD[Tuple[int, Tuple[Rating, ...]]]:
"""
Recommends the top "num" number of products for all users. The
number of recommendations returned per user may be less than "num".
"""
return self.call("wrappedRecommendProductsForUsers", num)
- def recommendUsersForProducts(self, num):
+ def recommendUsersForProducts(self, num: int) -> RDD[Tuple[int, Tuple[Rating, ...]]]:
"""
Recommends the top "num" number of users for all products. The
number of recommendations returned per product may be less than
@@ -202,17 +208,18 @@ class MatrixFactorizationModel(JavaModelWrapper, JavaSaveable, JavaLoader):
"""
return self.call("wrappedRecommendUsersForProducts", num)
- @property
+ @property # type: ignore[misc]
@since("1.4.0")
- def rank(self):
+ def rank(self) -> int:
"""Rank for the features in this model"""
return self.call("rank")
@classmethod
@since("1.3.1")
- def load(cls, sc, path):
+ def load(cls, sc: SparkContext, path: str) -> "MatrixFactorizationModel":
"""Load a model from the given path"""
model = cls._load_java(sc, path)
+ assert sc._jvm is not None
wrapper = sc._jvm.org.apache.spark.mllib.api.python.MatrixFactorizationModelWrapper(model)
return MatrixFactorizationModel(wrapper)
@@ -224,7 +231,7 @@ class ALS:
"""
@classmethod
- def _prepare(cls, ratings):
+ def _prepare(cls, ratings: Any) -> RDD[Rating]:
if isinstance(ratings, RDD):
pass
elif isinstance(ratings, DataFrame):
@@ -245,8 +252,15 @@ class ALS:
@classmethod
def train(
- cls, ratings, rank, iterations=5, lambda_=0.01, blocks=-1, nonnegative=False, seed=None
- ):
+ cls,
+ ratings: Union[RDD[Rating], RDD[Tuple[int, int, float]]],
+ rank: int,
+ iterations: int = 5,
+ lambda_: float = 0.01,
+ blocks: int = -1,
+ nonnegative: bool = False,
+ seed: Optional[int] = None,
+ ) -> MatrixFactorizationModel:
"""
Train a matrix factorization model given an RDD of ratings by users
for a subset of products. The ratings matrix is approximated as the
@@ -296,15 +310,15 @@ class ALS:
@classmethod
def trainImplicit(
cls,
- ratings,
- rank,
- iterations=5,
- lambda_=0.01,
- blocks=-1,
- alpha=0.01,
- nonnegative=False,
- seed=None,
- ):
+ ratings: Union[RDD[Rating], RDD[Tuple[int, int, float]]],
+ rank: int,
+ iterations: int = 5,
+ lambda_: float = 0.01,
+ blocks: int = -1,
+ alpha: float = 0.01,
+ nonnegative: bool = False,
+ seed: Optional[int] = None,
+ ) -> MatrixFactorizationModel:
"""
Train a matrix factorization model given an RDD of 'implicit
preferences' of users for a subset of products. The ratings matrix
@@ -356,7 +370,7 @@ class ALS:
return MatrixFactorizationModel(model)
-def _test():
+def _test() -> None:
import doctest
import pyspark.mllib.recommendation
from pyspark.sql import SQLContext
diff --git a/python/pyspark/mllib/recommendation.pyi b/python/pyspark/mllib/recommendation.pyi
deleted file mode 100644
index 43d2872..0000000
--- a/python/pyspark/mllib/recommendation.pyi
+++ /dev/null
@@ -1,71 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from typing import List, Optional, Tuple, Type, Union
-
-import array
-from collections import namedtuple
-
-from pyspark.context import SparkContext
-from pyspark.rdd import RDD
-from pyspark.mllib.common import JavaModelWrapper
-from pyspark.mllib.util import JavaLoader, JavaSaveable
-
-class Rating(namedtuple("Rating", ["user", "product", "rating"])):
- def __reduce__(self) -> Tuple[Type[Rating], Tuple[int, int, float]]: ...
-
-class MatrixFactorizationModel(
- JavaModelWrapper, JavaSaveable, JavaLoader[MatrixFactorizationModel]
-):
- def predict(self, user: int, product: int) -> float: ...
- def predictAll(self, user_product: RDD[Tuple[int, int]]) -> RDD[Rating]: ...
- def userFeatures(self) -> RDD[Tuple[int, array.array]]: ...
- def productFeatures(self) -> RDD[Tuple[int, array.array]]: ...
- def recommendUsers(self, product: int, num: int) -> List[Rating]: ...
- def recommendProducts(self, user: int, num: int) -> List[Rating]: ...
- def recommendProductsForUsers(self, num: int) -> RDD[Tuple[int, Tuple[Rating, ...]]]: ...
- def recommendUsersForProducts(self, num: int) -> RDD[Tuple[int, Tuple[Rating, ...]]]: ...
- @property
- def rank(self) -> int: ...
- @classmethod
- def load(cls, sc: SparkContext, path: str) -> MatrixFactorizationModel: ...
-
-class ALS:
- @classmethod
- def train(
- cls,
- ratings: Union[RDD[Rating], RDD[Tuple[int, int, float]]],
- rank: int,
- iterations: int = ...,
- lambda_: float = ...,
- blocks: int = ...,
- nonnegative: bool = ...,
- seed: Optional[int] = ...,
- ) -> MatrixFactorizationModel: ...
- @classmethod
- def trainImplicit(
- cls,
- ratings: Union[RDD[Rating], RDD[Tuple[int, int, float]]],
- rank: int,
- iterations: int = ...,
- lambda_: float = ...,
- blocks: int = ...,
- alpha: float = ...,
- nonnegative: bool = ...,
- seed: Optional[int] = ...,
- ) -> MatrixFactorizationModel: ...
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org