You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by me...@apache.org on 2015/04/23 02:22:30 UTC
spark git commit: [SPARK-6827] [MLLIB] Wrap
FPGrowthModel.freqItemsets and make it consistent with Java API
Repository: spark
Updated Branches:
refs/heads/master baf865ddc -> f4f39981f
[SPARK-6827] [MLLIB] Wrap FPGrowthModel.freqItemsets and make it consistent with Java API
Make PySpark ```FPGrowthModel.freqItemsets``` consistent with Java/Scala API like ```MatrixFactorizationModel.userFeatures```
It return a RDD with each tuple is composed of an array and a long value.
I think it's difficult to implement namedtuples to wrap the output because items of freqItemsets can be any type with arbitrary length which is tedious to impelement corresponding SerDe function.
Author: Yanbo Liang <yb...@gmail.com>
Closes #5614 from yanboliang/spark-6827 and squashes the following commits:
da8c404 [Yanbo Liang] use namedtuple
5532e78 [Yanbo Liang] Wrap FPGrowthModel.freqItemsets and make it consistent with Java API
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f4f39981
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f4f39981
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f4f39981
Branch: refs/heads/master
Commit: f4f39981f4f5e88c30eec7d0b107e2c3cdc268c9
Parents: baf865d
Author: Yanbo Liang <yb...@gmail.com>
Authored: Wed Apr 22 17:22:26 2015 -0700
Committer: Xiangrui Meng <me...@databricks.com>
Committed: Wed Apr 22 17:22:26 2015 -0700
----------------------------------------------------------------------
python/pyspark/mllib/fpm.py | 15 ++++++++++++---
1 file changed, 12 insertions(+), 3 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/f4f39981/python/pyspark/mllib/fpm.py
----------------------------------------------------------------------
diff --git a/python/pyspark/mllib/fpm.py b/python/pyspark/mllib/fpm.py
index 628ccc0..d8df02b 100644
--- a/python/pyspark/mllib/fpm.py
+++ b/python/pyspark/mllib/fpm.py
@@ -15,6 +15,10 @@
# limitations under the License.
#
+import numpy
+from numpy import array
+from collections import namedtuple
+
from pyspark import SparkContext
from pyspark.rdd import ignore_unicode_prefix
from pyspark.mllib.common import JavaModelWrapper, callMLlibFunc, inherit_doc
@@ -36,14 +40,14 @@ class FPGrowthModel(JavaModelWrapper):
>>> rdd = sc.parallelize(data, 2)
>>> model = FPGrowth.train(rdd, 0.6, 2)
>>> sorted(model.freqItemsets().collect())
- [([u'a'], 4), ([u'c'], 3), ([u'c', u'a'], 3)]
+ [FreqItemset(items=[u'a'], freq=4), FreqItemset(items=[u'c'], freq=3), ...
"""
def freqItemsets(self):
"""
- Get the frequent itemsets of this model
+ Returns the frequent itemsets of this model.
"""
- return self.call("getFreqItemsets")
+ return self.call("getFreqItemsets").map(lambda x: (FPGrowth.FreqItemset(x[0], x[1])))
class FPGrowth(object):
@@ -67,6 +71,11 @@ class FPGrowth(object):
model = callMLlibFunc("trainFPGrowthModel", data, float(minSupport), int(numPartitions))
return FPGrowthModel(model)
+ class FreqItemset(namedtuple("FreqItemset", ["items", "freq"])):
+ """
+ Represents an (items, freq) tuple.
+ """
+
def _test():
import doctest
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org