You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by jk...@apache.org on 2015/06/30 03:40:37 UTC

spark git commit: [SPARK-8456] [ML] Ngram featurizer python

Repository: spark
Updated Branches:
  refs/heads/master 4c1808be4 -> 620605a4a


[SPARK-8456] [ML] Ngram featurizer python

Python API for N-gram feature transformer

Author: Feynman Liang <fl...@databricks.com>

Closes #6960 from feynmanliang/ngram-featurizer-python and squashes the following commits:

f9e37c9 [Feynman Liang] Remove debugging code
4dd81f4 [Feynman Liang] Fix typo and doctest
06c79ac [Feynman Liang] Style guide
26c1175 [Feynman Liang] Add python NGram API


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/620605a4
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/620605a4
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/620605a4

Branch: refs/heads/master
Commit: 620605a4a1123afaab2674e38251f1231dea17ce
Parents: 4c1808b
Author: Feynman Liang <fl...@databricks.com>
Authored: Mon Jun 29 18:40:30 2015 -0700
Committer: Joseph K. Bradley <jo...@databricks.com>
Committed: Mon Jun 29 18:40:30 2015 -0700

----------------------------------------------------------------------
 python/pyspark/ml/feature.py | 71 ++++++++++++++++++++++++++++++++++++++-
 python/pyspark/ml/tests.py   | 11 ++++++
 2 files changed, 81 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/620605a4/python/pyspark/ml/feature.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index ddb33f4..8804dac 100644
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -21,7 +21,7 @@ from pyspark.ml.util import keyword_only
 from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaTransformer
 from pyspark.mllib.common import inherit_doc
 
-__all__ = ['Binarizer', 'HashingTF', 'IDF', 'IDFModel', 'Normalizer', 'OneHotEncoder',
+__all__ = ['Binarizer', 'HashingTF', 'IDF', 'IDFModel', 'NGram', 'Normalizer', 'OneHotEncoder',
            'PolynomialExpansion', 'RegexTokenizer', 'StandardScaler', 'StandardScalerModel',
            'StringIndexer', 'StringIndexerModel', 'Tokenizer', 'VectorAssembler', 'VectorIndexer',
            'Word2Vec', 'Word2VecModel']
@@ -266,6 +266,75 @@ class IDFModel(JavaModel):
 
 
 @inherit_doc
+@ignore_unicode_prefix
+class NGram(JavaTransformer, HasInputCol, HasOutputCol):
+    """
+    A feature transformer that converts the input array of strings into an array of n-grams. Null
+    values in the input array are ignored.
+    It returns an array of n-grams where each n-gram is represented by a space-separated string of
+    words.
+    When the input is empty, an empty array is returned.
+    When the input array length is less than n (number of elements per n-gram), no n-grams are
+    returned.
+
+    >>> df = sqlContext.createDataFrame([Row(inputTokens=["a", "b", "c", "d", "e"])])
+    >>> ngram = NGram(n=2, inputCol="inputTokens", outputCol="nGrams")
+    >>> ngram.transform(df).head()
+    Row(inputTokens=[u'a', u'b', u'c', u'd', u'e'], nGrams=[u'a b', u'b c', u'c d', u'd e'])
+    >>> # Change n-gram length
+    >>> ngram.setParams(n=4).transform(df).head()
+    Row(inputTokens=[u'a', u'b', u'c', u'd', u'e'], nGrams=[u'a b c d', u'b c d e'])
+    >>> # Temporarily modify output column.
+    >>> ngram.transform(df, {ngram.outputCol: "output"}).head()
+    Row(inputTokens=[u'a', u'b', u'c', u'd', u'e'], output=[u'a b c d', u'b c d e'])
+    >>> ngram.transform(df).head()
+    Row(inputTokens=[u'a', u'b', u'c', u'd', u'e'], nGrams=[u'a b c d', u'b c d e'])
+    >>> # Must use keyword arguments to specify params.
+    >>> ngram.setParams("text")
+    Traceback (most recent call last):
+        ...
+    TypeError: Method setParams forces keyword arguments.
+    """
+
+    # a placeholder to make it appear in the generated doc
+    n = Param(Params._dummy(), "n", "number of elements per n-gram (>=1)")
+
+    @keyword_only
+    def __init__(self, n=2, inputCol=None, outputCol=None):
+        """
+        __init__(self, n=2, inputCol=None, outputCol=None)
+        """
+        super(NGram, self).__init__()
+        self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.NGram", self.uid)
+        self.n = Param(self, "n", "number of elements per n-gram (>=1)")
+        self._setDefault(n=2)
+        kwargs = self.__init__._input_kwargs
+        self.setParams(**kwargs)
+
+    @keyword_only
+    def setParams(self, n=2, inputCol=None, outputCol=None):
+        """
+        setParams(self, n=2, inputCol=None, outputCol=None)
+        Sets params for this NGram.
+        """
+        kwargs = self.setParams._input_kwargs
+        return self._set(**kwargs)
+
+    def setN(self, value):
+        """
+        Sets the value of :py:attr:`n`.
+        """
+        self._paramMap[self.n] = value
+        return self
+
+    def getN(self):
+        """
+        Gets the value of n or its default value.
+        """
+        return self.getOrDefault(self.n)
+
+
+@inherit_doc
 class Normalizer(JavaTransformer, HasInputCol, HasOutputCol):
     """
      Normalize a vector to have unit norm using the given p-norm.

http://git-wip-us.apache.org/repos/asf/spark/blob/620605a4/python/pyspark/ml/tests.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
index 6adbf16..c151d21 100644
--- a/python/pyspark/ml/tests.py
+++ b/python/pyspark/ml/tests.py
@@ -252,6 +252,17 @@ class FeatureTests(PySparkTestCase):
         output = idf0m.transform(dataset)
         self.assertIsNotNone(output.head().idf)
 
+    def test_ngram(self):
+        sqlContext = SQLContext(self.sc)
+        dataset = sqlContext.createDataFrame([
+            ([["a", "b", "c", "d", "e"]])], ["input"])
+        ngram0 = NGram(n=4, inputCol="input", outputCol="output")
+        self.assertEqual(ngram0.getN(), 4)
+        self.assertEqual(ngram0.getInputCol(), "input")
+        self.assertEqual(ngram0.getOutputCol(), "output")
+        transformedDF = ngram0.transform(dataset)
+        self.assertEquals(transformedDF.head().output, ["a b c d", "b c d e"])
+
 
 if __name__ == "__main__":
     unittest.main()


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org