You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by rx...@apache.org on 2015/06/29 23:15:21 UTC

spark git commit: [SPARK-8056][SQL] Design an easier way to construct schema for both Scala and Python

Repository: spark
Updated Branches:
  refs/heads/master 27ef85451 -> f6fc254ec


[SPARK-8056][SQL] Design an easier way to construct schema for both Scala and Python

I've added functionality to create new StructType similar to how we add parameters to a new SparkContext.

I've also added tests for this type of creation.

Author: Ilya Ganelin <il...@capitalone.com>

Closes #6686 from ilganeli/SPARK-8056B and squashes the following commits:

27c1de1 [Ilya Ganelin] Rename
467d836 [Ilya Ganelin] Removed from_string in favor of _parse_Datatype_json_value
5fef5a4 [Ilya Ganelin] Updates for type parsing
4085489 [Ilya Ganelin] Style errors
3670cf5 [Ilya Ganelin] added string to DataType conversion
8109e00 [Ilya Ganelin] Fixed error in tests
41ab686 [Ilya Ganelin] Fixed style errors
e7ba7e0 [Ilya Ganelin] Moved some python tests to tests.py. Added cleaner handling of null data type and added test for correctness of input format
15868fa [Ilya Ganelin] Fixed python errors
b79b992 [Ilya Ganelin] Merge remote-tracking branch 'upstream/master' into SPARK-8056B
a3369fc [Ilya Ganelin] Fixing space errors
e240040 [Ilya Ganelin] Style
bab7823 [Ilya Ganelin] Constructor error
73d4677 [Ilya Ganelin] Style
4ed00d9 [Ilya Ganelin] Fixed default arg
67df57a [Ilya Ganelin] Removed Foo
04cbf0c [Ilya Ganelin] Added comments for single object
0484d7a [Ilya Ganelin] Restored second method
6aeb740 [Ilya Ganelin] Style
689e54d [Ilya Ganelin] Style
f497e9e [Ilya Ganelin] Got rid of old code
e3c7a88 [Ilya Ganelin] Fixed doctest failure
a62ccde [Ilya Ganelin] Style
966ac06 [Ilya Ganelin] style checks
dabb7e6 [Ilya Ganelin] Added Python tests
a3f4152 [Ilya Ganelin] added python bindings and better comments
e6e536c [Ilya Ganelin] Added extra space
7529a2e [Ilya Ganelin] Fixed formatting
d388f86 [Ilya Ganelin] Fixed small bug
c4e3bf5 [Ilya Ganelin] Reverted to using parse. Updated parse to support long
d7634b6 [Ilya Ganelin] Reverted to fromString to properly support types
22c39d5 [Ilya Ganelin] replaced FromString with DataTypeParser.parse. Replaced empty constructor initializing a null to have it instead create a new array to allow appends to it.
faca398 [Ilya Ganelin] [SPARK-8056] Replaced default argument usage. Updated usage and code for DataType.fromString
1acf76e [Ilya Ganelin] Scala style
e31c674 [Ilya Ganelin] Fixed bug in test
8dc0795 [Ilya Ganelin] Added tests for creation of StructType object with new methods
fdf7e9f [Ilya Ganelin] [SPARK-8056] Created add methods to facilitate building new StructType objects.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f6fc254e
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f6fc254e
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f6fc254e

Branch: refs/heads/master
Commit: f6fc254ec4ce5f103d45da6d007b4066ce751236
Parents: 27ef854
Author: Ilya Ganelin <il...@capitalone.com>
Authored: Mon Jun 29 14:15:15 2015 -0700
Committer: Reynold Xin <rx...@databricks.com>
Committed: Mon Jun 29 14:15:15 2015 -0700

----------------------------------------------------------------------
 python/pyspark/sql/tests.py                     |  29 ++++++
 python/pyspark/sql/types.py                     |  52 +++++++++-
 .../apache/spark/sql/types/DataTypeParser.scala |   2 +-
 .../org/apache/spark/sql/types/StructType.scala | 104 ++++++++++++++++++-
 .../apache/spark/sql/types/DataTypeSuite.scala  |  31 ++++++
 5 files changed, 212 insertions(+), 6 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/f6fc254e/python/pyspark/sql/tests.py
----------------------------------------------------------------------
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index ffee43a..34f397d 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -516,6 +516,35 @@ class SQLTests(ReusedPySparkTestCase):
         self.assertEqual([Row(a=2, b=1, c=3), Row(a=4, b=1, c=4)],
                          df.filter(df.a.between(df.b, df.c)).collect())
 
+    def test_struct_type(self):
+        from pyspark.sql.types import StructType, StringType, StructField
+        struct1 = StructType().add("f1", StringType(), True).add("f2", StringType(), True, None)
+        struct2 = StructType([StructField("f1", StringType(), True),
+                              StructField("f2", StringType(), True, None)])
+        self.assertEqual(struct1, struct2)
+
+        struct1 = StructType().add("f1", StringType(), True).add("f2", StringType(), True, None)
+        struct2 = StructType([StructField("f1", StringType(), True)])
+        self.assertNotEqual(struct1, struct2)
+
+        struct1 = (StructType().add(StructField("f1", StringType(), True))
+                   .add(StructField("f2", StringType(), True, None)))
+        struct2 = StructType([StructField("f1", StringType(), True),
+                              StructField("f2", StringType(), True, None)])
+        self.assertEqual(struct1, struct2)
+
+        struct1 = (StructType().add(StructField("f1", StringType(), True))
+                   .add(StructField("f2", StringType(), True, None)))
+        struct2 = StructType([StructField("f1", StringType(), True)])
+        self.assertNotEqual(struct1, struct2)
+
+        # Catch exception raised during improper construction
+        try:
+            struct1 = StructType().add("name")
+            self.assertEqual(1, 0)
+        except ValueError:
+            self.assertEqual(1, 1)
+
     def test_save_and_load(self):
         df = self.df
         tmpPath = tempfile.mkdtemp()

http://git-wip-us.apache.org/repos/asf/spark/blob/f6fc254e/python/pyspark/sql/types.py
----------------------------------------------------------------------
diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index 932686e..ae9344e 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -355,8 +355,7 @@ class StructType(DataType):
 
     This is the data type representing a :class:`Row`.
     """
-
-    def __init__(self, fields):
+    def __init__(self, fields=None):
         """
         >>> struct1 = StructType([StructField("f1", StringType(), True)])
         >>> struct2 = StructType([StructField("f1", StringType(), True)])
@@ -368,8 +367,53 @@ class StructType(DataType):
         >>> struct1 == struct2
         False
         """
-        assert all(isinstance(f, DataType) for f in fields), "fields should be a list of DataType"
-        self.fields = fields
+        if not fields:
+            self.fields = []
+        else:
+            self.fields = fields
+            assert all(isinstance(f, StructField) for f in fields),\
+                "fields should be a list of StructField"
+
+    def add(self, field, data_type=None, nullable=True, metadata=None):
+        """
+        Construct a StructType by adding new elements to it to define the schema. The method accepts
+        either:
+            a) A single parameter which is a StructField object.
+            b) Between 2 and 4 parameters as (name, data_type, nullable (optional),
+             metadata(optional). The data_type parameter may be either a String or a DataType object
+
+        >>> struct1 = StructType().add("f1", StringType(), True).add("f2", StringType(), True, None)
+        >>> struct2 = StructType([StructField("f1", StringType(), True),\
+         StructField("f2", StringType(), True, None)])
+        >>> struct1 == struct2
+        True
+        >>> struct1 = StructType().add(StructField("f1", StringType(), True))
+        >>> struct2 = StructType([StructField("f1", StringType(), True)])
+        >>> struct1 == struct2
+        True
+        >>> struct1 = StructType().add("f1", "string", True)
+        >>> struct2 = StructType([StructField("f1", StringType(), True)])
+        >>> struct1 == struct2
+        True
+
+        :param field: Either the name of the field or a StructField object
+        :param data_type: If present, the DataType of the StructField to create
+        :param nullable: Whether the field to add should be nullable (default True)
+        :param metadata: Any additional metadata (default None)
+        :return: a new updated StructType
+        """
+        if isinstance(field, StructField):
+            self.fields.append(field)
+        else:
+            if isinstance(field, str) and data_type is None:
+                raise ValueError("Must specify DataType if passing name of struct_field to create.")
+
+            if isinstance(data_type, str):
+                data_type_f = _parse_datatype_json_value(data_type)
+            else:
+                data_type_f = data_type
+            self.fields.append(StructField(field, data_type_f, nullable, metadata))
+        return self
 
     def simpleString(self):
         return 'struct<%s>' % (','.join(f.simpleString() for f in self.fields))

http://git-wip-us.apache.org/repos/asf/spark/blob/f6fc254e/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataTypeParser.scala
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataTypeParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataTypeParser.scala
index 04f3379..6b43224 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataTypeParser.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataTypeParser.scala
@@ -44,7 +44,7 @@ private[sql] trait DataTypeParser extends StandardTokenParsers {
     "(?i)tinyint".r ^^^ ByteType |
     "(?i)smallint".r ^^^ ShortType |
     "(?i)double".r ^^^ DoubleType |
-    "(?i)bigint".r ^^^ LongType |
+    "(?i)(?:bigint|long)".r ^^^ LongType |
     "(?i)binary".r ^^^ BinaryType |
     "(?i)boolean".r ^^^ BooleanType |
     fixedDecimalType |

http://git-wip-us.apache.org/repos/asf/spark/blob/f6fc254e/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
index 193c08a..2db0a35 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
@@ -94,7 +94,7 @@ import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Attribute}
 case class StructType(fields: Array[StructField]) extends DataType with Seq[StructField] {
 
   /** No-arg constructor for kryo. */
-  protected def this() = this(null)
+  def this() = this(Array.empty[StructField])
 
   /** Returns all field names in an array. */
   def fieldNames: Array[String] = fields.map(_.name)
@@ -104,6 +104,108 @@ case class StructType(fields: Array[StructField]) extends DataType with Seq[Stru
   private lazy val nameToIndex: Map[String, Int] = fieldNames.zipWithIndex.toMap
 
   /**
+   * Creates a new [[StructType]] by adding a new field.
+   * {{{
+   * val struct = (new StructType)
+   *   .add(StructField("a", IntegerType, true))
+   *   .add(StructField("b", LongType, false))
+   *   .add(StructField("c", StringType, true))
+   *}}}
+   */
+  def add(field: StructField): StructType = {
+    StructType(fields :+ field)
+  }
+
+  /**
+   * Creates a new [[StructType]] by adding a new nullable field with no metadata.
+   *
+   * val struct = (new StructType)
+   *   .add("a", IntegerType)
+   *   .add("b", LongType)
+   *   .add("c", StringType)
+   */
+  def add(name: String, dataType: DataType): StructType = {
+    StructType(fields :+ new StructField(name, dataType, nullable = true, Metadata.empty))
+  }
+
+  /**
+   * Creates a new [[StructType]] by adding a new field with no metadata.
+   *
+   * val struct = (new StructType)
+   *   .add("a", IntegerType, true)
+   *   .add("b", LongType, false)
+   *   .add("c", StringType, true)
+   */
+  def add(name: String, dataType: DataType, nullable: Boolean): StructType = {
+    StructType(fields :+ new StructField(name, dataType, nullable, Metadata.empty))
+  }
+
+  /**
+   * Creates a new [[StructType]] by adding a new field and specifying metadata.
+   * {{{
+   * val struct = (new StructType)
+   *   .add("a", IntegerType, true, Metadata.empty)
+   *   .add("b", LongType, false, Metadata.empty)
+   *   .add("c", StringType, true, Metadata.empty)
+   * }}}
+   */
+  def add(
+      name: String,
+      dataType: DataType,
+      nullable: Boolean,
+      metadata: Metadata): StructType = {
+    StructType(fields :+ new StructField(name, dataType, nullable, metadata))
+  }
+
+  /**
+   * Creates a new [[StructType]] by adding a new nullable field with no metadata where the
+   * dataType is specified as a String.
+   *
+   * {{{
+   * val struct = (new StructType)
+   *   .add("a", "int")
+   *   .add("b", "long")
+   *   .add("c", "string")
+   * }}}
+   */
+  def add(name: String, dataType: String): StructType = {
+    add(name, DataTypeParser.parse(dataType), nullable = true, Metadata.empty)
+  }
+
+  /**
+   * Creates a new [[StructType]] by adding a new field with no metadata where the
+   * dataType is specified as a String.
+   *
+   * {{{
+   * val struct = (new StructType)
+   *   .add("a", "int", true)
+   *   .add("b", "long", false)
+   *   .add("c", "string", true)
+   * }}}
+   */
+  def add(name: String, dataType: String, nullable: Boolean): StructType = {
+    add(name, DataTypeParser.parse(dataType), nullable, Metadata.empty)
+  }
+
+  /**
+   * Creates a new [[StructType]] by adding a new field and specifying metadata where the
+   * dataType is specified as a String.
+   * {{{
+   * val struct = (new StructType)
+   *   .add("a", "int", true, Metadata.empty)
+   *   .add("b", "long", false, Metadata.empty)
+   *   .add("c", "string", true, Metadata.empty)
+   * }}}
+   */
+  def add(
+      name: String,
+      dataType: String,
+      nullable: Boolean,
+      metadata: Metadata): StructType = {
+    add(name, DataTypeParser.parse(dataType), nullable, metadata)
+  }
+
+  /**
    * Extracts a [[StructField]] of the given name. If the [[StructType]] object does not
    * have a name matching the given name, `null` will be returned.
    */

http://git-wip-us.apache.org/repos/asf/spark/blob/f6fc254e/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala
index 077c0ad..14e7b4a 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala
@@ -33,6 +33,37 @@ class DataTypeSuite extends SparkFunSuite {
     assert(MapType(StringType, IntegerType, true) === map)
   }
 
+  test("construct with add") {
+    val struct = (new StructType)
+      .add("a", IntegerType, true)
+      .add("b", LongType, false)
+      .add("c", StringType, true)
+
+    assert(StructField("b", LongType, false) === struct("b"))
+  }
+
+  test("construct with add from StructField") {
+    // Test creation from StructField type
+    val struct = (new StructType)
+      .add(StructField("a", IntegerType, true))
+      .add(StructField("b", LongType, false))
+      .add(StructField("c", StringType, true))
+
+    assert(StructField("b", LongType, false) === struct("b"))
+  }
+
+  test("construct with String DataType") {
+    // Test creation with DataType as String
+    val struct = (new StructType)
+      .add("a", "int", true)
+      .add("b", "long", false)
+      .add("c", "string", true)
+
+    assert(StructField("a", IntegerType, true) === struct("a"))
+    assert(StructField("b", LongType, false) === struct("b"))
+    assert(StructField("c", StringType, true) === struct("c"))
+  }
+
   test("extract fields from a StructType") {
     val struct = StructType(
       StructField("a", IntegerType, true) ::


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org