You are viewing a plain text version of this content. The canonical link for it is here.

Posted to reviews@spark.apache.org by GitBox <gi...@apache.org> on 2020/03/03 20:18:39 UTC

[GitHub] [spark] MaxGekk commented on a change in pull request #22666: [SPARK-25672][SQL] schema_of_csv() - schema inference from an example

MaxGekk commented on a change in pull request #22666: [SPARK-25672][SQL] schema_of_csv() - schema inference from an example
URL: https://github.com/apache/spark/pull/22666#discussion_r387269929
 
 

 ##########
 File path: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExprUtils.scala
 ##########
 @@ -19,14 +19,39 @@ package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.util.ArrayBasedMapData
-import org.apache.spark.sql.types.{MapType, StringType, StructType}
+import org.apache.spark.sql.types.{DataType, MapType, StringType, StructType}
+import org.apache.spark.unsafe.types.UTF8String
 
 object ExprUtils {
 
-  def evalSchemaExpr(exp: Expression): StructType = exp match {
-    case Literal(s, StringType) => StructType.fromDDL(s.toString)
+  def evalSchemaExpr(exp: Expression): StructType = {
+    // Use `DataType.fromDDL` since the type string can be struct<...>.
+    val dataType = exp match {
+      case Literal(s, StringType) =>
+        DataType.fromDDL(s.toString)
+      case e @ SchemaOfCsv(_: Literal, _) =>
+        val ddlSchema = e.eval(EmptyRow).asInstanceOf[UTF8String]
+        DataType.fromDDL(ddlSchema.toString)
+      case e => throw new AnalysisException(
+        "Schema should be specified in DDL format as a string literal or output of " +
+          s"the schema_of_csv function instead of ${e.sql}")
+    }
+
+    if (!dataType.isInstanceOf[StructType]) {
+      throw new AnalysisException(
+        s"Schema should be struct type but got ${dataType.sql}.")
+    }
+    dataType.asInstanceOf[StructType]
+  }
+
+  def evalTypeExpr(exp: Expression): DataType = exp match {
+    case Literal(s, StringType) => DataType.fromDDL(s.toString)
 
 Review comment:
   For example, a column with CSV string may be a result of string functions. So, you could just invoke the functions with an particular inputs. Currently, we force people to materialize an example and copy-past it to `schema_of_csv()`. That could cause maintainability issues, so, users should keep in sync the example in `schema_of_csv()` with the code which forms CSV column.
   
   I prepared the PR https://github.com/apache/spark/pull/27777 to avoid the restriction which is not necessary from my point of view.

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org