You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by do...@apache.org on 2020/04/12 20:15:18 UTC
[spark] branch master updated: [SPARK-31416][SQL] Check more
strictly that a field name can be used as a valid Java identifier for
codegen
This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 6cd0bef [SPARK-31416][SQL] Check more strictly that a field name can be used as a valid Java identifier for codegen
6cd0bef is described below
commit 6cd0bef7fe872b5d9472a2140db188fb9f1eede8
Author: Kousuke Saruta <sa...@oss.nttdata.com>
AuthorDate: Sun Apr 12 13:14:41 2020 -0700
[SPARK-31416][SQL] Check more strictly that a field name can be used as a valid Java identifier for codegen
### What changes were proposed in this pull request?
Check more strictly that a field name can be used as a valid Java identifier in `ScalaReflection.serializerFor`
To check that, `SourceVersion` is used so that we need not add reserved keywords to be checked manually for the future Java versions (e.g, underscore, var, yield), .
### Why are the changes needed?
In the current implementation, `enum` is not checked even though it's a reserved keyword.
Also, there are lots of characters and sequences of character including numeric literals but they are not checked.
So we can't get better error message with following code.
```
case class Data(`0`: Int)
Seq(Data(1)).toDF.show
20/04/11 03:24:24 ERROR CodeGenerator: failed to compile: org.codehaus.commons.compiler.CompileException: File 'generated.java', Line 43, Column 1: Expression "value_0 = value_3" is not a type
org.codehaus.commons.compiler.CompileException: File 'generated.java', Line 43, Column 1: Expression "value_0 = value_3" is not a type
...
```
### Does this PR introduce any user-facing change?
Yes. With this change and the code example above, we can get following error message.
```
java.lang.UnsupportedOperationException: `0` is not a valid identifier of Java and cannot be used as field name
- root class: "Data"
...
```
### How was this patch tested?
Add another assertion to existing test case.
Closes #28184 from sarutak/improve-identifier-check.
Authored-by: Kousuke Saruta <sa...@oss.nttdata.com>
Signed-off-by: Dongjoon Hyun <do...@apache.org>
---
.../apache/spark/sql/catalyst/ScalaReflection.scala | 20 ++++++++++----------
.../sql/catalyst/expressions/objects/objects.scala | 3 +--
.../scala/org/apache/spark/sql/DatasetSuite.scala | 10 ----------
.../spark/sql/ScalaReflectionRelationSuite.scala | 21 +++++++++++++++++++++
4 files changed, 32 insertions(+), 22 deletions(-)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
index 3694832..9c8da32 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
@@ -17,6 +17,8 @@
package org.apache.spark.sql.catalyst
+import javax.lang.model.SourceVersion
+
import org.apache.commons.lang3.reflect.ConstructorUtils
import org.apache.spark.internal.Logging
@@ -539,9 +541,10 @@ object ScalaReflection extends ScalaReflection {
val params = getConstructorParameters(t)
val fields = params.map { case (fieldName, fieldType) =>
- if (javaKeywords.contains(fieldName)) {
- throw new UnsupportedOperationException(s"`$fieldName` is a reserved keyword and " +
- "cannot be used as field name\n" + walkedTypePath)
+ if (SourceVersion.isKeyword(fieldName) ||
+ !SourceVersion.isIdentifier(encodeFieldNameToIdentifier(fieldName))) {
+ throw new UnsupportedOperationException(s"`$fieldName` is not a valid identifier of " +
+ "Java and cannot be used as field name\n" + walkedTypePath)
}
// SPARK-26730 inputObject won't be null with If's guard below. And KnownNotNul
@@ -784,13 +787,6 @@ object ScalaReflection extends ScalaReflection {
}
}
- private val javaKeywords = Set("abstract", "assert", "boolean", "break", "byte", "case", "catch",
- "char", "class", "const", "continue", "default", "do", "double", "else", "extends", "false",
- "final", "finally", "float", "for", "goto", "if", "implements", "import", "instanceof", "int",
- "interface", "long", "native", "new", "null", "package", "private", "protected", "public",
- "return", "short", "static", "strictfp", "super", "switch", "synchronized", "this", "throw",
- "throws", "transient", "true", "try", "void", "volatile", "while")
-
val typeJavaMapping = Map[DataType, Class[_]](
BooleanType -> classOf[Boolean],
ByteType -> classOf[Byte],
@@ -849,6 +845,10 @@ object ScalaReflection extends ScalaReflection {
Seq.empty
}
}
+
+ def encodeFieldNameToIdentifier(fieldName: String): String = {
+ TermName(fieldName).encodedName.toString
+ }
}
/**
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
index 54abd09..d5de95c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
@@ -28,7 +28,6 @@ import org.apache.spark.{SparkConf, SparkEnv}
import org.apache.spark.serializer._
import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow, ScalaReflection}
-import org.apache.spark.sql.catalyst.ScalaReflection.universe.TermName
import org.apache.spark.sql.catalyst.encoders.RowEncoder
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.expressions.codegen._
@@ -311,7 +310,7 @@ case class Invoke(
override def nullable: Boolean = targetObject.nullable || needNullCheck || returnNullable
override def children: Seq[Expression] = targetObject +: arguments
- private lazy val encodedFunctionName = TermName(functionName).encodedName.toString
+ private lazy val encodedFunctionName = ScalaReflection.encodeFieldNameToIdentifier(functionName)
@transient lazy val method = targetObject.dataType match {
case ObjectType(cls) =>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
index b4ed4ec..af65957 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
@@ -1222,14 +1222,6 @@ class DatasetSuite extends QueryTest
assert(result == Set(ClassData("a", 1) -> null, ClassData("b", 2) -> ClassData("x", 2)))
}
- test("better error message when use java reserved keyword as field name") {
- val e = intercept[UnsupportedOperationException] {
- Seq(InvalidInJava(1)).toDS()
- }
- assert(e.getMessage.contains(
- "`abstract` is a reserved keyword and cannot be used as field name"))
- }
-
test("Dataset should support flat input object to be null") {
checkDataset(Seq("a", null).toDS(), "a", null)
}
@@ -1964,8 +1956,6 @@ case class ClassNullableData(a: String, b: Integer)
case class NestedStruct(f: ClassData)
case class DeepNestedStruct(f: NestedStruct)
-case class InvalidInJava(`abstract`: Int)
-
/**
* A class used to test serialization using encoders. This class throws exceptions when using
* Java serialization -- so the only way it can be "serialized" is through our encoders.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ScalaReflectionRelationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ScalaReflectionRelationSuite.scala
index 7e305e0..d51a461 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/ScalaReflectionRelationSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/ScalaReflectionRelationSuite.scala
@@ -74,9 +74,14 @@ case class ComplexReflectData(
mapFieldContainsNull: Map[Int, Option[Long]],
dataField: Data)
+case class InvalidInJava(`abstract`: Int)
+
class ScalaReflectionRelationSuite extends SparkFunSuite with SharedSparkSession {
import testImplicits._
+ // To avoid syntax error thrown by genjavadoc, make this case class non-top level and private.
+ private case class InvalidInJava2(`0`: Int)
+
test("query case class RDD") {
val data = ReflectData("a", 1, 1L, 1.toFloat, 1.toDouble, 1.toShort, 1.toByte, true,
new java.math.BigDecimal(1), Date.valueOf("1970-01-01"), new Timestamp(12345), Seq(1, 2, 3),
@@ -142,4 +147,20 @@ class ScalaReflectionRelationSuite extends SparkFunSuite with SharedSparkSession
Map(10 -> 100L, 20 -> 200L, 30 -> null),
Row(null, "abc"))))
}
+
+ test("better error message when use java reserved keyword as field name") {
+ val e = intercept[UnsupportedOperationException] {
+ Seq(InvalidInJava(1)).toDS()
+ }
+ assert(e.getMessage.contains(
+ "`abstract` is not a valid identifier of Java and cannot be used as field name"))
+ }
+
+ test("better error message when use invalid java identifier as field name") {
+ val e1 = intercept[UnsupportedOperationException] {
+ Seq(InvalidInJava2(1)).toDS()
+ }
+ assert(e1.getMessage.contains(
+ "`0` is not a valid identifier of Java and cannot be used as field name"))
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org