You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by gu...@apache.org on 2017/08/24 10:24:06 UTC
spark git commit: [SPARK-21804][SQL] json_tuple returns null values within repeated columns except the first one

Repository: spark
Updated Branches:
  refs/heads/master 846bc61cf -> 95713eb4f


[SPARK-21804][SQL] json_tuple returns null values within repeated columns except the first one

## What changes were proposed in this pull request?

When json_tuple in extracting values from JSON it returns null values within repeated columns except the first one as below:

``` scala
scala> spark.sql("""SELECT json_tuple('{"a":1, "b":2}', 'a', 'b', 'a')""").show()
+---+---+----+
| c0| c1|  c2|
+---+---+----+
|  1|  2|null|
+---+---+----+
```

I think this should be consistent with Hive's implementation:
```
hive> SELECT json_tuple('{"a": 1, "b": 2}', 'a', 'a');
...
1    1
```

In this PR, we located all the matched indices in `fieldNames` instead of returning the first matched index, i.e., indexOf.

## How was this patch tested?

Added test in JsonExpressionsSuite.

Author: Jen-Ming Chung <je...@gmail.com>

Closes #19017 from jmchung/SPARK-21804.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/95713eb4
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/95713eb4
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/95713eb4

Branch: refs/heads/master
Commit: 95713eb4f22de4e16617a605f74a1d6373ed270b
Parents: 846bc61
Author: Jen-Ming Chung <je...@gmail.com>
Authored: Thu Aug 24 19:24:00 2017 +0900
Committer: hyukjinkwon <gu...@gmail.com>
Committed: Thu Aug 24 19:24:00 2017 +0900

----------------------------------------------------------------------
 .../sql/catalyst/expressions/jsonExpressions.scala      | 12 ++++++++++--
 .../sql/catalyst/expressions/JsonExpressionsSuite.scala | 10 ++++++++++
 2 files changed, 20 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/95713eb4/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
index c375737..ee5da1a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
@@ -436,7 +436,8 @@ case class JsonTuple(children: Seq[Expression])
     while (parser.nextToken() != JsonToken.END_OBJECT) {
       if (parser.getCurrentToken == JsonToken.FIELD_NAME) {
         // check to see if this field is desired in the output
-        val idx = fieldNames.indexOf(parser.getCurrentName)
+        val jsonField = parser.getCurrentName
+        var idx = fieldNames.indexOf(jsonField)
         if (idx >= 0) {
           // it is, copy the child tree to the correct location in the output row
           val output = new ByteArrayOutputStream()
@@ -447,7 +448,14 @@ case class JsonTuple(children: Seq[Expression])
               generator => copyCurrentStructure(generator, parser)
             }
 
-            row(idx) = UTF8String.fromBytes(output.toByteArray)
+            val jsonValue = UTF8String.fromBytes(output.toByteArray)
+
+            // SPARK-21804: json_tuple returns null values within repeated columns
+            // except the first one; so that we need to check the remaining fields.
+            do {
+              row(idx) = jsonValue
+              idx = fieldNames.indexOf(jsonField, idx + 1)
+            } while (idx >= 0)
           }
         }
       }

http://git-wip-us.apache.org/repos/asf/spark/blob/95713eb4/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
index 1cd2b4f..9991bda 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
@@ -373,6 +373,16 @@ class JsonExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
       InternalRow(UTF8String.fromString("1"), null, UTF8String.fromString("2")))
   }
 
+  test("SPARK-21804: json_tuple returns null values within repeated columns except the first one") {
+    checkJsonTuple(
+      JsonTuple(Literal("""{"f1": 1, "f2": 2}""") ::
+        NonFoldableLiteral("f1") ::
+        NonFoldableLiteral("cast(NULL AS STRING)") ::
+        NonFoldableLiteral("f1") ::
+        Nil),
+      InternalRow(UTF8String.fromString("1"), null, UTF8String.fromString("1")))
+  }
+
   val gmtId = Option(DateTimeUtils.TimeZoneGMT.getID)
 
   test("from_json") {


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org