You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@spark.apache.org by "Vincenzo Cerminara (Jira)" <ji...@apache.org> on 2021/05/05 18:54:00 UTC

[jira] [Updated] (SPARK-35320) from_json cannot parse maps with timestamp as key

     [ https://issues.apache.org/jira/browse/SPARK-35320?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]

Vincenzo Cerminara updated SPARK-35320:
---------------------------------------
    Description: 
I have a json that contains a {{map<timestamp,string>}} like the following
{code:json}
{
  "map": {
    "2021-05-05T20:05:08": "sampleValue"
  }
}
{code}
The key of the map is a string containing a formatted timestamp and I want to parse it as a Java {{{{Map<Instant,String>}}}} using the {{{{from_json}}}} Spark SQL function (see the {{Sample}} class in the code below).
{code:java}
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;

import java.io.Serializable;
import java.time.Instant;
import java.util.List;
import java.util.Map;

import static org.apache.spark.sql.functions.*;

public class TimestampAsJsonMapKey {

    public static class Sample implements Serializable {
        private Map<Instant, String> map;
        
        public Map<Instant, String> getMap() {
            return map;
        }
        
        public void setMap(Map<Instant, String> map) {
            this.map = map;
        }
    }

    public static class InvertedSample implements Serializable {
        private Map<String, Instant> map;
        
        public Map<String, Instant> getMap() {
            return map;
        }
        
        public void setMap(Map<String, Instant> map) {
            this.map = map;
        }
    }

    public static void main(String[] args) {

        final SparkSession spark = SparkSession
                .builder()
                .appName("Timestamp As Json Map Key Test")
                .master("local[1]")
                .getOrCreate();

        workingTest(spark);

        notWorkingTest(spark);

    }

    private static void workingTest(SparkSession spark) {
        //language=JSON
        final String invertedSampleJson = "{ \"map\": { \"sampleValue\": \"2021-05-05T20:05:08\" } }";

        final Dataset<String> samplesDf = spark.createDataset(List.of(invertedSampleJson), Encoders.STRING());

        final Dataset<Row> parsedDf = samplesDf.select(from_json(col("value"), Encoders.bean(InvertedSample.class).schema()));

        parsedDf.show(false);
    }

    private static void notWorkingTest(SparkSession spark) {
        //language=JSON
        final String sampleJson = "{ \"map\": { \"2021-05-05T20:05:08\": \"sampleValue\" } }";

        final Dataset<String> samplesDf = spark.createDataset(List.of(sampleJson), Encoders.STRING());

        final Dataset<Row> parsedDf = samplesDf.select(from_json(col("value"), Encoders.bean(Sample.class).schema()));

        parsedDf.show(false);
    }
}
{code}
When I run the {{notWorkingTest}} method it fails with the following exception:
{noformat}
Exception in thread "main" java.lang.ClassCastException: class org.apache.spark.unsafe.types.UTF8String cannot be cast to class java.lang.Long (org.apache.spark.unsafe.types.UTF8String is in unnamed module of loader 'app'; java.lang.Long is in module java.base of loader 'bootstrap')
	at scala.runtime.BoxesRunTime.unboxToLong(BoxesRunTime.java:107)
	at org.apache.spark.sql.catalyst.expressions.CastBase.$anonfun$castToString$8$adapted(Cast.scala:297)
	at org.apache.spark.sql.catalyst.expressions.CastBase.buildCast(Cast.scala:285)
	at org.apache.spark.sql.catalyst.expressions.CastBase.$anonfun$castToString$7(Cast.scala:297)
	at org.apache.spark.sql.catalyst.expressions.CastBase.$anonfun$castToString$12(Cast.scala:329)
	at org.apache.spark.sql.catalyst.expressions.CastBase.buildCast(Cast.scala:285)
	at org.apache.spark.sql.catalyst.expressions.CastBase.$anonfun$castToString$11(Cast.scala:321)
	at org.apache.spark.sql.catalyst.expressions.CastBase.$anonfun$castToString$14(Cast.scala:359)
	at org.apache.spark.sql.catalyst.expressions.CastBase.buildCast(Cast.scala:285)
	at org.apache.spark.sql.catalyst.expressions.CastBase.$anonfun$castToString$13(Cast.scala:352)
	at org.apache.spark.sql.catalyst.expressions.CastBase.nullSafeEval(Cast.scala:815)
	at org.apache.spark.sql.catalyst.expressions.UnaryExpression.eval(Expression.scala:461)
	at org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:156)
	at org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(InterpretedMutableProjection.scala:83)
	at org.apache.spark.sql.catalyst.optimizer.ConvertToLocalRelation$$anonfun$apply$17.$anonfun$applyOrElse$71(Optimizer.scala:1508)
{noformat}
It seems that if the a {{timestamp}} is the key in a map it must necessarily be a of type long, and cannot be of type {{string}}.

 
----
 In the {{workingTest}} method, instead, I have an inverted map (the timestamp appears as the value in this case, and not as the key) and it works correctly

  was:
I have a json that contains a {{map<timestamp,string>}} like the following
{code:json}
{
  "map": {
    "2021-05-05T20:05:08": "sampleValue"
  }
}
{code}
 The key of the map is a string containing a formatted timestamp and I want to parse it in a Java {{Map<Instant,String>}} using the {{from_json}} Spark SQL function (see the {{Sample}} class in the code below).
{code:java}
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;

import java.io.Serializable;
import java.time.Instant;
import java.util.List;
import java.util.Map;

import static org.apache.spark.sql.functions.*;

public class TimestampAsJsonMapKey {

    public static class Sample implements Serializable {
        private Map<Instant, String> map;
        
        public Map<Instant, String> getMap() {
            return map;
        }
        
        public void setMap(Map<Instant, String> map) {
            this.map = map;
        }
    }

    public static class InvertedSample implements Serializable {
        private Map<String, Instant> map;
        
        public Map<String, Instant> getMap() {
            return map;
        }
        
        public void setMap(Map<String, Instant> map) {
            this.map = map;
        }
    }

    public static void main(String[] args) {

        final SparkSession spark = SparkSession
                .builder()
                .appName("Timestamp As Json Map Key Test")
                .master("local[1]")
                .getOrCreate();

        workingTest(spark);

        notWorkingTest(spark);

    }

    private static void workingTest(SparkSession spark) {
        //language=JSON
        final String invertedSampleJson = "{ \"map\": { \"sampleValue\": \"2021-05-05T20:05:08\" } }";

        final Dataset<String> samplesDf = spark.createDataset(List.of(invertedSampleJson), Encoders.STRING());

        final Dataset<Row> parsedDf = samplesDf.select(from_json(col("value"), Encoders.bean(InvertedSample.class).schema()));

        parsedDf.show(false);
    }

    private static void notWorkingTest(SparkSession spark) {
        //language=JSON
        final String sampleJson = "{ \"map\": { \"2021-05-05T20:05:08\": \"sampleValue\" } }";

        final Dataset<String> samplesDf = spark.createDataset(List.of(sampleJson), Encoders.STRING());

        final Dataset<Row> parsedDf = samplesDf.select(from_json(col("value"), Encoders.bean(Sample.class).schema()));

        parsedDf.show(false);
    }
}
{code}
 

When I run the {{notWorkingTest}} method it fails with the following exception:
{noformat}
Exception in thread "main" java.lang.ClassCastException: class org.apache.spark.unsafe.types.UTF8String cannot be cast to class java.lang.Long (org.apache.spark.unsafe.types.UTF8String is in unnamed module of loader 'app'; java.lang.Long is in module java.base of loader 'bootstrap')
	at scala.runtime.BoxesRunTime.unboxToLong(BoxesRunTime.java:107)
	at org.apache.spark.sql.catalyst.expressions.CastBase.$anonfun$castToString$8$adapted(Cast.scala:297)
	at org.apache.spark.sql.catalyst.expressions.CastBase.buildCast(Cast.scala:285)
	at org.apache.spark.sql.catalyst.expressions.CastBase.$anonfun$castToString$7(Cast.scala:297)
	at org.apache.spark.sql.catalyst.expressions.CastBase.$anonfun$castToString$12(Cast.scala:329)
	at org.apache.spark.sql.catalyst.expressions.CastBase.buildCast(Cast.scala:285)
	at org.apache.spark.sql.catalyst.expressions.CastBase.$anonfun$castToString$11(Cast.scala:321)
	at org.apache.spark.sql.catalyst.expressions.CastBase.$anonfun$castToString$14(Cast.scala:359)
	at org.apache.spark.sql.catalyst.expressions.CastBase.buildCast(Cast.scala:285)
	at org.apache.spark.sql.catalyst.expressions.CastBase.$anonfun$castToString$13(Cast.scala:352)
	at org.apache.spark.sql.catalyst.expressions.CastBase.nullSafeEval(Cast.scala:815)
	at org.apache.spark.sql.catalyst.expressions.UnaryExpression.eval(Expression.scala:461)
	at org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:156)
	at org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(InterpretedMutableProjection.scala:83)
	at org.apache.spark.sql.catalyst.optimizer.ConvertToLocalRelation$$anonfun$apply$17.$anonfun$applyOrElse$71(Optimizer.scala:1508)
{noformat}
It seems that if the a {{timestamp}} is the key in a map it must necessarily be a of type long, and cannot be of type {{string}}.
----
 In the {{workingTest}} method, instead, I have an inverted map (the timestamp appears as the value in this case, and not as the key) and it works correctly


> from_json cannot parse maps with timestamp as key
> -------------------------------------------------
>
>                 Key: SPARK-35320
>                 URL: https://issues.apache.org/jira/browse/SPARK-35320
>             Project: Spark
>          Issue Type: Bug
>          Components: SQL
>    Affects Versions: 3.0.1, 3.1.1
>         Environment: * Java 11
>  * Spark 3.0.1/3.1.1
>  * Scala 2.12
>            Reporter: Vincenzo Cerminara
>            Priority: Minor
>
> I have a json that contains a {{map<timestamp,string>}} like the following
> {code:json}
> {
>   "map": {
>     "2021-05-05T20:05:08": "sampleValue"
>   }
> }
> {code}
> The key of the map is a string containing a formatted timestamp and I want to parse it as a Java {{{{Map<Instant,String>}}}} using the {{{{from_json}}}} Spark SQL function (see the {{Sample}} class in the code below).
> {code:java}
> import org.apache.spark.sql.Dataset;
> import org.apache.spark.sql.Encoders;
> import org.apache.spark.sql.Row;
> import org.apache.spark.sql.SparkSession;
> import java.io.Serializable;
> import java.time.Instant;
> import java.util.List;
> import java.util.Map;
> import static org.apache.spark.sql.functions.*;
> public class TimestampAsJsonMapKey {
>     public static class Sample implements Serializable {
>         private Map<Instant, String> map;
>         
>         public Map<Instant, String> getMap() {
>             return map;
>         }
>         
>         public void setMap(Map<Instant, String> map) {
>             this.map = map;
>         }
>     }
>     public static class InvertedSample implements Serializable {
>         private Map<String, Instant> map;
>         
>         public Map<String, Instant> getMap() {
>             return map;
>         }
>         
>         public void setMap(Map<String, Instant> map) {
>             this.map = map;
>         }
>     }
>     public static void main(String[] args) {
>         final SparkSession spark = SparkSession
>                 .builder()
>                 .appName("Timestamp As Json Map Key Test")
>                 .master("local[1]")
>                 .getOrCreate();
>         workingTest(spark);
>         notWorkingTest(spark);
>     }
>     private static void workingTest(SparkSession spark) {
>         //language=JSON
>         final String invertedSampleJson = "{ \"map\": { \"sampleValue\": \"2021-05-05T20:05:08\" } }";
>         final Dataset<String> samplesDf = spark.createDataset(List.of(invertedSampleJson), Encoders.STRING());
>         final Dataset<Row> parsedDf = samplesDf.select(from_json(col("value"), Encoders.bean(InvertedSample.class).schema()));
>         parsedDf.show(false);
>     }
>     private static void notWorkingTest(SparkSession spark) {
>         //language=JSON
>         final String sampleJson = "{ \"map\": { \"2021-05-05T20:05:08\": \"sampleValue\" } }";
>         final Dataset<String> samplesDf = spark.createDataset(List.of(sampleJson), Encoders.STRING());
>         final Dataset<Row> parsedDf = samplesDf.select(from_json(col("value"), Encoders.bean(Sample.class).schema()));
>         parsedDf.show(false);
>     }
> }
> {code}
> When I run the {{notWorkingTest}} method it fails with the following exception:
> {noformat}
> Exception in thread "main" java.lang.ClassCastException: class org.apache.spark.unsafe.types.UTF8String cannot be cast to class java.lang.Long (org.apache.spark.unsafe.types.UTF8String is in unnamed module of loader 'app'; java.lang.Long is in module java.base of loader 'bootstrap')
> 	at scala.runtime.BoxesRunTime.unboxToLong(BoxesRunTime.java:107)
> 	at org.apache.spark.sql.catalyst.expressions.CastBase.$anonfun$castToString$8$adapted(Cast.scala:297)
> 	at org.apache.spark.sql.catalyst.expressions.CastBase.buildCast(Cast.scala:285)
> 	at org.apache.spark.sql.catalyst.expressions.CastBase.$anonfun$castToString$7(Cast.scala:297)
> 	at org.apache.spark.sql.catalyst.expressions.CastBase.$anonfun$castToString$12(Cast.scala:329)
> 	at org.apache.spark.sql.catalyst.expressions.CastBase.buildCast(Cast.scala:285)
> 	at org.apache.spark.sql.catalyst.expressions.CastBase.$anonfun$castToString$11(Cast.scala:321)
> 	at org.apache.spark.sql.catalyst.expressions.CastBase.$anonfun$castToString$14(Cast.scala:359)
> 	at org.apache.spark.sql.catalyst.expressions.CastBase.buildCast(Cast.scala:285)
> 	at org.apache.spark.sql.catalyst.expressions.CastBase.$anonfun$castToString$13(Cast.scala:352)
> 	at org.apache.spark.sql.catalyst.expressions.CastBase.nullSafeEval(Cast.scala:815)
> 	at org.apache.spark.sql.catalyst.expressions.UnaryExpression.eval(Expression.scala:461)
> 	at org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:156)
> 	at org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(InterpretedMutableProjection.scala:83)
> 	at org.apache.spark.sql.catalyst.optimizer.ConvertToLocalRelation$$anonfun$apply$17.$anonfun$applyOrElse$71(Optimizer.scala:1508)
> {noformat}
> It seems that if the a {{timestamp}} is the key in a map it must necessarily be a of type long, and cannot be of type {{string}}.
>  
> ----
>  In the {{workingTest}} method, instead, I have an inverted map (the timestamp appears as the value in this case, and not as the key) and it works correctly



--
This message was sent by Atlassian Jira
(v8.3.4#803005)

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@spark.apache.org
For additional commands, e-mail: issues-help@spark.apache.org