You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@spark.apache.org by "Andy Grove (Jira)" <ji...@apache.org> on 2022/01/28 17:32:00 UTC

[jira] [Updated] (SPARK-38060) Inconsistent behavior from JSON option allowNonNumericNumbers

     [ https://issues.apache.org/jira/browse/SPARK-38060?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]

Andy Grove updated SPARK-38060:
-------------------------------
    Description: 
The behavior of the JSON option allowNonNumericNumbers is not consistent:

1. Some NaN and Infinity values are still parsed when the option is set to false

2. Some values are parsed differently depending on whether they are quoted or not (see results for positive and negative Infinity)
h2. Input data
{code:java}
{ "number": "NaN" }
{ "number": NaN }
{ "number": "+INF" }
{ "number": +INF }
{ "number": "-INF" }
{ "number": -INF }
{ "number": "INF" }
{ "number": INF }
{ "number": Infinity }
{ "number": +Infinity }
{ "number": -Infinity }
{ "number": "Infinity" }
{ "number": "+Infinity" }
{ "number": "-Infinity" }
{code}
h2. Setup
{code:java}
import org.apache.spark.sql.types._

val schema = StructType(Seq(StructField("number", DataTypes.FloatType, false))) {code}
h2. allowNonNumericNumbers = false
{code:java}
spark.read.format("json").schema(schema).option("allowNonNumericNumbers", "false").json("nan_valid.json")

df.show

+---------+
|   number|
+---------+
|      NaN|
|     null|
|     null|
|     null|
|     null|
|     null|
|     null|
|     null|
|     null|
|     null|
|     null|
| Infinity|
|     null|
|-Infinity|
+---------+ {code}
h2. allowNonNumericNumbers = true
{code:java}
val df = spark.read.format("json").schema(schema).option("allowNonNumericNumbers", "true").json("nan_valid.json") 

df.show

+---------+
|   number|
+---------+
|      NaN|
|      NaN|
|     null|
| Infinity|
|     null|
|-Infinity|
|     null|
|     null|
| Infinity|
| Infinity|
|-Infinity|
| Infinity|
|     null|
|-Infinity|
+---------+{code}

  was:
The behavior of the JSON option allowNonNumericNumbers is not consistent and still supports parsing NaN and Infinity values in some cases when the option is set to false.
h2. Input data
{code:java}
{ "number": "NaN" }
{ "number": NaN }
{ "number": "+INF" }
{ "number": +INF }
{ "number": "-INF" }
{ "number": -INF }
{ "number": "INF" }
{ "number": INF }
{ "number": Infinity }
{ "number": +Infinity }
{ "number": -Infinity }
{ "number": "Infinity" }
{ "number": "+Infinity" }
{ "number": "-Infinity" }
{code}
h2. Setup
{code:java}
import org.apache.spark.sql.types._

val schema = StructType(Seq(StructField("number", DataTypes.FloatType, false))) {code}
h2. allowNonNumericNumbers = false
{code:java}
spark.read.format("json").schema(schema).option("allowNonNumericNumbers", "false").json("nan_valid.json")

df.show

+---------+
|   number|
+---------+
|      NaN|
|     null|
|     null|
|     null|
|     null|
|     null|
|     null|
|     null|
|     null|
|     null|
|     null|
| Infinity|
|     null|
|-Infinity|
+---------+ {code}
h2. allowNonNumericNumbers = true
{code:java}
val df = spark.read.format("json").schema(schema).option("allowNonNumericNumbers", "true").json("nan_valid.json") 

df.show

+---------+
|   number|
+---------+
|      NaN|
|      NaN|
|     null|
| Infinity|
|     null|
|-Infinity|
|     null|
|     null|
| Infinity|
| Infinity|
|-Infinity|
| Infinity|
|     null|
|-Infinity|
+---------+{code}


> Inconsistent behavior from JSON option allowNonNumericNumbers
> -------------------------------------------------------------
>
>                 Key: SPARK-38060
>                 URL: https://issues.apache.org/jira/browse/SPARK-38060
>             Project: Spark
>          Issue Type: Bug
>          Components: SQL
>    Affects Versions: 3.2.0
>         Environment: Running Spark 3.2.0 in local mode on Ubuntu 20.04.3 LTS
>            Reporter: Andy Grove
>            Priority: Minor
>
> The behavior of the JSON option allowNonNumericNumbers is not consistent:
> 1. Some NaN and Infinity values are still parsed when the option is set to false
> 2. Some values are parsed differently depending on whether they are quoted or not (see results for positive and negative Infinity)
> h2. Input data
> {code:java}
> { "number": "NaN" }
> { "number": NaN }
> { "number": "+INF" }
> { "number": +INF }
> { "number": "-INF" }
> { "number": -INF }
> { "number": "INF" }
> { "number": INF }
> { "number": Infinity }
> { "number": +Infinity }
> { "number": -Infinity }
> { "number": "Infinity" }
> { "number": "+Infinity" }
> { "number": "-Infinity" }
> {code}
> h2. Setup
> {code:java}
> import org.apache.spark.sql.types._
> val schema = StructType(Seq(StructField("number", DataTypes.FloatType, false))) {code}
> h2. allowNonNumericNumbers = false
> {code:java}
> spark.read.format("json").schema(schema).option("allowNonNumericNumbers", "false").json("nan_valid.json")
> df.show
> +---------+
> |   number|
> +---------+
> |      NaN|
> |     null|
> |     null|
> |     null|
> |     null|
> |     null|
> |     null|
> |     null|
> |     null|
> |     null|
> |     null|
> | Infinity|
> |     null|
> |-Infinity|
> +---------+ {code}
> h2. allowNonNumericNumbers = true
> {code:java}
> val df = spark.read.format("json").schema(schema).option("allowNonNumericNumbers", "true").json("nan_valid.json") 
> df.show
> +---------+
> |   number|
> +---------+
> |      NaN|
> |      NaN|
> |     null|
> | Infinity|
> |     null|
> |-Infinity|
> |     null|
> |     null|
> | Infinity|
> | Infinity|
> |-Infinity|
> | Infinity|
> |     null|
> |-Infinity|
> +---------+{code}



--
This message was sent by Atlassian Jira
(v8.20.1#820001)

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@spark.apache.org
For additional commands, e-mail: issues-help@spark.apache.org