You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@spark.apache.org by "Yuming Wang (Jira)" <ji...@apache.org> on 2021/02/23 23:42:00 UTC

[jira] [Updated] (SPARK-34512) Disable validate default values when parsing Avro schemas

     [ https://issues.apache.org/jira/browse/SPARK-34512?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]

Yuming Wang updated SPARK-34512:
--------------------------------
    Description: 
How to reproduce this issue:
{code:scala}
  // Add this test to HiveSerDeReadWriteSuite
  test("SPARK-34512") {
    withTable("t1") {
      hiveClient.runSqlHive(
        """
          |CREATE TABLE t1
          |  ROW FORMAT SERDE
          |  'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
          |  STORED AS INPUTFORMAT
          |  'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
          |  OUTPUTFORMAT
          |  'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
          |  TBLPROPERTIES (
          |    'avro.schema.literal'='{
          |      "namespace": "org.apache.spark.sql.hive.test",
          |      "name": "schema_with_default_value",
          |      "type": "record",
          |      "fields": [
          |         {
          |           "name": "ARRAY_WITH_DEFAULT",
          |           "type": {"type": "array", "items": "string"},
          |           "default": null
          |         }
          |       ]
          |    }')
          |""".stripMargin)

      spark.sql("select * from t1").show
    }
  }
{code}


{noformat}
org.apache.avro.AvroTypeException: Invalid default for field ARRAY_WITH_DEFAULT: null not a {"type":"array","items":"string"}
	at org.apache.avro.Schema.validateDefault(Schema.java:1571)
	at org.apache.avro.Schema.access$500(Schema.java:87)
	at org.apache.avro.Schema$Field.<init>(Schema.java:544)
	at org.apache.avro.Schema.parse(Schema.java:1678)
	at org.apache.avro.Schema$Parser.parse(Schema.java:1425)
	at org.apache.avro.Schema$Parser.parse(Schema.java:1413)
	at org.apache.hadoop.hive.serde2.avro.AvroSerdeUtils.getSchemaFor(AvroSerdeUtils.java:268)
	at org.apache.hadoop.hive.serde2.avro.AvroSerdeUtils.determineSchemaOrThrowException(AvroSerdeUtils.java:111)
	at org.apache.hadoop.hive.serde2.avro.AvroSerDe.determineSchemaOrReturnErrorSchema(AvroSerDe.java:187)
	at org.apache.hadoop.hive.serde2.avro.AvroSerDe.initialize(AvroSerDe.java:107)
	at org.apache.hadoop.hive.serde2.avro.AvroSerDe.initialize(AvroSerDe.java:83)
	at org.apache.hadoop.hive.serde2.SerDeUtils.initializeSerDe(SerDeUtils.java:533)
	at org.apache.hadoop.hive.metastore.MetaStoreUtils.getDeserializer(MetaStoreUtils.java:450)
	at org.apache.hadoop.hive.metastore.MetaStoreUtils.getDeserializer(MetaStoreUtils.java:437)
	at org.apache.hadoop.hive.ql.metadata.Table.getDeserializerFromMetaStore(Table.java:281)
	at org.apache.hadoop.hive.ql.metadata.Table.getDeserializer(Table.java:263)
	at org.apache.hadoop.hive.ql.metadata.Table.getColsInternal(Table.java:641)
	at org.apache.hadoop.hive.ql.metadata.Table.getCols(Table.java:624)
	at org.apache.hadoop.hive.ql.metadata.Hive.createTable(Hive.java:831)
	at org.apache.hadoop.hive.ql.metadata.Hive.createTable(Hive.java:867)
	at org.apache.hadoop.hive.ql.exec.DDLTask.createTable(DDLTask.java:4356)
	at org.apache.hadoop.hive.ql.exec.DDLTask.execute(DDLTask.java:354)
	at org.apache.hadoop.hive.ql.exec.Task.executeTask(Task.java:199)
	at org.apache.hadoop.hive.ql.exec.TaskRunner.runSequential(TaskRunner.java:100)
	at org.apache.hadoop.hive.ql.Driver.launchTask(Driver.java:2183)
	at org.apache.hadoop.hive.ql.Driver.execute(Driver.java:1839)
	at org.apache.hadoop.hive.ql.Driver.runInternal(Driver.java:1526)
	at org.apache.hadoop.hive.ql.Driver.run(Driver.java:1237)
	at org.apache.hadoop.hive.ql.Driver.run(Driver.java:1227)
	at org.apache.spark.sql.hive.client.HiveClientImpl.$anonfun$runHive$1(HiveClientImpl.scala:820)
	at org.apache.spark.sql.hive.client.HiveClientImpl.$anonfun$withHiveState$1(HiveClientImpl.scala:291)
	at org.apache.spark.sql.hive.client.HiveClientImpl.liftedTree1$1(HiveClientImpl.scala:224)
	at org.apache.spark.sql.hive.client.HiveClientImpl.retryLocked(HiveClientImpl.scala:223)
	at org.apache.spark.sql.hive.client.HiveClientImpl.withHiveState(HiveClientImpl.scala:273)
	at org.apache.spark.sql.hive.client.HiveClientImpl.runHive(HiveClientImpl.scala:800)
	at org.apache.spark.sql.hive.client.HiveClientImpl.runSqlHive(HiveClientImpl.scala:787)

{noformat}

It works before.


  was:
How to reproduce this issue:
{code:scala}

{code}


> Disable validate default values when parsing Avro schemas
> ---------------------------------------------------------
>
>                 Key: SPARK-34512
>                 URL: https://issues.apache.org/jira/browse/SPARK-34512
>             Project: Spark
>          Issue Type: Bug
>          Components: SQL
>    Affects Versions: 3.2.0
>            Reporter: Yuming Wang
>            Priority: Major
>
> How to reproduce this issue:
> {code:scala}
>   // Add this test to HiveSerDeReadWriteSuite
>   test("SPARK-34512") {
>     withTable("t1") {
>       hiveClient.runSqlHive(
>         """
>           |CREATE TABLE t1
>           |  ROW FORMAT SERDE
>           |  'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
>           |  STORED AS INPUTFORMAT
>           |  'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
>           |  OUTPUTFORMAT
>           |  'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
>           |  TBLPROPERTIES (
>           |    'avro.schema.literal'='{
>           |      "namespace": "org.apache.spark.sql.hive.test",
>           |      "name": "schema_with_default_value",
>           |      "type": "record",
>           |      "fields": [
>           |         {
>           |           "name": "ARRAY_WITH_DEFAULT",
>           |           "type": {"type": "array", "items": "string"},
>           |           "default": null
>           |         }
>           |       ]
>           |    }')
>           |""".stripMargin)
>       spark.sql("select * from t1").show
>     }
>   }
> {code}
> {noformat}
> org.apache.avro.AvroTypeException: Invalid default for field ARRAY_WITH_DEFAULT: null not a {"type":"array","items":"string"}
> 	at org.apache.avro.Schema.validateDefault(Schema.java:1571)
> 	at org.apache.avro.Schema.access$500(Schema.java:87)
> 	at org.apache.avro.Schema$Field.<init>(Schema.java:544)
> 	at org.apache.avro.Schema.parse(Schema.java:1678)
> 	at org.apache.avro.Schema$Parser.parse(Schema.java:1425)
> 	at org.apache.avro.Schema$Parser.parse(Schema.java:1413)
> 	at org.apache.hadoop.hive.serde2.avro.AvroSerdeUtils.getSchemaFor(AvroSerdeUtils.java:268)
> 	at org.apache.hadoop.hive.serde2.avro.AvroSerdeUtils.determineSchemaOrThrowException(AvroSerdeUtils.java:111)
> 	at org.apache.hadoop.hive.serde2.avro.AvroSerDe.determineSchemaOrReturnErrorSchema(AvroSerDe.java:187)
> 	at org.apache.hadoop.hive.serde2.avro.AvroSerDe.initialize(AvroSerDe.java:107)
> 	at org.apache.hadoop.hive.serde2.avro.AvroSerDe.initialize(AvroSerDe.java:83)
> 	at org.apache.hadoop.hive.serde2.SerDeUtils.initializeSerDe(SerDeUtils.java:533)
> 	at org.apache.hadoop.hive.metastore.MetaStoreUtils.getDeserializer(MetaStoreUtils.java:450)
> 	at org.apache.hadoop.hive.metastore.MetaStoreUtils.getDeserializer(MetaStoreUtils.java:437)
> 	at org.apache.hadoop.hive.ql.metadata.Table.getDeserializerFromMetaStore(Table.java:281)
> 	at org.apache.hadoop.hive.ql.metadata.Table.getDeserializer(Table.java:263)
> 	at org.apache.hadoop.hive.ql.metadata.Table.getColsInternal(Table.java:641)
> 	at org.apache.hadoop.hive.ql.metadata.Table.getCols(Table.java:624)
> 	at org.apache.hadoop.hive.ql.metadata.Hive.createTable(Hive.java:831)
> 	at org.apache.hadoop.hive.ql.metadata.Hive.createTable(Hive.java:867)
> 	at org.apache.hadoop.hive.ql.exec.DDLTask.createTable(DDLTask.java:4356)
> 	at org.apache.hadoop.hive.ql.exec.DDLTask.execute(DDLTask.java:354)
> 	at org.apache.hadoop.hive.ql.exec.Task.executeTask(Task.java:199)
> 	at org.apache.hadoop.hive.ql.exec.TaskRunner.runSequential(TaskRunner.java:100)
> 	at org.apache.hadoop.hive.ql.Driver.launchTask(Driver.java:2183)
> 	at org.apache.hadoop.hive.ql.Driver.execute(Driver.java:1839)
> 	at org.apache.hadoop.hive.ql.Driver.runInternal(Driver.java:1526)
> 	at org.apache.hadoop.hive.ql.Driver.run(Driver.java:1237)
> 	at org.apache.hadoop.hive.ql.Driver.run(Driver.java:1227)
> 	at org.apache.spark.sql.hive.client.HiveClientImpl.$anonfun$runHive$1(HiveClientImpl.scala:820)
> 	at org.apache.spark.sql.hive.client.HiveClientImpl.$anonfun$withHiveState$1(HiveClientImpl.scala:291)
> 	at org.apache.spark.sql.hive.client.HiveClientImpl.liftedTree1$1(HiveClientImpl.scala:224)
> 	at org.apache.spark.sql.hive.client.HiveClientImpl.retryLocked(HiveClientImpl.scala:223)
> 	at org.apache.spark.sql.hive.client.HiveClientImpl.withHiveState(HiveClientImpl.scala:273)
> 	at org.apache.spark.sql.hive.client.HiveClientImpl.runHive(HiveClientImpl.scala:800)
> 	at org.apache.spark.sql.hive.client.HiveClientImpl.runSqlHive(HiveClientImpl.scala:787)
> {noformat}
> It works before.



--
This message was sent by Atlassian Jira
(v8.3.4#803005)

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@spark.apache.org
For additional commands, e-mail: issues-help@spark.apache.org