You are viewing a plain text version of this content. The canonical link for it is here.
Posted to user@spark.apache.org by Arun Patel <ar...@gmail.com> on 2016/09/08 21:31:15 UTC
spark-xml to avro - SchemaParseException: Can't redefine
I'm trying to convert XML to AVRO. But, I am getting SchemaParser
exception for 'Rules' which is existing in two separate containers. Any
thoughts?
XML is attached.
df =
sqlContext.read.format('com.databricks.spark.xml').options(rowTag='GGLResponse',attributePrefix='').load('GGL.xml')
df.show()
+--------------------+--------------------+---+--------------------+
| ResponseDataset| ResponseHeader|ns2| xmlns|
+--------------------+--------------------+---+--------------------+
|[[[[[1,1],[SD2000...|[2016-07-26T16:28...|GGL|http://www.xxxx.c...|
+--------------------+--------------------+---+--------------------+
>>> df.printSchema()
root
|-- ResponseDataset: struct (nullable = true)
| |-- ResponseFileGGL: struct (nullable = true)
| | |-- OfferSets: struct (nullable = true)
| | | |-- OfferSet: struct (nullable = true)
| | | | |-- OfferSetHeader: struct (nullable = true)
| | | | | |-- OfferSetIdentifier: long (nullable = true)
| | | | | |-- TotalOffersProcessed: long (nullable = true)
| | | | |-- Offers: struct (nullable = true)
| | | | | |-- Identifier: string (nullable = true)
| | | | | |-- Offer: struct (nullable = true)
| | | | | | |-- Rules: struct (nullable = true)
| | | | | | | |-- Rule: array (nullable = true)
| | | | | | | | |-- element: struct (containsNull
= true)
| | | | | | | | | |-- BorrowerIdentifier: long
(nullable = true)
| | | | | | | | | |-- RuleIdentifier: long
(nullable = true)
| | | | | |-- PartyRoleIdentifier: long (nullable = true)
| | | | | |-- SuffixIdentifier: string (nullable = true)
| | | | | |-- UCP: string (nullable = true)
| | | | |-- Pool: struct (nullable = true)
| | | | | |-- Identifier: string (nullable = true)
| | | | | |-- PartyRoleIdentifier: long (nullable = true)
| | | | | |-- Rules: struct (nullable = true)
| | | | | | |-- Rule: array (nullable = true)
| | | | | | | |-- element: struct (containsNull =
true)
| | | | | | | | |-- BIdentifier: long (nullable =
true)
| | | | | | | | |-- RIdentifier: long (nullable =
true)
| | | | | |-- SuffixIdentifier: string (nullable = true)
| | | | | |-- UCP: string (nullable = true)
| | |-- ResultHeader: struct (nullable = true)
| | | |-- RequestDateTime: string (nullable = true)
| | | |-- ResultDateTime: string (nullable = true)
| |-- ResponseFileUUID: string (nullable = true)
| |-- ResponseFileVersion: double (nullable = true)
|-- ResponseHeader: struct (nullable = true)
| |-- ResponseDateTime: string (nullable = true)
| |-- SessionIdentifier: string (nullable = true)
|-- ns2: string (nullable = true)
|-- xmlns: string (nullable = true)
df.write.format('com.databricks.spark.avro').save('ggl_avro')
16/09/08 17:07:20 INFO MemoryStore: Block broadcast_73 stored as values in
memory (estimated size 233.5 KB, free 772.4 KB)
16/09/08 17:07:20 INFO MemoryStore: Block broadcast_73_piece0 stored as
bytes in memory (estimated size 28.2 KB, free 800.6 KB)
16/09/08 17:07:20 INFO BlockManagerInfo: Added broadcast_73_piece0 in
memory on localhost:29785 (size: 28.2 KB, free: 511.4 MB)
16/09/08 17:07:20 INFO SparkContext: Created broadcast 73 from
newAPIHadoopFile at XmlFile.scala:39
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/usr/hdp/2.4.2.0-258/spark/python/pyspark/sql/readwriter.py", line
397, in save
self._jwrite.save(path)
File
"/usr/hdp/2.4.2.0-258/spark/python/lib/py4j-0.9-src.zip/py4j/java_gateway.py",
line 813, in __call__
File "/usr/hdp/2.4.2.0-258/spark/python/pyspark/sql/utils.py", line 45,
in deco
return f(*a, **kw)
File
"/usr/hdp/2.4.2.0-258/spark/python/lib/py4j-0.9-src.zip/py4j/protocol.py",
line 308, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o426.save.
: org.apache.avro.SchemaParseException: Can't redefine: Rules
at
org.apache.avro.SchemaBuilder$NameContext.put(SchemaBuilder.java:936)
at
org.apache.avro.SchemaBuilder$NameContext.access$600(SchemaBuilder.java:884)
at
org.apache.avro.SchemaBuilder$NamespacedBuilder.completeSchema(SchemaBuilder.java:470)
at
org.apache.avro.SchemaBuilder$RecordBuilder.fields(SchemaBuilder.java:1734)
....
Re: spark-xml to avro - SchemaParseException: Can't redefine
Posted by Arun Patel <ar...@gmail.com>.
Thank you Yong. I just looked at it.
There was a pull request (#73
<https://github.com/databricks/spark-avro/pull/73>) as well. Anything
wrong with that fix? Can I use similar fix?
On Thu, Sep 8, 2016 at 8:53 PM, Yong Zhang <ja...@hotmail.com> wrote:
> Do you take a look about this -> https://github.com/databricks/
> spark-avro/issues/54
>
>
>
> Yong
> <https://github.com/databricks/spark-avro/issues/54>
> spark-avro fails to save DF with nested records having the ...
> <https://github.com/databricks/spark-avro/issues/54>
> github.com
> sixers changed the title from Save DF with nested records with the same
> name to spark-avro fails to save DF with nested records having the same
> name Jun 23, 2015
>
>
>
> ------------------------------
> *From:* Arun Patel <ar...@gmail.com>
> *Sent:* Thursday, September 8, 2016 5:31 PM
> *To:* user
> *Subject:* spark-xml to avro - SchemaParseException: Can't redefine
>
> I'm trying to convert XML to AVRO. But, I am getting SchemaParser
> exception for 'Rules' which is existing in two separate containers. Any
> thoughts?
>
> XML is attached.
>
> df = sqlContext.read.format('com.databricks.spark.xml').
> options(rowTag='GGLResponse',attributePrefix='').load('GGL.xml')
> df.show()
> +--------------------+--------------------+---+--------------------+
> | ResponseDataset| ResponseHeader|ns2| xmlns|
> +--------------------+--------------------+---+--------------------+
> |[[[[[1,1],[SD2000...|[2016-07-26T16:28...|GGL|http://www.xxxx.c...|
> +--------------------+--------------------+---+--------------------+
>
> >>> df.printSchema()
> root
> |-- ResponseDataset: struct (nullable = true)
> | |-- ResponseFileGGL: struct (nullable = true)
> | | |-- OfferSets: struct (nullable = true)
> | | | |-- OfferSet: struct (nullable = true)
> | | | | |-- OfferSetHeader: struct (nullable = true)
> | | | | | |-- OfferSetIdentifier: long (nullable = true)
> | | | | | |-- TotalOffersProcessed: long (nullable = true)
> | | | | |-- Offers: struct (nullable = true)
> | | | | | |-- Identifier: string (nullable = true)
> | | | | | |-- Offer: struct (nullable = true)
> | | | | | | |-- Rules: struct (nullable = true)
> | | | | | | | |-- Rule: array (nullable = true)
> | | | | | | | | |-- element: struct
> (containsNull = true)
> | | | | | | | | | |-- BorrowerIdentifier:
> long (nullable = true)
> | | | | | | | | | |-- RuleIdentifier: long
> (nullable = true)
> | | | | | |-- PartyRoleIdentifier: long (nullable = true)
> | | | | | |-- SuffixIdentifier: string (nullable = true)
> | | | | | |-- UCP: string (nullable = true)
> | | | | |-- Pool: struct (nullable = true)
> | | | | | |-- Identifier: string (nullable = true)
> | | | | | |-- PartyRoleIdentifier: long (nullable = true)
> | | | | | |-- Rules: struct (nullable = true)
> | | | | | | |-- Rule: array (nullable = true)
> | | | | | | | |-- element: struct (containsNull =
> true)
> | | | | | | | | |-- BIdentifier: long (nullable
> = true)
> | | | | | | | | |-- RIdentifier: long (nullable
> = true)
> | | | | | |-- SuffixIdentifier: string (nullable = true)
> | | | | | |-- UCP: string (nullable = true)
> | | |-- ResultHeader: struct (nullable = true)
> | | | |-- RequestDateTime: string (nullable = true)
> | | | |-- ResultDateTime: string (nullable = true)
> | |-- ResponseFileUUID: string (nullable = true)
> | |-- ResponseFileVersion: double (nullable = true)
> |-- ResponseHeader: struct (nullable = true)
> | |-- ResponseDateTime: string (nullable = true)
> | |-- SessionIdentifier: string (nullable = true)
> |-- ns2: string (nullable = true)
> |-- xmlns: string (nullable = true)
>
>
> df.write.format('com.databricks.spark.avro').save('ggl_avro')
>
> 16/09/08 17:07:20 INFO MemoryStore: Block broadcast_73 stored as values in
> memory (estimated size 233.5 KB, free 772.4 KB)
> 16/09/08 17:07:20 INFO MemoryStore: Block broadcast_73_piece0 stored as
> bytes in memory (estimated size 28.2 KB, free 800.6 KB)
> 16/09/08 17:07:20 INFO BlockManagerInfo: Added broadcast_73_piece0 in
> memory on localhost:29785 (size: 28.2 KB, free: 511.4 MB)
> 16/09/08 17:07:20 INFO SparkContext: Created broadcast 73 from
> newAPIHadoopFile at XmlFile.scala:39
> Traceback (most recent call last):
> File "<stdin>", line 1, in <module>
> File "/usr/hdp/2.4.2.0-258/spark/python/pyspark/sql/readwriter.py",
> line 397, in save
> self._jwrite.save(path)
> File "/usr/hdp/2.4.2.0-258/spark/python/lib/py4j-0.9-src.zip/py4j/java_gateway.py",
> line 813, in __call__
> File "/usr/hdp/2.4.2.0-258/spark/python/pyspark/sql/utils.py", line 45,
> in deco
> return f(*a, **kw)
> File "/usr/hdp/2.4.2.0-258/spark/python/lib/py4j-0.9-src.zip/py4j/protocol.py",
> line 308, in get_return_value
> py4j.protocol.Py4JJavaError: An error occurred while calling o426.save.
> : org.apache.avro.SchemaParseException: Can't redefine: Rules
> at org.apache.avro.SchemaBuilder$NameContext.put(SchemaBuilder.
> java:936)
> at org.apache.avro.SchemaBuilder$NameContext.access$600(
> SchemaBuilder.java:884)
> at org.apache.avro.SchemaBuilder$NamespacedBuilder.
> completeSchema(SchemaBuilder.java:470)
> at org.apache.avro.SchemaBuilder$RecordBuilder.fields(
> SchemaBuilder.java:1734)
> ....
>
>
Re: spark-xml to avro - SchemaParseException: Can't redefine
Posted by Yong Zhang <ja...@hotmail.com>.
Do you take a look about this -> https://github.com/databricks/spark-avro/issues/54
Yong
[https://avatars0.githubusercontent.com/u/1457102?v=3&s=400]<https://github.com/databricks/spark-avro/issues/54>
spark-avro fails to save DF with nested records having the ...<https://github.com/databricks/spark-avro/issues/54>
github.com
sixers changed the title from Save DF with nested records with the same name to spark-avro fails to save DF with nested records having the same name Jun 23, 2015
________________________________
From: Arun Patel <ar...@gmail.com>
Sent: Thursday, September 8, 2016 5:31 PM
To: user
Subject: spark-xml to avro - SchemaParseException: Can't redefine
I'm trying to convert XML to AVRO. But, I am getting SchemaParser exception for 'Rules' which is existing in two separate containers. Any thoughts?
XML is attached.
df = sqlContext.read.format('com.databricks.spark.xml').options(rowTag='GGLResponse',attributePrefix='').load('GGL.xml')
df.show()
+--------------------+--------------------+---+--------------------+
| ResponseDataset| ResponseHeader|ns2| xmlns|
+--------------------+--------------------+---+--------------------+
|[[[[[1,1],[SD2000...|[2016-07-26T16:28...|GGL|http://www.xxxx.c...|
+--------------------+--------------------+---+--------------------+
>>> df.printSchema()
root
|-- ResponseDataset: struct (nullable = true)
| |-- ResponseFileGGL: struct (nullable = true)
| | |-- OfferSets: struct (nullable = true)
| | | |-- OfferSet: struct (nullable = true)
| | | | |-- OfferSetHeader: struct (nullable = true)
| | | | | |-- OfferSetIdentifier: long (nullable = true)
| | | | | |-- TotalOffersProcessed: long (nullable = true)
| | | | |-- Offers: struct (nullable = true)
| | | | | |-- Identifier: string (nullable = true)
| | | | | |-- Offer: struct (nullable = true)
| | | | | | |-- Rules: struct (nullable = true)
| | | | | | | |-- Rule: array (nullable = true)
| | | | | | | | |-- element: struct (containsNull = true)
| | | | | | | | | |-- BorrowerIdentifier: long (nullable = true)
| | | | | | | | | |-- RuleIdentifier: long (nullable = true)
| | | | | |-- PartyRoleIdentifier: long (nullable = true)
| | | | | |-- SuffixIdentifier: string (nullable = true)
| | | | | |-- UCP: string (nullable = true)
| | | | |-- Pool: struct (nullable = true)
| | | | | |-- Identifier: string (nullable = true)
| | | | | |-- PartyRoleIdentifier: long (nullable = true)
| | | | | |-- Rules: struct (nullable = true)
| | | | | | |-- Rule: array (nullable = true)
| | | | | | | |-- element: struct (containsNull = true)
| | | | | | | | |-- BIdentifier: long (nullable = true)
| | | | | | | | |-- RIdentifier: long (nullable = true)
| | | | | |-- SuffixIdentifier: string (nullable = true)
| | | | | |-- UCP: string (nullable = true)
| | |-- ResultHeader: struct (nullable = true)
| | | |-- RequestDateTime: string (nullable = true)
| | | |-- ResultDateTime: string (nullable = true)
| |-- ResponseFileUUID: string (nullable = true)
| |-- ResponseFileVersion: double (nullable = true)
|-- ResponseHeader: struct (nullable = true)
| |-- ResponseDateTime: string (nullable = true)
| |-- SessionIdentifier: string (nullable = true)
|-- ns2: string (nullable = true)
|-- xmlns: string (nullable = true)
df.write.format('com.databricks.spark.avro').save('ggl_avro')
16/09/08 17:07:20 INFO MemoryStore: Block broadcast_73 stored as values in memory (estimated size 233.5 KB, free 772.4 KB)
16/09/08 17:07:20 INFO MemoryStore: Block broadcast_73_piece0 stored as bytes in memory (estimated size 28.2 KB, free 800.6 KB)
16/09/08 17:07:20 INFO BlockManagerInfo: Added broadcast_73_piece0 in memory on localhost:29785 (size: 28.2 KB, free: 511.4 MB)
16/09/08 17:07:20 INFO SparkContext: Created broadcast 73 from newAPIHadoopFile at XmlFile.scala:39
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/usr/hdp/2.4.2.0-258/spark/python/pyspark/sql/readwriter.py", line 397, in save
self._jwrite.save(path)
File "/usr/hdp/2.4.2.0-258/spark/python/lib/py4j-0.9-src.zip/py4j/java_gateway.py", line 813, in __call__
File "/usr/hdp/2.4.2.0-258/spark/python/pyspark/sql/utils.py", line 45, in deco
return f(*a, **kw)
File "/usr/hdp/2.4.2.0-258/spark/python/lib/py4j-0.9-src.zip/py4j/protocol.py", line 308, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o426.save.
: org.apache.avro.SchemaParseException: Can't redefine: Rules
at org.apache.avro.SchemaBuilder$NameContext.put(SchemaBuilder.java:936)
at org.apache.avro.SchemaBuilder$NameContext.access$600(SchemaBuilder.java:884)
at org.apache.avro.SchemaBuilder$NamespacedBuilder.completeSchema(SchemaBuilder.java:470)
at org.apache.avro.SchemaBuilder$RecordBuilder.fields(SchemaBuilder.java:1734)
....