You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@parquet.apache.org by "cesar matos (JIRA)" <ji...@apache.org> on 2018/09/03 07:58:00 UTC
[jira] [Updated] (PARQUET-1409) Can write but read parquet file
with nested arrays
[ https://issues.apache.org/jira/browse/PARQUET-1409?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
cesar matos updated PARQUET-1409:
---------------------------------
Environment: (was: This is the code used to generate this file:
{code:java}
val filename = "/tmp/test.parquet"
val path = Paths.get(filename).toFile
val conf = new Configuration()
val schema: Schema = {
val inner = Schema.createRecord("inner", "some doc", "outer", false,
List(new Schema.Field("b", Schema.createArray(Schema.create(Schema.Type.INT)), "", null: Object)).asJava
)
Schema.createRecord("outer", "", "", false,
List(new Schema.Field("a", Schema.createArray(inner), "", null: Object)).asJava
)
}
val os = new FileOutputStream(path)
val outputFile = new RawParquetOutputFile(os)
val parquetWriter: ParquetWriter[GenericRecord] = AvroParquetWriter.builder[GenericRecord](outputFile)
.withConf(conf)
.withSchema(schema)
.build()
val data = Outer(
Array(
Inner(Array(1, 2)),
Inner(Array(3, 4))
)
)
val record = new GenericData.Record(schema)
val fieldA = schema.getField("a").schema()
val recorData = {
val fieldAType = fieldA.getElementType()
data.a.map { x =>
val innerRecord = new GenericData.Record(fieldAType)
innerRecord.put("b", x.b)
innerRecord
}
}
record.put("a", recorData)
parquetWriter.write(record)
parquetWriter.close()
os.close()
{code}
Also if I pass the configuration option
{code:java}
parquet.avro.add-list-element-records = false
{code}
I get a different exception:
org.apache.avro.SchemaParseException: Can't redefine: list)
Description:
I am trying to read a parquet file in scala using the Avro interface (1.10.). The file was also generated using the same interface.
The data that I am writing looks like this:
{code:java}
case class Inner(b: Array[Int])
case class Outer(a: Array[Inner])
val data = Outer(
Array(
Inner(Array(1, 2)),
Inner(Array(3, 4))
)
)
{code}
Using parquet-tools to read read the file looks like this:
{code:java}
$ parquet-tools cat /tmp/test.parquet
a:
.array:
..b:
...array = 1
...array = 2
.array:
..b:
...array = 3
...array = 4
{code}
But while trying to read the file I get the following exception:
{code:java}
Exception in thread "main" org.apache.parquet.io.InvalidRecordException: Parquet/Avro schema mismatch: Avro field 'array' not found
at org.apache.parquet.avro.AvroRecordConverter.getAvroField(AvroRecordConverter.java:225)
at org.apache.parquet.avro.AvroRecordConverter.<init>(AvroRecordConverter.java:130)
at org.apache.parquet.avro.AvroRecordConverter.newConverter(AvroRecordConverter.java:279)
at org.apache.parquet.avro.AvroRecordConverter.newConverter(AvroRecordConverter.java:232)
at org.apache.parquet.avro.AvroRecordConverter.access$100(AvroRecordConverter.java:78)
at org.apache.parquet.avro.AvroRecordConverter$AvroCollectionConverter$ElementConverter.<init>(AvroRecordConverter.java:536)
at org.apache.parquet.avro.AvroRecordConverter$AvroCollectionConverter.<init>(AvroRecordConverter.java:486)
at org.apache.parquet.avro.AvroRecordConverter.newConverter(AvroRecordConverter.java:289)
at org.apache.parquet.avro.AvroRecordConverter.<init>(AvroRecordConverter.java:141)
at org.apache.parquet.avro.AvroRecordConverter.<init>(AvroRecordConverter.java:95)
at org.apache.parquet.avro.AvroRecordMaterializer.<init>(AvroRecordMaterializer.java:33)
at org.apache.parquet.avro.AvroReadSupport.prepareForRead(AvroReadSupport.java:138)
at org.apache.parquet.hadoop.InternalParquetRecordReader.initialize(InternalParquetRecordReader.java:183)
at org.apache.parquet.hadoop.ParquetReader.initReader(ParquetReader.java:156)
at org.apache.parquet.hadoop.ParquetReader.read(ParquetReader.java:135)
at raw.runtime.writer.parquet.avro.Lixo$.main(Lixo.scala:78)
at raw.runtime.writer.parquet.avro.Lixo.main(Lixo.scala)
{code}
This is the code used to generate this file:
{code:java}
val filename = "/tmp/test.parquet"
val path = Paths.get(filename).toFile
val conf = new Configuration()
val schema: Schema = {
val inner = Schema.createRecord("inner", "some doc", "outer", false,
List(new Schema.Field("b", Schema.createArray(Schema.create(Schema.Type.INT)), "", null: Object)).asJava
)
Schema.createRecord("outer", "", "", false,
List(new Schema.Field("a", Schema.createArray(inner), "", null: Object)).asJava
)
}
val os = new FileOutputStream(path)
val outputFile = new RawParquetOutputFile(os)
val parquetWriter: ParquetWriter[GenericRecord] = AvroParquetWriter.builder[GenericRecord](outputFile)
.withConf(conf)
.withSchema(schema)
.build()
val data = Outer(
Array(
Inner(Array(1, 2)),
Inner(Array(3, 4))
)
)
val record = new GenericData.Record(schema)
val fieldA = schema.getField("a").schema()
val recorData = {
val fieldAType = fieldA.getElementType()
data.a.map { x =>
val innerRecord = new GenericData.Record(fieldAType)
innerRecord.put("b", x.b)
innerRecord
}
}
record.put("a", recorData)
parquetWriter.write(record)
parquetWriter.close()
os.close()
{code}
Also if I pass the configuration option
{code:java}
parquet.avro.add-list-element-records = false
{code}
I get a different exception:
org.apache.avro.SchemaParseException: Can't redefine: list
Am I doing something wrong?
was:
I am trying to read a parquet file in scala using the Avro interface (1.10.). The file was also generated using the same interface.
The data that I am writing looks like this:
{code:java}
case class Inner(b: Array[Int])
case class Outer(a: Array[Inner])
val data = Outer(
Array(
Inner(Array(1, 2)),
Inner(Array(3, 4))
)
)
{code}
Using parquet-tools to read read the file looks like this:
{code:java}
$ parquet-tools cat /tmp/test.parquet
a:
.array:
..b:
...array = 1
...array = 2
.array:
..b:
...array = 3
...array = 4
{code}
But while trying to read the file I get the following exception:
{code:java}
Exception in thread "main" org.apache.parquet.io.InvalidRecordException: Parquet/Avro schema mismatch: Avro field 'array' not found
at org.apache.parquet.avro.AvroRecordConverter.getAvroField(AvroRecordConverter.java:225)
at org.apache.parquet.avro.AvroRecordConverter.<init>(AvroRecordConverter.java:130)
at org.apache.parquet.avro.AvroRecordConverter.newConverter(AvroRecordConverter.java:279)
at org.apache.parquet.avro.AvroRecordConverter.newConverter(AvroRecordConverter.java:232)
at org.apache.parquet.avro.AvroRecordConverter.access$100(AvroRecordConverter.java:78)
at org.apache.parquet.avro.AvroRecordConverter$AvroCollectionConverter$ElementConverter.<init>(AvroRecordConverter.java:536)
at org.apache.parquet.avro.AvroRecordConverter$AvroCollectionConverter.<init>(AvroRecordConverter.java:486)
at org.apache.parquet.avro.AvroRecordConverter.newConverter(AvroRecordConverter.java:289)
at org.apache.parquet.avro.AvroRecordConverter.<init>(AvroRecordConverter.java:141)
at org.apache.parquet.avro.AvroRecordConverter.<init>(AvroRecordConverter.java:95)
at org.apache.parquet.avro.AvroRecordMaterializer.<init>(AvroRecordMaterializer.java:33)
at org.apache.parquet.avro.AvroReadSupport.prepareForRead(AvroReadSupport.java:138)
at org.apache.parquet.hadoop.InternalParquetRecordReader.initialize(InternalParquetRecordReader.java:183)
at org.apache.parquet.hadoop.ParquetReader.initReader(ParquetReader.java:156)
at org.apache.parquet.hadoop.ParquetReader.read(ParquetReader.java:135)
at raw.runtime.writer.parquet.avro.Lixo$.main(Lixo.scala:78)
at raw.runtime.writer.parquet.avro.Lixo.main(Lixo.scala)
{code}
Am I doing something wrong?
> Can write but read parquet file with nested arrays
> --------------------------------------------------
>
> Key: PARQUET-1409
> URL: https://issues.apache.org/jira/browse/PARQUET-1409
> Project: Parquet
> Issue Type: Bug
> Components: parquet-avro
> Affects Versions: 1.10.0
> Reporter: cesar matos
> Priority: Major
>
> I am trying to read a parquet file in scala using the Avro interface (1.10.). The file was also generated using the same interface.
> The data that I am writing looks like this:
>
> {code:java}
> case class Inner(b: Array[Int])
> case class Outer(a: Array[Inner])
> val data = Outer(
> Array(
> Inner(Array(1, 2)),
> Inner(Array(3, 4))
> )
> )
> {code}
>
> Using parquet-tools to read read the file looks like this:
>
> {code:java}
> $ parquet-tools cat /tmp/test.parquet
> a:
> .array:
> ..b:
> ...array = 1
> ...array = 2
> .array:
> ..b:
> ...array = 3
> ...array = 4
> {code}
>
> But while trying to read the file I get the following exception:
>
>
> {code:java}
>
> Exception in thread "main" org.apache.parquet.io.InvalidRecordException: Parquet/Avro schema mismatch: Avro field 'array' not found
> at org.apache.parquet.avro.AvroRecordConverter.getAvroField(AvroRecordConverter.java:225)
> at org.apache.parquet.avro.AvroRecordConverter.<init>(AvroRecordConverter.java:130)
> at org.apache.parquet.avro.AvroRecordConverter.newConverter(AvroRecordConverter.java:279)
> at org.apache.parquet.avro.AvroRecordConverter.newConverter(AvroRecordConverter.java:232)
> at org.apache.parquet.avro.AvroRecordConverter.access$100(AvroRecordConverter.java:78)
> at org.apache.parquet.avro.AvroRecordConverter$AvroCollectionConverter$ElementConverter.<init>(AvroRecordConverter.java:536)
> at org.apache.parquet.avro.AvroRecordConverter$AvroCollectionConverter.<init>(AvroRecordConverter.java:486)
> at org.apache.parquet.avro.AvroRecordConverter.newConverter(AvroRecordConverter.java:289)
> at org.apache.parquet.avro.AvroRecordConverter.<init>(AvroRecordConverter.java:141)
> at org.apache.parquet.avro.AvroRecordConverter.<init>(AvroRecordConverter.java:95)
> at org.apache.parquet.avro.AvroRecordMaterializer.<init>(AvroRecordMaterializer.java:33)
> at org.apache.parquet.avro.AvroReadSupport.prepareForRead(AvroReadSupport.java:138)
> at org.apache.parquet.hadoop.InternalParquetRecordReader.initialize(InternalParquetRecordReader.java:183)
> at org.apache.parquet.hadoop.ParquetReader.initReader(ParquetReader.java:156)
> at org.apache.parquet.hadoop.ParquetReader.read(ParquetReader.java:135)
> at raw.runtime.writer.parquet.avro.Lixo$.main(Lixo.scala:78)
> at raw.runtime.writer.parquet.avro.Lixo.main(Lixo.scala)
>
> {code}
>
> This is the code used to generate this file:
> {code:java}
> val filename = "/tmp/test.parquet"
> val path = Paths.get(filename).toFile
> val conf = new Configuration()
> val schema: Schema = {
> val inner = Schema.createRecord("inner", "some doc", "outer", false,
> List(new Schema.Field("b", Schema.createArray(Schema.create(Schema.Type.INT)), "", null: Object)).asJava
> )
> Schema.createRecord("outer", "", "", false,
> List(new Schema.Field("a", Schema.createArray(inner), "", null: Object)).asJava
> )
> }
> val os = new FileOutputStream(path)
> val outputFile = new RawParquetOutputFile(os)
> val parquetWriter: ParquetWriter[GenericRecord] = AvroParquetWriter.builder[GenericRecord](outputFile)
> .withConf(conf)
> .withSchema(schema)
> .build()
> val data = Outer(
> Array(
> Inner(Array(1, 2)),
> Inner(Array(3, 4))
> )
> )
> val record = new GenericData.Record(schema)
> val fieldA = schema.getField("a").schema()
> val recorData = {
> val fieldAType = fieldA.getElementType()
> data.a.map { x =>
> val innerRecord = new GenericData.Record(fieldAType)
> innerRecord.put("b", x.b)
> innerRecord
> }
> }
> record.put("a", recorData)
> parquetWriter.write(record)
> parquetWriter.close()
> os.close()
> {code}
> Also if I pass the configuration option
> {code:java}
> parquet.avro.add-list-element-records = false
> {code}
> I get a different exception:
> org.apache.avro.SchemaParseException: Can't redefine: list
>
> Am I doing something wrong?
>
>
--
This message was sent by Atlassian JIRA
(v7.6.3#76005)