You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@avro.apache.org by "Anton Agestam (Jira)" <ji...@apache.org> on 2023/05/24 14:02:00 UTC
[jira] [Updated] (AVRO-3760) Using enum with default symbol, cannot parse future value
[ https://issues.apache.org/jira/browse/AVRO-3760?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Anton Agestam updated AVRO-3760:
--------------------------------
Description:
It seems like support for default symbols is broken. In the example below, since I'm using default symbols, I expected to be able to add new values to the enum and see the default value when parsing using the old schema.
{code:python}
import io
from avro.io import DatumReader, DatumWriter, BinaryDecoder, BinaryEncoder
import avro.schema
current_schema = avro.schema.parse("""
{
"fields": [
{
"default": "unknown",
"name": "checksum_algorithm",
"type": {
"name": "ChecksumAlgorithm",
"symbols": [
"unknown",
"xxhash3_64_be"
],
"type": "enum",
"default": "unknown"
}
}
],
"name": "Metadata",
"type": "record"
}
""")
# Future schema adds the "crc32_be" symbol.
future_schema = avro.schema.parse("""
{
"fields": [
{
"default": "unknown",
"name": "checksum_algorithm",
"type": {
"name": "ChecksumAlgorithm",
"symbols": [
"unknown",
"xxhash3_64_be",
"crc32_be"
],
"type": "enum",
"default": "unknown"
}
}
],
"name": "Metadata",
"type": "record"
}
""")
with io.BytesIO() as buffer:
writer = DatumWriter(future_schema)
encoder = BinaryEncoder(buffer)
writer.write({"checksum_algorithm": "crc32_be"}, encoder)
buffer.seek(0)
reader = DatumReader(current_schema)
decoder = BinaryDecoder(buffer)
decoded = reader.read(decoder)
print(decoded)
{code}
Instead, this results in an exception:
{code}
Traceback (most recent call last):
File "reproduce-avro.py", line 58, in <module>
decoded = reader.read(decoder)
File "/Users/anton/.pyenv/versions/karapace/lib/python3.8/site-packages/avro/io.py", line 649, in read
return self.read_data(self.writers_schema, self.readers_schema, decoder)
File "/Users/anton/.pyenv/versions/karapace/lib/python3.8/site-packages/avro/io.py", line 727, in read_data
return self.read_record(writers_schema, readers_schema, decoder)
File "/Users/anton/.pyenv/versions/karapace/lib/python3.8/site-packages/avro/io.py", line 922, in read_record
field_val = self.read_data(field.type, readers_field.type, decoder)
File "/Users/anton/.pyenv/versions/karapace/lib/python3.8/site-packages/avro/io.py", line 720, in read_data
return self.read_enum(writers_schema, readers_schema, decoder)
File "/Users/anton/.pyenv/versions/karapace/lib/python3.8/site-packages/avro/io.py", line 779, in read_enum
raise avro.errors.SchemaResolutionException(
avro.errors.SchemaResolutionException: Can't access enum index 2 for enum with 2 symbols
Writer's Schema: {
"type": "enum",
"default": "unknown",
"name": "ChecksumAlgorithm",
"symbols": [
"unknown",
"xxhash3_64_be"
]
}
Reader's Schema: {
"type": "enum",
"default": "unknown",
"name": "ChecksumAlgorithm",
"symbols": [
"unknown",
"xxhash3_64_be"
]
}
{code}
was:
It seems like support for default symbols is broken. In the example below, since I'm using default symbols, I expected to be able to add new values to the enum and see the default value when parsing using the old schema.
{code:python}
import io
from avro.io import DatumReader, DatumWriter, BinaryDecoder, BinaryEncoder
import avro.schema
current_schema = avro.schema.parse("""
{
"fields": [
{
"default": "unknown",
"name": "checksum_algorithm",
"type": {
"name": "ChecksumAlgorithm",
"symbols": [
"unknown",
"xxhash3_64_be"
],
"type": "enum",
"default": "unknown"
}
}
],
"name": "Metadata",
"type": "record"
}
""")
# Future schema adds the "crc32_be" symbol.
future_schema = avro.schema.parse("""
{
"fields": [
{
"default": "unknown",
"name": "checksum_algorithm",
"type": {
"name": "ChecksumAlgorithm",
"symbols": [
"unknown",
"xxhash3_64_be",
"crc32_be"
],
"type": "enum",
"default": "unknown"
}
}
],
"name": "Metadata",
"type": "record"
}
""")
with io.BytesIO() as buffer:
writer = DatumWriter(future_schema)
encoder = BinaryEncoder(buffer)
writer.write({"checksum_algorithm": "crc32_be"}, encoder)
buffer.seek(0)
reader = DatumReader(current_schema)
decoder = BinaryDecoder(buffer)
decoded = reader.read(decoder)
print(decoded)
{code}
Instead, this results in an exception:
{code:java}
Traceback (most recent call last):
File "reproduce-avro.py", line 58, in <module>
decoded = reader.read(decoder)
File "/Users/anton/.pyenv/versions/karapace/lib/python3.8/site-packages/avro/io.py", line 649, in read
return self.read_data(self.writers_schema, self.readers_schema, decoder)
File "/Users/anton/.pyenv/versions/karapace/lib/python3.8/site-packages/avro/io.py", line 727, in read_data
return self.read_record(writers_schema, readers_schema, decoder)
File "/Users/anton/.pyenv/versions/karapace/lib/python3.8/site-packages/avro/io.py", line 922, in read_record
field_val = self.read_data(field.type, readers_field.type, decoder)
File "/Users/anton/.pyenv/versions/karapace/lib/python3.8/site-packages/avro/io.py", line 720, in read_data
return self.read_enum(writers_schema, readers_schema, decoder)
File "/Users/anton/.pyenv/versions/karapace/lib/python3.8/site-packages/avro/io.py", line 779, in read_enum
raise avro.errors.SchemaResolutionException(
avro.errors.SchemaResolutionException: Can't access enum index 2 for enum with 2 symbols
Writer's Schema: {
"type": "enum",
"default": "unknown",
"name": "ChecksumAlgorithm",
"symbols": [
"unknown",
"xxhash3_64_be"
]
}
Reader's Schema: {
"type": "enum",
"default": "unknown",
"name": "ChecksumAlgorithm",
"symbols": [
"unknown",
"xxhash3_64_be"
]
}
{code}
> Using enum with default symbol, cannot parse future value
> ---------------------------------------------------------
>
> Key: AVRO-3760
> URL: https://issues.apache.org/jira/browse/AVRO-3760
> Project: Apache Avro
> Issue Type: Bug
> Components: python
> Affects Versions: 1.11.1
> Environment: {code}
> $ pip freeze | grep -i avro
> avro==1.11.1
> $ python --version
> Python 3.8.16
> {code}
> Reporter: Anton Agestam
> Priority: Major
>
> It seems like support for default symbols is broken. In the example below, since I'm using default symbols, I expected to be able to add new values to the enum and see the default value when parsing using the old schema.
> {code:python}
> import io
> from avro.io import DatumReader, DatumWriter, BinaryDecoder, BinaryEncoder
> import avro.schema
> current_schema = avro.schema.parse("""
> {
> "fields": [
> {
> "default": "unknown",
> "name": "checksum_algorithm",
> "type": {
> "name": "ChecksumAlgorithm",
> "symbols": [
> "unknown",
> "xxhash3_64_be"
> ],
> "type": "enum",
> "default": "unknown"
> }
> }
> ],
> "name": "Metadata",
> "type": "record"
> }
> """)
> # Future schema adds the "crc32_be" symbol.
> future_schema = avro.schema.parse("""
> {
> "fields": [
> {
> "default": "unknown",
> "name": "checksum_algorithm",
> "type": {
> "name": "ChecksumAlgorithm",
> "symbols": [
> "unknown",
> "xxhash3_64_be",
> "crc32_be"
> ],
> "type": "enum",
> "default": "unknown"
> }
> }
> ],
> "name": "Metadata",
> "type": "record"
> }
> """)
> with io.BytesIO() as buffer:
> writer = DatumWriter(future_schema)
> encoder = BinaryEncoder(buffer)
> writer.write({"checksum_algorithm": "crc32_be"}, encoder)
> buffer.seek(0)
> reader = DatumReader(current_schema)
> decoder = BinaryDecoder(buffer)
> decoded = reader.read(decoder)
> print(decoded)
> {code}
> Instead, this results in an exception:
> {code}
> Traceback (most recent call last):
> File "reproduce-avro.py", line 58, in <module>
> decoded = reader.read(decoder)
> File "/Users/anton/.pyenv/versions/karapace/lib/python3.8/site-packages/avro/io.py", line 649, in read
> return self.read_data(self.writers_schema, self.readers_schema, decoder)
> File "/Users/anton/.pyenv/versions/karapace/lib/python3.8/site-packages/avro/io.py", line 727, in read_data
> return self.read_record(writers_schema, readers_schema, decoder)
> File "/Users/anton/.pyenv/versions/karapace/lib/python3.8/site-packages/avro/io.py", line 922, in read_record
> field_val = self.read_data(field.type, readers_field.type, decoder)
> File "/Users/anton/.pyenv/versions/karapace/lib/python3.8/site-packages/avro/io.py", line 720, in read_data
> return self.read_enum(writers_schema, readers_schema, decoder)
> File "/Users/anton/.pyenv/versions/karapace/lib/python3.8/site-packages/avro/io.py", line 779, in read_enum
> raise avro.errors.SchemaResolutionException(
> avro.errors.SchemaResolutionException: Can't access enum index 2 for enum with 2 symbols
> Writer's Schema: {
> "type": "enum",
> "default": "unknown",
> "name": "ChecksumAlgorithm",
> "symbols": [
> "unknown",
> "xxhash3_64_be"
> ]
> }
> Reader's Schema: {
> "type": "enum",
> "default": "unknown",
> "name": "ChecksumAlgorithm",
> "symbols": [
> "unknown",
> "xxhash3_64_be"
> ]
> }
> {code}
--
This message was sent by Atlassian Jira
(v8.20.10#820010)