You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@avro.apache.org by "Anton Agestam (Jira)" <ji...@apache.org> on 2023/05/24 14:02:00 UTC

[jira] [Updated] (AVRO-3760) Using enum with default symbol, cannot parse future value

     [ https://issues.apache.org/jira/browse/AVRO-3760?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]

Anton Agestam updated AVRO-3760:
--------------------------------
    Description: 
It seems like support for default symbols is broken. In the example below, since I'm using default symbols, I expected to be able to add new values to the enum and see the default value when parsing using the old schema.

{code:python}
import io
from avro.io import DatumReader, DatumWriter, BinaryDecoder, BinaryEncoder
import avro.schema

current_schema = avro.schema.parse("""
{
    "fields": [
        {
            "default": "unknown",
            "name": "checksum_algorithm",
            "type": {
                "name": "ChecksumAlgorithm",
                "symbols": [
                    "unknown",
                    "xxhash3_64_be"
                ],
                "type": "enum",
                "default": "unknown"
            }
        }
    ],
    "name": "Metadata",
    "type": "record"
}
""")

# Future schema adds the "crc32_be" symbol.
future_schema = avro.schema.parse("""
{
    "fields": [
        {
            "default": "unknown",
            "name": "checksum_algorithm",
            "type": {
                "name": "ChecksumAlgorithm",
                "symbols": [
                    "unknown",
                    "xxhash3_64_be",
                    "crc32_be"
                ],
                "type": "enum",
                "default": "unknown"
            }
        }
    ],
    "name": "Metadata",
    "type": "record"
}
""")


with io.BytesIO() as buffer:
    writer = DatumWriter(future_schema)
    encoder = BinaryEncoder(buffer)
    writer.write({"checksum_algorithm": "crc32_be"}, encoder)
    buffer.seek(0)

    reader = DatumReader(current_schema)
    decoder = BinaryDecoder(buffer)
    decoded = reader.read(decoder)

print(decoded)
{code}

Instead, this results in an exception:

{code}
Traceback (most recent call last):
  File "reproduce-avro.py", line 58, in <module>
    decoded = reader.read(decoder)
  File "/Users/anton/.pyenv/versions/karapace/lib/python3.8/site-packages/avro/io.py", line 649, in read
    return self.read_data(self.writers_schema, self.readers_schema, decoder)
  File "/Users/anton/.pyenv/versions/karapace/lib/python3.8/site-packages/avro/io.py", line 727, in read_data
    return self.read_record(writers_schema, readers_schema, decoder)
  File "/Users/anton/.pyenv/versions/karapace/lib/python3.8/site-packages/avro/io.py", line 922, in read_record
    field_val = self.read_data(field.type, readers_field.type, decoder)
  File "/Users/anton/.pyenv/versions/karapace/lib/python3.8/site-packages/avro/io.py", line 720, in read_data
    return self.read_enum(writers_schema, readers_schema, decoder)
  File "/Users/anton/.pyenv/versions/karapace/lib/python3.8/site-packages/avro/io.py", line 779, in read_enum
    raise avro.errors.SchemaResolutionException(
avro.errors.SchemaResolutionException: Can't access enum index 2 for enum with 2 symbols
Writer's Schema: {
  "type": "enum",
  "default": "unknown",
  "name": "ChecksumAlgorithm",
  "symbols": [
    "unknown",
    "xxhash3_64_be"
  ]
}
Reader's Schema: {
  "type": "enum",
  "default": "unknown",
  "name": "ChecksumAlgorithm",
  "symbols": [
    "unknown",
    "xxhash3_64_be"
  ]
}
{code}

  was:
It seems like support for default symbols is broken. In the example below, since I'm using default symbols, I expected to be able to add new values to the enum and see the default value when parsing using the old schema.

{code:python}
import io
from avro.io import DatumReader, DatumWriter, BinaryDecoder, BinaryEncoder
import avro.schema

current_schema = avro.schema.parse("""
{
    "fields": [
        {
            "default": "unknown",
            "name": "checksum_algorithm",
            "type": {
                "name": "ChecksumAlgorithm",
                "symbols": [
                    "unknown",
                    "xxhash3_64_be"
                ],
                "type": "enum",
                "default": "unknown"
            }
        }
    ],
    "name": "Metadata",
    "type": "record"
}
""")

# Future schema adds the "crc32_be" symbol.
future_schema = avro.schema.parse("""
{
    "fields": [
        {
            "default": "unknown",
            "name": "checksum_algorithm",
            "type": {
                "name": "ChecksumAlgorithm",
                "symbols": [
                    "unknown",
                    "xxhash3_64_be",
                    "crc32_be"
                ],
                "type": "enum",
                "default": "unknown"
            }
        }
    ],
    "name": "Metadata",
    "type": "record"
}
""")


with io.BytesIO() as buffer:
    writer = DatumWriter(future_schema)
    encoder = BinaryEncoder(buffer)
    writer.write({"checksum_algorithm": "crc32_be"}, encoder)
    buffer.seek(0)

    reader = DatumReader(current_schema)
    decoder = BinaryDecoder(buffer)
    decoded = reader.read(decoder)

print(decoded)
{code}

Instead, this results in an exception:

{code:java}
Traceback (most recent call last):
  File "reproduce-avro.py", line 58, in <module>
    decoded = reader.read(decoder)
  File "/Users/anton/.pyenv/versions/karapace/lib/python3.8/site-packages/avro/io.py", line 649, in read
    return self.read_data(self.writers_schema, self.readers_schema, decoder)
  File "/Users/anton/.pyenv/versions/karapace/lib/python3.8/site-packages/avro/io.py", line 727, in read_data
    return self.read_record(writers_schema, readers_schema, decoder)
  File "/Users/anton/.pyenv/versions/karapace/lib/python3.8/site-packages/avro/io.py", line 922, in read_record
    field_val = self.read_data(field.type, readers_field.type, decoder)
  File "/Users/anton/.pyenv/versions/karapace/lib/python3.8/site-packages/avro/io.py", line 720, in read_data
    return self.read_enum(writers_schema, readers_schema, decoder)
  File "/Users/anton/.pyenv/versions/karapace/lib/python3.8/site-packages/avro/io.py", line 779, in read_enum
    raise avro.errors.SchemaResolutionException(
avro.errors.SchemaResolutionException: Can't access enum index 2 for enum with 2 symbols
Writer's Schema: {
  "type": "enum",
  "default": "unknown",
  "name": "ChecksumAlgorithm",
  "symbols": [
    "unknown",
    "xxhash3_64_be"
  ]
}
Reader's Schema: {
  "type": "enum",
  "default": "unknown",
  "name": "ChecksumAlgorithm",
  "symbols": [
    "unknown",
    "xxhash3_64_be"
  ]
}
{code}


> Using enum with default symbol, cannot parse future value
> ---------------------------------------------------------
>
>                 Key: AVRO-3760
>                 URL: https://issues.apache.org/jira/browse/AVRO-3760
>             Project: Apache Avro
>          Issue Type: Bug
>          Components: python
>    Affects Versions: 1.11.1
>         Environment: {code}
> $ pip freeze | grep -i avro
> avro==1.11.1
> $ python --version
> Python 3.8.16
> {code}
>            Reporter: Anton Agestam
>            Priority: Major
>
> It seems like support for default symbols is broken. In the example below, since I'm using default symbols, I expected to be able to add new values to the enum and see the default value when parsing using the old schema.
> {code:python}
> import io
> from avro.io import DatumReader, DatumWriter, BinaryDecoder, BinaryEncoder
> import avro.schema
> current_schema = avro.schema.parse("""
> {
>     "fields": [
>         {
>             "default": "unknown",
>             "name": "checksum_algorithm",
>             "type": {
>                 "name": "ChecksumAlgorithm",
>                 "symbols": [
>                     "unknown",
>                     "xxhash3_64_be"
>                 ],
>                 "type": "enum",
>                 "default": "unknown"
>             }
>         }
>     ],
>     "name": "Metadata",
>     "type": "record"
> }
> """)
> # Future schema adds the "crc32_be" symbol.
> future_schema = avro.schema.parse("""
> {
>     "fields": [
>         {
>             "default": "unknown",
>             "name": "checksum_algorithm",
>             "type": {
>                 "name": "ChecksumAlgorithm",
>                 "symbols": [
>                     "unknown",
>                     "xxhash3_64_be",
>                     "crc32_be"
>                 ],
>                 "type": "enum",
>                 "default": "unknown"
>             }
>         }
>     ],
>     "name": "Metadata",
>     "type": "record"
> }
> """)
> with io.BytesIO() as buffer:
>     writer = DatumWriter(future_schema)
>     encoder = BinaryEncoder(buffer)
>     writer.write({"checksum_algorithm": "crc32_be"}, encoder)
>     buffer.seek(0)
>     reader = DatumReader(current_schema)
>     decoder = BinaryDecoder(buffer)
>     decoded = reader.read(decoder)
> print(decoded)
> {code}
> Instead, this results in an exception:
> {code}
> Traceback (most recent call last):
>   File "reproduce-avro.py", line 58, in <module>
>     decoded = reader.read(decoder)
>   File "/Users/anton/.pyenv/versions/karapace/lib/python3.8/site-packages/avro/io.py", line 649, in read
>     return self.read_data(self.writers_schema, self.readers_schema, decoder)
>   File "/Users/anton/.pyenv/versions/karapace/lib/python3.8/site-packages/avro/io.py", line 727, in read_data
>     return self.read_record(writers_schema, readers_schema, decoder)
>   File "/Users/anton/.pyenv/versions/karapace/lib/python3.8/site-packages/avro/io.py", line 922, in read_record
>     field_val = self.read_data(field.type, readers_field.type, decoder)
>   File "/Users/anton/.pyenv/versions/karapace/lib/python3.8/site-packages/avro/io.py", line 720, in read_data
>     return self.read_enum(writers_schema, readers_schema, decoder)
>   File "/Users/anton/.pyenv/versions/karapace/lib/python3.8/site-packages/avro/io.py", line 779, in read_enum
>     raise avro.errors.SchemaResolutionException(
> avro.errors.SchemaResolutionException: Can't access enum index 2 for enum with 2 symbols
> Writer's Schema: {
>   "type": "enum",
>   "default": "unknown",
>   "name": "ChecksumAlgorithm",
>   "symbols": [
>     "unknown",
>     "xxhash3_64_be"
>   ]
> }
> Reader's Schema: {
>   "type": "enum",
>   "default": "unknown",
>   "name": "ChecksumAlgorithm",
>   "symbols": [
>     "unknown",
>     "xxhash3_64_be"
>   ]
> }
> {code}



--
This message was sent by Atlassian Jira
(v8.20.10#820010)