You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@avro.apache.org by dk...@apache.org on 2018/11/29 20:21:39 UTC

[avro] branch master updated: [AVRO-2226] Fixes UnionSchema specificity

This is an automated email from the ASF dual-hosted git repository.

dkulp pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/avro.git


The following commit(s) were added to refs/heads/master by this push:
     new 3e5cf4a  [AVRO-2226] Fixes UnionSchema specificity
3e5cf4a is described below

commit 3e5cf4a94dd1f71e42a45c812f26a53c7b89e945
Author: Andrew Kelleher <an...@venmo.com>
AuthorDate: Fri Sep 14 09:31:56 2018 -0400

    [AVRO-2226] Fixes UnionSchema specificity
    
    Trouble arises in the python library when deducing the appropriate schema from a list of schemas given a particular datum.
    
    When "null" values are allowed for fields in two separate schemas, there is no way to differentiate which should schema be used given of a list of schemas that are set as the type definition for a record.
    
    This PR checks to ensure all fields defined on a given datum are _also_ defined in the schema being validated to use for that datum.
    
    With this bugfix, datums such as `{"foo": "a"}` will not "cast" to schemas such as `{"name": "bar", "type": ["long", "null"]}`, which is currently the case.
---
 lang/py3/avro/io.py            | 14 +++++++++++---
 lang/py3/avro/tests/test_io.py | 24 ++++++++++++++++++++++++
 2 files changed, 35 insertions(+), 3 deletions(-)

diff --git a/lang/py3/avro/io.py b/lang/py3/avro/io.py
index b944904..7b56dff 100644
--- a/lang/py3/avro/io.py
+++ b/lang/py3/avro/io.py
@@ -135,9 +135,17 @@ def Validate(expected_schema, datum):
     return any(Validate(union_branch, datum)
                for union_branch in expected_schema.schemas)
   elif schema_type in ['record', 'error', 'request']:
-    return (isinstance(datum, dict)
-        and all(Validate(field.type, datum.get(field.name))
-                for field in expected_schema.fields))
+    if not isinstance(datum, dict):
+        return False
+    expected_schema_field_names = set()
+    for field in expected_schema.fields:
+        expected_schema_field_names.add(field.name)
+        if not Validate(field.type, datum.get(field.name)):
+            return False
+    for datum_field in datum.keys():
+        if datum_field not in expected_schema_field_names:
+            return False
+    return True
   else:
     raise AvroTypeException('Unknown Avro schema type: %r' % schema_type)
 
diff --git a/lang/py3/avro/tests/test_io.py b/lang/py3/avro/tests/test_io.py
index 8349ce5..3d58683 100644
--- a/lang/py3/avro/tests/test_io.py
+++ b/lang/py3/avro/tests/test_io.py
@@ -346,6 +346,30 @@ class TestIO(unittest.TestCase):
     self.assertRaises(
         avro_io.AvroTypeException, write_datum, datum_to_write, writer_schema)
 
+  def testUnionSchemaSpecificity(self):
+    union_schema = schema.Parse("""
+        [{
+         "type" : "record",
+         "name" : "A",
+         "fields" : [{"name" : "foo", "type" : ["string", "null"]}]
+        },
+        {
+         "type" : "record",
+         "name" : "B",
+         "fields" : [{"name" : "bar", "type" : ["string", "null"]}]
+        },
+        {
+         "type" : "record",
+         "name" : "AOrB",
+         "fields" : [{"name" : "entity", "type" : ["A", "B"]}]
+        }]
+    """)
+    sch = {s.name: s for s in union_schema.schemas}.get('AOrB')
+    datum_to_read = {'entity': {'foo': 'this is an instance of schema A'}}
+    writer, encoder, datum_writer = write_datum(datum_to_read, sch)
+    datum_read = read_datum(writer, sch, sch)
+    self.assertEqual(datum_to_read, datum_read)
+
 
 if __name__ == '__main__':
   raise Exception('Use run_tests.py')