You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@avro.apache.org by GitBox <gi...@apache.org> on 2020/07/24 21:35:53 UTC

[GitHub] [avro] kojiromike commented on a change in pull request #936: Speculative: Traversal validation

kojiromike commented on a change in pull request #936:
URL: https://github.com/apache/avro/pull/936#discussion_r460289855



##########
File path: lang/py/avro/io.py
##########
@@ -112,84 +164,218 @@ def __init__(self, fail_msg, writers_schema=None, readers_schema=None):
             fail_msg += "\nReader's Schema: %s" % pretty_readers
         schema.AvroException.__init__(self, fail_msg)
 
+
 #
 # Validate
 #
 
-
 def _is_timezone_aware_datetime(dt):
     return dt.tzinfo is not None and dt.tzinfo.utcoffset(dt) is not None
 
 
-_valid = {
-    'null': lambda s, d: d is None,
-    'boolean': lambda s, d: isinstance(d, bool),
-    'string': lambda s, d: isinstance(d, unicode),
-    'bytes': lambda s, d: ((isinstance(d, bytes)) or
-                           (isinstance(d, Decimal) and
-                            getattr(s, 'logical_type', None) == constants.DECIMAL)),
-    'int': lambda s, d: ((isinstance(d, (int, long))) and (INT_MIN_VALUE <= d <= INT_MAX_VALUE) or
-                         (isinstance(d, datetime.date) and
-                          getattr(s, 'logical_type', None) == constants.DATE) or
-                         (isinstance(d, datetime.time) and
-                          getattr(s, 'logical_type', None) == constants.TIME_MILLIS)),
-    'long': lambda s, d: ((isinstance(d, (int, long))) and (LONG_MIN_VALUE <= d <= LONG_MAX_VALUE) or
-                          (isinstance(d, datetime.time) and
-                           getattr(s, 'logical_type', None) == constants.TIME_MICROS) or
-                          (isinstance(d, datetime.date) and
-                           _is_timezone_aware_datetime(d) and
-                           getattr(s, 'logical_type', None) in (constants.TIMESTAMP_MILLIS,
-                                                                constants.TIMESTAMP_MICROS))),
-    'float': lambda s, d: isinstance(d, (int, long, float)),
-    'fixed': lambda s, d: ((isinstance(d, bytes) and len(d) == s.size) or
-                           (isinstance(d, Decimal) and
-                            getattr(s, 'logical_type', None) == constants.DECIMAL)),
-    'enum': lambda s, d: d in s.symbols,
-
-    'array': lambda s, d: isinstance(d, list) and all(validate(s.items, item) for item in d),
-    'map': lambda s, d: (isinstance(d, dict) and all(isinstance(key, unicode) for key in d) and
-                         all(validate(s.values, value) for value in d.values())),
-    'union': lambda s, d: any(validate(branch, d) for branch in s.schemas),
-    'record': lambda s, d: (isinstance(d, dict) and
-                            all(validate(f.type, d.get(f.name)) for f in s.fields) and
-                            {f.name for f in s.fields}.issuperset(d.keys())),
+def validate(expected_schema, datum, raise_on_error=False):
+    """Return True if the provided datum is valid for the excpted schema

Review comment:
       expected*

##########
File path: lang/py/avro/io.py
##########
@@ -112,84 +164,218 @@ def __init__(self, fail_msg, writers_schema=None, readers_schema=None):
             fail_msg += "\nReader's Schema: %s" % pretty_readers
         schema.AvroException.__init__(self, fail_msg)
 
+
 #
 # Validate
 #
 
-
 def _is_timezone_aware_datetime(dt):
     return dt.tzinfo is not None and dt.tzinfo.utcoffset(dt) is not None
 
 
-_valid = {
-    'null': lambda s, d: d is None,
-    'boolean': lambda s, d: isinstance(d, bool),
-    'string': lambda s, d: isinstance(d, unicode),
-    'bytes': lambda s, d: ((isinstance(d, bytes)) or
-                           (isinstance(d, Decimal) and
-                            getattr(s, 'logical_type', None) == constants.DECIMAL)),
-    'int': lambda s, d: ((isinstance(d, (int, long))) and (INT_MIN_VALUE <= d <= INT_MAX_VALUE) or
-                         (isinstance(d, datetime.date) and
-                          getattr(s, 'logical_type', None) == constants.DATE) or
-                         (isinstance(d, datetime.time) and
-                          getattr(s, 'logical_type', None) == constants.TIME_MILLIS)),
-    'long': lambda s, d: ((isinstance(d, (int, long))) and (LONG_MIN_VALUE <= d <= LONG_MAX_VALUE) or
-                          (isinstance(d, datetime.time) and
-                           getattr(s, 'logical_type', None) == constants.TIME_MICROS) or
-                          (isinstance(d, datetime.date) and
-                           _is_timezone_aware_datetime(d) and
-                           getattr(s, 'logical_type', None) in (constants.TIMESTAMP_MILLIS,
-                                                                constants.TIMESTAMP_MICROS))),
-    'float': lambda s, d: isinstance(d, (int, long, float)),
-    'fixed': lambda s, d: ((isinstance(d, bytes) and len(d) == s.size) or
-                           (isinstance(d, Decimal) and
-                            getattr(s, 'logical_type', None) == constants.DECIMAL)),
-    'enum': lambda s, d: d in s.symbols,
-
-    'array': lambda s, d: isinstance(d, list) and all(validate(s.items, item) for item in d),
-    'map': lambda s, d: (isinstance(d, dict) and all(isinstance(key, unicode) for key in d) and
-                         all(validate(s.values, value) for value in d.values())),
-    'union': lambda s, d: any(validate(branch, d) for branch in s.schemas),
-    'record': lambda s, d: (isinstance(d, dict) and
-                            all(validate(f.type, d.get(f.name)) for f in s.fields) and
-                            {f.name for f in s.fields}.issuperset(d.keys())),
+def validate(expected_schema, datum, raise_on_error=False):
+    """Return True if the provided datum is valid for the excpted schema
+
+    If raise_on_error is passed and True, then raise a validation error
+    with specific information about the error encountered in validation.
+
+    :param expected_schema: An avro schema type object representing the schema against
+                            which the datum will be validated.
+    :param datum: The datum to be validated, A python dictionary or some supported type
+    :param raise_on_error: True if a AvroTypeException should be raised immediately when a
+                           validation problem is encountered.
+    :raises: AvroTyeException if datum is invalid and raise_on_error is True
+    :returns: True if datum is valid for expected_schema, False if not.
+    """
+    # use a FIFO queue to process schema nodes breadth first.
+    nodes = deque()
+    nodes.append(ValidationNode(expected_schema, datum, getattr(expected_schema, "name", None)))
+
+    while nodes:
+        current_node = nodes.popleft()
+
+        # _validate_node returns the node for iteration if it is valid. Or it returns None
+        valid_node = _validate_node(current_node)
+
+        if valid_node is not None:
+            # if there are children of this node to append, do so.
+            for child_node in _iterate_node(valid_node):
+                nodes.append(child_node)
+        else:
+            # the current node was not valid.
+            if raise_on_error:
+                raise AvroTypeException(current_node.schema, current_node.datum)
+            else:
+                # preserve the prior validation behavior of returning false when there are problems.
+                return False
+
+    return True
+
+
+def _validate_node(node):
+    """Return the result of applying the appropriate validator function to the provided node"""
+    # breakpoint()
+    key = (node.schema.type, getattr(node.schema, 'logical_type', None))
+    return _VALIDATORS.get(key, _default_validator)(node)
+
+
+def _iterate_node(node):
+    for item in _ITERATORS.get(node.schema.type, _default_iterator)(node):
+        yield ValidationNode(*item)
+
+
+#############
+# Iteration #
+#############
+
+def _default_iterator(_):
+    """Immediately raise StopIteration.
+
+    This exists to prevent problems with iteration over unsupported container types.
+    """
+    for blank in []:

Review comment:
       Why not `raise StopIteration` explicitly?

##########
File path: lang/py/avro/io.py
##########
@@ -112,84 +164,218 @@ def __init__(self, fail_msg, writers_schema=None, readers_schema=None):
             fail_msg += "\nReader's Schema: %s" % pretty_readers
         schema.AvroException.__init__(self, fail_msg)
 
+
 #
 # Validate
 #
 
-
 def _is_timezone_aware_datetime(dt):
     return dt.tzinfo is not None and dt.tzinfo.utcoffset(dt) is not None
 
 
-_valid = {
-    'null': lambda s, d: d is None,
-    'boolean': lambda s, d: isinstance(d, bool),
-    'string': lambda s, d: isinstance(d, unicode),
-    'bytes': lambda s, d: ((isinstance(d, bytes)) or
-                           (isinstance(d, Decimal) and
-                            getattr(s, 'logical_type', None) == constants.DECIMAL)),
-    'int': lambda s, d: ((isinstance(d, (int, long))) and (INT_MIN_VALUE <= d <= INT_MAX_VALUE) or
-                         (isinstance(d, datetime.date) and
-                          getattr(s, 'logical_type', None) == constants.DATE) or
-                         (isinstance(d, datetime.time) and
-                          getattr(s, 'logical_type', None) == constants.TIME_MILLIS)),
-    'long': lambda s, d: ((isinstance(d, (int, long))) and (LONG_MIN_VALUE <= d <= LONG_MAX_VALUE) or
-                          (isinstance(d, datetime.time) and
-                           getattr(s, 'logical_type', None) == constants.TIME_MICROS) or
-                          (isinstance(d, datetime.date) and
-                           _is_timezone_aware_datetime(d) and
-                           getattr(s, 'logical_type', None) in (constants.TIMESTAMP_MILLIS,
-                                                                constants.TIMESTAMP_MICROS))),
-    'float': lambda s, d: isinstance(d, (int, long, float)),
-    'fixed': lambda s, d: ((isinstance(d, bytes) and len(d) == s.size) or
-                           (isinstance(d, Decimal) and
-                            getattr(s, 'logical_type', None) == constants.DECIMAL)),
-    'enum': lambda s, d: d in s.symbols,
-
-    'array': lambda s, d: isinstance(d, list) and all(validate(s.items, item) for item in d),
-    'map': lambda s, d: (isinstance(d, dict) and all(isinstance(key, unicode) for key in d) and
-                         all(validate(s.values, value) for value in d.values())),
-    'union': lambda s, d: any(validate(branch, d) for branch in s.schemas),
-    'record': lambda s, d: (isinstance(d, dict) and
-                            all(validate(f.type, d.get(f.name)) for f in s.fields) and
-                            {f.name for f in s.fields}.issuperset(d.keys())),
+def validate(expected_schema, datum, raise_on_error=False):
+    """Return True if the provided datum is valid for the excpted schema
+
+    If raise_on_error is passed and True, then raise a validation error
+    with specific information about the error encountered in validation.
+
+    :param expected_schema: An avro schema type object representing the schema against
+                            which the datum will be validated.
+    :param datum: The datum to be validated, A python dictionary or some supported type
+    :param raise_on_error: True if a AvroTypeException should be raised immediately when a
+                           validation problem is encountered.
+    :raises: AvroTyeException if datum is invalid and raise_on_error is True

Review comment:
       Type*




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org