You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@avro.apache.org by ko...@apache.org on 2019/11/21 03:36:12 UTC

[avro] branch master updated: AVRO-621: Stricter Name Validation (#718)

This is an automated email from the ASF dual-hosted git repository.

kojiromike pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/avro.git


View the commit online:
https://github.com/apache/avro/commit/f8afef1f98b1f27deb9e73ef804ed74ba2d20bbb

The following commit(s) were added to refs/heads/master by this push:
     new f8afef1  AVRO-621: Stricter Name Validation (#718)
f8afef1 is described below

commit f8afef1f98b1f27deb9e73ef804ed74ba2d20bbb
Author: Michael A. Smith <mi...@smith-li.com>
AuthorDate: Wed Nov 20 22:36:03 2019 -0500

    AVRO-621: Stricter Name Validation (#718)
    
    * AVRO-621: Test an Invalid Name
    
    * AVRO-621: Stricter Name Checking
    
    * AVRO-621 Adhere Better to Spec
    
    * AVRO-621 More Complete Name Tests
---
 lang/py/src/avro/schema.py  | 108 ++++++++++++++++++++++----------------------
 lang/py/test/test_schema.py |  73 ++++++++++++++++--------------
 2 files changed, 91 insertions(+), 90 deletions(-)

diff --git a/lang/py/src/avro/schema.py b/lang/py/src/avro/schema.py
index 60a826d..a6116e2 100644
--- a/lang/py/src/avro/schema.py
+++ b/lang/py/src/avro/schema.py
@@ -42,6 +42,7 @@ from __future__ import absolute_import, division, print_function
 
 import json
 import math
+import re
 import sys
 import warnings
 
@@ -113,6 +114,9 @@ class AvroException(Exception):
 class SchemaParseException(AvroException):
   pass
 
+class InvalidName(SchemaParseException):
+  """User attempted to parse a schema with an invalid name."""
+
 class AvroWarning(UserWarning):
   """Base class for warnings."""
 
@@ -173,68 +177,65 @@ class Schema(object):
 class Name(object):
   """Class to describe Avro name."""
 
-  def __init__(self, name_attr, space_attr, default_space):
-    """
-    Formulate full name according to the specification.
-
-    @arg name_attr: name value read in schema or None.
-    @arg space_attr: namespace value read in schema or None.
-    @ard default_space: the current default space or None.
-    """
-    # Ensure valid ctor args
-    if not (isinstance(name_attr, basestring) or (name_attr is None)):
-      fail_msg = 'Name must be non-empty string or None.'
-      raise SchemaParseException(fail_msg)
-    elif name_attr == "":
-      fail_msg = 'Name must be non-empty string or None.'
-      raise SchemaParseException(fail_msg)
+  # The name portion of a fullname, record field names, and enum symbols must:
+  # start with [A-Za-z_]
+  # subsequently contain only [A-Za-z0-9_]
+  _base_name_pattern = re.compile(r'(?:^|\.)[A-Za-z_][A-Za-z0-9_]*$')
 
-    if not (isinstance(space_attr, basestring) or (space_attr is None)):
-      fail_msg = 'Space must be non-empty string or None.'
-      raise SchemaParseException(fail_msg)
-    elif name_attr == "":
-      fail_msg = 'Space must be non-empty string or None.'
-      raise SchemaParseException(fail_msg)
+  _full = None
 
-    if not (isinstance(default_space, basestring) or (default_space is None)):
-      fail_msg = 'Default space must be non-empty string or None.'
-      raise SchemaParseException(fail_msg)
-    elif name_attr == "":
-      fail_msg = 'Default must be non-empty string or None.'
-      raise SchemaParseException(fail_msg)
+  def __init__(self, name_attr, space_attr, default_space):
+    """The fullname is determined in one of the following ways:
 
-    self._full = None;
+    - A name and namespace are both specified. For example, one might use "name": "X", "namespace": "org.foo" to indicate the fullname org.foo.X.
+    - A fullname is specified. If the name specified contains a dot, then it is assumed to be a fullname, and any namespace also specified is ignored. For example, use "name": "org.foo.X" to indicate the fullname org.foo.X.
+    - A name only is specified, i.e., a name that contains no dots. In this case the namespace is taken from the most tightly enclosing schema or protocol. For example, if "name": "X" is specified, and this occurs within a field of the record definition of org.foo.Y, then the fullname is org.foo.X. If there is no enclosing namespace then the null namespace is used.
 
-    if name_attr is None or name_attr == "":
-        return;
+    References to previously defined names are as in the latter two cases above: if they contain a dot they are a fullname, if they do not contain a dot, the namespace is the namespace of the enclosing definition.
 
-    if (name_attr.find('.') < 0):
-      if (space_attr is not None) and (space_attr != ""):
-        self._full = "%s.%s" % (space_attr, name_attr)
-      else:
-        if (default_space is not None) and (default_space != ""):
-           self._full = "%s.%s" % (default_space, name_attr)
-        else:
-          self._full = name_attr
+    @arg name_attr: name value read in schema or None.
+    @arg space_attr: namespace value read in schema or None. The empty string may be used as a namespace to indicate the null namespace.
+    @arg default_space: the current default space or None.
+    """
+    if name_attr is None:
+      return
+    if name_attr == "":
+      raise SchemaParseException('Name must not be the empty string.')
+
+    if '.' in name_attr or space_attr == "" or not (space_attr or default_space):
+      # The empty string may be used as a namespace to indicate the null namespace.
+      self._full = name_attr
     else:
-        self._full = name_attr
+      self._full = "{!s}.{!s}".format(space_attr or default_space, name_attr)
+
+    self._validate_fullname(self._full)
+
+  def _validate_fullname(self, fullname):
+    for name in fullname.split('.'):
+      if not self._base_name_pattern.search(name):
+        raise InvalidName("{!s} is not a valid Avro name because it "
+                          "does not match the pattern {!s}".format(
+                          name, self._base_name_pattern.pattern))
 
   def __eq__(self, other):
-    if not isinstance(other, Name):
-        return False
-    return (self.fullname == other.fullname)
+    """Equality of names is defined on the fullname and is case-sensitive."""
+    return isinstance(other, Name) and self.fullname == other.fullname
 
-  fullname = property(lambda self: self._full)
+  @property
+  def fullname(self):
+    return self._full
 
-  def get_space(self):
+  @property
+  def space(self):
     """Back out a namespace from full name."""
     if self._full is None:
-        return None
+      return None
+    return self._full.rsplit(".", 1)[0] if "." in self._full else None
+
+  def get_space(self):
+    warnings.warn('Name.get_space() is deprecated in favor of Name.space')
+    return self.space
 
-    if (self._full.find('.') > 0):
-      return self._full.rsplit(".", 1)[0]
-    else:
-      return ""
 
 class Names(object):
   """Track name set and default namespace during parsing."""
@@ -313,16 +314,13 @@ class NamedSchema(Schema):
     # Store name and namespace as they were read in origin schema
     self.set_prop('name', name)
     if namespace is not None:
-      self.set_prop('namespace', new_name.get_space())
+      self.set_prop('namespace', new_name.space)
 
     # Store full name as calculated from name, namespace
     self._fullname = new_name.fullname
 
   def name_ref(self, names):
-    if self.namespace == names.default_namespace:
-      return self.name
-    else:
-      return self.fullname
+    return self.name if self.namespace == names.default_namespace else self.fullname
 
   # read-only properties
   name = property(lambda self: self.get_prop('name'))
@@ -751,7 +749,7 @@ class RecordSchema(NamedSchema):
     if schema_type == 'record':
       old_default = names.default_namespace
       names.default_namespace = Name(name, namespace,
-                                     names.default_namespace).get_space()
+                                     names.default_namespace).space
 
     # Add class members
     field_objects = RecordSchema.make_field_objects(fields, names)
diff --git a/lang/py/test/test_schema.py b/lang/py/test/test_schema.py
index ea6779d..63b0705 100644
--- a/lang/py/test/test_schema.py
+++ b/lang/py/test/test_schema.py
@@ -71,6 +71,7 @@ FIXED_EXAMPLES = [
                    "namespace": "org.apache.hadoop.avro"}),
   InvalidTestSchema({"type": "fixed", "name": "Missing size"}),
   InvalidTestSchema({"type": "fixed", "size": 314}),
+  InvalidTestSchema({"type": "fixed", "size": 314, "name": "dr. spaceman"}, comment='AVRO-621'),
 ]
 
 ENUM_EXAMPLES = [
@@ -315,61 +316,63 @@ class TestSchema(unittest.TestCase):
     # If we've made it this far, the subschema was reasonably stringified; it ccould be reparsed.
     self.assertEqual("X", t.fields[0].type.name)
 
-  # TODO(hammer): more tests
-  def test_fullname(self):
-    """Test schema full names
-
-    The fullname is determined in one of the following ways:
-     * A name and namespace are both specified.  For example,
-       one might use "name": "X", "namespace": "org.foo"
-       to indicate the fullname "org.foo.X".
-     * A fullname is specified.  If the name specified contains
-       a dot, then it is assumed to be a fullname, and any
-       namespace also specified is ignored.  For example,
-       use "name": "org.foo.X" to indicate the
-       fullname "org.foo.X".
-     * A name only is specified, i.e., a name that contains no
-       dots.  In this case the namespace is taken from the most
-       tightly encosing schema or protocol.  For example,
-       if "name": "X" is specified, and this occurs
-       within a field of the record definition
-       of "org.foo.Y", then the fullname is "org.foo.X".
-
-    References to previously defined names are as in the latter
-    two cases above: if they contain a dot they are a fullname, if
-    they do not contain a dot, the namespace is the namespace of
-    the enclosing definition.
-
-    Primitive type names have no namespace and their names may
-    not be defined in any namespace.  A schema may only contain
-    multiple definitions of a fullname if the definitions are
-    equivalent.
-    """
+  def test_name_is_none(self):
+    """When a name is None its namespace is None."""
+    self.assertIsNone(schema.Name(None, None, None).fullname)
+    self.assertIsNone(schema.Name(None, None, None).space)
+
+  def test_name_not_empty_string(self):
+    """A name cannot be the empty string."""
+    self.assertRaises(schema.SchemaParseException, schema.Name, "", None, None)
 
+  def test_name_space_specified(self):
+    """Space combines with a name to become the fullname."""
     # name and namespace specified
     fullname = schema.Name('a', 'o.a.h', None).fullname
     self.assertEqual(fullname, 'o.a.h.a')
 
-    # fullname and namespace specified
+  def test_fullname_space_specified(self):
+    """When name contains dots, namespace should be ignored."""
     fullname = schema.Name('a.b.c.d', 'o.a.h', None).fullname
     self.assertEqual(fullname, 'a.b.c.d')
 
-    # name and default namespace specified
+  def test_name_default_specified(self):
+    """Default space becomes the namespace when the namespace is None."""
     fullname = schema.Name('a', None, 'b.c.d').fullname
     self.assertEqual(fullname, 'b.c.d.a')
 
-    # fullname and default namespace specified
+  def test_fullname_default_specified(self):
+    """When a name contains dots, default space should be ignored."""
     fullname = schema.Name('a.b.c.d', None, 'o.a.h').fullname
     self.assertEqual(fullname, 'a.b.c.d')
 
-    # fullname, namespace, default namespace specified
+  def test_fullname_space_default_specified(self):
+    """When a name contains dots, namespace and default space should be ignored."""
     fullname = schema.Name('a.b.c.d', 'o.a.a', 'o.a.h').fullname
     self.assertEqual(fullname, 'a.b.c.d')
 
-    # name, namespace, default namespace specified
+  def test_name_space_default_specified(self):
+    """When name and space are specified, default space should be ignored."""
     fullname = schema.Name('a', 'o.a.a', 'o.a.h').fullname
     self.assertEqual(fullname, 'o.a.a.a')
 
+  def test_equal_names(self):
+    """Equality of names is defined on the fullname and is case-sensitive."""
+    self.assertEqual(schema.Name('a.b.c.d', None, None), schema.Name('d', 'a.b.c', None))
+    self.assertNotEqual(schema.Name('C.d', None, None), schema.Name('c.d', None, None))
+
+  def test_invalid_name(self):
+    """The name portion of a fullname, record field names, and enum symbols must:
+       start with [A-Za-z_] and subsequently contain only [A-Za-z0-9_]"""
+    self.assertRaises(schema.InvalidName, schema.Name, 'an especially spacey cowboy', None, None)
+    self.assertRaises(schema.InvalidName, schema.Name, '99 problems but a name aint one', None, None)
+
+  def test_null_namespace(self):
+    """The empty string may be used as a namespace to indicate the null namespace."""
+    name = schema.Name('name', "", None)
+    self.assertEqual(name.fullname, "name")
+    self.assertIsNone(name.space)
+
   def test_exception_is_not_swallowed_on_parse_error(self):
     """A specific exception message should appear on a json parse error."""
     try: