You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@avro.apache.org by bu...@apache.org on 2015/09/14 02:07:08 UTC

svn commit: r1702840 - in /avro/branches/branch-1.8/lang/ruby: lib/avro.rb lib/avro/schema.rb lib/avro/schema_normalization.rb test/case_finder.rb test/test_fingerprints.rb test/test_schema_normalization.rb

Author: busbey
Date: Mon Sep 14 00:07:07 2015
New Revision: 1702840

URL: http://svn.apache.org/r1702840
Log:
AVRO-1694. Ruby: Schema normaliation and fingerprints. Contributed by Daniel Schierbeck.

* Avro::SchemaNormalization.to_parsing_form converts a schema to Parsing
Canonical Form
* support for MD5 and SHA256 fingerprints

This closes #40


Added:
    avro/branches/branch-1.8/lang/ruby/lib/avro/schema_normalization.rb
    avro/branches/branch-1.8/lang/ruby/test/case_finder.rb
    avro/branches/branch-1.8/lang/ruby/test/test_fingerprints.rb
    avro/branches/branch-1.8/lang/ruby/test/test_schema_normalization.rb
Modified:
    avro/branches/branch-1.8/lang/ruby/lib/avro.rb
    avro/branches/branch-1.8/lang/ruby/lib/avro/schema.rb

Modified: avro/branches/branch-1.8/lang/ruby/lib/avro.rb
URL: http://svn.apache.org/viewvc/avro/branches/branch-1.8/lang/ruby/lib/avro.rb?rev=1702840&r1=1702839&r2=1702840&view=diff
==============================================================================
--- avro/branches/branch-1.8/lang/ruby/lib/avro.rb (original)
+++ avro/branches/branch-1.8/lang/ruby/lib/avro.rb Mon Sep 14 00:07:07 2015
@@ -39,3 +39,4 @@ require 'avro/io'
 require 'avro/data_file'
 require 'avro/protocol'
 require 'avro/ipc'
+require 'avro/schema_normalization'

Modified: avro/branches/branch-1.8/lang/ruby/lib/avro/schema.rb
URL: http://svn.apache.org/viewvc/avro/branches/branch-1.8/lang/ruby/lib/avro/schema.rb?rev=1702840&r1=1702839&r2=1702840&view=diff
==============================================================================
--- avro/branches/branch-1.8/lang/ruby/lib/avro/schema.rb (original)
+++ avro/branches/branch-1.8/lang/ruby/lib/avro/schema.rb Mon Sep 14 00:07:07 2015
@@ -137,6 +137,18 @@ module Avro
     # Deprecated in favor of {#type_sym}.
     def type; @type_sym.to_s; end
 
+    # Returns the MD5 fingerprint of the schema as an Integer.
+    def md5_fingerprint
+      parsing_form = SchemaNormalization.to_parsing_form(self)
+      Digest::MD5.hexdigest(parsing_form).to_i(16)
+    end
+
+    # Returns the SHA-256 fingerprint of the schema as an Integer.
+    def sha256_fingerprint
+      parsing_form = SchemaNormalization.to_parsing_form(self)
+      Digest::SHA256.hexdigest(parsing_form).to_i(16)
+    end
+
     def ==(other, seen=nil)
       other.is_a?(Schema) && type_sym == other.type_sym
     end

Added: avro/branches/branch-1.8/lang/ruby/lib/avro/schema_normalization.rb
URL: http://svn.apache.org/viewvc/avro/branches/branch-1.8/lang/ruby/lib/avro/schema_normalization.rb?rev=1702840&view=auto
==============================================================================
--- avro/branches/branch-1.8/lang/ruby/lib/avro/schema_normalization.rb (added)
+++ avro/branches/branch-1.8/lang/ruby/lib/avro/schema_normalization.rb Mon Sep 14 00:07:07 2015
@@ -0,0 +1,83 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+module Avro
+  class SchemaNormalization
+    def self.to_parsing_form(schema)
+      new.to_parsing_form(schema)
+    end
+
+    def initialize
+      @processed_names = []
+    end
+
+    def to_parsing_form(schema)
+      JSON.dump(normalize_schema(schema))
+    end
+
+    private
+
+    def normalize_schema(schema)
+      type = schema.type_sym.to_s
+
+      if Schema::NAMED_TYPES.include?(type)
+        if @processed_names.include?(schema.name)
+          return schema.name
+        else
+          @processed_names << schema.name
+        end
+      end
+
+      case type
+      when *Schema::PRIMITIVE_TYPES
+        type
+      when "record"
+        fields = schema.fields.map {|field| normalize_field(field) }
+
+        normalize_named_type(schema, fields: fields)
+      when "enum"
+        normalize_named_type(schema, symbols: schema.symbols)
+      when "fixed"
+        normalize_named_type(schema, size: schema.size)
+      when "array"
+        { type: type, items: normalize_schema(schema.items) }
+      when "map"
+        { type: type, values: normalize_schema(schema.values) }
+      when "union"
+        if schema.schemas.nil?
+          []
+        else
+          schema.schemas.map {|s| normalize_schema(s) }
+        end
+      else
+        raise "unknown type #{type}"
+      end
+    end
+
+    def normalize_field(field)
+      {
+        name: field.name,
+        type: normalize_schema(field.type)
+      }
+    end
+
+    def normalize_named_type(schema, attributes = {})
+      name = Name.make_fullname(schema.name, schema.namespace)
+
+      { name: name, type: schema.type_sym.to_s }.merge(attributes)
+    end
+  end
+end

Added: avro/branches/branch-1.8/lang/ruby/test/case_finder.rb
URL: http://svn.apache.org/viewvc/avro/branches/branch-1.8/lang/ruby/test/case_finder.rb?rev=1702840&view=auto
==============================================================================
--- avro/branches/branch-1.8/lang/ruby/test/case_finder.rb (added)
+++ avro/branches/branch-1.8/lang/ruby/test/case_finder.rb Mon Sep 14 00:07:07 2015
@@ -0,0 +1,67 @@
+class CaseFinder
+  PATH = File.expand_path("../../../../share/test/data/schema-tests.txt", __FILE__)
+
+  Case = Struct.new(:id, :input, :canonical, :fingerprint)
+
+  def self.cases
+    new.cases
+  end
+
+  def initialize
+    @scanner = StringScanner.new(File.read(PATH))
+    @cases = []
+  end
+
+  def cases
+    until @scanner.eos?
+      test_case = scan_case
+      @cases << test_case if test_case
+    end
+
+    @cases
+  end
+
+  private
+
+  def scan_case
+    if id = @scanner.scan(/\/\/ \d+\n/)
+      while @scanner.skip(/\/\/ .*\n/); end
+
+      input = scan_input
+      canonical = scan_canonical
+      fingerprint = scan_fingerprint
+
+      Case.new(id, input, canonical, fingerprint)
+    else
+      @scanner.skip(/.*\n/)
+      nil
+    end
+  end
+
+  def scan_item(name)
+    if @scanner.scan(/<<#{name}\n/)
+      lines = []
+      while line = @scanner.scan(/.+\n/)
+        break if line.chomp == name
+        lines << line
+      end
+      lines.join
+    elsif @scanner.scan(/<<#{name} /)
+      input = @scanner.scan(/.+$/)
+      @scanner.skip(/\n/)
+      input
+    end
+  end
+
+  def scan_input
+    scan_item("INPUT")
+  end
+
+  def scan_canonical
+    scan_item("canonical")
+  end
+
+  def scan_fingerprint
+    scan_item("fingerprint")
+  end
+end

Added: avro/branches/branch-1.8/lang/ruby/test/test_fingerprints.rb
URL: http://svn.apache.org/viewvc/avro/branches/branch-1.8/lang/ruby/test/test_fingerprints.rb?rev=1702840&view=auto
==============================================================================
--- avro/branches/branch-1.8/lang/ruby/test/test_fingerprints.rb (added)
+++ avro/branches/branch-1.8/lang/ruby/test/test_fingerprints.rb Mon Sep 14 00:07:07 2015
@@ -0,0 +1,37 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+require 'test_help'
+
+class TestFingerprints < Test::Unit::TestCase
+  def test_md5_fingerprint
+    schema = Avro::Schema.parse <<-SCHEMA
+      { "type": "int" }
+    SCHEMA
+
+    assert_equal 318112854175969537208795771590915775282,
+      schema.md5_fingerprint
+  end
+
+  def test_sha256_fingerprint
+    schema = Avro::Schema.parse <<-SCHEMA
+      { "type": "int" }
+    SCHEMA
+
+    assert_equal 28572620203319713300323544804233350633246234624932075150020181448463213378117,
+      schema.sha256_fingerprint
+  end
+end

Added: avro/branches/branch-1.8/lang/ruby/test/test_schema_normalization.rb
URL: http://svn.apache.org/viewvc/avro/branches/branch-1.8/lang/ruby/test/test_schema_normalization.rb?rev=1702840&view=auto
==============================================================================
--- avro/branches/branch-1.8/lang/ruby/test/test_schema_normalization.rb (added)
+++ avro/branches/branch-1.8/lang/ruby/test/test_schema_normalization.rb Mon Sep 14 00:07:07 2015
@@ -0,0 +1,171 @@
+# -*- coding: utf-8 -*-
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+require 'test_help'
+require 'case_finder'
+
+class TestSchemaNormalization < Test::Unit::TestCase
+  def test_primitives
+    %w[null boolean string bytes int long float double].each do |type|
+      schema = Avro::Schema.parse(<<-JSON)
+        { "type": "#{type}" }
+      JSON
+
+      canonical_form = Avro::SchemaNormalization.to_parsing_form(schema)
+
+      assert_equal %("#{type}"), canonical_form
+    end
+  end
+
+  def test_records
+    schema = Avro::Schema.parse(<<-JSON)
+      {
+        "type": "record",
+        "name": "test",
+        "namespace": "random",
+        "doc": "some record",
+        "fields": [
+          { "name": "height", "type": "int", "doc": "the height" }
+        ]
+      }
+    JSON
+
+    expected_type = <<-JSON.strip
+      {"name":"random.test","type":"record","fields":[{"name":"height","type":"int"}]}
+    JSON
+
+    canonical_form = Avro::SchemaNormalization.to_parsing_form(schema)
+
+    assert_equal expected_type, canonical_form
+  end
+
+  def test_recursive_records
+    schema = Avro::Schema.parse(<<-JSON)
+      {
+        "type": "record",
+        "name": "item",
+        "fields": [
+          { "name": "next", "type": "item" }
+        ]
+      }
+    JSON
+
+    expected_type = <<-JSON.strip
+      {"name":"item","type":"record","fields":[{"name":"next","type":"item"}]}
+    JSON
+
+    canonical_form = Avro::SchemaNormalization.to_parsing_form(schema)
+
+    assert_equal expected_type, canonical_form
+  end
+
+  def test_enums
+    schema = Avro::Schema.parse(<<-JSON)
+      {
+        "type": "enum",
+        "name": "suit",
+        "namespace": "cards",
+        "doc": "the different suits of cards",
+        "symbols": ["club", "hearts", "diamond", "spades"]
+      }
+    JSON
+
+    expected_type = <<-JSON.strip
+      {"name":"cards.suit","type":"enum","symbols":["club","hearts","diamond","spades"]}
+    JSON
+
+    canonical_form = Avro::SchemaNormalization.to_parsing_form(schema)
+
+    assert_equal expected_type, canonical_form
+  end
+
+  def test_fixed
+    schema = Avro::Schema.parse(<<-JSON)
+      {
+        "type": "fixed",
+        "name": "id",
+        "namespace": "db",
+        "size": 64
+      }
+    JSON
+
+    expected_type = <<-JSON.strip
+      {"name":"db.id","type":"fixed","size":64}
+    JSON
+
+    canonical_form = Avro::SchemaNormalization.to_parsing_form(schema)
+
+    assert_equal expected_type, canonical_form
+  end
+
+  def test_arrays
+    schema = Avro::Schema.parse(<<-JSON)
+      {
+        "type": "array",
+        "doc": "the items",
+        "items": "int"
+      }
+    JSON
+
+    expected_type = <<-JSON.strip
+      {"type":"array","items":"int"}
+    JSON
+
+    canonical_form = Avro::SchemaNormalization.to_parsing_form(schema)
+
+    assert_equal expected_type, canonical_form
+  end
+
+  def test_maps
+    schema = Avro::Schema.parse(<<-JSON)
+      {
+        "type": "map",
+        "doc": "the items",
+        "values": "int"
+      }
+    JSON
+
+    expected_type = <<-JSON.strip
+      {"type":"map","values":"int"}
+    JSON
+
+    canonical_form = Avro::SchemaNormalization.to_parsing_form(schema)
+
+    assert_equal expected_type, canonical_form
+  end
+
+  def test_unions
+    schema = Avro::Schema.parse(<<-JSON)
+      ["int", "string"]
+    JSON
+
+    expected_type = <<-JSON.strip
+      ["int","string"]
+    JSON
+
+    canonical_form = Avro::SchemaNormalization.to_parsing_form(schema)
+
+    assert_equal expected_type, canonical_form
+  end
+
+  def test_shared_dataset
+    CaseFinder.cases.each do |test_case|
+      schema = Avro::Schema.parse(test_case.input)
+      assert_equal test_case.canonical, Avro::SchemaNormalization.to_parsing_form(schema)
+    end
+  end
+end