You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@gobblin.apache.org by hu...@apache.org on 2019/06/17 16:57:22 UTC

[incubator-gobblin] branch master updated: [Gobblin-799][GOBBLIN-799] Fix bug in AvroSchemaCheckDefaultStrategy

This is an automated email from the ASF dual-hosted git repository.

hutran pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-gobblin.git


The following commit(s) were added to refs/heads/master by this push:
     new 73cbd9f  [Gobblin-799][GOBBLIN-799] Fix bug in AvroSchemaCheckDefaultStrategy
73cbd9f is described below

commit 73cbd9f3e66713dc12b4209d802b09b2ecd07fe3
Author: Zihan Li <zi...@zihli-mn1.linkedin.biz>
AuthorDate: Mon Jun 17 09:57:16 2019 -0700

    [Gobblin-799][GOBBLIN-799] Fix bug in AvroSchemaCheckDefaultStrategy
    
    Closes #2666 from ZihanLi58/schemaCheckBug
---
 .../AvroSchemaCheckDefaultStrategy.java            |   2 +
 .../util/AvroSchemaCheckDefaultStrategyTest.java   |  51 +++++
 .../expectedSchema.avsc                            | 239 +++++++++++++++++++++
 .../toValidateSchema.avsc                          | 199 +++++++++++++++++
 4 files changed, 491 insertions(+)

diff --git a/gobblin-data-management/src/main/java/org/apache/gobblin/util/schema_check/AvroSchemaCheckDefaultStrategy.java b/gobblin-data-management/src/main/java/org/apache/gobblin/util/schema_check/AvroSchemaCheckDefaultStrategy.java
index 1c7696d..b094183 100644
--- a/gobblin-data-management/src/main/java/org/apache/gobblin/util/schema_check/AvroSchemaCheckDefaultStrategy.java
+++ b/gobblin-data-management/src/main/java/org/apache/gobblin/util/schema_check/AvroSchemaCheckDefaultStrategy.java
@@ -59,6 +59,7 @@ public class AvroSchemaCheckDefaultStrategy implements AvroSchemaCheckStrategy {
           if (toValidate.getFixedSize() != expected.getFixedSize()) {
             return false;
           }
+          return true;
         }
         case ENUM: {
           // expected symbols must contain all toValidate symbols:
@@ -70,6 +71,7 @@ public class AvroSchemaCheckDefaultStrategy implements AvroSchemaCheckStrategy {
           if (!expectedSymbols.containsAll(toValidateSymbols)) {
             return false;
           }
+          return true;
         }
 
         case RECORD: {
diff --git a/gobblin-data-management/src/test/java/org/apache/gobblin/data/management/util/AvroSchemaCheckDefaultStrategyTest.java b/gobblin-data-management/src/test/java/org/apache/gobblin/data/management/util/AvroSchemaCheckDefaultStrategyTest.java
new file mode 100644
index 0000000..c57e188
--- /dev/null
+++ b/gobblin-data-management/src/test/java/org/apache/gobblin/data/management/util/AvroSchemaCheckDefaultStrategyTest.java
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.gobblin.data.management.util;
+
+import java.io.File;
+import org.apache.avro.Schema;
+import org.apache.gobblin.util.schema_check.AvroSchemaCheckDefaultStrategy;
+import org.junit.Assert;
+import org.testng.annotations.Test;
+
+
+public class AvroSchemaCheckDefaultStrategyTest {
+  @Test
+  public void testSchemCheckStrategy() throws Exception {
+    //test when it's compatible
+    Schema toValidate = new Schema.Parser().parse("{\"type\":\"record\",\"name\":\"baseRecord\",\"fields\":[{\"name\":\"foo\",\"type\":[\"null\",\"long\"],\"default\":null}]}");
+    Schema expected = new Schema.Parser().parse("{\"type\":\"record\",\"name\":\"baseRecord\",\"fields\":[{\"name\":\"foo\",\"type\":[\"null\",\"long\"],\"doc\":\"this is for test\",\"default\":null}]}");
+    AvroSchemaCheckDefaultStrategy strategy = new AvroSchemaCheckDefaultStrategy();
+    org.junit.Assert.assertTrue(strategy.compare(expected, toValidate));
+
+    //test when field name is different
+    expected = new Schema.Parser().parse("{\"type\":\"record\",\"name\":\"baseRecord\",\"fields\":[{\"name\":\"foo1\",\"type\":[\"null\",\"long\"],\"doc\":\"this is for test\",\"default\":null}]}");
+    org.junit.Assert.assertFalse(strategy.compare(expected, toValidate));
+
+    //test when the type change
+    expected = new Schema.Parser().parse("{\"type\":\"record\",\"name\":\"baseRecord\",\"fields\":[{\"name\":\"foo\",\"type\":[\"null\",\"int\"],\"doc\":\"this is for test\",\"default\":null}]}");
+    org.junit.Assert.assertFalse(strategy.compare(expected, toValidate));
+    expected = new Schema.Parser().parse("{\"type\":\"record\",\"name\":\"baseRecord\",\"fields\":[{\"name\":\"foo\",\"type\":[\"null\",\"float\"],\"doc\":\"this is for test\",\"default\":null}]}");
+    Assert.assertFalse(strategy.compare(expected, toValidate));
+
+    //test complex schema
+    toValidate = new Schema.Parser().parse(new File(AvroSchemaCheckDefaultStrategy.class.getClassLoader().getResource("avroSchemaCheckStrategyTest/toValidateSchema.avsc").getFile()));
+    expected = new Schema.Parser().parse(new File(AvroSchemaCheckDefaultStrategy.class.getClassLoader().getResource("avroSchemaCheckStrategyTest/expectedSchema.avsc").getFile()));
+    Assert.assertTrue(strategy.compare(expected, toValidate));
+  }
+}
diff --git a/gobblin-data-management/src/test/resources/avroSchemaCheckStrategyTest/expectedSchema.avsc b/gobblin-data-management/src/test/resources/avroSchemaCheckStrategyTest/expectedSchema.avsc
new file mode 100644
index 0000000..11d6102
--- /dev/null
+++ b/gobblin-data-management/src/test/resources/avroSchemaCheckStrategyTest/expectedSchema.avsc
@@ -0,0 +1,239 @@
+{
+  "type" : "record",
+  "name" : "OfflineComplianceRequestEvent",
+  "namespace" : "com.linkedin.events.compliance",
+  "fields" : [ {
+    "name" : "header",
+    "type" : {
+      "type" : "record",
+      "name" : "EventHeader",
+      "namespace" : "com.linkedin.events",
+      "fields" : [ {
+        "name" : "memberId",
+        "type" : "int",
+        "doc" : "The LinkedIn member ID of the user initiating the action.  LinkedIn member IDs are integers greater than zero.  Guests are represented either as zero or a negative number."
+      }, {
+        "name" : "viewerUrn",
+        "type" : [ "null", "string" ],
+        "doc" : "The LinkedIn URN of the user initiating the action.  For other applications like Slideshare, this should be filled in when the LinkedIn member URN is actually known.  The LinkedIn member URN would be known, for example, when the user has linked their Slideshare account with their LinkedIn account.",
+        "default" : null
+      }, {
+        "name" : "applicationViewerUrn",
+        "type" : [ "null", "string" ],
+        "doc" : "The Application URN of the user initiating the action.  This URN identifies the member within the particular application that the member is using, which may or may not be LinkedIn.  If the user is interacting with LinkedIn then this should be the LinkedIn URN, the same as viewerUrn.  If the member is interacting with a different site, such as Slideshare, then this should be the URN identifying the member in that site.",
+        "default" : null
+      }, {
+        "name" : "csUserUrn",
+        "type" : [ "null", "string" ],
+        "doc" : "The URN of the CS user initiating the action. A CS user is essentially a LinkedIn member with elevated permissions and can perform Admin actions on a page. A non-null value would indicate CS activity on the website. This field is different from the impersonatorId. ImpersonatorId will be populated when a CS user is logged in as (or impersonating) another member. On the other hand, this field will be populated when a CS user logged in as himself has elevated permissions to [...]
+        "default" : null
+      }, {
+        "name" : "time",
+        "type" : "long",
+        "doc" : "The time of the event"
+      }, {
+        "name" : "server",
+        "type" : "string",
+        "doc" : "The name of the server"
+      }, {
+        "name" : "service",
+        "type" : "string",
+        "doc" : "The name of the service. Synonymous to the com.linkedin.events.monitoring.EventHeader#container field."
+      }, {
+        "name" : "environment",
+        "type" : [ "string", "null" ],
+        "doc" : "The environment the service is running in",
+        "default" : ""
+      }, {
+        "name" : "guid",
+        "type" : {
+          "type" : "fixed",
+          "name" : "Guid",
+          "size" : 16
+        },
+        "doc" : "A unique identifier for the message"
+      }, {
+        "name" : "treeId",
+        "type" : [ "null", {
+          "type" : "fixed",
+          "name" : "fixed_16",
+          "size" : 16
+        } ],
+        "doc" : "Service call tree uuid",
+        "default" : null
+      }, {
+        "name" : "requestId",
+        "type" : [ "null", "int" ],
+        "doc" : "Service call request id",
+        "default" : null
+      }, {
+        "name" : "impersonatorId",
+        "type" : [ "null", "string" ],
+        "doc" : "this is the ID of the CS Agent or Application acting on the users behalf",
+        "default" : null
+      }, {
+        "name" : "version",
+        "type" : [ "null", "string" ],
+        "doc" : "Synonymous to the com.linkedin.events.monitoring.EventHeader#version field. The version that the service which emitted this event was at. For services in multiproducts, this usually comes in the form of {major}.{minor}.{micro} (eg. 0.1.2), however for network services, the version follows a format like so: 0.0.2000-RC8.35047",
+        "default" : null
+      }, {
+        "name" : "instance",
+        "type" : [ "null", "string" ],
+        "doc" : "Synonymous to the com.linkedin.events.monitoring.EventHeader#instance field. The instance ID of the service (eg. i001)",
+        "default" : null
+      }, {
+        "name" : "appName",
+        "type" : [ "null", "string" ],
+        "doc" : "Synonymous to the com.linkedin.events.monitoring.EventHeader#service field. Named 'appName' here since this is what this field actually represents, and 'service' is already used. This is also synonymous to 'appName' in Play and network apps, where on a typical page there would be a <meta name=appName content=biz> tag. For network apps, this would be the container name without the '-tomcat' suffix. So for 'profile-tomcat', it would just be 'profile'. For Play! services, i [...]
+        "default" : null
+      }, {
+        "name" : "testId",
+        "type" : [ "null", "string" ],
+        "doc" : "A client provided ID that uniquely identifies a particular execution of a test case.  This ID is provided by clients through an ENG_TEST_ID cookie.  The Selenium test framework automatically sets this cookie for each request.  This will be null when there is no ENG_TEST_ID provided.  See https://iwww.corp.linkedin.com/wiki/cf/display/ENGS/Selenium+Framework+Architecture+Documentation for more details on the test framework.  See https://iwww.corp.linkedin.com/wiki/cf/disp [...]
+        "default" : null
+      }, {
+        "name" : "testSegmentId",
+        "type" : [ "null", "string" ],
+        "doc" : "A client provided ID that uniquely identifies a section of the testing code from a  particular execution of a test case.  This ID is provided by clients through an ENG_TEST_SEGMENT_ID cookie. ",
+        "default" : null
+      }, {
+        "name" : "auditHeader",
+        "type" : [ "null", {
+          "type" : "record",
+          "name" : "KafkaAuditHeader",
+          "fields" : [ {
+            "name" : "time",
+            "type" : "long",
+            "doc" : "The time at which the event was emitted into kafka."
+          }, {
+            "name" : "server",
+            "type" : "string",
+            "doc" : "The fully qualified name of the host from which the event is being emitted."
+          }, {
+            "name" : "instance",
+            "type" : [ "null", "string" ],
+            "doc" : "The instance on the server from which the event is being emitted. e.g. i001"
+          }, {
+            "name" : "appName",
+            "type" : "string",
+            "doc" : "The name of the application from which the event is being emitted. see go/appname"
+          }, {
+            "name" : "messageId",
+            "type" : {
+              "type" : "fixed",
+              "name" : "UUID",
+              "size" : 16
+            },
+            "doc" : "A unique identifier for the message"
+          }, {
+            "name" : "auditVersion",
+            "type" : [ "null", "int" ],
+            "doc" : "The version that is being used for auditing. In version 0, the audit trail buckets events into 10 minute audit windows based on the EventHeader timestamp. In version 1, the audit trail buckets events as follows: if the schema has an outer KafkaAuditHeader, use the outer audit header timestamp for bucketing; else if the EventHeader has an inner KafkaAuditHeader use that inner audit header's timestamp for bucketing",
+            "default" : null
+          }, {
+            "name" : "fabricUrn",
+            "type" : [ "null", "string" ],
+            "doc" : "The fabricUrn of the host from which the event is being emitted. Fabric Urn in the format of urn:li:fabric:{fabric_name}. See go/fabric.",
+            "default" : null
+          } ]
+        } ],
+        "doc" : "Header used by kafka for auditing the data in the kafka pipeline",
+        "default" : null
+      }, {
+        "name" : "pageInstance",
+        "type" : [ "null", {
+          "type" : "record",
+          "name" : "PageInstance",
+          "namespace" : "com.linkedin.events.common",
+          "fields" : [ {
+            "name" : "pageUrn",
+            "type" : "string",
+            "doc" : "The page entity. Example: urn:li:page:<pageKey>."
+          }, {
+            "name" : "trackingId",
+            "type" : {
+              "type" : "fixed",
+              "name" : "TrackingId",
+              "size" : 16
+            },
+            "doc" : "Uniquely identifies this rendering of the page."
+          } ]
+        } ],
+        "doc" : "The instance of a page to which the request that triggered this event is responding.  For more information see go/pageinstance",
+        "default" : null
+      }, {
+        "name" : "clientApplicationInstance",
+        "type" : [ "null", {
+          "type" : "record",
+          "name" : "ApplicationInstance",
+          "namespace" : "com.linkedin.events.common",
+          "fields" : [ {
+            "name" : "applicationUrn",
+            "type" : "string",
+            "doc" : "The application. Example: urn:li:application:<identifier>."
+          }, {
+            "name" : "version",
+            "type" : "string",
+            "doc" : "The internal version number of the running application in standardized version format, see go/version."
+          }, {
+            "name" : "trackingId",
+            "type" : "TrackingId",
+            "doc" : "Uniquely identifies this instantiation of the application.  Created when an application is started from cold.  Preserved through application pause, suspend, loss of focus, background, etc."
+          } ]
+        } ],
+        "doc" : "The particular instance of a client application which triggered this event.  For more information see go/clientApplicationInstance",
+        "default" : null
+      }, {
+        "name" : "originSource",
+        "type" : [ "null", {
+          "type" : "enum",
+          "name" : "OriginSource",
+          "symbols" : [ "QPROD" ]
+        } ],
+        "doc" : "If present, identifies this request as having an origin in a testing mechanism. If null, indicates a normal request from the external internet. For more information see go/originSource",
+        "default" : null
+      } ]
+    },
+    "doc" : "The basic header for this tracking event."
+  }, {
+    "name" : "requestType",
+    "type" : {
+      "type" : "enum",
+      "name" : "ComplianceRequestType",
+      "symbols" : [ "TIME_LIMITED_FILTER", "GLOBAL_FILTER", "GLOBAL_UNFILTER", "RETIRED_URN_FILTER" ]
+    },
+    "doc" : "The type of filter required by this event."
+  }, {
+    "name" : "conditionValueUrn",
+    "type" : "string",
+    "doc" : "Filter all records owned by this urn. Examples: member id urn, lynda member id urn, etc."
+  }, {
+    "name" : "endTime",
+    "type" : [ "null", "long" ],
+    "doc" : "For certain request types, specifies the upper timestamp for which the filter applies. Unit is ms since epoch."
+  }, {
+    "name" : "datasetRestrictionUrns",
+    "type" : [ "null", {
+      "type" : "array",
+      "items" : "string"
+    } ],
+    "doc" : "URN restricting the datasets to which this request should be applied. The request will be applied to a dataset that matches ANY of the given restrictions. To match all datasets, use NULL. An empty array will be reported as an error.",
+    "default" : null
+  }, {
+    "name" : "useCaseRestrictionUrns",
+    "type" : [ "null", {
+      "type" : "array",
+      "items" : "string"
+    } ],
+    "doc" : "URN restricting the querying users to which this request should be applied. The request will be applied to a use case that matches ANY of the given restrictions. To match all use cases, use NULL. An empty array will be reported as an error.",
+    "default" : null
+  }, {
+    "name" : "columnRestrictionUrns",
+    "type" : [ "null", {
+      "type" : "array",
+      "items" : "string"
+    } ],
+    "doc" : "URN restricting the columns to which this request should be applied. The request will be applied to a column that matches ANY of the given restrictions. To match all columns, use NULL. An empty array will be reported as an error.",
+    "default" : null
+  } ]
+}
\ No newline at end of file
diff --git a/gobblin-data-management/src/test/resources/avroSchemaCheckStrategyTest/toValidateSchema.avsc b/gobblin-data-management/src/test/resources/avroSchemaCheckStrategyTest/toValidateSchema.avsc
new file mode 100644
index 0000000..227ddf7
--- /dev/null
+++ b/gobblin-data-management/src/test/resources/avroSchemaCheckStrategyTest/toValidateSchema.avsc
@@ -0,0 +1,199 @@
+{
+  "type" : "record",
+  "name" : "OfflineComplianceRequestEvent",
+  "namespace" : "com.linkedin.events.compliance",
+  "fields" : [ {
+    "name" : "header",
+    "type" : {
+      "type" : "record",
+      "name" : "EventHeader",
+      "namespace" : "com.linkedin.events",
+      "fields" : [ {
+        "name" : "memberId",
+        "type" : "int"
+      }, {
+        "name" : "viewerUrn",
+        "type" : [ "null", "string" ],
+        "default" : null
+      }, {
+        "name" : "applicationViewerUrn",
+        "type" : [ "null", "string" ],
+        "default" : null
+      }, {
+        "name" : "csUserUrn",
+        "type" : [ "null", "string" ],
+        "default" : null
+      }, {
+        "name" : "time",
+        "type" : "long"
+      }, {
+        "name" : "server",
+        "type" : "string"
+      }, {
+        "name" : "service",
+        "type" : "string"
+      }, {
+        "name" : "environment",
+        "type" : [ "string", "null" ],
+        "default" : ""
+      }, {
+        "name" : "guid",
+        "type" : {
+          "type" : "fixed",
+          "name" : "Guid",
+          "size" : 16
+        }
+      }, {
+        "name" : "treeId",
+        "type" : [ "null", {
+          "type" : "fixed",
+          "name" : "fixed_16",
+          "size" : 16
+        } ],
+        "default" : null
+      }, {
+        "name" : "requestId",
+        "type" : [ "null", "int" ],
+        "default" : null
+      }, {
+        "name" : "impersonatorId",
+        "type" : [ "null", "string" ],
+        "default" : null
+      }, {
+        "name" : "version",
+        "type" : [ "null", "string" ],
+        "default" : null
+      }, {
+        "name" : "instance",
+        "type" : [ "null", "string" ],
+        "default" : null
+      }, {
+        "name" : "appName",
+        "type" : [ "null", "string" ],
+        "default" : null
+      }, {
+        "name" : "testId",
+        "type" : [ "null", "string" ],
+        "default" : null
+      }, {
+        "name" : "testSegmentId",
+        "type" : [ "null", "string" ],
+        "default" : null
+      }, {
+        "name" : "auditHeader",
+        "type" : [ "null", {
+          "type" : "record",
+          "name" : "KafkaAuditHeader",
+          "fields" : [ {
+            "name" : "time",
+            "type" : "long"
+          }, {
+            "name" : "server",
+            "type" : "string"
+          }, {
+            "name" : "instance",
+            "type" : [ "null", "string" ]
+          }, {
+            "name" : "appName",
+            "type" : "string"
+          }, {
+            "name" : "messageId",
+            "type" : {
+              "type" : "fixed",
+              "name" : "UUID",
+              "size" : 16
+            }
+          }, {
+            "name" : "auditVersion",
+            "type" : [ "null", "int" ],
+            "default" : null
+          }, {
+            "name" : "fabricUrn",
+            "type" : [ "null", "string" ],
+            "default" : null
+          } ]
+        } ],
+        "default" : null
+      }, {
+        "name" : "pageInstance",
+        "type" : [ "null", {
+          "type" : "record",
+          "name" : "PageInstance",
+          "namespace" : "com.linkedin.events.common",
+          "fields" : [ {
+            "name" : "pageUrn",
+            "type" : "string"
+          }, {
+            "name" : "trackingId",
+            "type" : {
+              "type" : "fixed",
+              "name" : "TrackingId",
+              "size" : 16
+            }
+          } ]
+        } ],
+        "default" : null
+      }, {
+        "name" : "clientApplicationInstance",
+        "type" : [ "null", {
+          "type" : "record",
+          "name" : "ApplicationInstance",
+          "namespace" : "com.linkedin.events.common",
+          "fields" : [ {
+            "name" : "applicationUrn",
+            "type" : "string"
+          }, {
+            "name" : "version",
+            "type" : "string"
+          }, {
+            "name" : "trackingId",
+            "type" : "TrackingId"
+          } ]
+        } ],
+        "default" : null
+      }, {
+        "name" : "originSource",
+        "type" : [ "null", {
+          "type" : "enum",
+          "name" : "OriginSource",
+          "symbols" : [ "QPROD" ]
+        } ],
+        "default" : null
+      } ]
+    }
+  }, {
+    "name" : "requestType",
+    "type" : {
+      "type" : "enum",
+      "name" : "ComplianceRequestType",
+      "symbols" : [ "TIME_LIMITED_FILTER", "GLOBAL_FILTER", "GLOBAL_UNFILTER", "RETIRED_URN_FILTER" ]
+    }
+  }, {
+    "name" : "conditionValueUrn",
+    "type" : "string"
+  }, {
+    "name" : "endTime",
+    "type" : [ "null", "long" ]
+  }, {
+    "name" : "datasetRestrictionUrns",
+    "type" : [ "null", {
+      "type" : "array",
+      "items" : "string"
+    } ],
+    "default" : null
+  }, {
+    "name" : "useCaseRestrictionUrns",
+    "type" : [ "null", {
+      "type" : "array",
+      "items" : "string"
+    } ],
+    "default" : null
+  }, {
+    "name" : "columnRestrictionUrns",
+    "type" : [ "null", {
+      "type" : "array",
+      "items" : "string"
+    } ],
+    "default" : null
+  } ]
+}