You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nifi.apache.org by jo...@apache.org on 2021/03/09 21:32:50 UTC

[nifi] 01/06: NIFI-7969: ValidateRecord enhanced with Force Types From Schema property

This is an automated email from the ASF dual-hosted git repository.

joewitt pushed a commit to branch support/nifi-1.13
in repository https://gitbox.apache.org/repos/asf/nifi.git

commit 4d64a71cd94361f390fb67bf9c4ce34959b279d7
Author: Peter Gyori <pe...@gmail.com>
AuthorDate: Tue Feb 16 19:37:33 2021 +0100

    NIFI-7969: ValidateRecord enhanced with Force Types From Schema property
    
    NIFI-7969: Documentation update
    Clarified that the Force Types From Schema property applies to the data read, whereas the Strict Type Checking property applies to the validation.
    NIFI-7969: Documentation update - updated the property name in additionalDetails.html
    
    This closes #4825.
    
    Signed-off-by: Peter Turcsanyi <tu...@apache.org>
---
 .../nifi/processors/standard/ValidateRecord.java   |  25 ++-
 .../additionalDetails.html                         | 190 +++++++++++++++++++++
 2 files changed, 211 insertions(+), 4 deletions(-)

diff --git a/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/java/org/apache/nifi/processors/standard/ValidateRecord.java b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/java/org/apache/nifi/processors/standard/ValidateRecord.java
index 1210eec..a952aca 100644
--- a/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/java/org/apache/nifi/processors/standard/ValidateRecord.java
+++ b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/java/org/apache/nifi/processors/standard/ValidateRecord.java
@@ -172,14 +172,29 @@ public class ValidateRecord extends AbstractProcessor {
     static final PropertyDescriptor STRICT_TYPE_CHECKING = new PropertyDescriptor.Builder()
         .name("strict-type-checking")
         .displayName("Strict Type Checking")
-        .description("If the incoming data has a Record where a field is not of the correct type, this property determine whether how to handle the Record. "
-            + "If true, the Record will still be considered invalid. If false, the Record will be considered valid and the field will be coerced into the "
-            + "correct type (if possible, according to the type coercion supported by the Record Writer).")
+        .description("If the incoming data has a Record where a field is not of the correct type, this property determines how to handle the Record. "
+            + "If true, the Record will be considered invalid. If false, the Record will be considered valid and the field will be coerced into the "
+            + "correct type (if possible, according to the type coercion supported by the Record Writer). "
+            + "This property controls how the data is validated against the validation schema.")
         .expressionLanguageSupported(ExpressionLanguageScope.NONE)
         .allowableValues("true", "false")
         .defaultValue("true")
         .required(true)
         .build();
+    static final PropertyDescriptor COERCE_TYPES = new PropertyDescriptor.Builder()
+            .name("coerce-types")
+            .displayName("Force Types From Reader's Schema")
+            .description("If enabled, the processor will coerce every field to the type specified in the Reader's schema. "
+                + "If the value of a field cannot be coerced to the type, the field will be skipped (will not be read from the input data), "
+                + "thus will not appear in the output. "
+                + "If not enabled, then every field will appear in the output but their types may differ from what is "
+                + "specified in the schema. For details please see the Additional Details page of the processor's Help. "
+                + "This property controls how the data is read by the specified Record Reader.")
+            .expressionLanguageSupported(ExpressionLanguageScope.NONE)
+            .allowableValues("true", "false")
+            .defaultValue("false")
+            .required(true)
+            .build();
     static final PropertyDescriptor VALIDATION_DETAILS_ATTRIBUTE_NAME = new PropertyDescriptor.Builder()
         .name("validation-details-attribute-name")
         .displayName("Validation Details Attribute Name")
@@ -227,6 +242,7 @@ public class ValidateRecord extends AbstractProcessor {
         properties.add(SCHEMA_TEXT);
         properties.add(ALLOW_EXTRA_FIELDS);
         properties.add(STRICT_TYPE_CHECKING);
+        properties.add(COERCE_TYPES);
         properties.add(VALIDATION_DETAILS_ATTRIBUTE_NAME);
         properties.add(MAX_VALIDATION_DETAILS_LENGTH);
         return properties;
@@ -282,6 +298,7 @@ public class ValidateRecord extends AbstractProcessor {
 
         final boolean allowExtraFields = context.getProperty(ALLOW_EXTRA_FIELDS).asBoolean();
         final boolean strictTypeChecking = context.getProperty(STRICT_TYPE_CHECKING).asBoolean();
+        final boolean coerceTypes = context.getProperty(COERCE_TYPES).asBoolean();
 
         RecordSetWriter validWriter = null;
         RecordSetWriter invalidWriter = null;
@@ -306,7 +323,7 @@ public class ValidateRecord extends AbstractProcessor {
 
             try {
                 Record record;
-                while ((record = reader.nextRecord(false, false)) != null) {
+                while ((record = reader.nextRecord(coerceTypes, false)) != null) {
                     final SchemaValidationResult result = validator.validate(record);
                     recordCount++;
 
diff --git a/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/resources/docs/org.apache.nifi.processors.standard.ValidateRecord/additionalDetails.html b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/resources/docs/org.apache.nifi.processors.standard.ValidateRecord/additionalDetails.html
new file mode 100644
index 0000000..c46443c
--- /dev/null
+++ b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/resources/docs/org.apache.nifi.processors.standard.ValidateRecord/additionalDetails.html
@@ -0,0 +1,190 @@
+<!DOCTYPE html>
+<html lang="en">
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+      http://www.apache.org/licenses/LICENSE-2.0
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<head>
+    <meta charset="utf-8"/>
+    <title>ValidateRecord</title>
+    <link rel="stylesheet" href="../../../../../css/component-usage.css" type="text/css"/>
+    <style>
+		table td:first-child {text-align: center;}
+	</style>
+
+</head>
+
+<body>
+
+<h2>Examples for the effect of Force Types From Reader's Schema property</h2>
+
+<p>
+    The processor first reads the data from the incoming FlowFile using the specified Record Reader,
+    which uses a schema. Then, depending on the value of the Schema Access Strategy property, the processor
+    can either use the reader's schema, or a different schema to validate the data against.
+    After that, the processor writes the data into the outgoing FlowFile using the specified
+    Record Writer. If the data is valid, the validation schema is used by the writer.
+    If the data is invalid, the writer uses the reader's schema.
+    The  <b>Force Types From Reader's Schema</b> property affects the first step:
+    how strictly the reader's schema should be applied when reading the data from the incoming FlowFile.
+    By affecting how the data is read, the value of the Force Types From Reader's Schema property also has an effect on what
+    the output of the ValidateRecord processor is,
+    and also whether the output is forwarded to the <b>valid</b> or the <b>invalid</b> relationship.
+    Below are two examples where the value of this property affects the output significantly.
+</p>
+
+<p>
+    In both examples the input is in XML format and the output is in JSON.
+    In the examples we assume that the same schema is used for reading, validation and writing.
+</p>
+
+<h3>Example 1</h3>
+
+<p>
+    Schema:
+</p>
+<pre><code>{
+    "namespace": "nifi",
+    "name": "test",
+    "type": "record",
+    "fields": [
+        { "name": "field1", "type": "string" },
+        { "name": "field2", "type": "string" }
+    ]
+}
+</code></pre>
+
+<p>
+    Input:
+</p>
+<pre><code>&lt;test&gt;
+    &lt;field1&gt;
+        &lt;sub_field&gt;content&lt;/sub_field&gt;
+    &lt;/field1&gt;
+    &lt;field2&gt;content_of_field_2&lt;/field2&gt;
+&lt;/test&gt;</code></pre>
+
+<p>
+    Output if <b>Force Types From Reader's Schema = true</b>
+    (forwarded to the <b>invalid</b> relationship):
+</p>
+<pre><code>[ {
+    "field2" : "content_of_field_2"
+} ]</code></pre>
+
+<p>
+    Output if <b>Force Types From Reader's Schema = false</b>
+    (forwarded to the <b>invalid</b> relationship):
+</p>
+<pre><code>[ {
+    "field1" : {
+        "sub_field" : "content"
+    },
+    "field2" : "content_of_field_2"
+} ]</code></pre>
+
+<p>
+    As you can see, the FlowFile is forwarded to the invalid relationship in both cases,
+    since the input data does not match the provided Avro schema.
+    However, if <b>Force Types From Reader's Schema = true</b>, only those fields appear in the output
+    that comply with the schema. If <b>Force Types From Reader's Schema = false</b>, all fields appear
+    in the output regardless of whether they comply with the schema or not.
+</p>
+
+<h3>Example 2</h3>
+
+<p>
+    Schema:
+</p>
+<pre><code>{
+    "namespace": "nifi",
+    "name": "test",
+    "type": "record",
+    "fields": [
+        {
+            "name": "field1",
+            "type": {
+                "type": "array",
+                "items": "string"
+            }
+        },
+        {
+            "name": "field2",
+            "type": {
+                "type": "array",
+                "items": "string"
+            }
+        }
+    ]
+}</code></pre>
+
+<p>
+    Input:
+</p>
+<pre><code>&lt;test&gt;
+    &lt;field1&gt;content_1&lt;/field1&gt;
+    &lt;field2&gt;content_2&lt;/field2&gt;
+    &lt;field2&gt;content_3&lt;/field2&gt;
+&lt;/test&gt;</code></pre>
+
+<p>
+    Output if <b>Force Types From Reader's Schema = true</b>
+    (forwarded to the <b>valid</b> relationship):
+</p>
+<pre><code>[ {
+    "field1" : [ "content_1" ],
+    "field2" : [ "content_2", "content_3" ]
+} ]</code></pre>
+
+<p>
+    Output if <b>Force Types From Reader's Schema = false</b>
+    (forwarded to the <b>invalid</b> relationship):
+</p>
+<pre><code>[ {
+    "field1" : "content_1",
+    "field2" : [ "content_2", "content_3" ]
+} ]</code></pre>
+
+<p>
+    The schema expects two fields (field1 and field2), both of type ARRAY.
+    field1 only appears once in the input XML document. If <b>Force Types From Reader's Schema = true</b>,
+    the processor forces this field to be in a type that complies with the schema.
+    So it is put in an array with one element. Since this type coercion can be done,
+    the output is routed to the <b>valid</b> relationship.
+    If <b>Force Types From Reader's Schema = false</b> the processor does not try to apply
+    type coercion, thus field1 appears in the output as a single value. According to the schema,
+    the processor expects an array for field1, but receives a single element so the output
+    is routed to the <b>invalid</b> relationship.
+</p>
+<p>
+    Schema compliance (and getting routed to the <b>valid</b> or the <b>invalid</b> relationship)
+    does not depend on what Writer is used to produce the output of the ValidateRecord processor.
+    Let us suppose that we used the same schema and input as in <b>Example 2</b>, but instead of
+    JsonRecordSetWriter, we used XMLRecordSetWriter to produce the output.
+    Both in case of <b>Force Types From Reader's Schema = true</b> and <b>Force Types From Reader's Schema = false</b>
+    the output is:
+</p>
+<pre><code>&lt;test&gt;
+    &lt;field1&gt;content_1&lt;/field1&gt;
+    &lt;field2&gt;content_2&lt;/field2&gt;
+    &lt;field2&gt;content_3&lt;/field2&gt;
+&lt;/test&gt;</code></pre>
+
+<p>
+    However, if <b>Force Types From Reader's Schema = true</b> this output is routed to the <b>valid</b>
+    relationship and if <b>Force Types From Reader's Schema = false</b> it is routed to the <b>invalid</b>
+    relationship.
+</p>
+
+</body>
+</html>
\ No newline at end of file