You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nifi.apache.org by pv...@apache.org on 2021/03/12 08:11:25 UTC

[nifi] branch main updated: NIFI-2702 Support named captures in ExtractText

This is an automated email from the ASF dual-hosted git repository.

pvillard pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/nifi.git


The following commit(s) were added to refs/heads/main by this push:
     new 9f0b47a  NIFI-2702 Support named captures in ExtractText
9f0b47a is described below

commit 9f0b47af7724af2e3d7e11b1226e5419cdf3a8bb
Author: Otto Fowler <ot...@gmail.com>
AuthorDate: Fri Jul 3 13:52:31 2020 -0400

    NIFI-2702 Support named captures in ExtractText
    
    Signed-off-by: Pierre Villard <pi...@gmail.com>
    
    This closes #4384.
---
 .../nifi/processors/standard/ExtractText.java      | 108 +++++-
 .../additionalDetails.html                         | 239 +++++++++++++
 .../nifi/processors/standard/TestExtractText.java  |  27 +-
 .../standard/TestExtractTextNamedGroups.java       | 397 +++++++++++++++++++++
 4 files changed, 746 insertions(+), 25 deletions(-)

diff --git a/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/java/org/apache/nifi/processors/standard/ExtractText.java b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/java/org/apache/nifi/processors/standard/ExtractText.java
index 87b05ff..3f62abf 100644
--- a/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/java/org/apache/nifi/processors/standard/ExtractText.java
+++ b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/java/org/apache/nifi/processors/standard/ExtractText.java
@@ -16,6 +16,8 @@
  */
 package org.apache.nifi.processors.standard;
 
+import static io.krakens.grok.api.GrokUtils.getNameGroups;
+
 import java.io.IOException;
 import java.io.InputStream;
 import java.nio.charset.Charset;
@@ -32,6 +34,7 @@ import java.util.concurrent.LinkedBlockingQueue;
 import java.util.concurrent.atomic.AtomicReference;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
+
 import org.apache.nifi.annotation.behavior.DynamicProperty;
 import org.apache.nifi.annotation.behavior.EventDriven;
 import org.apache.nifi.annotation.behavior.InputRequirement;
@@ -69,6 +72,8 @@ import org.apache.nifi.stream.io.StreamUtils;
                 + "The results of those Regular Expressions are assigned to FlowFile Attributes.  "
                 + "Regular Expressions are entered by adding user-defined properties; "
                 + "the name of the property maps to the Attribute Name into which the result will be placed.  "
+                + "The attributes are generated differently based on the enabling of named capture groups.  "
+                + "If named capture groups are not enabled:  "
                 + "The first capture group, if any found, will be placed into that attribute name."
                 + "But all capture groups, including the matching string sequence itself will also be "
                 + "provided at that attribute name with an index value provided, with the exception of a capturing group "
@@ -76,7 +81,17 @@ import org.apache.nifi.stream.io.StreamUtils;
                 + "\"abc(def)?(g)\" we would add an attribute \"regex.1\" with a value of \"def\" if the \"def\" matched. If "
                 + "the \"def\" did not match, no attribute named \"regex.1\" would be added but an attribute named \"regex.2\" "
                 + "with a value of \"g\" will be added regardless."
+                + "If named capture groups are enabled:  "
+                + "Each named capture group, if found will be placed into the attributes name with the name provided.  "
+                + "If enabled the matching string sequence itself will be placed into the attribute name.  "
+                + "If multiple matches are enabled, and index will be applied after the first set of matches. "
+                + "The exception is a capturing group that is optional and does not match  "
+                + "For example, given the attribute name \"regex\" and expression \"abc(?<NAMED>def)?(?<NAMED-TWO>g)\"  "
+                + "we would add an attribute \"regex.NAMED\" with the value of \"def\" if the \"def\" matched.  We would  "
+                + " add an attribute \"regex.NAMED-TWO\" with the value of \"g\" if the \"g\" matched regardless.  "
                 + "The value of the property must be a valid Regular Expressions with one or more capturing groups. "
+                + "If named capture groups are enabled, all capture groups must be named.  If they are not, then the  "
+                + "processor configuration will fail validation.  "
                 + "If the Regular Expression matches more than once, only the first match will be used unless the property "
                 + "enabling repeating capture group is set to true. "
                 + "If any provided Regular Expression matches, the FlowFile(s) will be routed to 'matched'. "
@@ -208,6 +223,18 @@ public class ExtractText extends AbstractProcessor {
             .defaultValue("false")
             .build();
 
+    public static final PropertyDescriptor ENABLE_NAMED_GROUPS = new PropertyDescriptor.Builder()
+            .name("extract-text-enable-named-groups")
+            .displayName("Enable named group support")
+            .description("If set to true, when named groups are present in the regular expression, the name of the "
+                    + "group will be used in the attribute name as opposed to the group index.  All capturing groups "
+                    + "must be named, if the number of groups (not including capture group 0) does not equal the "
+                    + "number of named groups validation will fail.")
+            .required(false)
+            .allowableValues("true","false")
+            .defaultValue("false")
+            .build();
+
     public static final Relationship REL_MATCH = new Relationship.Builder()
             .name("matched")
             .description("FlowFiles are routed to this relationship when the Regular Expression is successfully evaluated and the FlowFile is modified as a result")
@@ -245,6 +272,7 @@ public class ExtractText extends AbstractProcessor {
         props.add(UNIX_LINES);
         props.add(INCLUDE_CAPTURE_GROUP_ZERO);
         props.add(ENABLE_REPEATING_CAPTURE_GROUP);
+        props.add(ENABLE_NAMED_GROUPS);
         this.properties = Collections.unmodifiableList(props);
     }
 
@@ -292,6 +320,37 @@ public class ExtractText extends AbstractProcessor {
             }
         }
 
+        // If named groups are enabled, the number of named groups needs to match the number of groups overall
+        final boolean enableNamedGroups = validationContext.getProperty(ENABLE_NAMED_GROUPS).asBoolean();
+        getLogger().debug(String.format("Enable named groups is %s", enableNamedGroups));
+        if (enableNamedGroups) {
+            for (Map.Entry<PropertyDescriptor, String> prop : validationContext.getProperties().entrySet()) {
+                PropertyDescriptor pd = prop.getKey();
+                if (pd.isDynamic()) {
+                    final String value = validationContext.getProperty(pd).getValue();
+                    getLogger().debug(
+                        "Evaluating dynamic property " + pd.getDisplayName() + " (" + pd.getName() + ") with value "
+                            + value);
+                    final Pattern pattern = Pattern.compile(value);
+                    final int numGroups = pattern.matcher("").groupCount();
+                    final int namedGroupCount = getNameGroups(value).size();
+                    if (numGroups!= namedGroupCount) {
+                        getLogger().debug(String
+                            .format("Named group count %d does not match total group count %d", namedGroupCount,
+                                numGroups));
+                        problems.add(new ValidationResult.Builder()
+                            .subject(pd.getDisplayName())
+                            .input(value)
+                            .valid(false)
+                            .explanation("Named group count does not match total group count")
+                            .build()
+                        );
+                    }
+                }
+            }
+        }
+
+
         return problems;
     }
 
@@ -358,29 +417,54 @@ public class ExtractText extends AbstractProcessor {
         final Map<String, Pattern> patternMap = compiledPattersMapRef.get();
 
         final int startGroupIdx = context.getProperty(INCLUDE_CAPTURE_GROUP_ZERO).asBoolean() ? 0 : 1;
+        final boolean useNamedGroups = context.getProperty(ENABLE_NAMED_GROUPS).isSet()
+            ? context.getProperty(ENABLE_NAMED_GROUPS).asBoolean() : false;
 
         for (final Map.Entry<String, Pattern> entry : patternMap.entrySet()) {
-
+            String patternString = entry.getValue().toString();
+            String[] namedGroups = getNameGroups(patternString).toArray(new String[0]);
             final Matcher matcher = entry.getValue().matcher(contentString);
             int j = 0;
 
             while (matcher.find()) {
                 final String baseKey = entry.getKey();
-                int start = j == 0 ? startGroupIdx : 1;
-                for (int i = start; i <= matcher.groupCount(); i++) {
-                    final String key = new StringBuilder(baseKey).append(".").append(i + j).toString();
-                    String value = matcher.group(i);
-                    if (value != null && !value.isEmpty()) {
-                        if (value.length() > maxCaptureGroupLength) {
-                            value = value.substring(0, maxCaptureGroupLength);
+                // group count doesn't include the 0
+                if (useNamedGroups && matcher.groupCount()  == namedGroups.length) {
+                    for ( int i = 0; i < namedGroups.length; i++) {
+                        final StringBuilder builder = new StringBuilder(baseKey).append(".").append(namedGroups[i]);
+                        if (j > 0) {
+                            builder.append(".").append(j);
+                        }
+                        final String key = builder.toString();
+                        String value = matcher.group(namedGroups[i]);
+                        if (value != null && !value.isEmpty()) {
+                            if (value.length() > maxCaptureGroupLength) {
+                                value = value.substring(0, maxCaptureGroupLength);
+                            }
+                            regexResults.put(key, value);
                         }
-                        regexResults.put(key, value);
-                        if (i == 1 && j == 0) {
-                            regexResults.put(baseKey, value);
+                    }
+                    if (startGroupIdx == 0 && j == 0) {
+                        regexResults.put(baseKey, matcher.group(0));
+                    }
+                    j++;
+                } else {
+                    int start = j == 0 ? startGroupIdx : 1;
+                    for (int i = start; i <= matcher.groupCount(); i++) {
+                        final String key = new StringBuilder(baseKey).append(".").append(i + j).toString();
+                        String value = matcher.group(i);
+                        if (value != null && !value.isEmpty()) {
+                            if (value.length() > maxCaptureGroupLength) {
+                                value = value.substring(0, maxCaptureGroupLength);
+                            }
+                            regexResults.put(key, value);
+                            if (i == 1 && j == 0) {
+                                regexResults.put(baseKey, value);
+                            }
                         }
                     }
+                    j += matcher.groupCount();
                 }
-                j += matcher.groupCount();
                 if (!context.getProperty(ENABLE_REPEATING_CAPTURE_GROUP).asBoolean()) {
                     break;
                 }
diff --git a/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/resources/docs/org.apache.nifi.processors.standard.ExtractText/additionalDetails.html b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/resources/docs/org.apache.nifi.processors.standard.ExtractText/additionalDetails.html
new file mode 100644
index 0000000..475092d
--- /dev/null
+++ b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/resources/docs/org.apache.nifi.processors.standard.ExtractText/additionalDetails.html
@@ -0,0 +1,239 @@
+<!DOCTYPE html>
+<html lang="en">
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+      http://www.apache.org/licenses/LICENSE-2.0
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<head>
+  <meta charset="utf-8"/>
+  <title>ExtractText</title>
+  <link rel="stylesheet" href="../../../../../css/component-usage.css" type="text/css"/>
+</head>
+
+<body>
+<!-- Processor Documentation ================================================== -->
+<h2>Usage Information</h2>
+
+<p>
+  The Extract Text processor provides different results based on whether named capture groups are enabled.
+</p>
+
+<h2>Example</h2>
+<p>
+  Here is a like for like example that illustrates this.
+</p>
+<h4>Data</h4>
+<table>
+  <tbody>
+  <tr>
+    <td>
+      <code>
+      <pre>
+        foo\r\nbar1\r\nbar2\r\nbar3\r\nhello\r\nworld\r\n
+      </pre>
+      </code>
+    </td>
+  </tr>
+  </tbody>
+</table>
+
+
+<h3>Without named capture groups</h3>
+<h4>Configuration</h4>
+<table>
+  <thead>
+  <th>Property Name</th>
+  <th>Property Value</th>
+  </thead>
+  <tbody>
+  <tr>
+    <td>regex.result1</td>
+    <td>(?s)(.*)</td>
+  </tr>
+  <tr>
+    <td>regex.result2</td>
+    <td>(?s).*(bar1).*</td>
+  </tr>
+  <tr>
+    <td>regex.result3</td>
+    <td>(?s).*?(bar\\d).*</td>
+  </tr>
+  <tr>
+    <td>regex.result4</td>
+    <td>(?s).*?(?:bar\\d).*?(bar\\d).*?(bar3).*</td>
+  </tr>
+  <tr>
+    <td>regex.result5</td>
+    <td>(?s).*(bar\\d).*</td>
+  </tr>
+  <tr>
+    <td>regex.result6</td>
+    <td>(?s)^(.*)$</td>
+  </tr>
+  <tr>
+    <td>regex.result7</td>
+    <td>(?s)(XXX)</td>
+  </tr>
+  </tbody>
+</table>
+
+<h4>Results</h4>
+<table>
+  <thead>
+  <th>Attribute Name</th>
+  <th>Attribute Value</th>
+  </thead>
+  <tbody>
+  <tr>
+    <td>regex.result1</td>
+    <td><code>
+      <pre>foo\r\nbar1\r\nbar2\r\nbar3\r\nhello\r\nworld\r\n</pre>
+    </code></td>
+  </tr>
+  <tr>
+    <td>regex.result2</td>
+    <td>bar1</td>
+  </tr>
+  <tr>
+    <td>regex.result3</td>
+    <td>bar1</td>
+  </tr>
+  <tr>
+    <td>regex.result4</td>
+    <td>bar2</td>
+  </tr>
+  <tr>
+    <td>regex.result4.0</td>
+    <td><code>
+      <pre>foo\r\nbar1\r\nbar2\r\nbar3\r\nhello\r\nworld\r\n</pre>
+    </code></td>
+  </tr>
+  <tr>
+    <td>regex.result4.1</td>
+    <td>bar2</td>
+  </tr>
+  <tr>
+    <td>regex.result4.2</td>
+    <td>bar3</td>
+  </tr>
+  <tr>
+    <td>regex.result5</td>
+    <td>bar3</td>
+  </tr>
+  <tr>
+    <td>regex.result6</td>
+    <td><code>
+      <pre>foo\r\nbar1\r\nbar2\r\nbar3\r\nhello\r\nworld\r\n</pre>
+    </code></td>
+  </tr>
+  <tr>
+    <td>regex.result7</td>
+    <td></td>
+  </tr>
+  </tbody>
+</table>
+
+<h3>With named capture groups</h3>
+<h4>Configuration</h4>
+<table>
+  <thead>
+  <th>Property Name</th>
+  <th>Property Value</th>
+  </thead>
+  <tbody>
+  <tr>
+    <td>Enable named group support</td>
+    <td>True</td>
+  </tr>
+  <tr>
+    <td>regex.result1</td>
+    <td>(?s)(?&ltALL&gt.*</td>
+  </tr>
+  <tr>
+    <td>regex.result2</td>
+    <td>(?s).*(?&ltBAR1&gtbar1).*</td>
+  </tr>
+  <tr>
+    <td>regex.result3</td>
+    <td>(?s).*?(?&ltBAR1&gtbar\d).*</td>
+  </tr>
+  <tr>
+    <td>regex.result4</td>
+    <td>(?s).*?(?:bar\d).*?(?&ltBAR2&gtbar\d).*?(?&ltBAR3&gtbar3).*</td>
+  </tr>
+  <tr>
+    <td>regex.result5</td>
+    <td>(?s).*(?&ltBAR3&gtbar\d).*</td>
+  </tr>
+  <tr>
+    <td>regex.result6</td>
+    <td>(?s)^(?&ltALL&gt.*)$</td>
+  </tr>
+  <tr>
+    <td>regex.result7</td>
+    <td>(?s)(?&ltMISS&gtXXX)</td>
+  </tr>
+  </tbody>
+</table>
+
+<h4>Results</h4>
+<table>
+  <thead>
+  <th>Attribute Name</th>
+  <th>Attribute Value</th>
+  </thead>
+  <tbody>
+  <tr>
+    <td>regex.result1</td>
+    <td><code>
+      <pre>foo\r\nbar1\r\nbar2\r\nbar3\r\nhello\r\nworld\r\n</pre>
+    </code></td>
+  </tr>
+  <tr>
+    <td>regex.result2.BAR1</td>
+    <td>bar1</td>
+  </tr>
+  <tr>
+    <td>regex.result3.BAR1</td>
+    <td>bar1</td>
+  </tr>
+  <tr>
+    <td>regex.result4.BAR2</td>
+    <td>bar2</td>
+  </tr>
+  <tr>
+    <td>regex.result4.BAR2</td>
+    <td>bar2</td>
+  </tr>
+  <tr>
+    <td>regex.result4.BAR3</td>
+    <td>bar3</td>
+  </tr>
+  <tr>
+    <td>regex.result5.BAR3</td>
+    <td>bar3</td>
+  </tr>
+  <tr>
+    <td>regex.result6.ALL</td>
+    <td><code>
+      <pre>foo\r\nbar1\r\nbar2\r\nbar3\r\nhello\r\nworld\r\n</pre>
+    </code></td>
+  </tr>
+  <tr>
+    <td>regex.result7.MISS</td>
+    <td></td>
+  </tr>
+  </tbody>
+</table>
+</body>
+</html>
diff --git a/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/java/org/apache/nifi/processors/standard/TestExtractText.java b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/java/org/apache/nifi/processors/standard/TestExtractText.java
index e323c2d..7a1696b 100644
--- a/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/java/org/apache/nifi/processors/standard/TestExtractText.java
+++ b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/java/org/apache/nifi/processors/standard/TestExtractText.java
@@ -20,6 +20,7 @@ import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertTrue;
 
 import java.io.UnsupportedEncodingException;
+import java.nio.charset.StandardCharsets;
 import java.util.Set;
 import java.util.regex.Pattern;
 import org.apache.nifi.processor.Relationship;
@@ -45,7 +46,7 @@ public class TestExtractText {
         testRunner.setProperty("regex.result6", "(?s)^(.*)$");
         testRunner.setProperty("regex.result7", "(?s)(XXX)");
 
-        testRunner.enqueue(SAMPLE_STRING.getBytes("UTF-8"));
+        testRunner.enqueue(SAMPLE_STRING.getBytes(StandardCharsets.UTF_8));
         testRunner.run();
 
         testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1);
@@ -99,7 +100,7 @@ public class TestExtractText {
         testRunner.setProperty("regex.result6", "^(.*)$");
         testRunner.setProperty("regex.result7", "^(XXX)$");
 
-        testRunner.enqueue(SAMPLE_STRING.getBytes("UTF-8"));
+        testRunner.enqueue(SAMPLE_STRING.getBytes(StandardCharsets.UTF_8));
         testRunner.run();
 
         testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1);
@@ -131,7 +132,7 @@ public class TestExtractText {
         testRunner.setProperty("regex.result6", "^(.*)$");
         testRunner.setProperty("regex.result7", "^(XXX)$");
 
-        testRunner.enqueue(SAMPLE_STRING.getBytes("UTF-8"));
+        testRunner.enqueue(SAMPLE_STRING.getBytes(StandardCharsets.UTF_8));
         testRunner.run();
 
         testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1);
@@ -165,7 +166,7 @@ public class TestExtractText {
         testRunner.setProperty("regex.result6", "^(.*)$");
         testRunner.setProperty("regex.result7", "^(XXX)$");
 
-        testRunner.enqueue(SAMPLE_STRING.getBytes("UTF-8"));
+        testRunner.enqueue(SAMPLE_STRING.getBytes(StandardCharsets.UTF_8));
         testRunner.run();
 
         testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1);
@@ -198,7 +199,7 @@ public class TestExtractText {
         testRunner.setProperty("regex.result5b", "(?:bar\\d\\r?\\n)*(bar\\d)");
         testRunner.setProperty("regex.result7", "^(XXX)$");
 
-        testRunner.enqueue("YYY".getBytes("UTF-8"));
+        testRunner.enqueue("YYY".getBytes(StandardCharsets.UTF_8));
         testRunner.run();
 
         testRunner.assertAllFlowFilesTransferred(ExtractText.REL_NO_MATCH, 1);
@@ -232,7 +233,7 @@ public class TestExtractText {
         testRunner.setProperty("regex.result1", "(foo)");
         testRunner.setProperty("regex.result2", "(world)");
 
-        testRunner.enqueue(SAMPLE_STRING.getBytes("UTF-8"));
+        testRunner.enqueue(SAMPLE_STRING.getBytes(StandardCharsets.UTF_8));
         testRunner.run();
 
         testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1);
@@ -312,7 +313,7 @@ public class TestExtractText {
         final ExtractText processor = new ExtractText();
         final TestRunner testRunner = TestRunners.newTestRunner(processor);
 
-        testRunner.enqueue("foo".getBytes("UTF-8"));
+        testRunner.enqueue("foo".getBytes(StandardCharsets.UTF_8));
         testRunner.run();
 
         Set<Relationship> relationships = processor.getRelationships();
@@ -329,7 +330,7 @@ public class TestExtractText {
 
         testRunner.setProperty(attributeKey, "(?s)(.*)");
 
-        testRunner.enqueue(SAMPLE_STRING.getBytes("UTF-8"));
+        testRunner.enqueue(SAMPLE_STRING.getBytes(StandardCharsets.UTF_8));
         testRunner.run();
 
         testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1);
@@ -346,7 +347,7 @@ public class TestExtractText {
         testRunner.setProperty(ExtractText.ENABLE_REPEATING_CAPTURE_GROUP, "true");
         final String attributeKey = "regex.result";
         testRunner.setProperty(attributeKey, "(?s)(\\w+)");
-        testRunner.enqueue("This is my text".getBytes("UTF-8"));
+        testRunner.enqueue("This is my text".getBytes(StandardCharsets.UTF_8));
         testRunner.run();
         testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1);
         final MockFlowFile out = testRunner.getFlowFilesForRelationship(ExtractText.REL_MATCH).get(0);
@@ -370,7 +371,7 @@ public class TestExtractText {
         testRunner.setProperty(ExtractText.ENABLE_REPEATING_CAPTURE_GROUP, "true");
         final String attributeKey = "regex.result";
         testRunner.setProperty(attributeKey, "(\\w+)=(\\d+)");
-        testRunner.enqueue("a=1,b=10,c=100".getBytes("UTF-8"));
+        testRunner.enqueue("a=1,b=10,c=100".getBytes(StandardCharsets.UTF_8));
         testRunner.run();
         testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1);
         final MockFlowFile out = testRunner.getFlowFilesForRelationship(ExtractText.REL_MATCH).get(0);
@@ -403,7 +404,7 @@ public class TestExtractText {
 
         testRunner.setProperty(attributeKey, "(?s)(.*)");
 
-        testRunner.enqueue(SAMPLE_STRING.getBytes("UTF-8"));
+        testRunner.enqueue(SAMPLE_STRING.getBytes(StandardCharsets.UTF_8));
         testRunner.run();
 
         testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1);
@@ -422,7 +423,7 @@ public class TestExtractText {
         testRunner.setProperty(attributeKey, "(?s).*");
 
         // Act
-        testRunner.enqueue(SAMPLE_STRING.getBytes("UTF-8"));
+        testRunner.enqueue(SAMPLE_STRING.getBytes(StandardCharsets.UTF_8));
         testRunner.run();
 
         // Assert
@@ -443,7 +444,7 @@ public class TestExtractText {
         testRunner.setProperty(attributeKey, "(?s).*");
 
         // Act
-        testRunner.enqueue(SAMPLE_STRING.getBytes("UTF-8"));
+        testRunner.enqueue(SAMPLE_STRING.getBytes(StandardCharsets.UTF_8));
 
         // Validation should fail because nothing will match
         testRunner.run();
diff --git a/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/java/org/apache/nifi/processors/standard/TestExtractTextNamedGroups.java b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/java/org/apache/nifi/processors/standard/TestExtractTextNamedGroups.java
new file mode 100644
index 0000000..9be1c47
--- /dev/null
+++ b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/java/org/apache/nifi/processors/standard/TestExtractTextNamedGroups.java
@@ -0,0 +1,397 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nifi.processors.standard;
+
+import static org.apache.nifi.processors.standard.ExtractText.ENABLE_NAMED_GROUPS;
+
+import java.io.UnsupportedEncodingException;
+import java.nio.charset.StandardCharsets;
+
+import org.apache.nifi.util.MockFlowFile;
+import org.apache.nifi.util.TestRunner;
+import org.apache.nifi.util.TestRunners;
+import org.junit.Test;
+
+public class TestExtractTextNamedGroups {
+
+    final String SAMPLE_STRING = "foo\r\nbar1\r\nbar2\r\nbar3\r\nhello\r\nworld\r\n";
+
+    @Test
+    public void testProcessor() throws Exception {
+
+        final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText());
+
+        testRunner.setProperty(ENABLE_NAMED_GROUPS, "true");
+        testRunner.setProperty("regex.result1", "(?s)(?<ALL>.*)");
+        testRunner.setProperty("regex.result2", "(?s).*(?<BAR1>bar1).*");
+        testRunner.setProperty("regex.result3", "(?s).*?(?<BAR1>bar\\d).*");
+        testRunner.setProperty("regex.result4", "(?s).*?(?:bar\\d).*?(?<BAR2>bar\\d).*?(?<BAR3>bar3).*");
+        testRunner.setProperty("regex.result5", "(?s).*(?<BAR3>bar\\d).*");
+        testRunner.setProperty("regex.result6", "(?s)^(?<ALL>.*)$");
+        testRunner.setProperty("regex.result7", "(?s)(?<MISS>XXX)");
+        testRunner.enqueue(SAMPLE_STRING.getBytes(StandardCharsets.UTF_8));
+        testRunner.run();
+
+        testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1);
+        final MockFlowFile out = testRunner.getFlowFilesForRelationship(ExtractText.REL_MATCH).get(0);
+        java.util.Map<String,String> attributes = out.getAttributes();
+        out.assertAttributeEquals("regex.result1.ALL", SAMPLE_STRING);
+        out.assertAttributeEquals("regex.result2.BAR1", "bar1");
+        out.assertAttributeEquals("regex.result3.BAR1", "bar1");
+        out.assertAttributeEquals("regex.result4.BAR2", "bar2");
+        out.assertAttributeEquals("regex.result4.BAR3", "bar3");
+        out.assertAttributeEquals("regex.result5.BAR3", "bar3");
+        out.assertAttributeEquals("regex.result6.ALL", SAMPLE_STRING);
+        out.assertAttributeEquals("regex.result7.MISS", null);
+    }
+
+    @Test
+    public void testWithUnmatchedOptionalCapturingGroup() {
+        final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText());
+        testRunner.setProperty(ENABLE_NAMED_GROUPS, "true");
+        testRunner.setProperty("regex", "abc(?<DEF>def)?(?<G>g)");
+        testRunner.enqueue("abcg");
+        testRunner.run();
+
+        testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1);
+        final MockFlowFile out = testRunner.getFlowFilesForRelationship(ExtractText.REL_MATCH).get(0);
+        out.assertAttributeNotExists("regex.DEF");
+        out.assertAttributeEquals("regex.G", "g");
+
+        testRunner.clearTransferState();
+
+        testRunner.enqueue("abcdefg");
+        testRunner.run();
+        final MockFlowFile out2 = testRunner.getFlowFilesForRelationship(ExtractText.REL_MATCH).get(0);
+        out2.assertAttributeEquals("regex.DEF", "def");
+        out2.assertAttributeEquals("regex.G", "g");
+    }
+
+    @Test
+    public void testProcessorWithDotall() throws Exception {
+
+        final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText());
+        testRunner.setProperty(ENABLE_NAMED_GROUPS, "true");
+
+        testRunner.setProperty(ExtractText.DOTALL, "true");
+
+        testRunner.setProperty("regex.result1", "(?<TOUT>.*)");
+        testRunner.setProperty("regex.result2", ".*(?<BAR1>bar1).*");
+        testRunner.setProperty("regex.result3", ".*?(?<BAR1>bar\\d).*"); // reluctant gets first
+        testRunner.setProperty("regex.result4", ".*?(?:bar\\d).*?(?<BAR2>bar\\d).*"); // reluctant w/ repeated pattern gets second
+        testRunner.setProperty("regex.result5", ".*(?<BAR3>bar\\d).*"); // greedy gets last
+        testRunner.setProperty("regex.result6", "^(?<TOUT>.*)$");
+        testRunner.setProperty("regex.result7", "^(?<NO>XXX)$");
+
+        testRunner.enqueue(SAMPLE_STRING.getBytes(StandardCharsets.UTF_8));
+        testRunner.run();
+
+        testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1);
+        final MockFlowFile out = testRunner.getFlowFilesForRelationship(ExtractText.REL_MATCH).get(0);
+        out.assertAttributeEquals("regex.result1.TOUT", SAMPLE_STRING);
+        out.assertAttributeEquals("regex.result2.BAR1", "bar1");
+        out.assertAttributeEquals("regex.result3.BAR1", "bar1");
+        out.assertAttributeEquals("regex.result4.BAR2", "bar2");
+        out.assertAttributeEquals("regex.result5.BAR3", "bar3");
+        out.assertAttributeEquals("regex.result6.TOUT", SAMPLE_STRING);
+        out.assertAttributeEquals("regex.result7.NO", null);
+
+    }
+
+    @Test
+    public void testProcessorWithMultiline() throws Exception {
+
+        final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText());
+        testRunner.setProperty(ENABLE_NAMED_GROUPS, "true");
+
+        testRunner.setProperty(ExtractText.MULTILINE, "true");
+
+        testRunner.setProperty("regex.result1", "(?<ALL>.*)");
+        testRunner.setProperty("regex.result2", "(?<BAR1>bar1)");
+        testRunner.setProperty("regex.result3", ".*?(?<BAR1>bar\\d).*");
+        testRunner.setProperty("regex.result4", ".*?(?:bar\\d).*?(?<NULL>bar\\d).*");
+        testRunner.setProperty("regex.result4b", "bar\\d\\r\\n(?<BAR2>bar\\d)");
+        testRunner.setProperty("regex.result5", ".*(?<BAR2>bar\\d).*");
+        testRunner.setProperty("regex.result5b", "(?:bar\\d\\r?\\n)*(?<BAR3>bar\\d)");
+        testRunner.setProperty("regex.result6", "^(?<ALL>.*)$");
+        testRunner.setProperty("regex.result7", "^(?<NO>XXX)$");
+
+        testRunner.enqueue(SAMPLE_STRING.getBytes(StandardCharsets.UTF_8));
+        testRunner.run();
+
+        testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1);
+        final MockFlowFile out = testRunner.getFlowFilesForRelationship(ExtractText.REL_MATCH).get(0);
+        out.assertAttributeEquals("regex.result1.ALL", "foo"); // matches everything on the first line
+        out.assertAttributeEquals("regex.result2.BAR1", "bar1");
+        out.assertAttributeEquals("regex.result3.BAR1", "bar1");
+        out.assertAttributeEquals("regex.result4.NULL", null); // null because no line has two bar's
+        out.assertAttributeEquals("regex.result4b.BAR2", "bar2"); // included newlines in regex
+        out.assertAttributeEquals("regex.result5.BAR2", "bar1"); //still gets first because no lines with multiple bar's
+        out.assertAttributeEquals("regex.result5b.BAR3", "bar3"); // included newlines in regex
+        out.assertAttributeEquals("regex.result6.ALL", "foo"); // matches all of first line
+        out.assertAttributeEquals("regex.result7.NO", null); // no match
+    }
+
+    @Test
+    public void testProcessorWithMultilineAndDotall() throws Exception {
+
+        final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText());
+        testRunner.setProperty(ENABLE_NAMED_GROUPS, "true");
+
+        testRunner.setProperty(ExtractText.MULTILINE, "true");
+        testRunner.setProperty(ExtractText.DOTALL, "true");
+
+        testRunner.setProperty("regex.result1", "(?<ALL>.*)");
+        testRunner.setProperty("regex.result2", "(?<BAR1>bar1)");
+        testRunner.setProperty("regex.result3", ".*?(?<BAR1>bar\\d).*");
+        testRunner.setProperty("regex.result4", ".*?(?:bar\\d).*?(?<BAR2>bar\\d).*");
+        testRunner.setProperty("regex.result4b", "bar\\d\\r\\n(?<BAR2>bar\\d)");
+        testRunner.setProperty("regex.result5", ".*(?<BAR3>bar\\d).*");
+        testRunner.setProperty("regex.result5b", "(?:bar\\d\\r?\\n)*(?<BAR3>bar\\d)");
+        testRunner.setProperty("regex.result6", "^(?<ALL>.*)$");
+        testRunner.setProperty("regex.result7", "^(?<MISS>XXX)$");
+
+        testRunner.enqueue(SAMPLE_STRING.getBytes(StandardCharsets.UTF_8));
+        testRunner.run();
+
+        testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1);
+        final MockFlowFile out = testRunner.getFlowFilesForRelationship(ExtractText.REL_MATCH).get(0);
+
+        out.assertAttributeEquals("regex.result1.ALL", SAMPLE_STRING);
+        out.assertAttributeEquals("regex.result2.BAR1", "bar1");
+        out.assertAttributeEquals("regex.result3.BAR1", "bar1");
+        out.assertAttributeEquals("regex.result4.BAR2", "bar2");
+        out.assertAttributeEquals("regex.result4b.BAR2", "bar2");
+        out.assertAttributeEquals("regex.result5.BAR3", "bar3");
+        out.assertAttributeEquals("regex.result5b.BAR3", "bar3");
+        out.assertAttributeEquals("regex.result6.ALL", SAMPLE_STRING);
+        out.assertAttributeEquals("regex.result7.MISS", null);
+    }
+
+    @Test
+    public void testProcessorWithNoMatches() throws Exception {
+
+        final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText());
+        testRunner.setProperty(ENABLE_NAMED_GROUPS, "true");
+
+        testRunner.setProperty(ExtractText.MULTILINE, "true");
+        testRunner.setProperty(ExtractText.DOTALL, "true");
+
+        testRunner.setProperty("regex.result2", "(?<NONE>bar1)");
+        testRunner.setProperty("regex.result3", ".*?(?<NONE>bar\\d).*");
+        testRunner.setProperty("regex.result4", ".*?(?:bar\\d).*?(?<NONE>bar\\d).*");
+        testRunner.setProperty("regex.result4b", "bar\\d\\r\\n(?<NONE>bar\\d)");
+        testRunner.setProperty("regex.result5", ".*(?<NONE>bar\\d).*");
+        testRunner.setProperty("regex.result5b", "(?:bar\\d\\r?\\n)*(?<NONE>bar\\d)");
+        testRunner.setProperty("regex.result7", "^(?<NONE>XXX)$");
+
+        testRunner.enqueue("YYY".getBytes(StandardCharsets.UTF_8));
+        testRunner.run();
+
+        testRunner.assertAllFlowFilesTransferred(ExtractText.REL_NO_MATCH, 1);
+        final MockFlowFile out = testRunner.getFlowFilesForRelationship(ExtractText.REL_NO_MATCH).get(0);
+
+        out.assertAttributeEquals("regex.result1.NONE", null);
+        out.assertAttributeEquals("regex.result2.NONE", null);
+        out.assertAttributeEquals("regex.result3.NONE", null);
+        out.assertAttributeEquals("regex.result4.NONE", null);
+        out.assertAttributeEquals("regex.result4b.NONE", null);
+        out.assertAttributeEquals("regex.result5.NONE", null);
+        out.assertAttributeEquals("regex.result5b.NONE", null);
+        out.assertAttributeEquals("regex.result6.NONE", null);
+        out.assertAttributeEquals("regex.result7.NONE", null);
+    }
+
+    @Test
+    public void testNoFlowFile() throws UnsupportedEncodingException {
+        final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText());
+        testRunner.setProperty(ENABLE_NAMED_GROUPS, "true");
+        testRunner.run();
+        testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 0);
+
+    }
+
+    @Test
+    public void testMatchOutsideBuffer() throws Exception {
+        final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText());
+        testRunner.setProperty(ENABLE_NAMED_GROUPS, "true");
+
+        testRunner.setProperty(ExtractText.MAX_BUFFER_SIZE, "3 B");//only read the first 3 chars ("foo")
+
+        testRunner.setProperty("regex.result1", "(?<FOO>foo)");
+        testRunner.setProperty("regex.result2", "(?<WORLD>world)");
+
+        testRunner.enqueue(SAMPLE_STRING.getBytes(StandardCharsets.UTF_8));
+        testRunner.run();
+
+        testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1);
+        final MockFlowFile out = testRunner.getFlowFilesForRelationship(ExtractText.REL_MATCH).get(0);
+
+        out.assertAttributeEquals("regex.result1.FOO", "foo");
+        out.assertAttributeEquals("regex.result2.WORLD", null); // null because outsk
+    }
+
+    @Test
+    public void testIncludeZeroCaptureGroupProperty() throws Exception {
+        final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText());
+        testRunner.setProperty(ENABLE_NAMED_GROUPS, "true");
+
+        final String attributeKey = "regex.result";
+
+        testRunner.setProperty(attributeKey, "(?s)(?<ALL>.*)");
+
+        testRunner.enqueue(SAMPLE_STRING.getBytes(StandardCharsets.UTF_8));
+        testRunner.run();
+
+        testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1);
+        final MockFlowFile out = testRunner.getFlowFilesForRelationship(ExtractText.REL_MATCH).get(0);
+
+        // Ensure the zero capture group is in the resultant attributes
+        out.assertAttributeExists(attributeKey);
+        out.assertAttributeExists(attributeKey + ".ALL");
+        out.assertAttributeEquals(attributeKey, SAMPLE_STRING);
+        out.assertAttributeEquals(attributeKey + ".ALL", SAMPLE_STRING);
+    }
+
+    @Test
+    public void testFindAll() throws Exception {
+        final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText());
+        testRunner.setProperty(ENABLE_NAMED_GROUPS, "true");
+        testRunner.setProperty(ExtractText.ENABLE_REPEATING_CAPTURE_GROUP, "true");
+        final String attributeKey = "regex.result";
+        testRunner.setProperty(attributeKey, "(?s)(?<W>\\w+)");
+        testRunner.enqueue("This is my text".getBytes(StandardCharsets.UTF_8));
+        testRunner.run();
+        testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1);
+        final MockFlowFile out = testRunner.getFlowFilesForRelationship(ExtractText.REL_MATCH).get(0);
+        // Ensure the zero capture group is in the resultant attributes
+        out.assertAttributeExists(attributeKey);
+        out.assertAttributeExists(attributeKey + ".W");
+        out.assertAttributeExists(attributeKey + ".W.1");
+        out.assertAttributeExists(attributeKey + ".W.2");
+        out.assertAttributeExists(attributeKey + ".W.3");
+        out.assertAttributeEquals(attributeKey, "This");
+        out.assertAttributeEquals(attributeKey + ".W", "This");
+        out.assertAttributeEquals(attributeKey + ".W.1", "is");
+        out.assertAttributeEquals(attributeKey + ".W.2", "my");
+        out.assertAttributeEquals(attributeKey + ".W.3", "text");
+    }
+
+    @Test
+    public void testFindAllPair() throws Exception {
+        final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText());
+        testRunner.setProperty(ENABLE_NAMED_GROUPS, "true");
+        testRunner.setProperty(ExtractText.ENABLE_REPEATING_CAPTURE_GROUP, "true");
+        final String attributeKey = "regex.result";
+        testRunner.setProperty(attributeKey, "(?<LEFT>\\w+)=(?<RIGHT>\\d+)");
+        testRunner.enqueue("a=1,b=10,c=100".getBytes(StandardCharsets.UTF_8));
+        testRunner.run();
+        testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1);
+        final MockFlowFile out = testRunner.getFlowFilesForRelationship(ExtractText.REL_MATCH).get(0);
+        // Ensure the zero capture group is in the resultant attributes
+        out.assertAttributeExists(attributeKey);
+        out.assertAttributeExists(attributeKey + ".LEFT");
+        out.assertAttributeExists(attributeKey + ".RIGHT");
+        out.assertAttributeExists(attributeKey + ".LEFT.1");
+        out.assertAttributeExists(attributeKey + ".RIGHT.1");
+        out.assertAttributeExists(attributeKey + ".LEFT.2");
+        out.assertAttributeExists(attributeKey + ".RIGHT.2");
+        out.assertAttributeNotExists(attributeKey + ".LEFT.3"); // Ensure there's no more attributes
+        out.assertAttributeNotExists(attributeKey + ".RIGHT.3"); // Ensure there's no more attributes
+        out.assertAttributeEquals(attributeKey , "a=1");
+        out.assertAttributeEquals(attributeKey + ".LEFT", "a");
+        out.assertAttributeEquals(attributeKey + ".RIGHT", "1");
+        out.assertAttributeEquals(attributeKey + ".LEFT.1", "b");
+        out.assertAttributeEquals(attributeKey + ".RIGHT.1", "10");
+        out.assertAttributeEquals(attributeKey + ".LEFT.2", "c");
+        out.assertAttributeEquals(attributeKey + ".RIGHT.2", "100");
+    }
+
+    @Test
+    public void testIgnoreZeroCaptureGroupProperty() throws Exception {
+        final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText());
+        testRunner.setProperty(ENABLE_NAMED_GROUPS, "true");
+
+        testRunner.setProperty(ExtractText.INCLUDE_CAPTURE_GROUP_ZERO, "false");
+
+        final String attributeKey = "regex.result";
+
+        testRunner.setProperty(attributeKey, "(?s)(?<ALL>.*)");
+
+        testRunner.enqueue(SAMPLE_STRING.getBytes(StandardCharsets.UTF_8));
+        testRunner.run();
+
+        testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1);
+        final MockFlowFile out = testRunner.getFlowFilesForRelationship(ExtractText.REL_MATCH).get(0);
+
+        // Ensure the zero capture group is not in the resultant attributes
+        out.assertAttributeNotExists(attributeKey);
+        out.assertAttributeEquals(attributeKey + ".ALL", SAMPLE_STRING);
+    }
+
+    @Test
+    public void testShouldAllowNoCaptureGroups() throws Exception {
+        // Arrange
+        final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText());
+        testRunner.setProperty(ENABLE_NAMED_GROUPS, "true");
+        final String attributeKey = "regex.result";
+        testRunner.setProperty(attributeKey, "(?s).*");
+
+        // Act
+        testRunner.enqueue(SAMPLE_STRING.getBytes(StandardCharsets.UTF_8));
+        testRunner.run();
+
+        // Assert
+        testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1);
+        final MockFlowFile out = testRunner.getFlowFilesForRelationship(ExtractText.REL_MATCH).get(0);
+
+        // There is no global capture group, but no named capture group either
+        // so attributeKey has the match
+        out.assertAttributeEquals(attributeKey , SAMPLE_STRING);
+    }
+
+    @Test(expected = AssertionError.class)
+    public void testShouldNotAllowNoCaptureGroupsIfZeroDisabled() throws Exception {
+        // Arrange
+        final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText());
+        testRunner.setProperty(ENABLE_NAMED_GROUPS, "true");
+        testRunner.setProperty(ExtractText.INCLUDE_CAPTURE_GROUP_ZERO, "false");
+        final String attributeKey = "regex.result";
+        testRunner.setProperty(attributeKey, "(?s).*");
+
+        // Act
+        testRunner.enqueue(SAMPLE_STRING.getBytes(StandardCharsets.UTF_8));
+
+        // Validation should fail because nothing will match
+        testRunner.run();
+    }
+
+    @Test(expected = AssertionError.class)
+    public void testInvalidIfGroupCountsDoNotMatch() {
+        final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText());
+        testRunner.setProperty(ENABLE_NAMED_GROUPS, "true");
+        testRunner.setProperty(ExtractText.INCLUDE_CAPTURE_GROUP_ZERO, "false");
+        final String attributeKey = "notValidOne";
+        testRunner.setProperty(attributeKey,"^(beginning)\\s(middle)\\s(?<END>end)$");
+
+        // Act
+        testRunner.enqueue("beginning middle end".getBytes(StandardCharsets.UTF_8));
+
+        // Validation should fail because number of groups does not match number of named groups
+        testRunner.run();
+    }
+}