You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nifi.apache.org by pv...@apache.org on 2021/03/12 08:11:25 UTC
[nifi] branch main updated: NIFI-2702 Support named captures in
ExtractText
This is an automated email from the ASF dual-hosted git repository.
pvillard pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/nifi.git
The following commit(s) were added to refs/heads/main by this push:
new 9f0b47a NIFI-2702 Support named captures in ExtractText
9f0b47a is described below
commit 9f0b47af7724af2e3d7e11b1226e5419cdf3a8bb
Author: Otto Fowler <ot...@gmail.com>
AuthorDate: Fri Jul 3 13:52:31 2020 -0400
NIFI-2702 Support named captures in ExtractText
Signed-off-by: Pierre Villard <pi...@gmail.com>
This closes #4384.
---
.../nifi/processors/standard/ExtractText.java | 108 +++++-
.../additionalDetails.html | 239 +++++++++++++
.../nifi/processors/standard/TestExtractText.java | 27 +-
.../standard/TestExtractTextNamedGroups.java | 397 +++++++++++++++++++++
4 files changed, 746 insertions(+), 25 deletions(-)
diff --git a/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/java/org/apache/nifi/processors/standard/ExtractText.java b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/java/org/apache/nifi/processors/standard/ExtractText.java
index 87b05ff..3f62abf 100644
--- a/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/java/org/apache/nifi/processors/standard/ExtractText.java
+++ b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/java/org/apache/nifi/processors/standard/ExtractText.java
@@ -16,6 +16,8 @@
*/
package org.apache.nifi.processors.standard;
+import static io.krakens.grok.api.GrokUtils.getNameGroups;
+
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
@@ -32,6 +34,7 @@ import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.atomic.AtomicReference;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
+
import org.apache.nifi.annotation.behavior.DynamicProperty;
import org.apache.nifi.annotation.behavior.EventDriven;
import org.apache.nifi.annotation.behavior.InputRequirement;
@@ -69,6 +72,8 @@ import org.apache.nifi.stream.io.StreamUtils;
+ "The results of those Regular Expressions are assigned to FlowFile Attributes. "
+ "Regular Expressions are entered by adding user-defined properties; "
+ "the name of the property maps to the Attribute Name into which the result will be placed. "
+ + "The attributes are generated differently based on the enabling of named capture groups. "
+ + "If named capture groups are not enabled: "
+ "The first capture group, if any found, will be placed into that attribute name."
+ "But all capture groups, including the matching string sequence itself will also be "
+ "provided at that attribute name with an index value provided, with the exception of a capturing group "
@@ -76,7 +81,17 @@ import org.apache.nifi.stream.io.StreamUtils;
+ "\"abc(def)?(g)\" we would add an attribute \"regex.1\" with a value of \"def\" if the \"def\" matched. If "
+ "the \"def\" did not match, no attribute named \"regex.1\" would be added but an attribute named \"regex.2\" "
+ "with a value of \"g\" will be added regardless."
+ + "If named capture groups are enabled: "
+ + "Each named capture group, if found will be placed into the attributes name with the name provided. "
+ + "If enabled the matching string sequence itself will be placed into the attribute name. "
+ + "If multiple matches are enabled, and index will be applied after the first set of matches. "
+ + "The exception is a capturing group that is optional and does not match "
+ + "For example, given the attribute name \"regex\" and expression \"abc(?<NAMED>def)?(?<NAMED-TWO>g)\" "
+ + "we would add an attribute \"regex.NAMED\" with the value of \"def\" if the \"def\" matched. We would "
+ + " add an attribute \"regex.NAMED-TWO\" with the value of \"g\" if the \"g\" matched regardless. "
+ "The value of the property must be a valid Regular Expressions with one or more capturing groups. "
+ + "If named capture groups are enabled, all capture groups must be named. If they are not, then the "
+ + "processor configuration will fail validation. "
+ "If the Regular Expression matches more than once, only the first match will be used unless the property "
+ "enabling repeating capture group is set to true. "
+ "If any provided Regular Expression matches, the FlowFile(s) will be routed to 'matched'. "
@@ -208,6 +223,18 @@ public class ExtractText extends AbstractProcessor {
.defaultValue("false")
.build();
+ public static final PropertyDescriptor ENABLE_NAMED_GROUPS = new PropertyDescriptor.Builder()
+ .name("extract-text-enable-named-groups")
+ .displayName("Enable named group support")
+ .description("If set to true, when named groups are present in the regular expression, the name of the "
+ + "group will be used in the attribute name as opposed to the group index. All capturing groups "
+ + "must be named, if the number of groups (not including capture group 0) does not equal the "
+ + "number of named groups validation will fail.")
+ .required(false)
+ .allowableValues("true","false")
+ .defaultValue("false")
+ .build();
+
public static final Relationship REL_MATCH = new Relationship.Builder()
.name("matched")
.description("FlowFiles are routed to this relationship when the Regular Expression is successfully evaluated and the FlowFile is modified as a result")
@@ -245,6 +272,7 @@ public class ExtractText extends AbstractProcessor {
props.add(UNIX_LINES);
props.add(INCLUDE_CAPTURE_GROUP_ZERO);
props.add(ENABLE_REPEATING_CAPTURE_GROUP);
+ props.add(ENABLE_NAMED_GROUPS);
this.properties = Collections.unmodifiableList(props);
}
@@ -292,6 +320,37 @@ public class ExtractText extends AbstractProcessor {
}
}
+ // If named groups are enabled, the number of named groups needs to match the number of groups overall
+ final boolean enableNamedGroups = validationContext.getProperty(ENABLE_NAMED_GROUPS).asBoolean();
+ getLogger().debug(String.format("Enable named groups is %s", enableNamedGroups));
+ if (enableNamedGroups) {
+ for (Map.Entry<PropertyDescriptor, String> prop : validationContext.getProperties().entrySet()) {
+ PropertyDescriptor pd = prop.getKey();
+ if (pd.isDynamic()) {
+ final String value = validationContext.getProperty(pd).getValue();
+ getLogger().debug(
+ "Evaluating dynamic property " + pd.getDisplayName() + " (" + pd.getName() + ") with value "
+ + value);
+ final Pattern pattern = Pattern.compile(value);
+ final int numGroups = pattern.matcher("").groupCount();
+ final int namedGroupCount = getNameGroups(value).size();
+ if (numGroups!= namedGroupCount) {
+ getLogger().debug(String
+ .format("Named group count %d does not match total group count %d", namedGroupCount,
+ numGroups));
+ problems.add(new ValidationResult.Builder()
+ .subject(pd.getDisplayName())
+ .input(value)
+ .valid(false)
+ .explanation("Named group count does not match total group count")
+ .build()
+ );
+ }
+ }
+ }
+ }
+
+
return problems;
}
@@ -358,29 +417,54 @@ public class ExtractText extends AbstractProcessor {
final Map<String, Pattern> patternMap = compiledPattersMapRef.get();
final int startGroupIdx = context.getProperty(INCLUDE_CAPTURE_GROUP_ZERO).asBoolean() ? 0 : 1;
+ final boolean useNamedGroups = context.getProperty(ENABLE_NAMED_GROUPS).isSet()
+ ? context.getProperty(ENABLE_NAMED_GROUPS).asBoolean() : false;
for (final Map.Entry<String, Pattern> entry : patternMap.entrySet()) {
-
+ String patternString = entry.getValue().toString();
+ String[] namedGroups = getNameGroups(patternString).toArray(new String[0]);
final Matcher matcher = entry.getValue().matcher(contentString);
int j = 0;
while (matcher.find()) {
final String baseKey = entry.getKey();
- int start = j == 0 ? startGroupIdx : 1;
- for (int i = start; i <= matcher.groupCount(); i++) {
- final String key = new StringBuilder(baseKey).append(".").append(i + j).toString();
- String value = matcher.group(i);
- if (value != null && !value.isEmpty()) {
- if (value.length() > maxCaptureGroupLength) {
- value = value.substring(0, maxCaptureGroupLength);
+ // group count doesn't include the 0
+ if (useNamedGroups && matcher.groupCount() == namedGroups.length) {
+ for ( int i = 0; i < namedGroups.length; i++) {
+ final StringBuilder builder = new StringBuilder(baseKey).append(".").append(namedGroups[i]);
+ if (j > 0) {
+ builder.append(".").append(j);
+ }
+ final String key = builder.toString();
+ String value = matcher.group(namedGroups[i]);
+ if (value != null && !value.isEmpty()) {
+ if (value.length() > maxCaptureGroupLength) {
+ value = value.substring(0, maxCaptureGroupLength);
+ }
+ regexResults.put(key, value);
}
- regexResults.put(key, value);
- if (i == 1 && j == 0) {
- regexResults.put(baseKey, value);
+ }
+ if (startGroupIdx == 0 && j == 0) {
+ regexResults.put(baseKey, matcher.group(0));
+ }
+ j++;
+ } else {
+ int start = j == 0 ? startGroupIdx : 1;
+ for (int i = start; i <= matcher.groupCount(); i++) {
+ final String key = new StringBuilder(baseKey).append(".").append(i + j).toString();
+ String value = matcher.group(i);
+ if (value != null && !value.isEmpty()) {
+ if (value.length() > maxCaptureGroupLength) {
+ value = value.substring(0, maxCaptureGroupLength);
+ }
+ regexResults.put(key, value);
+ if (i == 1 && j == 0) {
+ regexResults.put(baseKey, value);
+ }
}
}
+ j += matcher.groupCount();
}
- j += matcher.groupCount();
if (!context.getProperty(ENABLE_REPEATING_CAPTURE_GROUP).asBoolean()) {
break;
}
diff --git a/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/resources/docs/org.apache.nifi.processors.standard.ExtractText/additionalDetails.html b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/resources/docs/org.apache.nifi.processors.standard.ExtractText/additionalDetails.html
new file mode 100644
index 0000000..475092d
--- /dev/null
+++ b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/resources/docs/org.apache.nifi.processors.standard.ExtractText/additionalDetails.html
@@ -0,0 +1,239 @@
+<!DOCTYPE html>
+<html lang="en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<head>
+ <meta charset="utf-8"/>
+ <title>ExtractText</title>
+ <link rel="stylesheet" href="../../../../../css/component-usage.css" type="text/css"/>
+</head>
+
+<body>
+<!-- Processor Documentation ================================================== -->
+<h2>Usage Information</h2>
+
+<p>
+ The Extract Text processor provides different results based on whether named capture groups are enabled.
+</p>
+
+<h2>Example</h2>
+<p>
+ Here is a like for like example that illustrates this.
+</p>
+<h4>Data</h4>
+<table>
+ <tbody>
+ <tr>
+ <td>
+ <code>
+ <pre>
+ foo\r\nbar1\r\nbar2\r\nbar3\r\nhello\r\nworld\r\n
+ </pre>
+ </code>
+ </td>
+ </tr>
+ </tbody>
+</table>
+
+
+<h3>Without named capture groups</h3>
+<h4>Configuration</h4>
+<table>
+ <thead>
+ <th>Property Name</th>
+ <th>Property Value</th>
+ </thead>
+ <tbody>
+ <tr>
+ <td>regex.result1</td>
+ <td>(?s)(.*)</td>
+ </tr>
+ <tr>
+ <td>regex.result2</td>
+ <td>(?s).*(bar1).*</td>
+ </tr>
+ <tr>
+ <td>regex.result3</td>
+ <td>(?s).*?(bar\\d).*</td>
+ </tr>
+ <tr>
+ <td>regex.result4</td>
+ <td>(?s).*?(?:bar\\d).*?(bar\\d).*?(bar3).*</td>
+ </tr>
+ <tr>
+ <td>regex.result5</td>
+ <td>(?s).*(bar\\d).*</td>
+ </tr>
+ <tr>
+ <td>regex.result6</td>
+ <td>(?s)^(.*)$</td>
+ </tr>
+ <tr>
+ <td>regex.result7</td>
+ <td>(?s)(XXX)</td>
+ </tr>
+ </tbody>
+</table>
+
+<h4>Results</h4>
+<table>
+ <thead>
+ <th>Attribute Name</th>
+ <th>Attribute Value</th>
+ </thead>
+ <tbody>
+ <tr>
+ <td>regex.result1</td>
+ <td><code>
+ <pre>foo\r\nbar1\r\nbar2\r\nbar3\r\nhello\r\nworld\r\n</pre>
+ </code></td>
+ </tr>
+ <tr>
+ <td>regex.result2</td>
+ <td>bar1</td>
+ </tr>
+ <tr>
+ <td>regex.result3</td>
+ <td>bar1</td>
+ </tr>
+ <tr>
+ <td>regex.result4</td>
+ <td>bar2</td>
+ </tr>
+ <tr>
+ <td>regex.result4.0</td>
+ <td><code>
+ <pre>foo\r\nbar1\r\nbar2\r\nbar3\r\nhello\r\nworld\r\n</pre>
+ </code></td>
+ </tr>
+ <tr>
+ <td>regex.result4.1</td>
+ <td>bar2</td>
+ </tr>
+ <tr>
+ <td>regex.result4.2</td>
+ <td>bar3</td>
+ </tr>
+ <tr>
+ <td>regex.result5</td>
+ <td>bar3</td>
+ </tr>
+ <tr>
+ <td>regex.result6</td>
+ <td><code>
+ <pre>foo\r\nbar1\r\nbar2\r\nbar3\r\nhello\r\nworld\r\n</pre>
+ </code></td>
+ </tr>
+ <tr>
+ <td>regex.result7</td>
+ <td></td>
+ </tr>
+ </tbody>
+</table>
+
+<h3>With named capture groups</h3>
+<h4>Configuration</h4>
+<table>
+ <thead>
+ <th>Property Name</th>
+ <th>Property Value</th>
+ </thead>
+ <tbody>
+ <tr>
+ <td>Enable named group support</td>
+ <td>True</td>
+ </tr>
+ <tr>
+ <td>regex.result1</td>
+ <td>(?s)(?<ALL>.*</td>
+ </tr>
+ <tr>
+ <td>regex.result2</td>
+ <td>(?s).*(?<BAR1>bar1).*</td>
+ </tr>
+ <tr>
+ <td>regex.result3</td>
+ <td>(?s).*?(?<BAR1>bar\d).*</td>
+ </tr>
+ <tr>
+ <td>regex.result4</td>
+ <td>(?s).*?(?:bar\d).*?(?<BAR2>bar\d).*?(?<BAR3>bar3).*</td>
+ </tr>
+ <tr>
+ <td>regex.result5</td>
+ <td>(?s).*(?<BAR3>bar\d).*</td>
+ </tr>
+ <tr>
+ <td>regex.result6</td>
+ <td>(?s)^(?<ALL>.*)$</td>
+ </tr>
+ <tr>
+ <td>regex.result7</td>
+ <td>(?s)(?<MISS>XXX)</td>
+ </tr>
+ </tbody>
+</table>
+
+<h4>Results</h4>
+<table>
+ <thead>
+ <th>Attribute Name</th>
+ <th>Attribute Value</th>
+ </thead>
+ <tbody>
+ <tr>
+ <td>regex.result1</td>
+ <td><code>
+ <pre>foo\r\nbar1\r\nbar2\r\nbar3\r\nhello\r\nworld\r\n</pre>
+ </code></td>
+ </tr>
+ <tr>
+ <td>regex.result2.BAR1</td>
+ <td>bar1</td>
+ </tr>
+ <tr>
+ <td>regex.result3.BAR1</td>
+ <td>bar1</td>
+ </tr>
+ <tr>
+ <td>regex.result4.BAR2</td>
+ <td>bar2</td>
+ </tr>
+ <tr>
+ <td>regex.result4.BAR2</td>
+ <td>bar2</td>
+ </tr>
+ <tr>
+ <td>regex.result4.BAR3</td>
+ <td>bar3</td>
+ </tr>
+ <tr>
+ <td>regex.result5.BAR3</td>
+ <td>bar3</td>
+ </tr>
+ <tr>
+ <td>regex.result6.ALL</td>
+ <td><code>
+ <pre>foo\r\nbar1\r\nbar2\r\nbar3\r\nhello\r\nworld\r\n</pre>
+ </code></td>
+ </tr>
+ <tr>
+ <td>regex.result7.MISS</td>
+ <td></td>
+ </tr>
+ </tbody>
+</table>
+</body>
+</html>
diff --git a/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/java/org/apache/nifi/processors/standard/TestExtractText.java b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/java/org/apache/nifi/processors/standard/TestExtractText.java
index e323c2d..7a1696b 100644
--- a/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/java/org/apache/nifi/processors/standard/TestExtractText.java
+++ b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/java/org/apache/nifi/processors/standard/TestExtractText.java
@@ -20,6 +20,7 @@ import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import java.io.UnsupportedEncodingException;
+import java.nio.charset.StandardCharsets;
import java.util.Set;
import java.util.regex.Pattern;
import org.apache.nifi.processor.Relationship;
@@ -45,7 +46,7 @@ public class TestExtractText {
testRunner.setProperty("regex.result6", "(?s)^(.*)$");
testRunner.setProperty("regex.result7", "(?s)(XXX)");
- testRunner.enqueue(SAMPLE_STRING.getBytes("UTF-8"));
+ testRunner.enqueue(SAMPLE_STRING.getBytes(StandardCharsets.UTF_8));
testRunner.run();
testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1);
@@ -99,7 +100,7 @@ public class TestExtractText {
testRunner.setProperty("regex.result6", "^(.*)$");
testRunner.setProperty("regex.result7", "^(XXX)$");
- testRunner.enqueue(SAMPLE_STRING.getBytes("UTF-8"));
+ testRunner.enqueue(SAMPLE_STRING.getBytes(StandardCharsets.UTF_8));
testRunner.run();
testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1);
@@ -131,7 +132,7 @@ public class TestExtractText {
testRunner.setProperty("regex.result6", "^(.*)$");
testRunner.setProperty("regex.result7", "^(XXX)$");
- testRunner.enqueue(SAMPLE_STRING.getBytes("UTF-8"));
+ testRunner.enqueue(SAMPLE_STRING.getBytes(StandardCharsets.UTF_8));
testRunner.run();
testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1);
@@ -165,7 +166,7 @@ public class TestExtractText {
testRunner.setProperty("regex.result6", "^(.*)$");
testRunner.setProperty("regex.result7", "^(XXX)$");
- testRunner.enqueue(SAMPLE_STRING.getBytes("UTF-8"));
+ testRunner.enqueue(SAMPLE_STRING.getBytes(StandardCharsets.UTF_8));
testRunner.run();
testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1);
@@ -198,7 +199,7 @@ public class TestExtractText {
testRunner.setProperty("regex.result5b", "(?:bar\\d\\r?\\n)*(bar\\d)");
testRunner.setProperty("regex.result7", "^(XXX)$");
- testRunner.enqueue("YYY".getBytes("UTF-8"));
+ testRunner.enqueue("YYY".getBytes(StandardCharsets.UTF_8));
testRunner.run();
testRunner.assertAllFlowFilesTransferred(ExtractText.REL_NO_MATCH, 1);
@@ -232,7 +233,7 @@ public class TestExtractText {
testRunner.setProperty("regex.result1", "(foo)");
testRunner.setProperty("regex.result2", "(world)");
- testRunner.enqueue(SAMPLE_STRING.getBytes("UTF-8"));
+ testRunner.enqueue(SAMPLE_STRING.getBytes(StandardCharsets.UTF_8));
testRunner.run();
testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1);
@@ -312,7 +313,7 @@ public class TestExtractText {
final ExtractText processor = new ExtractText();
final TestRunner testRunner = TestRunners.newTestRunner(processor);
- testRunner.enqueue("foo".getBytes("UTF-8"));
+ testRunner.enqueue("foo".getBytes(StandardCharsets.UTF_8));
testRunner.run();
Set<Relationship> relationships = processor.getRelationships();
@@ -329,7 +330,7 @@ public class TestExtractText {
testRunner.setProperty(attributeKey, "(?s)(.*)");
- testRunner.enqueue(SAMPLE_STRING.getBytes("UTF-8"));
+ testRunner.enqueue(SAMPLE_STRING.getBytes(StandardCharsets.UTF_8));
testRunner.run();
testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1);
@@ -346,7 +347,7 @@ public class TestExtractText {
testRunner.setProperty(ExtractText.ENABLE_REPEATING_CAPTURE_GROUP, "true");
final String attributeKey = "regex.result";
testRunner.setProperty(attributeKey, "(?s)(\\w+)");
- testRunner.enqueue("This is my text".getBytes("UTF-8"));
+ testRunner.enqueue("This is my text".getBytes(StandardCharsets.UTF_8));
testRunner.run();
testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1);
final MockFlowFile out = testRunner.getFlowFilesForRelationship(ExtractText.REL_MATCH).get(0);
@@ -370,7 +371,7 @@ public class TestExtractText {
testRunner.setProperty(ExtractText.ENABLE_REPEATING_CAPTURE_GROUP, "true");
final String attributeKey = "regex.result";
testRunner.setProperty(attributeKey, "(\\w+)=(\\d+)");
- testRunner.enqueue("a=1,b=10,c=100".getBytes("UTF-8"));
+ testRunner.enqueue("a=1,b=10,c=100".getBytes(StandardCharsets.UTF_8));
testRunner.run();
testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1);
final MockFlowFile out = testRunner.getFlowFilesForRelationship(ExtractText.REL_MATCH).get(0);
@@ -403,7 +404,7 @@ public class TestExtractText {
testRunner.setProperty(attributeKey, "(?s)(.*)");
- testRunner.enqueue(SAMPLE_STRING.getBytes("UTF-8"));
+ testRunner.enqueue(SAMPLE_STRING.getBytes(StandardCharsets.UTF_8));
testRunner.run();
testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1);
@@ -422,7 +423,7 @@ public class TestExtractText {
testRunner.setProperty(attributeKey, "(?s).*");
// Act
- testRunner.enqueue(SAMPLE_STRING.getBytes("UTF-8"));
+ testRunner.enqueue(SAMPLE_STRING.getBytes(StandardCharsets.UTF_8));
testRunner.run();
// Assert
@@ -443,7 +444,7 @@ public class TestExtractText {
testRunner.setProperty(attributeKey, "(?s).*");
// Act
- testRunner.enqueue(SAMPLE_STRING.getBytes("UTF-8"));
+ testRunner.enqueue(SAMPLE_STRING.getBytes(StandardCharsets.UTF_8));
// Validation should fail because nothing will match
testRunner.run();
diff --git a/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/java/org/apache/nifi/processors/standard/TestExtractTextNamedGroups.java b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/java/org/apache/nifi/processors/standard/TestExtractTextNamedGroups.java
new file mode 100644
index 0000000..9be1c47
--- /dev/null
+++ b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/java/org/apache/nifi/processors/standard/TestExtractTextNamedGroups.java
@@ -0,0 +1,397 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nifi.processors.standard;
+
+import static org.apache.nifi.processors.standard.ExtractText.ENABLE_NAMED_GROUPS;
+
+import java.io.UnsupportedEncodingException;
+import java.nio.charset.StandardCharsets;
+
+import org.apache.nifi.util.MockFlowFile;
+import org.apache.nifi.util.TestRunner;
+import org.apache.nifi.util.TestRunners;
+import org.junit.Test;
+
+public class TestExtractTextNamedGroups {
+
+ final String SAMPLE_STRING = "foo\r\nbar1\r\nbar2\r\nbar3\r\nhello\r\nworld\r\n";
+
+ @Test
+ public void testProcessor() throws Exception {
+
+ final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText());
+
+ testRunner.setProperty(ENABLE_NAMED_GROUPS, "true");
+ testRunner.setProperty("regex.result1", "(?s)(?<ALL>.*)");
+ testRunner.setProperty("regex.result2", "(?s).*(?<BAR1>bar1).*");
+ testRunner.setProperty("regex.result3", "(?s).*?(?<BAR1>bar\\d).*");
+ testRunner.setProperty("regex.result4", "(?s).*?(?:bar\\d).*?(?<BAR2>bar\\d).*?(?<BAR3>bar3).*");
+ testRunner.setProperty("regex.result5", "(?s).*(?<BAR3>bar\\d).*");
+ testRunner.setProperty("regex.result6", "(?s)^(?<ALL>.*)$");
+ testRunner.setProperty("regex.result7", "(?s)(?<MISS>XXX)");
+ testRunner.enqueue(SAMPLE_STRING.getBytes(StandardCharsets.UTF_8));
+ testRunner.run();
+
+ testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1);
+ final MockFlowFile out = testRunner.getFlowFilesForRelationship(ExtractText.REL_MATCH).get(0);
+ java.util.Map<String,String> attributes = out.getAttributes();
+ out.assertAttributeEquals("regex.result1.ALL", SAMPLE_STRING);
+ out.assertAttributeEquals("regex.result2.BAR1", "bar1");
+ out.assertAttributeEquals("regex.result3.BAR1", "bar1");
+ out.assertAttributeEquals("regex.result4.BAR2", "bar2");
+ out.assertAttributeEquals("regex.result4.BAR3", "bar3");
+ out.assertAttributeEquals("regex.result5.BAR3", "bar3");
+ out.assertAttributeEquals("regex.result6.ALL", SAMPLE_STRING);
+ out.assertAttributeEquals("regex.result7.MISS", null);
+ }
+
+ @Test
+ public void testWithUnmatchedOptionalCapturingGroup() {
+ final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText());
+ testRunner.setProperty(ENABLE_NAMED_GROUPS, "true");
+ testRunner.setProperty("regex", "abc(?<DEF>def)?(?<G>g)");
+ testRunner.enqueue("abcg");
+ testRunner.run();
+
+ testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1);
+ final MockFlowFile out = testRunner.getFlowFilesForRelationship(ExtractText.REL_MATCH).get(0);
+ out.assertAttributeNotExists("regex.DEF");
+ out.assertAttributeEquals("regex.G", "g");
+
+ testRunner.clearTransferState();
+
+ testRunner.enqueue("abcdefg");
+ testRunner.run();
+ final MockFlowFile out2 = testRunner.getFlowFilesForRelationship(ExtractText.REL_MATCH).get(0);
+ out2.assertAttributeEquals("regex.DEF", "def");
+ out2.assertAttributeEquals("regex.G", "g");
+ }
+
+ @Test
+ public void testProcessorWithDotall() throws Exception {
+
+ final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText());
+ testRunner.setProperty(ENABLE_NAMED_GROUPS, "true");
+
+ testRunner.setProperty(ExtractText.DOTALL, "true");
+
+ testRunner.setProperty("regex.result1", "(?<TOUT>.*)");
+ testRunner.setProperty("regex.result2", ".*(?<BAR1>bar1).*");
+ testRunner.setProperty("regex.result3", ".*?(?<BAR1>bar\\d).*"); // reluctant gets first
+ testRunner.setProperty("regex.result4", ".*?(?:bar\\d).*?(?<BAR2>bar\\d).*"); // reluctant w/ repeated pattern gets second
+ testRunner.setProperty("regex.result5", ".*(?<BAR3>bar\\d).*"); // greedy gets last
+ testRunner.setProperty("regex.result6", "^(?<TOUT>.*)$");
+ testRunner.setProperty("regex.result7", "^(?<NO>XXX)$");
+
+ testRunner.enqueue(SAMPLE_STRING.getBytes(StandardCharsets.UTF_8));
+ testRunner.run();
+
+ testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1);
+ final MockFlowFile out = testRunner.getFlowFilesForRelationship(ExtractText.REL_MATCH).get(0);
+ out.assertAttributeEquals("regex.result1.TOUT", SAMPLE_STRING);
+ out.assertAttributeEquals("regex.result2.BAR1", "bar1");
+ out.assertAttributeEquals("regex.result3.BAR1", "bar1");
+ out.assertAttributeEquals("regex.result4.BAR2", "bar2");
+ out.assertAttributeEquals("regex.result5.BAR3", "bar3");
+ out.assertAttributeEquals("regex.result6.TOUT", SAMPLE_STRING);
+ out.assertAttributeEquals("regex.result7.NO", null);
+
+ }
+
+ @Test
+ public void testProcessorWithMultiline() throws Exception {
+
+ final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText());
+ testRunner.setProperty(ENABLE_NAMED_GROUPS, "true");
+
+ testRunner.setProperty(ExtractText.MULTILINE, "true");
+
+ testRunner.setProperty("regex.result1", "(?<ALL>.*)");
+ testRunner.setProperty("regex.result2", "(?<BAR1>bar1)");
+ testRunner.setProperty("regex.result3", ".*?(?<BAR1>bar\\d).*");
+ testRunner.setProperty("regex.result4", ".*?(?:bar\\d).*?(?<NULL>bar\\d).*");
+ testRunner.setProperty("regex.result4b", "bar\\d\\r\\n(?<BAR2>bar\\d)");
+ testRunner.setProperty("regex.result5", ".*(?<BAR2>bar\\d).*");
+ testRunner.setProperty("regex.result5b", "(?:bar\\d\\r?\\n)*(?<BAR3>bar\\d)");
+ testRunner.setProperty("regex.result6", "^(?<ALL>.*)$");
+ testRunner.setProperty("regex.result7", "^(?<NO>XXX)$");
+
+ testRunner.enqueue(SAMPLE_STRING.getBytes(StandardCharsets.UTF_8));
+ testRunner.run();
+
+ testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1);
+ final MockFlowFile out = testRunner.getFlowFilesForRelationship(ExtractText.REL_MATCH).get(0);
+ out.assertAttributeEquals("regex.result1.ALL", "foo"); // matches everything on the first line
+ out.assertAttributeEquals("regex.result2.BAR1", "bar1");
+ out.assertAttributeEquals("regex.result3.BAR1", "bar1");
+ out.assertAttributeEquals("regex.result4.NULL", null); // null because no line has two bar's
+ out.assertAttributeEquals("regex.result4b.BAR2", "bar2"); // included newlines in regex
+ out.assertAttributeEquals("regex.result5.BAR2", "bar1"); //still gets first because no lines with multiple bar's
+ out.assertAttributeEquals("regex.result5b.BAR3", "bar3"); // included newlines in regex
+ out.assertAttributeEquals("regex.result6.ALL", "foo"); // matches all of first line
+ out.assertAttributeEquals("regex.result7.NO", null); // no match
+ }
+
+ @Test
+ public void testProcessorWithMultilineAndDotall() throws Exception {
+
+ final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText());
+ testRunner.setProperty(ENABLE_NAMED_GROUPS, "true");
+
+ testRunner.setProperty(ExtractText.MULTILINE, "true");
+ testRunner.setProperty(ExtractText.DOTALL, "true");
+
+ testRunner.setProperty("regex.result1", "(?<ALL>.*)");
+ testRunner.setProperty("regex.result2", "(?<BAR1>bar1)");
+ testRunner.setProperty("regex.result3", ".*?(?<BAR1>bar\\d).*");
+ testRunner.setProperty("regex.result4", ".*?(?:bar\\d).*?(?<BAR2>bar\\d).*");
+ testRunner.setProperty("regex.result4b", "bar\\d\\r\\n(?<BAR2>bar\\d)");
+ testRunner.setProperty("regex.result5", ".*(?<BAR3>bar\\d).*");
+ testRunner.setProperty("regex.result5b", "(?:bar\\d\\r?\\n)*(?<BAR3>bar\\d)");
+ testRunner.setProperty("regex.result6", "^(?<ALL>.*)$");
+ testRunner.setProperty("regex.result7", "^(?<MISS>XXX)$");
+
+ testRunner.enqueue(SAMPLE_STRING.getBytes(StandardCharsets.UTF_8));
+ testRunner.run();
+
+ testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1);
+ final MockFlowFile out = testRunner.getFlowFilesForRelationship(ExtractText.REL_MATCH).get(0);
+
+ out.assertAttributeEquals("regex.result1.ALL", SAMPLE_STRING);
+ out.assertAttributeEquals("regex.result2.BAR1", "bar1");
+ out.assertAttributeEquals("regex.result3.BAR1", "bar1");
+ out.assertAttributeEquals("regex.result4.BAR2", "bar2");
+ out.assertAttributeEquals("regex.result4b.BAR2", "bar2");
+ out.assertAttributeEquals("regex.result5.BAR3", "bar3");
+ out.assertAttributeEquals("regex.result5b.BAR3", "bar3");
+ out.assertAttributeEquals("regex.result6.ALL", SAMPLE_STRING);
+ out.assertAttributeEquals("regex.result7.MISS", null);
+ }
+
+ @Test
+ public void testProcessorWithNoMatches() throws Exception {
+
+ final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText());
+ testRunner.setProperty(ENABLE_NAMED_GROUPS, "true");
+
+ testRunner.setProperty(ExtractText.MULTILINE, "true");
+ testRunner.setProperty(ExtractText.DOTALL, "true");
+
+ testRunner.setProperty("regex.result2", "(?<NONE>bar1)");
+ testRunner.setProperty("regex.result3", ".*?(?<NONE>bar\\d).*");
+ testRunner.setProperty("regex.result4", ".*?(?:bar\\d).*?(?<NONE>bar\\d).*");
+ testRunner.setProperty("regex.result4b", "bar\\d\\r\\n(?<NONE>bar\\d)");
+ testRunner.setProperty("regex.result5", ".*(?<NONE>bar\\d).*");
+ testRunner.setProperty("regex.result5b", "(?:bar\\d\\r?\\n)*(?<NONE>bar\\d)");
+ testRunner.setProperty("regex.result7", "^(?<NONE>XXX)$");
+
+ testRunner.enqueue("YYY".getBytes(StandardCharsets.UTF_8));
+ testRunner.run();
+
+ testRunner.assertAllFlowFilesTransferred(ExtractText.REL_NO_MATCH, 1);
+ final MockFlowFile out = testRunner.getFlowFilesForRelationship(ExtractText.REL_NO_MATCH).get(0);
+
+ out.assertAttributeEquals("regex.result1.NONE", null);
+ out.assertAttributeEquals("regex.result2.NONE", null);
+ out.assertAttributeEquals("regex.result3.NONE", null);
+ out.assertAttributeEquals("regex.result4.NONE", null);
+ out.assertAttributeEquals("regex.result4b.NONE", null);
+ out.assertAttributeEquals("regex.result5.NONE", null);
+ out.assertAttributeEquals("regex.result5b.NONE", null);
+ out.assertAttributeEquals("regex.result6.NONE", null);
+ out.assertAttributeEquals("regex.result7.NONE", null);
+ }
+
+ @Test
+ public void testNoFlowFile() throws UnsupportedEncodingException {
+ final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText());
+ testRunner.setProperty(ENABLE_NAMED_GROUPS, "true");
+ testRunner.run();
+ testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 0);
+
+ }
+
+ @Test
+ public void testMatchOutsideBuffer() throws Exception {
+ final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText());
+ testRunner.setProperty(ENABLE_NAMED_GROUPS, "true");
+
+ testRunner.setProperty(ExtractText.MAX_BUFFER_SIZE, "3 B");//only read the first 3 chars ("foo")
+
+ testRunner.setProperty("regex.result1", "(?<FOO>foo)");
+ testRunner.setProperty("regex.result2", "(?<WORLD>world)");
+
+ testRunner.enqueue(SAMPLE_STRING.getBytes(StandardCharsets.UTF_8));
+ testRunner.run();
+
+ testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1);
+ final MockFlowFile out = testRunner.getFlowFilesForRelationship(ExtractText.REL_MATCH).get(0);
+
+ out.assertAttributeEquals("regex.result1.FOO", "foo");
+ out.assertAttributeEquals("regex.result2.WORLD", null); // null because outsk
+ }
+
+ @Test
+ public void testIncludeZeroCaptureGroupProperty() throws Exception {
+ final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText());
+ testRunner.setProperty(ENABLE_NAMED_GROUPS, "true");
+
+ final String attributeKey = "regex.result";
+
+ testRunner.setProperty(attributeKey, "(?s)(?<ALL>.*)");
+
+ testRunner.enqueue(SAMPLE_STRING.getBytes(StandardCharsets.UTF_8));
+ testRunner.run();
+
+ testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1);
+ final MockFlowFile out = testRunner.getFlowFilesForRelationship(ExtractText.REL_MATCH).get(0);
+
+ // Ensure the zero capture group is in the resultant attributes
+ out.assertAttributeExists(attributeKey);
+ out.assertAttributeExists(attributeKey + ".ALL");
+ out.assertAttributeEquals(attributeKey, SAMPLE_STRING);
+ out.assertAttributeEquals(attributeKey + ".ALL", SAMPLE_STRING);
+ }
+
+ @Test
+ public void testFindAll() throws Exception {
+ final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText());
+ testRunner.setProperty(ENABLE_NAMED_GROUPS, "true");
+ testRunner.setProperty(ExtractText.ENABLE_REPEATING_CAPTURE_GROUP, "true");
+ final String attributeKey = "regex.result";
+ testRunner.setProperty(attributeKey, "(?s)(?<W>\\w+)");
+ testRunner.enqueue("This is my text".getBytes(StandardCharsets.UTF_8));
+ testRunner.run();
+ testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1);
+ final MockFlowFile out = testRunner.getFlowFilesForRelationship(ExtractText.REL_MATCH).get(0);
+ // Ensure the zero capture group is in the resultant attributes
+ out.assertAttributeExists(attributeKey);
+ out.assertAttributeExists(attributeKey + ".W");
+ out.assertAttributeExists(attributeKey + ".W.1");
+ out.assertAttributeExists(attributeKey + ".W.2");
+ out.assertAttributeExists(attributeKey + ".W.3");
+ out.assertAttributeEquals(attributeKey, "This");
+ out.assertAttributeEquals(attributeKey + ".W", "This");
+ out.assertAttributeEquals(attributeKey + ".W.1", "is");
+ out.assertAttributeEquals(attributeKey + ".W.2", "my");
+ out.assertAttributeEquals(attributeKey + ".W.3", "text");
+ }
+
+ @Test
+ public void testFindAllPair() throws Exception {
+ final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText());
+ testRunner.setProperty(ENABLE_NAMED_GROUPS, "true");
+ testRunner.setProperty(ExtractText.ENABLE_REPEATING_CAPTURE_GROUP, "true");
+ final String attributeKey = "regex.result";
+ testRunner.setProperty(attributeKey, "(?<LEFT>\\w+)=(?<RIGHT>\\d+)");
+ testRunner.enqueue("a=1,b=10,c=100".getBytes(StandardCharsets.UTF_8));
+ testRunner.run();
+ testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1);
+ final MockFlowFile out = testRunner.getFlowFilesForRelationship(ExtractText.REL_MATCH).get(0);
+ // Ensure the zero capture group is in the resultant attributes
+ out.assertAttributeExists(attributeKey);
+ out.assertAttributeExists(attributeKey + ".LEFT");
+ out.assertAttributeExists(attributeKey + ".RIGHT");
+ out.assertAttributeExists(attributeKey + ".LEFT.1");
+ out.assertAttributeExists(attributeKey + ".RIGHT.1");
+ out.assertAttributeExists(attributeKey + ".LEFT.2");
+ out.assertAttributeExists(attributeKey + ".RIGHT.2");
+ out.assertAttributeNotExists(attributeKey + ".LEFT.3"); // Ensure there's no more attributes
+ out.assertAttributeNotExists(attributeKey + ".RIGHT.3"); // Ensure there's no more attributes
+ out.assertAttributeEquals(attributeKey , "a=1");
+ out.assertAttributeEquals(attributeKey + ".LEFT", "a");
+ out.assertAttributeEquals(attributeKey + ".RIGHT", "1");
+ out.assertAttributeEquals(attributeKey + ".LEFT.1", "b");
+ out.assertAttributeEquals(attributeKey + ".RIGHT.1", "10");
+ out.assertAttributeEquals(attributeKey + ".LEFT.2", "c");
+ out.assertAttributeEquals(attributeKey + ".RIGHT.2", "100");
+ }
+
+ @Test
+ public void testIgnoreZeroCaptureGroupProperty() throws Exception {
+ final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText());
+ testRunner.setProperty(ENABLE_NAMED_GROUPS, "true");
+
+ testRunner.setProperty(ExtractText.INCLUDE_CAPTURE_GROUP_ZERO, "false");
+
+ final String attributeKey = "regex.result";
+
+ testRunner.setProperty(attributeKey, "(?s)(?<ALL>.*)");
+
+ testRunner.enqueue(SAMPLE_STRING.getBytes(StandardCharsets.UTF_8));
+ testRunner.run();
+
+ testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1);
+ final MockFlowFile out = testRunner.getFlowFilesForRelationship(ExtractText.REL_MATCH).get(0);
+
+ // Ensure the zero capture group is not in the resultant attributes
+ out.assertAttributeNotExists(attributeKey);
+ out.assertAttributeEquals(attributeKey + ".ALL", SAMPLE_STRING);
+ }
+
+ @Test
+ public void testShouldAllowNoCaptureGroups() throws Exception {
+ // Arrange
+ final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText());
+ testRunner.setProperty(ENABLE_NAMED_GROUPS, "true");
+ final String attributeKey = "regex.result";
+ testRunner.setProperty(attributeKey, "(?s).*");
+
+ // Act
+ testRunner.enqueue(SAMPLE_STRING.getBytes(StandardCharsets.UTF_8));
+ testRunner.run();
+
+ // Assert
+ testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1);
+ final MockFlowFile out = testRunner.getFlowFilesForRelationship(ExtractText.REL_MATCH).get(0);
+
+ // There is no global capture group, but no named capture group either
+ // so attributeKey has the match
+ out.assertAttributeEquals(attributeKey , SAMPLE_STRING);
+ }
+
+ @Test(expected = AssertionError.class)
+ public void testShouldNotAllowNoCaptureGroupsIfZeroDisabled() throws Exception {
+ // Arrange
+ final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText());
+ testRunner.setProperty(ENABLE_NAMED_GROUPS, "true");
+ testRunner.setProperty(ExtractText.INCLUDE_CAPTURE_GROUP_ZERO, "false");
+ final String attributeKey = "regex.result";
+ testRunner.setProperty(attributeKey, "(?s).*");
+
+ // Act
+ testRunner.enqueue(SAMPLE_STRING.getBytes(StandardCharsets.UTF_8));
+
+ // Validation should fail because nothing will match
+ testRunner.run();
+ }
+
+ @Test(expected = AssertionError.class)
+ public void testInvalidIfGroupCountsDoNotMatch() {
+ final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText());
+ testRunner.setProperty(ENABLE_NAMED_GROUPS, "true");
+ testRunner.setProperty(ExtractText.INCLUDE_CAPTURE_GROUP_ZERO, "false");
+ final String attributeKey = "notValidOne";
+ testRunner.setProperty(attributeKey,"^(beginning)\\s(middle)\\s(?<END>end)$");
+
+ // Act
+ testRunner.enqueue("beginning middle end".getBytes(StandardCharsets.UTF_8));
+
+ // Validation should fail because number of groups does not match number of named groups
+ testRunner.run();
+ }
+}