You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nifi.apache.org by sz...@apache.org on 2022/04/26 14:39:29 UTC
[nifi-minifi-cpp] 02/03: MINIFICPP-1675 Use regex.h instead of std::regex when using libstdc++
This is an automated email from the ASF dual-hosted git repository.
szaszm pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/nifi-minifi-cpp.git
commit b0f0873d74ea7a19d4c2cda7ed79a99ea46bd88d
Author: Gabor Gyimesi <ga...@gmail.com>
AuthorDate: Tue Apr 26 16:04:05 2022 +0200
MINIFICPP-1675 Use regex.h instead of std::regex when using libstdc++
Due to a bug in the libstdc++ implementation of std::regex it is unsafe
to use std::regex_match and std::regex_search on large texts as the
process can crash due to a stack overflow. In this commit std::regex
usages are replaced with minifi::utils::Regex which uses regex.h if
libstdc++ is used otherwise it uses std::regex.
There are a few exceptions:
- std::regex is still used for regex_replace calls
- AppendHostInfo still uses std::regex as POSIX regex grammar is not
sufficient for easily filtering interface names, and there is no risk
of running into the bug as the interface names are small
- In tests and mocks we still use std::regex as they are only run in our
test environments with controlled inputs and sometimes use regex
patterns not available in POSIX grammar
Closes #1300
Signed-off-by: Marton Szasz <sz...@apache.org>
---
encrypt-config/tests/ConfigFileEncryptorTests.cpp | 6 +-
extensions/aws/s3/S3Wrapper.cpp | 8 +-
.../azure/processors/ListAzureDataLakeStorage.cpp | 6 +-
extensions/azure/storage/AzureDataLakeStorage.cpp | 10 +-
extensions/azure/storage/DataLakeStorageClient.h | 6 +-
extensions/civetweb/processors/ListenHTTP.cpp | 4 +-
extensions/civetweb/processors/ListenHTTP.h | 6 +-
extensions/expression-language/Expression.cpp | 17 +-
extensions/http-curl/client/HTTPClient.cpp | 8 +-
.../KubernetesControllerService.cpp | 10 +-
.../KubernetesControllerService.h | 7 +-
extensions/librdkafka/PublishKafka.cpp | 8 +-
extensions/librdkafka/PublishKafka.h | 4 +-
extensions/pcap/CapturePacket.cpp | 8 +-
extensions/sftp/processors/ListSFTP.cpp | 13 +-
extensions/sftp/processors/ListSFTP.h | 6 +-
.../processors/AttributesToJSON.cpp | 4 +-
.../processors/AttributesToJSON.h | 4 +-
.../processors/DefragmentText.cpp | 6 +-
.../processors/DefragmentText.h | 4 +-
.../standard-processors/processors/ExtractText.cpp | 14 +-
.../standard-processors/processors/GetFile.cpp | 6 +-
.../standard-processors/processors/ListenSyslog.h | 1 +
.../standard-processors/processors/RouteText.cpp | 25 +-
.../standard-processors/processors/RouteText.h | 4 +-
.../standard-processors/processors/TailFile.cpp | 18 +-
.../tests/unit/ExtractTextTests.cpp | 32 +++
.../tests/unit/RouteTextTests.cpp | 6 +-
.../tests/unit/TailFileTests.cpp | 10 +-
.../windows-event-log/wel/MetadataWalker.cpp | 4 +-
extensions/windows-event-log/wel/MetadataWalker.h | 4 +-
libminifi/include/utils/HTTPUtils.h | 8 +-
libminifi/include/utils/RegexUtils.h | 173 ++++++++++++
libminifi/include/utils/StringUtils.h | 9 -
libminifi/src/core/yaml/YamlConfiguration.cpp | 10 +-
libminifi/src/utils/RegexUtils.cpp | 295 +++++++++++++++++++++
libminifi/src/utils/StringUtils.cpp | 10 -
libminifi/test/unit/RegexUtilsTests.cpp | 129 +++++++++
libminifi/test/unit/StringUtilsTests.cpp | 23 --
39 files changed, 758 insertions(+), 168 deletions(-)
diff --git a/encrypt-config/tests/ConfigFileEncryptorTests.cpp b/encrypt-config/tests/ConfigFileEncryptorTests.cpp
index 32894da4a..61439ed73 100644
--- a/encrypt-config/tests/ConfigFileEncryptorTests.cpp
+++ b/encrypt-config/tests/ConfigFileEncryptorTests.cpp
@@ -17,11 +17,11 @@
#include <fstream>
#include <optional>
-#include <regex>
#include <string>
#include "ConfigFileEncryptor.h"
#include "properties/Configuration.h"
+#include "utils/RegexUtils.h"
#include "TestBase.h"
#include "Catch.h"
@@ -46,8 +46,8 @@ bool check_encryption(const ConfigFile& test_file, const std::string& property_n
auto length = base64_length(utils::crypto::EncryptionType::nonceLength()) +
utils::crypto::EncryptionType::separator().size() +
base64_length(original_value_length + utils::crypto::EncryptionType::macLength());
- std::regex pattern("[0-9A-Za-z/+=|]{" + std::to_string(length) + "}");
- return std::regex_match(*encrypted_value, pattern);
+ utils::Regex pattern("[0-9A-Za-z/+=|]{" + std::to_string(length) + "}");
+ return utils::regexMatch(*encrypted_value, pattern);
}
} // namespace
diff --git a/extensions/aws/s3/S3Wrapper.cpp b/extensions/aws/s3/S3Wrapper.cpp
index fc6fa9a9e..757bf0210 100644
--- a/extensions/aws/s3/S3Wrapper.cpp
+++ b/extensions/aws/s3/S3Wrapper.cpp
@@ -20,7 +20,6 @@
#include "S3Wrapper.h"
#include <memory>
-#include <regex>
#include <utility>
#include <vector>
@@ -28,6 +27,7 @@
#include "utils/StringUtils.h"
#include "utils/file/FileUtils.h"
#include "utils/gsl.h"
+#include "utils/RegexUtils.h"
namespace org {
namespace apache {
@@ -56,9 +56,9 @@ void S3Wrapper::setCannedAcl(Aws::S3::Model::PutObjectRequest& request, const st
}
Expiration S3Wrapper::getExpiration(const std::string& expiration) {
- std::regex expr("expiry-date=\"(.*)\", rule-id=\"(.*)\"");
- std::smatch matches;
- const bool matched = std::regex_search(expiration, matches, expr);
+ minifi::utils::Regex expr("expiry-date=\"(.*)\", rule-id=\"(.*)\"");
+ minifi::utils::SMatch matches;
+ const bool matched = minifi::utils::regexSearch(expiration, matches, expr);
if (!matched || matches.size() < 3)
return Expiration{};
return Expiration{matches[1], matches[2]};
diff --git a/extensions/azure/processors/ListAzureDataLakeStorage.cpp b/extensions/azure/processors/ListAzureDataLakeStorage.cpp
index 187dbf38c..685a93f63 100644
--- a/extensions/azure/processors/ListAzureDataLakeStorage.cpp
+++ b/extensions/azure/processors/ListAzureDataLakeStorage.cpp
@@ -112,16 +112,16 @@ std::optional<storage::ListAzureDataLakeStorageParameters> ListAzureDataLakeStor
return std::nullopt;
}
- auto createFilterRegex = [&context](const std::string& property_name) -> std::optional<std::regex> {
+ auto createFilterRegex = [&context](const std::string& property_name) -> std::optional<minifi::utils::Regex> {
try {
std::string filter_str;
context.getProperty(property_name, filter_str);
if (!filter_str.empty()) {
- return std::regex(filter_str);
+ return minifi::utils::Regex(filter_str);
}
return std::nullopt;
- } catch (const std::regex_error&) {
+ } catch (const minifi::Exception&) {
throw Exception(PROCESS_SCHEDULE_EXCEPTION, property_name + " regex is invalid");
}
};
diff --git a/extensions/azure/storage/AzureDataLakeStorage.cpp b/extensions/azure/storage/AzureDataLakeStorage.cpp
index 9c0ce6345..0ba2aeb73 100644
--- a/extensions/azure/storage/AzureDataLakeStorage.cpp
+++ b/extensions/azure/storage/AzureDataLakeStorage.cpp
@@ -20,7 +20,6 @@
#include "AzureDataLakeStorage.h"
-#include <regex>
#include <string_view>
#include "AzureDataLakeStorageClient.h"
@@ -29,11 +28,12 @@
#include "utils/StringUtils.h"
#include "utils/gsl.h"
#include "utils/GeneralUtils.h"
+#include "utils/RegexUtils.h"
namespace org::apache::nifi::minifi::azure::storage {
namespace {
-bool matchesPathFilter(std::string_view base_directory, const std::optional<std::regex>& path_regex, std::string path) {
+bool matchesPathFilter(std::string_view base_directory, const std::optional<minifi::utils::Regex>& path_regex, std::string path) {
gsl_Expects(utils::implies(!base_directory.empty(), minifi::utils::StringUtils::startsWith(path, base_directory)));
if (!path_regex) {
return true;
@@ -43,15 +43,15 @@ bool matchesPathFilter(std::string_view base_directory, const std::optional<std:
path = path.size() == base_directory.size() ? "" : path.substr(base_directory.size() + 1);
}
- return std::regex_match(path, *path_regex);
+ return minifi::utils::regexMatch(path, *path_regex);
}
-bool matchesFileFilter(const std::optional<std::regex>& file_regex, const std::string& filename) {
+bool matchesFileFilter(const std::optional<minifi::utils::Regex>& file_regex, const std::string& filename) {
if (!file_regex) {
return true;
}
- return std::regex_match(filename, *file_regex);
+ return minifi::utils::regexMatch(filename, *file_regex);
}
} // namespace
diff --git a/extensions/azure/storage/DataLakeStorageClient.h b/extensions/azure/storage/DataLakeStorageClient.h
index a02bfca8f..38a0a07f7 100644
--- a/extensions/azure/storage/DataLakeStorageClient.h
+++ b/extensions/azure/storage/DataLakeStorageClient.h
@@ -23,7 +23,6 @@
#include <optional>
#include <memory>
#include <vector>
-#include <regex>
#include "AzureStorageCredentials.h"
@@ -31,6 +30,7 @@
#include "io/InputStream.h"
#include "azure/storage/files/datalake/protocol/datalake_rest_client.hpp"
#include "utils/Enum.h"
+#include "utils/RegexUtils.h"
namespace org::apache::nifi::minifi::azure::storage {
@@ -63,8 +63,8 @@ struct FetchAzureDataLakeStorageParameters : public AzureDataLakeStorageFileOper
struct ListAzureDataLakeStorageParameters : public AzureDataLakeStorageParameters {
bool recurse_subdirectories = true;
- std::optional<std::regex> path_regex;
- std::optional<std::regex> file_regex;
+ std::optional<minifi::utils::Regex> path_regex;
+ std::optional<minifi::utils::Regex> file_regex;
};
class DataLakeStorageClient {
diff --git a/extensions/civetweb/processors/ListenHTTP.cpp b/extensions/civetweb/processors/ListenHTTP.cpp
index 3853e5ff2..d88f0082a 100644
--- a/extensions/civetweb/processors/ListenHTTP.cpp
+++ b/extensions/civetweb/processors/ListenHTTP.cpp
@@ -319,7 +319,7 @@ void ListenHTTP::Handler::setHeaderAttributes(const mg_request_info *req_info, c
if (strcmp("filename", header->name) == 0) {
flow_file->setAttribute("filename", header->value);
- } else if (std::regex_match(header->name, headers_as_attrs_regex_)) {
+ } else if (utils::regexMatch(header->name, headers_as_attrs_regex_)) {
flow_file->setAttribute(header->name, header->value);
}
}
@@ -373,7 +373,7 @@ bool ListenHTTP::Handler::authRequest(mg_connection *conn, const mg_request_info
// If this is a two-way TLS connection, authorize the peer against the configured pattern
bool authorized = true;
if (req_info->is_ssl && req_info->client_cert != nullptr) {
- if (!std::regex_match(req_info->client_cert->subject, auth_dn_regex_)) {
+ if (!utils::regexMatch(req_info->client_cert->subject, auth_dn_regex_)) {
mg_printf(conn, "HTTP/1.1 403 Forbidden\r\n"
"Content-Type: text/html\r\n"
"Content-Length: 0\r\n\r\n");
diff --git a/extensions/civetweb/processors/ListenHTTP.h b/extensions/civetweb/processors/ListenHTTP.h
index a02f7a442..76c9e2c32 100644
--- a/extensions/civetweb/processors/ListenHTTP.h
+++ b/extensions/civetweb/processors/ListenHTTP.h
@@ -21,7 +21,6 @@
#include <map>
#include <memory>
-#include <regex>
#include <string>
#include <utility>
@@ -35,6 +34,7 @@
#include "utils/MinifiConcurrentQueue.h"
#include "utils/gsl.h"
#include "utils/Export.h"
+#include "utils/RegexUtils.h"
namespace org {
namespace apache {
@@ -116,8 +116,8 @@ class ListenHTTP : public core::Processor {
void enqueueRequest(mg_connection *conn, const mg_request_info *req_info, std::unique_ptr<io::BufferStream>);
std::string base_uri_;
- std::regex auth_dn_regex_;
- std::regex headers_as_attrs_regex_;
+ utils::Regex auth_dn_regex_;
+ utils::Regex headers_as_attrs_regex_;
core::ProcessContext *process_context_;
std::shared_ptr<core::logging::Logger> logger_ = core::logging::LoggerFactory<ListenHTTP>::getLogger();
std::map<std::string, ResponseBody> response_uri_map_;
diff --git a/extensions/expression-language/Expression.cpp b/extensions/expression-language/Expression.cpp
index e4168a765..fbce1b104 100644
--- a/extensions/expression-language/Expression.cpp
+++ b/extensions/expression-language/Expression.cpp
@@ -32,6 +32,7 @@
#include "utils/StringUtils.h"
#include "utils/OsUtils.h"
#include "expression/Expression.h"
+#include "utils/RegexUtils.h"
#ifndef DISABLE_CURL
#ifdef WIN32
@@ -822,16 +823,16 @@ Value expr_replaceEmpty(const std::vector<Value> &args) {
Value expr_matches(const std::vector<Value> &args) {
const auto &subject = args[0].asString();
- const std::regex expr = std::regex(args[1].asString());
+ const auto expr = utils::Regex(args[1].asString());
- return Value(std::regex_match(subject.begin(), subject.end(), expr));
+ return Value(utils::regexMatch(subject, expr));
}
Value expr_find(const std::vector<Value> &args) {
const auto &subject = args[0].asString();
- const std::regex expr = std::regex(args[1].asString());
+ const auto expr = utils::Regex(args[1].asString());
- return Value(std::regex_search(subject.begin(), subject.end(), expr));
+ return Value(utils::regexSearch(subject, expr));
}
#endif // EXPRESSION_LANGUAGE_USE_REGEX
@@ -1184,7 +1185,7 @@ Expression make_allMatchingAttributes(const std::string &function_name, const st
std::vector<Expression> out_exprs;
for (const auto &arg : args) {
- const std::regex attr_regex = std::regex(arg(params).asString());
+ const auto attr_regex = utils::Regex(arg(params).asString());
const auto cur_flow_file = params.flow_file.lock();
std::map<std::string, std::string> attrs;
@@ -1193,7 +1194,7 @@ Expression make_allMatchingAttributes(const std::string &function_name, const st
}
for (const auto &attr : attrs) {
- if (std::regex_match(attr.first.begin(), attr.first.end(), attr_regex)) {
+ if (utils::regexMatch(attr.first, attr_regex)) {
out_exprs.emplace_back(make_dynamic([=](const Parameters& /*params*/,
const std::vector<Expression>& /*sub_exprs*/) -> Value {
std::string attr_val;
@@ -1237,7 +1238,7 @@ Expression make_anyMatchingAttribute(const std::string &function_name, const std
std::vector<Expression> out_exprs;
for (const auto &arg : args) {
- const std::regex attr_regex = std::regex(arg(params).asString());
+ const auto attr_regex = utils::Regex(arg(params).asString());
const auto cur_flow_file = params.flow_file.lock();
std::map<std::string, std::string> attrs;
@@ -1246,7 +1247,7 @@ Expression make_anyMatchingAttribute(const std::string &function_name, const std
}
for (const auto &attr : attrs) {
- if (std::regex_match(attr.first.begin(), attr.first.end(), attr_regex)) {
+ if (utils::regexMatch(attr.first, attr_regex)) {
out_exprs.emplace_back(make_dynamic([=](const Parameters& /*params*/,
const std::vector<Expression>& /*sub_exprs*/) -> Value {
std::string attr_val;
diff --git a/extensions/http-curl/client/HTTPClient.cpp b/extensions/http-curl/client/HTTPClient.cpp
index a63c6bb66..16298aa24 100644
--- a/extensions/http-curl/client/HTTPClient.cpp
+++ b/extensions/http-curl/client/HTTPClient.cpp
@@ -23,12 +23,12 @@
#include <vector>
#include <string>
#include <algorithm>
-#include <regex>
#include "Exception.h"
#include "utils/gsl.h"
#include "utils/StringUtils.h"
#include "core/Resource.h"
+#include "utils/RegexUtils.h"
namespace org {
namespace apache {
@@ -394,9 +394,9 @@ bool HTTPClient::matches(const std::string &value, const std::string &sregex) {
if (sregex == ".*")
return true;
try {
- std::regex rgx(sregex);
- return std::regex_search(value, rgx);
- } catch (const std::regex_error &) {
+ utils::Regex rgx(sregex);
+ return utils::regexSearch(value, rgx);
+ } catch (const Exception &) {
return false;
}
}
diff --git a/extensions/kubernetes/controllerservice/KubernetesControllerService.cpp b/extensions/kubernetes/controllerservice/KubernetesControllerService.cpp
index 3322db063..18d8fbb8e 100644
--- a/extensions/kubernetes/controllerservice/KubernetesControllerService.cpp
+++ b/extensions/kubernetes/controllerservice/KubernetesControllerService.cpp
@@ -116,17 +116,17 @@ void KubernetesControllerService::onEnable() {
std::string namespace_filter;
if (getProperty(NamespaceFilter.getName(), namespace_filter) && !namespace_filter.empty()) {
- namespace_filter_ = std::regex{namespace_filter};
+ namespace_filter_ = utils::Regex{namespace_filter};
}
std::string pod_name_filter;
if (getProperty(PodNameFilter.getName(), pod_name_filter) && !pod_name_filter.empty()) {
- pod_name_filter_ = std::regex{pod_name_filter};
+ pod_name_filter_ = utils::Regex{pod_name_filter};
}
std::string container_name_filter;
if (getProperty(ContainerNameFilter.getName(), container_name_filter) && !container_name_filter.empty()) {
- container_name_filter_ = std::regex{container_name_filter};
+ container_name_filter_ = utils::Regex{container_name_filter};
}
}
@@ -199,8 +199,8 @@ std::optional<std::vector<KubernetesControllerService::AttributeMap>> Kubernetes
}
bool KubernetesControllerService::matchesRegexFilters(const std::string& name_space, const std::string& pod_name, const std::string& container_name) const {
- static constexpr auto matchesFilter = [](const std::string& target, const std::optional<std::regex>& filter) {
- return !filter || std::regex_match(target, *filter);
+ static constexpr auto matchesFilter = [](const std::string& target, const std::optional<utils::Regex>& filter) {
+ return !filter || utils::regexMatch(target, *filter);
};
return matchesFilter(name_space, namespace_filter_) &&
matchesFilter(pod_name, pod_name_filter_) &&
diff --git a/extensions/kubernetes/controllerservice/KubernetesControllerService.h b/extensions/kubernetes/controllerservice/KubernetesControllerService.h
index e923749c1..51bbc8777 100644
--- a/extensions/kubernetes/controllerservice/KubernetesControllerService.h
+++ b/extensions/kubernetes/controllerservice/KubernetesControllerService.h
@@ -24,6 +24,7 @@
#include "controllers/AttributeProviderService.h"
#include "core/logging/Logger.h"
#include "core/Property.h"
+#include "utils/RegexUtils.h"
namespace org::apache::nifi::minifi::controllers {
@@ -48,9 +49,9 @@ class KubernetesControllerService : public AttributeProviderService {
std::mutex initialization_mutex_;
bool initialized_ = false;
- std::optional<std::regex> namespace_filter_;
- std::optional<std::regex> pod_name_filter_;
- std::optional<std::regex> container_name_filter_;
+ std::optional<utils::Regex> namespace_filter_;
+ std::optional<utils::Regex> pod_name_filter_;
+ std::optional<utils::Regex> container_name_filter_;
std::shared_ptr<core::logging::Logger> logger_;
std::unique_ptr<APIClient> api_client_;
};
diff --git a/extensions/librdkafka/PublishKafka.cpp b/extensions/librdkafka/PublishKafka.cpp
index a45eba348..39111795b 100644
--- a/extensions/librdkafka/PublishKafka.cpp
+++ b/extensions/librdkafka/PublishKafka.cpp
@@ -283,12 +283,12 @@ class ReadCallback : public InputStreamCallback {
});
}
- static rd_kafka_headers_unique_ptr make_headers(const core::FlowFile& flow_file, std::regex& attribute_name_regex) {
+ static rd_kafka_headers_unique_ptr make_headers(const core::FlowFile& flow_file, utils::Regex& attribute_name_regex) {
const gsl::owner<rd_kafka_headers_t*> result{ rd_kafka_headers_new(8) };
if (!result) { throw std::bad_alloc{}; }
for (const auto& kv : flow_file.getAttributes()) {
- if (std::regex_search(kv.first, attribute_name_regex)) {
+ if (utils::regexSearch(kv.first, attribute_name_regex)) {
rd_kafka_header_add(result, kv.first.c_str(), kv.first.size(), kv.second.c_str(), kv.second.size());
}
}
@@ -337,7 +337,7 @@ class ReadCallback : public InputStreamCallback {
rd_kafka_topic_t* const rkt,
rd_kafka_t* const rk,
const core::FlowFile& flowFile,
- std::regex& attributeNameRegex,
+ utils::Regex& attributeNameRegex,
std::shared_ptr<PublishKafka::Messages> messages,
const size_t flow_file_index,
const bool fail_empty_flow_files,
@@ -510,7 +510,7 @@ void PublishKafka::onSchedule(const std::shared_ptr<core::ProcessContext> &conte
// Attributes to Send as Headers
std::string value;
if (context->getProperty(AttributeNameRegex.getName(), value) && !value.empty()) {
- attributeNameRegex_ = std::regex(value);
+ attributeNameRegex_ = utils::Regex(value);
logger_->log_debug("PublishKafka: AttributeNameRegex [%s]", value);
}
diff --git a/extensions/librdkafka/PublishKafka.h b/extensions/librdkafka/PublishKafka.h
index 3108c548a..153cc1955 100644
--- a/extensions/librdkafka/PublishKafka.h
+++ b/extensions/librdkafka/PublishKafka.h
@@ -30,7 +30,6 @@
#include <condition_variable>
#include <utility>
#include <vector>
-#include <regex>
#include "KafkaProcessorBase.h"
#include "utils/GeneralUtils.h"
@@ -43,6 +42,7 @@
#include "controllers/SSLContextService.h"
#include "rdkafka.h"
#include "KafkaConnection.h"
+#include "utils/RegexUtils.h"
namespace org {
namespace apache {
@@ -133,7 +133,7 @@ class PublishKafka : public KafkaProcessorBase {
uint32_t batch_size_{};
uint64_t target_batch_payload_size_{};
uint64_t max_flow_seg_size_{};
- std::regex attributeNameRegex_;
+ utils::Regex attributeNameRegex_;
std::atomic<bool> interrupted_{false};
std::mutex messages_mutex_; // If both connection_mutex_ and messages_mutex_ are needed, always take connection_mutex_ first to avoid deadlock
diff --git a/extensions/pcap/CapturePacket.cpp b/extensions/pcap/CapturePacket.cpp
index 2a5ae0213..96c1eaa0d 100644
--- a/extensions/pcap/CapturePacket.cpp
+++ b/extensions/pcap/CapturePacket.cpp
@@ -16,7 +16,6 @@
* limitations under the License.
*/
-#include <regex>
#include <memory>
#include <algorithm>
#include <cctype>
@@ -43,6 +42,7 @@
#include "ResourceClaim.h"
#include "utils/StringUtils.h"
#include "utils/ByteArrayCallback.h"
+#include "utils/RegexUtils.h"
namespace org {
namespace apache {
@@ -167,9 +167,9 @@ void CapturePacket::onSchedule(const std::shared_ptr<core::ProcessContext> &cont
bool found_match = false;
std::string matching_regex = "";
for (const auto &filter : allowed_interfaces) {
- std::regex r(filter);
- std::smatch m;
- if (std::regex_match(name, m, r)) {
+ utils::Regex r(filter);
+ utils::SMatch m;
+ if (utils::regexMatch(name, m, r)) {
matching_regex = filter;
found_match = true;
break;
diff --git a/extensions/sftp/processors/ListSFTP.cpp b/extensions/sftp/processors/ListSFTP.cpp
index 01bb1663b..28a256ae2 100644
--- a/extensions/sftp/processors/ListSFTP.cpp
+++ b/extensions/sftp/processors/ListSFTP.cpp
@@ -34,7 +34,6 @@
#include <vector>
#include <tuple>
#include <deque>
-#include <regex>
#include "utils/ByteArrayCallback.h"
#include "utils/TimeUtil.h"
@@ -236,9 +235,9 @@ void ListSFTP::onSchedule(const std::shared_ptr<core::ProcessContext> &context,
}
if (context->getProperty(FileFilterRegex.getName(), file_filter_regex_)) {
try {
- compiled_file_filter_regex_ = std::regex(file_filter_regex_);
+ compiled_file_filter_regex_ = utils::Regex(file_filter_regex_);
file_filter_regex_set_ = true;
- } catch (const std::regex_error &e) {
+ } catch (const Exception &e) {
logger_->log_error("Failed to compile File Filter Regex \"%s\"", file_filter_regex_.c_str());
file_filter_regex_set_ = false;
}
@@ -247,9 +246,9 @@ void ListSFTP::onSchedule(const std::shared_ptr<core::ProcessContext> &context,
}
if (context->getProperty(PathFilterRegex.getName(), path_filter_regex_)) {
try {
- compiled_path_filter_regex_ = std::regex(path_filter_regex_);
+ compiled_path_filter_regex_ = utils::Regex(path_filter_regex_);
path_filter_regex_set_ = true;
- } catch (const std::regex_error &e) {
+ } catch (const Exception &e) {
logger_->log_error("Failed to compile Path Filter Regex \"%s\"", path_filter_regex_.c_str());
path_filter_regex_set_ = false;
}
@@ -398,7 +397,7 @@ bool ListSFTP::filterFile(const std::string& parent_path, const std::string& fil
/* File Filter Regex */
if (file_filter_regex_set_) {
bool match = false;
- match = std::regex_search(filename, compiled_file_filter_regex_);
+ match = utils::regexSearch(filename, compiled_file_filter_regex_);
if (!match) {
logger_->log_debug("Ignoring \"%s/%s\" because it did not match the File Filter Regex \"%s\"",
parent_path.c_str(),
@@ -420,7 +419,7 @@ bool ListSFTP::filterDirectory(const std::string& parent_path, const std::string
if (path_filter_regex_set_) {
std::string dir_path = utils::file::FileUtils::concat_path(parent_path, filename, true /*force_posix*/);
bool match = false;
- match = std::regex_search(dir_path, compiled_path_filter_regex_);
+ match = utils::regexSearch(dir_path, compiled_path_filter_regex_);
if (!match) {
logger_->log_debug("Not recursing into \"%s\" because it did not match the Path Filter Regex \"%s\"",
dir_path.c_str(),
diff --git a/extensions/sftp/processors/ListSFTP.h b/extensions/sftp/processors/ListSFTP.h
index d2f7ac52e..a416a270d 100644
--- a/extensions/sftp/processors/ListSFTP.h
+++ b/extensions/sftp/processors/ListSFTP.h
@@ -25,7 +25,6 @@
#include <set>
#include <tuple>
#include <vector>
-#include <regex>
#include "SFTPProcessorBase.h"
#include "core/Processor.h"
@@ -33,6 +32,7 @@
#include "core/Property.h"
#include "utils/Id.h"
#include "controllers/keyvalue/PersistableKeyValueStoreService.h"
+#include "utils/RegexUtils.h"
namespace org {
namespace apache {
@@ -114,8 +114,8 @@ class ListSFTP : public SFTPProcessorBase {
std::string path_filter_regex_;
bool file_filter_regex_set_;
bool path_filter_regex_set_;
- std::regex compiled_file_filter_regex_;
- std::regex compiled_path_filter_regex_;
+ utils::Regex compiled_file_filter_regex_;
+ utils::Regex compiled_path_filter_regex_;
bool ignore_dotted_files_;
std::string target_system_timestamp_precision_;
std::string entity_tracking_initial_listing_target_;
diff --git a/extensions/standard-processors/processors/AttributesToJSON.cpp b/extensions/standard-processors/processors/AttributesToJSON.cpp
index 7f8b350be..69dd573d6 100644
--- a/extensions/standard-processors/processors/AttributesToJSON.cpp
+++ b/extensions/standard-processors/processors/AttributesToJSON.cpp
@@ -89,7 +89,7 @@ void AttributesToJSON::onSchedule(core::ProcessContext* context, core::ProcessSe
attribute_list_ = utils::StringUtils::splitAndTrimRemovingEmpty(value, ",");
}
if (context->getProperty(AttributesRegularExpression.getName(), value) && !value.empty()) {
- attributes_regular_expression_ = std::regex(value);
+ attributes_regular_expression_ = utils::Regex(value);
}
write_destination_ = WriteDestination::parse(utils::parsePropertyWithAllowableValuesOrThrow(*context, Destination.getName(), WriteDestination::values()).c_str());
context->getProperty(IncludeCoreAttributes.getName(), include_core_attributes_);
@@ -114,7 +114,7 @@ std::optional<std::unordered_set<std::string>> AttributesToJSON::getAttributesTo
if (attributes_regular_expression_) {
for (const auto& [key, value] : flowfile_attributes) {
- if (std::regex_match(key, attributes_regular_expression_.value())) {
+ if (utils::regexMatch(key, attributes_regular_expression_.value())) {
attributes.insert(key);
}
}
diff --git a/extensions/standard-processors/processors/AttributesToJSON.h b/extensions/standard-processors/processors/AttributesToJSON.h
index d04cd2ce5..35149adad 100644
--- a/extensions/standard-processors/processors/AttributesToJSON.h
+++ b/extensions/standard-processors/processors/AttributesToJSON.h
@@ -25,7 +25,6 @@
#include <unordered_set>
#include <memory>
#include <map>
-#include <regex>
#include "rapidjson/document.h"
#include "core/FlowFile.h"
@@ -35,6 +34,7 @@
#include "io/StreamPipe.h"
#include "utils/Enum.h"
#include "utils/Export.h"
+#include "utils/RegexUtils.h"
namespace org {
namespace apache {
@@ -90,7 +90,7 @@ class AttributesToJSON : public core::Processor {
std::shared_ptr<core::logging::Logger> logger_ = core::logging::LoggerFactory<AttributesToJSON>::getLogger();
std::vector<std::string> attribute_list_;
- std::optional<std::regex> attributes_regular_expression_;
+ std::optional<utils::Regex> attributes_regular_expression_;
WriteDestination write_destination_;
bool include_core_attributes_ = true;
bool null_value_ = false;
diff --git a/extensions/standard-processors/processors/DefragmentText.cpp b/extensions/standard-processors/processors/DefragmentText.cpp
index fda9dc071..3d5411e0a 100644
--- a/extensions/standard-processors/processors/DefragmentText.cpp
+++ b/extensions/standard-processors/processors/DefragmentText.cpp
@@ -81,7 +81,7 @@ void DefragmentText::onSchedule(core::ProcessContext* context, core::ProcessSess
std::string pattern_str;
if (context->getProperty(Pattern.getName(), pattern_str) && !pattern_str.empty()) {
- pattern_ = std::regex(pattern_str);
+ pattern_ = utils::Regex(pattern_str);
logger_->log_trace("The Pattern is configured to be %s", pattern_str);
} else {
throw Exception(PROCESS_SCHEDULE_EXCEPTION, "Pattern property missing or invalid");
@@ -204,7 +204,7 @@ struct ReadFlowFileContent : public InputStreamCallback {
}
};
-size_t getSplitPosition(const std::smatch& last_match, DefragmentText::PatternLocation pattern_location) {
+size_t getSplitPosition(const utils::SMatch& last_match, DefragmentText::PatternLocation pattern_location) {
size_t split_position = last_match.position(0);
if (pattern_location == DefragmentText::PatternLocation::END_OF_MESSAGE) {
split_position += last_match.length(0);
@@ -220,7 +220,7 @@ bool DefragmentText::splitFlowFileAtLastPattern(core::ProcessSession *session,
std::shared_ptr<core::FlowFile> &split_after_last_pattern) const {
ReadFlowFileContent read_flow_file_content;
session->read(original_flow_file, &read_flow_file_content);
- auto last_regex_match = utils::StringUtils::getLastRegexMatch(read_flow_file_content.content, pattern_);
+ auto last_regex_match = utils::getLastRegexMatch(read_flow_file_content.content, pattern_);
if (!last_regex_match.ready()) {
split_before_last_pattern = session->clone(original_flow_file);
split_after_last_pattern = nullptr;
diff --git a/extensions/standard-processors/processors/DefragmentText.h b/extensions/standard-processors/processors/DefragmentText.h
index eef72727f..163510a8b 100644
--- a/extensions/standard-processors/processors/DefragmentText.h
+++ b/extensions/standard-processors/processors/DefragmentText.h
@@ -17,7 +17,6 @@
#pragma once
-#include <regex>
#include <memory>
#include <string>
#include <set>
@@ -29,6 +28,7 @@
#include "core/logging/LoggerConfiguration.h"
#include "utils/Enum.h"
#include "serialization/PayloadSerializer.h"
+#include "utils/RegexUtils.h"
namespace org::apache::nifi::minifi::processors {
@@ -97,7 +97,7 @@ class DefragmentText : public core::Processor {
};
- std::regex pattern_;
+ utils::Regex pattern_;
PatternLocation pattern_location_;
std::optional<std::chrono::milliseconds> max_age_;
std::optional<size_t> max_size_;
diff --git a/extensions/standard-processors/processors/ExtractText.cpp b/extensions/standard-processors/processors/ExtractText.cpp
index 8763ca40a..68cd6d4e3 100644
--- a/extensions/standard-processors/processors/ExtractText.cpp
+++ b/extensions/standard-processors/processors/ExtractText.cpp
@@ -23,7 +23,6 @@
#include <memory>
#include <map>
#include <set>
-#include <regex>
#include <iostream>
#include <sstream>
#include <utility>
@@ -34,6 +33,7 @@
#include "core/Resource.h"
#include "core/FlowFile.h"
#include "utils/gsl.h"
+#include "utils/RegexUtils.h"
namespace org {
namespace apache {
@@ -149,9 +149,9 @@ int64_t ExtractText::ReadCallback::process(const std::shared_ptr<io::BaseStream>
if (regex_mode) {
bool insensitive;
- std::regex_constants::syntax_option_type regex_flags = std::regex::ECMAScript; // ECMAScript is the default behaviour
+ std::vector<utils::Regex::Mode> regex_flags;
if (ctx_->getProperty(InsensitiveMatch.getName(), insensitive) && insensitive) {
- regex_flags |= std::regex_constants::icase;
+ regex_flags.push_back(utils::Regex::Mode::ICASE);
}
bool ignoregroupzero;
@@ -179,9 +179,9 @@ int64_t ExtractText::ReadCallback::process(const std::shared_ptr<io::BaseStream>
int matchcount = 0;
try {
- std::regex rgx(value, regex_flags);
- std::smatch matches;
- while (std::regex_search(workStr, matches, rgx)) {
+ utils::Regex rgx(value, regex_flags);
+ utils::SMatch matches;
+ while (utils::regexSearch(workStr, matches, rgx)) {
size_t i = ignoregroupzero ? 1 : 0;
for (; i < matches.size(); ++i, ++matchcount) {
@@ -199,7 +199,7 @@ int64_t ExtractText::ReadCallback::process(const std::shared_ptr<io::BaseStream>
}
workStr = matches.suffix();
}
- } catch (const std::regex_error &e) {
+ } catch (const Exception &e) {
logger_->log_error("%s error encountered when trying to construct regular expression from property (key: %s) value: %s",
e.what(), k, value);
continue;
diff --git a/extensions/standard-processors/processors/GetFile.cpp b/extensions/standard-processors/processors/GetFile.cpp
index e99afd105..721860f36 100644
--- a/extensions/standard-processors/processors/GetFile.cpp
+++ b/extensions/standard-processors/processors/GetFile.cpp
@@ -27,7 +27,6 @@
#include <memory>
#include <set>
#include <string>
-#include <regex>
#include "utils/StringUtils.h"
#include "utils/file/FileUtils.h"
@@ -37,6 +36,7 @@
#include "core/Resource.h"
#include "core/TypedValues.h"
#include "utils/FileReaderCallback.h"
+#include "utils/RegexUtils.h"
using namespace std::literals::chrono_literals;
@@ -261,8 +261,8 @@ bool GetFile::fileMatchesRequestCriteria(std::string fullName, std::string name,
if (request.ignoreHiddenFile && utils::file::is_hidden(fullName))
return false;
- std::regex rgx(request.fileFilter);
- if (!std::regex_search(name, rgx)) {
+ utils::Regex rgx(request.fileFilter);
+ if (!utils::regexSearch(name, rgx)) {
return false;
}
diff --git a/extensions/standard-processors/processors/ListenSyslog.h b/extensions/standard-processors/processors/ListenSyslog.h
index d8c92423b..e85deaf52 100644
--- a/extensions/standard-processors/processors/ListenSyslog.h
+++ b/extensions/standard-processors/processors/ListenSyslog.h
@@ -21,6 +21,7 @@
#include <utility>
#include <string>
#include <memory>
+#include <regex>
#include "core/Processor.h"
#include "core/logging/Logger.h"
diff --git a/extensions/standard-processors/processors/RouteText.cpp b/extensions/standard-processors/processors/RouteText.cpp
index e1eab5c67..3c97339f0 100644
--- a/extensions/standard-processors/processors/RouteText.cpp
+++ b/extensions/standard-processors/processors/RouteText.cpp
@@ -135,7 +135,7 @@ void RouteText::onSchedule(core::ProcessContext* context, core::ProcessSessionFa
matching_ = utils::parseEnumProperty<Matching>(*context, MatchingStrategy);
context->getProperty(TrimWhitespace.getName(), trim_);
case_policy_ = context->getProperty<bool>(IgnoreCase).value_or(false) ? CasePolicy::IGNORE_CASE : CasePolicy::CASE_SENSITIVE;
- group_regex_ = context->getProperty(GroupingRegex) | utils::map([] (const auto& str) {return std::regex(str);});
+ group_regex_ = context->getProperty(GroupingRegex) | utils::map([] (const auto& str) {return utils::Regex(str);});
segmentation_ = utils::parseEnumProperty<Segmentation>(*context, SegmentationStrategy);
context->getProperty(GroupingFallbackValue.getName(), group_fallback_);
}
@@ -227,7 +227,7 @@ class RouteText::MatchingContext {
flow_file_(std::move(flow_file)),
case_policy_(case_policy) {}
- const std::regex& getRegexProperty(const core::Property& prop) {
+ const utils::Regex& getRegexProperty(const core::Property& prop) {
auto it = regex_values_.find(prop.getName());
if (it != regex_values_.end()) {
return it->second;
@@ -236,11 +236,11 @@ class RouteText::MatchingContext {
if (!process_context_.getDynamicProperty(prop, value, flow_file_)) {
throw Exception(PROCESSOR_EXCEPTION, "Missing dynamic property: '" + prop.getName() + "'");
}
- std::regex::flag_type flags = std::regex::ECMAScript;
+ std::vector<utils::Regex::Mode> flags;
if (case_policy_ == CasePolicy::IGNORE_CASE) {
- flags |= std::regex::icase;
+ flags.push_back(utils::Regex::Mode::ICASE);
}
- return (regex_values_[prop.getName()] = std::regex(value, flags));
+ return (regex_values_[prop.getName()] = utils::Regex(value, flags));
}
const std::string& getStringProperty(const core::Property& prop) {
@@ -275,7 +275,7 @@ class RouteText::MatchingContext {
CasePolicy case_policy_;
std::map<std::string, std::string> string_values_;
- std::map<std::string, std::regex> regex_values_;
+ std::map<std::string, utils::Regex> regex_values_;
struct OwningSearcher {
OwningSearcher(std::string str, CasePolicy case_policy)
@@ -425,12 +425,12 @@ bool RouteText::matchSegment(MatchingContext& context, const Segment& segment, c
return utils::StringUtils::equals(segment.value_, context.getStringProperty(prop), case_policy_ == CasePolicy::CASE_SENSITIVE);
}
case Matching::CONTAINS_REGEX: {
- std::match_results<std::string_view::const_iterator> match_result;
- return std::regex_search(segment.value_.begin(), segment.value_.end(), match_result, context.getRegexProperty(prop));
+ std::string segment_str = std::string(segment.value_);
+ return utils::regexSearch(segment_str, context.getRegexProperty(prop));
}
case Matching::MATCHES_REGEX: {
- std::match_results<std::string_view::const_iterator> match_result;
- return std::regex_match(segment.value_.begin(), segment.value_.end(), match_result, context.getRegexProperty(prop));
+ std::string segment_str = std::string(segment.value_);
+ return utils::regexMatch(segment_str, context.getRegexProperty(prop));
}
}
throw Exception(PROCESSOR_EXCEPTION, "Unknown matching strategy");
@@ -440,8 +440,9 @@ std::optional<std::string> RouteText::getGroup(const std::string_view& segment)
if (!group_regex_) {
return std::nullopt;
}
- std::match_results<std::string_view::const_iterator> match_result;
- if (!std::regex_match(segment.begin(), segment.end(), match_result, group_regex_.value())) {
+ utils::SMatch match_result;
+ std::string segment_str = std::string(segment);
+ if (!utils::regexMatch(segment_str, match_result, group_regex_.value())) {
return group_fallback_;
}
// WARNING!! using a temporary std::string causes the omission of delimiters
diff --git a/extensions/standard-processors/processors/RouteText.h b/extensions/standard-processors/processors/RouteText.h
index b30dbadf1..636e5b04c 100644
--- a/extensions/standard-processors/processors/RouteText.h
+++ b/extensions/standard-processors/processors/RouteText.h
@@ -17,7 +17,6 @@
#pragma once
-#include <regex>
#include <optional>
#include <string_view>
#include <map>
@@ -27,6 +26,7 @@
#include "Processor.h"
#include "utils/Enum.h"
#include "utils/Export.h"
+#include "utils/RegexUtils.h"
namespace org::apache::nifi::minifi::processors {
@@ -111,7 +111,7 @@ class RouteText : public core::Processor {
Segmentation segmentation_;
bool trim_{true};
CasePolicy case_policy_{CasePolicy::CASE_SENSITIVE};
- std::optional<std::regex> group_regex_;
+ std::optional<utils::Regex> group_regex_;
std::string group_fallback_;
std::map<std::string, core::Property> dynamic_properties_;
diff --git a/extensions/standard-processors/processors/TailFile.cpp b/extensions/standard-processors/processors/TailFile.cpp
index 098e794ce..2e065a6ba 100644
--- a/extensions/standard-processors/processors/TailFile.cpp
+++ b/extensions/standard-processors/processors/TailFile.cpp
@@ -30,7 +30,6 @@
#include <string>
#include <utility>
#include <vector>
-#include <regex>
#include "range/v3/action/sort.hpp"
#include "range/v3/range/conversion.hpp"
@@ -48,6 +47,7 @@
#include "core/ProcessContext.h"
#include "core/ProcessSession.h"
#include "core/Resource.h"
+#include "utils/RegexUtils.h"
namespace org {
@@ -631,8 +631,8 @@ std::vector<TailState> TailFile::findAllRotatedFiles(const TailState &state) con
std::vector<TailStateWithMtime> matched_files_with_mtime;
auto collect_matching_files = [&](const std::string &path, const std::string &file_name) -> bool {
- std::regex pattern_regex(pattern);
- if (file_name != state.file_name_ && std::regex_match(file_name, pattern_regex)) {
+ utils::Regex pattern_regex(pattern);
+ if (file_name != state.file_name_ && utils::regexMatch(file_name, pattern_regex)) {
std::string full_file_name = path + utils::file::get_separator() + file_name;
TailStateWithMtime::TimePoint mtime{utils::file::last_write_time_point(full_file_name)};
logger_->log_debug("File %s with mtime %" PRId64 " matches rolling filename pattern %s, so we are reading it", file_name, int64_t{mtime.time_since_epoch().count()}, pattern);
@@ -653,8 +653,8 @@ std::vector<TailState> TailFile::findRotatedFilesAfterLastReadTime(const TailSta
std::vector<TailStateWithMtime> matched_files_with_mtime;
auto collect_matching_files = [&](const std::string &path, const std::string &file_name) -> bool {
- std::regex pattern_regex(pattern);
- if (file_name != state.file_name_ && std::regex_match(file_name, pattern_regex)) {
+ utils::Regex pattern_regex(pattern);
+ if (file_name != state.file_name_ && utils::regexMatch(file_name, pattern_regex)) {
std::string full_file_name = path + utils::file::get_separator() + file_name;
TailStateWithMtime::TimePoint mtime{utils::file::last_write_time_point(full_file_name)};
logger_->log_debug("File %s with mtime %" PRId64 " matches rolling filename pattern %s", file_name, int64_t{mtime.time_since_epoch().count()}, pattern);
@@ -867,9 +867,9 @@ void TailFile::checkForRemovedFiles() {
for (const auto &kv : tail_states_) {
const std::string &full_file_name = kv.first;
const TailState &state = kv.second;
- std::regex pattern_regex(file_to_tail_);
+ utils::Regex pattern_regex(file_to_tail_);
if (utils::file::file_size(state.fileNameWithPath()) == 0u ||
- !std::regex_match(state.file_name_, pattern_regex)) {
+ !utils::regexMatch(state.file_name_, pattern_regex)) {
file_names_to_remove.push_back(full_file_name);
}
}
@@ -882,8 +882,8 @@ void TailFile::checkForRemovedFiles() {
void TailFile::checkForNewFiles(core::ProcessContext& context) {
auto add_new_files_callback = [&](const std::string &path, const std::string &file_name) -> bool {
std::string full_file_name = path + utils::file::get_separator() + file_name;
- std::regex file_to_tail_regex(file_to_tail_);
- if (!containsKey(tail_states_, full_file_name) && std::regex_match(file_name, file_to_tail_regex)) {
+ utils::Regex file_to_tail_regex(file_to_tail_);
+ if (!containsKey(tail_states_, full_file_name) && utils::regexMatch(file_name, file_to_tail_regex)) {
tail_states_.emplace(full_file_name, TailState{path, file_name});
}
return true;
diff --git a/extensions/standard-processors/tests/unit/ExtractTextTests.cpp b/extensions/standard-processors/tests/unit/ExtractTextTests.cpp
index 09a47ca5d..f41f74e7a 100644
--- a/extensions/standard-processors/tests/unit/ExtractTextTests.cpp
+++ b/extensions/standard-processors/tests/unit/ExtractTextTests.cpp
@@ -29,6 +29,7 @@
#include "core/Core.h"
#include "unit/ProvenanceTestHelper.h"
#include "repository/VolatileContentRepository.h"
+#include "utils/TestUtils.h"
#include "core/FlowFile.h"
#include "core/Processor.h"
@@ -180,3 +181,34 @@ TEST_CASE("Test usage of ExtractText in regex mode", "[extracttextRegexTest]") {
LogTestController::getInstance().reset();
}
+
+TEST_CASE("Test usage of ExtractText in regex mode with large regex matches", "[extracttextRegexTest]") {
+ TestController test_controller;
+ LogTestController::getInstance().setTrace<org::apache::nifi::minifi::processors::ExtractText>();
+ LogTestController::getInstance().setTrace<org::apache::nifi::minifi::processors::GetFile>();
+ LogTestController::getInstance().setTrace<org::apache::nifi::minifi::processors::LogAttribute>();
+
+ std::shared_ptr<TestPlan> plan = test_controller.createPlan();
+ std::shared_ptr<TestRepository> repo = std::make_shared<TestRepository>();
+
+ auto dir = test_controller.createTempDirectory();
+ REQUIRE(!dir.empty());
+ auto getfile = plan->addProcessor("GetFile", "GetFile");
+ plan->setProperty(getfile, org::apache::nifi::minifi::processors::GetFile::Directory.getName(), dir);
+ plan->setProperty(getfile, org::apache::nifi::minifi::processors::GetFile::KeepSourceFile.getName(), "true");
+
+ auto extract_text_processor = plan->addProcessor("ExtractText", "ExtractText", core::Relationship("success", "description"), true);
+ plan->setProperty(extract_text_processor, org::apache::nifi::minifi::processors::ExtractText::RegexMode.getName(), "true");
+ plan->setProperty(extract_text_processor, "RegexAttr", "Speed limit (.*)", true);
+
+ auto log_attribute_processor = plan->addProcessor("LogAttribute", "outputLogAttribute", core::Relationship("success", "description"), true);
+ plan->setProperty(log_attribute_processor, org::apache::nifi::minifi::processors::LogAttribute::AttributesToLog.getName(), TEST_ATTR);
+
+ std::string additional_long_string(100'000, '.');
+ utils::putFileToDir(dir, TEST_FILE, "Speed limit 80" + additional_long_string);
+
+ test_controller.runSession(plan);
+
+ REQUIRE(LogTestController::getInstance().contains("key:RegexAttr.0 value:80"));
+ LogTestController::getInstance().reset();
+}
diff --git a/extensions/standard-processors/tests/unit/RouteTextTests.cpp b/extensions/standard-processors/tests/unit/RouteTextTests.cpp
index 4e4d1b3a7..b59159cad 100644
--- a/extensions/standard-processors/tests/unit/RouteTextTests.cpp
+++ b/extensions/standard-processors/tests/unit/RouteTextTests.cpp
@@ -402,7 +402,7 @@ TEST_CASE_METHOD(RouteTextController, "RouteText grouping uses empty strings for
proc_->setProperty(processors::RouteText::RoutingStrategy, "Dynamic Routing");
proc_->setProperty(processors::RouteText::SegmentationStrategy, "Per Line");
proc_->setProperty(processors::RouteText::MatchingStrategy, "Contains");
- proc_->setProperty(processors::RouteText::GroupingRegex, "group(.)(?:\\.(.))?.*");
+ proc_->setProperty(processors::RouteText::GroupingRegex, "group(.)(\\..)?.*");
proc_->setDynamicProperty("A", "toA");
@@ -416,8 +416,8 @@ TEST_CASE_METHOD(RouteTextController, "RouteText grouping uses empty strings for
std::map<std::string, FlowFilePatternVec> expected;
expected["A"] = {
- FlowFilePattern{}.attr("RouteText.Group", "1, 1").content("group1.1:toA(one)\ngroup1.1:toA(two)\n"),
- FlowFilePattern{}.attr("RouteText.Group", "1, 2").content("group1.2:toA(three)\n"),
+ FlowFilePattern{}.attr("RouteText.Group", "1, .1").content("group1.1:toA(one)\ngroup1.1:toA(two)\n"),
+ FlowFilePattern{}.attr("RouteText.Group", "1, .2").content("group1.2:toA(three)\n"),
FlowFilePattern{}.attr("RouteText.Group", "2, ").content("group2:toA(four)\ngroup2:toA(five)")
};
expected["matched"] = FlowFilePatternVec{};
diff --git a/extensions/standard-processors/tests/unit/TailFileTests.cpp b/extensions/standard-processors/tests/unit/TailFileTests.cpp
index 089d8121c..e7f54d889 100644
--- a/extensions/standard-processors/tests/unit/TailFileTests.cpp
+++ b/extensions/standard-processors/tests/unit/TailFileTests.cpp
@@ -1824,11 +1824,11 @@ TEST_CASE("TailFile can use an AttributeProviderService", "[AttributeProviderSer
CHECK(LogTestController::getInstance().contains("key:test.animal value:dog"));
CHECK(LogTestController::getInstance().contains("key:test.animal value:dolphin"));
- CHECK_FALSE(LogTestController::getInstance().contains("key:test.fruit value:strawberry"));
- CHECK_FALSE(LogTestController::getInstance().contains("key:test.uid value:002"));
- CHECK_FALSE(LogTestController::getInstance().contains("key:test.uid value:003"));
- CHECK_FALSE(LogTestController::getInstance().contains("key:test.animal value:elephant"));
- CHECK_FALSE(LogTestController::getInstance().contains("key:test.animal value:horse"));
+ CHECK_FALSE(LogTestController::getInstance().contains("key:test.fruit value:strawberry", 0s, 0ms));
+ CHECK_FALSE(LogTestController::getInstance().contains("key:test.uid value:002", 0s, 0ms));
+ CHECK_FALSE(LogTestController::getInstance().contains("key:test.uid value:003", 0s, 0ms));
+ CHECK_FALSE(LogTestController::getInstance().contains("key:test.animal value:elephant", 0s, 0ms));
+ CHECK_FALSE(LogTestController::getInstance().contains("key:test.animal value:horse", 0s, 0ms));
LogTestController::getInstance().reset();
}
diff --git a/extensions/windows-event-log/wel/MetadataWalker.cpp b/extensions/windows-event-log/wel/MetadataWalker.cpp
index 7d68676bf..9ac4f02ed 100644
--- a/extensions/windows-event-log/wel/MetadataWalker.cpp
+++ b/extensions/windows-event-log/wel/MetadataWalker.cpp
@@ -52,11 +52,11 @@ bool MetadataWalker::for_each(pugi::xml_node &node) {
return input;
};
- if (std::regex_match(attr.name(), regex_)) {
+ if (utils::regexMatch(attr.name(), regex_)) {
updateText(node, attr.name(), idUpdate);
}
- if (std::regex_match(attr.value(), regex_)) {
+ if (utils::regexMatch(attr.value(), regex_)) {
updateText(node, attr.value(), idUpdate);
}
}
diff --git a/extensions/windows-event-log/wel/MetadataWalker.h b/extensions/windows-event-log/wel/MetadataWalker.h
index f779a7d2d..f4938d7a9 100644
--- a/extensions/windows-event-log/wel/MetadataWalker.h
+++ b/extensions/windows-event-log/wel/MetadataWalker.h
@@ -27,7 +27,6 @@
#include <map>
#include <sstream>
#include <string>
-#include <regex>
#include <vector>
#include "core/Core.h"
@@ -39,6 +38,7 @@
#include "concurrentqueue.h"
#include "pugixml.hpp"
+#include "utils/RegexUtils.h"
namespace org {
namespace apache {
@@ -97,7 +97,7 @@ class MetadataWalker : public pugi::xml_tree_walker {
const WindowsEventLogMetadata& windows_event_log_metadata_;
std::string log_name_;
- std::regex regex_;
+ utils::Regex regex_;
std::string regex_str_;
bool update_xml_;
bool resolve_;
diff --git a/libminifi/include/utils/HTTPUtils.h b/libminifi/include/utils/HTTPUtils.h
index 275cda2d4..8e62af8e3 100644
--- a/libminifi/include/utils/HTTPUtils.h
+++ b/libminifi/include/utils/HTTPUtils.h
@@ -20,9 +20,9 @@
#define LIBMINIFI_INCLUDE_UTILS_HTTPUTILS_H_
#include <string>
-#include <regex>
#include "io/ClientSocket.h"
+#include "utils/RegexUtils.h"
/**
This function, unfortunately, assumes that we're parsing http components of a local host. On windows this is problematic
@@ -36,9 +36,9 @@ inline bool parse_http_components(const std::string &url, std::string &port, std
std::string regex_str = "^(http|https)://(localhost:)([0-9]+)?(/.*)$";
#endif
- auto rgx = std::regex(regex_str, std::regex_constants::icase);
- std::smatch matches;
- if (std::regex_search(url, matches, rgx)) {
+ auto rgx = org::apache::nifi::minifi::utils::Regex(regex_str, {org::apache::nifi::minifi::utils::Regex::Mode::ICASE});
+ org::apache::nifi::minifi::utils::SMatch matches;
+ if (org::apache::nifi::minifi::utils::regexSearch(url, matches, rgx)) {
if (matches.size() >= 5) {
scheme = matches[1];
port = matches[3];
diff --git a/libminifi/include/utils/RegexUtils.h b/libminifi/include/utils/RegexUtils.h
new file mode 100644
index 000000000..f0b7cdf0f
--- /dev/null
+++ b/libminifi/include/utils/RegexUtils.h
@@ -0,0 +1,173 @@
+/**
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <string>
+#include <string_view>
+#include <vector>
+#include <cstddef>
+
+// There is a bug in std::regex implementation of libstdc++ which causes stack overflow on long matches: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=86164
+// Due to this bug we should use regex.h for regex searches if libstdc++ is used until a fix is released.
+#if defined(__GLIBCXX__) || defined(__GLIBCPP__)
+#include <regex.h>
+#else
+#include <regex>
+#define NO_MORE_REGFREEE
+#endif
+
+namespace org::apache::nifi::minifi::utils {
+
+class Regex;
+
+#ifdef NO_MORE_REGFREEE
+using SMatch = std::smatch;
+#else
+class SMatch {
+ struct Regmatch;
+ struct SuffixWrapper;
+ public:
+ struct Iterator {
+ using iterator_category = std::forward_iterator_tag;
+ using difference_type = std::ptrdiff_t;
+ using value_type = Regmatch;
+ using pointer = value_type*;
+ using reference = value_type&;
+
+ Iterator() : regmatch_(nullptr) {
+ }
+
+ explicit Iterator(Regmatch* regmatch)
+ : regmatch_(regmatch) {
+ }
+
+ reference operator*() const { return *regmatch_; }
+ pointer operator->() { return regmatch_; }
+
+ Iterator& operator++() { regmatch_++; return *this; }
+ Iterator operator++(int) { Iterator tmp = *this; ++(*this); return tmp; }
+
+ friend bool operator== (const Iterator& a, const Iterator& b) { return a.regmatch_ == b.regmatch_; }
+ friend bool operator!= (const Iterator& a, const Iterator& b) { return a.regmatch_ != b.regmatch_; }
+
+ private:
+ pointer regmatch_;
+ };
+
+ SuffixWrapper suffix() const;
+ const Regmatch& operator[](std::size_t index) const;
+ Iterator begin() { return Iterator(&matches_[0]); }
+ Iterator end() { return Iterator(&matches_[matches_.size()]); }
+
+ std::size_t size() const;
+ bool ready() const;
+ std::size_t position(std::size_t index) const;
+ std::size_t length(std::size_t index) const;
+
+ private:
+ struct Regmatch {
+ operator std::string() const {
+ return str();
+ }
+
+ std::string str() const {
+ if (match.rm_so == -1) {
+ return "";
+ }
+ return std::string(pattern.begin() + match.rm_so, pattern.begin() + match.rm_eo);
+ }
+
+ regmatch_t match;
+ std::string_view pattern;
+ };
+
+ struct SuffixWrapper {
+ operator std::string() const {
+ return str();
+ }
+
+ std::string str() const {
+ return suffix;
+ }
+
+ std::string suffix;
+ };
+
+ void clear();
+
+ std::vector<Regmatch> matches_;
+ std::string pattern_;
+
+ friend bool regexMatch(const std::string &pattern, SMatch& match, const Regex& regex);
+ friend bool regexSearch(const std::string &pattern, SMatch& match, const Regex& regex);
+ friend utils::SMatch getLastRegexMatch(const std::string& str, const utils::Regex& pattern);
+};
+#endif
+
+class Regex {
+ public:
+ enum class Mode { ICASE };
+
+ Regex();
+ explicit Regex(const std::string &value);
+ explicit Regex(const std::string &value,
+ const std::vector<Mode> &mode);
+ Regex(const Regex &);
+ Regex& operator=(const Regex &);
+ Regex(Regex&& other);
+ Regex& operator=(Regex&& other);
+ ~Regex();
+
+ private:
+ std::string regex_str_;
+ bool valid_;
+
+#ifdef NO_MORE_REGFREEE
+ std::regex compiled_regex_;
+ std::regex_constants::syntax_option_type regex_mode_;
+#else
+ void compileRegex(regex_t& regex, const std::string& regex_string) const;
+
+ regex_t compiled_regex_;
+ regex_t compiled_full_input_regex_;
+ int regex_mode_;
+#endif
+
+ friend bool regexMatch(const std::string &pattern, const Regex& regex);
+ friend bool regexMatch(const std::string &pattern, SMatch& match, const Regex& regex);
+ friend bool regexSearch(const std::string &pattern, const Regex& regex);
+ friend bool regexSearch(const std::string &pattern, SMatch& match, const Regex& regex);
+ friend SMatch getLastRegexMatch(const std::string& pattern, const utils::Regex& regex);
+};
+
+bool regexMatch(const std::string &pattern, const Regex& regex);
+bool regexMatch(const std::string &pattern, SMatch& match, const Regex& regex);
+
+bool regexSearch(const std::string &pattern, const Regex& regex);
+bool regexSearch(const std::string &pattern, SMatch& match, const Regex& regex);
+
+/**
+ * Returns the last match of a regular expression within the given string
+ * @param pattern incoming string
+ * @param regex the regex to be matched
+ * @return the last valid SMatch or a default constructed SMatch (ready() != true) if no matches have been found
+ */
+SMatch getLastRegexMatch(const std::string& pattern, const utils::Regex& regex);
+
+} // namespace org::apache::nifi::minifi::utils
diff --git a/libminifi/include/utils/StringUtils.h b/libminifi/include/utils/StringUtils.h
index 5b1902e42..d43486bc2 100644
--- a/libminifi/include/utils/StringUtils.h
+++ b/libminifi/include/utils/StringUtils.h
@@ -23,7 +23,6 @@
#include <iostream>
#include <map>
#include <optional>
-#include <regex>
#include <sstream>
#include <string>
#include <string_view>
@@ -485,14 +484,6 @@ class StringUtils {
return str;
}
- /**
- * Returns the last match of a regular expression within the given string
- * @param str incoming string
- * @param pattern the regex to be matched
- * @return the last valid std::smatch or a default constructed smatch (ready() != true) if no matches have been found
- */
- static std::smatch getLastRegexMatch(const std::string& str, const std::regex& pattern);
-
static std::string escapeUnprintableBytes(gsl::span<const std::byte> data);
private:
diff --git a/libminifi/src/core/yaml/YamlConfiguration.cpp b/libminifi/src/core/yaml/YamlConfiguration.cpp
index 4d09d49c1..299c7c8f9 100644
--- a/libminifi/src/core/yaml/YamlConfiguration.cpp
+++ b/libminifi/src/core/yaml/YamlConfiguration.cpp
@@ -29,7 +29,7 @@
#include "utils/TimeUtil.h"
#ifdef YAML_CONFIGURATION_USE_REGEX
-#include <regex>
+#include "utils/RegexUtils.h"
#endif // YAML_CONFIGURATION_USE_REGEX
namespace org {
@@ -826,8 +826,8 @@ void YamlConfiguration::validateComponentProperties(ConfigurableComponent& compo
}
for (const auto &excl_pair : excl_props) {
- std::regex excl_expr(excl_pair.second);
- if (std::regex_match(component_properties.at(excl_pair.first).getValue().to_string(), excl_expr)) {
+ utils::Regex excl_expr(excl_pair.second);
+ if (utils::regexMatch(component_properties.at(excl_pair.first).getValue().to_string(), excl_expr)) {
std::string reason = utils::StringUtils::join_pack("property '", prop_pair.second.getName(),
"' must not be set when the value of property '", excl_pair.first, "' matches '", excl_pair.second, "'");
raiseComponentError(component_name, yaml_section, reason);
@@ -840,8 +840,8 @@ void YamlConfiguration::validateComponentProperties(ConfigurableComponent& compo
const auto &prop_regex_str = prop_pair.second.getValidRegex();
if (!prop_regex_str.empty()) {
- std::regex prop_regex(prop_regex_str);
- if (!std::regex_match(prop_pair.second.getValue().to_string(), prop_regex)) {
+ utils::Regex prop_regex(prop_regex_str);
+ if (!utils::regexMatch(prop_pair.second.getValue().to_string(), prop_regex)) {
std::string reason = utils::StringUtils::join_pack("property '", prop_pair.second.getName(), "' does not match validation pattern '", prop_regex_str, "'");
raiseComponentError(component_name, yaml_section, reason);
}
diff --git a/libminifi/src/utils/RegexUtils.cpp b/libminifi/src/utils/RegexUtils.cpp
new file mode 100644
index 000000000..0b9d9d38f
--- /dev/null
+++ b/libminifi/src/utils/RegexUtils.cpp
@@ -0,0 +1,295 @@
+/**
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "utils/RegexUtils.h"
+
+#include <iostream>
+#include <vector>
+
+#include "Exception.h"
+
+#ifndef NO_MORE_REGFREEE
+namespace {
+
+std::size_t getMaxGroupCountOfRegex(const std::string& regex) {
+ return std::count(regex.begin(), regex.end(), '(') + 1;
+}
+
+} // namespace
+#endif
+
+namespace org::apache::nifi::minifi::utils {
+
+#ifndef NO_MORE_REGFREEE
+SMatch::SuffixWrapper SMatch::suffix() const {
+ if ((size_t) matches_[0].match.rm_eo >= pattern_.size()) {
+ return SuffixWrapper{std::string()};
+ } else {
+ return SuffixWrapper{pattern_.substr(matches_[0].match.rm_eo)};
+ }
+}
+
+const SMatch::Regmatch& SMatch::operator[](std::size_t index) const {
+ return matches_[index];
+}
+
+std::size_t SMatch::size() const {
+ std::size_t count = 0;
+ for (const auto &m : matches_) {
+ if (m.match.rm_so == -1) {
+ break;
+ }
+ ++count;
+ }
+ return count;
+}
+
+bool SMatch::ready() const {
+ return !matches_.empty();
+}
+
+std::size_t SMatch::position(std::size_t index) const {
+ return matches_.at(index).match.rm_so;
+}
+
+std::size_t SMatch::length(std::size_t index) const {
+ return matches_.at(index).match.rm_eo - matches_.at(index).match.rm_so;
+}
+
+void SMatch::clear() {
+ matches_.clear();
+ pattern_.clear();
+}
+#endif
+
+Regex::Regex() : Regex::Regex("") {}
+
+Regex::Regex(const std::string &value) : Regex::Regex(value, {}) {}
+
+Regex::Regex(const std::string &value,
+ const std::vector<Regex::Mode> &mode)
+ : regex_str_(value),
+ valid_(false) {
+ // Create regex mode
+#ifdef NO_MORE_REGFREEE
+ regex_mode_ = std::regex_constants::ECMAScript;
+#else
+ regex_mode_ = REG_EXTENDED;
+#endif
+ for (const auto m : mode) {
+ switch (m) {
+ case Mode::ICASE:
+#ifdef NO_MORE_REGFREEE
+ regex_mode_ |= std::regex_constants::icase;
+#else
+ regex_mode_ |= REG_ICASE;
+#endif
+ break;
+ }
+ }
+#ifdef NO_MORE_REGFREEE
+ try {
+ compiled_regex_ = std::regex(regex_str_, regex_mode_);
+ valid_ = true;
+ } catch (const std::regex_error &e) {
+ throw Exception(REGEX_EXCEPTION, e.what());
+ }
+#else
+ compileRegex(compiled_regex_, regex_str_);
+ compileRegex(compiled_full_input_regex_, '^' + regex_str_ + '$');
+ valid_ = true;
+#endif
+}
+
+Regex::Regex(const Regex& other)
+#ifndef NO_MORE_REGFREEE
+ : valid_(false),
+ regex_mode_(REG_EXTENDED)
+#endif
+{
+ *this = other;
+}
+
+Regex& Regex::operator=(const Regex& other) {
+ if (this == &other) {
+ return *this;
+ }
+
+ regex_str_ = other.regex_str_;
+ regex_mode_ = other.regex_mode_;
+#ifdef NO_MORE_REGFREEE
+ compiled_regex_ = other.compiled_regex_;
+#else
+ if (valid_) {
+ regfree(&compiled_regex_);
+ regfree(&compiled_full_input_regex_);
+ }
+ compileRegex(compiled_regex_, regex_str_);
+ compileRegex(compiled_full_input_regex_, '^' + regex_str_ + '$');
+#endif
+ valid_ = other.valid_;
+ return *this;
+}
+
+Regex::Regex(Regex&& other)
+#ifndef NO_MORE_REGFREEE
+ : valid_(false),
+ regex_mode_(REG_EXTENDED)
+#endif
+{
+ *this = std::move(other);
+}
+
+Regex& Regex::operator=(Regex&& other) {
+ if (this == &other) {
+ return *this;
+ }
+
+ regex_str_ = std::move(other.regex_str_);
+ regex_mode_ = other.regex_mode_;
+#ifdef NO_MORE_REGFREEE
+ compiled_regex_ = std::move(other.compiled_regex_);
+#else
+ if (valid_) {
+ regfree(&compiled_regex_);
+ regfree(&compiled_full_input_regex_);
+ }
+ compiled_regex_ = other.compiled_regex_;
+ compiled_full_input_regex_ = other.compiled_full_input_regex_;
+#endif
+ valid_ = other.valid_;
+ other.valid_ = false;
+ return *this;
+}
+
+Regex::~Regex() {
+#ifndef NO_MORE_REGFREEE
+ if (valid_) {
+ regfree(&compiled_regex_);
+ regfree(&compiled_full_input_regex_);
+ }
+#endif
+}
+
+#ifndef NO_MORE_REGFREEE
+void Regex::compileRegex(regex_t& regex, const std::string& regex_string) const {
+ int err_code = regcomp(®ex, regex_string.c_str(), regex_mode_);
+ if (err_code) {
+ const size_t size = regerror(err_code, ®ex, nullptr, 0);
+ std::vector<char> msg(size);
+ regerror(err_code, ®ex, msg.data(), msg.size());
+ throw Exception(REGEX_EXCEPTION, std::string(msg.begin(), msg.end()));
+ }
+}
+#endif
+
+bool regexSearch(const std::string &pattern, const Regex& regex) {
+ if (!regex.valid_) {
+ return false;
+ }
+#ifdef NO_MORE_REGFREEE
+ return std::regex_search(pattern, regex.compiled_regex_);
+#else
+ std::vector<regmatch_t> match;
+ match.resize(getMaxGroupCountOfRegex(regex.regex_str_));
+ return regexec(®ex.compiled_regex_, pattern.c_str(), match.size(), match.data(), 0) == 0;
+#endif
+}
+
+bool regexSearch(const std::string &pattern, SMatch& match, const Regex& regex) {
+ if (!regex.valid_) {
+ return false;
+ }
+#ifdef NO_MORE_REGFREEE
+ return std::regex_search(pattern, match, regex.compiled_regex_);
+#else
+ match.clear();
+ std::vector<regmatch_t> regmatches;
+ regmatches.resize(getMaxGroupCountOfRegex(regex.regex_str_));
+ bool result = regexec(®ex.compiled_regex_, pattern.c_str(), regmatches.size(), regmatches.data(), 0) == 0;
+ match.pattern_ = pattern;
+ for (const auto& regmatch : regmatches) {
+ match.matches_.push_back(SMatch::Regmatch{regmatch, match.pattern_});
+ }
+ return result;
+#endif
+}
+
+bool regexMatch(const std::string &pattern, const Regex& regex) {
+ if (!regex.valid_) {
+ return false;
+ }
+#ifdef NO_MORE_REGFREEE
+ return std::regex_match(pattern, regex.compiled_regex_);
+#else
+ std::vector<regmatch_t> match;
+ match.resize(getMaxGroupCountOfRegex(regex.regex_str_));
+ return regexec(®ex.compiled_full_input_regex_, pattern.c_str(), match.size(), match.data(), 0) == 0;
+#endif
+}
+
+bool regexMatch(const std::string &pattern, SMatch& match, const Regex& regex) {
+ if (!regex.valid_) {
+ return false;
+ }
+#ifdef NO_MORE_REGFREEE
+ return std::regex_match(pattern, match, regex.compiled_regex_);
+#else
+ match.clear();
+ std::vector<regmatch_t> regmatches;
+ regmatches.resize(getMaxGroupCountOfRegex(regex.regex_str_));
+ bool result = regexec(®ex.compiled_full_input_regex_, pattern.c_str(), regmatches.size(), regmatches.data(), 0) == 0;
+ match.pattern_ = pattern;
+ for (const auto& regmatch : regmatches) {
+ match.matches_.push_back(SMatch::Regmatch{regmatch, match.pattern_});
+ }
+ return result;
+#endif
+}
+
+SMatch getLastRegexMatch(const std::string& pattern, const utils::Regex& regex) {
+#ifdef NO_MORE_REGFREEE
+ auto matches = std::sregex_iterator(pattern.begin(), pattern.end(), regex.compiled_regex_);
+ std::smatch last_match;
+ while (matches != std::sregex_iterator()) {
+ last_match = *matches;
+ matches = std::next(matches);
+ }
+ return last_match;
+#else
+ SMatch search_result;
+ SMatch last_match;
+ auto current_str = pattern;
+ while (regexSearch(current_str, search_result, regex)) {
+ last_match = search_result;
+ current_str = search_result.suffix();
+ }
+
+ auto diff = pattern.size() - last_match.pattern_.size();
+ last_match.pattern_ = pattern;
+ for (auto& match : last_match.matches_) {
+ if (match.match.rm_so >= 0) {
+ match.match.rm_so += diff;
+ match.match.rm_eo += diff;
+ }
+ }
+ return last_match;
+#endif
+}
+
+} // namespace org::apache::nifi::minifi::utils
diff --git a/libminifi/src/utils/StringUtils.cpp b/libminifi/src/utils/StringUtils.cpp
index a4ae0851c..8913963fb 100644
--- a/libminifi/src/utils/StringUtils.cpp
+++ b/libminifi/src/utils/StringUtils.cpp
@@ -474,16 +474,6 @@ std::string StringUtils::to_base64(const gsl::span<const std::byte> raw_data, bo
return buf;
}
-std::smatch StringUtils::getLastRegexMatch(const std::string& str, const std::regex& pattern) {
- auto matches = std::sregex_iterator(str.begin(), str.end(), pattern);
- std::smatch last_match;
- while (matches != std::sregex_iterator()) {
- last_match = *matches;
- matches = std::next(matches);
- }
- return last_match;
-}
-
std::string StringUtils::escapeUnprintableBytes(gsl::span<const std::byte> data) {
constexpr const char* hex_digits = "0123456789abcdef";
std::string result;
diff --git a/libminifi/test/unit/RegexUtilsTests.cpp b/libminifi/test/unit/RegexUtilsTests.cpp
new file mode 100644
index 000000000..efec1124d
--- /dev/null
+++ b/libminifi/test/unit/RegexUtilsTests.cpp
@@ -0,0 +1,129 @@
+/**
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <string>
+#include <vector>
+
+#include "Exception.h"
+#include "utils/RegexUtils.h"
+#include "../TestBase.h"
+#include "../Catch.h"
+
+using org::apache::nifi::minifi::utils::Regex;
+using org::apache::nifi::minifi::Exception;
+namespace minifi = org::apache::nifi::minifi;
+
+TEST_CASE("TestRegexUtils::single_match", "[regex1]") {
+ std::string pat = "Speed limit 130 | Speed limit 80";
+ std::string rgx1 = "Speed limit ([0-9]+)";
+ std::vector<Regex::Mode> mode = {Regex::Mode::ICASE};
+ Regex r1(rgx1, mode);
+ REQUIRE(minifi::utils::regexSearch(pat, r1));
+}
+
+TEST_CASE("TestRegexUtils::invalid_construction", "[regex2]") {
+ std::string pat = "Speed limit 130 | Speed limit 80";
+ std::string rgx1 = "Speed limit ([0-9]+)";
+ std::string rgx2 = "[Invalid)A(F)";
+ std::vector<Regex::Mode> mode = {Regex::Mode::ICASE};
+ Regex r1(rgx1, mode);
+ REQUIRE_THROWS_WITH(Regex(rgx2, mode), Catch::Contains("Regex Operation"));
+}
+
+TEST_CASE("TestRegexUtils::empty_input", "[regex3]") {
+ std::string pat = "";
+ std::string rgx1 = "Speed limit ([0-9]+)";
+ std::string rgx2 = "";
+ std::string rgx3 = "(.*)";
+ std::vector<Regex::Mode> mode = {Regex::Mode::ICASE};
+ Regex r1(rgx1, mode);
+ REQUIRE(!minifi::utils::regexSearch(pat, r1));
+ Regex r2(rgx2, mode);
+ REQUIRE(minifi::utils::regexSearch(pat, r2));
+ REQUIRE(!minifi::utils::regexSearch("LMN", r1));
+ Regex r3(rgx3);
+ REQUIRE(minifi::utils::regexSearch(pat, r3));
+}
+
+TEST_CASE("TestRegexUtils::check_mode", "[regex4]") {
+ std::string pat = "Speed limit 130 | Speed limit 80";
+ std::string rgx1 = "sPeeD limIt ([0-9]+)";
+ Regex r1(rgx1);
+ REQUIRE(!minifi::utils::regexSearch(pat, r1));
+ std::vector<Regex::Mode> mode = {Regex::Mode::ICASE};
+ Regex r2(rgx1, mode);
+ REQUIRE(minifi::utils::regexSearch(pat, r2));
+}
+
+TEST_CASE("TestRegexUtils::regexMatch works correctly", "[matchesFullInput]") {
+ REQUIRE(minifi::utils::regexMatch("", Regex("")) == true);
+ REQUIRE(minifi::utils::regexMatch("input", Regex("")) == false);
+ REQUIRE(minifi::utils::regexMatch("input", Regex(".*")) == true);
+ REQUIRE(minifi::utils::regexMatch("input", Regex("np")) == false);
+ REQUIRE(minifi::utils::regexMatch("input", Regex(".*np.*")) == true);
+ REQUIRE(minifi::utils::regexMatch("input", Regex("(in|out)put")) == true);
+ REQUIRE(minifi::utils::regexMatch("input", Regex("inpu[aeiou]*")) == false);
+}
+
+TEST_CASE("TestRegexUtils::regexSearch works with groups", "[matchesFullInput]") {
+ std::string pat = "Speed limit 130 | Speed limit 80";
+ std::string rgx1 = "Speed limit ([0-9]+)";
+ Regex r1(rgx1);
+ minifi::utils::SMatch matches;
+ REQUIRE(minifi::utils::regexSearch(pat, matches, r1));
+ REQUIRE(matches.size() == 2);
+ REQUIRE(matches[0].str() == "Speed limit 130");
+ REQUIRE(matches[1].str() == "130");
+ REQUIRE(" | Speed limit 80" == matches.suffix().str());
+}
+
+TEST_CASE("TestRegexUtils::regexMatch works with groups", "[matchesFullInput]") {
+ std::string pat = "Speed limit 130 all the way";
+ std::string rgx1 = "Speed limit ([0-9]+) (.*)";
+ Regex r1(rgx1);
+ minifi::utils::SMatch matches;
+ REQUIRE(minifi::utils::regexMatch(pat, matches, r1));
+ REQUIRE(matches.size() == 3);
+ REQUIRE(matches[0].str() == "Speed limit 130 all the way");
+ REQUIRE(matches[1].str() == "130");
+ REQUIRE(matches[2].str() == "all the way");
+ REQUIRE("" == matches.suffix().str());
+}
+
+TEST_CASE("TestRegexUtils::getLastRegexMatch works correctly", "[getLastRegexMatch]") {
+ utils::Regex pattern("<[0-9]+>");
+ {
+ std::string content = "Foo";
+ auto last_match = minifi::utils::getLastRegexMatch(content, pattern);
+ REQUIRE_FALSE(last_match.ready());
+ }
+ {
+ std::string content = "<1> Foo";
+ auto last_match = minifi::utils::getLastRegexMatch(content, pattern);
+ REQUIRE(last_match.ready());
+ CHECK(last_match.length(0) == 3);
+ CHECK(last_match.position(0) == 0);
+ }
+ {
+ std::string content = "<1> Foo<2> Bar<3> Baz<10> Qux";
+ auto last_match = minifi::utils::getLastRegexMatch(content, pattern);
+ REQUIRE(last_match.ready());
+ CHECK(last_match.length(0) == 4);
+ CHECK(last_match.position(0) == 21);
+ }
+}
diff --git a/libminifi/test/unit/StringUtilsTests.cpp b/libminifi/test/unit/StringUtilsTests.cpp
index 799540162..46c4b1573 100644
--- a/libminifi/test/unit/StringUtilsTests.cpp
+++ b/libminifi/test/unit/StringUtilsTests.cpp
@@ -521,29 +521,6 @@ TEST_CASE("StringUtils::removeFramingCharacters works correctly", "[removeFramin
REQUIRE(utils::StringUtils::removeFramingCharacters("\"\"abba\"\"", '"') == "\"abba\"");
}
-TEST_CASE("StringUtils::getLastRegexMatch works correctly", "[getLastRegexMatch]") {
- std::regex pattern("<[0-9]+>");
- {
- std::string content = "Foo";
- auto last_match = StringUtils::getLastRegexMatch(content, pattern);
- REQUIRE_FALSE(last_match.ready());
- }
- {
- std::string content = "<1> Foo";
- auto last_match = StringUtils::getLastRegexMatch(content, pattern);
- REQUIRE(last_match.ready());
- CHECK(last_match.length(0) == 3);
- CHECK(last_match.position(0) == 0);
- }
- {
- std::string content = "<1> Foo<2> Bar<3> Baz<10> Qux";
- auto last_match = StringUtils::getLastRegexMatch(content, pattern);
- REQUIRE(last_match.ready());
- CHECK(last_match.length(0) == 4);
- CHECK(last_match.position(0) == 21);
- }
-}
-
// ignore terminating \0 character
template<size_t N>
gsl::span<const std::byte> from_cstring(const char (&str)[N]) {