You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@nifi.apache.org by GitBox <gi...@apache.org> on 2019/04/08 10:09:29 UTC

[GitHub] [nifi-minifi-cpp] arpadboda commented on a change in pull request #533: MINIFICPP-726 - Enhance ExtractText to have more feature parity with …

arpadboda commented on a change in pull request #533: MINIFICPP-726 - Enhance ExtractText to have more feature parity with …
URL: https://github.com/apache/nifi-minifi-cpp/pull/533#discussion_r272972983
 
 

 ##########
 File path: libminifi/src/processors/ExtractText.cpp
 ##########
 @@ -104,7 +143,64 @@ int64_t ExtractText::ReadCallback::process(std::shared_ptr<io::BaseStream> strea
     }
   }
 
-  flowFile_->setAttribute(attrKey, contentStream.str());
+  if(regex_mode) {
+    std::regex_constants::syntax_option_type regex_mode = std::regex_constants::ECMAScript;
+
+    bool insensitive = InsensitiveMatch.getDefaultValue();
+    if(ctx_->getProperty(InsensitiveMatch.getName(), insensitive) && insensitive) {
+      regex_mode |= std::regex_constants::icase;
+    }
+
+    bool ignoregroupzero = IgnoreCaptureGroupZero.getDefaultValue();
+    ctx_->getProperty(IgnoreCaptureGroupZero.getName(), ignoregroupzero);
+
+    bool repeatingcapture = EnableRepeatingCaptureGroup.getDefaultValue();
+    ctx_->getProperty(EnableRepeatingCaptureGroup.getName(), repeatingcapture);
+
+    int maxCaptureSize = MaxCaptureGroupLen.getDefaultValue();
+    ctx_->getProperty(MaxCaptureGroupLen.getName(), maxCaptureSize);
+
+    std::string contentStr = contentStream.str();
+
+    std::map<std::string, std::string> regexAttributes;
+
+    for (const auto& k : ctx_->getDynamicPropertyKeys()){
+      std::string value;
+      ctx_->getDynamicProperty(k, value);
+
+      std::regex rgx(value, regex_mode);
+
+      std::smatch matches;
+
+      std::string workStr = contentStr;
+
+      int matchcount = 0;
+
+      while(std::regex_search(workStr, matches, rgx)) {
+        size_t i = ignoregroupzero ? 1 : 0;
+
+        for (; i < matches.size(); ++i, ++matchcount) {
+          std::string value = matches[i].str();
+          if(value.length() > maxCaptureSize) {
+            value = value.substr(0, maxCaptureSize);
+          }
+          if(matchcount == 0) {
+            regexAttributes[k] = value;
+          }
+          regexAttributes[k + '.' + std::to_string(matchcount)] = value;
+        }
+        if(!repeatingcapture) {
+          break;
+        }
+        workStr = matches.suffix();
+      }
+    }
+    for(const auto& kv : regexAttributes) {
+      flowFile_->setAttribute(kv.first, kv.second);
+    }
 
 Review comment:
   The only thing missing here is routing to "unmatched" in case there is no match.
   However I think that would be a breaking change, so I would prefer to do that in scope of a follow-up scheduled for 1.0.
   This change can be part of the next minor release. 
   @apiri do you agree or do you have different opinion? 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services