You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@trafficserver.apache.org by wk...@apache.org on 2024/04/15 14:58:38 UTC

(trafficserver) branch master updated: Avoid including pcre2.h in Regex.h. (#11246)

This is an automated email from the ASF dual-hosted git repository.

wkaras pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/trafficserver.git


The following commit(s) were added to refs/heads/master by this push:
     new 08506ef708 Avoid including pcre2.h in Regex.h. (#11246)
08506ef708 is described below

commit 08506ef708b9bb05c6254be1a521f7d60f12e1cf
Author: Walt Karas <wk...@yahooinc.com>
AuthorDate: Mon Apr 15 10:58:32 2024 -0400

    Avoid including pcre2.h in Regex.h. (#11246)
---
 include/tsutil/Regex.h |  46 ++++++++++++--------
 src/tsutil/Regex.cc    | 115 +++++++++++++++++++++++++++----------------------
 2 files changed, 93 insertions(+), 68 deletions(-)

diff --git a/include/tsutil/Regex.h b/include/tsutil/Regex.h
index c4ca8feb03..44e10d53b2 100644
--- a/include/tsutil/Regex.h
+++ b/include/tsutil/Regex.h
@@ -28,14 +28,14 @@
 #include <vector>
 #include <memory>
 
-#define PCRE2_CODE_UNIT_WIDTH 8
-#include <pcre2.h>
-
 /// @brief Match flags for regular expression evaluation.
+///
+/// @internal These values are copied from pcre2.h, to avoid having to include it.  The values are checked (with
+/// static_assert) in Regex.cc against PCRE2 named constants, in case they change in future PCRE2 releases.
 enum REFlags {
-  RE_CASE_INSENSITIVE = PCRE2_CASELESS,  ///< Ignore case (default: case sensitive).
-  RE_UNANCHORED       = PCRE2_MULTILINE, ///< Unanchored (DFA defaults to anchored).
-  RE_ANCHORED         = PCRE2_ANCHORED,  ///< Anchored (Regex defaults to unanchored).
+  RE_CASE_INSENSITIVE = 0x00000008u, ///< Ignore case (default: case sensitive).
+  RE_UNANCHORED       = 0x00000400u, ///< Unanchored (DFA defaults to anchored).
+  RE_ANCHORED         = 0x80000000u, ///< Anchored (Regex defaults to unanchored).
 };
 
 /// @brief Wrapper for PCRE2 match data.
@@ -63,19 +63,25 @@ public:
   size_t *get_ovector_pointer();
   int32_t size() const;
 
-protected:
-  pcre2_match_data *get_match_data();
-  void set_subject(std::string_view subject);
-  void set_size(int32_t size);
-
 private:
   constexpr static uint32_t DEFAULT_MATCHES = 10;
   static void *malloc(size_t size, void *caller);
-  pcre2_match_data *_match_data = nullptr;
   std::string_view _subject;
   char _buffer[24 + 96 + 16 * DEFAULT_MATCHES]; // 24 bytes for the general context, 96 bytes overhead, 16 bytes per match.
   size_t _buffer_bytes_used = 0;
   int32_t _size             = 0;
+
+  /// @internal This effectively wraps a void* so that we can avoid requiring the pcre2.h include for the user of the Regex
+  /// API (see Regex.cc).
+  struct _MatchData;
+  class _MatchDataPtr
+  {
+    friend struct _MatchData;
+
+  private:
+    void *_ptr = nullptr;
+  };
+  _MatchDataPtr _match_data;
 };
 
 /// @brief Wrapper for PCRE2 regular expression.
@@ -135,11 +141,17 @@ public:
   int get_capture_count();
 
 private:
-  // @internal - Because the PCRE header is badly done, we can't forward declare the PCRE
-  // enough to use as pointers. For some reason the header defines in name only a struct and
-  // then aliases it to the standard name, rather than simply declare the latter in name only.
-  // The goal is completely wrap PCRE and not include that header in client code.
-  pcre2_code *_code = nullptr;
+  /// @internal This effectively wraps a void* so that we can avoid requiring the pcre2.h include for the user of the Regex
+  /// API (see Regex.cc).
+  struct _Code;
+  class _CodePtr
+  {
+    friend struct _Code;
+
+  private:
+    void *_ptr = nullptr;
+  };
+  _CodePtr _code;
 };
 
 /** Deterministic Finite state Automata container.
diff --git a/src/tsutil/Regex.cc b/src/tsutil/Regex.cc
index 37e4b4337b..fb2cddbb2e 100644
--- a/src/tsutil/Regex.cc
+++ b/src/tsutil/Regex.cc
@@ -23,11 +23,18 @@
 
 #include "tsutil/Regex.h"
 
+#define PCRE2_CODE_UNIT_WIDTH 8
+#include <pcre2.h>
+
 #include <array>
 #include <assert.h>
 #include <vector>
 #include <mutex>
 
+static_assert(RE_CASE_INSENSITIVE == PCRE2_CASELESS, "Update RE_CASE_INSERSITIVE for current PCRE2 version.");
+static_assert(RE_UNANCHORED == PCRE2_MULTILINE, "Update RE_MULTILINE for current PCRE2 version.");
+static_assert(RE_ANCHORED == PCRE2_ANCHORED, "Update RE_ANCHORED for current PCRE2 version.");
+
 //----------------------------------------------------------------------------
 namespace
 {
@@ -145,13 +152,27 @@ RegexContextCleanup::push_back(RegexContext *ctx)
 
 } // namespace
 
+//----------------------------------------------------------------------------
+struct RegexMatches::_MatchData {
+  static pcre2_match_data *
+  get(_MatchDataPtr const &p)
+  {
+    return static_cast<pcre2_match_data *>(p._ptr);
+  }
+  static void
+  set(_MatchDataPtr &p, pcre2_match_data *ptr)
+  {
+    p._ptr = ptr;
+  }
+};
+
 //----------------------------------------------------------------------------
 RegexMatches::RegexMatches(uint32_t size)
 {
   pcre2_general_context *ctx = pcre2_general_context_create(
     &RegexMatches::malloc, [](void *, void *) -> void {}, static_cast<void *>(this));
 
-  _match_data = pcre2_match_data_create(size, ctx);
+  _MatchData::set(_match_data, pcre2_match_data_create(size, ctx));
 }
 
 //----------------------------------------------------------------------------
@@ -175,8 +196,9 @@ RegexMatches::malloc(size_t size, void *caller)
 //----------------------------------------------------------------------------
 RegexMatches::~RegexMatches()
 {
-  if (_match_data != nullptr) {
-    pcre2_match_data_free(_match_data);
+  auto ptr = _MatchData::get(_match_data);
+  if (ptr != nullptr) {
+    pcre2_match_data_free(ptr);
   }
 }
 
@@ -184,7 +206,7 @@ RegexMatches::~RegexMatches()
 size_t *
 RegexMatches::get_ovector_pointer()
 {
-  return pcre2_get_ovector_pointer(_match_data);
+  return pcre2_get_ovector_pointer(_MatchData::get(_match_data));
 }
 
 //----------------------------------------------------------------------------
@@ -194,52 +216,46 @@ RegexMatches::size() const
   return _size;
 }
 
-//----------------------------------------------------------------------------
-pcre2_match_data *
-RegexMatches::get_match_data()
-{
-  return _match_data;
-}
-
-//----------------------------------------------------------------------------
-void
-RegexMatches::set_size(int32_t size)
-{
-  _size = size;
-}
-
-//----------------------------------------------------------------------------
-void
-RegexMatches::set_subject(std::string_view subject)
-{
-  _subject = subject;
-}
-
 //----------------------------------------------------------------------------
 std::string_view
 RegexMatches::operator[](size_t index) const
 {
   // check if the index is valid
-  if (index >= pcre2_get_ovector_count(_match_data)) {
+  if (index >= pcre2_get_ovector_count(_MatchData::get(_match_data))) {
     return std::string_view();
   }
 
-  PCRE2_SIZE *ovector = pcre2_get_ovector_pointer(_match_data);
+  PCRE2_SIZE *ovector = pcre2_get_ovector_pointer(_MatchData::get(_match_data));
   return std::string_view(_subject.data() + ovector[2 * index], ovector[2 * index + 1] - ovector[2 * index]);
 }
 
+//----------------------------------------------------------------------------
+struct Regex::_Code {
+  static pcre2_code *
+  get(_CodePtr const &p)
+  {
+    return static_cast<pcre2_code *>(p._ptr);
+  }
+  static void
+  set(_CodePtr &p, pcre2_code *ptr)
+  {
+    p._ptr = ptr;
+  }
+};
+
 //----------------------------------------------------------------------------
 Regex::Regex(Regex &&that) noexcept
 {
-  _code      = that._code;
-  that._code = nullptr;
+  _code = that._code;
+  _Code::set(that._code, nullptr);
 }
 
 //----------------------------------------------------------------------------
 Regex::~Regex()
 {
-  if (_code != nullptr) {
-    pcre2_code_free(_code);
+  auto ptr = _Code::get(_code);
+  if (ptr != nullptr) {
+    pcre2_code_free(ptr);
   }
 }
 
@@ -258,8 +274,8 @@ bool
 Regex::compile(std::string_view pattern, std::string &error, int &erroroffset, uint32_t flags)
 {
   // free the existing compiled regex if there is one
-  if (_code != nullptr) {
-    pcre2_code_free(_code);
+  if (auto ptr = _Code::get(_code); ptr != nullptr) {
+    pcre2_code_free(ptr);
   }
 
   // get the RegexContext instance - should only be null when shutting down
@@ -270,9 +286,9 @@ Regex::compile(std::string_view pattern, std::string &error, int &erroroffset, u
 
   PCRE2_SIZE error_offset;
   int error_code;
-  _code = pcre2_compile(reinterpret_cast<PCRE2_SPTR>(pattern.data()), pattern.size(), flags, &error_code, &error_offset,
-                        regex_context->get_compile_context());
-  if (!_code) {
+  auto code = pcre2_compile(reinterpret_cast<PCRE2_SPTR>(pattern.data()), pattern.size(), flags, &error_code, &error_offset,
+                            regex_context->get_compile_context());
+  if (!code) {
     erroroffset = error_offset;
 
     // get pcre2 error message
@@ -283,7 +299,9 @@ Regex::compile(std::string_view pattern, std::string &error, int &erroroffset, u
   }
 
   // support for JIT
-  pcre2_jit_compile(_code, PCRE2_JIT_COMPLETE);
+  pcre2_jit_compile(code, PCRE2_JIT_COMPLETE);
+
+  _Code::set(_code, code);
 
   return true;
 }
@@ -292,7 +310,7 @@ Regex::compile(std::string_view pattern, std::string &error, int &erroroffset, u
 bool
 Regex::exec(std::string_view subject) const
 {
-  if (_code == nullptr) {
+  if (_Code::get(_code) == nullptr) {
     return false;
   }
   RegexMatches matches;
@@ -305,28 +323,23 @@ Regex::exec(std::string_view subject) const
 int32_t
 Regex::exec(std::string_view subject, RegexMatches &matches) const
 {
-  // check if there is a compiled regex
-  if (_code == nullptr) {
-    return 0;
-  }
+  auto code = _Code::get(_code);
 
-  // get the RegexContext instance - should only be null when shutting down
-  RegexContext *regex_context = RegexContext::get_instance();
-  if (regex_context == nullptr) {
+  // check if there is a compiled regex
+  if (code == nullptr) {
     return 0;
   }
+  int count = pcre2_match(code, reinterpret_cast<PCRE2_SPTR>(subject.data()), subject.size(), 0, 0,
+                          RegexMatches::_MatchData::get(matches._match_data), RegexContext::get_instance()->get_match_context());
 
-  int count = pcre2_match(_code, reinterpret_cast<PCRE2_SPTR>(subject.data()), subject.size(), 0, 0, matches.get_match_data(),
-                          regex_context->get_match_context());
-
-  matches.set_size(count);
+  matches._size = count;
 
   if (count < 0) {
     return count;
   }
 
   if (count > 0) {
-    matches.set_subject(subject);
+    matches._subject = subject;
   }
 
   return count;
@@ -337,7 +350,7 @@ int32_t
 Regex::get_capture_count()
 {
   int captures = -1;
-  if (pcre2_pattern_info(_code, PCRE2_INFO_CAPTURECOUNT, &captures) != 0) {
+  if (pcre2_pattern_info(_Code::get(_code), PCRE2_INFO_CAPTURECOUNT, &captures) != 0) {
     return -1;
   }
   return captures;