You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@trafficserver.apache.org by bc...@apache.org on 2018/07/05 17:18:46 UTC
[trafficserver] 01/03: Search cache for urls matching the regex input

This is an automated email from the ASF dual-hosted git repository.

bcall pushed a commit to branch 8.0.x
in repository https://gitbox.apache.org/repos/asf/trafficserver.git

commit 0ba368a52a04f9bcf581a05df46b3013b50512c9
Author: Persia Aziz <pe...@yahoo-inc.com>
AuthorDate: Wed May 2 10:04:06 2018 -0500

    Search cache for urls matching the regex input
    
    (cherry picked from commit 9aaee09eafe6e386515e7ec219079a6c967c1819)
---
 src/traffic_cache_tool/CacheDefs.h  | 47 ++++++++++++++++++++++++++++++-------
 src/traffic_cache_tool/CacheScan.cc | 33 +++++++++++++++++---------
 src/traffic_cache_tool/CacheScan.h  | 13 +++++++---
 src/traffic_cache_tool/CacheTool.cc | 22 ++++++++++++-----
 4 files changed, 86 insertions(+), 29 deletions(-)

diff --git a/src/traffic_cache_tool/CacheDefs.h b/src/traffic_cache_tool/CacheDefs.h
index 6944a27..2ec599c 100644
--- a/src/traffic_cache_tool/CacheDefs.h
+++ b/src/traffic_cache_tool/CacheDefs.h
@@ -275,14 +275,45 @@ public:
 
 class DFA;
 // this class matches url of the format : scheme://hostname:port/path;params?query
+
 struct url_matcher {
-  // R"(^https?\:\/\/^[a-z A-Z 0-9]\.[a-z A-Z 0-9 \.]+)"
+  url_matcher(ts::FilePath const &path) // file contains a list of regex
+  {
+    ts::BulkFile cfile(path);
+    if (cfile.load() == 0) {
+      ts::TextView fileContent = cfile.content();
+      const char **patterns;
+      std::vector<std::string> str_vec;
+      int count = 0;
+      while (fileContent) {
+        ts::TextView line = fileContent.take_prefix_at('\n');
+        std::string reg_str(line.data(), line.size());
+        str_vec.push_back(reg_str);
+        count++;
+      }
+      patterns = (const char **)ats_malloc(count * sizeof(char *));
+      int i    = 0;
+      for (auto str : str_vec) {
+        patterns[i++] = ats_strdup(str.data());
+
+        std::cout << "regex input\n" << patterns[i - 1] << std::endl;
+      }
+      for (i = 0; i < count; i++) {
+        std::cout << "regex " << patterns[i] << std::endl;
+      }
+      if (regex.compile(patterns, count) != 0) {
+        std::cout << "Check your regular expression" << std::endl;
+      }
+
+      if (port.compile(R"([0-9]+$)") != 0) {
+        std::cout << "Check your regular expression" << std::endl;
+        return;
+      }
+    }
+  }
+
   url_matcher()
   {
-    /*if (regex.compile(R"(^https?\:\/\/^[a-z A-Z 0-9][\. a-z A-Z 0-9 ]+(\:[0-9]\/)?.*))") != 0) {
-        std::cout<<"Check your regular expression"<<std::endl;
-    }*/
-    //  (\w+\:[\w\W]+\@)? (:[0-9]+)?(\/.*)
     if (regex.compile(R"(^(https?\:\/\/)") != 0) {
       std::cout << "Check your regular expression" << std::endl;
       return;
@@ -294,14 +325,14 @@ struct url_matcher {
   }
 
   ~url_matcher() {}
+
   uint8_t
   match(const char *hostname) const
   {
     if (regex.match(hostname) != -1) {
       return 1;
     }
-    //   if(url_with_user.match(hostname) != -1)
-    //       return 2;
+
     return 0;
   }
   uint8_t
@@ -310,8 +341,6 @@ struct url_matcher {
     if (port.match(hostname, length) != -1) {
       return 1;
     }
-    //   if(url_with_user.match(hostname) != -1)
-    //       return 2;
     return 0;
   }
 
diff --git a/src/traffic_cache_tool/CacheScan.cc b/src/traffic_cache_tool/CacheScan.cc
index 2cbd471..06be535 100644
--- a/src/traffic_cache_tool/CacheScan.cc
+++ b/src/traffic_cache_tool/CacheScan.cc
@@ -34,7 +34,7 @@ const int HTTP_ALT_MARSHAL_SIZE = ROUND(sizeof(HTTPCacheAlt), HDR_PTR_SIZE);
 namespace ct
 {
 Errata
-CacheScan::Scan()
+CacheScan::Scan(bool search)
 {
   int64_t guessed_size = 1048576; // 1M
   Errata zret;
@@ -63,7 +63,7 @@ CacheScan::Scan()
             std::cout << "Failed to read content from the Stripe.  " << strerror(errno) << std::endl;
           } else {
             Doc *doc = reinterpret_cast<Doc *>(stripe_buff2);
-            get_alternates(doc->hdr(), doc->hlen);
+            get_alternates(doc->hdr(), doc->hlen, search);
           }
           dir_bitset[dir_to_offset(e, seg)] = true;
           e                                 = next_dir(e, seg);
@@ -125,12 +125,12 @@ CacheScan::unmarshal(MIMEFieldBlockImpl *mf, intptr_t offset)
 {
   Errata zret;
   HDR_UNMARSHAL_PTR(mf->m_next, MIMEFieldBlockImpl, offset);
-
+  ts::MemSpan mf_mem((char *)mf, mf->m_length);
   for (uint32_t index = 0; index < mf->m_freetop; index++) {
     MIMEField *field = &(mf->m_field_slots[index]);
 
     // check if out of bounds
-    if (((char *)field - (char *)mf) > mf->m_length) {
+    if (!mf_mem.contains((char *)field)) {
       zret.push(0, 0, "Out of bounds memory in the deserialized MIMEFieldBlockImpl");
       return zret;
     }
@@ -358,7 +358,7 @@ CacheScan::check_url(ts::MemSpan &mem, URLImpl *url)
 }
 
 Errata
-CacheScan::get_alternates(const char *buf, int length)
+CacheScan::get_alternates(const char *buf, int length, bool search)
 {
   Errata zret;
   ink_assert(!(((intptr_t)buf) & 3)); // buf must be aligned
@@ -378,7 +378,7 @@ CacheScan::get_alternates(const char *buf, int length)
       } else if (!a->m_request_hdr.m_http) {
         std::cerr << "no http object found in the request header object" << std::endl;
         return zret;
-      } else if (((char *)a->m_request_hdr.m_http - buf) > length) {
+      } else if (!doc_mem.contains((char *)a->m_request_hdr.m_http)) {
         std::cerr << "out of bounds request header in the alternate" << std::endl;
         return zret;
       }
@@ -386,12 +386,23 @@ CacheScan::get_alternates(const char *buf, int length)
       auto *url = a->m_request_hdr.m_http->u.req.m_url_impl;
       if (check_url(doc_mem, url)) {
         std::string str;
-        ts::bwprint(str, "stripe: {} : {}://{}:{}/{};{}?{}", std::string_view(this->stripe->hashText),
-                    std::string_view(url->m_ptr_scheme, url->m_len_scheme), std::string_view(url->m_ptr_host, url->m_len_host),
-                    std::string_view(url->m_ptr_port, url->m_len_port), std::string_view(url->m_ptr_path, url->m_len_path),
-                    std::string_view(url->m_ptr_params, url->m_len_params), std::string_view(url->m_ptr_query, url->m_len_query));
 
-        std::cout << str << std::endl;
+        if (search) {
+          ts::bwprint(str, "{}://{}:{}/{};{}?{}", std::string_view(url->m_ptr_scheme, url->m_len_scheme),
+                      std::string_view(url->m_ptr_host, url->m_len_host), std::string_view(url->m_ptr_port, url->m_len_port),
+                      std::string_view(url->m_ptr_path, url->m_len_path), std::string_view(url->m_ptr_params, url->m_len_params),
+                      std::string_view(url->m_ptr_query, url->m_len_query));
+          if (u_matcher->match(str.data())) {
+            str = this->stripe->hashText + " " + str;
+            std::cout << "match found " << str << std::endl;
+          }
+        } else {
+          ts::bwprint(str, "stripe: {} : {}://{}:{}/{};{}?{}", std::string_view(this->stripe->hashText),
+                      std::string_view(url->m_ptr_scheme, url->m_len_scheme), std::string_view(url->m_ptr_host, url->m_len_host),
+                      std::string_view(url->m_ptr_port, url->m_len_port), std::string_view(url->m_ptr_path, url->m_len_path),
+                      std::string_view(url->m_ptr_params, url->m_len_params), std::string_view(url->m_ptr_query, url->m_len_query));
+          std::cout << str << std::endl;
+        }
       } else {
         std::cerr << "The retrieved url object is invalid" << std::endl;
       }
diff --git a/src/traffic_cache_tool/CacheScan.h b/src/traffic_cache_tool/CacheScan.h
index 2a359fb..3a7cdff 100644
--- a/src/traffic_cache_tool/CacheScan.h
+++ b/src/traffic_cache_tool/CacheScan.h
@@ -40,12 +40,19 @@ namespace ct
 class CacheScan
 {
   Stripe *stripe;
+  url_matcher *u_matcher;
 
 public:
-  CacheScan(Stripe *str) : stripe(str){};
+  CacheScan(Stripe *str, ts::FilePath const &path) : stripe(str)
+  {
+    if (path.has_path()) {
+      u_matcher = new url_matcher(path);
+    }
+  };
+  CacheScan(Stripe *str) : stripe(str) {}
+  Errata Scan(bool search = false);
+  Errata get_alternates(const char *buf, int length, bool search);
   int unmarshal(HdrHeap *hh, int buf_length, int obj_type, HdrHeapObjImpl **found_obj, RefCountObj *block_ref);
-  Errata Scan();
-  Errata get_alternates(const char *buf, int length);
   Errata unmarshal(char *buf, int len, RefCountObj *block_ref);
   Errata unmarshal(HTTPHdrImpl *obj, intptr_t offset);
   Errata unmarshal(URLImpl *obj, intptr_t offset);
diff --git a/src/traffic_cache_tool/CacheTool.cc b/src/traffic_cache_tool/CacheTool.cc
index 7b1f4cb..c59f355 100644
--- a/src/traffic_cache_tool/CacheTool.cc
+++ b/src/traffic_cache_tool/CacheTool.cc
@@ -749,6 +749,7 @@ Span::loadDevice()
           }
           _len = _header->num_blocks;
         } else {
+          zret = Errata::Message(0, 0, _path, " header is uninitialized or invalid");
           std::cout << "Span: " << _path << " header is uninitialized or invalid" << std::endl;
           _len = round_down(_geometry.totalsz) - _base;
         }
@@ -1351,26 +1352,35 @@ Get_Response(FilePath const &input_file_path)
   return zret;
 }
 
-void static scan_span(Span *span)
+void static scan_span(Span *span, ts::FilePath const &regex_path)
 {
   for (auto strp : span->_stripes) {
     strp->loadMeta();
     strp->loadDir();
-    CacheScan cs(strp);
-    cs.Scan();
+
+    if (regex_path.has_path()) {
+      CacheScan cs(strp, regex_path);
+      cs.Scan(true);
+    } else {
+      CacheScan cs(strp);
+      cs.Scan(false);
+    }
   }
 }
 
 Errata
-Scan_Cache()
+Scan_Cache(ts::FilePath const &regex_path)
 {
   Errata zret;
   Cache cache;
   std::vector<std::thread> threadPool;
   if ((zret = cache.loadSpan(SpanFile))) {
+    if (zret.size()) {
+      return zret;
+    }
     cache.dumpSpans(Cache::SpanDumpDepth::SPAN);
     for (auto sp : cache._spans) {
-      threadPool.emplace_back(scan_span, sp); // move constructor is necessary since std::thread is non copyable
+      threadPool.emplace_back(scan_span, sp, regex_path);
     }
     for (auto &th : threadPool)
       th.join();
@@ -1443,7 +1453,7 @@ main(int argc, char *argv[])
   Commands.add(std::string("init"), std::string(" Initializes uninitialized span"),
                [&](int, char *argv[]) { return Init_disk(input_url_file); });
   Commands.add(std::string("scan"), std::string(" Scans the whole cache and lists the urls of the cached contents"),
-               [&](int, char *argv[]) { return Scan_Cache(); });
+               [&](int, char *argv[]) { return Scan_Cache(input_url_file); });
   Commands.setArgIndex(optind);
 
   if (help) {