You are viewing a plain text version of this content. The canonical link for it is here.
Posted to cvs@httpd.apache.org by ni...@apache.org on 2010/09/21 20:42:21 UTC

svn commit: r999533 - in /httpd/httpd/trunk: CHANGES include/ap_regex.h server/Makefile.in server/util_pcre.c server/util_regex.c

Author: niq
Date: Tue Sep 21 18:42:20 2010
New Revision: 999533

URL: http://svn.apache.org/viewvc?rev=999533&view=rev
Log:
Introduce ap_rxplus class: higher-level regexps supporting perl-style
regexp operations.

Added:
    httpd/httpd/trunk/server/util_regex.c
Modified:
    httpd/httpd/trunk/CHANGES
    httpd/httpd/trunk/include/ap_regex.h
    httpd/httpd/trunk/server/Makefile.in
    httpd/httpd/trunk/server/util_pcre.c

Modified: httpd/httpd/trunk/CHANGES
URL: http://svn.apache.org/viewvc/httpd/httpd/trunk/CHANGES?rev=999533&r1=999532&r2=999533&view=diff
==============================================================================
--- httpd/httpd/trunk/CHANGES [utf-8] (original)
+++ httpd/httpd/trunk/CHANGES [utf-8] Tue Sep 21 18:42:20 2010
@@ -64,6 +64,10 @@ Changes with Apache 2.3.9
      and sub-directories of matched directories are no longer implicitly
      matched.  PR49809 [Eric Covener]
 
+  *) Regexps: introduce new higher-level regexp utility including parsing
+     and executing perl-style regexp ops (e.g s/foo/bar/i) and regexp memory
+     [Nick Kew]
+
 Changes with Apache 2.3.8
 
   *) suexec: Support large log files. PR 45856. [Stefan Fritsch]

Modified: httpd/httpd/trunk/include/ap_regex.h
URL: http://svn.apache.org/viewvc/httpd/httpd/trunk/include/ap_regex.h?rev=999533&r1=999532&r2=999533&view=diff
==============================================================================
--- httpd/httpd/trunk/include/ap_regex.h (original)
+++ httpd/httpd/trunk/include/ap_regex.h Tue Sep 21 18:42:20 2010
@@ -63,7 +63,7 @@ POSSIBILITY OF SUCH DAMAGE.
 extern "C" {
 #endif
 
-/* Options for ap_regexec: */
+/* Options for ap_regcomp, ap_regexec, and ap_rxplus versions: */
 
 #define AP_REG_ICASE    0x01 /** use a case-insensitive match */
 #define AP_REG_NEWLINE  0x02 /** don't match newlines against '.' etc */
@@ -73,6 +73,10 @@ extern "C" {
 #define AP_REG_EXTENDED (0)  /** unused */
 #define AP_REG_NOSUB    (0)  /** unused */
 
+#define AP_REG_MULTI 0x10    /* perl's /g (needs fixing) */
+#define AP_REG_NOMEM 0x20    /* nomem in our code */
+#define AP_REG_DOTALL 0x40   /* perl's /s flag */
+
 /* Error values: */
 enum {
   AP_REG_ASSERT = 1,  /** internal error ? */
@@ -134,6 +138,80 @@ AP_DECLARE(apr_size_t) ap_regerror(int e
  */
 AP_DECLARE(void) ap_regfree(ap_regex_t *preg);
 
+/* ap_rxplus: higher-level regexps */
+
+typedef struct {
+    ap_regex_t rx;
+    apr_uint32_t flags;
+    const char *subs;
+    const char *match;
+    apr_size_t nmatch;
+    ap_regmatch_t *pmatch;
+} ap_rxplus_t;
+
+/**
+ * Compile a pattern into a regexp.
+ * supports perl-like formats
+ *    match-string
+ *    /match-string/flags
+ *    s/match-string/replacement-string/flags
+ *    Intended to support more perl-like stuff as and when round tuits happen
+ * match-string is anything supported by ap_regcomp
+ * replacement-string is a substitution string as supported in ap_pregsub
+ * flags should correspond with perl syntax: treat failure to do so as a bug
+ *                                           (documentation TBD)
+ * @param pool Pool to allocate from
+ * @param pattern Pattern to compile
+ * @return Compiled regexp, or NULL in case of compile/syntax error
+ */
+AP_DECLARE(ap_rxplus_t*) ap_rxplus_compile(apr_pool_t *pool, const char *pattern);
+/**
+ * Apply a regexp operation to a string.
+ * @param pool Pool to allocate from
+ * @param rx The regex match to apply
+ * @param pattern The string to apply it to
+ *                NOTE: This MUST be kept in scope to use regexp memory
+ * @param newpattern The modified string (ignored if the operation doesn't
+ *                                        modify the string)
+ * @return Number of times a match happens.  Normally 0 (no match) or 1
+ *         (match found), but may be greater if a transforming pattern
+ *         is applied with the 'g' flag.
+ */
+AP_DECLARE(int) ap_rxplus_exec(apr_pool_t *pool, ap_rxplus_t *rx,
+                               const char *pattern, char **newpattern);
+#ifdef DOXYGEN
+/**
+ * Number of matches in the regexp operation's memory
+ * This may be 0 if no match is in memory, or up to nmatch from compilation
+ * @param rx The regexp
+ * @return Number of matches in memory
+ */
+AP_DECLARE(int) ap_rxplus_nmatch(ap_rxplus_t *rx);
+#else
+#define ap_rxplus_nmatch(rx) (((rx)->match != NULL) ? (rx)->nmatch : 0)
+#endif
+/**
+ * Get a pointer to a match from regex memory
+ * NOTE: this relies on the match pattern from the last call to
+ *       ap_rxplus_exec still being valid (i.e. not freed or out-of-scope)
+ * @param rx The regexp
+ * @param n The match number to retrieve (must be between 0 and nmatch)
+ * @param len Returns the length of the match.
+ * @param match Returns the match pattern
+ */
+AP_DECLARE(void) ap_rxplus_match(ap_rxplus_t *rx, int n, int *len,
+                                 const char **match);
+/**
+ * Get a match from regex memory in a string copy
+ * NOTE: this relies on the match pattern from the last call to
+ *       ap_rxplus_exec still being valid (i.e. not freed or out-of-scope)
+ * @param pool Pool to allocate from
+ * @param rx The regexp
+ * @param n The match number to retrieve (must be between 0 and nmatch)
+ * @return The matched string
+ */
+AP_DECLARE(char*) ap_rxplus_pmatch(apr_pool_t *pool, ap_rxplus_t *rx, int n);
+
 #ifdef __cplusplus
 }   /* extern "C" */
 #endif

Modified: httpd/httpd/trunk/server/Makefile.in
URL: http://svn.apache.org/viewvc/httpd/httpd/trunk/server/Makefile.in?rev=999533&r1=999532&r2=999533&view=diff
==============================================================================
--- httpd/httpd/trunk/server/Makefile.in (original)
+++ httpd/httpd/trunk/server/Makefile.in Tue Sep 21 18:42:20 2010
@@ -12,7 +12,7 @@ LTLIBRARY_SOURCES = \
 	util_script.c util_md5.c util_cfgtree.c util_ebcdic.c util_time.c \
 	connection.c listen.c util_mutex.c mpm_common.c mpm_unix.c \
 	util_charset.c util_cookies.c util_debug.c util_xml.c \
-	util_expr.c util_filter.c util_pcre.c exports.c \
+	util_expr.c util_filter.c util_pcre.c util_regex.c exports.c \
 	scoreboard.c error_bucket.c protocol.c core.c request.c provider.c \
 	eoc_bucket.c eor_bucket.c core_filters.c
 

Modified: httpd/httpd/trunk/server/util_pcre.c
URL: http://svn.apache.org/viewvc/httpd/httpd/trunk/server/util_pcre.c?rev=999533&r1=999532&r2=999533&view=diff
==============================================================================
--- httpd/httpd/trunk/server/util_pcre.c (original)
+++ httpd/httpd/trunk/server/util_pcre.c Tue Sep 21 18:42:20 2010
@@ -128,6 +128,7 @@ int options = 0;
 
 if ((cflags & AP_REG_ICASE) != 0) options |= PCRE_CASELESS;
 if ((cflags & AP_REG_NEWLINE) != 0) options |= PCRE_MULTILINE;
+if ((cflags & AP_REG_DOTALL) != 0) options |= PCRE_DOTALL;
 
 preg->re_pcre = pcre_compile(pattern, options, &errorptr, &erroffset, NULL);
 preg->re_erroffset = erroffset;

Added: httpd/httpd/trunk/server/util_regex.c
URL: http://svn.apache.org/viewvc/httpd/httpd/trunk/server/util_regex.c?rev=999533&view=auto
==============================================================================
--- httpd/httpd/trunk/server/util_regex.c (added)
+++ httpd/httpd/trunk/server/util_regex.c Tue Sep 21 18:42:20 2010
@@ -0,0 +1,261 @@
+/* Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "apr.h"
+#include "apr_lib.h"
+#include "apr_pools.h"
+#include "apr_strings.h"
+#include "ap_config.h"
+#include "ap_regex.h"
+#include "httpd.h"
+
+AP_DECLARE(ap_rxplus_t*) ap_rxplus_compile(apr_pool_t *pool,
+                                           const char *pattern)
+{
+    /* perl style patterns
+     * add support for more as and when wanted
+     * substitute: s/rx/subs/
+     * match: m/rx/ or just /rx/
+     */
+
+    /* allow any nonalnum delimiter as first or second char.
+     * If we ever use this with non-string pattern we'll need an extra check
+     */
+    const char *endp = 0;
+    const char *str = pattern;
+    const char *rxstr;
+    ap_rxplus_t *ret = apr_pcalloc(pool, sizeof(ap_rxplus_t));
+    char delim = 0;
+    enum { SUBSTITUTE = 's', MATCH = 'm'} action = MATCH;
+    if (!apr_isalnum(pattern[0])) {
+        delim = *str++;
+    }
+    else if (pattern[0] == 's' && !apr_isalnum(pattern[1])) {
+        action = SUBSTITUTE;
+        delim = pattern[1];
+        str += 2;
+    }
+    else if (pattern[0] == 'm' && !apr_isalnum(pattern[1])) {
+        delim = pattern[1];
+        str += 2;
+    }
+    /* TODO: support perl's after/before */
+    /* FIXME: fix these simplminded delims */
+
+    /* we think there's a delimiter.  Allow for it not to be if unmatched */
+    if (delim) {
+        endp = ap_strchr_c(str, delim);
+    }
+    if (!endp) { /* there's no delim  or flags */
+        if (ap_regcomp(&ret->rx, pattern, 0) == 0) {
+            apr_pool_cleanup_register(pool, &ret->rx, (void*) ap_regfree,
+                                      apr_pool_cleanup_null);
+            return ret;
+        }
+        else {
+            return NULL;
+        }
+    }
+
+    /* We have a delimiter.  Use it to extract the regexp */
+    rxstr = apr_pstrndup(pool, str, endp-str);
+
+    /* If it's a substitution, we need the replacement string
+     * TODO: possible future enhancement - support other parsing
+     * in the replacement string.
+     */
+    if (action == SUBSTITUTE) {
+        str = endp+1;
+        if (!*str || (endp = ap_strchr_c(str, delim), !endp)) {
+            /* missing replacement string is an error */
+            return NULL;
+        }
+        ret->subs = apr_pstrndup(pool, str, (endp-str));
+    }
+
+    /* anything after the current delimiter is flags */
+    while (*++endp) {
+        switch (*endp) {
+        case 'i': ret->flags |= AP_REG_ICASE; break;
+        case 'm': ret->flags |= AP_REG_NEWLINE; break;
+        case 'n': ret->flags |= AP_REG_NOMEM; break;
+        case 'g': ret->flags |= AP_REG_MULTI; break;
+        case 's': ret->flags |= AP_REG_DOTALL; break;
+        case '^': ret->flags |= AP_REG_NOTBOL; break;
+        case '$': ret->flags |= AP_REG_NOTEOL; break;
+        default: break; /* we should probably be stricter here */
+        }
+    }
+    if (ap_regcomp(&ret->rx, rxstr, ret->flags) == 0) {
+        apr_pool_cleanup_register(pool, &ret->rx, (void*) ap_regfree,
+                                  apr_pool_cleanup_null);
+    }
+    else {
+        return NULL;
+    }
+    if (!(ret->flags & AP_REG_NOMEM)) {
+        /* count size of memory required, starting at 1 for the whole-match
+         * Simpleminded should be fine 'cos regcomp already checked syntax
+         */
+        ret->nmatch = 1;
+        while (*rxstr) {
+            switch (*rxstr++) {
+            case '\\':  /* next char is escaped - skip it */
+                if (*rxstr != 0) {
+                    ++rxstr;
+                }
+                break;
+            case '(':   /* unescaped bracket implies memory */
+                ++ret->nmatch;
+                break;
+            default:
+                break;
+            }
+        }
+        ret->pmatch = apr_palloc(pool, ret->nmatch*sizeof(ap_regmatch_t));
+    }
+    return ret;
+}
+
+AP_DECLARE(int) ap_rxplus_exec(apr_pool_t *pool, ap_rxplus_t *rx,
+                               const char *pattern, char **newpattern)
+                               //int max_iterations)
+{
+#if 1
+    int ret = 1;
+    int startl, oldl, newl, diffsz;
+    const char *remainder;
+    char *subs;
+/* snrf process_regexp from mod_headers */
+    if (ap_regexec(&rx->rx, pattern, rx->nmatch, rx->pmatch, rx->flags) != 0) {
+        rx->match = NULL;
+        return 0; /* no match, nothing to do */
+    }
+    rx->match = pattern;
+    if (rx->subs) {
+        *newpattern = ap_pregsub(pool, rx->subs, pattern,
+                                 rx->nmatch, rx->pmatch);
+        if (!*newpattern) {
+            return 0; /* FIXME - should we do more to handle error? */
+        }
+        startl = rx->pmatch[0].rm_so;
+        oldl = rx->pmatch[0].rm_eo - startl;
+        newl = strlen(*newpattern);
+        diffsz = newl - oldl;
+        remainder = pattern + startl + oldl;
+        if (rx->flags & AP_REG_MULTI) {
+            /* recurse to do any further matches */
+            char *subs;
+            ret += ap_rxplus_exec(pool, rx, remainder, &subs);
+            if (ret > 1) {
+                /* a further substitution happened */
+                diffsz += strlen(subs) - strlen(remainder);
+                remainder = subs;
+            }
+        }
+        subs  = apr_palloc(pool, strlen(pattern) + 1 + diffsz);
+        memcpy(subs, pattern, startl);
+        memcpy(subs+startl, *newpattern, newl);
+        strcpy(subs+startl+newl, remainder);
+        *newpattern = subs;
+    }
+    return ret;
+        
+
+
+
+#else
+
+
+
+
+
+
+
+
+
+
+
+
+    if (!(rx->flags & AP_REG_MULTI) || (rx->subs == NULL)) {
+        max_iterations = 1;
+    }
+    /* FIXME: multi-matching is incorrect */
+    while (max_iterations-- > 0) {
+        if (ap_regexec(&rx->rx, pattern, rx->nmatch, rx->pmatch, rx->flags)
+            == 0) {
+            ret++;
+            if (rx->subs) {
+                rx->match = pattern;
+                *newpattern = ap_pregsub(pool, rx->subs, pattern,
+                                         rx->nmatch, rx->pmatch);
+                pattern = *newpattern;
+                if (pattern == NULL) {
+                    max_iterations = 0;
+                }
+            }
+        }
+        else {
+            max_iterations = 0;
+        }
+    }
+
+    if (ret == 0 || rx->flags&AP_REG_NOMEM) {
+        rx->match = NULL;  /* no match, so don't pretend to remember a match */
+    }
+    else {
+#if 0
+        /* FIXME - should we be 'safe' and take the performance hit,
+         * or just document thou-shalt-keep-pattern-in-scope?
+         */
+        if (rx->match == inpattern) {
+            rx->match = apr_pstrdup(pool, inpattern);
+        }
+#endif
+    }
+    return ret;
+#endif
+}
+#ifdef DOXYGEN
+AP_DECLARE(int) ap_rxplus_nmatch(ap_rxplus_t *rx)
+{
+    return (rx->match != NULL) ? rx->nmatch : 0;
+}
+#endif
+
+/* If this blows up on you, see the notes in the header/apidoc
+ * rx->match is a pointer and it's your responsibility to ensure
+ * it hasn't gone out-of-scope since the last ap_rxplus_exec
+ */
+AP_DECLARE(void) ap_rxplus_match(ap_rxplus_t *rx, int n, int *len,
+                                 const char **match)
+{
+    if (n >= 0 && n < ap_rxplus_nmatch(rx)) {
+        *match = rx->match + rx->pmatch[n].rm_so;
+        *len = rx->pmatch[n].rm_eo - rx->pmatch[n].rm_so;
+    }
+    else {
+        *len = -1;
+        *match = NULL;
+    }
+}
+AP_DECLARE(char*) ap_rxplus_pmatch(apr_pool_t *pool, ap_rxplus_t *rx, int n)
+{
+    int len;
+    const char *match;
+    ap_rxplus_match(rx, n, &len, &match);
+    return (match != NULL) ? apr_pstrndup(pool, match, len) : NULL;
+}



Re: svn commit: r999533 - in /httpd/httpd/trunk: CHANGES include/ap_regex.h server/Makefile.in server/util_pcre.c server/util_regex.c

Posted by Ruediger Pluem <rp...@apache.org>.

On 09/22/2010 08:23 AM, Ruediger Pluem wrote:
> 
> On 09/21/2010 08:42 PM, niq@apache.org wrote:
>> Author: niq
>> Date: Tue Sep 21 18:42:20 2010
>> New Revision: 999533
>>
>> URL: http://svn.apache.org/viewvc?rev=999533&view=rev
>> Log:
>> Introduce ap_rxplus class: higher-level regexps supporting perl-style
>> regexp operations.
>>
>> Added:
>>     httpd/httpd/trunk/server/util_regex.c
>> Modified:
>>     httpd/httpd/trunk/CHANGES
>>     httpd/httpd/trunk/include/ap_regex.h
>>     httpd/httpd/trunk/server/Makefile.in
>>     httpd/httpd/trunk/server/util_pcre.c
>>
> 
>> Added: httpd/httpd/trunk/server/util_regex.c
>> URL: http://svn.apache.org/viewvc/httpd/httpd/trunk/server/util_regex.c?rev=999533&view=auto
>> ==============================================================================
>> --- httpd/httpd/trunk/server/util_regex.c (added)
>> +++ httpd/httpd/trunk/server/util_regex.c Tue Sep 21 18:42:20 2010
> 
>> +
>> +AP_DECLARE(int) ap_rxplus_exec(apr_pool_t *pool, ap_rxplus_t *rx,
>> +                               const char *pattern, char **newpattern)
>> +                               //int max_iterations)
> 
> 
> This looks like a C++ style comment.


Nevermind. I see now that all this stuff is already addressed.

Regards

RĂ¼diger


Re: svn commit: r999533 - in /httpd/httpd/trunk: CHANGES include/ap_regex.h server/Makefile.in server/util_pcre.c server/util_regex.c

Posted by Ruediger Pluem <rp...@apache.org>.

On 09/21/2010 08:42 PM, niq@apache.org wrote:
> Author: niq
> Date: Tue Sep 21 18:42:20 2010
> New Revision: 999533
> 
> URL: http://svn.apache.org/viewvc?rev=999533&view=rev
> Log:
> Introduce ap_rxplus class: higher-level regexps supporting perl-style
> regexp operations.
> 
> Added:
>     httpd/httpd/trunk/server/util_regex.c
> Modified:
>     httpd/httpd/trunk/CHANGES
>     httpd/httpd/trunk/include/ap_regex.h
>     httpd/httpd/trunk/server/Makefile.in
>     httpd/httpd/trunk/server/util_pcre.c
> 

> Added: httpd/httpd/trunk/server/util_regex.c
> URL: http://svn.apache.org/viewvc/httpd/httpd/trunk/server/util_regex.c?rev=999533&view=auto
> ==============================================================================
> --- httpd/httpd/trunk/server/util_regex.c (added)
> +++ httpd/httpd/trunk/server/util_regex.c Tue Sep 21 18:42:20 2010

> +
> +AP_DECLARE(int) ap_rxplus_exec(apr_pool_t *pool, ap_rxplus_t *rx,
> +                               const char *pattern, char **newpattern)
> +                               //int max_iterations)


This looks like a C++ style comment.

> +{
> +#if 1
> +    int ret = 1;
> +    int startl, oldl, newl, diffsz;
> +    const char *remainder;
> +    char *subs;
> +/* snrf process_regexp from mod_headers */
> +    if (ap_regexec(&rx->rx, pattern, rx->nmatch, rx->pmatch, rx->flags) != 0) {
> +        rx->match = NULL;
> +        return 0; /* no match, nothing to do */
> +    }
> +    rx->match = pattern;
> +    if (rx->subs) {
> +        *newpattern = ap_pregsub(pool, rx->subs, pattern,
> +                                 rx->nmatch, rx->pmatch);
> +        if (!*newpattern) {
> +            return 0; /* FIXME - should we do more to handle error? */
> +        }
> +        startl = rx->pmatch[0].rm_so;
> +        oldl = rx->pmatch[0].rm_eo - startl;
> +        newl = strlen(*newpattern);
> +        diffsz = newl - oldl;
> +        remainder = pattern + startl + oldl;
> +        if (rx->flags & AP_REG_MULTI) {
> +            /* recurse to do any further matches */
> +            char *subs;
> +            ret += ap_rxplus_exec(pool, rx, remainder, &subs);
> +            if (ret > 1) {
> +                /* a further substitution happened */
> +                diffsz += strlen(subs) - strlen(remainder);
> +                remainder = subs;
> +            }
> +        }
> +        subs  = apr_palloc(pool, strlen(pattern) + 1 + diffsz);
> +        memcpy(subs, pattern, startl);
> +        memcpy(subs+startl, *newpattern, newl);
> +        strcpy(subs+startl+newl, remainder);
> +        *newpattern = subs;
> +    }
> +    return ret;
> +        
> +
> +
> +
> +#else
> +
> +
> +
> +
> +
> +
> +
> +
> +
> +
> +
> +
> +    if (!(rx->flags & AP_REG_MULTI) || (rx->subs == NULL)) {
> +        max_iterations = 1;
> +    }
> +    /* FIXME: multi-matching is incorrect */
> +    while (max_iterations-- > 0) {
> +        if (ap_regexec(&rx->rx, pattern, rx->nmatch, rx->pmatch, rx->flags)
> +            == 0) {
> +            ret++;
> +            if (rx->subs) {
> +                rx->match = pattern;
> +                *newpattern = ap_pregsub(pool, rx->subs, pattern,
> +                                         rx->nmatch, rx->pmatch);
> +                pattern = *newpattern;
> +                if (pattern == NULL) {
> +                    max_iterations = 0;
> +                }
> +            }
> +        }
> +        else {
> +            max_iterations = 0;
> +        }
> +    }
> +
> +    if (ret == 0 || rx->flags&AP_REG_NOMEM) {
> +        rx->match = NULL;  /* no match, so don't pretend to remember a match */
> +    }
> +    else {
> +#if 0
> +        /* FIXME - should we be 'safe' and take the performance hit,
> +         * or just document thou-shalt-keep-pattern-in-scope?
> +         */
> +        if (rx->match == inpattern) {
> +            rx->match = apr_pstrdup(pool, inpattern);
> +        }
> +#endif
> +    }
> +    return ret;
> +#endif

Why do you commit dead code that is never used?

Regards

RĂ¼diger