You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@lucy.apache.org by nw...@apache.org on 2013/03/02 21:06:35 UTC

[lucy-commits] [15/15] git commit: refs/heads/c-bindings-wip2 - Implement POSIX RegexTokenizer

Updated Branches:
  refs/heads/c-bindings-wip2 [created] 24d06ccd8


Implement POSIX RegexTokenizer


Project: http://git-wip-us.apache.org/repos/asf/lucy/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucy/commit/24d06ccd
Tree: http://git-wip-us.apache.org/repos/asf/lucy/tree/24d06ccd
Diff: http://git-wip-us.apache.org/repos/asf/lucy/diff/24d06ccd

Branch: refs/heads/c-bindings-wip2
Commit: 24d06ccd88d6d093dbf54638cbc8b9e81f770d34
Parents: 0fc39d5
Author: Nick Wellnhofer <we...@aevum.de>
Authored: Sat Mar 2 21:03:52 2013 +0100
Committer: Nick Wellnhofer <we...@aevum.de>
Committed: Sat Mar 2 21:03:52 2013 +0100

----------------------------------------------------------------------
 c/src/Lucy/Analysis/RegexTokenizer.c |  114 ++++++++++++++++++++++++++++-
 1 files changed, 110 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucy/blob/24d06ccd/c/src/Lucy/Analysis/RegexTokenizer.c
----------------------------------------------------------------------
diff --git a/c/src/Lucy/Analysis/RegexTokenizer.c b/c/src/Lucy/Analysis/RegexTokenizer.c
index 2f21afb..46b0fe4 100644
--- a/c/src/Lucy/Analysis/RegexTokenizer.c
+++ b/c/src/Lucy/Analysis/RegexTokenizer.c
@@ -17,15 +17,62 @@
 #define C_LUCY_REGEXTOKENIZER
 
 #include "CFBind.h"
+
+#include <string.h>
+
+#if defined(CHY_HAS_REGEX_H)
+  #include <regex.h>
+#elif defined(CHY_HAS_PCREPOSIX_H)
+  #include <pcreposix.h>
+#else
+  #error No regex headers found.
+#endif
+
+#include "Clownfish/Util/Memory.h"
+#include "Clownfish/Util/StringHelper.h"
 #include "Lucy/Analysis/RegexTokenizer.h"
 #include "Lucy/Analysis/Token.h"
 #include "Lucy/Analysis/Inversion.h"
 
+static uint32_t
+S_count_code_points(const char *string, size_t len);
+
 lucy_RegexTokenizer*
 lucy_RegexTokenizer_init(lucy_RegexTokenizer *self,
                          const lucy_CharBuf *pattern) {
-    THROW(LUCY_ERR, "TODO");
-    UNREACHABLE_RETURN(lucy_RegexTokenizer*);
+    lucy_Analyzer_init((lucy_Analyzer*)self);
+
+    const char *pattern_ptr;
+    if (pattern) {
+        self->pattern = Lucy_CB_Clone(pattern);
+        pattern_ptr = (char*)Lucy_CB_Get_Ptr8(self->pattern);
+    }
+    else {
+        pattern_ptr = "[[:alnum:]]+('[[:alnum:]]+)*";
+        self->pattern
+            = lucy_CB_new_from_trusted_utf8(pattern_ptr, strlen(pattern_ptr));
+    }
+
+    // TODO: Make sure that we use a UTF-8 locale.
+
+    int flags = REG_EXTENDED;
+#ifdef CHY_HAS_REG_ENHANCED
+    flags |= REG_ENHANCED;
+#endif
+    regex_t *re = LUCY_MALLOCATE(sizeof(regex_t));
+    int errcode = regcomp(re, pattern_ptr, flags);
+    if (errcode) {
+        size_t errbuf_size = regerror(errcode, re, NULL, 0);
+        char *errbuf = (char*)LUCY_MALLOCATE(errbuf_size);
+        regerror(errcode, re, errbuf, errbuf_size);
+        regfree(re);
+        LUCY_FREEMEM(re);
+        THROW(LUCY_ERR, "%s", errbuf);
+    }
+
+    self->token_re = re;
+
+    return self;
 }
 
 void
@@ -35,14 +82,73 @@ lucy_RegexTokenizer_set_token_re(lucy_RegexTokenizer *self, void *token_re) {
 
 void
 lucy_RegexTokenizer_destroy(lucy_RegexTokenizer *self) {
-    THROW(LUCY_ERR, "TODO");
+    CFISH_DECREF(self->pattern);
+    regex_t *re = (regex_t*)self->token_re;
+    if (re) {
+        regfree(re);
+        LUCY_FREEMEM(re);
+    }
+    LUCY_SUPER_DESTROY(self, LUCY_REGEXTOKENIZER);
 }
 
 void
 lucy_RegexTokenizer_tokenize_str(lucy_RegexTokenizer *self,
                                  const char *string, size_t string_len,
                                  lucy_Inversion *inversion) {
-    THROW(LUCY_ERR, "TODO");
+    /* Null-terminate string. This could be avoided by using the non-standard
+     * REG_STARTEND flag.
+     */
+    char *c_string = (char*)LUCY_MALLOCATE(string_len + 1);
+    memcpy(c_string, string, string_len);
+    c_string[string_len] = '\0';
+
+    regex_t    *re  = (regex_t*)self->token_re;
+    const char *ptr = c_string;
+    uint32_t    off = 0; // Code points
+    regmatch_t match;
+
+    int errcode = regexec(re, ptr, 1, &match, 0);
+    while (errcode == 0) {
+        const char *match_start = ptr + match.rm_so;
+        size_t      match_len   = match.rm_eo - match.rm_so;
+        uint32_t    start_off   = off + S_count_code_points(ptr, match.rm_so);
+        uint32_t    end_off     = start_off + S_count_code_points(match_start,
+                                                                  match_len); 
+
+        // Add a token to the new inversion.
+        lucy_Token *token = lucy_Token_new(match_start, match_len, start_off,
+                                           end_off, 1.0f, 1);
+        Lucy_Inversion_Append(inversion, token);
+
+        ptr += match.rm_eo;
+        off  = end_off;
+        errcode = regexec(re, ptr, 1, &match, REG_NOTBOL);
+    }
+
+    if (errcode != REG_NOMATCH) {
+        size_t errbuf_size = regerror(errcode, re, NULL, 0);
+        char *errbuf = (char*)LUCY_MALLOCATE(errbuf_size);
+        regerror(errcode, re, errbuf, errbuf_size);
+        THROW(LUCY_ERR, "%s", errbuf);
+    }
+
+    LUCY_FREEMEM(c_string);
 }
 
+static uint32_t
+S_count_code_points(const char *string, size_t len) {
+    uint32_t num_code_points = 0;
+    size_t i = 0;
+
+    while (i < len) {
+        i += lucy_StrHelp_UTF8_COUNT[(uint8_t)(string[i])];
+        ++num_code_points;
+    }
+
+    if (i != len) {
+        THROW(LUCY_ERR, "Match between code point boundaries in '%s'", string);
+    }
+
+    return num_code_points;
+}

Re: [lucy-dev] Re: git commit: refs/heads/c-bindings-wip2 - Implement POSIX RegexTokenizer

Posted by Kurt Starsinic <ks...@gmail.com>.

FYI, Perl's support is moving along pretty quickly:

http://perl5.git.perl.org/perl.git/blob/HEAD:/pod/perlunicode.pod#l969

- Kurt


On Tue, Mar 5, 2013 at 8:19 AM, Nick Wellnhofer <we...@aevum.de> wrote:

> On 05/03/2013 05:05, Marvin Humphrey wrote:
>
>> On Sat, Mar 2, 2013 at 12:06 PM,  <nw...@apache.org> wrote:
>>
>> We may want to consider allowing builds without a fully functional
>> RegexTokenizer in the future.  At some point, we'll publish a public API
>> for
>> extending Analyzer from C, and it's not hard to imagine people creating
>> their
>> own tokenizer for a dedicated app which doesn't need RegexTokenizer.
>>
>
> Yes, we could make RegexTokenizer optional. I don't see a problem with
> that.
>
>
>  +    // TODO: Make sure that we use a UTF-8 locale.
>>>
>>
>> PCRE has a UTF-8 mode, if I recall correctly.  Would things be easier if
>> we
>> make PCRE a mandatory prerequisite for a functioning RegexTokenizer?
>>
>
> I implemented the POSIX RegexTokenizer because it was very easy to do.
> PCRE is next on my list. Maybe we should support multiple regex flavors:
>
>     RegexTokenizer_new(CharBuf *pattern, CharBuf *flavor)
>
> That might be useful for other host languages, too. But for
> interoperability between host languages, it would be better to have a
> single, universally supported syntax.
>
>
>  I'm not totally up to speed on the standards, but it seems to me that it
>> would
>> be better to prefer Unicode regular expressions over POSIX, if we have to
>> choose.
>>
>>      http://www.unicode.org/**reports/tr18/<http://www.unicode.org/reports/tr18/>
>>
>
> Unicode TR18 doesn't specify a particular regex syntax. It only says how a
> regex engine should behave with regard to Unicode.
>
> POSIX regexes should work with UTF-8 strings when using a UTF-8 locale.
> Other than that, they probably don't support much of TR18. Also note that
> even Perl's support for TR18 isn't complete:
>
> http://perldoc.perl.org/**perlunicode.html#Unicode-**
> Regular-Expression-Support-**Level<http://perldoc.perl.org/perlunicode.html#Unicode-Regular-Expression-Support-Level>
>
> But most other regex engines aside from ICU are a lot worse, AFAIK.
>
> Nick
>
>

Re: [lucy-dev] git commit: refs/heads/c-bindings-wip2 - Implement POSIX RegexTokenizer

Posted by Marvin Humphrey <ma...@rectangular.com>.

On Mon, Mar 11, 2013 at 4:31 PM, Nick Wellnhofer <we...@aevum.de> wrote:

> We could simply include the type of the regex engine and even the particular
> version in the serialization and equality test of RegexTokenizer. This
> should safely guard against using different engines for indexing.
>
> Whether regex engines are selected at compile time, by choosing a dedicated
> class, or by a constructor argument shouldn't make a difference then. I'd
> prefer the latter approach.

Having the equality test fail leads to a harsh consequence, though -- when the
regex engine changes (e.g. because you upgraded the host language) all your
apps start throwing exceptions as soon as they try to open the index.  And
it's quite possible that the changes in the regex behavior don't even affect
your app or cause only minor degradation.

This problem of degraded recall is the same one we face with all Analyzer
behavior changes.  The best solution for many users is to live with a window
of inferior search results while refreshing the index after an upgrade.

If we introduce extra fields specifying the engine and possibly the version,
how about leaving them undefined by default, falling back to whatever is
available?  It's a little strange to have an opt-in which ties your hands on
upgrade, though...

Fortunately, with StandardTokenizer and EasyAnalyzer moving to the forefront
in all our sample code, we can assume that fewer people use RegexTokenizer
these days and all this matters less than it once did. :)

Marvin Humphrey

Re: [lucy-dev] git commit: refs/heads/c-bindings-wip2 - Implement POSIX RegexTokenizer

Posted by Nick Wellnhofer <we...@aevum.de>.

On Mar 11, 2013, at 23:21 , Marvin Humphrey <ma...@rectangular.com> wrote:

> What might theoretically be useful is specifying a regex engine for the sake
> of index portability across hosts -- for example, specifying that a Perl build
> of Lucy use PCRE instead of Perl's regex engine.  There are a couple of ways
> we could do that.
> 
> One option would be to offer a compile-time configuration option for
> RegexTokenizer.  However, incompatible configurations would fail silently,
> producing subtly different results under the inappropriate engine rather than
> bombing out.
> 
> A more reliable technique would be to provide dedicated classes such as
> "PCRETokenizer" which are associated with specific regex engines.  However,
> such an approach has notable cost because the regex engine code would need to
> be bundled to protect against incompatibilities across regex engine versions.

We could simply include the type of the regex engine and even the particular version in the serialization and equality test of RegexTokenizer. This should safely guard against using different engines for indexing.

Whether regex engines are selected at compile time, by choosing a dedicated class, or by a constructor argument shouldn't make a difference then. I'd prefer the latter approach.

Nick

Re: [lucy-dev] Re: git commit: refs/heads/c-bindings-wip2 - Implement POSIX RegexTokenizer

Posted by Marvin Humphrey <ma...@rectangular.com>.

On Tue, Mar 5, 2013 at 5:19 AM, Nick Wellnhofer <we...@aevum.de> wrote:
> I implemented the POSIX RegexTokenizer because it was very easy to do. PCRE
> is next on my list. Maybe we should support multiple regex flavors:
>
>     RegexTokenizer_new(CharBuf *pattern, CharBuf *flavor)

In the context of tokenizing, regular expression engines are more notable
for producing subtly incompatible results than for offering substantially
different functionality.  I don't think that offering different regular
expression flavors at runtime lets our users accomplish much as far as
tokenization that they couldn't accomplish otherwise.

What might theoretically be useful is specifying a regex engine for the sake
of index portability across hosts -- for example, specifying that a Perl build
of Lucy use PCRE instead of Perl's regex engine.  There are a couple of ways
we could do that.

One option would be to offer a compile-time configuration option for
RegexTokenizer.  However, incompatible configurations would fail silently,
producing subtly different results under the inappropriate engine rather than
bombing out.

A more reliable technique would be to provide dedicated classes such as
"PCRETokenizer" which are associated with specific regex engines.  However,
such an approach has notable cost because the regex engine code would need to
be bundled to protect against incompatibilities across regex engine versions.

> That might be useful for other host languages, too. But for interoperability
> between host languages, it would be better to have a single, universally
> supported syntax.

The only way we're going to get reliable interoperability across host
languages for tokenization based on regular expressions is to bundle a regex
engine such as PCRE with every Lucy build.  Which doesn't seem worthwhile at
this time.

Even between different versions of the host language, interop will be
imperfect -- index compatibility will be compromized on host language upgrades
as the host language devs fix bugs, add features, update to the latest and
greatest version of Unicode, etc..

But that's fine.  That's what RegexTokenizer is now -- a wrapper around the
host's regex facilities which allows users to leverage their existing
expertise.  We prize host integration, and RegexTokenizer is an expression of
that.  People just need to be aware that they may need to regenerate their
indexes when the regex engine behavior changes.

Marvin Humphrey

[lucy-dev] Re: git commit: refs/heads/c-bindings-wip2 - Implement POSIX RegexTokenizer

Posted by Nick Wellnhofer <we...@aevum.de>.

On 05/03/2013 05:05, Marvin Humphrey wrote:
> On Sat, Mar 2, 2013 at 12:06 PM,  <nw...@apache.org> wrote:
>
> We may want to consider allowing builds without a fully functional
> RegexTokenizer in the future.  At some point, we'll publish a public API for
> extending Analyzer from C, and it's not hard to imagine people creating their
> own tokenizer for a dedicated app which doesn't need RegexTokenizer.

Yes, we could make RegexTokenizer optional. I don't see a problem with that.

>> +    // TODO: Make sure that we use a UTF-8 locale.
>
> PCRE has a UTF-8 mode, if I recall correctly.  Would things be easier if we
> make PCRE a mandatory prerequisite for a functioning RegexTokenizer?

I implemented the POSIX RegexTokenizer because it was very easy to do. 
PCRE is next on my list. Maybe we should support multiple regex flavors:

     RegexTokenizer_new(CharBuf *pattern, CharBuf *flavor)

That might be useful for other host languages, too. But for 
interoperability between host languages, it would be better to have a 
single, universally supported syntax.

> I'm not totally up to speed on the standards, but it seems to me that it would
> be better to prefer Unicode regular expressions over POSIX, if we have to
> choose.
>
>      http://www.unicode.org/reports/tr18/

Unicode TR18 doesn't specify a particular regex syntax. It only says how 
a regex engine should behave with regard to Unicode.

POSIX regexes should work with UTF-8 strings when using a UTF-8 locale. 
Other than that, they probably don't support much of TR18. Also note 
that even Perl's support for TR18 isn't complete:

http://perldoc.perl.org/perlunicode.html#Unicode-Regular-Expression-Support-Level

But most other regex engines aside from ICU are a lot worse, AFAIK.

Nick

[lucy-dev] Re: [lucy-commits] [15/15] git commit: refs/heads/c-bindings-wip2 - Implement POSIX RegexTokenizer

Posted by Marvin Humphrey <ma...@rectangular.com>.

On Sat, Mar 2, 2013 at 12:06 PM,  <nw...@apache.org> wrote:

> Implement POSIX RegexTokenizer

> Project: http://git-wip-us.apache.org/repos/asf/lucy/repo
> Commit: http://git-wip-us.apache.org/repos/asf/lucy/commit/24d06ccd
> Tree: http://git-wip-us.apache.org/repos/asf/lucy/tree/24d06ccd
> Diff: http://git-wip-us.apache.org/repos/asf/lucy/diff/24d06ccd

> +#if defined(CHY_HAS_REGEX_H)
> +  #include <regex.h>
> +#elif defined(CHY_HAS_PCREPOSIX_H)
> +  #include <pcreposix.h>
> +#else
> +  #error No regex headers found.
> +#endif

We may want to consider allowing builds without a fully functional
RegexTokenizer in the future.  At some point, we'll publish a public API for
extending Analyzer from C, and it's not hard to imagine people creating their
own tokenizer for a dedicated app which doesn't need RegexTokenizer.

> +    // TODO: Make sure that we use a UTF-8 locale.

PCRE has a UTF-8 mode, if I recall correctly.  Would things be easier if we
make PCRE a mandatory prerequisite for a functioning RegexTokenizer?

I'm not totally up to speed on the standards, but it seems to me that it would
be better to prefer Unicode regular expressions over POSIX, if we have to
choose.

    http://www.unicode.org/reports/tr18/

Marvin Humphrey