You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucy.apache.org by ma...@apache.org on 2015/08/03 23:59:17 UTC

[4/9] lucy git commit: Port RegexTokenizer stubs to CGO.

Port RegexTokenizer stubs to CGO.


Project: http://git-wip-us.apache.org/repos/asf/lucy/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucy/commit/44fc440f
Tree: http://git-wip-us.apache.org/repos/asf/lucy/tree/44fc440f
Diff: http://git-wip-us.apache.org/repos/asf/lucy/diff/44fc440f

Branch: refs/heads/master
Commit: 44fc440fdc419b655fb4c482afb63b9020138011
Parents: dab9a88
Author: Marvin Humphrey <ma...@rectangular.com>
Authored: Sun Jul 19 12:57:13 2015 -0700
Committer: Marvin Humphrey <ma...@rectangular.com>
Committed: Fri Jul 31 17:39:28 2015 -0700

----------------------------------------------------------------------
 go/cfext/lucy.c | 157 +++------------------------------------------------
 go/lucy/lucy.go |  48 ++++++++++++++++
 2 files changed, 56 insertions(+), 149 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucy/blob/44fc440f/go/cfext/lucy.c
----------------------------------------------------------------------
diff --git a/go/cfext/lucy.c b/go/cfext/lucy.c
index d1044df..5773f16 100644
--- a/go/cfext/lucy.c
+++ b/go/cfext/lucy.c
@@ -55,175 +55,34 @@
 #include "Lucy/Store/OutStream.h"
 #include "Lucy/Util/Freezer.h"
 
-#if defined(CHY_HAS_PCRE_H)
-
-#include <pcre.h>
-
-static uint32_t
-S_count_code_points(const char *string, size_t len);
-
 bool
 RegexTokenizer_is_available(void) {
-    return true;
+    return false;
 }
 
 RegexTokenizer*
-RegexTokenizer_init(RegexTokenizer *self, String *pattern) {
-    Analyzer_init((Analyzer*)self);
-    RegexTokenizerIVARS *const ivars = RegexTokenizer_IVARS(self);
-
-    char *pattern_buf = NULL;
-    const char *pattern_ptr;
-    if (pattern) {
-        ivars->pattern = Str_Clone(pattern);
-        pattern_buf = Str_To_Utf8(ivars->pattern);
-        pattern_ptr = pattern_buf;
-    }
-    else {
-        pattern_ptr = "\\w+(?:['\\x{2019}]\\w+)*";
-        ivars->pattern
-            = Str_new_from_trusted_utf8(pattern_ptr, strlen(pattern_ptr));
-    }
-
-    int options = PCRE_UTF8 | PCRE_NO_UTF8_CHECK;
-#ifdef PCRE_BSR_UNICODE
-    // Available since PCRE 7.4
-    options |= PCRE_BSR_UNICODE;
-#endif
-#ifdef PCRE_NEWLINE_LF
-    // Available since PCRE 6.7
-    options |= PCRE_NEWLINE_LF;
-#endif
-    const char *err_ptr;
-    int err_offset;
-    pcre *re = pcre_compile(pattern_ptr, options, &err_ptr, &err_offset, NULL);
-    if (pattern_buf) {
-        FREEMEM(pattern_buf);
-    }
-    if (!re) {
-        THROW(ERR, "%s", err_ptr);
-    }
-
-    // TODO: Check whether pcre_study improves performance
-
-    ivars->token_re = re;
-
-    return self;
-}
-
-void
-RegexTokenizer_Destroy_IMP(RegexTokenizer *self) {
-    RegexTokenizerIVARS *const ivars = RegexTokenizer_IVARS(self);
-    DECREF(ivars->pattern);
-    pcre *re = (pcre*)ivars->token_re;
-    if (re) {
-        pcre_free(re);
-    }
-    SUPER_DESTROY(self, REGEXTOKENIZER);
-}
-
-void
-RegexTokenizer_Tokenize_Utf8_IMP(RegexTokenizer *self, const char *string,
-                                 size_t string_len, Inversion *inversion) {
-    RegexTokenizerIVARS *const ivars = RegexTokenizer_IVARS(self);
-    pcre      *re          = (pcre*)ivars->token_re;
-    int        byte_offset = 0;
-    uint32_t   cp_offset   = 0; // Code points
-    int        options     = PCRE_NO_UTF8_CHECK;
-    int        ovector[3];
-
-    int return_code = pcre_exec(re, NULL, string, string_len, byte_offset,
-                                options, ovector, 3);
-    while (return_code >= 0) {
-        const char *match     = string + ovector[0];
-        size_t      match_len = ovector[1] - ovector[0];
-
-        uint32_t cp_before  = S_count_code_points(string + byte_offset,
-                                                  ovector[0] - byte_offset);
-        uint32_t cp_start   = cp_offset + cp_before;
-        uint32_t cp_matched = S_count_code_points(match, match_len);
-        uint32_t cp_end     = cp_start + cp_matched;
-
-        // Add a token to the new inversion.
-        Token *token = Token_new(match, match_len, cp_start, cp_end, 1.0f, 1);
-        Inversion_Append(inversion, token);
-
-        byte_offset = ovector[1];
-        cp_offset   = cp_end;
-        return_code = pcre_exec(re, NULL, string, string_len, byte_offset,
-                                options, ovector, 3);
-    }
-
-    if (return_code != PCRE_ERROR_NOMATCH) {
-        THROW(ERR, "pcre_exec failed: %d", return_code);
-    }
-}
-
-static uint32_t
-S_count_code_points(const char *string, size_t len) {
-    uint32_t num_code_points = 0;
-    size_t i = 0;
-
-    while (i < len) {
-        i += StrHelp_UTF8_COUNT[(uint8_t)(string[i])];
-        ++num_code_points;
-    }
-
-    if (i != len) {
-        THROW(ERR, "Match between code point boundaries in '%s'", string);
-    }
-
-    return num_code_points;
-}
-
-#else // CHY_HAS_PCRE_H
-
-bool
-RegexTokenizer_is_available(void) {
-    return false;
-}
+(*GOLUCY_RegexTokenizer_init_BRIDGE)(RegexTokenizer *self, String *pattern);
 
 RegexTokenizer*
 RegexTokenizer_init(RegexTokenizer *self, String *pattern) {
-    UNUSED_VAR(self);
-    UNUSED_VAR(pattern);
-    THROW(ERR,
-          "RegexTokenizer is not available because Lucy was compiled"
-          " without PCRE.");
-    UNREACHABLE_RETURN(RegexTokenizer*);
+    return GOLUCY_RegexTokenizer_init_BRIDGE(self, pattern);
 }
 
-void
-RegexTokenizer_Set_Token_RE_IMP(RegexTokenizer *self, void *token_re) {
-    UNUSED_VAR(self);
-    UNUSED_VAR(token_re);
-    THROW(ERR,
-          "RegexTokenizer is not available because Lucy was compiled"
-          " without PCRE.");
-}
+RegexTokenizer_Destroy_t GOLUCY_RegexTokenizer_Destroy_BRIDGE;
 
 void
 RegexTokenizer_Destroy_IMP(RegexTokenizer *self) {
-    UNUSED_VAR(self);
-    THROW(ERR,
-          "RegexTokenizer is not available because Lucy was compiled"
-          " without PCRE.");
+    GOLUCY_RegexTokenizer_Destroy_BRIDGE(self);
 }
 
+RegexTokenizer_Tokenize_Utf8_t GOLUCY_RegexTokenizer_Tokenize_Utf8_BRIDGE;
+
 void
 RegexTokenizer_Tokenize_Utf8_IMP(RegexTokenizer *self, const char *string,
                                  size_t string_len, Inversion *inversion) {
-    UNUSED_VAR(self);
-    UNUSED_VAR(string);
-    UNUSED_VAR(string_len);
-    UNUSED_VAR(inversion);
-    THROW(ERR,
-          "RegexTokenizer is not available because Lucy was compiled"
-          " without PCRE.");
+    GOLUCY_RegexTokenizer_Tokenize_Utf8_BRIDGE(self, string, string_len, inversion);
 }
 
-#endif // CHY_HAS_PCRE_H
-
 /********************************** Doc ********************************/
 
 Doc*

http://git-wip-us.apache.org/repos/asf/lucy/blob/44fc440f/go/lucy/lucy.go
----------------------------------------------------------------------
diff --git a/go/lucy/lucy.go b/go/lucy/lucy.go
index 908599a..13bdafa 100644
--- a/go/lucy/lucy.go
+++ b/go/lucy/lucy.go
@@ -17,11 +17,59 @@
 package lucy
 
 /*
+#define C_LUCY_REGEXTOKENIZER
+
 #include "lucy_parcel.h"
+#include "Lucy/Analysis/RegexTokenizer.h"
+
+extern lucy_RegexTokenizer*
+GOLUCY_RegexTokenizer_init(lucy_RegexTokenizer *self, cfish_String *pattern);
+extern lucy_RegexTokenizer*
+(*GOLUCY_RegexTokenizer_init_BRIDGE)(lucy_RegexTokenizer *self,
+									 cfish_String *pattern);
+extern void
+GOLUCY_RegexTokenizer_Destroy(lucy_RegexTokenizer *self);
+extern void
+(*GOLUCY_RegexTokenizer_Destroy_BRIDGE)(lucy_RegexTokenizer *self);
+extern void
+GOLUCY_RegexTokenizer_Tokenize_Utf8(lucy_RegexTokenizer *self, char *str,
+									size_t string_len, lucy_Inversion *inversion);
+extern void
+(*GOLUCY_RegexTokenizer_Tokenize_Utf8_BRIDGE)(lucy_RegexTokenizer *self, const char *str,
+											  size_t string_len, lucy_Inversion *inversion);
+
+
+// C symbols linked into a Go-built package archive are not visible to
+// external C code -- but internal code *can* see symbols from outside.
+// This allows us to fake up symbol export by assigning values only known
+// interally to external symbols during Go package initialization.
+static CFISH_INLINE void
+GOLUCY_glue_exported_symbols() {
+	GOLUCY_RegexTokenizer_init_BRIDGE = GOLUCY_RegexTokenizer_init;
+	GOLUCY_RegexTokenizer_Destroy_BRIDGE = GOLUCY_RegexTokenizer_Destroy;
+	GOLUCY_RegexTokenizer_Tokenize_Utf8_BRIDGE
+		= (LUCY_RegexTokenizer_Tokenize_Utf8_t)GOLUCY_RegexTokenizer_Tokenize_Utf8;
+}
+
 */
 import "C"
 import _ "git-wip-us.apache.org/repos/asf/lucy-clownfish.git/runtime/go/clownfish"
 
 func init() {
+	C.GOLUCY_glue_exported_symbols()
 	C.lucy_bootstrap_parcel()
 }
+
+//export GOLUCY_RegexTokenizer_init
+func GOLUCY_RegexTokenizer_init(rt *C.lucy_RegexTokenizer, pattern *C.cfish_String) *C.lucy_RegexTokenizer {
+	return nil
+}
+
+//export GOLUCY_RegexTokenizer_Destroy
+func GOLUCY_RegexTokenizer_Destroy(rt *C.lucy_RegexTokenizer) {
+}
+
+//export GOLUCY_RegexTokenizer_Tokenize_Utf8
+func GOLUCY_RegexTokenizer_Tokenize_Utf8(rt *C.lucy_RegexTokenizer, str *C.char,
+	stringLen C.size_t, inversion *C.lucy_Inversion) {
+}