You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucy.apache.org by ma...@apache.org on 2015/08/03 23:59:17 UTC
[4/9] lucy git commit: Port RegexTokenizer stubs to CGO.
Port RegexTokenizer stubs to CGO.
Project: http://git-wip-us.apache.org/repos/asf/lucy/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucy/commit/44fc440f
Tree: http://git-wip-us.apache.org/repos/asf/lucy/tree/44fc440f
Diff: http://git-wip-us.apache.org/repos/asf/lucy/diff/44fc440f
Branch: refs/heads/master
Commit: 44fc440fdc419b655fb4c482afb63b9020138011
Parents: dab9a88
Author: Marvin Humphrey <ma...@rectangular.com>
Authored: Sun Jul 19 12:57:13 2015 -0700
Committer: Marvin Humphrey <ma...@rectangular.com>
Committed: Fri Jul 31 17:39:28 2015 -0700
----------------------------------------------------------------------
go/cfext/lucy.c | 157 +++------------------------------------------------
go/lucy/lucy.go | 48 ++++++++++++++++
2 files changed, 56 insertions(+), 149 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucy/blob/44fc440f/go/cfext/lucy.c
----------------------------------------------------------------------
diff --git a/go/cfext/lucy.c b/go/cfext/lucy.c
index d1044df..5773f16 100644
--- a/go/cfext/lucy.c
+++ b/go/cfext/lucy.c
@@ -55,175 +55,34 @@
#include "Lucy/Store/OutStream.h"
#include "Lucy/Util/Freezer.h"
-#if defined(CHY_HAS_PCRE_H)
-
-#include <pcre.h>
-
-static uint32_t
-S_count_code_points(const char *string, size_t len);
-
bool
RegexTokenizer_is_available(void) {
- return true;
+ return false;
}
RegexTokenizer*
-RegexTokenizer_init(RegexTokenizer *self, String *pattern) {
- Analyzer_init((Analyzer*)self);
- RegexTokenizerIVARS *const ivars = RegexTokenizer_IVARS(self);
-
- char *pattern_buf = NULL;
- const char *pattern_ptr;
- if (pattern) {
- ivars->pattern = Str_Clone(pattern);
- pattern_buf = Str_To_Utf8(ivars->pattern);
- pattern_ptr = pattern_buf;
- }
- else {
- pattern_ptr = "\\w+(?:['\\x{2019}]\\w+)*";
- ivars->pattern
- = Str_new_from_trusted_utf8(pattern_ptr, strlen(pattern_ptr));
- }
-
- int options = PCRE_UTF8 | PCRE_NO_UTF8_CHECK;
-#ifdef PCRE_BSR_UNICODE
- // Available since PCRE 7.4
- options |= PCRE_BSR_UNICODE;
-#endif
-#ifdef PCRE_NEWLINE_LF
- // Available since PCRE 6.7
- options |= PCRE_NEWLINE_LF;
-#endif
- const char *err_ptr;
- int err_offset;
- pcre *re = pcre_compile(pattern_ptr, options, &err_ptr, &err_offset, NULL);
- if (pattern_buf) {
- FREEMEM(pattern_buf);
- }
- if (!re) {
- THROW(ERR, "%s", err_ptr);
- }
-
- // TODO: Check whether pcre_study improves performance
-
- ivars->token_re = re;
-
- return self;
-}
-
-void
-RegexTokenizer_Destroy_IMP(RegexTokenizer *self) {
- RegexTokenizerIVARS *const ivars = RegexTokenizer_IVARS(self);
- DECREF(ivars->pattern);
- pcre *re = (pcre*)ivars->token_re;
- if (re) {
- pcre_free(re);
- }
- SUPER_DESTROY(self, REGEXTOKENIZER);
-}
-
-void
-RegexTokenizer_Tokenize_Utf8_IMP(RegexTokenizer *self, const char *string,
- size_t string_len, Inversion *inversion) {
- RegexTokenizerIVARS *const ivars = RegexTokenizer_IVARS(self);
- pcre *re = (pcre*)ivars->token_re;
- int byte_offset = 0;
- uint32_t cp_offset = 0; // Code points
- int options = PCRE_NO_UTF8_CHECK;
- int ovector[3];
-
- int return_code = pcre_exec(re, NULL, string, string_len, byte_offset,
- options, ovector, 3);
- while (return_code >= 0) {
- const char *match = string + ovector[0];
- size_t match_len = ovector[1] - ovector[0];
-
- uint32_t cp_before = S_count_code_points(string + byte_offset,
- ovector[0] - byte_offset);
- uint32_t cp_start = cp_offset + cp_before;
- uint32_t cp_matched = S_count_code_points(match, match_len);
- uint32_t cp_end = cp_start + cp_matched;
-
- // Add a token to the new inversion.
- Token *token = Token_new(match, match_len, cp_start, cp_end, 1.0f, 1);
- Inversion_Append(inversion, token);
-
- byte_offset = ovector[1];
- cp_offset = cp_end;
- return_code = pcre_exec(re, NULL, string, string_len, byte_offset,
- options, ovector, 3);
- }
-
- if (return_code != PCRE_ERROR_NOMATCH) {
- THROW(ERR, "pcre_exec failed: %d", return_code);
- }
-}
-
-static uint32_t
-S_count_code_points(const char *string, size_t len) {
- uint32_t num_code_points = 0;
- size_t i = 0;
-
- while (i < len) {
- i += StrHelp_UTF8_COUNT[(uint8_t)(string[i])];
- ++num_code_points;
- }
-
- if (i != len) {
- THROW(ERR, "Match between code point boundaries in '%s'", string);
- }
-
- return num_code_points;
-}
-
-#else // CHY_HAS_PCRE_H
-
-bool
-RegexTokenizer_is_available(void) {
- return false;
-}
+(*GOLUCY_RegexTokenizer_init_BRIDGE)(RegexTokenizer *self, String *pattern);
RegexTokenizer*
RegexTokenizer_init(RegexTokenizer *self, String *pattern) {
- UNUSED_VAR(self);
- UNUSED_VAR(pattern);
- THROW(ERR,
- "RegexTokenizer is not available because Lucy was compiled"
- " without PCRE.");
- UNREACHABLE_RETURN(RegexTokenizer*);
+ return GOLUCY_RegexTokenizer_init_BRIDGE(self, pattern);
}
-void
-RegexTokenizer_Set_Token_RE_IMP(RegexTokenizer *self, void *token_re) {
- UNUSED_VAR(self);
- UNUSED_VAR(token_re);
- THROW(ERR,
- "RegexTokenizer is not available because Lucy was compiled"
- " without PCRE.");
-}
+RegexTokenizer_Destroy_t GOLUCY_RegexTokenizer_Destroy_BRIDGE;
void
RegexTokenizer_Destroy_IMP(RegexTokenizer *self) {
- UNUSED_VAR(self);
- THROW(ERR,
- "RegexTokenizer is not available because Lucy was compiled"
- " without PCRE.");
+ GOLUCY_RegexTokenizer_Destroy_BRIDGE(self);
}
+RegexTokenizer_Tokenize_Utf8_t GOLUCY_RegexTokenizer_Tokenize_Utf8_BRIDGE;
+
void
RegexTokenizer_Tokenize_Utf8_IMP(RegexTokenizer *self, const char *string,
size_t string_len, Inversion *inversion) {
- UNUSED_VAR(self);
- UNUSED_VAR(string);
- UNUSED_VAR(string_len);
- UNUSED_VAR(inversion);
- THROW(ERR,
- "RegexTokenizer is not available because Lucy was compiled"
- " without PCRE.");
+ GOLUCY_RegexTokenizer_Tokenize_Utf8_BRIDGE(self, string, string_len, inversion);
}
-#endif // CHY_HAS_PCRE_H
-
/********************************** Doc ********************************/
Doc*
http://git-wip-us.apache.org/repos/asf/lucy/blob/44fc440f/go/lucy/lucy.go
----------------------------------------------------------------------
diff --git a/go/lucy/lucy.go b/go/lucy/lucy.go
index 908599a..13bdafa 100644
--- a/go/lucy/lucy.go
+++ b/go/lucy/lucy.go
@@ -17,11 +17,59 @@
package lucy
/*
+#define C_LUCY_REGEXTOKENIZER
+
#include "lucy_parcel.h"
+#include "Lucy/Analysis/RegexTokenizer.h"
+
+extern lucy_RegexTokenizer*
+GOLUCY_RegexTokenizer_init(lucy_RegexTokenizer *self, cfish_String *pattern);
+extern lucy_RegexTokenizer*
+(*GOLUCY_RegexTokenizer_init_BRIDGE)(lucy_RegexTokenizer *self,
+ cfish_String *pattern);
+extern void
+GOLUCY_RegexTokenizer_Destroy(lucy_RegexTokenizer *self);
+extern void
+(*GOLUCY_RegexTokenizer_Destroy_BRIDGE)(lucy_RegexTokenizer *self);
+extern void
+GOLUCY_RegexTokenizer_Tokenize_Utf8(lucy_RegexTokenizer *self, char *str,
+ size_t string_len, lucy_Inversion *inversion);
+extern void
+(*GOLUCY_RegexTokenizer_Tokenize_Utf8_BRIDGE)(lucy_RegexTokenizer *self, const char *str,
+ size_t string_len, lucy_Inversion *inversion);
+
+
+// C symbols linked into a Go-built package archive are not visible to
+// external C code -- but internal code *can* see symbols from outside.
+// This allows us to fake up symbol export by assigning values only known
+// interally to external symbols during Go package initialization.
+static CFISH_INLINE void
+GOLUCY_glue_exported_symbols() {
+ GOLUCY_RegexTokenizer_init_BRIDGE = GOLUCY_RegexTokenizer_init;
+ GOLUCY_RegexTokenizer_Destroy_BRIDGE = GOLUCY_RegexTokenizer_Destroy;
+ GOLUCY_RegexTokenizer_Tokenize_Utf8_BRIDGE
+ = (LUCY_RegexTokenizer_Tokenize_Utf8_t)GOLUCY_RegexTokenizer_Tokenize_Utf8;
+}
+
*/
import "C"
import _ "git-wip-us.apache.org/repos/asf/lucy-clownfish.git/runtime/go/clownfish"
func init() {
+ C.GOLUCY_glue_exported_symbols()
C.lucy_bootstrap_parcel()
}
+
+//export GOLUCY_RegexTokenizer_init
+func GOLUCY_RegexTokenizer_init(rt *C.lucy_RegexTokenizer, pattern *C.cfish_String) *C.lucy_RegexTokenizer {
+ return nil
+}
+
+//export GOLUCY_RegexTokenizer_Destroy
+func GOLUCY_RegexTokenizer_Destroy(rt *C.lucy_RegexTokenizer) {
+}
+
+//export GOLUCY_RegexTokenizer_Tokenize_Utf8
+func GOLUCY_RegexTokenizer_Tokenize_Utf8(rt *C.lucy_RegexTokenizer, str *C.char,
+ stringLen C.size_t, inversion *C.lucy_Inversion) {
+}