You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucy.apache.org by nw...@apache.org on 2013/03/11 22:52:48 UTC
[lucy-commits] [14/18] git commit: refs/heads/master - Implement RegexTokenizer for
C bindings using PCRE
Implement RegexTokenizer for C bindings using PCRE
Project: http://git-wip-us.apache.org/repos/asf/lucy/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucy/commit/49dbbec6
Tree: http://git-wip-us.apache.org/repos/asf/lucy/tree/49dbbec6
Diff: http://git-wip-us.apache.org/repos/asf/lucy/diff/49dbbec6
Branch: refs/heads/master
Commit: 49dbbec6e81420e2eb7ae815846f522b244e82d0
Parents: 049bd82
Author: Nick Wellnhofer <we...@aevum.de>
Authored: Sat Mar 2 21:03:52 2013 +0100
Committer: Nick Wellnhofer <we...@aevum.de>
Committed: Sat Mar 9 17:51:55 2013 +0100
----------------------------------------------------------------------
c/src/Lucy/Analysis/RegexTokenizer.c | 153 ++++++++++++++++++++++++++---
common/charmonizer.main | 6 +-
2 files changed, 145 insertions(+), 14 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucy/blob/49dbbec6/c/src/Lucy/Analysis/RegexTokenizer.c
----------------------------------------------------------------------
diff --git a/c/src/Lucy/Analysis/RegexTokenizer.c b/c/src/Lucy/Analysis/RegexTokenizer.c
index 2f21afb..ae4c7d1 100644
--- a/c/src/Lucy/Analysis/RegexTokenizer.c
+++ b/c/src/Lucy/Analysis/RegexTokenizer.c
@@ -15,34 +15,161 @@
*/
#define C_LUCY_REGEXTOKENIZER
+#define CHY_USE_SHORT_NAMES
+#define LUCY_USE_SHORT_NAMES
+
+#include "charmony.h"
+
+#include <string.h>
-#include "CFBind.h"
#include "Lucy/Analysis/RegexTokenizer.h"
+#include "Clownfish/CharBuf.h"
+#include "Clownfish/Err.h"
+#include "Clownfish/Util/Memory.h"
+#include "Clownfish/Util/StringHelper.h"
#include "Lucy/Analysis/Token.h"
#include "Lucy/Analysis/Inversion.h"
-lucy_RegexTokenizer*
-lucy_RegexTokenizer_init(lucy_RegexTokenizer *self,
- const lucy_CharBuf *pattern) {
- THROW(LUCY_ERR, "TODO");
- UNREACHABLE_RETURN(lucy_RegexTokenizer*);
+#if defined(HAS_PCRE_H)
+
+#include <pcre.h>
+
+static uint32_t
+S_count_code_points(const char *string, size_t len);
+
+RegexTokenizer*
+RegexTokenizer_init(RegexTokenizer *self, const CharBuf *pattern) {
+ Analyzer_init((Analyzer*)self);
+
+ const char *pattern_ptr;
+ if (pattern) {
+ self->pattern = CB_Clone(pattern);
+ pattern_ptr = (char*)CB_Get_Ptr8(self->pattern);
+ }
+ else {
+ pattern_ptr = "\\w+(?:['\\x{2019}]\\w+)*";
+ self->pattern
+ = CB_new_from_trusted_utf8(pattern_ptr, strlen(pattern_ptr));
+ }
+
+ int options = PCRE_BSR_UNICODE
+ | PCRE_NEWLINE_LF
+ | PCRE_UTF8
+ | PCRE_NO_UTF8_CHECK;
+ const char *err_ptr;
+ int err_offset;
+ pcre *re = pcre_compile(pattern_ptr, options, &err_ptr, &err_offset, NULL);
+ if (!re) {
+ THROW(ERR, "%s", err_ptr);
+ }
+
+ // TODO: Check whether pcre_study improves performance
+
+ self->token_re = re;
+
+ return self;
}
void
-lucy_RegexTokenizer_set_token_re(lucy_RegexTokenizer *self, void *token_re) {
- THROW(LUCY_ERR, "TODO");
+RegexTokenizer_set_token_re(RegexTokenizer *self, void *token_re) {
+ THROW(ERR, "TODO");
}
void
-lucy_RegexTokenizer_destroy(lucy_RegexTokenizer *self) {
- THROW(LUCY_ERR, "TODO");
+RegexTokenizer_destroy(RegexTokenizer *self) {
+ DECREF(self->pattern);
+ pcre *re = (pcre*)self->token_re;
+ if (re) {
+ pcre_free(re);
+ }
+ SUPER_DESTROY(self, REGEXTOKENIZER);
}
void
-lucy_RegexTokenizer_tokenize_str(lucy_RegexTokenizer *self,
+RegexTokenizer_tokenize_str(RegexTokenizer *self,
const char *string, size_t string_len,
- lucy_Inversion *inversion) {
- THROW(LUCY_ERR, "TODO");
+ Inversion *inversion) {
+ pcre *re = (pcre*)self->token_re;
+ int byte_offset = 0;
+ uint32_t cp_offset = 0; // Code points
+ int options = PCRE_NO_UTF8_CHECK;
+ int ovector[3];
+
+ int return_code = pcre_exec(re, NULL, string, string_len, byte_offset,
+ options, ovector, 3);
+ while (return_code >= 0) {
+ const char *match = string + ovector[0];
+ size_t match_len = ovector[1] - ovector[0];
+
+ uint32_t cp_before = S_count_code_points(string + byte_offset,
+ ovector[0] - byte_offset);
+ uint32_t cp_start = cp_offset + cp_before;
+ uint32_t cp_matched = S_count_code_points(match, match_len);
+ uint32_t cp_end = cp_start + cp_matched;
+
+ // Add a token to the new inversion.
+ Token *token = Token_new(match, match_len, cp_start, cp_end, 1.0f, 1);
+ Inversion_Append(inversion, token);
+
+ byte_offset = ovector[1];
+ cp_offset = cp_end;
+ return_code = pcre_exec(re, NULL, string, string_len, byte_offset,
+ options, ovector, 3);
+ }
+
+ if (return_code != PCRE_ERROR_NOMATCH) {
+ THROW(ERR, "pcre_exec failed: %d", return_code);
+ }
+}
+
+static uint32_t
+S_count_code_points(const char *string, size_t len) {
+ uint32_t num_code_points = 0;
+ size_t i = 0;
+
+ while (i < len) {
+ i += StrHelp_UTF8_COUNT[(uint8_t)(string[i])];
+ ++num_code_points;
+ }
+
+ if (i != len) {
+ THROW(ERR, "Match between code point boundaries in '%s'", string);
+ }
+
+ return num_code_points;
+}
+
+#else // HAS_PCRE_H
+
+RegexTokenizer*
+RegexTokenizer_init(RegexTokenizer *self, const CharBuf *pattern) {
+ THROW(ERR,
+ "RegexTokenizer is not available because Lucy was compiled"
+ " without PCRE.");
+ UNREACHABLE_RETURN(RegexTokenizer*);
+}
+
+void
+RegexTokenizer_set_token_re(RegexTokenizer *self, void *token_re) {
+ THROW(ERR,
+ "RegexTokenizer is not available because Lucy was compiled"
+ " without PCRE.");
+}
+
+void
+RegexTokenizer_destroy(RegexTokenizer *self) {
+ THROW(ERR,
+ "RegexTokenizer is not available because Lucy was compiled"
+ " without PCRE.");
+}
+
+void
+RegexTokenizer_tokenize_str(RegexTokenizer *self, const char *string,
+ size_t string_len, Inversion *inversion) {
+ THROW(ERR,
+ "RegexTokenizer is not available because Lucy was compiled"
+ " without PCRE.");
}
+#endif // HAS_PCRE_H
http://git-wip-us.apache.org/repos/asf/lucy/blob/49dbbec6/common/charmonizer.main
----------------------------------------------------------------------
diff --git a/common/charmonizer.main b/common/charmonizer.main
index 83de908..c0d30b8 100644
--- a/common/charmonizer.main
+++ b/common/charmonizer.main
@@ -278,8 +278,12 @@ S_write_makefile() {
chaz_MakeRule_add_prereq(rule, json_parser_c);
chaz_MakeRule_add_prereq(rule, "$(AUTOGEN_DIR)");
+ const char *link_flags = "";
+ if (chaz_HeadCheck_check_header("pcre.h")) {
+ link_flags = "-lpcre";
+ }
chaz_MakeFile_add_shared_obj(makefile, "$(LUCY_SHOBJ)", "$(LUCY_OBJS)",
- "");
+ link_flags);
chaz_MakeFile_add_rule(makefile, "$(TEST_LUCY_OBJS)", "$(AUTOGEN_DIR)");