You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucy.apache.org by nw...@apache.org on 2011/12/05 21:58:04 UTC
[lucy-commits] svn commit: r1210622 - in /incubator/lucy/branches/LUCY-196-uax-tokenizer/core/Lucy/Analysis: StandardTokenizer.c StandardTokenizer.cfh

Author: nwellnhof
Date: Mon Dec  5 20:58:03 2011
New Revision: 1210622

URL: http://svn.apache.org/viewvc?rev=1210622&view=rev
Log:
UAX #29 StandardTokenizer

Added:
    incubator/lucy/branches/LUCY-196-uax-tokenizer/core/Lucy/Analysis/StandardTokenizer.c
    incubator/lucy/branches/LUCY-196-uax-tokenizer/core/Lucy/Analysis/StandardTokenizer.cfh

Added: incubator/lucy/branches/LUCY-196-uax-tokenizer/core/Lucy/Analysis/StandardTokenizer.c
URL: http://svn.apache.org/viewvc/incubator/lucy/branches/LUCY-196-uax-tokenizer/core/Lucy/Analysis/StandardTokenizer.c?rev=1210622&view=auto
==============================================================================
--- incubator/lucy/branches/LUCY-196-uax-tokenizer/core/Lucy/Analysis/StandardTokenizer.c (added)
+++ incubator/lucy/branches/LUCY-196-uax-tokenizer/core/Lucy/Analysis/StandardTokenizer.c Mon Dec  5 20:58:03 2011
@@ -0,0 +1,245 @@
+/* Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define C_LUCY_STANDARDTOKENIZER
+#define C_LUCY_TOKEN
+#define C_LUCY_I32ARRAY
+#include "Lucy/Util/ToolSet.h"
+
+#include "Lucy/Analysis/StandardTokenizer.h"
+#include "Lucy/Analysis/Token.h"
+#include "Lucy/Analysis/Inversion.h"
+
+/*
+ * We use a modified version of the Word_Break property defined in UAX #29.
+ * CR, LF, Newline and all undefined characters map to 0. WB_ASingle
+ * designates characters that are Alphabetic but are excluded from ALetter.
+ * WB_Extend_Format includes characters in both Extend and Format. The other
+ * WB_* values correspond to the standard properties.
+ *
+ * The tables are in a compressed format that uses a three-stage lookup
+ * scheme. They're generated with the perl script gen_word_break_tables.pl
+ * in devel/bin.
+ */
+
+#define WB_ASingle        1
+#define WB_ALetter        2
+#define WB_Numeric        3
+#define WB_Katakana       4
+#define WB_ExtendNumLet   5
+#define WB_Extend_Format  6
+#define WB_MidNumLet      7
+#define WB_MidLetter      8
+#define WB_MidNum         9
+
+#include "WordBreak.tab"
+
+typedef struct lucy_StringIter {
+    size_t byte_pos;
+    size_t char_pos;
+} lucy_StringIter;
+
+static int
+S_parse_single(const char *text, size_t len, lucy_StringIter *iter,
+               Inversion *inversion);
+
+static int
+S_parse_word(const char *text, size_t len, lucy_StringIter *iter,
+             int state, Inversion *inversion);
+
+static int
+S_wb_lookup(const char *ptr);
+
+static void
+S_iter_advance(const char *text, lucy_StringIter *iter);
+
+static int
+S_skip_extend_format(const char *text, size_t len, lucy_StringIter *iter);
+
+StandardTokenizer*
+StandardTokenizer_new() {
+    StandardTokenizer *self = (StandardTokenizer*)VTable_Make_Obj(STANDARDTOKENIZER);
+    return StandardTokenizer_init(self);
+}
+
+StandardTokenizer*
+StandardTokenizer_init(StandardTokenizer *self) {
+    Analyzer_init((Analyzer*)self);
+    return self;
+}
+
+Inversion*
+StandardTokenizer_transform(StandardTokenizer *self, Inversion *inversion) {
+    Inversion *new_inversion = Inversion_new(NULL);
+    Token *token;
+
+    while (NULL != (token = Inversion_Next(inversion))) {
+        StandardTokenizer_Tokenize_Str(self, token->text, token->len,
+                                    new_inversion);
+    }
+
+    return new_inversion;
+}
+
+Inversion*
+StandardTokenizer_transform_text(StandardTokenizer *self, CharBuf *text) {
+    Inversion *new_inversion = Inversion_new(NULL);
+    StandardTokenizer_Tokenize_Str(self, (char*)CB_Get_Ptr8(text),
+                                CB_Get_Size(text), new_inversion);
+    return new_inversion;
+}
+
+void
+StandardTokenizer_tokenize_str(StandardTokenizer *self, const char *text,
+                               size_t len, Inversion *inversion) {
+    lucy_StringIter iter = { 0, 0 };
+
+    while (iter.byte_pos < len) {
+        int wb = S_wb_lookup(text + iter.byte_pos);
+
+        while (wb >= WB_ASingle && wb <=  WB_Katakana) {
+            if (wb == WB_ASingle) {
+                wb = S_parse_single(text, len, &iter, inversion);
+            }
+            else {
+                wb = S_parse_word(text, len, &iter, wb, inversion);
+            }
+            if (iter.byte_pos >= len) return;
+        }
+
+        S_iter_advance(text, &iter);
+    }
+}
+
+/*
+ * Parse a word consisting of a single codepoint followed by extend or
+ * format characters. Used for Alphabetic characters that don't have the
+ * ALetter word break property: ideographs, Hiragana, and "complex content".
+ * Advances the iterator and returns the word break property of the current
+ * character.
+ */
+static int
+S_parse_single(const char *text, size_t len, lucy_StringIter *iter,
+               Inversion *inversion) {
+    lucy_StringIter start = *iter;
+    int wb = S_skip_extend_format(text, len, iter);
+
+    Token *token = Token_new(text + start.byte_pos,
+                             iter->byte_pos - start.byte_pos,
+                             start.char_pos, iter->char_pos, 1.0f, 1);
+    Inversion_Append(inversion, token);
+
+    return wb;
+}
+
+/*
+ * Parse a word starting with an ALetter, Numeric or Katakana character.
+ * Advances the iterator and returns the word break property of the current
+ * character.
+ */
+static int
+S_parse_word(const char *text, size_t len, lucy_StringIter *iter,
+             int state, Inversion *inversion) {
+    int wb = -1;
+    lucy_StringIter start = *iter;
+    S_iter_advance(text, iter);
+    lucy_StringIter end = *iter;
+
+    while (iter->byte_pos < len) {
+        wb = S_wb_lookup(text + iter->byte_pos);
+
+        switch(wb) {
+          case WB_ALetter:
+          case WB_Numeric:
+          case WB_Katakana:
+          case WB_ExtendNumLet:
+            state = wb;
+            // fall through
+          case WB_Extend_Format:
+            S_iter_advance(text, iter);
+            end = *iter;
+            continue;
+          case WB_MidNumLet:
+          case WB_MidLetter:
+          case WB_MidNum:
+            if (state == WB_ALetter && wb != WB_MidNum
+            ||  state == WB_Numeric && wb != WB_MidLetter) {
+                wb = S_skip_extend_format(text, len, iter);
+
+                if (wb == state) {
+                    S_iter_advance(text, iter);
+                    end = *iter;
+                    continue;
+                }
+            }
+          default:
+            break;
+        }
+
+        break;
+    }
+
+    Token *token = Token_new(text + start.byte_pos,
+                             end.byte_pos - start.byte_pos,
+                             start.char_pos, end.char_pos, 1.0f, 1);
+    Inversion_Append(inversion, token);
+
+    return wb;
+}
+
+static int
+S_wb_lookup(const char *ptr) {
+    uint32_t c  = StrHelp_decode_utf8_char(ptr);
+    uint32_t t  = c >> WB_TABLE2_SHIFT;
+    uint32_t i1 = t >> WB_TABLE1_SHIFT;
+    if (i1 >= WB_TABLE1_SIZE) { return 0; }
+    uint32_t i2 = (wb_table1[i1] << WB_TABLE1_SHIFT) | (t & WB_TABLE1_MASK);
+    uint32_t i3 = (wb_table2[i2] << WB_TABLE2_SHIFT) | (c & WB_TABLE2_MASK);
+    return wb_table3[i3];
+}
+
+static void
+S_iter_advance(const char *text, lucy_StringIter *iter) {
+    iter->byte_pos += StrHelp_UTF8_COUNT[*(uint8_t*)(text + iter->byte_pos)];
+    iter->char_pos += 1;
+}
+
+/*
+ * Advances the iterator skipping over Extend and Format characters.
+ * Returns the word break property of the current character.
+ */
+static int
+S_skip_extend_format(const char *text, size_t len, lucy_StringIter *iter) {
+    int wb = -1;
+
+    do {
+        S_iter_advance(text, iter);
+        if (iter->byte_pos >= len) { break; }
+        wb = S_wb_lookup(text + iter->byte_pos);
+    } while (wb == WB_Extend_Format);
+
+    return wb;
+}
+
+bool_t
+StandardTokenizer_equals(StandardTokenizer *self, Obj *other) {
+    StandardTokenizer *const twin = (StandardTokenizer*)other;
+    if (twin == self)                        { return true; }
+    if (!Obj_Is_A(other, STANDARDTOKENIZER)) { return false; }
+    return true;
+}
+
+

Added: incubator/lucy/branches/LUCY-196-uax-tokenizer/core/Lucy/Analysis/StandardTokenizer.cfh
URL: http://svn.apache.org/viewvc/incubator/lucy/branches/LUCY-196-uax-tokenizer/core/Lucy/Analysis/StandardTokenizer.cfh?rev=1210622&view=auto
==============================================================================
--- incubator/lucy/branches/LUCY-196-uax-tokenizer/core/Lucy/Analysis/StandardTokenizer.cfh (added)
+++ incubator/lucy/branches/LUCY-196-uax-tokenizer/core/Lucy/Analysis/StandardTokenizer.cfh Mon Dec  5 20:58:03 2011
@@ -0,0 +1,57 @@
+/* Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+parcel Lucy;
+
+/** Split a string into tokens.
+ *
+ * Generically, "tokenizing" is a process of breaking up a string into an
+ * array of "tokens".  For instance, the string "three blind mice" might be
+ * tokenized into "three", "blind", "mice".
+ *
+ * Lucy::Analysis::StandardTokenizer breaks up the text at the word
+ * boundaries defined in Unicode Standard Annex #29. It then returns those
+ * words that start with an alphabetic or numeric character.
+ */
+class Lucy::Analysis::StandardTokenizer
+    inherits Lucy::Analysis::Analyzer {
+
+    inert incremented StandardTokenizer*
+    new();
+
+    /** Constructor.  Takes no arguments.
+     */
+    public inert StandardTokenizer*
+    init(StandardTokenizer *self);
+
+    public incremented Inversion*
+    Transform(StandardTokenizer *self, Inversion *inversion);
+
+    public incremented Inversion*
+    Transform_Text(StandardTokenizer *self, CharBuf *text);
+
+    /** Tokenize the supplied string and add any Tokens generated to the
+     * supplied Inversion.
+     */
+    void
+    Tokenize_Str(StandardTokenizer *self, const char *text, size_t len,
+                 Inversion *inversion);
+
+    public bool_t
+    Equals(StandardTokenizer *self, Obj *other);
+}
+
+