You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucy.apache.org by nw...@apache.org on 2011/12/12 15:19:18 UTC
[lucy-commits] svn commit: r1213252 [1/4] - in /incubator/lucy/trunk: core/Lucy/Analysis/ core/Lucy/Test/Analysis/ devel/bin/ devel/conf/ modules/unicode/ucd/ perl/ perl/buildlib/Lucy/ perl/lib/Lucy/ perl/lib/Lucy/Analysis/ perl/t/ perl/t/core/

Author: nwellnhof
Date: Mon Dec 12 14:19:17 2011
New Revision: 1213252

URL: http://svn.apache.org/viewvc?rev=1213252&view=rev
Log:
LUCY-196 UAX #29 tokenizer

Merge branch LUCY-196-uax-tokenizer. This adds the new StandardTokenizer.

Added:
    incubator/lucy/trunk/core/Lucy/Analysis/StandardTokenizer.c
    incubator/lucy/trunk/core/Lucy/Analysis/StandardTokenizer.cfh
    incubator/lucy/trunk/core/Lucy/Test/Analysis/TestStandardTokenizer.c
    incubator/lucy/trunk/core/Lucy/Test/Analysis/TestStandardTokenizer.cfh
    incubator/lucy/trunk/devel/bin/UnicodeTable.pm
    incubator/lucy/trunk/devel/bin/gen_word_break_data.pl
    incubator/lucy/trunk/modules/unicode/ucd/
    incubator/lucy/trunk/modules/unicode/ucd/WordBreak.tab
    incubator/lucy/trunk/modules/unicode/ucd/WordBreakTest.json
    incubator/lucy/trunk/perl/lib/Lucy/Analysis/StandardTokenizer.pm
    incubator/lucy/trunk/perl/t/158-standard_tokenizer.t
    incubator/lucy/trunk/perl/t/core/158-standard_tokenizer.t
Modified:
    incubator/lucy/trunk/devel/conf/rat-excludes
    incubator/lucy/trunk/perl/MANIFEST
    incubator/lucy/trunk/perl/buildlib/Lucy/Build.pm
    incubator/lucy/trunk/perl/lib/Lucy/Test.pm

Added: incubator/lucy/trunk/core/Lucy/Analysis/StandardTokenizer.c
URL: http://svn.apache.org/viewvc/incubator/lucy/trunk/core/Lucy/Analysis/StandardTokenizer.c?rev=1213252&view=auto
==============================================================================
--- incubator/lucy/trunk/core/Lucy/Analysis/StandardTokenizer.c (added)
+++ incubator/lucy/trunk/core/Lucy/Analysis/StandardTokenizer.c Mon Dec 12 14:19:17 2011
@@ -0,0 +1,301 @@
+/* Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define C_LUCY_STANDARDTOKENIZER
+#define C_LUCY_TOKEN
+#include "Lucy/Util/ToolSet.h"
+
+#include "Lucy/Analysis/StandardTokenizer.h"
+#include "Lucy/Analysis/Token.h"
+#include "Lucy/Analysis/Inversion.h"
+
+/*
+ * We use a modified version of the Word_Break property defined in UAX #29.
+ * CR, LF, Newline and all undefined characters map to 0. WB_ASingle
+ * designates characters that are Alphabetic but are excluded from ALetter.
+ * WB_Extend_Format includes characters in both Extend and Format. The other
+ * WB_* values correspond to the standard properties.
+ *
+ * The tables are in a compressed format that uses a three-stage lookup
+ * scheme. They're generated with the perl script gen_word_break_tables.pl
+ * in devel/bin.
+ */
+
+#define WB_ASingle        1
+#define WB_ALetter        2
+#define WB_Numeric        3
+#define WB_Katakana       4
+#define WB_ExtendNumLet   5
+#define WB_Extend_Format  6
+#define WB_MidNumLet      7
+#define WB_MidLetter      8
+#define WB_MidNum         9
+
+#include "WordBreak.tab"
+
+typedef struct lucy_StringIter {
+    size_t byte_pos;
+    size_t char_pos;
+} lucy_StringIter;
+
+static int
+S_parse_single(const char *text, size_t len, lucy_StringIter *iter,
+               Inversion *inversion);
+
+static int
+S_parse_word(const char *text, size_t len, lucy_StringIter *iter,
+             int state, Inversion *inversion);
+
+static int
+S_wb_lookup(const char *ptr);
+
+static void
+S_iter_advance(const char *text, lucy_StringIter *iter);
+
+static int
+S_skip_extend_format(const char *text, size_t len, lucy_StringIter *iter);
+
+StandardTokenizer*
+StandardTokenizer_new() {
+    StandardTokenizer *self = (StandardTokenizer*)VTable_Make_Obj(STANDARDTOKENIZER);
+    return StandardTokenizer_init(self);
+}
+
+StandardTokenizer*
+StandardTokenizer_init(StandardTokenizer *self) {
+    Analyzer_init((Analyzer*)self);
+    return self;
+}
+
+Inversion*
+StandardTokenizer_transform(StandardTokenizer *self, Inversion *inversion) {
+    Inversion *new_inversion = Inversion_new(NULL);
+    Token *token;
+
+    while (NULL != (token = Inversion_Next(inversion))) {
+        StandardTokenizer_Tokenize_Str(self, token->text, token->len,
+                                    new_inversion);
+    }
+
+    return new_inversion;
+}
+
+Inversion*
+StandardTokenizer_transform_text(StandardTokenizer *self, CharBuf *text) {
+    Inversion *new_inversion = Inversion_new(NULL);
+    StandardTokenizer_Tokenize_Str(self, (char*)CB_Get_Ptr8(text),
+                                CB_Get_Size(text), new_inversion);
+    return new_inversion;
+}
+
+void
+StandardTokenizer_tokenize_str(StandardTokenizer *self, const char *text,
+                               size_t len, Inversion *inversion) {
+    if (len >= 1 && (uint8_t)text[len-1] >= 0xC0
+    ||  len >= 2 && (uint8_t)text[len-2] >= 0xE0
+    ||  len >= 3 && (uint8_t)text[len-3] >= 0xF0) {
+        THROW(ERR, "Invalid UTF-8 sequence");
+    }
+
+    lucy_StringIter iter = { 0, 0 };
+
+    while (iter.byte_pos < len) {
+        int wb = S_wb_lookup(text + iter.byte_pos);
+
+        while (wb >= WB_ASingle && wb <= WB_ExtendNumLet) {
+            if (wb == WB_ASingle) {
+                wb = S_parse_single(text, len, &iter, inversion);
+            }
+            else {
+                wb = S_parse_word(text, len, &iter, wb, inversion);
+            }
+            if (iter.byte_pos >= len) return;
+        }
+
+        S_iter_advance(text, &iter);
+    }
+}
+
+/*
+ * Parse a word consisting of a single codepoint followed by extend or
+ * format characters. Used for Alphabetic characters that don't have the
+ * ALetter word break property: ideographs, Hiragana, and "complex content".
+ * Advances the iterator and returns the word break property of the current
+ * character.
+ */
+static int
+S_parse_single(const char *text, size_t len, lucy_StringIter *iter,
+               Inversion *inversion) {
+    lucy_StringIter start = *iter;
+    int wb = S_skip_extend_format(text, len, iter);
+
+    Token *token = Token_new(text + start.byte_pos,
+                             iter->byte_pos - start.byte_pos,
+                             start.char_pos, iter->char_pos, 1.0f, 1);
+    Inversion_Append(inversion, token);
+
+    return wb;
+}
+
+/*
+ * Parse a word starting with an ALetter, Numeric or Katakana character.
+ * Advances the iterator and returns the word break property of the current
+ * character.
+ */
+static int
+S_parse_word(const char *text, size_t len, lucy_StringIter *iter,
+             int state, Inversion *inversion) {
+    int wb = -1;
+    lucy_StringIter start = *iter;
+    S_iter_advance(text, iter);
+    lucy_StringIter end = *iter;
+
+    while (iter->byte_pos < len) {
+        wb = S_wb_lookup(text + iter->byte_pos);
+
+        switch(wb) {
+          case WB_ALetter:
+          case WB_Numeric:
+            if (state == WB_Katakana) { goto word_break; }
+            break;
+          case WB_Katakana:
+            if (state == WB_ALetter || state == WB_Numeric) {
+                goto word_break;
+            }
+            break;
+          case WB_ExtendNumLet:
+            break;
+          case WB_Extend_Format:
+            // keep state
+            wb = state;
+            break;
+          case WB_MidNumLet:
+          case WB_MidLetter:
+          case WB_MidNum:
+            if (state == WB_ALetter && wb != WB_MidNum
+            ||  state == WB_Numeric && wb != WB_MidLetter) {
+                wb = S_skip_extend_format(text, len, iter);
+                if (wb == state) { break; }
+            }
+            goto word_break;
+          default:
+            goto word_break;
+        }
+
+        state = wb;
+        S_iter_advance(text, iter);
+        end = *iter;
+    }
+
+    Token *token;
+  word_break:
+    token = Token_new(text + start.byte_pos, end.byte_pos - start.byte_pos,
+                      start.char_pos, end.char_pos, 1.0f, 1);
+    Inversion_Append(inversion, token);
+
+    return wb;
+}
+
+/*
+ * Conceptually, the word break property table is split into rows that
+ * contain 64 columns and planes that contain 64 rows (not to be confused
+ * the 65,536 character Unicode planes). So bits 0-5 of a code point contain
+ * the column index into a row, bits 6-11 contain the row index into a plane,
+ * and bits 12-20 contain the plane index.
+ *
+ * To save space, identical rows are merged so the row table contains only
+ * unique rows and the plane table contains row indices remapped to row ids.
+ * Then, identical planes are merged, and a plane map table is created with
+ * plane indices remapped to plane ids.
+ *
+ * The row and plane tables are simple one-dimensional arrays created by
+ * concatenating all unique rows and planes. So looking up an entry can be
+ * done by left shifting the id and ORing the index.
+ */
+
+#define WB_TABLE_LOOKUP(table, id, index) table [ ((id) << 6) | (index) ]
+
+static int
+S_wb_lookup(const char *ptr) {
+    uint8_t start = *(uint8_t*)ptr++;
+
+    if (start < 0x80) { return wb_ascii[start]; }
+
+    size_t plane_id, row_index;
+
+    if (start < 0xE0) {
+        if (start < 0xC0) {
+            THROW(ERR, "Invalid UTF-8 sequence");
+        }
+        // two byte sequence
+        // 110rrrrr 10cccccc
+        plane_id  = 0;
+        row_index = start & 0x1F;
+    }
+    else {
+        size_t plane_index;
+        if (start < 0xF0) {
+            // three byte sequence
+            // 1110pppp 10rrrrrr 10cccccc
+            plane_index = start & 0x0F;
+        }
+        else {
+            // four byte sequence
+            // 11110ppp 10pppppp 10rrrrrr 10cccccc
+            plane_index = ((start & 0x07) << 6) | (*ptr++ & 0x3F);
+        }
+        if (plane_index >= WB_PLANE_MAP_SIZE) { return 0; }
+        plane_id  = wb_plane_map[plane_index];
+        row_index = *ptr++ & 0x3F;
+    }
+
+    size_t row_id = WB_TABLE_LOOKUP(wb_planes, plane_id, row_index);
+    size_t column_index = *ptr++ & 0x3F;
+    return WB_TABLE_LOOKUP(wb_rows, row_id, column_index);
+}
+
+static void
+S_iter_advance(const char *text, lucy_StringIter *iter) {
+    iter->byte_pos += StrHelp_UTF8_COUNT[*(uint8_t*)(text + iter->byte_pos)];
+    iter->char_pos += 1;
+}
+
+/*
+ * Advances the iterator skipping over Extend and Format characters.
+ * Returns the word break property of the current character.
+ */
+static int
+S_skip_extend_format(const char *text, size_t len, lucy_StringIter *iter) {
+    int wb = -1;
+
+    do {
+        S_iter_advance(text, iter);
+        if (iter->byte_pos >= len) { break; }
+        wb = S_wb_lookup(text + iter->byte_pos);
+    } while (wb == WB_Extend_Format);
+
+    return wb;
+}
+
+bool_t
+StandardTokenizer_equals(StandardTokenizer *self, Obj *other) {
+    StandardTokenizer *const twin = (StandardTokenizer*)other;
+    if (twin == self)                        { return true; }
+    if (!Obj_Is_A(other, STANDARDTOKENIZER)) { return false; }
+    return true;
+}
+
+

Added: incubator/lucy/trunk/core/Lucy/Analysis/StandardTokenizer.cfh
URL: http://svn.apache.org/viewvc/incubator/lucy/trunk/core/Lucy/Analysis/StandardTokenizer.cfh?rev=1213252&view=auto
==============================================================================
--- incubator/lucy/trunk/core/Lucy/Analysis/StandardTokenizer.cfh (added)
+++ incubator/lucy/trunk/core/Lucy/Analysis/StandardTokenizer.cfh Mon Dec 12 14:19:17 2011
@@ -0,0 +1,57 @@
+/* Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+parcel Lucy;
+
+/** Split a string into tokens.
+ *
+ * Generically, "tokenizing" is a process of breaking up a string into an
+ * array of "tokens".  For instance, the string "three blind mice" might be
+ * tokenized into "three", "blind", "mice".
+ *
+ * Lucy::Analysis::StandardTokenizer breaks up the text at the word
+ * boundaries defined in Unicode Standard Annex #29. It then returns those
+ * words that start with an alphabetic or numeric character.
+ */
+class Lucy::Analysis::StandardTokenizer
+    inherits Lucy::Analysis::Analyzer {
+
+    inert incremented StandardTokenizer*
+    new();
+
+    /** Constructor.  Takes no arguments.
+     */
+    public inert StandardTokenizer*
+    init(StandardTokenizer *self);
+
+    public incremented Inversion*
+    Transform(StandardTokenizer *self, Inversion *inversion);
+
+    public incremented Inversion*
+    Transform_Text(StandardTokenizer *self, CharBuf *text);
+
+    /** Tokenize the supplied string and add any Tokens generated to the
+     * supplied Inversion.
+     */
+    void
+    Tokenize_Str(StandardTokenizer *self, const char *text, size_t len,
+                 Inversion *inversion);
+
+    public bool_t
+    Equals(StandardTokenizer *self, Obj *other);
+}
+
+

Added: incubator/lucy/trunk/core/Lucy/Test/Analysis/TestStandardTokenizer.c
URL: http://svn.apache.org/viewvc/incubator/lucy/trunk/core/Lucy/Test/Analysis/TestStandardTokenizer.c?rev=1213252&view=auto
==============================================================================
--- incubator/lucy/trunk/core/Lucy/Test/Analysis/TestStandardTokenizer.c (added)
+++ incubator/lucy/trunk/core/Lucy/Test/Analysis/TestStandardTokenizer.c Mon Dec 12 14:19:17 2011
@@ -0,0 +1,130 @@
+/* Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define C_LUCY_TESTSTANDARDTOKENIZER
+#include "Lucy/Util/ToolSet.h"
+
+#include "Lucy/Test.h"
+#include "Lucy/Test/Analysis/TestStandardTokenizer.h"
+#include "Lucy/Analysis/StandardTokenizer.h"
+#include "Lucy/Store/FSFolder.h"
+#include "Lucy/Util/Json.h"
+
+static void
+test_Dump_Load_and_Equals(TestBatch *batch) {
+    StandardTokenizer *tokenizer = StandardTokenizer_new();
+    Obj *dump  = StandardTokenizer_Dump(tokenizer);
+    StandardTokenizer *clone = (StandardTokenizer*)StandardTokenizer_Load(tokenizer, dump);
+
+    TEST_TRUE(batch,
+              StandardTokenizer_Equals(tokenizer, (Obj*)clone),
+              "Dump => Load round trip");
+
+    DECREF(tokenizer);
+    DECREF(dump);
+    DECREF(clone);
+}
+
+static void
+test_tokenizer(TestBatch *batch) {
+    StandardTokenizer *tokenizer = StandardTokenizer_new();
+
+    ZombieCharBuf *word = ZCB_WRAP_STR(
+        " ."
+        "tha\xCC\x82t's"
+        ":"
+        "1,02\xC2\xADZ4.38"
+        "\xE0\xB8\x81\xC2\xAD\xC2\xAD"
+        "\xF0\xA0\x80\x80"
+        "a"
+        "/",
+        35);
+    VArray *got = StandardTokenizer_Split(tokenizer, (CharBuf*)word);
+    CharBuf *token = (CharBuf*)VA_Fetch(got, 0);
+    TEST_TRUE(batch,
+              token
+              && CB_Is_A(token, CHARBUF)
+              && CB_Equals_Str(token, "tha\xcc\x82t's", 8),
+              "Token: %s", CB_Get_Ptr8(token));
+    token = (CharBuf*)VA_Fetch(got, 1);
+    TEST_TRUE(batch,
+              token
+              && CB_Is_A(token, CHARBUF)
+              && CB_Equals_Str(token, "1,02\xC2\xADZ4.38", 11),
+              "Token: %s", CB_Get_Ptr8(token));
+    token = (CharBuf*)VA_Fetch(got, 2);
+    TEST_TRUE(batch,
+              token
+              && CB_Is_A(token, CHARBUF)
+              && CB_Equals_Str(token, "\xE0\xB8\x81\xC2\xAD\xC2\xAD", 7),
+              "Token: %s", CB_Get_Ptr8(token));
+    token = (CharBuf*)VA_Fetch(got, 3);
+    TEST_TRUE(batch,
+              token
+              && CB_Is_A(token, CHARBUF)
+              && CB_Equals_Str(token, "\xF0\xA0\x80\x80", 4),
+              "Token: %s", CB_Get_Ptr8(token));
+    token = (CharBuf*)VA_Fetch(got, 4);
+    TEST_TRUE(batch,
+              token
+              && CB_Is_A(token, CHARBUF)
+              && CB_Equals_Str(token, "a", 1),
+              "Token: %s", CB_Get_Ptr8(token));
+    DECREF(got);
+
+    CharBuf  *path           = CB_newf("modules");
+    FSFolder *modules_folder = FSFolder_new(path);
+    if (!FSFolder_Check(modules_folder)) {
+        DECREF(modules_folder);
+        CB_setf(path, "../modules");
+        modules_folder = FSFolder_new(path);
+        if (!FSFolder_Check(modules_folder)) {
+            THROW(ERR, "Can't open modules folder");
+        }
+    }
+    CB_setf(path, "unicode/ucd/WordBreakTest.json");
+    VArray *tests = (VArray*)Json_slurp_json((Folder*)modules_folder, path);
+    if (!tests) { RETHROW(Err_get_error()); }
+
+    for (uint32_t i = 0, max = VA_Get_Size(tests); i < max; i++) {
+        Hash *test = (Hash*)VA_Fetch(tests, i);
+        CharBuf *text = (CharBuf*)Hash_Fetch_Str(test, "text", 4);
+        VArray *wanted = (VArray*)Hash_Fetch_Str(test, "words", 5);
+        VArray *got = StandardTokenizer_Split(tokenizer, text);
+        TEST_TRUE(batch, VA_Equals(wanted, (Obj*)got), "UCD test #%d", i + 1);
+        DECREF(got);
+    }
+
+    DECREF(tests);
+    DECREF(modules_folder);
+    DECREF(path);
+
+    DECREF(tokenizer);
+}
+
+void
+TestStandardTokenizer_run_tests() {
+    TestBatch *batch = TestBatch_new(984);
+
+    TestBatch_Plan(batch);
+
+    test_Dump_Load_and_Equals(batch);
+    test_tokenizer(batch);
+
+    DECREF(batch);
+}
+
+

Added: incubator/lucy/trunk/core/Lucy/Test/Analysis/TestStandardTokenizer.cfh
URL: http://svn.apache.org/viewvc/incubator/lucy/trunk/core/Lucy/Test/Analysis/TestStandardTokenizer.cfh?rev=1213252&view=auto
==============================================================================
--- incubator/lucy/trunk/core/Lucy/Test/Analysis/TestStandardTokenizer.cfh (added)
+++ incubator/lucy/trunk/core/Lucy/Test/Analysis/TestStandardTokenizer.cfh Mon Dec 12 14:19:17 2011
@@ -0,0 +1,24 @@
+/* Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+parcel Lucy;
+
+inert class Lucy::Test::Analysis::TestStandardTokenizer {
+    inert void
+    run_tests();
+}
+
+

Added: incubator/lucy/trunk/devel/bin/UnicodeTable.pm
URL: http://svn.apache.org/viewvc/incubator/lucy/trunk/devel/bin/UnicodeTable.pm?rev=1213252&view=auto
==============================================================================
--- incubator/lucy/trunk/devel/bin/UnicodeTable.pm (added)
+++ incubator/lucy/trunk/devel/bin/UnicodeTable.pm Mon Dec 12 14:19:17 2011
@@ -0,0 +1,426 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+package UnicodeTable;
+use strict;
+
+=head1 NAME
+
+UnicodeTable - Create compressed Unicode tables for C programs
+
+=head1 SYNOPSIS
+
+    my $table = UnicodeTable->read(
+        filename => $filename,
+        type     => 'Enumerated',
+        map      => \%map,
+    );
+
+    my $comp = $table->compress($shift);
+
+    $comp->dump;
+
+=head1 DESCRIPTION
+
+This module creates compressed tables used to lookup Unicode properties
+in C programs. To compress a table, it's split into blocks of a fixed
+size. Identical blocks are discovered and only unique blocks are written to
+the compressed table. An additional map table is created to map original
+block indices to block ids.
+
+The map tables can then be compressed again using the same algorithm.
+
+Powers of two are used as block sizes, so the table indices to lookup values
+can be computed using bit operations.
+
+=head1 METHODS
+
+=head2 new
+
+    my $table = UnicodeTable->new(
+        values    => \@values,
+        default   => $default,
+        max       => $max,
+        shift     => $shift,
+        map_table => $map_table,
+    );
+
+\@values is an arrayref with the table values, $max is the maximum value.
+The default value for undefined table entries is $default or 0.
+$shift and $map_table are used for compressed tables.
+
+=cut
+
+sub new {
+    my $class = shift;
+
+    my $opts = @_ == 1 ? $_[0] : {@_};
+    my $self = bless( {}, $class );
+
+    for my $name (qw(values default max shift map_table)) {
+        $self->{$name} = $opts->{$name};
+    }
+
+    $self->{default} = 0
+        if !defined( $self->{default} );
+    $self->{mask} = ( 1 << $self->{shift} ) - 1
+        if defined( $self->{shift} );
+
+    return $self;
+}
+
+=head2 read
+
+    my $table = UnicodeTable->table(
+        filename => $filename,
+        type     => $type,
+        map      => \%map,
+        default  => $default,
+    );
+
+Reads a table from a Unicode data text file. $type is either 'Enumerated'
+or 'Boolean'. \%map is a hashref that maps property values to integers.
+For booleans, these integers are ORed. $default is the default value passed
+to L<new>.
+
+=cut
+
+sub read {
+    my $class = shift;
+
+    my $opts = @_ == 1 ? $_[0] : {@_};
+    my $max = 0;
+    my @values;
+
+    my $filename = $opts->{filename};
+    die('filename missing') if !defined($filename);
+    my $type = $opts->{type} or die('type missing');
+    my $map  = $opts->{map}  or die('map missing');
+    $type = lc($type);
+
+    open( my $file, '<', $filename )
+        or die("$filename: $!\n");
+
+    while ( my $line = $file->getline ) {
+        $line =~ s/\s*(#.*)?\z//s;
+        next if $line eq '';
+        my ( $chars, $prop ) = split( /\s*;\s*/, $line );
+        my $val = $map->{$prop};
+
+        if ( !defined($val) ) {
+            if ( $type eq 'boolean' ) {
+                next;
+            }
+            else {
+                die("unknown property '$prop'");
+            }
+        }
+
+        $max = $val if $val > $max;
+
+        if ( $chars =~ /^[0-9A-Fa-f]+\z/ ) {
+            my $i = hex($chars);
+            if ( $type eq 'boolean' ) {
+                $values[$i] |= $val;
+            }
+            else {
+                $values[$i] = $val;
+            }
+        }
+        elsif ( $chars =~ /^(\w+)\.\.(\w+)\z/ ) {
+            my ( $l, $r ) = ( hex($1), hex($2) );
+            die("invalid range '$chars'") if $l > $r;
+
+            for ( my $i = $l; $i <= $r; ++$i ) {
+                if ( $type eq 'boolean' ) {
+                    $values[$i] |= $val;
+                }
+                else {
+                    $values[$i] = $val;
+                }
+            }
+        }
+        else {
+            die("invalid range '$chars'");
+        }
+    }
+
+    close($file);
+
+    return $class->new(
+        values  => \@values,
+        default => $opts->{default},
+        max     => $max,
+    );
+}
+
+=head2 shift
+
+=head2 mask
+
+=head2 max
+
+=head2 map_table
+
+Accessors
+
+=cut
+
+sub shift {
+    return $_[0]->{shift};
+}
+
+sub mask {
+    return $_[0]->{mask};
+}
+
+sub max {
+    return $_[0]->{max};
+}
+
+sub map_table {
+    return $_[0]->{map_table};
+}
+
+=head2 set
+
+    $table->set($i, $value);
+
+Set entry at index $i to $value. Don't use with compressed tables.
+
+=cut
+
+sub set {
+    my ( $self, $i, $value ) = @_;
+    $self->{values}[$i] = $value;
+    $self->{max} = $value if $value > $self->{max};
+}
+
+=head2 size
+
+    my $size = $table->size;
+
+Storage size of the table in bytes.
+
+=cut
+
+sub size {
+    my $self = CORE::shift;
+
+    my $max = $self->{max};
+    my $bytes = $max < 0x100 ? 1 : $max < 0x10000 ? 2 : 4;
+
+    return @{ $self->{values} } * $bytes;
+}
+
+=head2 lookup
+
+    my $value = $table->lookup($i);
+
+Lookup value at index $i. Also works with compressed tables.
+
+=cut
+
+sub lookup {
+    my ( $self, $i ) = @_;
+
+    my $map_table = $self->{map_table};
+
+    if ($map_table) {
+        my $shift = $self->{shift};
+        my $id    = $map_table->lookup( $i >> $shift );
+        my $j     = ( $id << $shift ) | ( $i & $self->{mask} );
+        return $self->{values}->[$j];
+    }
+    else {
+        my $val = $self->{values}->[$i];
+        return $self->{default} if !defined($val);
+        return $val;
+    }
+}
+
+=head2 compress
+
+    my $compressed_table = $table->compress($shift);
+
+Returns a compressed version of this table which is linked to a second
+map table. Blocks of size (1 << $shift) are used.
+
+=cut
+
+sub compress {
+    my ( $self, $shift ) = @_;
+
+    my $values      = $self->{values};
+    my $default     = $self->{default};
+    my $block_size  = 1 << $shift;
+    my $block_count = 0;
+    my ( @compressed, @map_values, %block_ids );
+
+    for ( my $start = 0; $start < @$values; $start += $block_size ) {
+        my @block;
+
+        for ( my $i = $start; $i < $start + $block_size; ++$i ) {
+            my $val = $values->[$i];
+            $val = $default if !defined($val);
+            push( @block, $val );
+        }
+
+        my $str = join( '|', @block );
+        my $block_id = $block_ids{$str};
+
+        if ( !defined($block_id) ) {
+            $block_id = $block_count++;
+            $block_ids{$str} = $block_id;
+            push( @compressed, @block );
+        }
+
+        push( @map_values, $block_id );
+    }
+
+    # find default for map table
+
+    my @default_block;
+
+    for ( my $i = 0; $i < $block_size; ++$i ) {
+        push( @default_block, $default );
+    }
+
+    my $str = join( '|', @default_block );
+    my $default_block_id = $block_ids{$str};
+
+    if ( !defined($default_block_id) ) {
+        $default_block_id = $block_count++;
+        push( @compressed, @default_block );
+    }
+
+    my $map_table = UnicodeTable->new(
+        values  => \@map_values,
+        default => $default_block_id,
+        max     => $block_count - 1,
+    );
+
+    return UnicodeTable->new(
+        values    => \@compressed,
+        default   => $default,
+        max       => $self->{max},
+        shift     => $shift,
+        map_table => $map_table,
+    );
+}
+
+=head2 compress_map
+
+    my $map_table = $table->compress_map($shift);
+
+Compress the map table of a table for multi stage lookup. Returns the
+compressed map table.
+
+=cut
+
+sub compress_map {
+    my ( $self, $shift ) = @_;
+
+    my $comp = $self->{map_table}->compress($shift);
+    $self->{map_table} = $comp;
+
+    return $comp;
+}
+
+=head2 dump
+
+    $table->dump($file, $name);
+
+Dump the table as C code to filehandle $file. The table name is $name.
+
+=cut
+
+sub dump {
+    my ( $self, $file, $name ) = @_;
+
+    my $values  = $self->{values};
+    my $size    = @$values;
+    my $uc_name = uc($name);
+
+    print $file (<<"EOF") if $self->{shift};
+#define ${uc_name}_SHIFT $self->{shift}
+#define ${uc_name}_MASK  $self->{mask}
+EOF
+    print $file (<<"EOF");
+#define ${uc_name}_SIZE  $size
+
+EOF
+
+    my $max           = $self->{max};
+    my $bits          = $max < 0x100 ? 8 : $max < 0x10000 ? 16 : 32;
+    my $pad           = length($max);
+    my $vals_per_line = int( 76 / ( $pad + 2 ) );
+
+    print $file ("static const uint${bits}_t $name\[$size] = {\n");
+
+    my $i = 0;
+
+    while ( $i < $size ) {
+        printf $file ( "    \%${pad}d", $values->[$i] );
+
+        my $max = $i + $vals_per_line;
+        $max = $size if $max > $size;
+
+        while ( ++$i < $max ) {
+            printf $file ( ", \%${pad}d", $values->[$i] );
+        }
+
+        print $file (',') if $i < $size;
+        print $file ("\n");
+    }
+
+    print $file ("};\n");
+}
+
+sub calc_sizes {
+    my ( $self, $range2, $range1 ) = @_;
+
+    for ( my $shift2 = $range2->[0]; $shift2 <= $range2->[1]; ++$shift2 ) {
+        my $comp      = $self->compress($shift2);
+        my $map_table = $comp->map_table;
+        my $size3     = $comp->size;
+
+        for ( my $shift1 = $range1->[0]; $shift1 <= $range1->[1]; ++$shift1 )
+        {
+            my $comp_map_table = $map_table->compress($shift1);
+
+            my $size1 = $comp_map_table->map_table->size;
+            my $size2 = $comp_map_table->size;
+
+            printf(
+                "shift %2d %2d: %6d + %6d + %6d = %7d bytes, %4d %4d\n",
+                $shift1,                         $shift2,
+                $size1,                          $size2,
+                $size3,                          $size1 + $size2 + $size3,
+                $comp_map_table->map_table->max, $comp_map_table->max,
+            );
+        }
+
+        print("\n");
+    }
+}
+
+=head1 AUTHOR
+
+Nick Wellnhofer <we...@aevum.de>
+
+=cut
+
+1;

Added: incubator/lucy/trunk/devel/bin/gen_word_break_data.pl
URL: http://svn.apache.org/viewvc/incubator/lucy/trunk/devel/bin/gen_word_break_data.pl?rev=1213252&view=auto
==============================================================================
--- incubator/lucy/trunk/devel/bin/gen_word_break_data.pl (added)
+++ incubator/lucy/trunk/devel/bin/gen_word_break_data.pl Mon Dec 12 14:19:17 2011
@@ -0,0 +1,245 @@
+#!/usr/bin/perl
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+=head1 NAME
+
+gen_word_break_data.pl - Generate word break table and tests
+
+=head1 SYNOPSIS
+
+    perl gen_word_break_data.pl [-c] UCD_SRC_DIR
+
+=head1 DESCRIPTION
+
+This script generates the tables to lookup Unicode word break properties
+for the StandardTokenizer. It also converts the word break test suite in
+the UCD to JSON.
+
+UCD_SRC_DIR should point to a directory containing the files
+WordBreakProperty.txt, WordBreakTest.txt, and DerivedCoreProperties.txt from
+the Unicode Character Database available at
+L<http://www.unicode.org/Public/6.0.0/ucd/>.
+
+=head1 OUTPUT FILES
+
+    modules/unicode/ucd/WordBreak.tab
+    modules/unicode/ucd/WordBreakTest.json
+
+=head1 OPTIONS
+
+=head2 -c
+
+Show total table size for different shift values
+
+=cut
+
+use strict;
+
+use Getopt::Std;
+use JSON;
+use UnicodeTable;
+
+my $output_dir     = '../../modules/unicode/ucd';
+my $table_filename = "$output_dir/WordBreak.tab";
+my $tests_filename = "$output_dir/WordBreakTest.json";
+
+my %wb_map = (
+    CR           => 0,
+    LF           => 0,
+    Newline      => 0,
+    ALetter      => 2,
+    Numeric      => 3,
+    Katakana     => 4,
+    ExtendNumLet => 5,
+    Extend       => 6,
+    Format       => 6,
+    MidNumLet    => 7,
+    MidLetter    => 8,
+    MidNum       => 9,
+);
+
+my %opts;
+if ( !getopts( 'c', \%opts ) || @ARGV != 1 ) {
+    print STDERR ("Usage: $0 [-c] UCD_SRC_DIR\n");
+    exit;
+}
+
+my $src_dir = $ARGV[0];
+
+my $wb = UnicodeTable->read(
+    filename => "$src_dir/WordBreakProperty.txt",
+    type     => 'Enumerated',
+    map      => \%wb_map,
+);
+my $alpha = UnicodeTable->read(
+    filename => "$src_dir/DerivedCoreProperties.txt",
+    type     => 'Boolean',
+    map      => { Alphabetic => 1 },
+);
+
+# Set characters in Alphabetic but not in Word_Break to WB_ASingle = 1
+for ( my $i = 0; $i < 0x30000; ++$i ) {
+    if ( !$wb->lookup($i) && $alpha->lookup($i) ) {
+        $wb->set( $i, 1 );
+    }
+}
+
+if ( $opts{c} ) {
+    $wb->calc_sizes( [ 2, 6 ], [ 3, 9 ] );
+    exit;
+}
+
+# Optimize for UTF-8
+my $row_shift   = 6;
+my $plane_shift = 6;
+
+my $wb_ascii = UnicodeTable->new(
+    table => [],
+    max   => 0,
+);
+
+for ( my $i = 0; $i < 0x80; ++$i ) {
+    $wb_ascii->set( $i, $wb->lookup($i) );
+}
+
+my $wb_rows      = $wb->compress($row_shift);
+my $wb_planes    = $wb_rows->compress_map($plane_shift);
+my $wb_plane_map = $wb_planes->map_table;
+
+# test compressed table
+
+for ( my $i = 0; $i < 0x110000; ++$i ) {
+    my $v1 = $wb->lookup($i);
+    my $v2 = $wb_rows->lookup($i);
+    die("test for code point $i failed, want $v1, got $v2")
+        if $v1 != $v2;
+}
+
+# dump tables
+
+open( my $out_file, '>', $table_filename )
+    or die("$table_filename: $!\n");
+
+print $out_file (<DATA>);
+
+$wb_ascii->dump( $out_file, 'wb_ascii' );
+print $out_file ("\n");
+$wb_plane_map->dump( $out_file, 'wb_plane_map' );
+print $out_file ("\n");
+$wb_planes->dump( $out_file, 'wb_planes' );
+print $out_file ("\n");
+$wb_rows->dump( $out_file, 'wb_rows' );
+
+close($out_file);
+
+# convert UCD test suite
+
+open( my $in_file, '<', "$src_dir/WordBreakTest.txt" )
+    or die("$src_dir/WordBreakTest.txt: $!\n");
+binmode( $in_file, ':utf8' );
+
+my @tests;
+
+while (<$in_file>) {
+    s/\s*(#.*)?\z//s;
+    next if $_ eq '';
+    my @items = split(/\s+/);
+    my $word  = '';
+    my $text  = '';
+    my @words;
+
+    for ( my $i = 0; $i + 1 < @items; $i += 2 ) {
+        my ( $break, $code ) = ( $items[$i], hex( $items[ $i + 1 ] ) );
+        my $chr = chr($code);
+        $text .= $chr;
+
+        if ( $break eq "\xF7" ) {    # division sign
+            if ( $word ne '' ) {
+                push( @words, $word );
+                $word = '';
+            }
+
+            my $wb = $wb->lookup($code);
+            $word = $chr if $wb >= 1 && $wb <= 5;
+        }
+        elsif ( $break eq "\xD7" ) {    # multiplication sign
+            $word .= $chr if $word ne '';
+        }
+        else {
+            die("invalid break character '$break'");
+        }
+    }
+
+    push( @words, $word ) if $word ne '';
+
+    push(
+        @tests,
+        {   text  => $text,
+            words => \@words,
+        }
+    );
+}
+
+close($in_file);
+
+open( $out_file, '>', $tests_filename )
+    or die("$tests_filename: $!\n");
+print $out_file ( JSON->new->utf8->pretty->encode( \@tests ) );
+close($out_file);
+
+__DATA__
+/*
+
+This file is generated with devel/bin/gen_word_break_data.pl. DO NOT EDIT!
+The contents of this file are derived from the Unicode Character Database,
+version 6.0.0, available from http://www.unicode.org/Public/6.0.0/ucd/.
+The Unicode copyright and permission notice follows.
+
+Copyright (c) 1991-2011 Unicode, Inc. All rights reserved. Distributed under
+the Terms of Use in http://www.unicode.org/copyright.html.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+the Unicode data files and any associated documentation (the "Data Files") or
+Unicode software and any associated documentation (the "Software") to deal in
+the Data Files or Software without restriction, including without limitation
+the rights to use, copy, modify, merge, publish, distribute, and/or sell copies
+of the Data Files or Software, and to permit persons to whom the Data Files or
+Software are furnished to do so, provided that (a) the above copyright
+notice(s) and this permission notice appear with all copies of the Data Files
+or Software, (b) both the above copyright notice(s) and this permission notice
+appear in associated documentation, and (c) there is clear notice in each
+modified Data File or in the Software as well as in the documentation
+associated with the Data File(s) or Software that the data or software has been
+modified.
+
+THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD
+PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN
+THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
+DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THE DATA FILES OR
+SOFTWARE.
+
+Except as contained in this notice, the name of a copyright holder shall not be
+used in advertising or otherwise to promote the sale, use or other dealings in
+these Data Files or Software without prior written authorization of the
+copyright holder.
+
+*/
+

Modified: incubator/lucy/trunk/devel/conf/rat-excludes
URL: http://svn.apache.org/viewvc/incubator/lucy/trunk/devel/conf/rat-excludes?rev=1213252&r1=1213251&r2=1213252&view=diff
==============================================================================
--- incubator/lucy/trunk/devel/conf/rat-excludes (original)
+++ incubator/lucy/trunk/devel/conf/rat-excludes Mon Dec 12 14:19:17 2011
@@ -48,8 +48,11 @@ modules/analysis/snowstem/source/test/te
 # This file is autogenerated, as indicated in the comment at the top.
 modules/analysis/snowstop/source/snowball_stoplists.c
 
-# The Unicode license as applied to utf8proc was dealt with in LEGAL-110.
+# The Unicode license as applied to utf8proc and the Unicode Character Database
+# was dealt with in LEGAL-110.
 modules/unicode/utf8proc/utf8proc_data.c
+modules/unicode/ucd/WordBreak.tab
+modules/unicode/ucd/WordBreakTest.json
 
 # For whatever reason, RAT does not recognize the MIT license of utf8proc.h
 # and utf8proc.c.