You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucy.apache.org by nw...@apache.org on 2015/01/12 15:43:26 UTC
lucy git commit: Add and fix StandardTokenizer comments

Repository: lucy
Updated Branches:
  refs/heads/master 51f7418de -> 2bc4edbd9


Add and fix StandardTokenizer comments


Project: http://git-wip-us.apache.org/repos/asf/lucy/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucy/commit/2bc4edbd
Tree: http://git-wip-us.apache.org/repos/asf/lucy/tree/2bc4edbd
Diff: http://git-wip-us.apache.org/repos/asf/lucy/diff/2bc4edbd

Branch: refs/heads/master
Commit: 2bc4edbd9de809eb542f152ab8e19fda6fa77532
Parents: 51f7418
Author: Nick Wellnhofer <we...@aevum.de>
Authored: Mon Jan 12 15:41:38 2015 +0100
Committer: Nick Wellnhofer <we...@aevum.de>
Committed: Mon Jan 12 15:41:38 2015 +0100

----------------------------------------------------------------------
 core/Lucy/Analysis/StandardTokenizer.c   | 19 +++++++++++--------
 core/Lucy/Analysis/StandardTokenizer.cfh |  2 +-
 devel/bin/gen_word_break_data.pl         | 16 +++++++++++++++-
 3 files changed, 27 insertions(+), 10 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucy/blob/2bc4edbd/core/Lucy/Analysis/StandardTokenizer.c
----------------------------------------------------------------------
diff --git a/core/Lucy/Analysis/StandardTokenizer.c b/core/Lucy/Analysis/StandardTokenizer.c
index 318dcd7..23b25e3 100644
--- a/core/Lucy/Analysis/StandardTokenizer.c
+++ b/core/Lucy/Analysis/StandardTokenizer.c
@@ -31,7 +31,7 @@
  *
  * The tables are in a compressed format that uses a three-stage lookup
  * scheme. They're generated with the perl script gen_word_break_tables.pl
- * in devel/bin.
+ * in devel/bin. The WB_* constants must match the values used in the script.
  */
 
 #define WB_ASingle          1
@@ -137,9 +137,9 @@ StandardTokenizer_Tokenize_Utf8_IMP(StandardTokenizer *self, const char *text,
 /*
  * Parse a word consisting of a single codepoint followed by extend or
  * format characters. Used for Alphabetic characters that don't have the
- * ALetter word break property: ideographs, Hiragana, and "complex content".
- * Advances the iterator and returns the word break property of the current
- * character.
+ * ALetter word break property: ideographs, Hiragana, and "complex context".
+ * Advances the iterator and returns the word break property of the character
+ * following the word.
  */
 static int
 S_parse_single(const char *text, size_t len, lucy_StringIter *iter,
@@ -156,9 +156,12 @@ S_parse_single(const char *text, size_t len, lucy_StringIter *iter,
 }
 
 /*
- * Parse a word starting with an ALetter, Numeric, Katakana, or ExtendNumLet
- * character. Advances the iterator and returns the word break property of the
- * current character.
+ * Parse a word starting with an ALetter, Hebrew_Letter, Numeric, Katakana, or
+ * ExtendNumLet character. Advances the iterator and returns the word break
+ * property of the character following the word.
+ *
+ * TODO: Words consisting only of ExtendNumLet characters (General_Category
+ * Pc, typically underscores) should be ignored.
  */
 static int
 S_parse_word(const char *text, size_t len, lucy_StringIter *iter,
@@ -320,7 +323,7 @@ S_iter_advance(const char *text, lucy_StringIter *iter) {
 
 /*
  * Advances the iterator skipping over Extend and Format characters.
- * Returns the word break property of the current character.
+ * Returns the word break property of the following character.
  */
 static int
 S_skip_extend_format(const char *text, size_t len, lucy_StringIter *iter) {

http://git-wip-us.apache.org/repos/asf/lucy/blob/2bc4edbd/core/Lucy/Analysis/StandardTokenizer.cfh
----------------------------------------------------------------------
diff --git a/core/Lucy/Analysis/StandardTokenizer.cfh b/core/Lucy/Analysis/StandardTokenizer.cfh
index 2bac5c7..211309f 100644
--- a/core/Lucy/Analysis/StandardTokenizer.cfh
+++ b/core/Lucy/Analysis/StandardTokenizer.cfh
@@ -24,7 +24,7 @@ parcel Lucy;
  *
  * Lucy::Analysis::StandardTokenizer breaks up the text at the word
  * boundaries defined in Unicode Standard Annex #29. It then returns those
- * words that start with an alphabetic or numeric character.
+ * words that contain alphabetic or numeric characters.
  */
 public class Lucy::Analysis::StandardTokenizer
     inherits Lucy::Analysis::Analyzer {

http://git-wip-us.apache.org/repos/asf/lucy/blob/2bc4edbd/devel/bin/gen_word_break_data.pl
----------------------------------------------------------------------
diff --git a/devel/bin/gen_word_break_data.pl b/devel/bin/gen_word_break_data.pl
index c94d18a..eb64fea 100755
--- a/devel/bin/gen_word_break_data.pl
+++ b/devel/bin/gen_word_break_data.pl
@@ -95,7 +95,21 @@ my $alpha = UnicodeTable->read(
     map      => { Alphabetic => 1 },
 );
 
-# Set characters in Alphabetic but not in Word_Break to WB_ASingle = 1
+# Many characters don't have a Word_Break property and form a single word.
+# In order to include them in the tokenizing process, we use a custom
+# property "ASingle" with value 1.
+#
+# For now, this property is used for all Alphabetic characters without a
+# Word_Break property: Ideographic, Hiragana, and Complex_Context.
+#
+# There are also non-alphabetic, numeric characters without a WordBreak
+# property that possibly should be included:
+#
+# - Decimal numbers (General_Category Nd) with East_Asian_Width F (Fullwidth)
+# - Other numbers (General_Category No)
+#
+# These are ignored for now.
+
 for ( my $i = 0; $i < 0x30000; ++$i ) {
     if ( !$wb->lookup($i) && $alpha->lookup($i) ) {
         $wb->set( $i, 1 );