You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucy.apache.org by nw...@apache.org on 2013/03/25 21:56:33 UTC

[lucy-commits] [3/3] git commit: refs/heads/master - Upgrade StandardTokenizer to Unicode 6.2.0

Upgrade StandardTokenizer to Unicode 6.2.0


Project: http://git-wip-us.apache.org/repos/asf/lucy/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucy/commit/0df8da88
Tree: http://git-wip-us.apache.org/repos/asf/lucy/tree/0df8da88
Diff: http://git-wip-us.apache.org/repos/asf/lucy/diff/0df8da88

Branch: refs/heads/master
Commit: 0df8da88c95ab33edf4a71c63f1d3f720a23117d
Parents: 1dced22
Author: Nick Wellnhofer <we...@aevum.de>
Authored: Mon Mar 25 21:51:06 2013 +0100
Committer: Nick Wellnhofer <we...@aevum.de>
Committed: Mon Mar 25 21:51:06 2013 +0100

----------------------------------------------------------------------
 core/Lucy/Analysis/StandardTokenizer.c          |    6 +-
 core/Lucy/Test/Analysis/TestStandardTokenizer.c |   12 +-
 devel/bin/UnicodeTable.pm                       |    5 +-
 devel/bin/gen_word_break_data.pl                |   29 +-
 modules/unicode/ucd/WordBreak.tab               |  946 +++++++++---------
 modules/unicode/ucd/WordBreakTest.json          |  511 ++++++++++
 6 files changed, 1034 insertions(+), 475 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucy/blob/0df8da88/core/Lucy/Analysis/StandardTokenizer.c
----------------------------------------------------------------------
diff --git a/core/Lucy/Analysis/StandardTokenizer.c b/core/Lucy/Analysis/StandardTokenizer.c
index a24c2b8..2a8665e 100644
--- a/core/Lucy/Analysis/StandardTokenizer.c
+++ b/core/Lucy/Analysis/StandardTokenizer.c
@@ -151,9 +151,9 @@ S_parse_single(const char *text, size_t len, lucy_StringIter *iter,
 }
 
 /*
- * Parse a word starting with an ALetter, Numeric or Katakana character.
- * Advances the iterator and returns the word break property of the current
- * character.
+ * Parse a word starting with an ALetter, Numeric, Katakana, or ExtendNumLet
+ * character. Advances the iterator and returns the word break property of the
+ * current character.
  */
 static int
 S_parse_word(const char *text, size_t len, lucy_StringIter *iter,

http://git-wip-us.apache.org/repos/asf/lucy/blob/0df8da88/core/Lucy/Test/Analysis/TestStandardTokenizer.c
----------------------------------------------------------------------
diff --git a/core/Lucy/Test/Analysis/TestStandardTokenizer.c b/core/Lucy/Test/Analysis/TestStandardTokenizer.c
index b7851a6..9e550a0 100644
--- a/core/Lucy/Test/Analysis/TestStandardTokenizer.c
+++ b/core/Lucy/Test/Analysis/TestStandardTokenizer.c
@@ -26,20 +26,24 @@
 
 TestStandardTokenizer*
 TestStandardTokenizer_new(TestFormatter *formatter) {
-    TestStandardTokenizer *self = (TestStandardTokenizer*)VTable_Make_Obj(TESTSTANDARDTOKENIZER);
+    TestStandardTokenizer *self
+        = (TestStandardTokenizer*)VTable_Make_Obj(TESTSTANDARDTOKENIZER);
     return TestStandardTokenizer_init(self, formatter);
 }
 
 TestStandardTokenizer*
-TestStandardTokenizer_init(TestStandardTokenizer *self, TestFormatter *formatter) {
-    return (TestStandardTokenizer*)TestBatch_init((TestBatch*)self, 984, formatter);
+TestStandardTokenizer_init(TestStandardTokenizer *self,
+                           TestFormatter *formatter) {
+    TestBatch_init((TestBatch*)self, 1084, formatter);
+    return self;
 }
 
 static void
 test_Dump_Load_and_Equals(TestBatch *batch) {
     StandardTokenizer *tokenizer = StandardTokenizer_new();
     Obj *dump  = StandardTokenizer_Dump(tokenizer);
-    StandardTokenizer *clone = (StandardTokenizer*)StandardTokenizer_Load(tokenizer, dump);
+    StandardTokenizer *clone
+        = (StandardTokenizer*)StandardTokenizer_Load(tokenizer, dump);
 
     TEST_TRUE(batch,
               StandardTokenizer_Equals(tokenizer, (Obj*)clone),

http://git-wip-us.apache.org/repos/asf/lucy/blob/0df8da88/devel/bin/UnicodeTable.pm
----------------------------------------------------------------------
diff --git a/devel/bin/UnicodeTable.pm b/devel/bin/UnicodeTable.pm
index b233bfd..6cd1ed5 100644
--- a/devel/bin/UnicodeTable.pm
+++ b/devel/bin/UnicodeTable.pm
@@ -16,6 +16,8 @@
 package UnicodeTable;
 use strict;
 
+use IO::File;
+
 =head1 NAME
 
 UnicodeTable - Create compressed Unicode tables for C programs
@@ -110,7 +112,8 @@ sub read {
     my $map  = $opts->{map}  or die('map missing');
     $type = lc($type);
 
-    open( my $file, '<', $filename )
+    my $file = IO::File->new;
+    $file->open( $filename, '<' )
         or die("$filename: $!\n");
 
     while ( my $line = $file->getline ) {

http://git-wip-us.apache.org/repos/asf/lucy/blob/0df8da88/devel/bin/gen_word_break_data.pl
----------------------------------------------------------------------
diff --git a/devel/bin/gen_word_break_data.pl b/devel/bin/gen_word_break_data.pl
old mode 100644
new mode 100755
index d72df05..9bcf916
--- a/devel/bin/gen_word_break_data.pl
+++ b/devel/bin/gen_word_break_data.pl
@@ -32,7 +32,7 @@ the UCD to JSON.
 UCD_SRC_DIR should point to a directory containing the files
 WordBreakProperty.txt, WordBreakTest.txt, and DerivedCoreProperties.txt from
 the Unicode Character Database available at
-L<http://www.unicode.org/Public/6.0.0/ucd/>.
+L<http://www.unicode.org/Public/6.2.0/ucd/>.
 
 =head1 OUTPUT FILES
 
@@ -58,18 +58,19 @@ my $table_filename = "$output_dir/WordBreak.tab";
 my $tests_filename = "$output_dir/WordBreakTest.json";
 
 my %wb_map = (
-    CR           => 0,
-    LF           => 0,
-    Newline      => 0,
-    ALetter      => 2,
-    Numeric      => 3,
-    Katakana     => 4,
-    ExtendNumLet => 5,
-    Extend       => 6,
-    Format       => 6,
-    MidNumLet    => 7,
-    MidLetter    => 8,
-    MidNum       => 9,
+    CR                 => 0,
+    LF                 => 0,
+    Newline            => 0,
+    Regional_Indicator => 0,  # These are symbols, so ignore them.
+    ALetter            => 2,
+    Numeric            => 3,
+    Katakana           => 4,
+    ExtendNumLet       => 5,
+    Extend             => 6,
+    Format             => 6,
+    MidNumLet          => 7,
+    MidLetter          => 8,
+    MidNum             => 9,
 );
 
 my %opts;
@@ -206,7 +207,7 @@ __DATA__
 
 This file is generated with devel/bin/gen_word_break_data.pl. DO NOT EDIT!
 The contents of this file are derived from the Unicode Character Database,
-version 6.0.0, available from http://www.unicode.org/Public/6.0.0/ucd/.
+version 6.2.0, available from http://www.unicode.org/Public/6.2.0/ucd/.
 The Unicode copyright and permission notice follows.
 
 Copyright (c) 1991-2011 Unicode, Inc. All rights reserved. Distributed under