You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucy.apache.org by nw...@apache.org on 2013/03/25 21:56:33 UTC
[lucy-commits] [3/3] git commit: refs/heads/master - Upgrade StandardTokenizer to
Unicode 6.2.0
Upgrade StandardTokenizer to Unicode 6.2.0
Project: http://git-wip-us.apache.org/repos/asf/lucy/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucy/commit/0df8da88
Tree: http://git-wip-us.apache.org/repos/asf/lucy/tree/0df8da88
Diff: http://git-wip-us.apache.org/repos/asf/lucy/diff/0df8da88
Branch: refs/heads/master
Commit: 0df8da88c95ab33edf4a71c63f1d3f720a23117d
Parents: 1dced22
Author: Nick Wellnhofer <we...@aevum.de>
Authored: Mon Mar 25 21:51:06 2013 +0100
Committer: Nick Wellnhofer <we...@aevum.de>
Committed: Mon Mar 25 21:51:06 2013 +0100
----------------------------------------------------------------------
core/Lucy/Analysis/StandardTokenizer.c | 6 +-
core/Lucy/Test/Analysis/TestStandardTokenizer.c | 12 +-
devel/bin/UnicodeTable.pm | 5 +-
devel/bin/gen_word_break_data.pl | 29 +-
modules/unicode/ucd/WordBreak.tab | 946 +++++++++---------
modules/unicode/ucd/WordBreakTest.json | 511 ++++++++++
6 files changed, 1034 insertions(+), 475 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucy/blob/0df8da88/core/Lucy/Analysis/StandardTokenizer.c
----------------------------------------------------------------------
diff --git a/core/Lucy/Analysis/StandardTokenizer.c b/core/Lucy/Analysis/StandardTokenizer.c
index a24c2b8..2a8665e 100644
--- a/core/Lucy/Analysis/StandardTokenizer.c
+++ b/core/Lucy/Analysis/StandardTokenizer.c
@@ -151,9 +151,9 @@ S_parse_single(const char *text, size_t len, lucy_StringIter *iter,
}
/*
- * Parse a word starting with an ALetter, Numeric or Katakana character.
- * Advances the iterator and returns the word break property of the current
- * character.
+ * Parse a word starting with an ALetter, Numeric, Katakana, or ExtendNumLet
+ * character. Advances the iterator and returns the word break property of the
+ * current character.
*/
static int
S_parse_word(const char *text, size_t len, lucy_StringIter *iter,
http://git-wip-us.apache.org/repos/asf/lucy/blob/0df8da88/core/Lucy/Test/Analysis/TestStandardTokenizer.c
----------------------------------------------------------------------
diff --git a/core/Lucy/Test/Analysis/TestStandardTokenizer.c b/core/Lucy/Test/Analysis/TestStandardTokenizer.c
index b7851a6..9e550a0 100644
--- a/core/Lucy/Test/Analysis/TestStandardTokenizer.c
+++ b/core/Lucy/Test/Analysis/TestStandardTokenizer.c
@@ -26,20 +26,24 @@
TestStandardTokenizer*
TestStandardTokenizer_new(TestFormatter *formatter) {
- TestStandardTokenizer *self = (TestStandardTokenizer*)VTable_Make_Obj(TESTSTANDARDTOKENIZER);
+ TestStandardTokenizer *self
+ = (TestStandardTokenizer*)VTable_Make_Obj(TESTSTANDARDTOKENIZER);
return TestStandardTokenizer_init(self, formatter);
}
TestStandardTokenizer*
-TestStandardTokenizer_init(TestStandardTokenizer *self, TestFormatter *formatter) {
- return (TestStandardTokenizer*)TestBatch_init((TestBatch*)self, 984, formatter);
+TestStandardTokenizer_init(TestStandardTokenizer *self,
+ TestFormatter *formatter) {
+ TestBatch_init((TestBatch*)self, 1084, formatter);
+ return self;
}
static void
test_Dump_Load_and_Equals(TestBatch *batch) {
StandardTokenizer *tokenizer = StandardTokenizer_new();
Obj *dump = StandardTokenizer_Dump(tokenizer);
- StandardTokenizer *clone = (StandardTokenizer*)StandardTokenizer_Load(tokenizer, dump);
+ StandardTokenizer *clone
+ = (StandardTokenizer*)StandardTokenizer_Load(tokenizer, dump);
TEST_TRUE(batch,
StandardTokenizer_Equals(tokenizer, (Obj*)clone),
http://git-wip-us.apache.org/repos/asf/lucy/blob/0df8da88/devel/bin/UnicodeTable.pm
----------------------------------------------------------------------
diff --git a/devel/bin/UnicodeTable.pm b/devel/bin/UnicodeTable.pm
index b233bfd..6cd1ed5 100644
--- a/devel/bin/UnicodeTable.pm
+++ b/devel/bin/UnicodeTable.pm
@@ -16,6 +16,8 @@
package UnicodeTable;
use strict;
+use IO::File;
+
=head1 NAME
UnicodeTable - Create compressed Unicode tables for C programs
@@ -110,7 +112,8 @@ sub read {
my $map = $opts->{map} or die('map missing');
$type = lc($type);
- open( my $file, '<', $filename )
+ my $file = IO::File->new;
+ $file->open( $filename, '<' )
or die("$filename: $!\n");
while ( my $line = $file->getline ) {
http://git-wip-us.apache.org/repos/asf/lucy/blob/0df8da88/devel/bin/gen_word_break_data.pl
----------------------------------------------------------------------
diff --git a/devel/bin/gen_word_break_data.pl b/devel/bin/gen_word_break_data.pl
old mode 100644
new mode 100755
index d72df05..9bcf916
--- a/devel/bin/gen_word_break_data.pl
+++ b/devel/bin/gen_word_break_data.pl
@@ -32,7 +32,7 @@ the UCD to JSON.
UCD_SRC_DIR should point to a directory containing the files
WordBreakProperty.txt, WordBreakTest.txt, and DerivedCoreProperties.txt from
the Unicode Character Database available at
-L<http://www.unicode.org/Public/6.0.0/ucd/>.
+L<http://www.unicode.org/Public/6.2.0/ucd/>.
=head1 OUTPUT FILES
@@ -58,18 +58,19 @@ my $table_filename = "$output_dir/WordBreak.tab";
my $tests_filename = "$output_dir/WordBreakTest.json";
my %wb_map = (
- CR => 0,
- LF => 0,
- Newline => 0,
- ALetter => 2,
- Numeric => 3,
- Katakana => 4,
- ExtendNumLet => 5,
- Extend => 6,
- Format => 6,
- MidNumLet => 7,
- MidLetter => 8,
- MidNum => 9,
+ CR => 0,
+ LF => 0,
+ Newline => 0,
+ Regional_Indicator => 0, # These are symbols, so ignore them.
+ ALetter => 2,
+ Numeric => 3,
+ Katakana => 4,
+ ExtendNumLet => 5,
+ Extend => 6,
+ Format => 6,
+ MidNumLet => 7,
+ MidLetter => 8,
+ MidNum => 9,
);
my %opts;
@@ -206,7 +207,7 @@ __DATA__
This file is generated with devel/bin/gen_word_break_data.pl. DO NOT EDIT!
The contents of this file are derived from the Unicode Character Database,
-version 6.0.0, available from http://www.unicode.org/Public/6.0.0/ucd/.
+version 6.2.0, available from http://www.unicode.org/Public/6.2.0/ucd/.
The Unicode copyright and permission notice follows.
Copyright (c) 1991-2011 Unicode, Inc. All rights reserved. Distributed under