You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucy.apache.org by nw...@apache.org on 2011/12/06 01:41:20 UTC

[lucy-commits] svn commit: r1210721 - /incubator/lucy/branches/LUCY-196-uax-tokenizer/devel/bin/UnicodeTable.pm

Author: nwellnhof
Date: Tue Dec  6 00:41:19 2011
New Revision: 1210721

URL: http://svn.apache.org/viewvc?rev=1210721&view=rev
Log:
Handle default values of Unicode tables correctly

Modified:
    incubator/lucy/branches/LUCY-196-uax-tokenizer/devel/bin/UnicodeTable.pm

Modified: incubator/lucy/branches/LUCY-196-uax-tokenizer/devel/bin/UnicodeTable.pm
URL: http://svn.apache.org/viewvc/incubator/lucy/branches/LUCY-196-uax-tokenizer/devel/bin/UnicodeTable.pm?rev=1210721&r1=1210720&r2=1210721&view=diff
==============================================================================
--- incubator/lucy/branches/LUCY-196-uax-tokenizer/devel/bin/UnicodeTable.pm (original)
+++ incubator/lucy/branches/LUCY-196-uax-tokenizer/devel/bin/UnicodeTable.pm Tue Dec  6 00:41:19 2011
@@ -50,13 +50,15 @@ can be computed using bit operations.
 =head2 new
 
     my $table = UnicodeTable->new(
-        table => \@table,
-        max   => $max,
-        shift => $shift,
-        index => $index,
+        table   => \@table,
+        default => $default,
+        max     => $max,
+        shift   => $shift,
+        index   => $index,
     );
 
 \@table is an arrayref with the table values, $max is the maximum value.
+The default value for undefined table entries is $default or 0.
 $shift and $index are used for compressed tables.
 
 =cut
@@ -67,10 +69,12 @@ sub new {
     my $opts = @_ == 1 ? $_[0] : {@_};
     my $self = bless( {}, $class );
 
-    for my $name (qw(table max shift index)) {
+    for my $name (qw(table default max shift index)) {
         $self->{$name} = $opts->{$name};
     }
 
+    $self->{default} = 0
+        if !defined( $self->{default} );
     $self->{mask} = ( 1 << $self->{shift} ) - 1
         if defined( $self->{shift} );
 
@@ -83,11 +87,13 @@ sub new {
         filename => $filename,
         type     => $type,
         map      => \%map,
+        default  => $default,
     );
 
 Reads a table from a Unicode data text file. $type is either 'Enumerated'
 or 'Boolean'. \%map is a hashref that maps property values to integers.
-For booleans, these integers are ORed.
+For booleans, these integers are ORed. $default is the default value passed
+to L<new>.
 
 =cut
 
@@ -153,11 +159,10 @@ sub read {
 
     close($file);
 
-    return bless(
-        {   table => \@table,
-            max   => $max,
-        },
-        $class
+    return $class->new(
+        table   => \@table,
+        default => $opts->{default},
+        max     => $max,
     );
 }
 
@@ -203,6 +208,7 @@ Set entry at index $i to $value. Don't u
 sub set {
     my ( $self, $i, $value ) = @_;
     $self->{table}[$i] = $value;
+    $self->{max} = $value if $value > $self->{max};
 }
 
 =head2 size
@@ -237,11 +243,13 @@ sub lookup {
 
     if ($index) {
         $i = $index->mangle_index($i);
-        return 0 if !defined($i);
+        return $self->{default} if !defined($i);
         return $self->{table}->[$i];
     }
     else {
-        return $self->{table}->[$i] || 0;
+        my $val = $self->{table}->[$i];
+        return $self->{default} if !defined($val);
+        return $val;
     }
 }
 
@@ -285,41 +293,61 @@ sub compress {
     my ( $self, $shift ) = @_;
 
     my $table       = $self->{table};
+    my $default     = $self->{default};
     my $block_size  = 1 << $shift;
     my $block_count = 0;
-    my ( @compressed, @index, %blocks );
+    my ( @compressed, @index, %block_nums );
 
     for ( my $start = 0; $start < @$table; $start += $block_size ) {
         my @block;
 
         for ( my $i = $start; $i < $start + $block_size; ++$i ) {
-            push( @block, $table->[$i] || 0 );
+            my $val = $table->[$i];
+            $val = $default if !defined($val);
+            push( @block, $val );
         }
 
         my $str = join( '|', @block );
-        my $block = $blocks{$str};
+        my $block_num = $block_nums{$str};
 
-        if ( !defined($block) ) {
-            $block = $block_count;
-            $blocks{$str} = $block;
-            ++$block_count;
+        if ( !defined($block_num) ) {
+            $block_num = $block_count++;
+            $block_nums{$str} = $block_num;
             push( @compressed, @block );
         }
 
-        push( @index, $block );
+        push( @index, $block_num );
+    }
+
+    # find default for index table
+
+    my @default_block;
+
+    for ( my $i = 0; $i < $block_size; ++$i ) {
+        push( @default_block, $default );
+    }
+
+    my $str = join( '|', @default_block );
+    my $default_block_num = $block_nums{$str};
+
+    if ( !defined($default_block_num) ) {
+        $default_block_num = $block_count++;
+        push( @compressed, @default_block );
     }
 
     my $index = UnicodeTable->new(
-        table => \@index,
-        max   => $block_count - 1,
-        shift => $shift,
+        table   => \@index,
+        default => $default_block_num,
+        max     => $block_count - 1,
+        shift   => $shift,
     );
 
     return UnicodeTable->new(
-        table => \@compressed,
-        max   => $self->{max},
-        shift => $self->{shift},
-        index => $index,
+        table   => \@compressed,
+        default => $default,
+        max     => $self->{max},
+        shift   => $self->{shift},
+        index   => $index,
     );
 }