You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucy.apache.org by ma...@apache.org on 2010/12/29 22:27:11 UTC

[lucy-commits] svn commit: r1053745 - in /incubator/lucy/trunk/perl: t/154-tokenizer.t xs/Lucy/Analysis/Tokenizer.c

Author: marvin
Date: Wed Dec 29 21:27:10 2010
New Revision: 1053745

URL: http://svn.apache.org/viewvc?rev=1053745&view=rev
Log:
Forbid \p and \P constructs in Tokenizer patterns.

Modified:
    incubator/lucy/trunk/perl/t/154-tokenizer.t
    incubator/lucy/trunk/perl/xs/Lucy/Analysis/Tokenizer.c

Modified: incubator/lucy/trunk/perl/t/154-tokenizer.t
URL: http://svn.apache.org/viewvc/incubator/lucy/trunk/perl/t/154-tokenizer.t?rev=1053745&r1=1053744&r2=1053745&view=diff
==============================================================================
--- incubator/lucy/trunk/perl/t/154-tokenizer.t (original)
+++ incubator/lucy/trunk/perl/t/154-tokenizer.t Wed Dec 29 21:27:10 2010
@@ -16,7 +16,7 @@
 use strict;
 use warnings;
 
-use Test::More tests => 11;
+use Test::More tests => 15;
 use Lucy::Test;
 
 my $tokenizer   = Lucy::Analysis::Tokenizer->new;
@@ -76,3 +76,27 @@ is_deeply(
 $tokenizer = Lucy::Analysis::Tokenizer->new( token_re => qr/../ );
 is_deeply( $tokenizer->split('aabbcc'),
     [qw( aa bb cc )], "back compat with token_re argument" );
+
+eval {
+    my $toke
+        = Lucy::Analysis::Tokenizer->new(
+        pattern => '\\p{Carp::confess}' );
+};
+like( $@, qr/\\p/, "\\p forbidden in pattern" );
+
+eval {
+    my $toke
+        = Lucy::Analysis::Tokenizer->new(
+        pattern => '\\P{Carp::confess}' );
+};
+like( $@, qr/\\P/, "\\P forbidden in pattern" );
+
+$tokenizer = Lucy::Analysis::Tokenizer->new( pattern => '\\w+' );
+my $dump = $tokenizer->dump;
+$dump->{pattern} = "\\p{Carp::confess}";
+eval { $tokenizer->load($dump) };
+like( $@, qr/\\p/, "\\p forbidden during load" );
+
+$dump->{pattern} = "\\P{Carp::confess}";
+eval { $tokenizer->load($dump) };
+like( $@, qr/\\P/, "\\P forbidden during load" );

Modified: incubator/lucy/trunk/perl/xs/Lucy/Analysis/Tokenizer.c
URL: http://svn.apache.org/viewvc/incubator/lucy/trunk/perl/xs/Lucy/Analysis/Tokenizer.c?rev=1053745&r1=1053744&r2=1053745&view=diff
==============================================================================
--- incubator/lucy/trunk/perl/xs/Lucy/Analysis/Tokenizer.c (original)
+++ incubator/lucy/trunk/perl/xs/Lucy/Analysis/Tokenizer.c Wed Dec 29 21:27:10 2010
@@ -38,10 +38,19 @@ lucy_Tokenizer_init(lucy_Tokenizer *self
 
     lucy_Analyzer_init((lucy_Analyzer*)self);
     #define DEFAULT_PATTERN "\\w+(?:['\\x{2019}]\\w+)*"
-    self->pattern = pattern 
-                  ? Lucy_CB_Clone(pattern)
-                  : lucy_CB_new_from_trusted_utf8(DEFAULT_PATTERN,
-                      sizeof(DEFAULT_PATTERN) - 1);
+    if (pattern) {
+        if (   Lucy_CB_Find_Str(pattern, "\\p", 2) != -1
+            || Lucy_CB_Find_Str(pattern, "\\P", 2) != -1
+        ) {
+            LUCY_DECREF(self);
+            THROW(LUCY_ERR, "\\p and \\P constructs forbidden");
+        }
+        self->pattern = Lucy_CB_Clone(pattern);
+    }
+    else {
+        self->pattern = lucy_CB_new_from_trusted_utf8(DEFAULT_PATTERN,
+            sizeof(DEFAULT_PATTERN) - 1);
+    }
 
     // Acquire a compiled regex engine for matching one token. 
     token_re_sv = (SV*)lucy_Host_callback_host(LUCY_TOKENIZER,