You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucy.apache.org by ma...@apache.org on 2011/03/17 22:22:09 UTC
[lucy-commits] svn commit: r1082705 - in /incubator/lucy/trunk/perl: lib/Lucy/Docs/Cookbook/CustomQueryParser.pod sample/FlatQueryParser.pm

Author: marvin
Date: Thu Mar 17 21:22:09 2011
New Revision: 1082705

URL: http://svn.apache.org/viewvc?rev=1082705&view=rev
Log:
LUCY-135 
Streamline the FlatQueryParser sample class, eliminating discussion of grammar
based parsers and dependency on Parse::RecDescent.

Modified:
    incubator/lucy/trunk/perl/lib/Lucy/Docs/Cookbook/CustomQueryParser.pod
    incubator/lucy/trunk/perl/sample/FlatQueryParser.pm

Modified: incubator/lucy/trunk/perl/lib/Lucy/Docs/Cookbook/CustomQueryParser.pod
URL: http://svn.apache.org/viewvc/incubator/lucy/trunk/perl/lib/Lucy/Docs/Cookbook/CustomQueryParser.pod?rev=1082705&r1=1082704&r2=1082705&view=diff
==============================================================================
--- incubator/lucy/trunk/perl/lib/Lucy/Docs/Cookbook/CustomQueryParser.pod (original)
+++ incubator/lucy/trunk/perl/lib/Lucy/Docs/Cookbook/CustomQueryParser.pod Thu Mar 17 21:22:09 2011
@@ -19,28 +19,8 @@ Lucy::Docs::Cookbook::CustomQueryParser 
 
 =head1 ABSTRACT
 
-Implement a custom search query language using
-L<Lucy::Search::QueryParser> and L<Parse::RecDescent>.
-
-=head1 Grammar-based vs. hand-rolled
-
-There are two classic strategies for writing a text parser.
-
-=over
-
-=item 1
-
-Create a grammar-based parser using Perl modules like Parse::RecDescent or
-Parse::YAPP, C utilities like lex and yacc, etc.
-
-=item 2
-
-Hand-roll your own parser.
-
-=back
-
-We'll start off with hand-rolling, but we'll ultimately move to the
-grammar-based parsing technique because of its superior flexibility.
+Implement a custom search query language using a subclass of
+L<Lucy::Search::QueryParser>.
 
 =head1 The language
 
@@ -53,14 +33,11 @@ collections.
 
 Later, we'll add support for trailing wildcards.
 
-=head1 Single-field regex-based parser
+=head1 Single-field parser
 
-Hand-rolling a parser can be labor-intensive, but our proposed query language
-is simple enough that chewing up the query string with some simple regular
-expressions will do the trick.
-
-We'll use a fixed field name of "content", and a fixed choice of English
-PolyAnalyzer.
+Our initial parser implentation will generate queries against a single fixed
+field, "content", and it will analyze text using a fixed choice of English
+PolyAnalyzer.  We won't subclass Lucy::Search::QueryParser just yet.
 
     package FlatQueryParser;
     use Lucy::Search::TermQuery;
@@ -104,7 +81,7 @@ single token and splits on whitespace ev
         my ( $self, $query_string ) = @_;
         my @tokens;
         while ( length $query_string ) {
-            if ( $query_string =~ s/^\s*// ) {
+            if ( $query_string =~ s/^\s+// ) {
                 next;    # skip whitespace
             }
             elsif ( $query_string =~ s/^("[^"]*(?:"|$))// ) {
@@ -153,53 +130,7 @@ ORQuery.
         return $or_query;
     }
 
-=head1 Single-field Parse::RecDescent-based parser
-
-Instead of using regular expressions to tokenize the string, we can use
-Parse::RecDescent.
-
-    my $grammar = <<'END_GRAMMAR';
-    
-    leaf_queries:
-        leaf_query(s?)
-        { $item{'leaf_query(s?)'} }
-    
-    leaf_query:
-          phrase_query
-        | term_query
-    
-    term_query:
-        /(\S+)/
-        { $1 }
-    
-    phrase_query:
-        /("[^"]*(?:"|$))/   # terminated by either quote or end of string
-        { $1 }
-    
-    END_GRAMMAR
-    
-    sub new { 
-        my $analyzer = Lucy::Analysis::PolyAnalyzer->new(
-            language => 'en',
-        );
-        my $rd_parser = Parse::RecDescent->new($grammar);
-        return bless { 
-            field     => 'content',
-            analyzer  => $analyzer,
-            rd_parser => $rd_parser,
-        }, __PACKAGE__;
-    }
-
-The behavior of a Parse::RecDescent parser based on the grammar above is
-exactly the same as that of our regex-based tokenization routine from before,
-so we can leave parse() intact and simply change _tokenize():
-
-    sub _tokenize {
-        my ( $self, $query_string ) = @_;
-        return $self->{rd_parser}->leaf_queries($query_string);
-    }
-
-=head1 Multi-field Parse::RecDescent-based parser
+=head1 Multi-field parser
 
 Most often, the end user will want their search query to match not only a
 single 'content' field, but also 'title' and so on.  To make that happen, we
@@ -212,12 +143,14 @@ have to turn queries such as this...
     (title:foo OR content:foo) AND NOT (title:bar OR content:bar)
 
 Rather than continue with our own from-scratch parser class and write the
-routines to accomplish that expansion, we're now going to subclass QueryParser
+routines to accomplish that expansion, we're now going to subclass Lucy::Search::QueryParser
 and take advantage of some of its existing methods.
 
 Our first parser implementation had the "content" field name and the choice of
 English PolyAnalyzer hard-coded for simplicity, but we don't need to do that
-this time -- QueryParser's constructor requires a Schema which conveys field
+once we subclass Lucy::Search::QueryParser.  QueryParser's constructor --
+which we will inherit, allowing us to eliminate our own constructor --
+requires a Schema which conveys field
 and Analyzer information, so we can just defer to that.
 
     package FlatQueryParser;
@@ -225,74 +158,27 @@ and Analyzer information, so we can just
     use Lucy::Search::TermQuery;
     use Lucy::Search::PhraseQuery;
     use Lucy::Search::ORQuery;
-    use Lucy::Search::NoMatchQuery;
     use PrefixQuery;
-    use Parse::RecDescent;
     use Carp;
     
-    our %rd_parser;
-    
-    sub new { 
-        my $class = shift;
-        my $self = $class->SUPER::new(@_);
-        $rd_parser{$$self} = Parse::RecDescent->new($grammar);
-        return $self;
-    }
-    
-    sub DESTROY {
-        my $self = shift;
-        delete $rd_parser{$$self};
-        $self->SUPER::DESTROY;
-    }
+    # Inherit new()
 
-If we modify our Parse::RecDescent grammar slightly, we can eliminate the
-_tokenize(), _make_term_query(), and _make_phrase_query() helper subs, and our
-parse() subroutine can be chopped way down.  We'll have the C<term_query> and
-C<phrase_query> productions generate LeafQuery objects, and add a C<tree>
-production which joins the leaves together with an ORQuery.
+We're also going to jettison our _make_term_query() and _make_phrase_query()
+helper subs and chop our parse() subroutine way down.  Our revised parse()
+routine will generate Lucy::Search::LeafQuery objects instead of TermQueries
+and PhraseQueries:
 
-    my $grammar = <<'END_GRAMMAR';
-    
-    tree:
-        leaf_queries
-        { 
-            $return = Lucy::Search::ORQuery->new;
-            $return->add_child($_) for @{ $item[1] };
-        }
-    
-    leaf_queries:
-        leaf_query(s?)
-        { $item{'leaf_query(s)'} }
-    
-    leaf_query:
-          phrase_query
-        | term_query
-    
-    term_query:
-        /(\S+)/
-        { Lucy::Search::LeafQuery->new( text => $1 ) }
-    
-    phrase_query:
-        /("[^"]*(?:"|$))/   # terminated by either quote or end of string
-        { Lucy::Search::LeafQuery->new( text => $1 ) }
-    
-    END_GRAMMAR
-    
-    ...
-    
     sub parse {
-        my ( $self, $query_string ) = @_; 
-        my $tree = $self->tree($query_string);
-        return $tree ? $self->expand($tree) :
-        Lucy::Search::NoMatchQuery->new;
-    }
-    
-    sub tree {
-        my ( $self, $query_string ) = @_; 
-        return $rd_parser{$$self}->tree($query_string);
+        my ( $self, $query_string ) = @_;
+        my $tokens = $self->_tokenize($query_string);
+        my $or_query = Lucy::Search::ORQuery->new;
+        for my $token (@$tokens) {
+            my $leaf_query = Lucy::Search::LeafQuery->new( text => $token );
+            $or_query->add_child($leaf_query);
+        }
+        return $self->expand($or_query);
     }
 
-
 The magic happens in QueryParser's expand() method, which walks the ORQuery
 object we supply to it looking for LeafQuery objects, and calls expand_leaf()
 for each one it finds.  expand_leaf() performs field-specific analysis,
@@ -302,22 +188,9 @@ into C<(title:foo OR content:foo)>.
 
 =head1 Extending the query language
 
-To add support for trailing wildcards to our query language, first we need to
-modify our grammar, adding a C<prefix_query> production and tweaking the
-C<leaf_query> production to accommodate it.
-
-    leaf_query:
-          phrase_query
-        | prefix_query
-        | term_query
-    
-    prefix_query:
-        /(\w+\*)/
-        { Lucy::Search::LeafQuery->new( text => $1 ) }
-
-Second, we need to override expand_leaf() to accommodate PrefixQuery,
-while deferring to its original implementation on TermQuery and
-PhraseQuery.
+To add support for trailing wildcards to our query language, we need to
+override expand_leaf() to accommodate PrefixQuery, while deferring to the
+parent class implementation on TermQuery and PhraseQuery.
 
     sub expand_leaf {
         my ( $self, $leaf_query ) = @_;
@@ -338,10 +211,17 @@ PhraseQuery.
         }
     }
 
+Ordinarily, those asterisks would have been stripped when running tokens
+through the PolyAnalyzer -- query strings containing "foo*" would produce
+TermQueries for the term "foo".  Our override intercepts tokens with trailing
+asterisks and processes them as PrefixQueries before C<SUPER::expand_leaf> can
+discard them, so that a search for "foo*" can match "food", "foosball", and so
+on.
+
 =head1 Usage
 
-Insert any of our custom parsers into the search.cgi sample app to get a feel
-for how they behave:
+Insert our custom parser into the search.cgi sample app to get a feel for how
+it behaves:
 
     my $parser = FlatQueryParser->new( schema => $searcher->get_schema );
     my $query  = $parser->parse( decode( 'UTF-8', $cgi->param('q') || '' ) );

Modified: incubator/lucy/trunk/perl/sample/FlatQueryParser.pm
URL: http://svn.apache.org/viewvc/incubator/lucy/trunk/perl/sample/FlatQueryParser.pm?rev=1082705&r1=1082704&r2=1082705&view=diff
==============================================================================
--- incubator/lucy/trunk/perl/sample/FlatQueryParser.pm (original)
+++ incubator/lucy/trunk/perl/sample/FlatQueryParser.pm Thu Mar 17 21:22:09 2011
@@ -21,69 +21,38 @@ use base qw( Lucy::Search::QueryParser )
 use Lucy::Search::TermQuery;
 use Lucy::Search::PhraseQuery;
 use Lucy::Search::ORQuery;
-use Lucy::Search::NoMatchQuery;
 use PrefixQuery;
-use Parse::RecDescent;
 use Carp;
 
-our %rd_parser;
-
-my $grammar = <<'END_GRAMMAR';
-
-tree:
-    leaf_queries
-    { 
-        $return = Lucy::Search::ORQuery->new;
-        $return->add_child($_) for @{ $item[1] };
-    }
-
-leaf_queries:
-    leaf_query(s?)
-    { $item{'leaf_query(s?)'} }
-
-leaf_query:
-      phrase_query
-    | prefix_query
-    | term_query
-    
-term_query:
-    /(\S+)/
-    { Lucy::Search::LeafQuery->new( text => $1 ) }
-
-phrase_query:
-    /("[^"]*(?:"|$))/   # terminated by either quote or end of string
-    { Lucy::Search::LeafQuery->new( text => $1 ) }
-    
-prefix_query:
-    /(\w+\*)/
-    { Lucy::Search::LeafQuery->new( text => $1 ) }
-
-END_GRAMMAR
-
-sub new {
-    my $class = shift;
-    my $self  = $class->SUPER::new(@_);
-    $rd_parser{$$self} = Parse::RecDescent->new($grammar);
-    return $self;
-}
-
-sub DESTROY {
-    my $self = shift;
-    delete $rd_parser{$$self};
-    $self->SUPER::DESTROY;
-}
+# Inherit new()
 
 sub parse {
     my ( $self, $query_string ) = @_;
-    my $tree = $self->tree($query_string);
-    return $tree
-        ? $self->expand($tree)
-        : Lucy::Search::NoMatchQuery->new;
+    my $tokens = $self->_tokenize($query_string);
+    my $or_query = Lucy::Search::ORQuery->new;
+    for my $token (@$tokens) {
+        my $leaf_query = Lucy::Search::LeafQuery->new( text => $token );
+        $or_query->add_child($leaf_query);
+    }
+    return $self->expand($or_query);
 }
 
-sub tree {
+sub _tokenize {
     my ( $self, $query_string ) = @_;
-    return $rd_parser{$$self}->tree($query_string);
+    my @tokens;
+    while ( length $query_string ) {
+        if ( $query_string =~ s/^\s+// ) {
+            next;    # skip whitespace
+        }
+        elsif ( $query_string =~ s/^("[^"]*(?:"|$))// ) {
+            push @tokens, $1;    # double-quoted phrase
+        }
+        else {
+            $query_string =~ s/(\S+)//;
+            push @tokens, $1;    # single word
+        }
+    }
+    return \@tokens;
 }
 
 sub expand_leaf {