You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucy.apache.org by ma...@apache.org on 2010/06/19 02:38:33 UTC

svn commit: r956163 - in /lucene/lucy/trunk: core/Lucy/Search/ORMatcher.bp core/Lucy/Search/ORMatcher.c perl/t/518-or_scorer.t

Author: marvin
Date: Sat Jun 19 00:38:33 2010
New Revision: 956163

URL: http://svn.apache.org/viewvc?rev=956163&view=rev
Log:
LUCY-113:
Add ORMatcher and ORScorer.

Added:
    lucene/lucy/trunk/core/Lucy/Search/ORMatcher.bp   (with props)
    lucene/lucy/trunk/core/Lucy/Search/ORMatcher.c   (with props)
    lucene/lucy/trunk/perl/t/518-or_scorer.t   (with props)

Added: lucene/lucy/trunk/core/Lucy/Search/ORMatcher.bp
URL: http://svn.apache.org/viewvc/lucene/lucy/trunk/core/Lucy/Search/ORMatcher.bp?rev=956163&view=auto
==============================================================================
--- lucene/lucy/trunk/core/Lucy/Search/ORMatcher.bp (added)
+++ lucene/lucy/trunk/core/Lucy/Search/ORMatcher.bp Sat Jun 19 00:38:33 2010
@@ -0,0 +1,101 @@
+parcel Lucy;
+
+__C__
+#include "Lucy/Search/Matcher.h"
+
+/* A wrapper for a Matcher which caches the result of Matcher_Get_Doc_ID().
+ */
+typedef struct lucy_HeapedMatcherDoc {
+    lucy_Matcher *matcher;
+    int32_t   doc;
+} lucy_HeapedMatcherDoc;
+
+#ifdef LUCY_USE_SHORT_NAMES
+  #define HeapedMatcherDoc              lucy_HeapedMatcherDoc
+#endif
+
+__END_C__
+
+/** Matcher which unions the doc id sets of other Matchers using a priority
+ * queue.
+ */
+
+class Lucy::Search::ORMatcher extends Lucy::Search::PolyMatcher {
+
+    HeapedMatcherDoc **heap;
+    HeapedMatcherDoc **pool;    /* Pool of HMDs to minimize mallocs */
+    char              *blob;    /* single allocation for all HMDs */
+    HeapedMatcherDoc  *top_hmd; /* cached top elem */
+    uint32_t           size;
+    uint32_t           max_size;
+
+    inert incremented ORMatcher*
+    new(VArray *children);
+
+    /** 
+     * @param children An array of Matchers.
+     */
+    inert incremented ORMatcher*
+    init(ORMatcher *self, VArray *children);
+
+    public void
+    Destroy(ORMatcher *self);
+
+    public int32_t
+    Next(ORMatcher *self);
+
+    public int32_t
+    Advance(ORMatcher *self, int32_t target);
+
+    public int32_t 
+    Get_Doc_ID(ORMatcher *self);
+}
+
+/**
+ * Union results of multiple Matchers.
+ * 
+ * ORScorer collates the output of multiple subscorers, summing their scores
+ * whenever they match the same document.
+ */
+class Lucy::Search::ORScorer extends Lucy::Search::ORMatcher {
+
+    float            *scores;
+    int32_t           doc_id;
+
+    inert incremented ORScorer* 
+    new(VArray *children, Similarity *similarity);
+
+    inert ORScorer* 
+    init(ORScorer *self, VArray *children, Similarity *similarity);
+
+    public void
+    Destroy(ORScorer *self);
+
+    public int32_t
+    Next(ORScorer *self);
+
+    public int32_t
+    Advance(ORScorer *self, int32_t target);
+
+    public float
+    Score(ORScorer *self);
+
+    public int32_t 
+    Get_Doc_ID(ORScorer *self);
+}
+
+/* Copyright 2010 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+

Propchange: lucene/lucy/trunk/core/Lucy/Search/ORMatcher.bp
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/lucy/trunk/core/Lucy/Search/ORMatcher.c
URL: http://svn.apache.org/viewvc/lucene/lucy/trunk/core/Lucy/Search/ORMatcher.c?rev=956163&view=auto
==============================================================================
--- lucene/lucy/trunk/core/Lucy/Search/ORMatcher.c (added)
+++ lucene/lucy/trunk/core/Lucy/Search/ORMatcher.c Sat Jun 19 00:38:33 2010
@@ -0,0 +1,416 @@
+#define C_LUCY_ORMATCHER
+#define C_LUCY_ORSCORER
+#include "Lucy/Util/ToolSet.h"
+
+#include "Lucy/Search/ORMatcher.h"
+#include "Lucy/Index/Similarity.h"
+
+// Add an element to the queue.  Unsafe -- bounds checking of queue size is
+// left to the caller.
+static void 
+S_add_element(ORMatcher *self, Matcher *matcher, int32_t doc_id);
+
+// Empty out the queue. 
+static void 
+S_clear(ORMatcher *self);
+
+// Call Matcher_Next() on the top queue element and adjust the queue,
+// removing the element if Matcher_Next() returns false. 
+static INLINE int32_t
+SI_top_next(ORMatcher *self);
+
+// Call Matcher_Advance() on the top queue element and adjust the queue,
+// removing the element if Matcher_Advance() returns false. 
+static INLINE int32_t
+SI_top_advance(ORMatcher *self, int32_t target);
+
+/* React to a change in the top element, or "root" -- presumably the update of
+ * its doc_id resulting from a call to Matcher_Next() or Matcher_Advance().
+ * If the Matcher has been exhausted, remove it from the queue and replace it
+ * with the bottom node (i.e. perform "root replacement").  In either case,
+ * perform a "sift down" to restore the heap property.  Return the doc id of
+ * the root node, or 0 if the queue has been emptied.
+ */
+static int32_t
+S_adjust_root(ORMatcher *self);
+
+// Take the bottom node (which probably violates the heap property when this
+// is called) and bubble it up through the heap until the heap property is
+// restored.
+static void
+S_bubble_up(ORMatcher *self);
+
+// Take the top node (which probably violates the heap property when this
+// is called) and sift it down through the heap until the heap property is
+// restored.
+static void
+S_sift_down(ORMatcher *self);
+
+ORMatcher*
+ORMatcher_new(VArray *children) 
+{
+    ORMatcher *self = (ORMatcher*)VTable_Make_Obj(ORMATCHER);
+    return ORMatcher_init(self, children);
+}
+
+static ORMatcher*
+S_ormatcher_init2(ORMatcher *self, VArray *children, Similarity *sim)
+{
+    size_t amount_to_malloc;
+    uint32_t i;
+
+    // Init. 
+    PolyMatcher_init((PolyMatcher*)self, children, sim);
+    self->size = 0;
+
+    // Derive. 
+    self->max_size = VA_Get_Size(children);
+
+    // Allocate. 
+    self->heap = (HeapedMatcherDoc**)CALLOCATE(self->max_size + 1, sizeof(HeapedMatcherDoc*));
+
+    // Create a pool of HMDs.  Encourage CPU cache hits by using a single
+    // allocation for all of them.
+    amount_to_malloc = (self->max_size + 1) * sizeof(HeapedMatcherDoc);
+    self->blob = (char*)MALLOCATE(amount_to_malloc);
+    self->pool = (HeapedMatcherDoc**)CALLOCATE(self->max_size + 1, sizeof(HeapedMatcherDoc*));
+    for (i = 1; i <= self->max_size; i++) {
+        size_t offset = i * sizeof(HeapedMatcherDoc);
+        HeapedMatcherDoc *hmd = (HeapedMatcherDoc*)(self->blob + offset);
+        self->pool[i] = hmd;
+    }
+
+    // Prime queue. 
+    for (i = 0; i < self->max_size; i++) {
+        Matcher *matcher = (Matcher*)VA_Fetch(children, i);
+        S_add_element(self, (Matcher*)INCREF(matcher), 0);
+    }
+    
+    return self;
+}
+
+ORMatcher*
+ORMatcher_init(ORMatcher *self, VArray *children)
+{
+    return S_ormatcher_init2(self, children, NULL);
+}
+
+void
+ORMatcher_destroy(ORMatcher *self)
+{
+    if (self->blob) { S_clear(self); }
+    FREEMEM(self->blob);
+    FREEMEM(self->pool);
+    FREEMEM(self->heap);
+    SUPER_DESTROY(self, ORMATCHER);
+}
+
+int32_t
+ORMatcher_next(ORMatcher *self)
+{
+    if (self->size == 0) { 
+        return 0; 
+    }
+    else {
+        int32_t last_doc_id = self->top_hmd->doc;
+        while (self->top_hmd->doc == last_doc_id) {
+            int32_t top_doc_id = SI_top_next(self);
+            if (!top_doc_id && self->size == 0) {
+                return 0;
+            }
+        }
+        return self->top_hmd->doc;
+    }
+}
+
+int32_t
+ORMatcher_advance(ORMatcher *self, int32_t target)
+{
+    if (!self->size) { return 0; }
+    do {
+        int32_t least = SI_top_advance(self, target);
+        if (least >= target) { return least; }
+        if (!least) { 
+            if (!self->size) { return 0; } 
+        }
+    } while (true);
+}
+
+int32_t
+ORMatcher_get_doc_id(ORMatcher *self)
+{
+    return self->top_hmd->doc;
+}
+
+static void 
+S_clear(ORMatcher *self) 
+{
+    HeapedMatcherDoc **const heap = self->heap;
+    HeapedMatcherDoc **const pool = self->pool;
+
+    // Node 0 is held empty, to make the algo clearer. 
+    for ( ; self->size > 0; self->size--) {
+        HeapedMatcherDoc *hmd = heap[ self->size ];
+        heap[ self->size ] = NULL;
+        DECREF(hmd->matcher);
+
+        // Put HMD back in pool. 
+        pool[ self->size ] = hmd;
+    }   
+}
+
+static INLINE int32_t
+SI_top_next(ORMatcher *self)
+{
+    HeapedMatcherDoc *const top_hmd = self->top_hmd;
+    top_hmd->doc = Matcher_Next(top_hmd->matcher);
+    return S_adjust_root(self);
+}
+
+static INLINE int32_t
+SI_top_advance(ORMatcher *self, int32_t target)
+{
+    HeapedMatcherDoc *const top_hmd = self->top_hmd;
+    top_hmd->doc = Matcher_Advance(top_hmd->matcher, target);
+    return S_adjust_root(self);
+}
+
+static void 
+S_add_element(ORMatcher *self, Matcher *matcher, int32_t doc_id)
+{
+    HeapedMatcherDoc **const heap = self->heap;
+    HeapedMatcherDoc **const pool = self->pool;
+    HeapedMatcherDoc *hmd;
+
+    // Increment size. 
+    self->size++;
+
+    // Put element at the bottom of the heap. 
+    hmd          = pool[ self->size ];
+    hmd->matcher = matcher;
+    hmd->doc     = doc_id;
+    heap[ self->size ] = hmd;
+
+    // Adjust heap. 
+    S_bubble_up(self);
+}
+
+static int32_t
+S_adjust_root(ORMatcher *self)
+{
+    HeapedMatcherDoc *const top_hmd = self->top_hmd;
+
+    // Inlined pop. 
+    if (!top_hmd->doc) {
+        HeapedMatcherDoc *const last_hmd = self->heap[ self->size ];
+
+        // Last to first. 
+        DECREF(top_hmd->matcher);
+        top_hmd->matcher = last_hmd->matcher;
+        top_hmd->doc     = last_hmd->doc;
+        self->heap[ self->size ] = NULL;
+
+        // Put back in pool. 
+        self->pool[ self->size ] = last_hmd;
+
+        self->size--;
+        if (self->size == 0) 
+            return 0;
+    }
+
+    // Move queue no matter what. 
+    S_sift_down(self);
+
+    return self->top_hmd->doc;
+}
+
+static void
+S_bubble_up(ORMatcher *self) 
+{
+    HeapedMatcherDoc **const heap = self->heap;
+    uint32_t i = self->size;
+    uint32_t j = i >> 1;
+    HeapedMatcherDoc *const node = heap[i]; // save bottom node 
+
+    while (j > 0 && node->doc < heap[j]->doc) {
+        heap[i] = heap[j];
+        i = j;
+        j = j >> 1;
+    }
+    heap[i] = node;
+    self->top_hmd = heap[1];
+}
+
+static void
+S_sift_down(ORMatcher *self) 
+{
+    HeapedMatcherDoc **const heap = self->heap;
+    uint32_t i = 1;
+    uint32_t j = i << 1;
+    uint32_t k = j + 1;
+    HeapedMatcherDoc *const node = heap[i]; // save top node 
+
+    // Find smaller child. 
+    if (k <= self->size && heap[k]->doc < heap[j]->doc) {
+        j = k;
+    }
+
+    while (j <= self->size && heap[j]->doc < node->doc) {
+        heap[i] = heap[j];
+        i = j;
+        j = i << 1;
+        k = j + 1;
+        if (k <= self->size && heap[k]->doc < heap[j]->doc) {
+            j = k;
+        }
+    }
+    heap[i] = node;
+    
+    self->top_hmd = heap[1];
+}
+
+/***************************************************************************/
+
+/* When this is called, all children are past the current self->doc_id.  The
+ * least doc_id amongst them becomes the new self->doc_id, and they are all
+ * advanced so that they are once again out in front of it.  While they are
+ * advancing, their scores are cached in an array, to be summed during
+ * Score().
+ */
+static int32_t
+S_advance_after_current(ORScorer *self);
+
+ORScorer*
+ORScorer_new(VArray *children, Similarity *sim) 
+{
+    ORScorer *self = (ORScorer*)VTable_Make_Obj(ORSCORER);
+    return ORScorer_init(self, children, sim);
+}
+
+ORScorer*
+ORScorer_init(ORScorer *self, VArray *children, Similarity *sim) 
+{
+    S_ormatcher_init2((ORMatcher*)self, children, sim);
+    self->doc_id           = 0;
+    self->scores           = (float*)MALLOCATE(self->num_kids * sizeof(float));
+
+    // Establish the state of all child matchers being past the current doc
+    // id, by invoking ORMatcher's Next() method. 
+    ORMatcher_next((ORMatcher*)self);
+
+    return self;
+}
+
+void
+ORScorer_destroy(ORScorer *self) 
+{
+    FREEMEM(self->scores);
+    SUPER_DESTROY(self, ORSCORER);
+}
+
+int32_t
+ORScorer_next(ORScorer *self)
+{
+    return S_advance_after_current(self);
+}
+
+static int32_t
+S_advance_after_current(ORScorer *self) 
+{
+    float *const     scores = self->scores;
+    Matcher *child;
+
+    // Get the top Matcher, or bail because there are no Matchers left. 
+    if (!self->size) { return 0; }
+    else             { child = self->top_hmd->matcher; }
+
+    // The top matcher will already be at the correct doc, so start there. 
+    self->doc_id        = self->top_hmd->doc;
+    scores[0]           = Matcher_Score(child);
+    self->matching_kids = 1;
+
+    do {
+        // Attempt to advance past current doc. 
+        int32_t top_doc_id = SI_top_next((ORMatcher*)self);
+        if (!top_doc_id) {
+            if (!self->size) {
+                break; // bail, no more to advance 
+            }
+        }
+
+        if (top_doc_id != self->doc_id) {
+            // Bail, least doc in queue is now past the one we're scoring. 
+            break;
+        }
+        else {
+            // Accumulate score. 
+            child = self->top_hmd->matcher;
+            scores[ self->matching_kids ] = Matcher_Score(child);
+            self->matching_kids++;
+        }
+    } while (true);
+
+    return self->doc_id;
+}
+
+int32_t
+ORScorer_advance(ORScorer *self, int32_t target)
+{
+    // Return sentinel once exhausted. 
+    if (!self->size) { return 0; }
+
+    // Succeed if we're already past and still on a valid doc. 
+    if (target <= self->doc_id) {
+        return self->doc_id;
+    }
+
+    do {
+        // If all matchers are caught up, accumulate score and return. 
+        if (self->top_hmd->doc >= target) {
+            return S_advance_after_current(self);
+        }
+
+        // Not caught up yet, so keep skipping matchers. 
+        if ( !SI_top_advance((ORMatcher*)self, target) ) {
+            if (!self->size) { return 0; }
+        }
+    } while (true);
+}
+
+int32_t
+ORScorer_get_doc_id(ORScorer *self)
+{
+    return self->doc_id;
+}
+
+float
+ORScorer_score(ORScorer *self)
+{
+    float *const scores = self->scores;
+    float score = 0.0f; 
+    uint32_t i;
+
+    // Accumulate score, then factor in coord bonus. 
+    for (i = 0; i < self->matching_kids; i++) {
+        score += scores[i];
+    }
+    score *= self->coord_factors[ self->matching_kids ];
+
+    return score;
+}
+
+/* Copyright 2010 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+

Propchange: lucene/lucy/trunk/core/Lucy/Search/ORMatcher.c
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/lucy/trunk/perl/t/518-or_scorer.t
URL: http://svn.apache.org/viewvc/lucene/lucy/trunk/perl/t/518-or_scorer.t?rev=956163&view=auto
==============================================================================
--- lucene/lucy/trunk/perl/t/518-or_scorer.t (added)
+++ lucene/lucy/trunk/perl/t/518-or_scorer.t Sat Jun 19 00:38:33 2010
@@ -0,0 +1,68 @@
+use strict;
+use warnings;
+use lib 'buildlib';
+
+use Test::More skip_all => 'Requires SortCollector';
+# use Test::More tests => 900;
+use Lucy::Test;
+use LucyX::Search::MockScorer;
+use Lucy::Test::TestUtils qw( modulo_set doc_ids_from_td_coll );
+
+my $sim = Lucy::Index::Similarity::LuceneSimilarity->new;
+
+for my $interval_a ( 1 .. 10 ) {
+    for my $interval_b ( 5 .. 10 ) {
+        check_scorer( $interval_a, $interval_b );
+        for my $interval_c ( 30, 75 ) {
+            check_scorer( $interval_a, $interval_b, $interval_c );
+            check_scorer( $interval_c, $interval_b, $interval_a );
+        }
+    }
+}
+
+sub check_scorer {
+    my @intervals = @_;
+    my @doc_id_arrays = map { modulo_set( $_, 100 ) } @intervals;
+    my $subscorers
+        = Lucy::Object::VArray->new( capacity => scalar @intervals );
+    for my $doc_id_array (@doc_id_arrays) {
+        my $mock = LucyX::Search::MockScorer->new(
+            doc_ids => $doc_id_array,
+            scores  => [ (1) x scalar @$doc_id_array ],
+        );
+        $subscorers->push($mock);
+    }
+
+    my $or_scorer = Lucy::Search::ORScorer->new(
+        similarity => $sim,
+        children   => $subscorers,
+    );
+    my $collector
+        = Lucy::Search::Collector::SortCollector->new( wanted => 100 );
+    $or_scorer->collect( collector => $collector );
+    my ( $got_by_score, $got_by_id ) = doc_ids_from_td_coll($collector);
+    my ( $expected_by_count, $expected_by_id )
+        = union_doc_id_sets(@doc_id_arrays);
+    is( scalar @$got_by_id,
+        scalar @$expected_by_id,
+        "total hits: @intervals"
+    );
+    is_deeply( $got_by_id, $expected_by_id, "got all docs: @intervals" );
+    is_deeply( $got_by_score, $expected_by_count,
+        "scores accumulated: @intervals" );
+}
+
+sub union_doc_id_sets {
+    my @arrays = @_;
+    my %scores;
+    for my $array (@arrays) {
+        $scores{$_} += 1 for @$array;
+    }
+    my @by_count_then_id = sort { $scores{$b} <=> $scores{$a} or $a <=> $b }
+        keys %scores;
+    my @by_id = sort { $a <=> $b } keys %scores;
+    return ( \@by_count_then_id, \@by_id );
+}
+
+# Trigger destruction.
+undef $sim;

Propchange: lucene/lucy/trunk/perl/t/518-or_scorer.t
------------------------------------------------------------------------------
    svn:eol-style = native