You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucy.apache.org by ma...@apache.org on 2010/06/19 02:38:33 UTC
svn commit: r956163 - in /lucene/lucy/trunk: core/Lucy/Search/ORMatcher.bp
core/Lucy/Search/ORMatcher.c perl/t/518-or_scorer.t
Author: marvin
Date: Sat Jun 19 00:38:33 2010
New Revision: 956163
URL: http://svn.apache.org/viewvc?rev=956163&view=rev
Log:
LUCY-113:
Add ORMatcher and ORScorer.
Added:
lucene/lucy/trunk/core/Lucy/Search/ORMatcher.bp (with props)
lucene/lucy/trunk/core/Lucy/Search/ORMatcher.c (with props)
lucene/lucy/trunk/perl/t/518-or_scorer.t (with props)
Added: lucene/lucy/trunk/core/Lucy/Search/ORMatcher.bp
URL: http://svn.apache.org/viewvc/lucene/lucy/trunk/core/Lucy/Search/ORMatcher.bp?rev=956163&view=auto
==============================================================================
--- lucene/lucy/trunk/core/Lucy/Search/ORMatcher.bp (added)
+++ lucene/lucy/trunk/core/Lucy/Search/ORMatcher.bp Sat Jun 19 00:38:33 2010
@@ -0,0 +1,101 @@
+parcel Lucy;
+
+__C__
+#include "Lucy/Search/Matcher.h"
+
+/* A wrapper for a Matcher which caches the result of Matcher_Get_Doc_ID().
+ */
+typedef struct lucy_HeapedMatcherDoc {
+ lucy_Matcher *matcher;
+ int32_t doc;
+} lucy_HeapedMatcherDoc;
+
+#ifdef LUCY_USE_SHORT_NAMES
+ #define HeapedMatcherDoc lucy_HeapedMatcherDoc
+#endif
+
+__END_C__
+
+/** Matcher which unions the doc id sets of other Matchers using a priority
+ * queue.
+ */
+
+class Lucy::Search::ORMatcher extends Lucy::Search::PolyMatcher {
+
+ HeapedMatcherDoc **heap;
+ HeapedMatcherDoc **pool; /* Pool of HMDs to minimize mallocs */
+ char *blob; /* single allocation for all HMDs */
+ HeapedMatcherDoc *top_hmd; /* cached top elem */
+ uint32_t size;
+ uint32_t max_size;
+
+ inert incremented ORMatcher*
+ new(VArray *children);
+
+ /**
+ * @param children An array of Matchers.
+ */
+ inert incremented ORMatcher*
+ init(ORMatcher *self, VArray *children);
+
+ public void
+ Destroy(ORMatcher *self);
+
+ public int32_t
+ Next(ORMatcher *self);
+
+ public int32_t
+ Advance(ORMatcher *self, int32_t target);
+
+ public int32_t
+ Get_Doc_ID(ORMatcher *self);
+}
+
+/**
+ * Union results of multiple Matchers.
+ *
+ * ORScorer collates the output of multiple subscorers, summing their scores
+ * whenever they match the same document.
+ */
+class Lucy::Search::ORScorer extends Lucy::Search::ORMatcher {
+
+ float *scores;
+ int32_t doc_id;
+
+ inert incremented ORScorer*
+ new(VArray *children, Similarity *similarity);
+
+ inert ORScorer*
+ init(ORScorer *self, VArray *children, Similarity *similarity);
+
+ public void
+ Destroy(ORScorer *self);
+
+ public int32_t
+ Next(ORScorer *self);
+
+ public int32_t
+ Advance(ORScorer *self, int32_t target);
+
+ public float
+ Score(ORScorer *self);
+
+ public int32_t
+ Get_Doc_ID(ORScorer *self);
+}
+
+/* Copyright 2010 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
Propchange: lucene/lucy/trunk/core/Lucy/Search/ORMatcher.bp
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/lucy/trunk/core/Lucy/Search/ORMatcher.c
URL: http://svn.apache.org/viewvc/lucene/lucy/trunk/core/Lucy/Search/ORMatcher.c?rev=956163&view=auto
==============================================================================
--- lucene/lucy/trunk/core/Lucy/Search/ORMatcher.c (added)
+++ lucene/lucy/trunk/core/Lucy/Search/ORMatcher.c Sat Jun 19 00:38:33 2010
@@ -0,0 +1,416 @@
+#define C_LUCY_ORMATCHER
+#define C_LUCY_ORSCORER
+#include "Lucy/Util/ToolSet.h"
+
+#include "Lucy/Search/ORMatcher.h"
+#include "Lucy/Index/Similarity.h"
+
+// Add an element to the queue. Unsafe -- bounds checking of queue size is
+// left to the caller.
+static void
+S_add_element(ORMatcher *self, Matcher *matcher, int32_t doc_id);
+
+// Empty out the queue.
+static void
+S_clear(ORMatcher *self);
+
+// Call Matcher_Next() on the top queue element and adjust the queue,
+// removing the element if Matcher_Next() returns false.
+static INLINE int32_t
+SI_top_next(ORMatcher *self);
+
+// Call Matcher_Advance() on the top queue element and adjust the queue,
+// removing the element if Matcher_Advance() returns false.
+static INLINE int32_t
+SI_top_advance(ORMatcher *self, int32_t target);
+
+/* React to a change in the top element, or "root" -- presumably the update of
+ * its doc_id resulting from a call to Matcher_Next() or Matcher_Advance().
+ * If the Matcher has been exhausted, remove it from the queue and replace it
+ * with the bottom node (i.e. perform "root replacement"). In either case,
+ * perform a "sift down" to restore the heap property. Return the doc id of
+ * the root node, or 0 if the queue has been emptied.
+ */
+static int32_t
+S_adjust_root(ORMatcher *self);
+
+// Take the bottom node (which probably violates the heap property when this
+// is called) and bubble it up through the heap until the heap property is
+// restored.
+static void
+S_bubble_up(ORMatcher *self);
+
+// Take the top node (which probably violates the heap property when this
+// is called) and sift it down through the heap until the heap property is
+// restored.
+static void
+S_sift_down(ORMatcher *self);
+
+ORMatcher*
+ORMatcher_new(VArray *children)
+{
+ ORMatcher *self = (ORMatcher*)VTable_Make_Obj(ORMATCHER);
+ return ORMatcher_init(self, children);
+}
+
+static ORMatcher*
+S_ormatcher_init2(ORMatcher *self, VArray *children, Similarity *sim)
+{
+ size_t amount_to_malloc;
+ uint32_t i;
+
+ // Init.
+ PolyMatcher_init((PolyMatcher*)self, children, sim);
+ self->size = 0;
+
+ // Derive.
+ self->max_size = VA_Get_Size(children);
+
+ // Allocate.
+ self->heap = (HeapedMatcherDoc**)CALLOCATE(self->max_size + 1, sizeof(HeapedMatcherDoc*));
+
+ // Create a pool of HMDs. Encourage CPU cache hits by using a single
+ // allocation for all of them.
+ amount_to_malloc = (self->max_size + 1) * sizeof(HeapedMatcherDoc);
+ self->blob = (char*)MALLOCATE(amount_to_malloc);
+ self->pool = (HeapedMatcherDoc**)CALLOCATE(self->max_size + 1, sizeof(HeapedMatcherDoc*));
+ for (i = 1; i <= self->max_size; i++) {
+ size_t offset = i * sizeof(HeapedMatcherDoc);
+ HeapedMatcherDoc *hmd = (HeapedMatcherDoc*)(self->blob + offset);
+ self->pool[i] = hmd;
+ }
+
+ // Prime queue.
+ for (i = 0; i < self->max_size; i++) {
+ Matcher *matcher = (Matcher*)VA_Fetch(children, i);
+ S_add_element(self, (Matcher*)INCREF(matcher), 0);
+ }
+
+ return self;
+}
+
+ORMatcher*
+ORMatcher_init(ORMatcher *self, VArray *children)
+{
+ return S_ormatcher_init2(self, children, NULL);
+}
+
+void
+ORMatcher_destroy(ORMatcher *self)
+{
+ if (self->blob) { S_clear(self); }
+ FREEMEM(self->blob);
+ FREEMEM(self->pool);
+ FREEMEM(self->heap);
+ SUPER_DESTROY(self, ORMATCHER);
+}
+
+int32_t
+ORMatcher_next(ORMatcher *self)
+{
+ if (self->size == 0) {
+ return 0;
+ }
+ else {
+ int32_t last_doc_id = self->top_hmd->doc;
+ while (self->top_hmd->doc == last_doc_id) {
+ int32_t top_doc_id = SI_top_next(self);
+ if (!top_doc_id && self->size == 0) {
+ return 0;
+ }
+ }
+ return self->top_hmd->doc;
+ }
+}
+
+int32_t
+ORMatcher_advance(ORMatcher *self, int32_t target)
+{
+ if (!self->size) { return 0; }
+ do {
+ int32_t least = SI_top_advance(self, target);
+ if (least >= target) { return least; }
+ if (!least) {
+ if (!self->size) { return 0; }
+ }
+ } while (true);
+}
+
+int32_t
+ORMatcher_get_doc_id(ORMatcher *self)
+{
+ return self->top_hmd->doc;
+}
+
+static void
+S_clear(ORMatcher *self)
+{
+ HeapedMatcherDoc **const heap = self->heap;
+ HeapedMatcherDoc **const pool = self->pool;
+
+ // Node 0 is held empty, to make the algo clearer.
+ for ( ; self->size > 0; self->size--) {
+ HeapedMatcherDoc *hmd = heap[ self->size ];
+ heap[ self->size ] = NULL;
+ DECREF(hmd->matcher);
+
+ // Put HMD back in pool.
+ pool[ self->size ] = hmd;
+ }
+}
+
+static INLINE int32_t
+SI_top_next(ORMatcher *self)
+{
+ HeapedMatcherDoc *const top_hmd = self->top_hmd;
+ top_hmd->doc = Matcher_Next(top_hmd->matcher);
+ return S_adjust_root(self);
+}
+
+static INLINE int32_t
+SI_top_advance(ORMatcher *self, int32_t target)
+{
+ HeapedMatcherDoc *const top_hmd = self->top_hmd;
+ top_hmd->doc = Matcher_Advance(top_hmd->matcher, target);
+ return S_adjust_root(self);
+}
+
+static void
+S_add_element(ORMatcher *self, Matcher *matcher, int32_t doc_id)
+{
+ HeapedMatcherDoc **const heap = self->heap;
+ HeapedMatcherDoc **const pool = self->pool;
+ HeapedMatcherDoc *hmd;
+
+ // Increment size.
+ self->size++;
+
+ // Put element at the bottom of the heap.
+ hmd = pool[ self->size ];
+ hmd->matcher = matcher;
+ hmd->doc = doc_id;
+ heap[ self->size ] = hmd;
+
+ // Adjust heap.
+ S_bubble_up(self);
+}
+
+static int32_t
+S_adjust_root(ORMatcher *self)
+{
+ HeapedMatcherDoc *const top_hmd = self->top_hmd;
+
+ // Inlined pop.
+ if (!top_hmd->doc) {
+ HeapedMatcherDoc *const last_hmd = self->heap[ self->size ];
+
+ // Last to first.
+ DECREF(top_hmd->matcher);
+ top_hmd->matcher = last_hmd->matcher;
+ top_hmd->doc = last_hmd->doc;
+ self->heap[ self->size ] = NULL;
+
+ // Put back in pool.
+ self->pool[ self->size ] = last_hmd;
+
+ self->size--;
+ if (self->size == 0)
+ return 0;
+ }
+
+ // Move queue no matter what.
+ S_sift_down(self);
+
+ return self->top_hmd->doc;
+}
+
+static void
+S_bubble_up(ORMatcher *self)
+{
+ HeapedMatcherDoc **const heap = self->heap;
+ uint32_t i = self->size;
+ uint32_t j = i >> 1;
+ HeapedMatcherDoc *const node = heap[i]; // save bottom node
+
+ while (j > 0 && node->doc < heap[j]->doc) {
+ heap[i] = heap[j];
+ i = j;
+ j = j >> 1;
+ }
+ heap[i] = node;
+ self->top_hmd = heap[1];
+}
+
+static void
+S_sift_down(ORMatcher *self)
+{
+ HeapedMatcherDoc **const heap = self->heap;
+ uint32_t i = 1;
+ uint32_t j = i << 1;
+ uint32_t k = j + 1;
+ HeapedMatcherDoc *const node = heap[i]; // save top node
+
+ // Find smaller child.
+ if (k <= self->size && heap[k]->doc < heap[j]->doc) {
+ j = k;
+ }
+
+ while (j <= self->size && heap[j]->doc < node->doc) {
+ heap[i] = heap[j];
+ i = j;
+ j = i << 1;
+ k = j + 1;
+ if (k <= self->size && heap[k]->doc < heap[j]->doc) {
+ j = k;
+ }
+ }
+ heap[i] = node;
+
+ self->top_hmd = heap[1];
+}
+
+/***************************************************************************/
+
+/* When this is called, all children are past the current self->doc_id. The
+ * least doc_id amongst them becomes the new self->doc_id, and they are all
+ * advanced so that they are once again out in front of it. While they are
+ * advancing, their scores are cached in an array, to be summed during
+ * Score().
+ */
+static int32_t
+S_advance_after_current(ORScorer *self);
+
+ORScorer*
+ORScorer_new(VArray *children, Similarity *sim)
+{
+ ORScorer *self = (ORScorer*)VTable_Make_Obj(ORSCORER);
+ return ORScorer_init(self, children, sim);
+}
+
+ORScorer*
+ORScorer_init(ORScorer *self, VArray *children, Similarity *sim)
+{
+ S_ormatcher_init2((ORMatcher*)self, children, sim);
+ self->doc_id = 0;
+ self->scores = (float*)MALLOCATE(self->num_kids * sizeof(float));
+
+ // Establish the state of all child matchers being past the current doc
+ // id, by invoking ORMatcher's Next() method.
+ ORMatcher_next((ORMatcher*)self);
+
+ return self;
+}
+
+void
+ORScorer_destroy(ORScorer *self)
+{
+ FREEMEM(self->scores);
+ SUPER_DESTROY(self, ORSCORER);
+}
+
+int32_t
+ORScorer_next(ORScorer *self)
+{
+ return S_advance_after_current(self);
+}
+
+static int32_t
+S_advance_after_current(ORScorer *self)
+{
+ float *const scores = self->scores;
+ Matcher *child;
+
+ // Get the top Matcher, or bail because there are no Matchers left.
+ if (!self->size) { return 0; }
+ else { child = self->top_hmd->matcher; }
+
+ // The top matcher will already be at the correct doc, so start there.
+ self->doc_id = self->top_hmd->doc;
+ scores[0] = Matcher_Score(child);
+ self->matching_kids = 1;
+
+ do {
+ // Attempt to advance past current doc.
+ int32_t top_doc_id = SI_top_next((ORMatcher*)self);
+ if (!top_doc_id) {
+ if (!self->size) {
+ break; // bail, no more to advance
+ }
+ }
+
+ if (top_doc_id != self->doc_id) {
+ // Bail, least doc in queue is now past the one we're scoring.
+ break;
+ }
+ else {
+ // Accumulate score.
+ child = self->top_hmd->matcher;
+ scores[ self->matching_kids ] = Matcher_Score(child);
+ self->matching_kids++;
+ }
+ } while (true);
+
+ return self->doc_id;
+}
+
+int32_t
+ORScorer_advance(ORScorer *self, int32_t target)
+{
+ // Return sentinel once exhausted.
+ if (!self->size) { return 0; }
+
+ // Succeed if we're already past and still on a valid doc.
+ if (target <= self->doc_id) {
+ return self->doc_id;
+ }
+
+ do {
+ // If all matchers are caught up, accumulate score and return.
+ if (self->top_hmd->doc >= target) {
+ return S_advance_after_current(self);
+ }
+
+ // Not caught up yet, so keep skipping matchers.
+ if ( !SI_top_advance((ORMatcher*)self, target) ) {
+ if (!self->size) { return 0; }
+ }
+ } while (true);
+}
+
+int32_t
+ORScorer_get_doc_id(ORScorer *self)
+{
+ return self->doc_id;
+}
+
+float
+ORScorer_score(ORScorer *self)
+{
+ float *const scores = self->scores;
+ float score = 0.0f;
+ uint32_t i;
+
+ // Accumulate score, then factor in coord bonus.
+ for (i = 0; i < self->matching_kids; i++) {
+ score += scores[i];
+ }
+ score *= self->coord_factors[ self->matching_kids ];
+
+ return score;
+}
+
+/* Copyright 2010 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
Propchange: lucene/lucy/trunk/core/Lucy/Search/ORMatcher.c
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/lucy/trunk/perl/t/518-or_scorer.t
URL: http://svn.apache.org/viewvc/lucene/lucy/trunk/perl/t/518-or_scorer.t?rev=956163&view=auto
==============================================================================
--- lucene/lucy/trunk/perl/t/518-or_scorer.t (added)
+++ lucene/lucy/trunk/perl/t/518-or_scorer.t Sat Jun 19 00:38:33 2010
@@ -0,0 +1,68 @@
+use strict;
+use warnings;
+use lib 'buildlib';
+
+use Test::More skip_all => 'Requires SortCollector';
+# use Test::More tests => 900;
+use Lucy::Test;
+use LucyX::Search::MockScorer;
+use Lucy::Test::TestUtils qw( modulo_set doc_ids_from_td_coll );
+
+my $sim = Lucy::Index::Similarity::LuceneSimilarity->new;
+
+for my $interval_a ( 1 .. 10 ) {
+ for my $interval_b ( 5 .. 10 ) {
+ check_scorer( $interval_a, $interval_b );
+ for my $interval_c ( 30, 75 ) {
+ check_scorer( $interval_a, $interval_b, $interval_c );
+ check_scorer( $interval_c, $interval_b, $interval_a );
+ }
+ }
+}
+
+sub check_scorer {
+ my @intervals = @_;
+ my @doc_id_arrays = map { modulo_set( $_, 100 ) } @intervals;
+ my $subscorers
+ = Lucy::Object::VArray->new( capacity => scalar @intervals );
+ for my $doc_id_array (@doc_id_arrays) {
+ my $mock = LucyX::Search::MockScorer->new(
+ doc_ids => $doc_id_array,
+ scores => [ (1) x scalar @$doc_id_array ],
+ );
+ $subscorers->push($mock);
+ }
+
+ my $or_scorer = Lucy::Search::ORScorer->new(
+ similarity => $sim,
+ children => $subscorers,
+ );
+ my $collector
+ = Lucy::Search::Collector::SortCollector->new( wanted => 100 );
+ $or_scorer->collect( collector => $collector );
+ my ( $got_by_score, $got_by_id ) = doc_ids_from_td_coll($collector);
+ my ( $expected_by_count, $expected_by_id )
+ = union_doc_id_sets(@doc_id_arrays);
+ is( scalar @$got_by_id,
+ scalar @$expected_by_id,
+ "total hits: @intervals"
+ );
+ is_deeply( $got_by_id, $expected_by_id, "got all docs: @intervals" );
+ is_deeply( $got_by_score, $expected_by_count,
+ "scores accumulated: @intervals" );
+}
+
+sub union_doc_id_sets {
+ my @arrays = @_;
+ my %scores;
+ for my $array (@arrays) {
+ $scores{$_} += 1 for @$array;
+ }
+ my @by_count_then_id = sort { $scores{$b} <=> $scores{$a} or $a <=> $b }
+ keys %scores;
+ my @by_id = sort { $a <=> $b } keys %scores;
+ return ( \@by_count_then_id, \@by_id );
+}
+
+# Trigger destruction.
+undef $sim;
Propchange: lucene/lucy/trunk/perl/t/518-or_scorer.t
------------------------------------------------------------------------------
svn:eol-style = native