You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucy.apache.org by ma...@apache.org on 2014/07/01 21:35:06 UTC

[1/8] git commit: refs/heads/master - Free sorted_ids in SortFieldWriter a little earlier

Repository: lucy
Updated Branches:
  refs/heads/master 4d500c56b -> eb10a8c68


Free sorted_ids in SortFieldWriter a little earlier


Project: http://git-wip-us.apache.org/repos/asf/lucy/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucy/commit/dd1a1985
Tree: http://git-wip-us.apache.org/repos/asf/lucy/tree/dd1a1985
Diff: http://git-wip-us.apache.org/repos/asf/lucy/diff/dd1a1985

Branch: refs/heads/master
Commit: dd1a19852c7cb191c91599587d03a26484c5a6fa
Parents: 22663de
Author: Nick Wellnhofer <we...@aevum.de>
Authored: Thu Sep 26 02:28:04 2013 +0200
Committer: Marvin Humphrey <ma...@rectangular.com>
Committed: Wed Jun 18 21:22:59 2014 -0700

----------------------------------------------------------------------
 core/Lucy/Index/SortFieldWriter.c | 2 ++
 1 file changed, 2 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucy/blob/dd1a1985/core/Lucy/Index/SortFieldWriter.c
----------------------------------------------------------------------
diff --git a/core/Lucy/Index/SortFieldWriter.c b/core/Lucy/Index/SortFieldWriter.c
index f34b68e..0d07ce5 100644
--- a/core/Lucy/Index/SortFieldWriter.c
+++ b/core/Lucy/Index/SortFieldWriter.c
@@ -471,6 +471,8 @@ SortFieldWriter_Refill_IMP(SortFieldWriter *self) {
     if (ivars->run_tick > ivars->run_max) {
         DECREF(ivars->sort_cache);
         ivars->sort_cache = NULL;
+        FREEMEM(ivars->sorted_ids);
+        ivars->sorted_ids = NULL;
     }
 
     return count;


[7/8] git commit: refs/heads/master - Port t/015-sort_external.t to C

Posted by ma...@apache.org.
Port t/015-sort_external.t to C

Original commit by Nick Wellnhofer, updated by Marvin Humphrey for
changes to the SortExternal API.


Project: http://git-wip-us.apache.org/repos/asf/lucy/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucy/commit/73d6925e
Tree: http://git-wip-us.apache.org/repos/asf/lucy/tree/73d6925e
Diff: http://git-wip-us.apache.org/repos/asf/lucy/diff/73d6925e

Branch: refs/heads/master
Commit: 73d6925e8b6597a41939132feec7ebd2f8bf6f0c
Parents: be4e183
Author: Nick Wellnhofer <we...@aevum.de>
Authored: Thu Oct 17 18:56:40 2013 +0200
Committer: Marvin Humphrey <ma...@rectangular.com>
Committed: Tue Jul 1 12:09:49 2014 -0700

----------------------------------------------------------------------
 core/Lucy/Test.c                         |   2 +
 core/Lucy/Test/Util/TestSortExternal.c   | 287 ++++++++++++++++++++++++++
 core/Lucy/Test/Util/TestSortExternal.cfh |  29 +++
 core/Lucy/Util/BBSortEx.c                |  20 ++
 core/Lucy/Util/BBSortEx.cfh              |   6 +
 perl/buildlib/Lucy/Build/Binding/Util.pm |   9 -
 perl/lib/Lucy/Util/BBSortEx.pm           |  25 ---
 perl/t/core/015-sort_external.t          |  23 +++
 8 files changed, 367 insertions(+), 34 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucy/blob/73d6925e/core/Lucy/Test.c
----------------------------------------------------------------------
diff --git a/core/Lucy/Test.c b/core/Lucy/Test.c
index 62c9bdf..196ae6a 100644
--- a/core/Lucy/Test.c
+++ b/core/Lucy/Test.c
@@ -81,6 +81,7 @@
 #include "Lucy/Test/Util/TestJson.h"
 #include "Lucy/Test/Util/TestMemoryPool.h"
 #include "Lucy/Test/Util/TestPriorityQueue.h"
+#include "Lucy/Test/Util/TestSortExternal.h"
 
 TestSuite*
 Test_create_test_suite() {
@@ -88,6 +89,7 @@ Test_create_test_suite() {
 
     TestSuite_Add_Batch(suite, (TestBatch*)TestPriQ_new());
     TestSuite_Add_Batch(suite, (TestBatch*)TestBitVector_new());
+    TestSuite_Add_Batch(suite, (TestBatch*)TestSortExternal_new());
     TestSuite_Add_Batch(suite, (TestBatch*)TestMemPool_new());
     TestSuite_Add_Batch(suite, (TestBatch*)TestIxFileNames_new());
     TestSuite_Add_Batch(suite, (TestBatch*)TestJson_new());

http://git-wip-us.apache.org/repos/asf/lucy/blob/73d6925e/core/Lucy/Test/Util/TestSortExternal.c
----------------------------------------------------------------------
diff --git a/core/Lucy/Test/Util/TestSortExternal.c b/core/Lucy/Test/Util/TestSortExternal.c
new file mode 100644
index 0000000..2e04c83
--- /dev/null
+++ b/core/Lucy/Test/Util/TestSortExternal.c
@@ -0,0 +1,287 @@
+/* Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+#define TESTLUCY_USE_SHORT_NAMES
+#include "Lucy/Util/ToolSet.h"
+#include "Lucy/Test/Util/TestSortExternal.h"
+
+#include "Clownfish/TestHarness/TestBatchRunner.h"
+#include "Lucy/Util/BBSortEx.h"
+#include "Lucy/Util/SortExternal.h"
+
+static ByteBuf *a_bb;
+static ByteBuf *b_bb;
+static ByteBuf *c_bb;
+static ByteBuf *d_bb;
+static ByteBuf *x_bb;
+static ByteBuf *y_bb;
+static ByteBuf *z_bb;
+
+TestSortExternal*
+TestSortExternal_new() {
+    return (TestSortExternal*)VTable_Make_Obj(TESTSORTEXTERNAL);
+}
+
+static void
+S_init_bytebufs() {
+    a_bb = BB_new_bytes("a", 1);
+    b_bb = BB_new_bytes("b", 1);
+    c_bb = BB_new_bytes("c", 1);
+    d_bb = BB_new_bytes("d", 1);
+    x_bb = BB_new_bytes("x", 1);
+    y_bb = BB_new_bytes("y", 1);
+    z_bb = BB_new_bytes("z", 1);
+}
+
+static void
+S_destroy_bytebufs() {
+    DECREF(a_bb);
+    DECREF(b_bb);
+    DECREF(c_bb);
+    DECREF(d_bb);
+    DECREF(x_bb);
+    DECREF(y_bb);
+    DECREF(z_bb);
+}
+
+static void
+test_bbsortex(TestBatchRunner *runner) {
+    BBSortEx *sortex = BBSortEx_new(4, NULL);
+
+    BBSortEx_Feed(sortex, INCREF(c_bb));
+    TEST_INT_EQ(runner, BBSortEx_Buffer_Count(sortex), 1,
+                "feed elem into cache");
+
+    BBSortEx_Feed(sortex, INCREF(b_bb));
+    BBSortEx_Feed(sortex, INCREF(d_bb));
+    BBSortEx_Sort_Buffer(sortex);
+
+    {
+        VArray *cache  = BBSortEx_Peek_Cache(sortex);
+        VArray *wanted = VA_new(3);
+        VA_Push(wanted, INCREF(b_bb));
+        VA_Push(wanted, INCREF(c_bb));
+        VA_Push(wanted, INCREF(d_bb));
+        TEST_TRUE(runner, VA_Equals(cache, (Obj*)wanted), "sort cache");
+        DECREF(wanted);
+        DECREF(cache);
+    }
+
+    BBSortEx_Feed(sortex, INCREF(a_bb));
+    TEST_INT_EQ(runner, BBSortEx_Buffer_Count(sortex), 0,
+                "cache flushed automatically when mem_thresh crossed");
+    TEST_INT_EQ(runner, BBSortEx_Get_Num_Runs(sortex), 1, "run added");
+
+    VArray *external = VA_new(3);
+    VA_Push(external, INCREF(x_bb));
+    VA_Push(external, INCREF(y_bb));
+    VA_Push(external, INCREF(z_bb));
+    BBSortEx *run = BBSortEx_new(0x1000000, external);
+    BBSortEx_Add_Run(sortex, (SortExternal*)run);
+    BBSortEx_Flip(sortex);
+
+    {
+        VArray *got = VA_new(7);
+        Obj *object;
+        while (NULL != (object = BBSortEx_Fetch(sortex))) {
+            VA_Push(got, object);
+        }
+
+        VArray *wanted = VA_new(7);
+        VA_Push(wanted, INCREF(a_bb));
+        VA_Push(wanted, INCREF(b_bb));
+        VA_Push(wanted, INCREF(c_bb));
+        VA_Push(wanted, INCREF(d_bb));
+        VA_Push(wanted, INCREF(x_bb));
+        VA_Push(wanted, INCREF(y_bb));
+        VA_Push(wanted, INCREF(z_bb));
+
+        TEST_TRUE(runner, VA_Equals(got, (Obj*)wanted), "Add_Run");
+
+        DECREF(wanted);
+        DECREF(got);
+    }
+
+    DECREF(external);
+    DECREF(sortex);
+}
+
+static void
+test_clear_buffer(TestBatchRunner *runner) {
+    BBSortEx *sortex = BBSortEx_new(4, NULL);
+
+    BBSortEx_Feed(sortex, INCREF(c_bb));
+    BBSortEx_Clear_Buffer(sortex);
+    TEST_INT_EQ(runner, BBSortEx_Buffer_Count(sortex), 0, "Clear_Buffer");
+
+    BBSortEx_Feed(sortex, INCREF(b_bb));
+    BBSortEx_Feed(sortex, INCREF(a_bb));
+    BBSortEx_Flush(sortex);
+    BBSortEx_Flip(sortex);
+    Obj *object = BBSortEx_Peek(sortex);
+    TEST_TRUE(runner, BB_Equals(a_bb, object), "Peek");
+
+    VArray *got = VA_new(2);
+    while (NULL != (object = BBSortEx_Fetch(sortex))) {
+        VA_Push(got, object);
+    }
+    VArray *wanted = VA_new(2);
+    VA_Push(wanted, INCREF(a_bb));
+    VA_Push(wanted, INCREF(b_bb));
+    TEST_TRUE(runner, VA_Equals(got, (Obj*)wanted),
+              "elements cleared via Clear_Buffer truly cleared");
+
+    DECREF(wanted);
+    DECREF(got);
+    DECREF(sortex);
+}
+
+static void
+S_test_sort(TestBatchRunner *runner, VArray *bytebufs, uint32_t mem_thresh,
+            const char *test_name) {
+    uint32_t   size     = VA_Get_Size(bytebufs);
+    BBSortEx  *sortex   = BBSortEx_new(mem_thresh, NULL);
+    ByteBuf  **shuffled = (ByteBuf**)MALLOCATE(size * sizeof(ByteBuf*));
+
+    for (int i = 0; i < size; ++i) {
+        shuffled[i] = (ByteBuf*)CERTIFY(VA_Fetch(bytebufs, i), BYTEBUF);
+    }
+    for (int i = size - 1; i > 0; --i) {
+        int shuffle_pos = rand() % (i + 1);
+        ByteBuf *temp = shuffled[shuffle_pos];
+        shuffled[shuffle_pos] = shuffled[i];
+        shuffled[i] = temp;
+    }
+    for (int i = 0; i < size; ++i) {
+        BBSortEx_Feed(sortex, INCREF(shuffled[i]));
+    }
+
+    BBSortEx_Flip(sortex);
+    VArray *got = VA_new(size);
+    Obj *object;
+    while (NULL != (object = BBSortEx_Fetch(sortex))) {
+        VA_Push(got, object);
+    }
+    TEST_TRUE(runner, VA_Equals(got, (Obj*)bytebufs), test_name);
+
+    FREEMEM(shuffled);
+    DECREF(got);
+    DECREF(sortex);
+}
+
+static void
+S_test_sort_letters(TestBatchRunner *runner, const char *letters,
+                    uint32_t mem_thresh, const char *test_name) {
+    size_t  num_letters = strlen(letters);
+    VArray *bytebufs    = VA_new(num_letters);
+
+    for (int i = 0; i < num_letters; ++i) {
+        char ch = letters[i];
+        size_t size = ch == '_' ? 0 : 1;
+        ByteBuf *bytebuf = BB_new_bytes(&ch, size);
+        VA_Push(bytebufs, (Obj*)bytebuf);
+    }
+
+    S_test_sort(runner, bytebufs, mem_thresh, test_name);
+
+    DECREF(bytebufs);
+}
+
+static void
+test_sort_letters(TestBatchRunner *runner) {
+    S_test_sort_letters(runner, "abcdefghijklmnopqrstuvwxyz", 0x1000000,
+                        "sort letters");
+    S_test_sort_letters(runner, "aaabcdxxxxxxyy", 0x1000000,
+                        "sort repeated letters");
+    S_test_sort_letters(runner, "__abcdefghijklmnopqrstuvwxyz", 0x1000000,
+                        "sort letters and empty strings");
+    S_test_sort_letters(runner, "abcdefghijklmnopqrstuvwxyz", 30,
+                        "... with an absurdly low mem_thresh");
+    S_test_sort_letters(runner, "abcdefghijklmnopqrstuvwxyz", 1,
+                        "... with an even lower mem_thresh");
+}
+
+static void
+test_sort_nothing(TestBatchRunner *runner) {
+    BBSortEx *sortex = BBSortEx_new(0x1000000, NULL);
+    BBSortEx_Flip(sortex);
+    TEST_TRUE(runner, BBSortEx_Fetch(sortex) == NULL,
+              "Sorting nothing returns undef");
+    DECREF(sortex);
+}
+
+static void
+test_sort_packed_ints(TestBatchRunner *runner) {
+    size_t  num_ints = 11001;
+    VArray *bytebufs = VA_new(num_ints);
+
+    for (uint32_t i = 0; i < num_ints; ++i) {
+        char buf[4];
+        buf[0] = i >> 24;
+        buf[1] = i >> 16;
+        buf[2] = i >> 8;
+        buf[3] = i;
+        ByteBuf *bytebuf = BB_new_bytes(&buf, 4);
+        VA_Push(bytebufs, (Obj*)bytebuf);
+    }
+
+    S_test_sort(runner, bytebufs, 5000, "Sorting packed integers...");
+
+    DECREF(bytebufs);
+}
+
+static void
+test_sort_random_strings(TestBatchRunner *runner) {
+    size_t  num_strings = 1001;
+    VArray *bytebufs    = VA_new(num_strings);
+
+    for (uint32_t i = 0; i < num_strings; ++i) {
+        char buf[1201];
+        int size = rand() % 1200 + 1;
+        for (int i = 0; i < size; ++i) {
+            buf[i] = rand();
+        }
+        ByteBuf *bytebuf = BB_new_bytes(&buf, size);
+        VA_Push(bytebufs, (Obj*)bytebuf);
+    }
+
+    VA_Sort(bytebufs, NULL, NULL);
+    S_test_sort(runner, bytebufs, 15000,
+                "Random binary strings of random length");
+
+    DECREF(bytebufs);
+}
+
+void
+TestSortExternal_Run_IMP(TestSortExternal *self, TestBatchRunner *runner) {
+    TestBatchRunner_Plan(runner, (TestBatch*)self, 16);
+
+    srand((unsigned int)time((time_t*)NULL));
+    S_init_bytebufs();
+    test_bbsortex(runner);
+    test_clear_buffer(runner);
+    test_sort_letters(runner);
+    test_sort_nothing(runner);
+    test_sort_packed_ints(runner);
+    test_sort_random_strings(runner);
+    S_destroy_bytebufs();
+}
+
+

http://git-wip-us.apache.org/repos/asf/lucy/blob/73d6925e/core/Lucy/Test/Util/TestSortExternal.cfh
----------------------------------------------------------------------
diff --git a/core/Lucy/Test/Util/TestSortExternal.cfh b/core/Lucy/Test/Util/TestSortExternal.cfh
new file mode 100644
index 0000000..028b322
--- /dev/null
+++ b/core/Lucy/Test/Util/TestSortExternal.cfh
@@ -0,0 +1,29 @@
+/* Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+parcel TestLucy;
+
+class Lucy::Test::Util::TestSortExternal
+    inherits Clownfish::TestHarness::TestBatch {
+
+    inert incremented TestSortExternal*
+    new();
+
+    void
+    Run(TestSortExternal *self, TestBatchRunner *runner);
+}
+
+

http://git-wip-us.apache.org/repos/asf/lucy/blob/73d6925e/core/Lucy/Util/BBSortEx.c
----------------------------------------------------------------------
diff --git a/core/Lucy/Util/BBSortEx.c b/core/Lucy/Util/BBSortEx.c
index ff96c73..c9201bb 100644
--- a/core/Lucy/Util/BBSortEx.c
+++ b/core/Lucy/Util/BBSortEx.c
@@ -169,4 +169,24 @@ BBSortEx_Compare_IMP(BBSortEx *self, void *va, void *vb) {
     return BB_compare((ByteBuf**)va, (ByteBuf**)vb);
 }
 
+VArray*
+BBSortEx_Peek_Cache_IMP(BBSortEx *self) {
+    BBSortExIVARS *const ivars = BBSortEx_IVARS(self);
+    uint32_t   count  = ivars->buf_max - ivars->buf_tick;
+    Obj      **buffer = ivars->buffer;
+    VArray    *retval = VA_new(count);
+
+    for (uint32_t i = ivars->buf_tick; i < ivars->buf_max; ++i) {
+        VA_Push(retval, INCREF(buffer[i]));
+    }
+
+    return retval;
+}
+
+uint32_t
+BBSortEx_Get_Num_Runs_IMP(BBSortEx *self) {
+    BBSortExIVARS *const ivars = BBSortEx_IVARS(self);
+    return VA_Get_Size(ivars->runs);
+}
+
 

http://git-wip-us.apache.org/repos/asf/lucy/blob/73d6925e/core/Lucy/Util/BBSortEx.cfh
----------------------------------------------------------------------
diff --git a/core/Lucy/Util/BBSortEx.cfh b/core/Lucy/Util/BBSortEx.cfh
index 8ef64c8..6fd8656 100644
--- a/core/Lucy/Util/BBSortEx.cfh
+++ b/core/Lucy/Util/BBSortEx.cfh
@@ -51,6 +51,12 @@ class Lucy::Util::BBSortEx
     int
     Compare(BBSortEx *self, void *va, void *vb);
 
+    incremented VArray*
+    Peek_Cache(BBSortEx *self);
+
+    uint32_t
+    Get_Num_Runs(BBSortEx *self);
+
     public void
     Destroy(BBSortEx *self);
 }

http://git-wip-us.apache.org/repos/asf/lucy/blob/73d6925e/perl/buildlib/Lucy/Build/Binding/Util.pm
----------------------------------------------------------------------
diff --git a/perl/buildlib/Lucy/Build/Binding/Util.pm b/perl/buildlib/Lucy/Build/Binding/Util.pm
index 4ef7dbe..1438011 100644
--- a/perl/buildlib/Lucy/Build/Binding/Util.pm
+++ b/perl/buildlib/Lucy/Build/Binding/Util.pm
@@ -21,21 +21,12 @@ $VERSION = eval $VERSION;
 
 sub bind_all {
     my $class = shift;
-    $class->bind_bbsortex;
     $class->bind_debug;
     $class->bind_freezer;
     $class->bind_indexfilenames;
     $class->bind_sortexternal;
 }
 
-sub bind_bbsortex {
-    my $binding = Clownfish::CFC::Binding::Perl::Class->new(
-        parcel     => "Lucy",
-        class_name => "Lucy::Util::BBSortEx",
-    );
-    Clownfish::CFC::Binding::Perl::Class->register($binding);
-}
-
 sub bind_debug {
     my $xs_code = <<'END_XS_CODE';
 MODULE = Lucy   PACKAGE = Lucy::Util::Debug

http://git-wip-us.apache.org/repos/asf/lucy/blob/73d6925e/perl/lib/Lucy/Util/BBSortEx.pm
----------------------------------------------------------------------
diff --git a/perl/lib/Lucy/Util/BBSortEx.pm b/perl/lib/Lucy/Util/BBSortEx.pm
deleted file mode 100644
index cbba688..0000000
--- a/perl/lib/Lucy/Util/BBSortEx.pm
+++ /dev/null
@@ -1,25 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-package Lucy::Util::BBSortEx;
-use Lucy;
-our $VERSION = '0.003000';
-$VERSION = eval $VERSION;
-
-1;
-
-__END__
-
-

http://git-wip-us.apache.org/repos/asf/lucy/blob/73d6925e/perl/t/core/015-sort_external.t
----------------------------------------------------------------------
diff --git a/perl/t/core/015-sort_external.t b/perl/t/core/015-sort_external.t
new file mode 100644
index 0000000..6d966bb
--- /dev/null
+++ b/perl/t/core/015-sort_external.t
@@ -0,0 +1,23 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+use strict;
+use warnings;
+
+use Lucy::Test;
+my $success = Lucy::Test::run_tests("Lucy::Test::Util::TestSortExternal");
+
+exit($success ? 0 : 1);
+


[4/8] git commit: refs/heads/master - Don't sort documents twice in SortFieldWriter#Refill

Posted by ma...@apache.org.
Don't sort documents twice in SortFieldWriter#Refill

The doc_ids are already sorted in S_lazy_init_sorted_ids. We only have
to make sure that S_lazy_init_sorted_ids uses the doc_id as secondary
sort key.


Project: http://git-wip-us.apache.org/repos/asf/lucy/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucy/commit/efbc9ace
Tree: http://git-wip-us.apache.org/repos/asf/lucy/tree/efbc9ace
Diff: http://git-wip-us.apache.org/repos/asf/lucy/diff/efbc9ace

Branch: refs/heads/master
Commit: efbc9acef7a97463b9d98404e6add8362fcf5e4c
Parents: 4d500c5
Author: Nick Wellnhofer <we...@aevum.de>
Authored: Thu Sep 26 01:25:52 2013 +0200
Committer: Marvin Humphrey <ma...@rectangular.com>
Committed: Wed Jun 18 21:22:59 2014 -0700

----------------------------------------------------------------------
 core/Lucy/Index/SortFieldWriter.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucy/blob/efbc9ace/core/Lucy/Index/SortFieldWriter.c
----------------------------------------------------------------------
diff --git a/core/Lucy/Index/SortFieldWriter.c b/core/Lucy/Index/SortFieldWriter.c
index 381f9ad..9fe71c0 100644
--- a/core/Lucy/Index/SortFieldWriter.c
+++ b/core/Lucy/Index/SortFieldWriter.c
@@ -345,7 +345,7 @@ SortFieldWriter_Compare_IMP(SortFieldWriter *self, void *va, void *vb) {
     SFWriterElemIVARS *b = SFWriterElem_IVARS(*(SFWriterElem**)vb);
     int32_t comparison
         = FType_null_back_compare_values(ivars->type, a->value, b->value);
-    if (comparison == 0) { comparison = b->doc_id - a->doc_id; }
+    if (comparison == 0) { comparison = a->doc_id - b->doc_id; }
     return comparison;
 }
 
@@ -356,7 +356,9 @@ S_compare_doc_ids_by_ord_rev(void *context, const void *va, const void *vb) {
     int32_t b = *(int32_t*)vb;
     int32_t ord_a = SortCache_Ordinal(sort_cache, a);
     int32_t ord_b = SortCache_Ordinal(sort_cache, b);
-    return ord_a - ord_b;
+    int32_t comparison = ord_a - ord_b;
+    if (comparison == 0) { comparison = a - b; }
+    return comparison;
 }
 
 static void
@@ -480,7 +482,6 @@ SortFieldWriter_Refill_IMP(SortFieldWriter *self) {
         ivars->run_tick++;
     }
     ivars->run_ord++;
-    SortFieldWriter_Sort_Buffer(self);
 
     if (ivars->run_ord >= ivars->run_cardinality) {
         DECREF(ivars->sort_cache);


[3/8] git commit: refs/heads/master - Make SortFieldWriter#Refill obey the memory limit

Posted by ma...@apache.org.
Make SortFieldWriter#Refill obey the memory limit

The old logic was broken.


Project: http://git-wip-us.apache.org/repos/asf/lucy/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucy/commit/0cf86d1d
Tree: http://git-wip-us.apache.org/repos/asf/lucy/tree/0cf86d1d
Diff: http://git-wip-us.apache.org/repos/asf/lucy/diff/0cf86d1d

Branch: refs/heads/master
Commit: 0cf86d1d79cc91723762f6abd34c034a25cb285b
Parents: efbc9ac
Author: Nick Wellnhofer <we...@aevum.de>
Authored: Thu Sep 26 02:04:42 2013 +0200
Committer: Marvin Humphrey <ma...@rectangular.com>
Committed: Wed Jun 18 21:22:59 2014 -0700

----------------------------------------------------------------------
 core/Lucy/Index/SortFieldWriter.c   | 20 +++-----------------
 core/Lucy/Index/SortFieldWriter.cfh |  1 -
 2 files changed, 3 insertions(+), 18 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucy/blob/0cf86d1d/core/Lucy/Index/SortFieldWriter.c
----------------------------------------------------------------------
diff --git a/core/Lucy/Index/SortFieldWriter.c b/core/Lucy/Index/SortFieldWriter.c
index 9fe71c0..8a372a3 100644
--- a/core/Lucy/Index/SortFieldWriter.c
+++ b/core/Lucy/Index/SortFieldWriter.c
@@ -92,7 +92,6 @@ SortFieldWriter_init(SortFieldWriter *self, Schema *schema,
     ivars->sort_cache      = NULL;
     ivars->doc_map         = NULL;
     ivars->sorted_ids      = NULL;
-    ivars->run_ord         = 0;
     ivars->run_tick        = 0;
     ivars->ord_width       = 0;
     ivars->last_val        = NULL;
@@ -450,19 +449,10 @@ SortFieldWriter_Refill_IMP(SortFieldWriter *self) {
     I32Array *const  doc_map    = ivars->doc_map;
     SortCache *const sort_cache = ivars->sort_cache;
 
-    while (ivars->run_ord < ivars->run_cardinality
+    uint32_t count = 0;
+    while (ivars->run_tick <= ivars->run_max
            && MemPool_Get_Consumed(ivars->mem_pool) < ivars->mem_thresh
           ) {
-        Obj *val = SortCache_Value(sort_cache, ivars->run_ord);
-        if (val) {
-            Hash_Store(uniq_vals, val, (Obj*)CFISH_TRUE);
-            DECREF(val);
-            break;
-        }
-        ivars->run_ord++;
-    }
-    uint32_t count = 0;
-    while (ivars->run_tick <= ivars->run_max) {
         int32_t raw_doc_id = ivars->sorted_ids[ivars->run_tick];
         int32_t ord = SortCache_Ordinal(sort_cache, raw_doc_id);
         if (ord != null_ord) {
@@ -476,14 +466,10 @@ SortFieldWriter_Refill_IMP(SortFieldWriter *self) {
                 DECREF(val);
             }
         }
-        else if (ord > ivars->run_ord) {
-            break;
-        }
         ivars->run_tick++;
     }
-    ivars->run_ord++;
 
-    if (ivars->run_ord >= ivars->run_cardinality) {
+    if (ivars->run_tick > ivars->run_max) {
         DECREF(ivars->sort_cache);
         ivars->sort_cache = NULL;
     }

http://git-wip-us.apache.org/repos/asf/lucy/blob/0cf86d1d/core/Lucy/Index/SortFieldWriter.cfh
----------------------------------------------------------------------
diff --git a/core/Lucy/Index/SortFieldWriter.cfh b/core/Lucy/Index/SortFieldWriter.cfh
index 8f749cc..0128c61 100644
--- a/core/Lucy/Index/SortFieldWriter.cfh
+++ b/core/Lucy/Index/SortFieldWriter.cfh
@@ -48,7 +48,6 @@ class Lucy::Index::SortFieldWriter
     int32_t     run_max;
     bool        var_width;
     int32_t    *sorted_ids;
-    int32_t     run_ord;
     int32_t     run_tick;
     int32_t     ord_width;
     Obj        *last_val;


[6/8] git commit: refs/heads/master - Convert t/221-sort_writer.t to C

Posted by ma...@apache.org.
Convert t/221-sort_writer.t to C

Original commit by Nick Wellnhofer, amended by Marvin Humphrey to
replace "cnick" with "nickname".


Project: http://git-wip-us.apache.org/repos/asf/lucy/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucy/commit/be4e1833
Tree: http://git-wip-us.apache.org/repos/asf/lucy/tree/be4e1833
Diff: http://git-wip-us.apache.org/repos/asf/lucy/diff/be4e1833

Branch: refs/heads/master
Commit: be4e1833868fd8cca5cba9883d1bcd1cab32f2b4
Parents: d96ef18
Author: Nick Wellnhofer <we...@aevum.de>
Authored: Sat Sep 28 15:56:26 2013 +0200
Committer: Marvin Humphrey <ma...@rectangular.com>
Committed: Tue Jul 1 12:09:44 2014 -0700

----------------------------------------------------------------------
 core/Lucy/Test.c                        |   2 +
 core/Lucy/Test/Index/TestSortWriter.c   | 315 +++++++++++++++++++++++++++
 core/Lucy/Test/Index/TestSortWriter.cfh |  43 ++++
 perl/t/221-sort_writer.t                | 174 ---------------
 perl/t/core/224-sort_writer.t           |  23 ++
 5 files changed, 383 insertions(+), 174 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucy/blob/be4e1833/core/Lucy/Test.c
----------------------------------------------------------------------
diff --git a/core/Lucy/Test.c b/core/Lucy/Test.c
index 5923e98..62c9bdf 100644
--- a/core/Lucy/Test.c
+++ b/core/Lucy/Test.c
@@ -40,6 +40,7 @@
 #include "Lucy/Test/Index/TestSegWriter.h"
 #include "Lucy/Test/Index/TestSegment.h"
 #include "Lucy/Test/Index/TestSnapshot.h"
+#include "Lucy/Test/Index/TestSortWriter.h"
 #include "Lucy/Test/Index/TestTermInfo.h"
 #include "Lucy/Test/Object/TestBitVector.h"
 #include "Lucy/Test/Object/TestI32Array.h"
@@ -121,6 +122,7 @@ Test_create_test_suite() {
     TestSuite_Add_Batch(suite, (TestBatch*)TestHLWriter_new());
     TestSuite_Add_Batch(suite, (TestBatch*)TestPListWriter_new());
     TestSuite_Add_Batch(suite, (TestBatch*)TestSegWriter_new());
+    TestSuite_Add_Batch(suite, (TestBatch*)TestSortWriter_new());
     TestSuite_Add_Batch(suite, (TestBatch*)TestPolyReader_new());
     TestSuite_Add_Batch(suite, (TestBatch*)TestFullTextType_new());
     TestSuite_Add_Batch(suite, (TestBatch*)TestBlobType_new());

http://git-wip-us.apache.org/repos/asf/lucy/blob/be4e1833/core/Lucy/Test/Index/TestSortWriter.c
----------------------------------------------------------------------
diff --git a/core/Lucy/Test/Index/TestSortWriter.c b/core/Lucy/Test/Index/TestSortWriter.c
new file mode 100644
index 0000000..761572f
--- /dev/null
+++ b/core/Lucy/Test/Index/TestSortWriter.c
@@ -0,0 +1,315 @@
+/* Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define TESTLUCY_USE_SHORT_NAMES
+#include "Lucy/Util/ToolSet.h"
+
+#include "Lucy/Test/Index/TestSortWriter.h"
+
+#include "Clownfish/TestHarness/TestBatchRunner.h"
+#include "Lucy/Analysis/StandardTokenizer.h"
+#include "Lucy/Document/Doc.h"
+#include "Lucy/Document/HitDoc.h"
+#include "Lucy/Index/DocReader.h"
+#include "Lucy/Index/Indexer.h"
+#include "Lucy/Index/IndexManager.h"
+#include "Lucy/Index/PolyReader.h"
+#include "Lucy/Index/Segment.h"
+#include "Lucy/Index/SegReader.h"
+#include "Lucy/Index/SortCache.h"
+#include "Lucy/Index/SortReader.h"
+#include "Lucy/Index/SortWriter.h"
+#include "Lucy/Plan/FullTextType.h"
+#include "Lucy/Plan/Schema.h"
+#include "Lucy/Plan/StringType.h"
+#include "Lucy/Store/RAMFolder.h"
+
+static String *name_str;
+static String *speed_str;
+static String *weight_str;
+static String *home_str;
+static String *cat_str;
+static String *wheels_str;
+static String *unused_str;
+static String *nope_str;
+
+TestSortWriter*
+TestSortWriter_new() {
+    return (TestSortWriter*)VTable_Make_Obj(TESTSORTWRITER);
+}
+
+static void
+S_init_strings() {
+    name_str   = Str_newf("name");
+    speed_str  = Str_newf("speed");
+    weight_str = Str_newf("weight");
+    home_str   = Str_newf("home");
+    cat_str    = Str_newf("cat");
+    wheels_str = Str_newf("wheels");
+    unused_str = Str_newf("unused");
+    nope_str   = Str_newf("nope");
+}
+
+static void
+S_destroy_strings() {
+    DECREF(name_str);
+    DECREF(speed_str);
+    DECREF(weight_str);
+    DECREF(home_str);
+    DECREF(cat_str);
+    DECREF(wheels_str);
+    DECREF(unused_str);
+    DECREF(nope_str);
+}
+
+static Schema*
+S_create_schema() {
+    Schema *schema = Schema_new();
+
+    StandardTokenizer *tokenizer = StandardTokenizer_new();
+    FullTextType *full_text_type = FullTextType_new((Analyzer*)tokenizer);
+    FullTextType_Set_Sortable(full_text_type, true);
+
+    StringType *string_type = StringType_new();
+    StringType_Set_Sortable(string_type, true);
+
+    StringType *unsortable = StringType_new();
+
+    Schema_Spec_Field(schema, name_str,   (FieldType*)full_text_type);
+    Schema_Spec_Field(schema, speed_str,  (FieldType*)string_type);
+    Schema_Spec_Field(schema, weight_str, (FieldType*)string_type);
+    Schema_Spec_Field(schema, home_str,   (FieldType*)string_type);
+    Schema_Spec_Field(schema, cat_str,    (FieldType*)string_type);
+    Schema_Spec_Field(schema, wheels_str, (FieldType*)string_type);
+    Schema_Spec_Field(schema, unused_str, (FieldType*)string_type);
+    Schema_Spec_Field(schema, nope_str,   (FieldType*)unsortable);
+
+    DECREF(unsortable);
+    DECREF(string_type);
+    DECREF(full_text_type);
+    DECREF(tokenizer);
+
+    return schema;
+}
+
+static void
+S_store_field(Doc *doc, String *field, const char *value) {
+    if (value) {
+        StackString *string = SSTR_WRAP_UTF8(value, strlen(value));
+        Doc_Store(doc, field, (Obj*)string);
+    }
+}
+
+static void
+S_add_doc(Indexer *indexer, const char *name, const char *speed,
+              const char *weight, const char *home, const char *wheels,
+              const char *nope) {
+    Doc *doc   = Doc_new(NULL, 0);
+
+    S_store_field(doc, name_str,   name);
+    S_store_field(doc, speed_str,  speed);
+    S_store_field(doc, weight_str, weight);
+    S_store_field(doc, home_str,   home);
+    S_store_field(doc, cat_str,    "vehicle");
+    S_store_field(doc, wheels_str, wheels);
+    S_store_field(doc, nope_str,   nope);
+
+    Indexer_Add_Doc(indexer, doc, 1.0f);
+
+    DECREF(doc);
+}
+
+static void
+S_test_sort_cache(TestBatchRunner *runner, RAMFolder *folder,
+                  SegReader *seg_reader, const char *gen, bool is_used,
+                  String *field) {
+    Segment *segment   = SegReader_Get_Segment(seg_reader);
+    int32_t  field_num = Seg_Field_Num(segment, field);
+    String  *filename  = Str_newf("seg_%s/sort-%i32.ord", gen, field_num);
+    if (is_used) {
+        TEST_TRUE(runner, RAMFolder_Exists(folder, filename),
+                  "sort files written for %s", Str_Get_Ptr8(field));
+    }
+    else {
+        TEST_TRUE(runner, !RAMFolder_Exists(folder, filename),
+                  "no sort files written for %s", Str_Get_Ptr8(field));
+    }
+    DECREF(filename);
+
+    if (!is_used) { return; }
+
+    SortReader *sort_reader
+        = (SortReader*)SegReader_Obtain(seg_reader,
+                                        VTable_Get_Name(SORTREADER));
+    DocReader *doc_reader
+        = (DocReader*)SegReader_Obtain(seg_reader, VTable_Get_Name(DOCREADER));
+    SortCache *sort_cache
+        = SortReader_Fetch_Sort_Cache(sort_reader, field);
+
+    int32_t doc_max = SegReader_Doc_Max(seg_reader);
+    for (int32_t doc_id = 1; doc_id <= doc_max; ++doc_id) {
+        int32_t  ord         = SortCache_Ordinal(sort_cache, doc_id);
+        Obj     *cache_value = SortCache_Value(sort_cache, ord);
+        HitDoc  *doc         = DocReader_Fetch_Doc(doc_reader, doc_id);
+        Obj     *doc_value   = HitDoc_Extract(doc, field);
+
+        bool is_equal;
+        if (cache_value == NULL || doc_value == NULL) {
+            is_equal = (cache_value == doc_value);
+        }
+        else {
+            is_equal = Obj_Equals(cache_value, doc_value);
+        }
+        TEST_TRUE(runner, is_equal, "correct cached value field %s doc %d",
+                  Str_Get_Ptr8(field), doc_id);
+
+        DECREF(doc_value);
+        DECREF(doc);
+        DECREF(cache_value);
+    }
+}
+
+static void
+test_sort_writer(TestBatchRunner *runner) {
+    Schema    *schema  = S_create_schema();
+    RAMFolder *folder  = RAMFolder_new(NULL);
+
+    {
+        // Add vehicles.
+        Indexer *indexer = Indexer_new(schema, (Obj*)folder, NULL, 0);
+
+        S_add_doc(indexer, "airplane", "0200", "8000", "air", "3", "nyet");
+        S_add_doc(indexer, "bike", "0015", "0025", "land", "2", NULL);
+        S_add_doc(indexer, "car", "0070", "3000", "land",  "4", NULL);
+
+        Indexer_Commit(indexer);
+        DECREF(indexer);
+    }
+
+    {
+        PolyReader *poly_reader = PolyReader_open((Obj*)folder, NULL, NULL);
+        VArray     *seg_readers = PolyReader_Get_Seg_Readers(poly_reader);
+        SegReader  *seg_reader  = (SegReader*)VA_Fetch(seg_readers, 0);
+
+        S_test_sort_cache(runner, folder, seg_reader, "1", true,  name_str);
+        S_test_sort_cache(runner, folder, seg_reader, "1", true,  speed_str);
+        S_test_sort_cache(runner, folder, seg_reader, "1", true,  weight_str);
+        S_test_sort_cache(runner, folder, seg_reader, "1", true,  home_str);
+        S_test_sort_cache(runner, folder, seg_reader, "1", true,  cat_str);
+        S_test_sort_cache(runner, folder, seg_reader, "1", true,  wheels_str);
+        S_test_sort_cache(runner, folder, seg_reader, "1", false, unused_str);
+        S_test_sort_cache(runner, folder, seg_reader, "1", false, nope_str);
+
+        DECREF(poly_reader);
+    }
+
+    {
+        // Add a second segment.
+        NonMergingIndexManager *manager = NMIxManager_new();
+        Indexer *indexer
+            = Indexer_new(schema, (Obj*)folder, (IndexManager*)manager, 0);
+        // no "wheels" field -- test NULL/undef
+        S_add_doc(indexer, "dirigible", "0040", "0000", "air", NULL, NULL);
+        Indexer_Commit(indexer);
+        DECREF(indexer);
+        DECREF(manager);
+    }
+
+    {
+        // Consolidate everything, to test merging.
+        Indexer *indexer = Indexer_new(schema, (Obj*)folder, NULL, 0);
+        StackString *bike_str = SSTR_WRAP_UTF8("bike", 4);
+        Indexer_Delete_By_Term(indexer, name_str, (Obj*)bike_str);
+        // no "wheels" field -- test NULL/undef
+        S_add_doc(indexer, "elephant", "0020", "6000", "land", NULL, NULL);
+        Indexer_Optimize(indexer);
+        Indexer_Commit(indexer);
+        DECREF(indexer);
+    }
+
+    {
+        VArray *filenames = RAMFolder_List_R(folder, NULL);
+        int num_old_seg_files = 0;
+        for (uint32_t i = 0, size = VA_Get_Size(filenames); i < size; ++i) {
+            String *filename = (String*)VA_Fetch(filenames, i);
+            if (Str_Find_Utf8(filename, "seg_1", 5) >= 0
+                || Str_Find_Utf8(filename, "seg_2", 5) >= 0
+               ) {
+                ++num_old_seg_files;
+            }
+        }
+        TEST_INT_EQ(runner, num_old_seg_files, 0,
+                    "all files from earlier segments zapped");
+        DECREF(filenames);
+    }
+
+    {
+        PolyReader *poly_reader = PolyReader_open((Obj*)folder, NULL, NULL);
+        VArray     *seg_readers = PolyReader_Get_Seg_Readers(poly_reader);
+        SegReader  *seg_reader  = (SegReader*)VA_Fetch(seg_readers, 0);
+
+        S_test_sort_cache(runner, folder, seg_reader, "3", true, name_str);
+        S_test_sort_cache(runner, folder, seg_reader, "3", true, speed_str);
+        S_test_sort_cache(runner, folder, seg_reader, "3", true, weight_str);
+        S_test_sort_cache(runner, folder, seg_reader, "3", true, home_str);
+        S_test_sort_cache(runner, folder, seg_reader, "3", true, cat_str);
+        S_test_sort_cache(runner, folder, seg_reader, "3", true, wheels_str);
+
+        DECREF(poly_reader);
+    }
+
+    DECREF(folder);
+    DECREF(schema);
+}
+
+void
+TestSortWriter_Run_IMP(TestSortWriter *self, TestBatchRunner *runner) {
+    TestBatchRunner_Plan(runner, (TestBatch*)self, 57);
+
+    // Force frequent flushes.
+    SortWriter_set_default_mem_thresh(100);
+
+    S_init_strings();
+    test_sort_writer(runner);
+    S_destroy_strings();
+}
+
+NonMergingIndexManager*
+NMIxManager_new() {
+    NonMergingIndexManager *self
+        = (NonMergingIndexManager*)VTable_Make_Obj(NONMERGINGINDEXMANAGER);
+    return NMIxManager_init(self);
+}
+
+NonMergingIndexManager*
+NMIxManager_init(NonMergingIndexManager *self) {
+    IxManager_init((IndexManager*)self, NULL, NULL);
+    return self;
+}
+
+VArray*
+NMIxManager_Recycle_IMP(NonMergingIndexManager *self, PolyReader *reader,
+                        lucy_DeletionsWriter *del_writer, int64_t cutoff,
+                        bool optimize) {
+    UNUSED_VAR(self);
+    UNUSED_VAR(reader);
+    UNUSED_VAR(del_writer);
+    UNUSED_VAR(cutoff);
+    UNUSED_VAR(optimize);
+    return VA_new(0);
+}
+
+

http://git-wip-us.apache.org/repos/asf/lucy/blob/be4e1833/core/Lucy/Test/Index/TestSortWriter.cfh
----------------------------------------------------------------------
diff --git a/core/Lucy/Test/Index/TestSortWriter.cfh b/core/Lucy/Test/Index/TestSortWriter.cfh
new file mode 100644
index 0000000..98e3ec6
--- /dev/null
+++ b/core/Lucy/Test/Index/TestSortWriter.cfh
@@ -0,0 +1,43 @@
+/* Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+parcel TestLucy;
+
+class Lucy::Test::Index::TestSortWriter
+    inherits Clownfish::TestHarness::TestBatch {
+
+    inert incremented TestSortWriter*
+    new();
+
+    void
+    Run(TestSortWriter *self, TestBatchRunner *runner);
+}
+
+class Lucy::Test::Index::NonMergingIndexManager nickname NMIxManager
+    inherits Lucy::Index::IndexManager {
+
+    public inert incremented NonMergingIndexManager*
+    new();
+
+    public inert NonMergingIndexManager*
+    init(NonMergingIndexManager *self);
+
+    public incremented VArray*
+    Recycle(NonMergingIndexManager *self, PolyReader *reader,
+            DeletionsWriter *del_writer, int64_t cutoff,
+            bool optimize = false);
+}
+

http://git-wip-us.apache.org/repos/asf/lucy/blob/be4e1833/perl/t/221-sort_writer.t
----------------------------------------------------------------------
diff --git a/perl/t/221-sort_writer.t b/perl/t/221-sort_writer.t
deleted file mode 100644
index 45fb341..0000000
--- a/perl/t/221-sort_writer.t
+++ /dev/null
@@ -1,174 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-use strict;
-use warnings;
-use lib 'buildlib';
-
-package NonMergingIndexManager;
-use base qw( Lucy::Index::IndexManager );
-
-sub recycle {
-    return Clownfish::VArray->new;
-}
-
-package SortSchema;
-use base qw( Lucy::Plan::Schema );
-
-sub new {
-    my $self          = shift->SUPER::new(@_);
-    my $fulltext_type = Lucy::Plan::FullTextType->new(
-        analyzer => Lucy::Analysis::StandardTokenizer->new,
-        sortable => 1,
-    );
-    my $string_type = Lucy::Plan::StringType->new( sortable => 1 );
-    my $unsortable = Lucy::Plan::StringType->new;
-    $self->spec_field( name => 'name',   type => $fulltext_type );
-    $self->spec_field( name => 'speed',  type => $string_type );
-    $self->spec_field( name => 'weight', type => $string_type );
-    $self->spec_field( name => 'home',   type => $string_type );
-    $self->spec_field( name => 'cat',    type => $string_type );
-    $self->spec_field( name => 'wheels', type => $string_type );
-    $self->spec_field( name => 'unused', type => $string_type );
-    $self->spec_field( name => 'nope',   type => $unsortable );
-    return $self;
-}
-
-package main;
-use Lucy::Test;
-use Test::More tests => 57;
-
-# Force frequent flushes.
-Lucy::Index::SortWriter::set_default_mem_thresh(100);
-
-my $airplane = {
-    name   => 'airplane',
-    speed  => '0200',
-    weight => '8000',
-    home   => 'air',
-    cat    => 'vehicle',
-    wheels => 3,
-    nope   => 'nyet',
-};
-my $bike = {
-    name   => 'bike',
-    speed  => '0015',
-    weight => '0025',
-    home   => 'land',
-    cat    => 'vehicle',
-    wheels => 2,
-};
-my $car = {
-    name   => 'car',
-    speed  => '0070',
-    weight => '3000',
-    home   => 'land',
-    cat    => 'vehicle',
-    wheels => 4,
-};
-my $dirigible = {
-    name   => 'dirigible',
-    speed  => '0040',
-    weight => '0000',
-    home   => 'air',
-    cat    => 'vehicle',
-    # no "wheels" field -- test NULL/undef
-};
-my $elephant = {
-    name   => 'elephant',
-    speed  => '0020',
-    weight => '6000',
-    home   => 'land',
-    cat    => 'vehicle',
-    # no "wheels" field -- test NULL/undef
-};
-
-my $folder  = Lucy::Store::RAMFolder->new;
-my $schema  = SortSchema->new;
-my $indexer = Lucy::Index::Indexer->new(
-    index  => $folder,
-    schema => $schema,
-);
-
-# Add vehicles.
-$indexer->add_doc($_) for ( $airplane, $bike, $car );
-
-$indexer->commit;
-
-my $polyreader  = Lucy::Index::IndexReader->open( index => $folder );
-my $seg_reader  = $polyreader->get_seg_readers->[0];
-my $sort_reader = $seg_reader->obtain("Lucy::Index::SortReader");
-my $doc_reader  = $seg_reader->obtain("Lucy::Index::DocReader");
-my $segment     = $seg_reader->get_segment;
-
-for my $field (qw( name speed weight home cat wheels )) {
-    my $field_num = $segment->field_num($field);
-    ok( $folder->exists("seg_1/sort-$field_num.ord"),
-        "sort files written for $field" );
-    my $sort_cache = $sort_reader->fetch_sort_cache($field);
-    for ( 1 .. $seg_reader->doc_max ) {
-        is( $sort_cache->value( ord => $sort_cache->ordinal($_) ),
-            $doc_reader->fetch_doc($_)->{$field},
-            "correct cached value doc $_ "
-        );
-    }
-}
-
-for my $field (qw( unused nope )) {
-    my $field_num = $segment->field_num($field);
-    ok( !$folder->exists("seg_1/sort-$field_num.ord"),
-        "no sort files written for $field" );
-}
-
-# Add a second segment.
-$indexer = Lucy::Index::Indexer->new(
-    index   => $folder,
-    schema  => $schema,
-    manager => NonMergingIndexManager->new,
-);
-$indexer->add_doc($dirigible);
-$indexer->commit;
-
-# Consolidate everything, to test merging.
-$indexer = Lucy::Index::Indexer->new(
-    index  => $folder,
-    schema => $schema,
-);
-$indexer->delete_by_term( field => 'name', term => 'bike' );
-$indexer->add_doc($elephant);
-$indexer->optimize;
-$indexer->commit;
-
-my $num_old_seg_files = scalar grep {m/seg_[12]/} @{ $folder->list_r };
-is( $num_old_seg_files, 0, "all files from earlier segments zapped" );
-
-$polyreader  = Lucy::Index::IndexReader->open( index => $folder );
-$seg_reader  = $polyreader->get_seg_readers->[0];
-$sort_reader = $seg_reader->obtain("Lucy::Index::SortReader");
-$doc_reader  = $seg_reader->obtain("Lucy::Index::DocReader");
-$segment     = $seg_reader->get_segment;
-
-for my $field (qw( name speed weight home cat wheels )) {
-    my $field_num = $segment->field_num($field);
-    ok( $folder->exists("seg_3/sort-$field_num.ord"),
-        "sort files written for $field" );
-    my $sort_cache = $sort_reader->fetch_sort_cache($field);
-    for ( 1 .. $seg_reader->doc_max ) {
-        is( $sort_cache->value( ord => $sort_cache->ordinal($_) ),
-            $doc_reader->fetch_doc($_)->{$field},
-            "correct cached value field $field doc $_ "
-        );
-    }
-}

http://git-wip-us.apache.org/repos/asf/lucy/blob/be4e1833/perl/t/core/224-sort_writer.t
----------------------------------------------------------------------
diff --git a/perl/t/core/224-sort_writer.t b/perl/t/core/224-sort_writer.t
new file mode 100644
index 0000000..b987614
--- /dev/null
+++ b/perl/t/core/224-sort_writer.t
@@ -0,0 +1,23 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+use strict;
+use warnings;
+
+use Lucy::Test;
+my $success = Lucy::Test::run_tests("Lucy::Test::Index::TestSortWriter");
+
+exit($success ? 0 : 1);
+


[2/8] git commit: refs/heads/master - Initialize SortFieldWriter#run_tick to 1

Posted by ma...@apache.org.
Initialize SortFieldWriter#run_tick to 1

Make sure we never use a run_tick of 0.


Project: http://git-wip-us.apache.org/repos/asf/lucy/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucy/commit/22663dea
Tree: http://git-wip-us.apache.org/repos/asf/lucy/tree/22663dea
Diff: http://git-wip-us.apache.org/repos/asf/lucy/diff/22663dea

Branch: refs/heads/master
Commit: 22663dea422a186700379b80021fb202ca982dbd
Parents: 0cf86d1
Author: Nick Wellnhofer <we...@aevum.de>
Authored: Thu Sep 26 02:13:49 2013 +0200
Committer: Marvin Humphrey <ma...@rectangular.com>
Committed: Wed Jun 18 21:22:59 2014 -0700

----------------------------------------------------------------------
 core/Lucy/Index/SortFieldWriter.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucy/blob/22663dea/core/Lucy/Index/SortFieldWriter.c
----------------------------------------------------------------------
diff --git a/core/Lucy/Index/SortFieldWriter.c b/core/Lucy/Index/SortFieldWriter.c
index 8a372a3..f34b68e 100644
--- a/core/Lucy/Index/SortFieldWriter.c
+++ b/core/Lucy/Index/SortFieldWriter.c
@@ -92,7 +92,7 @@ SortFieldWriter_init(SortFieldWriter *self, Schema *schema,
     ivars->sort_cache      = NULL;
     ivars->doc_map         = NULL;
     ivars->sorted_ids      = NULL;
-    ivars->run_tick        = 0;
+    ivars->run_tick        = 1;
     ivars->ord_width       = 0;
     ivars->last_val        = NULL;
 
@@ -217,7 +217,6 @@ SortFieldWriter_Add_Segment_IMP(SortFieldWriter *self, SegReader *reader,
     run_ivars->run_max    = SegReader_Doc_Max(reader);
     run_ivars->run_cardinality = SortCache_Get_Cardinality(sort_cache);
     run_ivars->null_ord   = SortCache_Get_Null_Ord(sort_cache);
-    run_ivars->run_tick   = 1;
     SortFieldWriter_Add_Run(self, (SortExternal*)run);
 }
 


[8/8] git commit: refs/heads/master - Port t/028-sortexrun.t to C

Posted by ma...@apache.org.
Port t/028-sortexrun.t to C

Original commit by Nick Wellnhofer, amended by Marvin Humphrey for
changes to the SortExternal API.


Project: http://git-wip-us.apache.org/repos/asf/lucy/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucy/commit/eb10a8c6
Tree: http://git-wip-us.apache.org/repos/asf/lucy/tree/eb10a8c6
Diff: http://git-wip-us.apache.org/repos/asf/lucy/diff/eb10a8c6

Branch: refs/heads/master
Commit: eb10a8c6842c3483cff014a8a7b04de0ed5c0858
Parents: 73d6925
Author: Nick Wellnhofer <we...@aevum.de>
Authored: Fri Oct 18 22:15:04 2013 +0200
Committer: Marvin Humphrey <ma...@rectangular.com>
Committed: Tue Jul 1 12:09:49 2014 -0700

----------------------------------------------------------------------
 core/Lucy/Test/Util/TestSortExternal.c | 38 ++++++++++++++++++++++-
 core/Lucy/Util/BBSortEx.c              |  8 +++++
 core/Lucy/Util/BBSortEx.cfh            |  3 ++
 perl/t/028-sortexrun.t                 | 48 -----------------------------
 4 files changed, 48 insertions(+), 49 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucy/blob/eb10a8c6/core/Lucy/Test/Util/TestSortExternal.c
----------------------------------------------------------------------
diff --git a/core/Lucy/Test/Util/TestSortExternal.c b/core/Lucy/Test/Util/TestSortExternal.c
index 2e04c83..19e83cf 100644
--- a/core/Lucy/Test/Util/TestSortExternal.c
+++ b/core/Lucy/Test/Util/TestSortExternal.c
@@ -269,9 +269,44 @@ test_sort_random_strings(TestBatchRunner *runner) {
     DECREF(bytebufs);
 }
 
+static void
+test_run(TestBatchRunner *runner) {
+    VArray *letters = VA_new(26);
+    for (int i = 0; i < 26; ++i) {
+        char ch = 'a' + i;
+        ByteBuf *bytebuf = BB_new_bytes(&ch, 1);
+        VA_Push(letters, (Obj*)bytebuf);
+    }
+    BBSortEx *run = BBSortEx_new(0x1000000, letters);
+    BBSortEx_Set_Mem_Thresh(run, 5);
+
+    BBSortEx_Refill(run);
+    TEST_INT_EQ(runner, BBSortEx_Buffer_Count(run), 5,
+                "Refill doesn't exceed memory threshold");
+
+    Obj *endpost = BBSortEx_Peek_Last(run);
+    ByteBuf *wanted = BB_new_bytes("e", 1);
+    TEST_TRUE(runner, BB_Equals(wanted, endpost), "Peek_Last");
+
+    VArray *elems = VA_new(26);
+    do {
+        while (BBSortEx_Buffer_Count(run) > 0) {
+            Obj *object = BBSortEx_Fetch(run);
+            VA_Push(elems, object);
+        }
+    } while (BBSortEx_Refill(run) > 0);
+    TEST_TRUE(runner, VA_Equals(elems, (Obj*)letters), "retrieve all elems");
+
+    DECREF(elems);
+    DECREF(wanted);
+    DECREF(endpost);
+    DECREF(letters);
+    DECREF(run);
+}
+
 void
 TestSortExternal_Run_IMP(TestSortExternal *self, TestBatchRunner *runner) {
-    TestBatchRunner_Plan(runner, (TestBatch*)self, 16);
+    TestBatchRunner_Plan(runner, (TestBatch*)self, 19);
 
     srand((unsigned int)time((time_t*)NULL));
     S_init_bytebufs();
@@ -281,6 +316,7 @@ TestSortExternal_Run_IMP(TestSortExternal *self, TestBatchRunner *runner) {
     test_sort_nothing(runner);
     test_sort_packed_ints(runner);
     test_sort_random_strings(runner);
+    test_run(runner);
     S_destroy_bytebufs();
 }
 

http://git-wip-us.apache.org/repos/asf/lucy/blob/eb10a8c6/core/Lucy/Util/BBSortEx.c
----------------------------------------------------------------------
diff --git a/core/Lucy/Util/BBSortEx.c b/core/Lucy/Util/BBSortEx.c
index c9201bb..66e0568 100644
--- a/core/Lucy/Util/BBSortEx.c
+++ b/core/Lucy/Util/BBSortEx.c
@@ -183,6 +183,14 @@ BBSortEx_Peek_Cache_IMP(BBSortEx *self) {
     return retval;
 }
 
+Obj*
+BBSortEx_Peek_Last_IMP(BBSortEx *self) {
+    BBSortExIVARS *const ivars = BBSortEx_IVARS(self);
+    uint32_t count = ivars->buf_max - ivars->buf_tick;
+    if (count == 0) { return NULL; }
+    return INCREF(ivars->buffer[count-1]);
+}
+
 uint32_t
 BBSortEx_Get_Num_Runs_IMP(BBSortEx *self) {
     BBSortExIVARS *const ivars = BBSortEx_IVARS(self);

http://git-wip-us.apache.org/repos/asf/lucy/blob/eb10a8c6/core/Lucy/Util/BBSortEx.cfh
----------------------------------------------------------------------
diff --git a/core/Lucy/Util/BBSortEx.cfh b/core/Lucy/Util/BBSortEx.cfh
index 6fd8656..a43b1fd 100644
--- a/core/Lucy/Util/BBSortEx.cfh
+++ b/core/Lucy/Util/BBSortEx.cfh
@@ -54,6 +54,9 @@ class Lucy::Util::BBSortEx
     incremented VArray*
     Peek_Cache(BBSortEx *self);
 
+    incremented nullable Obj*
+    Peek_Last(BBSortEx *self);
+
     uint32_t
     Get_Num_Runs(BBSortEx *self);
 

http://git-wip-us.apache.org/repos/asf/lucy/blob/eb10a8c6/perl/t/028-sortexrun.t
----------------------------------------------------------------------
diff --git a/perl/t/028-sortexrun.t b/perl/t/028-sortexrun.t
deleted file mode 100644
index f0df52a..0000000
--- a/perl/t/028-sortexrun.t
+++ /dev/null
@@ -1,48 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-use strict;
-use warnings;
-use lib 'buildlib';
-
-use Test::More skip_all => 'Disabled until test ported to C';
-#use Test::More tests => 5;
-use Lucy::Test;
-use Clownfish qw( to_perl );
-
-my $letters = Clownfish::VArray->new;
-$letters->push( Clownfish::ByteBuf->new($_) ) for 'a' .. 'z';
-my $run = Lucy::Util::BBSortEx->new( external => $letters );
-$run->set_mem_thresh(5);
-
-my $num_in_cache = $run->refill;
-is( $run->cache_count, 5, "Read_Elem puts the brakes on Refill" );
-my $endpost = $run->peek_last;
-is( $endpost, 'e', "Peek_Last" );
-$endpost = Clownfish::ByteBuf->new('b');
-my $slice = $run->pop_slice($endpost);
-is( scalar @$slice, 2, "Pop_Slice gets only less-than-or-equal elems" );
-@$slice = map { to_perl($_) } @$slice;
-is_deeply( $slice, [qw( a b )], "Pop_Slice picks highest elems" );
-
-my @got = qw( a b );
-while (1) {
-    $endpost = $run->peek_last;
-    $slice   = $run->pop_slice( Clownfish::ByteBuf->new($endpost) );
-    push @got, map { to_perl($_) } @$slice;
-    last unless $run->refill;
-}
-is_deeply( \@got, [ 'a' .. 'z' ], "retrieve all elems" );
-


[5/8] git commit: refs/heads/master - Use counting sort to sort doc_ids in SortFieldWriter#Refill

Posted by ma...@apache.org.
Use counting sort to sort doc_ids in SortFieldWriter#Refill

Since we already have the ordinals for each doc_id, we can use a
counting sort. This uses a temporary array of size run_cardinality but
runs in linear time.


Project: http://git-wip-us.apache.org/repos/asf/lucy/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucy/commit/d96ef18e
Tree: http://git-wip-us.apache.org/repos/asf/lucy/tree/d96ef18e
Diff: http://git-wip-us.apache.org/repos/asf/lucy/diff/d96ef18e

Branch: refs/heads/master
Commit: d96ef18e4c8e1ef19ce55be5926a0728254df30d
Parents: dd1a198
Author: Nick Wellnhofer <we...@aevum.de>
Authored: Thu Sep 26 19:43:42 2013 +0200
Committer: Marvin Humphrey <ma...@rectangular.com>
Committed: Wed Jun 18 21:23:00 2014 -0700

----------------------------------------------------------------------
 core/Lucy/Index/SortFieldWriter.c | 53 +++++++++++++++++++++-------------
 1 file changed, 33 insertions(+), 20 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucy/blob/d96ef18e/core/Lucy/Index/SortFieldWriter.c
----------------------------------------------------------------------
diff --git a/core/Lucy/Index/SortFieldWriter.c b/core/Lucy/Index/SortFieldWriter.c
index 0d07ce5..b6faec5 100644
--- a/core/Lucy/Index/SortFieldWriter.c
+++ b/core/Lucy/Index/SortFieldWriter.c
@@ -347,30 +347,43 @@ SortFieldWriter_Compare_IMP(SortFieldWriter *self, void *va, void *vb) {
     return comparison;
 }
 
-static int
-S_compare_doc_ids_by_ord_rev(void *context, const void *va, const void *vb) {
-    SortCache *sort_cache = (SortCache*)context;
-    int32_t a = *(int32_t*)va;
-    int32_t b = *(int32_t*)vb;
-    int32_t ord_a = SortCache_Ordinal(sort_cache, a);
-    int32_t ord_b = SortCache_Ordinal(sort_cache, b);
-    int32_t comparison = ord_a - ord_b;
-    if (comparison == 0) { comparison = a - b; }
-    return comparison;
-}
-
 static void
 S_lazy_init_sorted_ids(SortFieldWriter *self) {
     SortFieldWriterIVARS *const ivars = SortFieldWriter_IVARS(self);
-    if (!ivars->sorted_ids) {
-        ivars->sorted_ids
-            = (int32_t*)MALLOCATE((ivars->run_max + 1) * sizeof(int32_t));
-        for (int32_t i = 0, max = ivars->run_max; i <= max; i++) {
-            ivars->sorted_ids[i] = i;
-        }
-        Sort_quicksort(ivars->sorted_ids + 1, ivars->run_max, sizeof(int32_t),
-                       S_compare_doc_ids_by_ord_rev, ivars->sort_cache);
+    if (ivars->sorted_ids) { return; }
+
+    // Counting sort. Could be optimized by working directly on the
+    // ordinal arrays.
+
+    SortCache *sort_cache      = ivars->sort_cache;
+    int32_t    run_cardinality = ivars->run_cardinality;
+    int32_t    run_max         = ivars->run_max;
+
+    // Count.
+    int32_t *counts = (int32_t*)CALLOCATE(run_cardinality, sizeof(int32_t));
+    for (int32_t doc_id = 0; doc_id <= run_max; ++doc_id) {
+        int32_t ord = SortCache_Ordinal(sort_cache, doc_id);
+        ++counts[ord];
+    }
+
+    // Compute partial sums.
+    int32_t sum = 0;
+    for (int32_t ord = 0; ord < run_cardinality; ++ord) {
+        int32_t count = counts[ord];
+        counts[ord] = sum;
+        sum += count;
     }
+
+    // Distribute.
+    int32_t *sorted_ids = (int32_t*)MALLOCATE((run_max + 1) * sizeof(int32_t));
+    for (int32_t doc_id = 0; doc_id <= run_max; ++doc_id) {
+        int32_t ord = SortCache_Ordinal(sort_cache, doc_id);
+        int32_t pos = counts[ord]++;
+        sorted_ids[pos] = doc_id;
+    }
+
+    ivars->sorted_ids = sorted_ids;
+    FREEMEM(counts);
 }
 
 void