You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2016/06/11 15:49:28 UTC

[12/21] lucene-solr:branch_6x: LUCENE-6766: implement STRING sort, using segment-local ordinals

LUCENE-6766: implement STRING sort, using segment-local ordinals


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/b62cad33
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/b62cad33
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/b62cad33

Branch: refs/heads/branch_6x
Commit: b62cad334c0777749ae45a1dbf959a4e6b7767ea
Parents: f4022dc
Author: Mike McCandless <mi...@apache.org>
Authored: Sat May 7 11:46:17 2016 -0400
Committer: Mike McCandless <mi...@apache.org>
Committed: Sat Jun 11 11:48:40 2016 -0400

----------------------------------------------------------------------
 .../org/apache/lucene/index/MergeState.java     | 49 +++++++++-----------
 .../org/apache/lucene/index/MultiSorter.java    | 42 ++++++++++++++++-
 .../apache/lucene/index/TestIndexSorting.java   | 34 ++++++++++++++
 .../lucene/index/MockRandomMergePolicy.java     |  1 +
 4 files changed, 98 insertions(+), 28 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b62cad33/lucene/core/src/java/org/apache/lucene/index/MergeState.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/MergeState.java b/lucene/core/src/java/org/apache/lucene/index/MergeState.java
index 32e0480..31065e3 100644
--- a/lucene/core/src/java/org/apache/lucene/index/MergeState.java
+++ b/lucene/core/src/java/org/apache/lucene/index/MergeState.java
@@ -213,34 +213,29 @@ public class MergeState {
     //System.out.println("MergeState.maybeSortReaders indexSort=" + indexSort);
 
     for (CodecReader leaf : originalReaders) {
-      if (leaf instanceof SegmentReader) {
-        SegmentReader segmentReader = (SegmentReader) leaf;
-        Sort segmentSort = segmentReader.getSegmentInfo().info.getIndexSort();
-        //System.out.println("  leaf=" + leaf + " sort=" + segmentSort);
-
-        if (segmentSort == null) {
-          // TODO: fix IW to also sort when flushing?  It's somewhat tricky because of stored fields and term vectors, which write "live"
-          // to the files on each indexed document:
-
-          // This segment was written by flush, so documents are not yet sorted, so we sort them now:
-          Sorter.DocMap sortDocMap = sorter.sort(leaf);
-          if (sortDocMap != null) {
-            //System.out.println("    sort!");
-            // nocommit what about MergedReaderWrapper in here?
-            leaf = SlowCodecReaderWrapper.wrap(SortingLeafReader.wrap(leaf, sortDocMap));
-            leafDocMaps[readers.size()] = new DocMap() {
-                @Override
-                public int get(int docID) {
-                  return sortDocMap.oldToNew(docID);
-                }
-              };
-          }
-
-        } else if (segmentSort.equals(indexSort) == false) {
-          throw new IllegalArgumentException("index sort mismatch: merged segment has sort=" + indexSort + " but to-be-merged segment has sort=" + segmentSort);
+      Sort segmentSort = leaf.getIndexSort();
+      //System.out.println("  leaf=" + leaf + " sort=" + segmentSort);
+
+      if (segmentSort == null) {
+        // TODO: fix IW to also sort when flushing?  It's somewhat tricky because of stored fields and term vectors, which write "live"
+        // to the files on each indexed document:
+
+        // This segment was written by flush, so documents are not yet sorted, so we sort them now:
+        Sorter.DocMap sortDocMap = sorter.sort(leaf);
+        if (sortDocMap != null) {
+          //System.out.println("    sort!");
+          // nocommit what about MergedReaderWrapper in here?
+          leaf = SlowCodecReaderWrapper.wrap(SortingLeafReader.wrap(leaf, sortDocMap));
+          leafDocMaps[readers.size()] = new DocMap() {
+              @Override
+              public int get(int docID) {
+                return sortDocMap.oldToNew(docID);
+              }
+            };
         }
-      } else {
-        throw new IllegalArgumentException("cannot sort index with foreign readers; leaf=" + leaf);
+
+      } else if (segmentSort.equals(indexSort) == false) {
+        throw new IllegalArgumentException("index sort mismatch: merged segment has sort=" + indexSort + " but to-be-merged segment has sort=" + segmentSort);
       }
 
       readers.add(leaf);

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b62cad33/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java b/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java
index 7f71eb5..ca1ebe5 100644
--- a/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java
@@ -123,7 +123,47 @@ final class MultiSorter {
 
   private static CrossReaderComparator getComparator(List<CodecReader> readers, SortField sortField) throws IOException {
     switch(sortField.getType()) {
-    // ncommit: use segment-local ords for string sort
+
+    case STRING:
+      {
+        // this uses the efficient segment-local ordinal map:
+        MultiReader multiReader = new MultiReader(readers.toArray(new LeafReader[readers.size()]));
+        final SortedDocValues sorted = MultiDocValues.getSortedValues(multiReader, sortField.getField());
+        final int[] docStarts = new int[readers.size()];
+        List<LeafReaderContext> leaves = multiReader.leaves();
+        for(int i=0;i<readers.size();i++) {
+          docStarts[i] = leaves.get(i).docBase;
+        }
+        final int missingOrd;
+        if (sortField.getMissingValue() == SortField.STRING_LAST) {
+          missingOrd = Integer.MIN_VALUE;
+        } else {
+          missingOrd = Integer.MAX_VALUE;
+        }
+
+        final int reverseMul;
+        if (sortField.getReverse()) {
+          reverseMul = -1;
+        } else {
+          reverseMul = 1;
+        }
+
+        return new CrossReaderComparator() {
+          @Override
+          public int compare(int readerIndexA, int docIDA, int readerIndexB, int docIDB) {
+            int ordA = sorted.getOrd(docStarts[readerIndexA] + docIDA);
+            if (ordA == -1) {
+              ordA = missingOrd;
+            }
+            int ordB = sorted.getOrd(docStarts[readerIndexB] + docIDB);
+            if (ordB == -1) {
+              ordB = missingOrd;
+            }
+            return reverseMul * Integer.compare(ordA, ordB);
+          }
+        };
+      }
+
     case INT:
       {
         List<NumericDocValues> values = new ArrayList<>();

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b62cad33/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java
index 8df81ba..1da6c82 100644
--- a/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java
@@ -79,6 +79,40 @@ import org.junit.BeforeClass;
 
 public class TestIndexSorting extends LuceneTestCase {
 
+  public void testBasicString() throws Exception {
+    Directory dir = newDirectory();
+    IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
+    Sort indexSort = new Sort(new SortField("foo", SortField.Type.STRING));
+    iwc.setIndexSort(indexSort);
+    IndexWriter w = new IndexWriter(dir, iwc);
+    Document doc = new Document();
+    doc.add(new SortedDocValuesField("foo", new BytesRef("zzz")));
+    w.addDocument(doc);
+    // so we get more than one segment, so that forceMerge actually does merge, since we only get a sorted segment by merging:
+    w.commit();
+
+    doc = new Document();
+    doc.add(new SortedDocValuesField("foo", new BytesRef("aaa")));
+    w.addDocument(doc);
+    w.commit();
+
+    doc = new Document();
+    doc.add(new SortedDocValuesField("foo", new BytesRef("mmm")));
+    w.addDocument(doc);
+    w.forceMerge(1);
+
+    DirectoryReader r = DirectoryReader.open(w);
+    LeafReader leaf = getOnlyLeafReader(r);
+    assertEquals(3, leaf.maxDoc());
+    SortedDocValues values = leaf.getSortedDocValues("foo");
+    assertEquals("aaa", values.get(0).utf8ToString());
+    assertEquals("mmm", values.get(1).utf8ToString());
+    assertEquals("zzz", values.get(2).utf8ToString());
+    r.close();
+    w.close();
+    dir.close();
+  }
+
   public void testSortOnMerge(boolean withDeletes) throws IOException {
     Directory dir = newDirectory();
     IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b62cad33/lucene/test-framework/src/java/org/apache/lucene/index/MockRandomMergePolicy.java
----------------------------------------------------------------------
diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/MockRandomMergePolicy.java b/lucene/test-framework/src/java/org/apache/lucene/index/MockRandomMergePolicy.java
index 9389888..f32e4d3 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/index/MockRandomMergePolicy.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/index/MockRandomMergePolicy.java
@@ -146,6 +146,7 @@ public class MockRandomMergePolicy extends MergePolicy {
 
     @Override
     public CodecReader wrapForMerge(CodecReader reader) throws IOException {
+
       // wrap it (e.g. prevent bulk merge etc)
       // TODO: cut this over to FilterCodecReader api, we can explicitly
       // enable/disable bulk merge for portions of the index we want.