You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2016/06/11 15:49:28 UTC
[12/21] lucene-solr:branch_6x: LUCENE-6766: implement STRING sort,
using segment-local ordinals
LUCENE-6766: implement STRING sort, using segment-local ordinals
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/b62cad33
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/b62cad33
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/b62cad33
Branch: refs/heads/branch_6x
Commit: b62cad334c0777749ae45a1dbf959a4e6b7767ea
Parents: f4022dc
Author: Mike McCandless <mi...@apache.org>
Authored: Sat May 7 11:46:17 2016 -0400
Committer: Mike McCandless <mi...@apache.org>
Committed: Sat Jun 11 11:48:40 2016 -0400
----------------------------------------------------------------------
.../org/apache/lucene/index/MergeState.java | 49 +++++++++-----------
.../org/apache/lucene/index/MultiSorter.java | 42 ++++++++++++++++-
.../apache/lucene/index/TestIndexSorting.java | 34 ++++++++++++++
.../lucene/index/MockRandomMergePolicy.java | 1 +
4 files changed, 98 insertions(+), 28 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b62cad33/lucene/core/src/java/org/apache/lucene/index/MergeState.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/MergeState.java b/lucene/core/src/java/org/apache/lucene/index/MergeState.java
index 32e0480..31065e3 100644
--- a/lucene/core/src/java/org/apache/lucene/index/MergeState.java
+++ b/lucene/core/src/java/org/apache/lucene/index/MergeState.java
@@ -213,34 +213,29 @@ public class MergeState {
//System.out.println("MergeState.maybeSortReaders indexSort=" + indexSort);
for (CodecReader leaf : originalReaders) {
- if (leaf instanceof SegmentReader) {
- SegmentReader segmentReader = (SegmentReader) leaf;
- Sort segmentSort = segmentReader.getSegmentInfo().info.getIndexSort();
- //System.out.println(" leaf=" + leaf + " sort=" + segmentSort);
-
- if (segmentSort == null) {
- // TODO: fix IW to also sort when flushing? It's somewhat tricky because of stored fields and term vectors, which write "live"
- // to the files on each indexed document:
-
- // This segment was written by flush, so documents are not yet sorted, so we sort them now:
- Sorter.DocMap sortDocMap = sorter.sort(leaf);
- if (sortDocMap != null) {
- //System.out.println(" sort!");
- // nocommit what about MergedReaderWrapper in here?
- leaf = SlowCodecReaderWrapper.wrap(SortingLeafReader.wrap(leaf, sortDocMap));
- leafDocMaps[readers.size()] = new DocMap() {
- @Override
- public int get(int docID) {
- return sortDocMap.oldToNew(docID);
- }
- };
- }
-
- } else if (segmentSort.equals(indexSort) == false) {
- throw new IllegalArgumentException("index sort mismatch: merged segment has sort=" + indexSort + " but to-be-merged segment has sort=" + segmentSort);
+ Sort segmentSort = leaf.getIndexSort();
+ //System.out.println(" leaf=" + leaf + " sort=" + segmentSort);
+
+ if (segmentSort == null) {
+ // TODO: fix IW to also sort when flushing? It's somewhat tricky because of stored fields and term vectors, which write "live"
+ // to the files on each indexed document:
+
+ // This segment was written by flush, so documents are not yet sorted, so we sort them now:
+ Sorter.DocMap sortDocMap = sorter.sort(leaf);
+ if (sortDocMap != null) {
+ //System.out.println(" sort!");
+ // nocommit what about MergedReaderWrapper in here?
+ leaf = SlowCodecReaderWrapper.wrap(SortingLeafReader.wrap(leaf, sortDocMap));
+ leafDocMaps[readers.size()] = new DocMap() {
+ @Override
+ public int get(int docID) {
+ return sortDocMap.oldToNew(docID);
+ }
+ };
}
- } else {
- throw new IllegalArgumentException("cannot sort index with foreign readers; leaf=" + leaf);
+
+ } else if (segmentSort.equals(indexSort) == false) {
+ throw new IllegalArgumentException("index sort mismatch: merged segment has sort=" + indexSort + " but to-be-merged segment has sort=" + segmentSort);
}
readers.add(leaf);
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b62cad33/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java b/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java
index 7f71eb5..ca1ebe5 100644
--- a/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java
@@ -123,7 +123,47 @@ final class MultiSorter {
private static CrossReaderComparator getComparator(List<CodecReader> readers, SortField sortField) throws IOException {
switch(sortField.getType()) {
- // ncommit: use segment-local ords for string sort
+
+ case STRING:
+ {
+ // this uses the efficient segment-local ordinal map:
+ MultiReader multiReader = new MultiReader(readers.toArray(new LeafReader[readers.size()]));
+ final SortedDocValues sorted = MultiDocValues.getSortedValues(multiReader, sortField.getField());
+ final int[] docStarts = new int[readers.size()];
+ List<LeafReaderContext> leaves = multiReader.leaves();
+ for(int i=0;i<readers.size();i++) {
+ docStarts[i] = leaves.get(i).docBase;
+ }
+ final int missingOrd;
+ if (sortField.getMissingValue() == SortField.STRING_LAST) {
+ missingOrd = Integer.MIN_VALUE;
+ } else {
+ missingOrd = Integer.MAX_VALUE;
+ }
+
+ final int reverseMul;
+ if (sortField.getReverse()) {
+ reverseMul = -1;
+ } else {
+ reverseMul = 1;
+ }
+
+ return new CrossReaderComparator() {
+ @Override
+ public int compare(int readerIndexA, int docIDA, int readerIndexB, int docIDB) {
+ int ordA = sorted.getOrd(docStarts[readerIndexA] + docIDA);
+ if (ordA == -1) {
+ ordA = missingOrd;
+ }
+ int ordB = sorted.getOrd(docStarts[readerIndexB] + docIDB);
+ if (ordB == -1) {
+ ordB = missingOrd;
+ }
+ return reverseMul * Integer.compare(ordA, ordB);
+ }
+ };
+ }
+
case INT:
{
List<NumericDocValues> values = new ArrayList<>();
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b62cad33/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java
index 8df81ba..1da6c82 100644
--- a/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java
@@ -79,6 +79,40 @@ import org.junit.BeforeClass;
public class TestIndexSorting extends LuceneTestCase {
+ public void testBasicString() throws Exception {
+ Directory dir = newDirectory();
+ IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
+ Sort indexSort = new Sort(new SortField("foo", SortField.Type.STRING));
+ iwc.setIndexSort(indexSort);
+ IndexWriter w = new IndexWriter(dir, iwc);
+ Document doc = new Document();
+ doc.add(new SortedDocValuesField("foo", new BytesRef("zzz")));
+ w.addDocument(doc);
+ // so we get more than one segment, so that forceMerge actually does merge, since we only get a sorted segment by merging:
+ w.commit();
+
+ doc = new Document();
+ doc.add(new SortedDocValuesField("foo", new BytesRef("aaa")));
+ w.addDocument(doc);
+ w.commit();
+
+ doc = new Document();
+ doc.add(new SortedDocValuesField("foo", new BytesRef("mmm")));
+ w.addDocument(doc);
+ w.forceMerge(1);
+
+ DirectoryReader r = DirectoryReader.open(w);
+ LeafReader leaf = getOnlyLeafReader(r);
+ assertEquals(3, leaf.maxDoc());
+ SortedDocValues values = leaf.getSortedDocValues("foo");
+ assertEquals("aaa", values.get(0).utf8ToString());
+ assertEquals("mmm", values.get(1).utf8ToString());
+ assertEquals("zzz", values.get(2).utf8ToString());
+ r.close();
+ w.close();
+ dir.close();
+ }
+
public void testSortOnMerge(boolean withDeletes) throws IOException {
Directory dir = newDirectory();
IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b62cad33/lucene/test-framework/src/java/org/apache/lucene/index/MockRandomMergePolicy.java
----------------------------------------------------------------------
diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/MockRandomMergePolicy.java b/lucene/test-framework/src/java/org/apache/lucene/index/MockRandomMergePolicy.java
index 9389888..f32e4d3 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/index/MockRandomMergePolicy.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/index/MockRandomMergePolicy.java
@@ -146,6 +146,7 @@ public class MockRandomMergePolicy extends MergePolicy {
@Override
public CodecReader wrapForMerge(CodecReader reader) throws IOException {
+
// wrap it (e.g. prevent bulk merge etc)
// TODO: cut this over to FilterCodecReader api, we can explicitly
// enable/disable bulk merge for portions of the index we want.