You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2016/06/11 15:49:20 UTC
[04/21] lucene-solr:branch_6x: LUCENE-6766: initial patch
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/fdc220ee/lucene/core/src/java/org/apache/lucene/index/IndexWriterConfig.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexWriterConfig.java b/lucene/core/src/java/org/apache/lucene/index/IndexWriterConfig.java
index a90d625..a6b5599 100644
--- a/lucene/core/src/java/org/apache/lucene/index/IndexWriterConfig.java
+++ b/lucene/core/src/java/org/apache/lucene/index/IndexWriterConfig.java
@@ -18,16 +18,19 @@ package org.apache.lucene.index;
import java.io.PrintStream;
+import java.util.EnumSet;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.index.DocumentsWriterPerThread.IndexingChain;
import org.apache.lucene.index.IndexWriter.IndexReaderWarmer;
+import org.apache.lucene.search.Sort;
+import org.apache.lucene.search.SortField;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.util.InfoStream;
import org.apache.lucene.util.PrintStreamInfoStream;
-import org.apache.lucene.util.SetOnce;
import org.apache.lucene.util.SetOnce.AlreadySetException;
+import org.apache.lucene.util.SetOnce;
/**
* Holds all the configuration that is used to create an {@link IndexWriter}.
@@ -439,6 +442,27 @@ public final class IndexWriterConfig extends LiveIndexWriterConfig {
return this;
}
+ /** We only allow sorting on these types */
+ private static final EnumSet<SortField.Type> ALLOWED_INDEX_SORT_TYPES = EnumSet.of(SortField.Type.STRING,
+ SortField.Type.INT,
+ SortField.Type.FLOAT,
+ SortField.Type.LONG,
+ SortField.Type.DOUBLE,
+ SortField.Type.BYTES);
+
+ /**
+ * Set the {@link Sort} order to use when merging segments. Note that newly flushed segments will remain unsorted.
+ */
+ public IndexWriterConfig setIndexSort(Sort sort) {
+ for(SortField sortField : sort.getSort()) {
+ if (ALLOWED_INDEX_SORT_TYPES.contains(sortField.getType()) == false) {
+ throw new IllegalArgumentException("invalid SortField type: must be one of " + ALLOWED_INDEX_SORT_TYPES + " but got: " + sortField);
+ }
+ }
+ this.indexSort = sort;
+ return this;
+ }
+
@Override
public String toString() {
StringBuilder sb = new StringBuilder(super.toString());
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/fdc220ee/lucene/core/src/java/org/apache/lucene/index/LeafReader.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/LeafReader.java b/lucene/core/src/java/org/apache/lucene/index/LeafReader.java
index 9622d4e..44e61e2 100644
--- a/lucene/core/src/java/org/apache/lucene/index/LeafReader.java
+++ b/lucene/core/src/java/org/apache/lucene/index/LeafReader.java
@@ -20,6 +20,7 @@ package org.apache.lucene.index;
import java.io.IOException;
import org.apache.lucene.index.IndexReader.ReaderClosedListener;
+import org.apache.lucene.search.Sort;
import org.apache.lucene.util.Bits;
/** {@code LeafReader} is an abstract class, providing an interface for accessing an
@@ -312,4 +313,7 @@ public abstract class LeafReader extends IndexReader {
* @lucene.internal
*/
public abstract void checkIntegrity() throws IOException;
+
+ /** Returns null if this leaf is unsorted, or the {@link Sort} that it was sorted by */
+ public abstract Sort getIndexSort();
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/fdc220ee/lucene/core/src/java/org/apache/lucene/index/LiveIndexWriterConfig.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/LiveIndexWriterConfig.java b/lucene/core/src/java/org/apache/lucene/index/LiveIndexWriterConfig.java
index 1a0002c..cec70c0 100644
--- a/lucene/core/src/java/org/apache/lucene/index/LiveIndexWriterConfig.java
+++ b/lucene/core/src/java/org/apache/lucene/index/LiveIndexWriterConfig.java
@@ -23,6 +23,7 @@ import org.apache.lucene.index.DocumentsWriterPerThread.IndexingChain;
import org.apache.lucene.index.IndexWriter.IndexReaderWarmer;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Sort;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.util.InfoStream;
@@ -94,6 +95,9 @@ public class LiveIndexWriterConfig {
/** True if calls to {@link IndexWriter#close()} should first do a commit. */
protected boolean commitOnClose = IndexWriterConfig.DEFAULT_COMMIT_ON_CLOSE;
+ /** The sort order to use to write merged segments. */
+ protected Sort indexSort = null;
+
// used by IndexWriterConfig
LiveIndexWriterConfig(Analyzer analyzer) {
this.analyzer = analyzer;
@@ -445,6 +449,14 @@ public class LiveIndexWriterConfig {
return commitOnClose;
}
+ /**
+ * Set the index-time {@link Sort} order. Merged segments will be written
+ * in this order.
+ */
+ public Sort getIndexSort() {
+ return indexSort;
+ }
+
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
@@ -467,6 +479,7 @@ public class LiveIndexWriterConfig {
sb.append("perThreadHardLimitMB=").append(getRAMPerThreadHardLimitMB()).append("\n");
sb.append("useCompoundFile=").append(getUseCompoundFile()).append("\n");
sb.append("commitOnClose=").append(getCommitOnClose()).append("\n");
+ sb.append("indexSort=").append(getIndexSort()).append("\n");
return sb.toString();
}
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/fdc220ee/lucene/core/src/java/org/apache/lucene/index/MappingMultiPostingsEnum.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/MappingMultiPostingsEnum.java b/lucene/core/src/java/org/apache/lucene/index/MappingMultiPostingsEnum.java
index a06c34f..c4333bc 100644
--- a/lucene/core/src/java/org/apache/lucene/index/MappingMultiPostingsEnum.java
+++ b/lucene/core/src/java/org/apache/lucene/index/MappingMultiPostingsEnum.java
@@ -18,8 +18,11 @@ package org.apache.lucene.index;
import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
import org.apache.lucene.index.MultiPostingsEnum.EnumWithSlice;
+import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
/**
@@ -30,52 +33,63 @@ import org.apache.lucene.util.BytesRef;
*/
final class MappingMultiPostingsEnum extends PostingsEnum {
- private MultiPostingsEnum.EnumWithSlice[] subs;
- int numSubs;
- int upto;
- MergeState.DocMap currentMap;
- PostingsEnum current;
- int currentBase;
- int doc = -1;
- private MergeState mergeState;
MultiPostingsEnum multiDocsAndPositionsEnum;
final String field;
+ final DocIDMerger<MappingPostingsSub> docIDMerger;
+ private MappingPostingsSub current;
+ private final MappingPostingsSub[] allSubs;
+ private final List<MappingPostingsSub> subs = new ArrayList<>();
+
+ private static class MappingPostingsSub extends DocIDMerger.Sub {
+ public PostingsEnum postings;
+
+ public MappingPostingsSub(MergeState.DocMap docMap, Bits liveDocs) {
+ super(docMap, liveDocs);
+ }
+
+ @Override
+ public int nextDoc() {
+ try {
+ return postings.nextDoc();
+ } catch (IOException ioe) {
+ throw new RuntimeException(ioe);
+ }
+ }
+ }
/** Sole constructor. */
- public MappingMultiPostingsEnum(String field, MergeState mergeState) {
+ public MappingMultiPostingsEnum(String field, MergeState mergeState) throws IOException {
this.field = field;
- this.mergeState = mergeState;
+ allSubs = new MappingPostingsSub[mergeState.fieldsProducers.length];
+ for(int i=0;i<allSubs.length;i++) {
+ // nocommit delDocMaps?
+ allSubs[i] = new MappingPostingsSub(mergeState.docMaps[i], mergeState.liveDocs[i]);
+ }
+ this.docIDMerger = new DocIDMerger<MappingPostingsSub>(subs, allSubs.length, mergeState.segmentInfo.getIndexSort() != null);
}
- MappingMultiPostingsEnum reset(MultiPostingsEnum postingsEnum) {
- this.numSubs = postingsEnum.getNumSubs();
- this.subs = postingsEnum.getSubs();
- upto = -1;
- doc = -1;
- current = null;
+ MappingMultiPostingsEnum reset(MultiPostingsEnum postingsEnum) throws IOException {
this.multiDocsAndPositionsEnum = postingsEnum;
+ MultiPostingsEnum.EnumWithSlice[] subsArray = postingsEnum.getSubs();
+ int count = postingsEnum.getNumSubs();
+ subs.clear();
+ for(int i=0;i<count;i++) {
+ MappingPostingsSub sub = allSubs[subsArray[i].slice.readerIndex];
+ sub.postings = subsArray[i].postingsEnum;
+ subs.add(sub);
+ }
+ docIDMerger.reset();
return this;
}
- /** How many sub-readers we are merging.
- * @see #getSubs */
- public int getNumSubs() {
- return numSubs;
- }
-
- /** Returns sub-readers we are merging. */
- public EnumWithSlice[] getSubs() {
- return subs;
- }
-
@Override
public int freq() throws IOException {
- return current.freq();
+ return current.postings.freq();
}
@Override
public int docID() {
- return doc;
+ return current.mappedDocID;
}
@Override
@@ -85,66 +99,47 @@ final class MappingMultiPostingsEnum extends PostingsEnum {
@Override
public int nextDoc() throws IOException {
- while(true) {
- if (current == null) {
- if (upto == numSubs-1) {
- return this.doc = NO_MORE_DOCS;
- } else {
- upto++;
- final int reader = subs[upto].slice.readerIndex;
- current = subs[upto].postingsEnum;
- currentBase = mergeState.docBase[reader];
- currentMap = mergeState.docMaps[reader];
- }
- }
-
- int doc = current.nextDoc();
- if (doc != NO_MORE_DOCS) {
- // compact deletions
- doc = currentMap.get(doc);
- if (doc == -1) {
- continue;
- }
- return this.doc = currentBase + doc;
- } else {
- current = null;
- }
+ current = docIDMerger.next();
+ if (current == null) {
+ return NO_MORE_DOCS;
+ } else {
+ return current.mappedDocID;
}
}
@Override
public int nextPosition() throws IOException {
- int pos = current.nextPosition();
+ int pos = current.postings.nextPosition();
if (pos < 0) {
- throw new CorruptIndexException("position=" + pos + " is negative, field=\"" + field + " doc=" + doc,
- mergeState.fieldsProducers[upto].toString());
+ throw new CorruptIndexException("position=" + pos + " is negative, field=\"" + field + " doc=" + current.mappedDocID,
+ current.postings.toString());
} else if (pos > IndexWriter.MAX_POSITION) {
- throw new CorruptIndexException("position=" + pos + " is too large (> IndexWriter.MAX_POSITION=" + IndexWriter.MAX_POSITION + "), field=\"" + field + "\" doc=" + doc,
- mergeState.fieldsProducers[upto].toString());
+ throw new CorruptIndexException("position=" + pos + " is too large (> IndexWriter.MAX_POSITION=" + IndexWriter.MAX_POSITION + "), field=\"" + field + "\" doc=" + current.mappedDocID,
+ current.postings.toString());
}
return pos;
}
@Override
public int startOffset() throws IOException {
- return current.startOffset();
+ return current.postings.startOffset();
}
@Override
public int endOffset() throws IOException {
- return current.endOffset();
+ return current.postings.endOffset();
}
@Override
public BytesRef getPayload() throws IOException {
- return current.getPayload();
+ return current.postings.getPayload();
}
@Override
public long cost() {
long cost = 0;
- for (EnumWithSlice enumWithSlice : subs) {
- cost += enumWithSlice.postingsEnum.cost();
+ for (MappingPostingsSub sub : subs) {
+ cost += sub.postings.cost();
}
return cost;
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/fdc220ee/lucene/core/src/java/org/apache/lucene/index/MergePolicy.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/MergePolicy.java b/lucene/core/src/java/org/apache/lucene/index/MergePolicy.java
index 1d67c4a..c42b052 100644
--- a/lucene/core/src/java/org/apache/lucene/index/MergePolicy.java
+++ b/lucene/core/src/java/org/apache/lucene/index/MergePolicy.java
@@ -58,31 +58,6 @@ import org.apache.lucene.util.FixedBitSet;
*/
public abstract class MergePolicy {
- /** A map of doc IDs. */
- public static abstract class DocMap {
- /** Sole constructor, typically invoked from sub-classes constructors. */
- protected DocMap() {}
-
- /** Return the new doc ID according to its old value. */
- public abstract int map(int old);
-
- /** Useful from an assert. */
- boolean isConsistent(int maxDoc) {
- final FixedBitSet targets = new FixedBitSet(maxDoc);
- for (int i = 0; i < maxDoc; ++i) {
- final int target = map(i);
- if (target < 0 || target >= maxDoc) {
- assert false : "out of range: " + target + " not in [0-" + maxDoc + "[";
- return false;
- } else if (targets.get(target)) {
- assert false : target + " is already taken (" + i + ")";
- return false;
- }
- }
- return true;
- }
- }
-
/** OneMerge provides the information necessary to perform
* an individual primitive merge operation, resulting in
* a single new segment. The merge spec includes the
@@ -140,25 +115,11 @@ public abstract class MergePolicy {
public void mergeFinished() throws IOException {
}
- /** Expert: Get the list of readers to merge. Note that this list does not
- * necessarily match the list of segments to merge and should only be used
- * to feed SegmentMerger to initialize a merge. When a {@link OneMerge}
- * reorders doc IDs, it must override {@link #getDocMap} too so that
- * deletes that happened during the merge can be applied to the newly
- * merged segment. */
- public List<CodecReader> getMergeReaders() throws IOException {
- if (readers == null) {
- throw new IllegalStateException("IndexWriter has not initialized readers from the segment infos yet");
- }
- final List<CodecReader> readers = new ArrayList<>(this.readers.size());
- for (SegmentReader reader : this.readers) {
- if (reader.numDocs() > 0) {
- readers.add(reader);
- }
- }
- return Collections.unmodifiableList(readers);
+ /** Wrap the reader in order to add/remove information to the merged segment. */
+ public CodecReader wrapForMerge(CodecReader reader) throws IOException {
+ return reader;
}
-
+
/**
* Expert: Sets the {@link SegmentCommitInfo} of the merged segment.
* Allows sub-classes to e.g. set diagnostics properties.
@@ -175,20 +136,6 @@ public abstract class MergePolicy {
return info;
}
- /** Expert: If {@link #getMergeReaders()} reorders document IDs, this method
- * must be overridden to return a mapping from the <i>natural</i> doc ID
- * (the doc ID that would result from a natural merge) to the actual doc
- * ID. This mapping is used to apply deletions that happened during the
- * merge to the new segment. */
- public DocMap getDocMap(MergeState mergeState) {
- return new DocMap() {
- @Override
- public int map(int docID) {
- return docID;
- }
- };
- }
-
/** Record that an exception occurred while executing
* this merge */
synchronized void setException(Throwable error) {
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/fdc220ee/lucene/core/src/java/org/apache/lucene/index/MergeReaderWrapper.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/MergeReaderWrapper.java b/lucene/core/src/java/org/apache/lucene/index/MergeReaderWrapper.java
new file mode 100644
index 0000000..be3513a
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/index/MergeReaderWrapper.java
@@ -0,0 +1,266 @@
+package org.apache.lucene.index;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.codecs.DocValuesProducer;
+import org.apache.lucene.codecs.FieldsProducer;
+import org.apache.lucene.codecs.NormsProducer;
+import org.apache.lucene.codecs.StoredFieldsReader;
+import org.apache.lucene.codecs.TermVectorsReader;
+import org.apache.lucene.search.Sort;
+import org.apache.lucene.util.Bits;
+
+/** This is a hack to make index sorting fast, with a {@link LeafReader} that always returns merge instances when you ask for the codec readers. */
+class MergeReaderWrapper extends LeafReader {
+ final SegmentReader in;
+ final FieldsProducer fields;
+ final NormsProducer norms;
+ final DocValuesProducer docValues;
+ final StoredFieldsReader store;
+ final TermVectorsReader vectors;
+
+ MergeReaderWrapper(SegmentReader in) throws IOException {
+ this.in = in;
+
+ FieldsProducer fields = in.getPostingsReader();
+ if (fields != null) {
+ fields = fields.getMergeInstance();
+ }
+ this.fields = fields;
+
+ NormsProducer norms = in.getNormsReader();
+ if (norms != null) {
+ norms = norms.getMergeInstance();
+ }
+ this.norms = norms;
+
+ DocValuesProducer docValues = in.getDocValuesReader();
+ if (docValues != null) {
+ docValues = docValues.getMergeInstance();
+ }
+ this.docValues = docValues;
+
+ StoredFieldsReader store = in.getFieldsReader();
+ if (store != null) {
+ store = store.getMergeInstance();
+ }
+ this.store = store;
+
+ TermVectorsReader vectors = in.getTermVectorsReader();
+ if (vectors != null) {
+ vectors = vectors.getMergeInstance();
+ }
+ this.vectors = vectors;
+ }
+
+ @Override
+ public void addCoreClosedListener(CoreClosedListener listener) {
+ in.addCoreClosedListener(listener);
+ }
+
+ @Override
+ public void removeCoreClosedListener(CoreClosedListener listener) {
+ in.removeCoreClosedListener(listener);
+ }
+
+ @Override
+ public Fields fields() throws IOException {
+ return fields;
+ }
+
+ @Override
+ public NumericDocValues getNumericDocValues(String field) throws IOException {
+ ensureOpen();
+ FieldInfo fi = getFieldInfos().fieldInfo(field);
+ if (fi == null) {
+ // Field does not exist
+ return null;
+ }
+ if (fi.getDocValuesType() != DocValuesType.NUMERIC) {
+ // Field was not indexed with doc values
+ return null;
+ }
+ return docValues.getNumeric(fi);
+ }
+
+ @Override
+ public BinaryDocValues getBinaryDocValues(String field) throws IOException {
+ ensureOpen();
+ FieldInfo fi = getFieldInfos().fieldInfo(field);
+ if (fi == null) {
+ // Field does not exist
+ return null;
+ }
+ if (fi.getDocValuesType() != DocValuesType.BINARY) {
+ // Field was not indexed with doc values
+ return null;
+ }
+ return docValues.getBinary(fi);
+ }
+
+ @Override
+ public SortedDocValues getSortedDocValues(String field) throws IOException {
+ ensureOpen();
+ FieldInfo fi = getFieldInfos().fieldInfo(field);
+ if (fi == null) {
+ // Field does not exist
+ return null;
+ }
+ if (fi.getDocValuesType() != DocValuesType.SORTED) {
+ // Field was not indexed with doc values
+ return null;
+ }
+ return docValues.getSorted(fi);
+ }
+
+ @Override
+ public SortedNumericDocValues getSortedNumericDocValues(String field) throws IOException {
+ ensureOpen();
+ FieldInfo fi = getFieldInfos().fieldInfo(field);
+ if (fi == null) {
+ // Field does not exist
+ return null;
+ }
+ if (fi.getDocValuesType() != DocValuesType.SORTED_NUMERIC) {
+ // Field was not indexed with doc values
+ return null;
+ }
+ return docValues.getSortedNumeric(fi);
+ }
+
+ @Override
+ public SortedSetDocValues getSortedSetDocValues(String field) throws IOException {
+ ensureOpen();
+ FieldInfo fi = getFieldInfos().fieldInfo(field);
+ if (fi == null) {
+ // Field does not exist
+ return null;
+ }
+ if (fi.getDocValuesType() != DocValuesType.SORTED_SET) {
+ // Field was not indexed with doc values
+ return null;
+ }
+ return docValues.getSortedSet(fi);
+ }
+
+ @Override
+ public Bits getDocsWithField(String field) throws IOException {
+ ensureOpen();
+ FieldInfo fi = getFieldInfos().fieldInfo(field);
+ if (fi == null) {
+ // Field does not exist
+ return null;
+ }
+ if (fi.getDocValuesType() == DocValuesType.NONE) {
+ // Field was not indexed with doc values
+ return null;
+ }
+ return docValues.getDocsWithField(fi);
+ }
+
+ @Override
+ public NumericDocValues getNormValues(String field) throws IOException {
+ ensureOpen();
+ FieldInfo fi = getFieldInfos().fieldInfo(field);
+ if (fi == null || !fi.hasNorms()) {
+ // Field does not exist or does not index norms
+ return null;
+ }
+ return norms.getNorms(fi);
+ }
+
+ @Override
+ public FieldInfos getFieldInfos() {
+ return in.getFieldInfos();
+ }
+
+ @Override
+ public Bits getLiveDocs() {
+ return in.getLiveDocs();
+ }
+
+ @Override
+ public void checkIntegrity() throws IOException {
+ in.checkIntegrity();
+ }
+
+ @Override
+ public Fields getTermVectors(int docID) throws IOException {
+ ensureOpen();
+ checkBounds(docID);
+ if (vectors == null) {
+ return null;
+ }
+ return vectors.get(docID);
+ }
+
+ @Override
+ public PointValues getPointValues() {
+ return in.getPointValues();
+ }
+
+ @Override
+ public int numDocs() {
+ return in.numDocs();
+ }
+
+ @Override
+ public int maxDoc() {
+ return in.maxDoc();
+ }
+
+ @Override
+ public void document(int docID, StoredFieldVisitor visitor) throws IOException {
+ ensureOpen();
+ checkBounds(docID);
+ store.visitDocument(docID, visitor);
+ }
+
+ @Override
+ protected void doClose() throws IOException {
+ in.close();
+ }
+
+ @Override
+ public Object getCoreCacheKey() {
+ return in.getCoreCacheKey();
+ }
+
+ @Override
+ public Object getCombinedCoreAndDeletesKey() {
+ return in.getCombinedCoreAndDeletesKey();
+ }
+
+ private void checkBounds(int docID) {
+ if (docID < 0 || docID >= maxDoc()) {
+ throw new IndexOutOfBoundsException("docID must be >= 0 and < maxDoc=" + maxDoc() + " (got docID=" + docID + ")");
+ }
+ }
+
+ @Override
+ public String toString() {
+ return "MergeReaderWrapper(" + in + ")";
+ }
+
+ @Override
+ public Sort getIndexSort() {
+ return in.getIndexSort();
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/fdc220ee/lucene/core/src/java/org/apache/lucene/index/MergeState.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/MergeState.java b/lucene/core/src/java/org/apache/lucene/index/MergeState.java
index 7242785..32e0480 100644
--- a/lucene/core/src/java/org/apache/lucene/index/MergeState.java
+++ b/lucene/core/src/java/org/apache/lucene/index/MergeState.java
@@ -18,6 +18,8 @@ package org.apache.lucene.index;
import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
import java.util.List;
import org.apache.lucene.codecs.DocValuesProducer;
@@ -26,6 +28,7 @@ import org.apache.lucene.codecs.NormsProducer;
import org.apache.lucene.codecs.PointsReader;
import org.apache.lucene.codecs.StoredFieldsReader;
import org.apache.lucene.codecs.TermVectorsReader;
+import org.apache.lucene.search.Sort;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.InfoStream;
import org.apache.lucene.util.packed.PackedInts;
@@ -36,6 +39,13 @@ import org.apache.lucene.util.packed.PackedLongValues;
* @lucene.experimental */
public class MergeState {
+ /** Maps document IDs from old segments to document IDs in the new segment */
+ // nocommit in the unsorted case, this should map correctly, e.g. apply per segment docBase
+ public final DocMap[] docMaps;
+
+ // nocommit can we somehow not need to expose this? should IW's reader pool always sort on load...?
+ public final DocMap[] leafDocMaps;
+
/** {@link SegmentInfo} of the newly merged segment. */
public final SegmentInfo segmentInfo;
@@ -60,18 +70,12 @@ public class MergeState {
/** Live docs for each reader */
public final Bits[] liveDocs;
- /** Maps docIDs around deletions. */
- public final DocMap[] docMaps;
-
/** Postings to merge */
public final FieldsProducer[] fieldsProducers;
/** Point readers to merge */
public final PointsReader[] pointsReaders;
- /** New docID base per reader. */
- public final int[] docBase;
-
/** Max docs per reader */
public final int[] maxDocs;
@@ -79,11 +83,13 @@ public class MergeState {
public final InfoStream infoStream;
/** Sole constructor. */
- MergeState(List<CodecReader> readers, SegmentInfo segmentInfo, InfoStream infoStream) throws IOException {
+ MergeState(List<CodecReader> originalReaders, SegmentInfo segmentInfo, InfoStream infoStream) throws IOException {
+
+ final Sort indexSort = segmentInfo.getIndexSort();
+ int numReaders = originalReaders.size();
+ leafDocMaps = new DocMap[numReaders];
+ List<CodecReader> readers = maybeSortReaders(originalReaders, segmentInfo);
- int numReaders = readers.size();
- docMaps = new DocMap[numReaders];
- docBase = new int[numReaders];
maxDocs = new int[numReaders];
fieldsProducers = new FieldsProducer[numReaders];
normsProducers = new NormsProducer[numReaders];
@@ -94,6 +100,7 @@ public class MergeState {
fieldInfos = new FieldInfos[numReaders];
liveDocs = new Bits[numReaders];
+ int numDocs = 0;
for(int i=0;i<numReaders;i++) {
final CodecReader reader = readers.get(i);
@@ -126,126 +133,137 @@ public class MergeState {
if (pointsReaders[i] != null) {
pointsReaders[i] = pointsReaders[i].getMergeInstance();
}
+ numDocs += reader.numDocs();
}
+ segmentInfo.setMaxDoc(numDocs);
+
this.segmentInfo = segmentInfo;
this.infoStream = infoStream;
-
- setDocMaps(readers);
+ this.docMaps = buildDocMaps(readers, indexSort);
}
- // NOTE: removes any "all deleted" readers from mergeState.readers
- private void setDocMaps(List<CodecReader> readers) throws IOException {
- final int numReaders = maxDocs.length;
+ private DocMap[] buildDocMaps(List<CodecReader> readers, Sort indexSort) throws IOException {
- // Remap docIDs
- int docBase = 0;
- for(int i=0;i<numReaders;i++) {
- final CodecReader reader = readers.get(i);
- this.docBase[i] = docBase;
- final DocMap docMap = DocMap.build(reader);
- docMaps[i] = docMap;
- docBase += docMap.numDocs();
- }
+ int numReaders = readers.size();
- segmentInfo.setMaxDoc(docBase);
- }
+ if (indexSort == null) {
+ // no index sort ... we only must map around deletions, and rebase to the merged segment's docID space
- /**
- * Remaps docids around deletes during merge
- */
- public static abstract class DocMap {
+ int totalDocs = 0;
+ DocMap[] docMaps = new DocMap[numReaders];
- DocMap() {}
+ // Remap docIDs around deletions:
+ for (int i = 0; i < numReaders; i++) {
+ LeafReader reader = readers.get(i);
+ Bits liveDocs = reader.getLiveDocs();
- /** Returns the mapped docID corresponding to the provided one. */
- public abstract int get(int docID);
+ final PackedLongValues delDocMap;
+ if (liveDocs != null) {
+ delDocMap = removeDeletes(reader.maxDoc(), liveDocs);
+ } else {
+ delDocMap = null;
+ }
+
+ final int docBase = totalDocs;
+ docMaps[i] = new DocMap() {
+ @Override
+ public int get(int docID) {
+ if (liveDocs == null) {
+ return docBase + docID;
+ } else if (liveDocs.get(docID)) {
+ return docBase + (int) delDocMap.get(docID);
+ } else {
+ return -1;
+ }
+ }
+ };
+ totalDocs += reader.numDocs();
+ }
- /** Returns the total number of documents, ignoring
- * deletions. */
- public abstract int maxDoc();
+ return docMaps;
- /** Returns the number of not-deleted documents. */
- public final int numDocs() {
- return maxDoc() - numDeletedDocs();
+ } else {
+ // do a merge sort of the incoming leaves:
+ return MultiSorter.sort(indexSort, readers);
}
+ }
- /** Returns the number of deleted documents. */
- public abstract int numDeletedDocs();
+ private List<CodecReader> maybeSortReaders(List<CodecReader> originalReaders, SegmentInfo segmentInfo) throws IOException {
- /** Returns true if there are any deletions. */
- public boolean hasDeletions() {
- return numDeletedDocs() > 0;
+ // Default to identity:
+ for(int i=0;i<originalReaders.size();i++) {
+ leafDocMaps[i] = new DocMap() {
+ @Override
+ public int get(int docID) {
+ return docID;
+ }
+ };
}
- /** Creates a {@link DocMap} instance appropriate for
- * this reader. */
- public static DocMap build(CodecReader reader) {
- final int maxDoc = reader.maxDoc();
- if (!reader.hasDeletions()) {
- return new NoDelDocMap(maxDoc);
- }
- final Bits liveDocs = reader.getLiveDocs();
- return build(maxDoc, liveDocs);
+ Sort indexSort = segmentInfo.getIndexSort();
+ if (indexSort == null) {
+ return originalReaders;
}
- static DocMap build(final int maxDoc, final Bits liveDocs) {
- assert liveDocs != null;
- final PackedLongValues.Builder docMapBuilder = PackedLongValues.monotonicBuilder(PackedInts.COMPACT);
- int del = 0;
- for (int i = 0; i < maxDoc; ++i) {
- docMapBuilder.add(i - del);
- if (!liveDocs.get(i)) {
- ++del;
- }
- }
- final PackedLongValues docMap = docMapBuilder.build();
- final int numDeletedDocs = del;
- assert docMap.size() == maxDoc;
- return new DocMap() {
-
- @Override
- public int get(int docID) {
- if (!liveDocs.get(docID)) {
- return -1;
+ // If an incoming reader is not sorted, because it was flushed by IW, we sort it here:
+ final Sorter sorter = new Sorter(indexSort);
+ List<CodecReader> readers = new ArrayList<>(originalReaders.size());
+
+ //System.out.println("MergeState.maybeSortReaders indexSort=" + indexSort);
+
+ for (CodecReader leaf : originalReaders) {
+ if (leaf instanceof SegmentReader) {
+ SegmentReader segmentReader = (SegmentReader) leaf;
+ Sort segmentSort = segmentReader.getSegmentInfo().info.getIndexSort();
+ //System.out.println(" leaf=" + leaf + " sort=" + segmentSort);
+
+ if (segmentSort == null) {
+ // TODO: fix IW to also sort when flushing? It's somewhat tricky because of stored fields and term vectors, which write "live"
+ // to the files on each indexed document:
+
+ // This segment was written by flush, so documents are not yet sorted, so we sort them now:
+ Sorter.DocMap sortDocMap = sorter.sort(leaf);
+ if (sortDocMap != null) {
+ //System.out.println(" sort!");
+ // nocommit what about MergedReaderWrapper in here?
+ leaf = SlowCodecReaderWrapper.wrap(SortingLeafReader.wrap(leaf, sortDocMap));
+ leafDocMaps[readers.size()] = new DocMap() {
+ @Override
+ public int get(int docID) {
+ return sortDocMap.oldToNew(docID);
+ }
+ };
}
- return (int) docMap.get(docID);
- }
-
- @Override
- public int maxDoc() {
- return maxDoc;
- }
- @Override
- public int numDeletedDocs() {
- return numDeletedDocs;
+ } else if (segmentSort.equals(indexSort) == false) {
+ throw new IllegalArgumentException("index sort mismatch: merged segment has sort=" + indexSort + " but to-be-merged segment has sort=" + segmentSort);
}
- };
- }
- }
-
- private static final class NoDelDocMap extends DocMap {
-
- private final int maxDoc;
+ } else {
+ throw new IllegalArgumentException("cannot sort index with foreign readers; leaf=" + leaf);
+ }
- NoDelDocMap(int maxDoc) {
- this.maxDoc = maxDoc;
+ readers.add(leaf);
}
- @Override
- public int get(int docID) {
- return docID;
- }
+ return readers;
+ }
- @Override
- public int maxDoc() {
- return maxDoc;
- }
+ /** A map of doc IDs. */
+ public static abstract class DocMap {
+ /** Return the mapped docID or -1 if the given doc is not mapped. */
+ public abstract int get(int docID);
+ }
- @Override
- public int numDeletedDocs() {
- return 0;
+ static PackedLongValues removeDeletes(final int maxDoc, final Bits liveDocs) {
+ final PackedLongValues.Builder docMapBuilder = PackedLongValues.monotonicBuilder(PackedInts.COMPACT);
+ int del = 0;
+ for (int i = 0; i < maxDoc; ++i) {
+ docMapBuilder.add(i - del);
+ if (liveDocs.get(i) == false) {
+ ++del;
+ }
}
+ return docMapBuilder.build();
}
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/fdc220ee/lucene/core/src/java/org/apache/lucene/index/MultiFields.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/MultiFields.java b/lucene/core/src/java/org/apache/lucene/index/MultiFields.java
index 1736bac..447e0ae 100644
--- a/lucene/core/src/java/org/apache/lucene/index/MultiFields.java
+++ b/lucene/core/src/java/org/apache/lucene/index/MultiFields.java
@@ -51,6 +51,8 @@ public final class MultiFields extends Fields {
private final ReaderSlice[] subSlices;
private final Map<String,Terms> terms = new ConcurrentHashMap<>();
+ // nocommit should we somehow throw exc if you try to pass in "sorted" Fields?
+
/** Returns a single {@link Fields} instance for this
* reader, merging fields/terms/docs/positions on the
* fly. This method will return null if the reader
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/fdc220ee/lucene/core/src/java/org/apache/lucene/index/MultiPostingsEnum.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/MultiPostingsEnum.java b/lucene/core/src/java/org/apache/lucene/index/MultiPostingsEnum.java
index 5456325..573bbe8 100644
--- a/lucene/core/src/java/org/apache/lucene/index/MultiPostingsEnum.java
+++ b/lucene/core/src/java/org/apache/lucene/index/MultiPostingsEnum.java
@@ -57,7 +57,9 @@ public final class MultiPostingsEnum extends PostingsEnum {
return this.parent == parent;
}
- /** Rre-use and reset this instance on the provided slices. */
+ // nocommit is this class supposed to be aware of index sorting too???
+
+ /** Re-use and reset this instance on the provided slices. */
public MultiPostingsEnum reset(final EnumWithSlice[] subs, final int numSubs) {
this.numSubs = numSubs;
for(int i=0;i<numSubs;i++) {
@@ -165,9 +167,6 @@ public final class MultiPostingsEnum extends PostingsEnum {
/** Holds a {@link PostingsEnum} along with the
* corresponding {@link ReaderSlice}. */
public final static class EnumWithSlice {
- EnumWithSlice() {
- }
-
/** {@link PostingsEnum} for this sub-reader. */
public PostingsEnum postingsEnum;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/fdc220ee/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java b/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java
new file mode 100644
index 0000000..062dde9
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/index/MultiSorter.java
@@ -0,0 +1,221 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.index;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.lucene.index.MergeState;
+import org.apache.lucene.search.LeafFieldComparator;
+import org.apache.lucene.search.Sort;
+import org.apache.lucene.search.SortField;
+import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.PriorityQueue;
+import org.apache.lucene.util.packed.PackedInts;
+import org.apache.lucene.util.packed.PackedLongValues;
+
+final class MultiSorter {
+
+ /** Does a merge sort of the leaves of the incoming reader, returning {@link MergeState#DocMap} to map each leaf's
+ * documents into the merged segment. The documents for each incoming leaf reader must already be sorted by the same sort! */
+ static MergeState.DocMap[] sort(Sort sort, List<CodecReader> readers) throws IOException {
+
+ SortField fields[] = sort.getSort();
+ final CrossReaderComparator[] comparators = new CrossReaderComparator[fields.length];
+ for(int i=0;i<fields.length;i++) {
+ comparators[i] = getComparator(readers, fields[i]);
+ }
+
+ int leafCount = readers.size();
+
+ PriorityQueue<LeafAndDocID> queue = new PriorityQueue<LeafAndDocID>(leafCount) {
+ @Override
+ public boolean lessThan(LeafAndDocID a, LeafAndDocID b) {
+ for(int i=0;i<comparators.length;i++) {
+ int cmp = comparators[i].compare(a.readerIndex, a.docID, b.readerIndex, b.docID);
+ if (cmp != 0) {
+ return cmp < 0;
+ }
+ }
+
+ // tie-break by docID natural order:
+ if (a.readerIndex != b.readerIndex) {
+ return a.readerIndex < b.readerIndex;
+ }
+ return a.docID < b.docID;
+ }
+ };
+
+ PackedLongValues.Builder[] builders = new PackedLongValues.Builder[leafCount];
+
+ for(int i=0;i<leafCount;i++) {
+ CodecReader reader = readers.get(i);
+ queue.add(new LeafAndDocID(i, reader.getLiveDocs(), reader.maxDoc()));
+ builders[i] = PackedLongValues.monotonicBuilder(PackedInts.COMPACT);
+ }
+
+ int mappedDocID = 0;
+ while (queue.size() != 0) {
+ LeafAndDocID top = queue.top();
+ builders[top.readerIndex].add(mappedDocID);
+ if (top.liveDocs == null || top.liveDocs.get(top.docID)) {
+ mappedDocID++;
+ }
+ top.docID++;
+ if (top.docID < top.maxDoc) {
+ queue.updateTop();
+ } else {
+ queue.pop();
+ }
+ }
+
+ MergeState.DocMap[] docMaps = new MergeState.DocMap[leafCount];
+ for(int i=0;i<leafCount;i++) {
+ final PackedLongValues remapped = builders[i].build();
+ final Bits liveDocs = readers.get(i).getLiveDocs();
+ docMaps[i] = new MergeState.DocMap() {
+ @Override
+ public int get(int docID) {
+ if (liveDocs == null || liveDocs.get(docID)) {
+ return (int) remapped.get(docID);
+ } else {
+ return -1;
+ }
+ }
+ };
+ }
+
+ return docMaps;
+ }
+
+ private static class LeafAndDocID {
+ final int readerIndex;
+ final Bits liveDocs;
+ final int maxDoc;
+ int docID;
+
+ public LeafAndDocID(int readerIndex, Bits liveDocs, int maxDoc) {
+ this.readerIndex = readerIndex;
+ this.liveDocs = liveDocs;
+ this.maxDoc = maxDoc;
+ }
+ }
+
+ private interface CrossReaderComparator {
+ public int compare(int readerIndexA, int docIDA, int readerIndexB, int docIDB);
+ }
+
+ private static CrossReaderComparator getComparator(List<CodecReader> readers, SortField sortField) throws IOException {
+ switch(sortField.getType()) {
+ // TODO: use global ords for string sort
+ case INT:
+ {
+ List<NumericDocValues> values = new ArrayList<>();
+ List<Bits> docsWithFields = new ArrayList<>();
+ for(CodecReader reader : readers) {
+ values.add(DocValues.getNumeric(reader, sortField.getField()));
+ docsWithFields.add(DocValues.getDocsWithField(reader, sortField.getField()));
+ }
+
+ final int reverseMul;
+ if (sortField.getReverse()) {
+ reverseMul = -1;
+ } else {
+ reverseMul = 1;
+ }
+
+ final int missingValue;
+
+ if (sortField.getMissingValue() != null) {
+ missingValue = (Integer) sortField.getMissingValue();
+ } else {
+ missingValue = 0;
+ }
+
+ return new CrossReaderComparator() {
+ @Override
+ public int compare(int readerIndexA, int docIDA, int readerIndexB, int docIDB) {
+ int valueA;
+ if (docsWithFields.get(readerIndexA).get(docIDA)) {
+ valueA = (int) values.get(readerIndexA).get(docIDA);
+ } else {
+ valueA = missingValue;
+ }
+
+ int valueB;
+ if (docsWithFields.get(readerIndexB).get(docIDB)) {
+ valueB = (int) values.get(readerIndexB).get(docIDB);
+ } else {
+ valueB = missingValue;
+ }
+ return reverseMul * Integer.compare(valueA, valueB);
+ }
+ };
+ }
+ case LONG:
+ // nocommit refactor/share at least numerics here:
+ {
+ List<NumericDocValues> values = new ArrayList<>();
+ List<Bits> docsWithFields = new ArrayList<>();
+ for(CodecReader reader : readers) {
+ values.add(DocValues.getNumeric(reader, sortField.getField()));
+ docsWithFields.add(DocValues.getDocsWithField(reader, sortField.getField()));
+ }
+
+ final int reverseMul;
+ if (sortField.getReverse()) {
+ reverseMul = -1;
+ } else {
+ reverseMul = 1;
+ }
+
+ final int missingValue;
+
+ if (sortField.getMissingValue() != null) {
+ missingValue = (Integer) sortField.getMissingValue();
+ } else {
+ missingValue = 0;
+ }
+
+ return new CrossReaderComparator() {
+ @Override
+ public int compare(int readerIndexA, int docIDA, int readerIndexB, int docIDB) {
+ long valueA;
+ if (docsWithFields.get(readerIndexA).get(docIDA)) {
+ valueA = (int) values.get(readerIndexA).get(docIDA);
+ } else {
+ valueA = missingValue;
+ }
+
+ long valueB;
+ if (docsWithFields.get(readerIndexB).get(docIDB)) {
+ valueB = (int) values.get(readerIndexB).get(docIDB);
+ } else {
+ valueB = missingValue;
+ }
+ return reverseMul * Long.compare(valueA, valueB);
+ }
+ };
+ }
+ // nocommit do the rest:
+ default:
+ throw new IllegalArgumentException("unhandled SortField.getType()=" + sortField.getType());
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/fdc220ee/lucene/core/src/java/org/apache/lucene/index/ParallelLeafReader.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/ParallelLeafReader.java b/lucene/core/src/java/org/apache/lucene/index/ParallelLeafReader.java
index 532265f..ef9f28c 100644
--- a/lucene/core/src/java/org/apache/lucene/index/ParallelLeafReader.java
+++ b/lucene/core/src/java/org/apache/lucene/index/ParallelLeafReader.java
@@ -26,6 +26,7 @@ import java.util.Set;
import java.util.SortedMap;
import java.util.TreeMap;
+import org.apache.lucene.search.Sort;
import org.apache.lucene.util.Bits;
/** An {@link LeafReader} which reads multiple, parallel indexes. Each index
@@ -55,6 +56,7 @@ public class ParallelLeafReader extends LeafReader {
private final boolean closeSubReaders;
private final int maxDoc, numDocs;
private final boolean hasDeletions;
+ private final Sort indexSort;
private final SortedMap<String,LeafReader> fieldToReader = new TreeMap<>();
private final SortedMap<String,LeafReader> tvFieldToReader = new TreeMap<>();
@@ -100,8 +102,17 @@ public class ParallelLeafReader extends LeafReader {
// TODO: make this read-only in a cleaner way?
FieldInfos.Builder builder = new FieldInfos.Builder();
+
+ Sort indexSort = null;
+
// build FieldInfos and fieldToReader map:
for (final LeafReader reader : this.parallelReaders) {
+ if (indexSort == null) {
+ indexSort = reader.getIndexSort();
+ } else if (indexSort.equals(reader.getIndexSort()) == false) {
+ throw new IllegalArgumentException("cannot combine LeafReaders that have different index sorts: saw both sort=" + indexSort + " and " + reader.getIndexSort());
+ }
+
final FieldInfos readerFieldInfos = reader.getFieldInfos();
for (FieldInfo fieldInfo : readerFieldInfos) {
// NOTE: first reader having a given field "wins":
@@ -115,6 +126,7 @@ public class ParallelLeafReader extends LeafReader {
}
}
fieldInfos = builder.finish();
+ this.indexSort = indexSort;
// build Fields instance
for (final LeafReader reader : this.parallelReaders) {
@@ -423,4 +435,10 @@ public class ParallelLeafReader extends LeafReader {
ensureOpen();
return parallelReaders;
}
+
+ @Override
+ public Sort getIndexSort() {
+ return indexSort;
+ }
+
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/fdc220ee/lucene/core/src/java/org/apache/lucene/index/SegmentInfo.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/SegmentInfo.java b/lucene/core/src/java/org/apache/lucene/index/SegmentInfo.java
index bed8458..5830201 100644
--- a/lucene/core/src/java/org/apache/lucene/index/SegmentInfo.java
+++ b/lucene/core/src/java/org/apache/lucene/index/SegmentInfo.java
@@ -28,6 +28,7 @@ import java.util.Set;
import java.util.regex.Matcher;
import org.apache.lucene.codecs.Codec;
+import org.apache.lucene.search.Sort;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.TrackingDirectoryWrapper;
import org.apache.lucene.util.StringHelper;
@@ -69,6 +70,8 @@ public final class SegmentInfo {
private final Map<String,String> attributes;
+ private final Sort indexSort;
+
// Tracks the Lucene version this segment was created with, since 3.1. Null
// indicates an older than 3.0 index, and it's used to detect a too old index.
// The format expected is "x.y" - "2.x" for pre-3.0 indexes (or null), and
@@ -93,7 +96,7 @@ public final class SegmentInfo {
*/
public SegmentInfo(Directory dir, Version version, String name, int maxDoc,
boolean isCompoundFile, Codec codec, Map<String,String> diagnostics,
- byte[] id, Map<String,String> attributes) {
+ byte[] id, Map<String,String> attributes, Sort indexSort) {
assert !(dir instanceof TrackingDirectoryWrapper);
this.dir = Objects.requireNonNull(dir);
this.version = Objects.requireNonNull(version);
@@ -107,6 +110,7 @@ public final class SegmentInfo {
throw new IllegalArgumentException("invalid id: " + Arrays.toString(id));
}
this.attributes = Objects.requireNonNull(attributes);
+ this.indexSort = indexSort;
}
/**
@@ -194,13 +198,13 @@ public final class SegmentInfo {
s.append('/').append(delCount);
}
- final String sorter_key = "sorter"; // SortingMergePolicy.SORTER_ID_PROP; // TODO: use this once we can import SortingMergePolicy (currently located in 'misc' instead of 'core')
- final String sorter_val = diagnostics.get(sorter_key);
- if (sorter_val != null) {
- s.append(":[");
- s.append(sorter_key);
- s.append('=');
- s.append(sorter_val);
+ // nocommit does search time "do the right thing" automatically when segment is sorted?
+
+ // nocommit remove sorter_key from diagnostics
+
+ if (indexSort != null) {
+ s.append(":[indexSort=");
+ s.append(indexSort);
s.append(']');
}
@@ -311,5 +315,10 @@ public final class SegmentInfo {
public Map<String,String> getAttributes() {
return attributes;
}
+
+ /** Return the sort order of this segment, or null if the index has no sort. */
+ public Sort getIndexSort() {
+ return indexSort;
+ }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/fdc220ee/lucene/core/src/java/org/apache/lucene/index/SegmentReader.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/SegmentReader.java b/lucene/core/src/java/org/apache/lucene/index/SegmentReader.java
index 8ed93e3..e68f818 100644
--- a/lucene/core/src/java/org/apache/lucene/index/SegmentReader.java
+++ b/lucene/core/src/java/org/apache/lucene/index/SegmentReader.java
@@ -28,6 +28,7 @@ import org.apache.lucene.codecs.NormsProducer;
import org.apache.lucene.codecs.PointsReader;
import org.apache.lucene.codecs.StoredFieldsReader;
import org.apache.lucene.codecs.TermVectorsReader;
+import org.apache.lucene.search.Sort;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.util.Bits;
@@ -303,4 +304,9 @@ public final class SegmentReader extends CodecReader {
ensureOpen();
core.removeCoreClosedListener(listener);
}
+
+ @Override
+ public Sort getIndexSort() {
+ return si.info.getIndexSort();
+ }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/fdc220ee/lucene/core/src/java/org/apache/lucene/index/SlowCodecReaderWrapper.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/SlowCodecReaderWrapper.java b/lucene/core/src/java/org/apache/lucene/index/SlowCodecReaderWrapper.java
index 3a73701..2742247 100644
--- a/lucene/core/src/java/org/apache/lucene/index/SlowCodecReaderWrapper.java
+++ b/lucene/core/src/java/org/apache/lucene/index/SlowCodecReaderWrapper.java
@@ -26,6 +26,7 @@ import org.apache.lucene.codecs.NormsProducer;
import org.apache.lucene.codecs.PointsReader;
import org.apache.lucene.codecs.StoredFieldsReader;
import org.apache.lucene.codecs.TermVectorsReader;
+import org.apache.lucene.search.Sort;
import org.apache.lucene.util.Bits;
/**
@@ -125,6 +126,16 @@ public final class SlowCodecReaderWrapper {
public void removeCoreClosedListener(CoreClosedListener listener) {
reader.removeCoreClosedListener(listener);
}
+
+ @Override
+ public String toString() {
+ return "SlowCodecReaderWrapper(" + reader + ")";
+ }
+
+ @Override
+ public Sort getIndexSort() {
+ return reader.getIndexSort();
+ }
};
}
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/fdc220ee/lucene/core/src/java/org/apache/lucene/index/Sorter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/Sorter.java b/lucene/core/src/java/org/apache/lucene/index/Sorter.java
new file mode 100644
index 0000000..0ce7d64
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/index/Sorter.java
@@ -0,0 +1,289 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.index;
+
+import java.io.IOException;
+import java.util.Comparator;
+
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.search.LeafFieldComparator;
+import org.apache.lucene.search.Scorer;
+import org.apache.lucene.search.Sort;
+import org.apache.lucene.search.SortField;
+import org.apache.lucene.util.TimSorter;
+import org.apache.lucene.util.packed.PackedInts;
+import org.apache.lucene.util.packed.PackedLongValues;
+
+/**
+ * Sorts documents of a given index by returning a permutation on the document
+ * IDs.
+ * @lucene.experimental
+ */
+// nocommit rename to IndexSorter?
+final class Sorter {
+ final Sort sort;
+
+ /** Creates a new Sorter to sort the index with {@code sort} */
+ Sorter(Sort sort) {
+ if (sort.needsScores()) {
+ throw new IllegalArgumentException("Cannot sort an index with a Sort that refers to the relevance score");
+ }
+ this.sort = sort;
+ }
+
+ /**
+ * A permutation of doc IDs. For every document ID between <tt>0</tt> and
+ * {@link IndexReader#maxDoc()}, <code>oldToNew(newToOld(docID))</code> must
+ * return <code>docID</code>.
+ */
+ static abstract class DocMap {
+
+ /** Given a doc ID from the original index, return its ordinal in the
+ * sorted index. */
+ abstract int oldToNew(int docID);
+
+ /** Given the ordinal of a doc ID, return its doc ID in the original index. */
+ abstract int newToOld(int docID);
+
+ /** Return the number of documents in this map. This must be equal to the
+ * {@link org.apache.lucene.index.LeafReader#maxDoc() number of documents} of the
+ * {@link org.apache.lucene.index.LeafReader} which is sorted. */
+ abstract int size();
+ }
+
+ /** Check consistency of a {@link DocMap}, useful for assertions. */
+ static boolean isConsistent(DocMap docMap) {
+ final int maxDoc = docMap.size();
+ for (int i = 0; i < maxDoc; ++i) {
+ final int newID = docMap.oldToNew(i);
+ final int oldID = docMap.newToOld(newID);
+ assert newID >= 0 && newID < maxDoc : "doc IDs must be in [0-" + maxDoc + "[, got " + newID;
+ assert i == oldID : "mapping is inconsistent: " + i + " --oldToNew--> " + newID + " --newToOld--> " + oldID;
+ if (i != oldID || newID < 0 || newID >= maxDoc) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ /** A comparator of doc IDs. */
+ static abstract class DocComparator {
+
+ /** Compare docID1 against docID2. The contract for the return value is the
+ * same as {@link Comparator#compare(Object, Object)}. */
+ public abstract int compare(int docID1, int docID2);
+
+ }
+
+ private static final class DocValueSorter extends TimSorter {
+
+ private final int[] docs;
+ private final Sorter.DocComparator comparator;
+ private final int[] tmp;
+
+ DocValueSorter(int[] docs, Sorter.DocComparator comparator) {
+ super(docs.length / 64);
+ this.docs = docs;
+ this.comparator = comparator;
+ tmp = new int[docs.length / 64];
+ }
+
+ @Override
+ protected int compare(int i, int j) {
+ return comparator.compare(docs[i], docs[j]);
+ }
+
+ @Override
+ protected void swap(int i, int j) {
+ int tmpDoc = docs[i];
+ docs[i] = docs[j];
+ docs[j] = tmpDoc;
+ }
+
+ @Override
+ protected void copy(int src, int dest) {
+ docs[dest] = docs[src];
+ }
+
+ @Override
+ protected void save(int i, int len) {
+ System.arraycopy(docs, i, tmp, 0, len);
+ }
+
+ @Override
+ protected void restore(int i, int j) {
+ docs[j] = tmp[i];
+ }
+
+ @Override
+ protected int compareSaved(int i, int j) {
+ return comparator.compare(tmp[i], docs[j]);
+ }
+ }
+
+ /** Computes the old-to-new permutation over the given comparator. */
+ private static Sorter.DocMap sort(final int maxDoc, DocComparator comparator) {
+ // check if the index is sorted
+ boolean sorted = true;
+ for (int i = 1; i < maxDoc; ++i) {
+ if (comparator.compare(i-1, i) > 0) {
+ sorted = false;
+ break;
+ }
+ }
+ if (sorted) {
+ return null;
+ }
+
+ // sort doc IDs
+ final int[] docs = new int[maxDoc];
+ for (int i = 0; i < maxDoc; i++) {
+ docs[i] = i;
+ }
+
+ DocValueSorter sorter = new DocValueSorter(docs, comparator);
+ // It can be common to sort a reader, add docs, sort it again, ... and in
+ // that case timSort can save a lot of time
+ sorter.sort(0, docs.length); // docs is now the newToOld mapping
+
+ // The reason why we use MonotonicAppendingLongBuffer here is that it
+ // wastes very little memory if the index is in random order but can save
+ // a lot of memory if the index is already "almost" sorted
+ final PackedLongValues.Builder newToOldBuilder = PackedLongValues.monotonicBuilder(PackedInts.COMPACT);
+ for (int i = 0; i < maxDoc; ++i) {
+ newToOldBuilder.add(docs[i]);
+ }
+ final PackedLongValues newToOld = newToOldBuilder.build();
+
+ // invert the docs mapping:
+ for (int i = 0; i < maxDoc; ++i) {
+ docs[(int) newToOld.get(i)] = i;
+ } // docs is now the oldToNew mapping
+
+ final PackedLongValues.Builder oldToNewBuilder = PackedLongValues.monotonicBuilder(PackedInts.COMPACT);
+ for (int i = 0; i < maxDoc; ++i) {
+ oldToNewBuilder.add(docs[i]);
+ }
+ final PackedLongValues oldToNew = oldToNewBuilder.build();
+
+ return new Sorter.DocMap() {
+
+ @Override
+ public int oldToNew(int docID) {
+ return (int) oldToNew.get(docID);
+ }
+
+ @Override
+ public int newToOld(int docID) {
+ return (int) newToOld.get(docID);
+ }
+
+ @Override
+ public int size() {
+ return maxDoc;
+ }
+ };
+ }
+
+ /**
+ * Returns a mapping from the old document ID to its new location in the
+ * sorted index. Implementations can use the auxiliary
+ * {@link #sort(int, DocComparator)} to compute the old-to-new permutation
+ * given a list of documents and their corresponding values.
+ * <p>
+ * A return value of <tt>null</tt> is allowed and means that
+ * <code>reader</code> is already sorted.
+ * <p>
+ * <b>NOTE:</b> deleted documents are expected to appear in the mapping as
+ * well, they will however be marked as deleted in the sorted view.
+ */
+ DocMap sort(LeafReader reader) throws IOException {
+ SortField fields[] = sort.getSort();
+ final int reverseMul[] = new int[fields.length];
+ final LeafFieldComparator comparators[] = new LeafFieldComparator[fields.length];
+
+ for (int i = 0; i < fields.length; i++) {
+ reverseMul[i] = fields[i].getReverse() ? -1 : 1;
+ comparators[i] = fields[i].getComparator(1, i).getLeafComparator(reader.getContext());
+ comparators[i].setScorer(FAKESCORER);
+ }
+ final DocComparator comparator = new DocComparator() {
+ @Override
+ public int compare(int docID1, int docID2) {
+ try {
+ for (int i = 0; i < comparators.length; i++) {
+ // TODO: would be better if copy() didnt cause a term lookup in TermOrdVal & co,
+ // the segments are always the same here...
+ comparators[i].copy(0, docID1);
+ comparators[i].setBottom(0);
+ int comp = reverseMul[i] * comparators[i].compareBottom(docID2);
+ if (comp != 0) {
+ return comp;
+ }
+ }
+ return Integer.compare(docID1, docID2); // docid order tiebreak
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+ };
+ return sort(reader.maxDoc(), comparator);
+ }
+
+ /**
+ * Returns the identifier of this {@link Sorter}.
+ * <p>This identifier is similar to {@link Object#hashCode()} and should be
+ * chosen so that two instances of this class that sort documents likewise
+ * will have the same identifier. On the contrary, this identifier should be
+ * different on different {@link Sort sorts}.
+ */
+ public String getID() {
+ return sort.toString();
+ }
+
+ @Override
+ public String toString() {
+ return getID();
+ }
+
+ static final Scorer FAKESCORER = new Scorer(null) {
+
+ float score;
+ int doc = -1;
+ int freq = 1;
+
+ @Override
+ public int docID() {
+ return doc;
+ }
+
+ public DocIdSetIterator iterator() {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public int freq() throws IOException {
+ return freq;
+ }
+
+ @Override
+ public float score() throws IOException {
+ return score;
+ }
+ };
+
+}