You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ji...@apache.org on 2017/01/17 13:00:30 UTC
[1/3] lucene-solr:branch_6x: LUCENE-7579: sort segments at flush too
Repository: lucene-solr
Updated Branches:
refs/heads/branch_6x 5d0f90a83 -> 8f5b5a393
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7d96f9f7/lucene/core/src/java/org/apache/lucene/index/SortingStoredFieldsConsumer.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/SortingStoredFieldsConsumer.java b/lucene/core/src/java/org/apache/lucene/index/SortingStoredFieldsConsumer.java
new file mode 100644
index 0000000..b3cc1f4
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/index/SortingStoredFieldsConsumer.java
@@ -0,0 +1,206 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.index;
+
+import java.io.IOException;
+import java.io.Reader;
+import java.nio.charset.StandardCharsets;
+import java.util.Map;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.codecs.StoredFieldsReader;
+import org.apache.lucene.codecs.StoredFieldsWriter;
+import org.apache.lucene.document.StoredField;
+import org.apache.lucene.store.IOContext;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IOUtils;
+
+final class SortingStoredFieldsConsumer extends StoredFieldsConsumer {
+ TrackingTmpOutputDirectoryWrapper tmpDirectory;
+
+ SortingStoredFieldsConsumer(DocumentsWriterPerThread docWriter) {
+ super(docWriter);
+ }
+
+ @Override
+ protected void initStoredFieldsWriter() throws IOException {
+ if (writer == null) {
+ this.tmpDirectory = new TrackingTmpOutputDirectoryWrapper(docWriter.directory);
+ this.writer = docWriter.codec.storedFieldsFormat().fieldsWriter(tmpDirectory, docWriter.getSegmentInfo(),
+ IOContext.DEFAULT);
+ }
+ }
+
+ @Override
+ void flush(SegmentWriteState state, Sorter.DocMap sortMap) throws IOException {
+ super.flush(state, sortMap);
+ if (sortMap == null) {
+ // we're lucky the index is already sorted, just rename the temporary file and return
+ for (Map.Entry<String, String> entry : tmpDirectory.getTemporaryFiles().entrySet()) {
+ tmpDirectory.rename(entry.getValue(), entry.getKey());
+ }
+ return;
+ }
+ StoredFieldsReader reader = docWriter.codec.storedFieldsFormat()
+ .fieldsReader(tmpDirectory, state.segmentInfo, state.fieldInfos, IOContext.DEFAULT);
+ StoredFieldsReader mergeReader = reader.getMergeInstance();
+ StoredFieldsWriter sortWriter = docWriter.codec.storedFieldsFormat()
+ .fieldsWriter(state.directory, state.segmentInfo, IOContext.DEFAULT);
+ try {
+ reader.checkIntegrity();
+ CopyVisitor visitor = new CopyVisitor(sortWriter);
+ for (int docID = 0; docID < state.segmentInfo.maxDoc(); docID++) {
+ sortWriter.startDocument();
+ mergeReader.visitDocument(sortMap.newToOld(docID), visitor);
+ sortWriter.finishDocument();
+ }
+ sortWriter.finish(state.fieldInfos, state.segmentInfo.maxDoc());
+ } finally {
+ IOUtils.close(reader, sortWriter);
+ IOUtils.deleteFiles(tmpDirectory,
+ tmpDirectory.getTemporaryFiles().values());
+ }
+ }
+
+ @Override
+ void abort() {
+ try {
+ super.abort();
+ } finally {
+ IOUtils.deleteFilesIgnoringExceptions(tmpDirectory,
+ tmpDirectory.getTemporaryFiles().values());
+ }
+ }
+
+ /**
+ * A visitor that copies every field it sees in the provided {@link StoredFieldsWriter}.
+ */
+ private static class CopyVisitor extends StoredFieldVisitor implements IndexableField {
+ final StoredFieldsWriter writer;
+ BytesRef binaryValue;
+ String stringValue;
+ Number numericValue;
+ FieldInfo currentField;
+
+
+ CopyVisitor(StoredFieldsWriter writer) {
+ this.writer = writer;
+ }
+
+ @Override
+ public void binaryField(FieldInfo fieldInfo, byte[] value) throws IOException {
+ reset(fieldInfo);
+ // TODO: can we avoid new BR here?
+ binaryValue = new BytesRef(value);
+ write();
+ }
+
+ @Override
+ public void stringField(FieldInfo fieldInfo, byte[] value) throws IOException {
+ reset(fieldInfo);
+ // TODO: can we avoid new String here?
+ stringValue = new String(value, StandardCharsets.UTF_8);
+ write();
+ }
+
+ @Override
+ public void intField(FieldInfo fieldInfo, int value) throws IOException {
+ reset(fieldInfo);
+ numericValue = value;
+ write();
+ }
+
+ @Override
+ public void longField(FieldInfo fieldInfo, long value) throws IOException {
+ reset(fieldInfo);
+ numericValue = value;
+ write();
+ }
+
+ @Override
+ public void floatField(FieldInfo fieldInfo, float value) throws IOException {
+ reset(fieldInfo);
+ numericValue = value;
+ write();
+ }
+
+ @Override
+ public void doubleField(FieldInfo fieldInfo, double value) throws IOException {
+ reset(fieldInfo);
+ numericValue = value;
+ write();
+ }
+
+ @Override
+ public Status needsField(FieldInfo fieldInfo) throws IOException {
+ return Status.YES;
+ }
+
+ @Override
+ public String name() {
+ return currentField.name;
+ }
+
+ @Override
+ public IndexableFieldType fieldType() {
+ return StoredField.TYPE;
+ }
+
+ @Override
+ public BytesRef binaryValue() {
+ return binaryValue;
+ }
+
+ @Override
+ public String stringValue() {
+ return stringValue;
+ }
+
+ @Override
+ public Number numericValue() {
+ return numericValue;
+ }
+
+ @Override
+ public Reader readerValue() {
+ return null;
+ }
+
+ @Override
+ public float boost() {
+ return 1F;
+ }
+
+ @Override
+ public TokenStream tokenStream(Analyzer analyzer, TokenStream reuse) {
+ return null;
+ }
+
+ void reset(FieldInfo field) {
+ currentField = field;
+ binaryValue = null;
+ stringValue = null;
+ numericValue = null;
+ }
+
+ void write() throws IOException {
+ writer.writeField(currentField, this);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7d96f9f7/lucene/core/src/java/org/apache/lucene/index/SortingTermVectorsConsumer.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/SortingTermVectorsConsumer.java b/lucene/core/src/java/org/apache/lucene/index/SortingTermVectorsConsumer.java
new file mode 100644
index 0000000..dff808e
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/index/SortingTermVectorsConsumer.java
@@ -0,0 +1,181 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.index;
+
+
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.Map;
+
+import org.apache.lucene.codecs.TermVectorsReader;
+import org.apache.lucene.codecs.TermVectorsWriter;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.FlushInfo;
+import org.apache.lucene.store.IOContext;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IOUtils;
+
+final class SortingTermVectorsConsumer extends TermVectorsConsumer {
+ TrackingTmpOutputDirectoryWrapper tmpDirectory;
+
+ public SortingTermVectorsConsumer(DocumentsWriterPerThread docWriter) {
+ super(docWriter);
+ }
+
+ @Override
+ void flush(Map<String, TermsHashPerField> fieldsToFlush, final SegmentWriteState state, Sorter.DocMap sortMap) throws IOException {
+ super.flush(fieldsToFlush, state, sortMap);
+ if (tmpDirectory != null) {
+ if (sortMap == null) {
+ // we're lucky the index is already sorted, just rename the temporary file and return
+ for (Map.Entry<String, String> entry : tmpDirectory.getTemporaryFiles().entrySet()) {
+ tmpDirectory.rename(entry.getValue(), entry.getKey());
+ }
+ return;
+ }
+ TermVectorsReader reader = docWriter.codec.termVectorsFormat()
+ .vectorsReader(tmpDirectory, state.segmentInfo, state.fieldInfos, IOContext.DEFAULT);
+ TermVectorsReader mergeReader = reader.getMergeInstance();
+ TermVectorsWriter writer = docWriter.codec.termVectorsFormat()
+ .vectorsWriter(state.directory, state.segmentInfo, IOContext.DEFAULT);
+ try {
+ reader.checkIntegrity();
+ for (int docID = 0; docID < state.segmentInfo.maxDoc(); docID++) {
+ Fields vectors = mergeReader.get(sortMap.newToOld(docID));
+ writeTermVectors(writer, vectors, state.fieldInfos);
+ }
+ writer.finish(state.fieldInfos, state.segmentInfo.maxDoc());
+ } finally {
+ IOUtils.close(reader, writer);
+ IOUtils.deleteFiles(tmpDirectory,
+ tmpDirectory.getTemporaryFiles().values());
+ }
+ }
+ }
+
+ @Override
+ void initTermVectorsWriter() throws IOException {
+ if (writer == null) {
+ IOContext context = new IOContext(new FlushInfo(docWriter.getNumDocsInRAM(), docWriter.bytesUsed()));
+ tmpDirectory = new TrackingTmpOutputDirectoryWrapper(docWriter.directory);
+ writer = docWriter.codec.termVectorsFormat().vectorsWriter(tmpDirectory, docWriter.getSegmentInfo(), context);
+ lastDocID = 0;
+ }
+ }
+
+ @Override
+ public void abort() {
+ try {
+ super.abort();
+ } finally {
+ IOUtils.deleteFilesIgnoringExceptions(tmpDirectory,
+ tmpDirectory.getTemporaryFiles().values());
+ }
+ }
+
+ /** Safe (but, slowish) default method to copy every vector field in the provided {@link TermVectorsWriter}. */
+ private static void writeTermVectors(TermVectorsWriter writer, Fields vectors, FieldInfos fieldInfos) throws IOException {
+ if (vectors == null) {
+ writer.startDocument(0);
+ writer.finishDocument();
+ return;
+ }
+
+ int numFields = vectors.size();
+ if (numFields == -1) {
+ // count manually! TODO: Maybe enforce that Fields.size() returns something valid?
+ numFields = 0;
+ for (final Iterator<String> it = vectors.iterator(); it.hasNext(); ) {
+ it.next();
+ numFields++;
+ }
+ }
+ writer.startDocument(numFields);
+
+ String lastFieldName = null;
+
+ TermsEnum termsEnum = null;
+ PostingsEnum docsAndPositionsEnum = null;
+
+ int fieldCount = 0;
+ for(String fieldName : vectors) {
+ fieldCount++;
+ final FieldInfo fieldInfo = fieldInfos.fieldInfo(fieldName);
+
+ assert lastFieldName == null || fieldName.compareTo(lastFieldName) > 0: "lastFieldName=" + lastFieldName + " fieldName=" + fieldName;
+ lastFieldName = fieldName;
+
+ final Terms terms = vectors.terms(fieldName);
+ if (terms == null) {
+ // FieldsEnum shouldn't lie...
+ continue;
+ }
+
+ final boolean hasPositions = terms.hasPositions();
+ final boolean hasOffsets = terms.hasOffsets();
+ final boolean hasPayloads = terms.hasPayloads();
+ assert !hasPayloads || hasPositions;
+
+ int numTerms = (int) terms.size();
+ if (numTerms == -1) {
+ // count manually. It is stupid, but needed, as Terms.size() is not a mandatory statistics function
+ numTerms = 0;
+ termsEnum = terms.iterator();
+ while(termsEnum.next() != null) {
+ numTerms++;
+ }
+ }
+
+ writer.startField(fieldInfo, numTerms, hasPositions, hasOffsets, hasPayloads);
+ termsEnum = terms.iterator();
+
+ int termCount = 0;
+ while(termsEnum.next() != null) {
+ termCount++;
+
+ final int freq = (int) termsEnum.totalTermFreq();
+
+ writer.startTerm(termsEnum.term(), freq);
+
+ if (hasPositions || hasOffsets) {
+ docsAndPositionsEnum = termsEnum.postings(docsAndPositionsEnum, PostingsEnum.OFFSETS | PostingsEnum.PAYLOADS);
+ assert docsAndPositionsEnum != null;
+
+ final int docID = docsAndPositionsEnum.nextDoc();
+ assert docID != DocIdSetIterator.NO_MORE_DOCS;
+ assert docsAndPositionsEnum.freq() == freq;
+
+ for(int posUpto=0; posUpto<freq; posUpto++) {
+ final int pos = docsAndPositionsEnum.nextPosition();
+ final int startOffset = docsAndPositionsEnum.startOffset();
+ final int endOffset = docsAndPositionsEnum.endOffset();
+
+ final BytesRef payload = docsAndPositionsEnum.getPayload();
+
+ assert !hasPositions || pos >= 0 ;
+ writer.addPosition(pos, startOffset, endOffset, payload);
+ }
+ }
+ writer.finishTerm();
+ }
+ assert termCount == numTerms;
+ writer.finishField();
+ }
+ assert fieldCount == numFields;
+ writer.finishDocument();
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7d96f9f7/lucene/core/src/java/org/apache/lucene/index/StoredFieldsConsumer.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/StoredFieldsConsumer.java b/lucene/core/src/java/org/apache/lucene/index/StoredFieldsConsumer.java
new file mode 100644
index 0000000..56b3e4d
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/index/StoredFieldsConsumer.java
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.index;
+
+import java.io.IOException;
+
+import org.apache.lucene.codecs.StoredFieldsWriter;
+import org.apache.lucene.store.IOContext;
+import org.apache.lucene.util.IOUtils;
+
+class StoredFieldsConsumer {
+ final DocumentsWriterPerThread docWriter;
+ StoredFieldsWriter writer;
+ int lastDoc;
+
+ StoredFieldsConsumer(DocumentsWriterPerThread docWriter) {
+ this.docWriter = docWriter;
+ this.lastDoc = -1;
+ }
+
+ protected void initStoredFieldsWriter() throws IOException {
+ if (writer == null) {
+ this.writer =
+ docWriter.codec.storedFieldsFormat().fieldsWriter(docWriter.directory, docWriter.getSegmentInfo(),
+ IOContext.DEFAULT);
+ }
+ }
+
+ void startDocument(int docID) throws IOException {
+ assert lastDoc < docID;
+ initStoredFieldsWriter();
+ while (++lastDoc < docID) {
+ writer.startDocument();
+ writer.finishDocument();
+ }
+ writer.startDocument();
+ }
+
+ void writeField(FieldInfo info, IndexableField field) throws IOException {
+ writer.writeField(info, field);
+ }
+
+ void finishDocument() throws IOException {
+ writer.finishDocument();
+ }
+
+ void finish(int maxDoc) throws IOException {
+ while (lastDoc < maxDoc-1) {
+ startDocument(lastDoc);
+ finishDocument();
+ ++lastDoc;
+ }
+ }
+
+ void flush(SegmentWriteState state, Sorter.DocMap sortMap) throws IOException {
+ try {
+ writer.finish(state.fieldInfos, state.segmentInfo.maxDoc());
+ } finally {
+ IOUtils.close(writer);
+ writer = null;
+ }
+ }
+
+ void abort() {
+ if (writer != null) {
+ IOUtils.closeWhileHandlingException(writer);
+ writer = null;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7d96f9f7/lucene/core/src/java/org/apache/lucene/index/TermVectorsConsumer.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/TermVectorsConsumer.java b/lucene/core/src/java/org/apache/lucene/index/TermVectorsConsumer.java
index da49a8b..46dc63c 100644
--- a/lucene/core/src/java/org/apache/lucene/index/TermVectorsConsumer.java
+++ b/lucene/core/src/java/org/apache/lucene/index/TermVectorsConsumer.java
@@ -29,8 +29,7 @@ import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.RamUsageEstimator;
-final class TermVectorsConsumer extends TermsHash {
-
+class TermVectorsConsumer extends TermsHash {
TermVectorsWriter writer;
/** Scratch term used by TermVectorsConsumerPerField.finishDocument. */
@@ -54,7 +53,7 @@ final class TermVectorsConsumer extends TermsHash {
}
@Override
- void flush(Map<String, TermsHashPerField> fieldsToFlush, final SegmentWriteState state) throws IOException {
+ void flush(Map<String, TermsHashPerField> fieldsToFlush, final SegmentWriteState state, Sorter.DocMap sortMap) throws IOException {
if (writer != null) {
int numDocs = state.segmentInfo.maxDoc();
assert numDocs > 0;
@@ -82,7 +81,7 @@ final class TermVectorsConsumer extends TermsHash {
}
}
- private void initTermVectorsWriter() throws IOException {
+ void initTermVectorsWriter() throws IOException {
if (writer == null) {
IOContext context = new IOContext(new FlushInfo(docWriter.getNumDocsInRAM(), docWriter.bytesUsed()));
writer = docWriter.codec.termVectorsFormat().vectorsWriter(docWriter.directory, docWriter.getSegmentInfo(), context);
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7d96f9f7/lucene/core/src/java/org/apache/lucene/index/TermsHash.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/TermsHash.java b/lucene/core/src/java/org/apache/lucene/index/TermsHash.java
index fb5c78c..bede2f8 100644
--- a/lucene/core/src/java/org/apache/lucene/index/TermsHash.java
+++ b/lucene/core/src/java/org/apache/lucene/index/TermsHash.java
@@ -76,13 +76,13 @@ abstract class TermsHash {
bytePool.reset(false, false);
}
- void flush(Map<String,TermsHashPerField> fieldsToFlush, final SegmentWriteState state) throws IOException {
+ void flush(Map<String,TermsHashPerField> fieldsToFlush, final SegmentWriteState state, Sorter.DocMap sortMap) throws IOException {
if (nextTermsHash != null) {
Map<String,TermsHashPerField> nextChildFields = new HashMap<>();
for (final Map.Entry<String,TermsHashPerField> entry : fieldsToFlush.entrySet()) {
nextChildFields.put(entry.getKey(), entry.getValue().nextPerField);
}
- nextTermsHash.flush(nextChildFields, state);
+ nextTermsHash.flush(nextChildFields, state, sortMap);
}
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7d96f9f7/lucene/core/src/java/org/apache/lucene/index/TrackingTmpOutputDirectoryWrapper.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/TrackingTmpOutputDirectoryWrapper.java b/lucene/core/src/java/org/apache/lucene/index/TrackingTmpOutputDirectoryWrapper.java
new file mode 100644
index 0000000..fd071c9
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/index/TrackingTmpOutputDirectoryWrapper.java
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.index;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FilterDirectory;
+import org.apache.lucene.store.IOContext;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+
+final class TrackingTmpOutputDirectoryWrapper extends FilterDirectory {
+ private final Map<String, String> fileNames = new HashMap<>();
+
+ TrackingTmpOutputDirectoryWrapper(Directory in) {
+ super(in);
+ }
+
+ @Override
+ public IndexOutput createOutput(String name, IOContext context) throws IOException {
+ IndexOutput output = super.createTempOutput(name, "", context);
+ fileNames.put(name, output.getName());
+ return output;
+ }
+
+ @Override
+ public IndexInput openInput(String name, IOContext context) throws IOException {
+ String tmpName = fileNames.get(name);
+ return super.openInput(tmpName, context);
+ }
+
+ public Map<String, String> getTemporaryFiles() {
+ return fileNames;
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7d96f9f7/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java
index 4f0d701..f1fdfbf 100644
--- a/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java
@@ -1445,8 +1445,6 @@ public class TestIndexSorting extends LuceneTestCase {
SegmentInfo info = leaf.getSegmentInfo().info;
switch (info.getDiagnostics().get(IndexWriter.SOURCE)) {
case IndexWriter.SOURCE_FLUSH:
- assertNull(info.getIndexSort());
- break;
case IndexWriter.SOURCE_MERGE:
assertEquals(indexSort, info.getIndexSort());
final NumericDocValues values = leaf.getNumericDocValues("foo");
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7d96f9f7/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingLiveDocsFormat.java
----------------------------------------------------------------------
diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingLiveDocsFormat.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingLiveDocsFormat.java
index 4837513..f393f18 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingLiveDocsFormat.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingLiveDocsFormat.java
@@ -68,8 +68,13 @@ public class AssertingLiveDocsFormat extends LiveDocsFormat {
@Override
public void writeLiveDocs(MutableBits bits, Directory dir, SegmentCommitInfo info, int newDelCount, IOContext context) throws IOException {
- assert bits instanceof AssertingMutableBits;
- MutableBits raw = (MutableBits) ((AssertingMutableBits)bits).in;
+ MutableBits raw = bits;
+ /**
+ * bits is not necessarily an AssertingMutableBits because index sorting needs to wrap it in a sorted view.
+ */
+ if (bits instanceof AssertingMutableBits) {
+ raw = (MutableBits) ((AssertingMutableBits) bits).in;
+ }
check(raw, info.info.maxDoc(), info.getDelCount() + newDelCount);
in.writeLiveDocs(raw, dir, info, newDelCount, context);
}
[3/3] lucene-solr:branch_6x: LUCENE-7579: sort segments at flush too
Posted by ji...@apache.org.
LUCENE-7579: sort segments at flush too
Segments are now also sorted during flush, and merging
on a sorted index is substantially faster by using some of the same
bulk merge optimizations that non-sorted merging uses
(cherry picked from commit 4ccb9fb)
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/8f5b5a39
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/8f5b5a39
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/8f5b5a39
Branch: refs/heads/branch_6x
Commit: 8f5b5a393d94500e6c7a8beff54e010c45c3b0e3
Parents: 5d0f90a 7d96f9f
Author: Jim Ferenczi <ji...@elastic.co>
Authored: Tue Jan 17 14:00:09 2017 +0100
Committer: Jim Ferenczi <ji...@elastic.co>
Committed: Tue Jan 17 14:00:09 2017 +0100
----------------------------------------------------------------------
lucene/CHANGES.txt | 5 +
.../CompressingStoredFieldsWriter.java | 67 +++++-
.../lucene50/Lucene50StoredFieldsFormat.java | 2 +-
.../codecs/lucene60/Lucene60PointsWriter.java | 11 +-
.../lucene/index/BinaryDocValuesWriter.java | 97 +++++++-
.../lucene/index/DefaultIndexingChain.java | 125 ++++++----
.../org/apache/lucene/index/DocConsumer.java | 2 +-
.../apache/lucene/index/DocValuesWriter.java | 4 +-
.../lucene/index/DocumentsWriterPerThread.java | 31 ++-
.../lucene/index/FreqProxTermsWriter.java | 8 +-
.../org/apache/lucene/index/IndexWriter.java | 10 +-
.../org/apache/lucene/index/MergeState.java | 8 +-
.../apache/lucene/index/NormValuesWriter.java | 54 ++++-
.../lucene/index/NumericDocValuesWriter.java | 142 +++++++++++-
.../apache/lucene/index/PointValuesWriter.java | 110 ++++++++-
.../lucene/index/SortedDocValuesWriter.java | 119 ++++++++--
.../index/SortedNumericDocValuesWriter.java | 163 ++++++++++++-
.../lucene/index/SortedSetDocValuesWriter.java | 227 +++++++++++++++++--
.../java/org/apache/lucene/index/Sorter.java | 4 +-
.../apache/lucene/index/SortingLeafReader.java | 6 +-
.../index/SortingStoredFieldsConsumer.java | 206 +++++++++++++++++
.../index/SortingTermVectorsConsumer.java | 181 +++++++++++++++
.../lucene/index/StoredFieldsConsumer.java | 85 +++++++
.../lucene/index/TermVectorsConsumer.java | 7 +-
.../java/org/apache/lucene/index/TermsHash.java | 4 +-
.../TrackingTmpOutputDirectoryWrapper.java | 53 +++++
.../apache/lucene/index/TestIndexSorting.java | 2 -
27 files changed, 1570 insertions(+), 163 deletions(-)
----------------------------------------------------------------------
[2/3] lucene-solr:branch_6x: LUCENE-7579: sort segments at flush too
Posted by ji...@apache.org.
LUCENE-7579: sort segments at flush too
Segments are now also sorted during flush, and merging
on a sorted index is substantially faster by using some of the same
bulk merge optimizations that non-sorted merging uses
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/7d96f9f7
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/7d96f9f7
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/7d96f9f7
Branch: refs/heads/branch_6x
Commit: 7d96f9f7981dbadda837b5b2cacc3855d19f71aa
Parents: 5d0f90a
Author: Mike McCandless <mi...@apache.org>
Authored: Tue Dec 20 06:45:06 2016 -0500
Committer: Jim Ferenczi <ji...@elastic.co>
Committed: Tue Jan 17 13:09:55 2017 +0100
----------------------------------------------------------------------
lucene/CHANGES.txt | 5 +
.../CompressingStoredFieldsWriter.java | 67 +++++-
.../lucene50/Lucene50StoredFieldsFormat.java | 2 +-
.../codecs/lucene60/Lucene60PointsWriter.java | 11 +-
.../lucene/index/BinaryDocValuesWriter.java | 97 +++++++-
.../lucene/index/DefaultIndexingChain.java | 125 ++++++----
.../org/apache/lucene/index/DocConsumer.java | 2 +-
.../apache/lucene/index/DocValuesWriter.java | 4 +-
.../lucene/index/DocumentsWriterPerThread.java | 31 ++-
.../lucene/index/FreqProxTermsWriter.java | 8 +-
.../org/apache/lucene/index/IndexWriter.java | 10 +-
.../org/apache/lucene/index/MergeState.java | 8 +-
.../apache/lucene/index/NormValuesWriter.java | 54 ++++-
.../lucene/index/NumericDocValuesWriter.java | 142 +++++++++++-
.../apache/lucene/index/PointValuesWriter.java | 110 ++++++++-
.../lucene/index/SortedDocValuesWriter.java | 119 ++++++++--
.../index/SortedNumericDocValuesWriter.java | 163 ++++++++++++-
.../lucene/index/SortedSetDocValuesWriter.java | 227 +++++++++++++++++--
.../java/org/apache/lucene/index/Sorter.java | 4 +-
.../apache/lucene/index/SortingLeafReader.java | 6 +-
.../index/SortingStoredFieldsConsumer.java | 206 +++++++++++++++++
.../index/SortingTermVectorsConsumer.java | 181 +++++++++++++++
.../lucene/index/StoredFieldsConsumer.java | 85 +++++++
.../lucene/index/TermVectorsConsumer.java | 7 +-
.../java/org/apache/lucene/index/TermsHash.java | 4 +-
.../TrackingTmpOutputDirectoryWrapper.java | 53 +++++
.../apache/lucene/index/TestIndexSorting.java | 2 -
.../asserting/AssertingLiveDocsFormat.java | 9 +-
28 files changed, 1577 insertions(+), 165 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7d96f9f7/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 4f6a0e0..8b154ce 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -203,6 +203,11 @@ Optimizations
* LUCENE-7572: Doc values queries now cache their hash code. (Adrien Grand)
+* LUCENE-7579: Segments are now also sorted during flush, and merging
+ on a sorted index is substantially faster by using some of the same
+ bulk merge optimizations that non-sorted merging uses (Jim Ferenczi
+ via Mike McCandless)
+
Other
* LUCENE-7546: Fixed references to benchmark wikipedia data and the Jenkins line-docs file
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7d96f9f7/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java
index cda855d..7f6c74e 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java
@@ -18,13 +18,16 @@ package org.apache.lucene.codecs.compressing;
import java.io.IOException;
+import java.util.ArrayList;
import java.util.Arrays;
+import java.util.List;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.StoredFieldsReader;
import org.apache.lucene.codecs.StoredFieldsWriter;
import org.apache.lucene.codecs.compressing.CompressingStoredFieldsReader.SerializedDocument;
import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.index.DocIDMerger;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexFileNames;
@@ -44,6 +47,8 @@ import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.packed.PackedInts;
+import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
+
/**
* {@link StoredFieldsWriter} impl for {@link CompressingStoredFieldsFormat}.
* @lucene.experimental
@@ -487,16 +492,44 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
@Override
public int merge(MergeState mergeState) throws IOException {
- if (mergeState.needsIndexSort) {
- // TODO: can we gain back some optos even if index is sorted? E.g. if sort results in large chunks of contiguous docs from one sub
- // being copied over...?
- return super.merge(mergeState);
- }
-
int docCount = 0;
int numReaders = mergeState.maxDocs.length;
MatchingReaders matching = new MatchingReaders(mergeState);
+ if (mergeState.needsIndexSort) {
+ /**
+ * If all readers are compressed and they have the same fieldinfos then we can merge the serialized document
+ * directly.
+ */
+ List<CompressingStoredFieldsMergeSub> subs = new ArrayList<>();
+ for(int i=0;i<mergeState.storedFieldsReaders.length;i++) {
+ if (matching.matchingReaders[i] &&
+ mergeState.storedFieldsReaders[i] instanceof CompressingStoredFieldsReader) {
+ CompressingStoredFieldsReader storedFieldsReader = (CompressingStoredFieldsReader) mergeState.storedFieldsReaders[i];
+ storedFieldsReader.checkIntegrity();
+ subs.add(new CompressingStoredFieldsMergeSub(storedFieldsReader, mergeState.docMaps[i], mergeState.maxDocs[i]));
+ } else {
+ return super.merge(mergeState);
+ }
+ }
+
+ final DocIDMerger<CompressingStoredFieldsMergeSub> docIDMerger = DocIDMerger.of(subs, true);
+ while (true) {
+ CompressingStoredFieldsMergeSub sub = docIDMerger.next();
+ if (sub == null) {
+ break;
+ }
+ assert sub.mappedDocID == docCount;
+ SerializedDocument doc = sub.reader.document(sub.docID);
+ startDocument();
+ bufferedDocs.copyBytes(doc.in, doc.length);
+ numStoredFieldsInDoc = doc.numStoredFields;
+ finishDocument();
+ ++docCount;
+ }
+ finish(mergeState.mergeFieldInfos, docCount);
+ return docCount;
+ }
for (int readerIndex=0;readerIndex<numReaders;readerIndex++) {
MergeVisitor visitor = new MergeVisitor(mergeState, readerIndex);
@@ -630,4 +663,26 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
return candidate.getNumDirtyChunks() > 1024 ||
candidate.getNumDirtyChunks() * 100 > candidate.getNumChunks();
}
+
+ private static class CompressingStoredFieldsMergeSub extends DocIDMerger.Sub {
+ private final CompressingStoredFieldsReader reader;
+ private final int maxDoc;
+ int docID = -1;
+
+ public CompressingStoredFieldsMergeSub(CompressingStoredFieldsReader reader, MergeState.DocMap docMap, int maxDoc) {
+ super(docMap);
+ this.maxDoc = maxDoc;
+ this.reader = reader;
+ }
+
+ @Override
+ public int nextDoc() {
+ docID++;
+ if (docID == maxDoc) {
+ return NO_MORE_DOCS;
+ } else {
+ return docID;
+ }
+ }
+ }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7d96f9f7/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50StoredFieldsFormat.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50StoredFieldsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50StoredFieldsFormat.java
index 10f8a69..fdfba5b 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50StoredFieldsFormat.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50StoredFieldsFormat.java
@@ -176,7 +176,7 @@ public final class Lucene50StoredFieldsFormat extends StoredFieldsFormat {
@Override
public StoredFieldsWriter fieldsWriter(Directory directory, SegmentInfo si, IOContext context) throws IOException {
String previous = si.putAttribute(MODE_KEY, mode.name());
- if (previous != null) {
+ if (previous != null && previous.equals(mode.name()) == false) {
throw new IllegalStateException("found existing value for " + MODE_KEY + " for segment: " + si.name +
"old=" + previous + ", new=" + mode.name());
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7d96f9f7/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60PointsWriter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60PointsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60PointsWriter.java
index 1535c7d..41e029c 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60PointsWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60PointsWriter.java
@@ -132,13 +132,10 @@ public class Lucene60PointsWriter extends PointsWriter implements Closeable {
@Override
public void merge(MergeState mergeState) throws IOException {
- if (mergeState.needsIndexSort) {
- // TODO: can we gain back some optos even if index is sorted? E.g. if sort results in large chunks of contiguous docs from one sub
- // being copied over...?
- super.merge(mergeState);
- return;
- }
-
+ /**
+ * If indexSort is activated and some of the leaves are not sorted the next test will catch that and the non-optimized merge will run.
+ * If the readers are all sorted then it's safe to perform a bulk merge of the points.
+ **/
for(PointsReader reader : mergeState.pointsReaders) {
if (reader instanceof Lucene60PointsReader == false) {
// We can only bulk merge when all to-be-merged segments use our format:
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7d96f9f7/lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java
index 03d9ff3..432285d 100644
--- a/lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java
@@ -22,7 +22,7 @@ import java.util.Iterator;
import java.util.NoSuchElementException;
import org.apache.lucene.codecs.DocValuesConsumer;
-import org.apache.lucene.store.DataInput;
+import org.apache.lucene.search.SortField;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
@@ -107,18 +107,43 @@ class BinaryDocValuesWriter extends DocValuesWriter {
@Override
public void finish(int maxDoc) {
+
+ }
+
+ @Override
+ Sorter.DocComparator getDocComparator(int numDoc, SortField sortField) throws IOException {
+ throw new IllegalArgumentException("It is forbidden to sort on a binary field");
}
@Override
- public void flush(SegmentWriteState state, DocValuesConsumer dvConsumer) throws IOException {
+ public void flush(SegmentWriteState state, Sorter.DocMap sortMap, DocValuesConsumer dvConsumer) throws IOException {
final int maxDoc = state.segmentInfo.maxDoc();
bytes.freeze(false);
final PackedLongValues lengths = this.lengths.build();
+
+ final long[] starts;
+ if (sortMap != null) {
+ starts = new long[maxDoc];
+ PackedLongValues.Iterator it = lengths.iterator();
+ long ptr = 0;
+ int doc = 0;
+ while (it.hasNext()) {
+ starts[doc++] = ptr;
+ ptr += it.next();
+ }
+ } else {
+ starts = null;
+ }
+
dvConsumer.addBinaryField(fieldInfo,
new Iterable<BytesRef>() {
@Override
public Iterator<BytesRef> iterator() {
- return new BytesIterator(maxDoc, lengths);
+ if (sortMap == null) {
+ return new BytesIterator(maxDoc, lengths);
+ } else {
+ return new SortingBytesIterator(maxDoc, lengths, sortMap, starts);
+ }
}
});
}
@@ -127,7 +152,7 @@ class BinaryDocValuesWriter extends DocValuesWriter {
private class BytesIterator implements Iterator<BytesRef> {
final BytesRefBuilder value = new BytesRefBuilder();
final PackedLongValues.Iterator lengthsIterator;
- final DataInput bytesIterator = bytes.getDataInput();
+ final PagedBytes.PagedBytesDataInput bytesIterator = (PagedBytes.PagedBytesDataInput) bytes.getDataInput();
final int size = (int) lengths.size();
final int maxDoc;
int upto;
@@ -149,15 +174,67 @@ class BinaryDocValuesWriter extends DocValuesWriter {
}
final BytesRef v;
if (upto < size) {
- int length = (int) lengthsIterator.next();
+ final int length = (int) lengthsIterator.next();
value.grow(length);
value.setLength(length);
- try {
- bytesIterator.readBytes(value.bytes(), 0, value.length());
- } catch (IOException ioe) {
- // Should never happen!
- throw new RuntimeException(ioe);
+ bytesIterator.readBytes(value.bytes(), 0, value.length());
+ if (docsWithField.get(upto)) {
+ v = value.get();
+ } else {
+ v = null;
}
+ } else {
+ v = null;
+ }
+ upto++;
+ return v;
+ }
+
+ @Override
+ public void remove() {
+ throw new UnsupportedOperationException();
+ }
+ }
+
+ // sort the values we have in ram according to the provided sort map
+ private class SortingBytesIterator implements Iterator<BytesRef> {
+ final BytesRefBuilder value = new BytesRefBuilder();
+ final PackedLongValues values;
+ final PackedLongValues.Iterator lengthsIterator;
+ final long[] starts;
+ final PagedBytes.PagedBytesDataInput bytesIterator = (PagedBytes.PagedBytesDataInput) bytes.getDataInput();
+ final Sorter.DocMap sortMap;
+ final int size = (int) lengths.size();
+ final int maxDoc;
+ int upto;
+
+ SortingBytesIterator(int maxDoc, PackedLongValues lengths, Sorter.DocMap sortMap, long[] starts) {
+ this.maxDoc = maxDoc;
+ this.lengthsIterator = lengths.iterator();
+ this.values = lengths;
+ this.sortMap = sortMap;
+ this.starts = starts;
+ }
+
+ @Override
+ public boolean hasNext() {
+ return upto < maxDoc;
+ }
+
+ @Override
+ public BytesRef next() {
+ if (!hasNext()) {
+ throw new NoSuchElementException();
+ }
+ final BytesRef v;
+ if (upto < size) {
+ int oldID = sortMap.newToOld(upto);
+ int length = (int) values.get(oldID);
+ long pos = starts[oldID];
+ bytesIterator.setPosition(pos);
+ value.grow(length);
+ value.setLength(length);
+ bytesIterator.readBytes(value.bytes(), 0, value.length());
if (docsWithField.get(upto)) {
v = value.get();
} else {
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7d96f9f7/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java b/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java
index d792111..29bf40d 100644
--- a/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java
+++ b/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java
@@ -18,9 +18,13 @@ package org.apache.lucene.index;
import java.io.IOException;
+import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
import java.util.Map;
+import java.util.Set;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.codecs.DocValuesConsumer;
@@ -29,8 +33,9 @@ import org.apache.lucene.codecs.NormsConsumer;
import org.apache.lucene.codecs.NormsFormat;
import org.apache.lucene.codecs.PointsFormat;
import org.apache.lucene.codecs.PointsWriter;
-import org.apache.lucene.codecs.StoredFieldsWriter;
import org.apache.lucene.document.FieldType;
+import org.apache.lucene.search.Sort;
+import org.apache.lucene.search.SortField;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.util.ArrayUtil;
@@ -50,10 +55,8 @@ final class DefaultIndexingChain extends DocConsumer {
// Writes postings and term vectors:
final TermsHash termsHash;
-
- // lazy init:
- private StoredFieldsWriter storedFieldsWriter;
- private int lastStoredDocID;
+ // Writes stored fields
+ final StoredFieldsConsumer storedFieldsConsumer;
// NOTE: I tried using Hash Map<String,PerField>
// but it was ~2% slower on Wiki and Geonames with Java
@@ -67,54 +70,91 @@ final class DefaultIndexingChain extends DocConsumer {
// Holds fields seen in each document
private PerField[] fields = new PerField[1];
+ private final Set<String> finishedDocValues = new HashSet<>();
+
public DefaultIndexingChain(DocumentsWriterPerThread docWriter) throws IOException {
this.docWriter = docWriter;
this.fieldInfos = docWriter.getFieldInfosBuilder();
this.docState = docWriter.docState;
this.bytesUsed = docWriter.bytesUsed;
- TermsHash termVectorsWriter = new TermVectorsConsumer(docWriter);
+ final TermsHash termVectorsWriter;
+ if (docWriter.getSegmentInfo().getIndexSort() == null) {
+ storedFieldsConsumer = new StoredFieldsConsumer(docWriter);
+ termVectorsWriter = new TermVectorsConsumer(docWriter);
+ } else {
+ storedFieldsConsumer = new SortingStoredFieldsConsumer(docWriter);
+ termVectorsWriter = new SortingTermVectorsConsumer(docWriter);
+ }
termsHash = new FreqProxTermsWriter(docWriter, termVectorsWriter);
}
-
- // TODO: can we remove this lazy-init / make cleaner / do it another way...?
- private void initStoredFieldsWriter() throws IOException {
- if (storedFieldsWriter == null) {
- storedFieldsWriter = docWriter.codec.storedFieldsFormat().fieldsWriter(docWriter.directory, docWriter.getSegmentInfo(), IOContext.DEFAULT);
+
+ private Sorter.DocMap maybeSortSegment(SegmentWriteState state) throws IOException {
+ Sort indexSort = state.segmentInfo.getIndexSort();
+ if (indexSort == null) {
+ return null;
+ }
+
+ final List<Sorter.DocComparator> comparators = new ArrayList<>();
+ for (int i = 0; i < indexSort.getSort().length; i++) {
+ SortField sortField = indexSort.getSort()[i];
+ PerField perField = getPerField(sortField.getField());
+ if (perField != null && perField.docValuesWriter != null &&
+ finishedDocValues.contains(perField.fieldInfo.name) == false) {
+ perField.docValuesWriter.finish(state.segmentInfo.maxDoc());
+ Sorter.DocComparator cmp = perField.docValuesWriter.getDocComparator(state.segmentInfo.maxDoc(), sortField);
+ comparators.add(cmp);
+ finishedDocValues.add(perField.fieldInfo.name);
+ } else {
+ // safe to ignore, sort field with no values or already seen before
+ }
}
+ Sorter.DocComparator docComparator = new Sorter.DocComparator() {
+ @Override
+ public int compare(int docID1, int docID2) {
+ for (Sorter.DocComparator comparator : comparators) {
+ int comp = comparator.compare(docID1, docID2);
+ if (comp != 0) {
+ return comp;
+ }
+ }
+ return Integer.compare(docID1, docID2); // docid order tiebreak
+ }
+ };
+ Sorter sorter = new Sorter(indexSort);
+ // returns null if documents are already sorted
+ return sorter.sort(state.segmentInfo.maxDoc(), docComparator);
}
@Override
- public void flush(SegmentWriteState state) throws IOException, AbortingException {
+ public Sorter.DocMap flush(SegmentWriteState state) throws IOException, AbortingException {
// NOTE: caller (DocumentsWriterPerThread) handles
// aborting on any exception from this method
-
+ Sorter.DocMap sortMap = maybeSortSegment(state);
int maxDoc = state.segmentInfo.maxDoc();
long t0 = System.nanoTime();
- writeNorms(state);
+ writeNorms(state, sortMap);
if (docState.infoStream.isEnabled("IW")) {
docState.infoStream.message("IW", ((System.nanoTime()-t0)/1000000) + " msec to write norms");
}
t0 = System.nanoTime();
- writeDocValues(state);
+ writeDocValues(state, sortMap);
if (docState.infoStream.isEnabled("IW")) {
docState.infoStream.message("IW", ((System.nanoTime()-t0)/1000000) + " msec to write docValues");
}
t0 = System.nanoTime();
- writePoints(state);
+ writePoints(state, sortMap);
if (docState.infoStream.isEnabled("IW")) {
docState.infoStream.message("IW", ((System.nanoTime()-t0)/1000000) + " msec to write points");
}
// it's possible all docs hit non-aborting exceptions...
t0 = System.nanoTime();
- initStoredFieldsWriter();
- fillStoredFields(maxDoc);
- storedFieldsWriter.finish(state.fieldInfos, maxDoc);
- storedFieldsWriter.close();
+ storedFieldsConsumer.finish(maxDoc);
+ storedFieldsConsumer.flush(state, sortMap);
if (docState.infoStream.isEnabled("IW")) {
docState.infoStream.message("IW", ((System.nanoTime()-t0)/1000000) + " msec to finish stored fields");
}
@@ -131,7 +171,7 @@ final class DefaultIndexingChain extends DocConsumer {
}
}
- termsHash.flush(fieldsToFlush, state);
+ termsHash.flush(fieldsToFlush, state, sortMap);
if (docState.infoStream.isEnabled("IW")) {
docState.infoStream.message("IW", ((System.nanoTime()-t0)/1000000) + " msec to write postings and finish vectors");
}
@@ -145,10 +185,12 @@ final class DefaultIndexingChain extends DocConsumer {
if (docState.infoStream.isEnabled("IW")) {
docState.infoStream.message("IW", ((System.nanoTime()-t0)/1000000) + " msec to write fieldInfos");
}
+
+ return sortMap;
}
/** Writes all buffered points. */
- private void writePoints(SegmentWriteState state) throws IOException {
+ private void writePoints(SegmentWriteState state, Sorter.DocMap sortMap) throws IOException {
PointsWriter pointsWriter = null;
boolean success = false;
try {
@@ -169,7 +211,7 @@ final class DefaultIndexingChain extends DocConsumer {
pointsWriter = fmt.fieldsWriter(state);
}
- perField.pointValuesWriter.flush(state, pointsWriter);
+ perField.pointValuesWriter.flush(state, sortMap, pointsWriter);
perField.pointValuesWriter = null;
} else if (perField.fieldInfo.getPointDimensionCount() != 0) {
// BUG
@@ -192,7 +234,7 @@ final class DefaultIndexingChain extends DocConsumer {
}
/** Writes all buffered doc values (called from {@link #flush}). */
- private void writeDocValues(SegmentWriteState state) throws IOException {
+ private void writeDocValues(SegmentWriteState state, Sorter.DocMap sortMap) throws IOException {
int maxDoc = state.segmentInfo.maxDoc();
DocValuesConsumer dvConsumer = null;
boolean success = false;
@@ -211,8 +253,10 @@ final class DefaultIndexingChain extends DocConsumer {
dvConsumer = fmt.fieldsConsumer(state);
}
- perField.docValuesWriter.finish(maxDoc);
- perField.docValuesWriter.flush(state, dvConsumer);
+ if (finishedDocValues.contains(perField.fieldInfo.name) == false) {
+ perField.docValuesWriter.finish(maxDoc);
+ }
+ perField.docValuesWriter.flush(state, sortMap, dvConsumer);
perField.docValuesWriter = null;
} else if (perField.fieldInfo.getDocValuesType() != DocValuesType.NONE) {
// BUG
@@ -246,17 +290,7 @@ final class DefaultIndexingChain extends DocConsumer {
}
}
- /** Catch up for all docs before us that had no stored
- * fields, or hit non-aborting exceptions before writing
- * stored fields. */
- private void fillStoredFields(int docID) throws IOException, AbortingException {
- while (lastStoredDocID < docID) {
- startStoredFields();
- finishStoredFields();
- }
- }
-
- private void writeNorms(SegmentWriteState state) throws IOException {
+ private void writeNorms(SegmentWriteState state, Sorter.DocMap sortMap) throws IOException {
boolean success = false;
NormsConsumer normsConsumer = null;
try {
@@ -274,7 +308,7 @@ final class DefaultIndexingChain extends DocConsumer {
if (fi.omitsNorms() == false && fi.getIndexOptions() != IndexOptions.NONE) {
assert perField.norms != null: "field=" + fi.name;
perField.norms.finish(state.segmentInfo.maxDoc());
- perField.norms.flush(state, normsConsumer);
+ perField.norms.flush(state, sortMap, normsConsumer);
}
}
}
@@ -290,7 +324,7 @@ final class DefaultIndexingChain extends DocConsumer {
@Override
public void abort() {
- IOUtils.closeWhileHandlingException(storedFieldsWriter);
+ storedFieldsConsumer.abort();
try {
// E.g. close any open files in the term vectors writer:
@@ -326,21 +360,19 @@ final class DefaultIndexingChain extends DocConsumer {
/** Calls StoredFieldsWriter.startDocument, aborting the
* segment if it hits any exception. */
- private void startStoredFields() throws IOException, AbortingException {
+ private void startStoredFields(int docID) throws IOException, AbortingException {
try {
- initStoredFieldsWriter();
- storedFieldsWriter.startDocument();
+ storedFieldsConsumer.startDocument(docID);
} catch (Throwable th) {
throw AbortingException.wrap(th);
}
- lastStoredDocID++;
}
/** Calls StoredFieldsWriter.finishDocument, aborting the
* segment if it hits any exception. */
private void finishStoredFields() throws IOException, AbortingException {
try {
- storedFieldsWriter.finishDocument();
+ storedFieldsConsumer.finishDocument();
} catch (Throwable th) {
throw AbortingException.wrap(th);
}
@@ -364,8 +396,7 @@ final class DefaultIndexingChain extends DocConsumer {
termsHash.startDocument();
- fillStoredFields(docState.docID);
- startStoredFields();
+ startStoredFields(docState.docID);
boolean aborting = false;
try {
@@ -435,7 +466,7 @@ final class DefaultIndexingChain extends DocConsumer {
throw new IllegalArgumentException("stored field \"" + field.name() + "\" is too large (" + value.length() + " characters) to store");
}
try {
- storedFieldsWriter.writeField(fp.fieldInfo, field);
+ storedFieldsConsumer.writeField(fp.fieldInfo, field);
} catch (Throwable th) {
throw AbortingException.wrap(th);
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7d96f9f7/lucene/core/src/java/org/apache/lucene/index/DocConsumer.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/DocConsumer.java b/lucene/core/src/java/org/apache/lucene/index/DocConsumer.java
index c0c7ce1..36766f3 100644
--- a/lucene/core/src/java/org/apache/lucene/index/DocConsumer.java
+++ b/lucene/core/src/java/org/apache/lucene/index/DocConsumer.java
@@ -21,6 +21,6 @@ import java.io.IOException;
abstract class DocConsumer {
abstract void processDocument() throws IOException, AbortingException;
- abstract void flush(final SegmentWriteState state) throws IOException, AbortingException;
+ abstract Sorter.DocMap flush(final SegmentWriteState state) throws IOException, AbortingException;
abstract void abort();
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7d96f9f7/lucene/core/src/java/org/apache/lucene/index/DocValuesWriter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/DocValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/DocValuesWriter.java
index 526a779..9dde817 100644
--- a/lucene/core/src/java/org/apache/lucene/index/DocValuesWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/DocValuesWriter.java
@@ -20,8 +20,10 @@ package org.apache.lucene.index;
import java.io.IOException;
import org.apache.lucene.codecs.DocValuesConsumer;
+import org.apache.lucene.search.SortField;
abstract class DocValuesWriter {
abstract void finish(int numDoc);
- abstract void flush(SegmentWriteState state, DocValuesConsumer consumer) throws IOException;
+ abstract void flush(SegmentWriteState state, Sorter.DocMap sortMap, DocValuesConsumer consumer) throws IOException;
+ abstract Sorter.DocComparator getDocComparator(int numDoc, SortField sortField) throws IOException;
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7d96f9f7/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java
index e72145c..c5c8839 100644
--- a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java
+++ b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java
@@ -33,6 +33,7 @@ import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FlushInfo;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.TrackingDirectoryWrapper;
+import org.apache.lucene.util.Bits;
import org.apache.lucene.util.ByteBlockPool.Allocator;
import org.apache.lucene.util.ByteBlockPool.DirectTrackingAllocator;
import org.apache.lucene.util.Counter;
@@ -177,7 +178,7 @@ class DocumentsWriterPerThread {
assert numDocsInRAM == 0 : "num docs " + numDocsInRAM;
deleteSlice = deleteQueue.newSlice();
- segmentInfo = new SegmentInfo(directoryOrig, Version.LATEST, segmentName, -1, false, codec, Collections.emptyMap(), StringHelper.randomId(), new HashMap<>(), null);
+ segmentInfo = new SegmentInfo(directoryOrig, Version.LATEST, segmentName, -1, false, codec, Collections.emptyMap(), StringHelper.randomId(), new HashMap<>(), indexWriterConfig.getIndexSort());
assert numDocsInRAM == 0;
if (INFO_VERBOSE && infoStream.isEnabled("DWPT")) {
infoStream.message("DWPT", Thread.currentThread().getName() + " init seg=" + segmentName + " delQueue=" + deleteQueue);
@@ -438,9 +439,9 @@ class DocumentsWriterPerThread {
if (infoStream.isEnabled("DWPT")) {
infoStream.message("DWPT", "flush postings as segment " + flushState.segmentInfo.name + " numDocs=" + numDocsInRAM);
}
-
+ final Sorter.DocMap sortMap;
try {
- consumer.flush(flushState);
+ sortMap = consumer.flush(flushState);
pendingUpdates.terms.clear();
segmentInfo.setFiles(new HashSet<>(directory.getCreatedFiles()));
@@ -477,7 +478,7 @@ class DocumentsWriterPerThread {
FlushedSegment fs = new FlushedSegment(segmentInfoPerCommit, flushState.fieldInfos,
segmentDeletes, flushState.liveDocs, flushState.delCountOnFlush);
- sealFlushedSegment(fs);
+ sealFlushedSegment(fs, sortMap);
if (infoStream.isEnabled("DWPT")) {
infoStream.message("DWPT", "flush time " + ((System.nanoTime() - t0)/1000000.0) + " msec");
}
@@ -494,11 +495,23 @@ class DocumentsWriterPerThread {
public Set<String> pendingFilesToDelete() {
return filesToDelete;
}
+
+ private MutableBits sortLiveDocs(Bits liveDocs, Sorter.DocMap sortMap) throws IOException {
+ assert liveDocs != null && sortMap != null;
+ MutableBits sortedLiveDocs = codec.liveDocsFormat().newLiveDocs(liveDocs.length());
+ for (int i = 0; i < liveDocs.length(); i++) {
+ if (liveDocs.get(i) == false) {
+ sortedLiveDocs.clear(sortMap.oldToNew(i));
+ }
+ }
+ return sortedLiveDocs;
+ }
+
/**
* Seals the {@link SegmentInfo} for the new flushed segment and persists
* the deleted documents {@link MutableBits}.
*/
- void sealFlushedSegment(FlushedSegment flushedSegment) throws IOException {
+ void sealFlushedSegment(FlushedSegment flushedSegment, Sorter.DocMap sortMap) throws IOException {
assert flushedSegment != null;
SegmentCommitInfo newSegment = flushedSegment.segmentInfo;
@@ -548,7 +561,13 @@ class DocumentsWriterPerThread {
SegmentCommitInfo info = flushedSegment.segmentInfo;
Codec codec = info.info.getCodec();
- codec.liveDocsFormat().writeLiveDocs(flushedSegment.liveDocs, directory, info, delCount, context);
+ final MutableBits bits;
+ if (sortMap == null) {
+ bits = flushedSegment.liveDocs;
+ } else {
+ bits = sortLiveDocs(flushedSegment.liveDocs, sortMap);
+ }
+ codec.liveDocsFormat().writeLiveDocs(bits, directory, info, delCount, context);
newSegment.setDelCount(delCount);
newSegment.advanceDelGen();
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7d96f9f7/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriter.java b/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriter.java
index efa1799..1ca2830 100644
--- a/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriter.java
@@ -79,8 +79,8 @@ final class FreqProxTermsWriter extends TermsHash {
}
@Override
- public void flush(Map<String,TermsHashPerField> fieldsToFlush, final SegmentWriteState state) throws IOException {
- super.flush(fieldsToFlush, state);
+ public void flush(Map<String,TermsHashPerField> fieldsToFlush, final SegmentWriteState state, Sorter.DocMap sortMap) throws IOException {
+ super.flush(fieldsToFlush, state, sortMap);
// Gather all fields that saw any postings:
List<FreqProxTermsWriterPerField> allFields = new ArrayList<>();
@@ -98,8 +98,10 @@ final class FreqProxTermsWriter extends TermsHash {
CollectionUtil.introSort(allFields);
Fields fields = new FreqProxFields(allFields);
-
applyDeletes(state, fields);
+ if (sortMap != null) {
+ fields = new SortingLeafReader.SortingFields(fields, state.fieldInfos, sortMap);
+ }
FieldsConsumer consumer = state.segmentInfo.getCodec().postingsFormat().fieldsConsumer(state);
boolean success = false;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7d96f9f7/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java
index d045e79..2159d60 100644
--- a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java
@@ -1032,15 +1032,19 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable {
}
}
- /** Confirms that the incoming index sort (if any) matches the existing index sort (if any). This is unfortunately just best effort,
- * because it could be the old index only has flushed segments. */
- private void validateIndexSort() {
+ /** Confirms that the incoming index sort (if any) matches the existing index sort (if any).
+ * This is unfortunately just best effort, because it could be the old index only has unsorted flushed segments built
+ * before {@link Version#LUCENE_6_5_0} (flushed segments are sorted in Lucene 7.0). */
+ private void validateIndexSort() throws CorruptIndexException {
Sort indexSort = config.getIndexSort();
if (indexSort != null) {
for(SegmentCommitInfo info : segmentInfos) {
Sort segmentIndexSort = info.info.getIndexSort();
if (segmentIndexSort != null && indexSort.equals(segmentIndexSort) == false) {
throw new IllegalArgumentException("cannot change previous indexSort=" + segmentIndexSort + " (from segment=" + info + ") to new indexSort=" + indexSort);
+ } else if (segmentIndexSort == null && info.info.getVersion().onOrAfter(Version.LUCENE_6_5_0)) {
+ // Flushed segments are not sorted if they were built with a version prior to 6.5.0
+ throw new CorruptIndexException("segment not sorted with indexSort=" + segmentIndexSort, info.info.toString());
}
}
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7d96f9f7/lucene/core/src/java/org/apache/lucene/index/MergeState.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/MergeState.java b/lucene/core/src/java/org/apache/lucene/index/MergeState.java
index fdedf3e..a7c8307 100644
--- a/lucene/core/src/java/org/apache/lucene/index/MergeState.java
+++ b/lucene/core/src/java/org/apache/lucene/index/MergeState.java
@@ -223,7 +223,10 @@ public class MergeState {
return originalReaders;
}
- // If an incoming reader is not sorted, because it was flushed by IW, we sort it here:
+ /** If an incoming reader is not sorted, because it was flushed by IW older than {@link Version.LUCENE_7_0_0}
+ * or because we add unsorted segments from another index {@link IndexWriter#addIndexes(CodecReader...)} ,
+ * we sort it here:
+ */
final Sorter sorter = new Sorter(indexSort);
List<CodecReader> readers = new ArrayList<>(originalReaders.size());
@@ -231,9 +234,6 @@ public class MergeState {
Sort segmentSort = leaf.getIndexSort();
if (segmentSort == null) {
- // TODO: fix IW to also sort when flushing? It's somewhat tricky because of stored fields and term vectors, which write "live"
- // to their index files on each indexed document:
-
// This segment was written by flush, so documents are not yet sorted, so we sort them now:
long t0 = System.nanoTime();
Sorter.DocMap sortDocMap = sorter.sort(leaf);
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7d96f9f7/lucene/core/src/java/org/apache/lucene/index/NormValuesWriter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/NormValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/NormValuesWriter.java
index a9797a1..01ca877 100644
--- a/lucene/core/src/java/org/apache/lucene/index/NormValuesWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/NormValuesWriter.java
@@ -64,7 +64,7 @@ class NormValuesWriter {
public void finish(int maxDoc) {
}
- public void flush(SegmentWriteState state, NormsConsumer normsConsumer) throws IOException {
+ public void flush(SegmentWriteState state, Sorter.DocMap sortMap, NormsConsumer normsConsumer) throws IOException {
final int maxDoc = state.segmentInfo.maxDoc();
final PackedLongValues values = pending.build();
@@ -73,7 +73,11 @@ class NormValuesWriter {
new Iterable<Number>() {
@Override
public Iterator<Number> iterator() {
- return new NumericIterator(maxDoc, values);
+ if (sortMap == null) {
+ return new NumericIterator(maxDoc, values);
+ } else {
+ return new SortingNumericIterator(maxDoc, values, sortMap);
+ }
}
});
}
@@ -84,13 +88,13 @@ class NormValuesWriter {
final int size;
final int maxDoc;
int upto;
-
+
NumericIterator(int maxDoc, PackedLongValues values) {
this.maxDoc = maxDoc;
this.iter = values.iterator();
this.size = (int) values.size();
}
-
+
@Override
public boolean hasNext() {
return upto < maxDoc;
@@ -116,5 +120,47 @@ class NormValuesWriter {
throw new UnsupportedOperationException();
}
}
+
+ // sort the values we have in ram according to the provided sort map
+ private static class SortingNumericIterator implements Iterator<Number> {
+ final PackedLongValues values;
+ final Sorter.DocMap sortMap;
+ final int size;
+ final int maxDoc;
+ int upto;
+
+ SortingNumericIterator(int maxDoc, PackedLongValues values, Sorter.DocMap sortMap) {
+ this.maxDoc = maxDoc;
+ this.values = values;
+ this.size = (int) values.size();
+ this.sortMap = sortMap;
+ }
+
+ @Override
+ public boolean hasNext() {
+ return upto < maxDoc;
+ }
+
+ @Override
+ public Number next() {
+ if (!hasNext()) {
+ throw new NoSuchElementException();
+ }
+ Long value;
+ if (upto < size) {
+ int oldUpto = sortMap.newToOld(upto);
+ value = values.get(oldUpto);
+ } else {
+ value = MISSING;
+ }
+ upto++;
+ return value;
+ }
+
+ @Override
+ public void remove() {
+ throw new UnsupportedOperationException();
+ }
+ }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7d96f9f7/lucene/core/src/java/org/apache/lucene/index/NumericDocValuesWriter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/NumericDocValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/NumericDocValuesWriter.java
index 917af66..a307afd 100644
--- a/lucene/core/src/java/org/apache/lucene/index/NumericDocValuesWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/NumericDocValuesWriter.java
@@ -20,8 +20,12 @@ package org.apache.lucene.index;
import java.io.IOException;
import java.util.Iterator;
import java.util.NoSuchElementException;
+import java.util.function.IntPredicate;
+import java.util.function.IntToDoubleFunction;
+import java.util.function.IntToLongFunction;
import org.apache.lucene.codecs.DocValuesConsumer;
+import org.apache.lucene.search.SortField;
import org.apache.lucene.util.Counter;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.RamUsageEstimator;
@@ -40,6 +44,8 @@ class NumericDocValuesWriter extends DocValuesWriter {
private FixedBitSet docsWithField;
private final FieldInfo fieldInfo;
+ PackedLongValues finalValues;
+
public NumericDocValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed) {
pending = PackedLongValues.deltaPackedBuilder(PackedInts.COMPACT);
docsWithField = new FixedBitSet(64);
@@ -79,23 +85,99 @@ class NumericDocValuesWriter extends DocValuesWriter {
@Override
public void finish(int maxDoc) {
+ finalValues = pending.build();
}
- @Override
- public void flush(SegmentWriteState state, DocValuesConsumer dvConsumer) throws IOException {
+ @Override
+ public void flush(SegmentWriteState state, Sorter.DocMap sortMap, DocValuesConsumer dvConsumer) throws IOException {
final int maxDoc = state.segmentInfo.maxDoc();
- final PackedLongValues values = pending.build();
dvConsumer.addNumericField(fieldInfo,
new Iterable<Number>() {
@Override
public Iterator<Number> iterator() {
- return new NumericIterator(maxDoc, values, docsWithField);
+ if (sortMap == null) {
+ return new NumericIterator(maxDoc, finalValues, docsWithField);
+ } else {
+ return new SortingNumericIterator(maxDoc, finalValues, docsWithField, sortMap);
+ }
}
});
}
+ @Override
+ Sorter.DocComparator getDocComparator(int numDoc, SortField sortField) throws IOException {
+ return getDocComparator(sortField, sortField.getType(), (docID) -> docsWithField.get(docID), (docID) -> finalValues.get(docID));
+ }
+
+ static Sorter.DocComparator getDocComparator(SortField sortField, SortField.Type sortType, IntPredicate docsWithField, IntToLongFunction docValueFunction) {
+ final int reverseMul = sortField.getReverse() ? -1 : 1;
+ switch (sortType) {
+ case LONG: {
+ final long missingValue = sortField.getMissingValue() != null ? (Long) sortField.getMissingValue() : 0;
+ IntToLongFunction docValueOrMissing =
+ (docID) -> docsWithField.test(docID) ? docValueFunction.applyAsLong(docID) : missingValue;
+ return new Sorter.DocComparator() {
+ @Override
+ public int compare(int docID1, int docID2) {
+ final long value1 = docValueOrMissing.applyAsLong(docID1);
+ final long value2 = docValueOrMissing.applyAsLong(docID2);
+ return reverseMul * Long.compare(value1, value2);
+ }
+ };
+ }
+
+ case INT: {
+ final int missingValue = sortField.getMissingValue() != null ? (Integer) sortField.getMissingValue() : 0;
+ IntToLongFunction docValueOrMissing =
+ (docID) -> docsWithField.test(docID) ? docValueFunction.applyAsLong(docID) : missingValue;
+
+ return new Sorter.DocComparator() {
+ @Override
+ public int compare(int docID1, int docID2) {
+ final int value1 = (int) docValueOrMissing.applyAsLong(docID1);
+ final int value2 = (int) docValueOrMissing.applyAsLong(docID2);
+ return reverseMul * Integer.compare(value1, value2);
+ }
+ };
+ }
+
+ case DOUBLE: {
+ final double missingValue = sortField.getMissingValue() != null ? (Double) sortField.getMissingValue() : 0;
+ IntToDoubleFunction docValueOrMissing =
+ (docID) -> docsWithField.test(docID) ? Double.longBitsToDouble(docValueFunction.applyAsLong(docID)) : missingValue;
+
+ return new Sorter.DocComparator() {
+ @Override
+ public int compare(int docID1, int docID2) {
+ final double value1 = docValueOrMissing.applyAsDouble(docID1);
+ final double value2 = docValueOrMissing.applyAsDouble(docID2);
+ return reverseMul * Double.compare(value1, value2);
+ }
+ };
+ }
+
+ case FLOAT: {
+ final float missingValue = sortField.getMissingValue() != null ? (Float) sortField.getMissingValue() : 0;
+ IntToDoubleFunction docValueOrMissing =
+ (docID) -> docsWithField.test(docID) ? Float.intBitsToFloat((int) docValueFunction.applyAsLong(docID)) : missingValue;
+
+ return new Sorter.DocComparator() {
+ @Override
+ public int compare(int docID1, int docID2) {
+ final float value1 = (float) docValueOrMissing.applyAsDouble(docID1);
+ final float value2 = (float) docValueOrMissing.applyAsDouble(docID2);
+ return reverseMul * Float.compare(value1, value2);
+ }
+ };
+ }
+
+ default:
+ throw new IllegalArgumentException("unhandled SortField.getType()=" + sortField.getType());
+ }
+ }
+
// iterates over the values we have in ram
private static class NumericIterator implements Iterator<Number> {
final PackedLongValues.Iterator iter;
@@ -103,14 +185,14 @@ class NumericDocValuesWriter extends DocValuesWriter {
final int size;
final int maxDoc;
int upto;
-
+
NumericIterator(int maxDoc, PackedLongValues values, FixedBitSet docsWithFields) {
this.maxDoc = maxDoc;
this.iter = values.iterator();
this.size = (int) values.size();
this.docsWithField = docsWithFields;
}
-
+
@Override
public boolean hasNext() {
return upto < maxDoc;
@@ -141,4 +223,52 @@ class NumericDocValuesWriter extends DocValuesWriter {
throw new UnsupportedOperationException();
}
}
+
+ // sort the values we have in ram according to the provided sort map
+ private static class SortingNumericIterator implements Iterator<Number> {
+ final PackedLongValues values;
+ final FixedBitSet docsWithField;
+ final Sorter.DocMap sortMap;
+ final int size;
+ final int maxDoc;
+ int upto;
+
+ SortingNumericIterator(int maxDoc, PackedLongValues values, FixedBitSet docsWithFields, Sorter.DocMap sortMap) {
+ this.maxDoc = maxDoc;
+ this.values = values;
+ this.size = (int) values.size();
+ this.docsWithField = docsWithFields;
+ this.sortMap = sortMap;
+ }
+
+ @Override
+ public boolean hasNext() {
+ return upto < maxDoc;
+ }
+
+ @Override
+ public Number next() {
+ if (!hasNext()) {
+ throw new NoSuchElementException();
+ }
+ Long value;
+ if (upto < size) {
+ int old = sortMap.newToOld(upto);
+ if (docsWithField.get(old)) {
+ value = values.get(old);
+ } else {
+ value = null;
+ }
+ } else {
+ value = null;
+ }
+ upto++;
+ return value;
+ }
+
+ @Override
+ public void remove() {
+ throw new UnsupportedOperationException();
+ }
+ }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7d96f9f7/lucene/core/src/java/org/apache/lucene/index/PointValuesWriter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/PointValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/PointValuesWriter.java
index daf1f33..cc14bd2 100644
--- a/lucene/core/src/java/org/apache/lucene/index/PointValuesWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/PointValuesWriter.java
@@ -69,9 +69,8 @@ class PointValuesWriter {
numPoints++;
}
- public void flush(SegmentWriteState state, PointsWriter writer) throws IOException {
- PointsReader reader = new MutablePointsReader() {
-
+ public void flush(SegmentWriteState state, Sorter.DocMap sortMap, PointsWriter writer) throws IOException {
+ MutablePointsReader points = new MutablePointsReader() {
final int[] ords = new int[numPoints];
{
for (int i = 0; i < numPoints; ++i) {
@@ -170,6 +169,109 @@ class PointValuesWriter {
}
};
- writer.writeField(fieldInfo, reader);
+ final MutablePointsReader values;
+ if (sortMap == null) {
+ values = points;
+ } else {
+ values = new MutableSortingPointReader(points, sortMap);
+ }
+
+ writer.writeField(fieldInfo, values);
+ }
+
+ static final class MutableSortingPointReader extends MutablePointsReader {
+
+ private final MutablePointsReader in;
+ private final Sorter.DocMap docMap;
+
+ public MutableSortingPointReader(final MutablePointsReader in, Sorter.DocMap docMap) {
+ this.in = in;
+ this.docMap = docMap;
+ }
+
+ @Override
+ public void intersect(String field, PointValues.IntersectVisitor visitor) throws IOException {
+ in.intersect(field, new PointValues.IntersectVisitor() {
+ @Override
+ public void visit(int docID) throws IOException {
+ visitor.visit(docMap.oldToNew(docID));
+ }
+
+ @Override
+ public void visit(int docID, byte[] packedValue) throws IOException {
+ visitor.visit(docMap.oldToNew(docID), packedValue);
+ }
+
+ @Override
+ public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
+ return visitor.compare(minPackedValue, maxPackedValue);
+ }
+ });
+ }
+
+ @Override
+ public byte[] getMinPackedValue(String field) throws IOException {
+ return in.getMinPackedValue(field);
+ }
+
+ @Override
+ public byte[] getMaxPackedValue(String field) throws IOException {
+ return in.getMaxPackedValue(field);
+ }
+
+ @Override
+ public int getNumDimensions(String field) throws IOException {
+ return in.getNumDimensions(field);
+ }
+
+ @Override
+ public int getBytesPerDimension(String field) throws IOException {
+ return in.getBytesPerDimension(field);
+ }
+
+ @Override
+ public long size(String field) {
+ return in.size(field);
+ }
+
+ @Override
+ public int getDocCount(String field) {
+ return in.getDocCount(field);
+ }
+
+ @Override
+ public void getValue(int i, BytesRef packedValue) {
+ in.getValue(i, packedValue);
+ }
+
+ @Override
+ public byte getByteAt(int i, int k) {
+ return in.getByteAt(i, k);
+ }
+
+ @Override
+ public int getDocID(int i) {
+ return docMap.oldToNew(in.getDocID(i));
+ }
+
+ @Override
+ public void swap(int i, int j) {
+ in.swap(i, j);
+ }
+
+ @Override
+ public void checkIntegrity() throws IOException {
+ in.checkIntegrity();
+ }
+
+ @Override
+ public void close() throws IOException {
+ in.close();
+ }
+
+ @Override
+ public long ramBytesUsed() {
+ return in.ramBytesUsed();
+ }
}
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7d96f9f7/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java
index 6517218..a66e6e5 100644
--- a/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java
@@ -16,21 +16,22 @@
*/
package org.apache.lucene.index;
-import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_SIZE;
-
import java.io.IOException;
import java.util.Iterator;
import java.util.NoSuchElementException;
import org.apache.lucene.codecs.DocValuesConsumer;
+import org.apache.lucene.search.SortField;
import org.apache.lucene.util.ByteBlockPool;
import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.BytesRefHash.DirectBytesStartArray;
import org.apache.lucene.util.BytesRefHash;
+import org.apache.lucene.util.BytesRefHash.DirectBytesStartArray;
import org.apache.lucene.util.Counter;
import org.apache.lucene.util.packed.PackedInts;
import org.apache.lucene.util.packed.PackedLongValues;
+import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_SIZE;
+
/** Buffers up pending byte[] per doc, deref and sorting via
* int ord, then flushes when segment flushes. */
class SortedDocValuesWriter extends DocValuesWriter {
@@ -40,6 +41,10 @@ class SortedDocValuesWriter extends DocValuesWriter {
private long bytesUsed; // this currently only tracks differences in 'pending'
private final FieldInfo fieldInfo;
+ PackedLongValues finalOrds;
+ int[] finalSortedValues;
+ int[] finalOrdMap;
+
private static final int EMPTY_ORD = -1;
public SortedDocValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed) {
@@ -80,6 +85,19 @@ class SortedDocValuesWriter extends DocValuesWriter {
pending.add(EMPTY_ORD);
}
updateBytesUsed();
+
+ assert pending.size() == maxDoc;
+ final int valueCount = hash.size();
+ if (finalOrds == null) {
+ finalOrds = pending.build();
+ finalSortedValues = hash.sort();
+ finalOrdMap = new int[valueCount];
+
+ for (int ord = 0; ord < valueCount; ord++) {
+ finalOrdMap[finalSortedValues[ord]] = ord;
+ }
+ }
+
}
private void addOneValue(BytesRef value) {
@@ -105,19 +123,9 @@ class SortedDocValuesWriter extends DocValuesWriter {
}
@Override
- public void flush(SegmentWriteState state, DocValuesConsumer dvConsumer) throws IOException {
+ public void flush(SegmentWriteState state, Sorter.DocMap sortMap, DocValuesConsumer dvConsumer) throws IOException {
final int maxDoc = state.segmentInfo.maxDoc();
-
- assert pending.size() == maxDoc;
final int valueCount = hash.size();
- final PackedLongValues ords = pending.build();
-
- final int[] sortedValues = hash.sort();
- final int[] ordMap = new int[valueCount];
-
- for(int ord=0;ord<valueCount;ord++) {
- ordMap[sortedValues[ord]] = ord;
- }
dvConsumer.addSortedField(fieldInfo,
@@ -125,7 +133,7 @@ class SortedDocValuesWriter extends DocValuesWriter {
new Iterable<BytesRef>() {
@Override
public Iterator<BytesRef> iterator() {
- return new ValuesIterator(sortedValues, valueCount, hash);
+ return new ValuesIterator(finalSortedValues, valueCount, hash);
}
},
@@ -133,11 +141,46 @@ class SortedDocValuesWriter extends DocValuesWriter {
new Iterable<Number>() {
@Override
public Iterator<Number> iterator() {
- return new OrdsIterator(ordMap, maxDoc, ords);
+ if (sortMap == null) {
+ return new OrdsIterator(finalOrdMap, maxDoc, finalOrds);
+ } else {
+ return new SortingOrdsIterator(finalOrdMap, maxDoc, finalOrds, sortMap);
+ }
}
});
}
+ @Override
+ Sorter.DocComparator getDocComparator(int numDoc, SortField sortField) throws IOException {
+ assert sortField.getType() == SortField.Type.STRING;
+ final int missingOrd;
+ if (sortField.getMissingValue() == SortField.STRING_LAST) {
+ missingOrd = Integer.MAX_VALUE;
+ } else {
+ missingOrd = Integer.MIN_VALUE;
+ }
+ final int reverseMul = sortField.getReverse() ? -1 : 1;
+ return new Sorter.DocComparator() {
+ @Override
+ public int compare(int docID1, int docID2) {
+ int ord1 = (int) finalOrds.get(docID1);
+ if (ord1 == -1) {
+ ord1 = missingOrd;
+ } else {
+ ord1 = finalOrdMap[ord1];
+ }
+
+ int ord2 = (int) finalOrds.get(docID2);
+ if (ord2 == -1) {
+ ord2 = missingOrd;
+ } else {
+ ord2 = finalOrdMap[ord2];
+ }
+ return reverseMul * Integer.compare(ord1, ord2);
+ }
+ };
+ }
+
// iterates over the unique values we have in ram
private static class ValuesIterator implements Iterator<BytesRef> {
final int sortedValues[];
@@ -172,21 +215,21 @@ class SortedDocValuesWriter extends DocValuesWriter {
throw new UnsupportedOperationException();
}
}
-
+
// iterates over the ords for each doc we have in ram
private static class OrdsIterator implements Iterator<Number> {
final PackedLongValues.Iterator iter;
final int ordMap[];
final int maxDoc;
int docUpto;
-
+
OrdsIterator(int ordMap[], int maxDoc, PackedLongValues ords) {
this.ordMap = ordMap;
this.maxDoc = maxDoc;
assert ords.size() == maxDoc;
this.iter = ords.iterator();
}
-
+
@Override
public boolean hasNext() {
return docUpto < maxDoc;
@@ -207,4 +250,42 @@ class SortedDocValuesWriter extends DocValuesWriter {
throw new UnsupportedOperationException();
}
}
+
+ // sort the ords we have in ram according to the provided sort map
+ private static class SortingOrdsIterator implements Iterator<Number> {
+ final PackedLongValues ords;
+ final int ordMap[];
+ final Sorter.DocMap sortMap;
+ final int maxDoc;
+ int docUpto;
+
+ SortingOrdsIterator(int ordMap[], int maxDoc, PackedLongValues ords, Sorter.DocMap sortMap) {
+ this.ordMap = ordMap;
+ this.maxDoc = maxDoc;
+ assert ords.size() == maxDoc;
+ this.ords = ords;
+ this.sortMap = sortMap;
+ }
+
+ @Override
+ public boolean hasNext() {
+ return docUpto < maxDoc;
+ }
+
+ @Override
+ public Number next() {
+ if (!hasNext()) {
+ throw new NoSuchElementException();
+ }
+ int oldUpto = sortMap.newToOld(docUpto);
+ int ord = (int) ords.get(oldUpto);
+ docUpto++;
+ return ord == -1 ? ord : ordMap[ord];
+ }
+
+ @Override
+ public void remove() {
+ throw new UnsupportedOperationException();
+ }
+ }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7d96f9f7/lucene/core/src/java/org/apache/lucene/index/SortedNumericDocValuesWriter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/SortedNumericDocValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/SortedNumericDocValuesWriter.java
index 5ce9a91..d22ac04 100644
--- a/lucene/core/src/java/org/apache/lucene/index/SortedNumericDocValuesWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/SortedNumericDocValuesWriter.java
@@ -21,8 +21,11 @@ import java.io.IOException;
import java.util.Arrays;
import java.util.Iterator;
import java.util.NoSuchElementException;
+import java.util.function.IntToLongFunction;
import org.apache.lucene.codecs.DocValuesConsumer;
+import org.apache.lucene.search.SortField;
+import org.apache.lucene.search.SortedNumericSortField;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Counter;
import org.apache.lucene.util.RamUsageEstimator;
@@ -39,6 +42,11 @@ class SortedNumericDocValuesWriter extends DocValuesWriter {
private int currentDoc;
private long currentValues[] = new long[8];
private int currentUpto = 0;
+ private int maxCount = 0;
+
+ PackedLongValues finalValues;
+ PackedLongValues finalValueCounts;
+ int[] valueStartPtrs;
public SortedNumericDocValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed) {
this.fieldInfo = fieldInfo;
@@ -72,6 +80,7 @@ class SortedNumericDocValuesWriter extends DocValuesWriter {
}
// record the number of values for this doc
pendingCounts.add(currentUpto);
+ maxCount = Math.max(maxCount, currentUpto);
currentUpto = 0;
currentDoc++;
}
@@ -84,6 +93,40 @@ class SortedNumericDocValuesWriter extends DocValuesWriter {
for (int i = currentDoc; i < maxDoc; i++) {
pendingCounts.add(0); // no values
}
+
+ assert pendingCounts.size() == maxDoc;
+ finalValues = pending.build();
+ finalValueCounts = pendingCounts.build();
+ }
+
+ @Override
+ Sorter.DocComparator getDocComparator(int numDoc, SortField sortField) throws IOException {
+ SortedNumericSortField sf = (SortedNumericSortField) sortField;
+ valueStartPtrs = new int[numDoc];
+ int ptr = 0;
+ int doc = 0;
+ PackedLongValues.Iterator it = finalValueCounts.iterator();
+ while (it.hasNext()) {
+ valueStartPtrs[doc++] = ptr;
+ ptr += it.next();
+ }
+
+ final IntToLongFunction function = (docID) -> {
+ int count = (int) finalValueCounts.get(docID);
+ assert count > 0;
+ int start = valueStartPtrs[docID];
+ switch (sf.getSelector()) {
+ case MIN:
+ return finalValues.get(start);
+
+ case MAX:
+ return finalValues.get(start + count - 1);
+
+ default:
+ throw new IllegalStateException("Should never happen");
+ }
+ };
+ return NumericDocValuesWriter.getDocComparator(sf, sf.getNumericType(), (docID) -> finalValueCounts.get(docID) > 0, function);
}
private void addOneValue(long value) {
@@ -102,26 +145,41 @@ class SortedNumericDocValuesWriter extends DocValuesWriter {
}
@Override
- public void flush(SegmentWriteState state, DocValuesConsumer dvConsumer) throws IOException {
+ public void flush(SegmentWriteState state, Sorter.DocMap sortMap, DocValuesConsumer dvConsumer) throws IOException {
final int maxDoc = state.segmentInfo.maxDoc();
- assert pendingCounts.size() == maxDoc;
- final PackedLongValues values = pending.build();
- final PackedLongValues valueCounts = pendingCounts.build();
+
+ if (sortMap != null) {
+ valueStartPtrs = new int[maxDoc];
+ int ptr = 0;
+ int doc = 0;
+ PackedLongValues.Iterator it = finalValueCounts.iterator();
+ while (it.hasNext()) {
+ valueStartPtrs[doc++] = ptr;
+ ptr += it.next();
+ }
+ }
dvConsumer.addSortedNumericField(fieldInfo,
// doc -> valueCount
new Iterable<Number>() {
@Override
public Iterator<Number> iterator() {
- return new CountIterator(valueCounts);
+ if (sortMap == null) {
+ return new CountIterator(finalValueCounts);
+ } else {
+ return new SortingCountIterator(finalValueCounts, sortMap);
+ }
}
},
-
// values
new Iterable<Number>() {
@Override
public Iterator<Number> iterator() {
- return new ValuesIterator(values);
+ if (sortMap == null) {
+ return new ValuesIterator(finalValues);
+ } else {
+ return new SortingValuesIterator(finalValues, finalValueCounts, maxCount, sortMap, valueStartPtrs);
+ }
}
});
}
@@ -152,7 +210,64 @@ class SortedNumericDocValuesWriter extends DocValuesWriter {
throw new UnsupportedOperationException();
}
}
-
+
+ // sort the values we have in ram according to the provided sort map
+ private static class SortingValuesIterator implements Iterator<Number> {
+ final PackedLongValues values;
+ final PackedLongValues counts;
+ final Sorter.DocMap sortMap;
+ final int[] startPtrs;
+
+ final long numValues;
+ long valueUpto;
+ final long currentDoc[];
+ int currentDocSize;
+ int docUpto;
+ int currentLength;
+
+ private SortingValuesIterator(PackedLongValues values, PackedLongValues counts, int maxCount, Sorter.DocMap sortMap,
+ int[] startPtrs) {
+ this.values = values;
+ this.numValues = values.size();
+ this.counts = counts;
+ this.sortMap = sortMap;
+ this.startPtrs = startPtrs;
+ this.currentDoc = new long[maxCount];
+ }
+
+ @Override
+ public boolean hasNext() {
+ return valueUpto < numValues;
+ }
+
+ @Override
+ public Number next() {
+ if (!hasNext()) {
+ throw new NoSuchElementException();
+ }
+ while (currentDocSize == currentLength) {
+ currentDocSize = 0;
+ int oldUpto = sortMap.newToOld(docUpto);
+ currentLength = (int) counts.get(oldUpto);
+ int start = startPtrs[oldUpto];
+ for (int i = 0; i < currentLength; i++) {
+ currentDoc[i] = values.get(start+i);
+ }
+ docUpto++;
+ }
+ long value = currentDoc[currentDocSize];
+ currentDocSize++;
+ valueUpto++;
+ // TODO: make reusable Number
+ return value;
+ }
+
+ @Override
+ public void remove() {
+ throw new UnsupportedOperationException();
+ }
+ }
+
private static class CountIterator implements Iterator<Number> {
final PackedLongValues.Iterator iter;
@@ -178,4 +293,36 @@ class SortedNumericDocValuesWriter extends DocValuesWriter {
throw new UnsupportedOperationException();
}
}
+
+ private static class SortingCountIterator implements Iterator<Number> {
+ final PackedLongValues counts;
+ final Sorter.DocMap sortMap;
+ final int size;
+ int currentUpto;
+
+ SortingCountIterator(PackedLongValues valueCounts, Sorter.DocMap sortMap) {
+ this.counts = valueCounts;
+ this.sortMap = sortMap;
+ this.size = (int) valueCounts.size();
+ }
+
+ @Override
+ public boolean hasNext() {
+ return currentUpto < size;
+ }
+
+ @Override
+ public Number next() {
+ if (!hasNext()) {
+ throw new NoSuchElementException();
+ }
+ int oldUpto = sortMap.newToOld(currentUpto++);
+ return counts.get(oldUpto);
+ }
+
+ @Override
+ public void remove() {
+ throw new UnsupportedOperationException();
+ }
+ }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7d96f9f7/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValuesWriter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValuesWriter.java
index 3f3beb3..6be3db2 100644
--- a/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValuesWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValuesWriter.java
@@ -16,23 +16,25 @@
*/
package org.apache.lucene.index;
-import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_SIZE;
-
import java.io.IOException;
import java.util.Arrays;
import java.util.Iterator;
import java.util.NoSuchElementException;
import org.apache.lucene.codecs.DocValuesConsumer;
+import org.apache.lucene.search.SortField;
+import org.apache.lucene.search.SortedSetSortField;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.ByteBlockPool;
import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.BytesRefHash.DirectBytesStartArray;
import org.apache.lucene.util.BytesRefHash;
+import org.apache.lucene.util.BytesRefHash.DirectBytesStartArray;
import org.apache.lucene.util.Counter;
import org.apache.lucene.util.packed.PackedInts;
import org.apache.lucene.util.packed.PackedLongValues;
+import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_SIZE;
+
/** Buffers up pending byte[]s per doc, deref and sorting via
* int ord, then flushes when segment flushes. */
class SortedSetDocValuesWriter extends DocValuesWriter {
@@ -47,6 +49,12 @@ class SortedSetDocValuesWriter extends DocValuesWriter {
private int currentUpto = 0;
private int maxCount = 0;
+ PackedLongValues finalOrds;
+ PackedLongValues finalOrdCounts;
+ int[] finalSortedValues;
+ int[] finalOrdMap;
+ int[] valueStartPtrs;
+
public SortedSetDocValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed) {
this.fieldInfo = fieldInfo;
this.iwBytesUsed = iwBytesUsed;
@@ -112,6 +120,17 @@ class SortedSetDocValuesWriter extends DocValuesWriter {
for (int i = currentDoc; i < maxDoc; i++) {
pendingCounts.add(0); // no values
}
+
+ assert pendingCounts.size() == maxDoc;
+ final int valueCount = hash.size();
+ finalOrds = pending.build();
+ finalOrdCounts = pendingCounts.build();
+
+ finalSortedValues = hash.sort();
+ finalOrdMap = new int[valueCount];
+ for (int ord = 0; ord < valueCount; ord++) {
+ finalOrdMap[finalSortedValues[ord]] = ord;
+ }
}
private void addOneValue(BytesRef value) {
@@ -144,19 +163,20 @@ class SortedSetDocValuesWriter extends DocValuesWriter {
}
@Override
- public void flush(SegmentWriteState state, DocValuesConsumer dvConsumer) throws IOException {
+ public void flush(SegmentWriteState state, Sorter.DocMap sortMap, DocValuesConsumer dvConsumer) throws IOException {
final int maxDoc = state.segmentInfo.maxDoc();
final int maxCountPerDoc = maxCount;
- assert pendingCounts.size() == maxDoc;
- final int valueCount = hash.size();
- final PackedLongValues ords = pending.build();
- final PackedLongValues ordCounts = pendingCounts.build();
- final int[] sortedValues = hash.sort();
- final int[] ordMap = new int[valueCount];
-
- for(int ord=0;ord<valueCount;ord++) {
- ordMap[sortedValues[ord]] = ord;
+ final int valueCount = hash.size();
+ if (valueStartPtrs == null) {
+ valueStartPtrs = new int[maxDoc];
+ int ptr = 0;
+ int doc = 0;
+ PackedLongValues.Iterator it = finalOrdCounts.iterator();
+ while (it.hasNext()) {
+ valueStartPtrs[doc++] = ptr;
+ ptr += it.next();
+ }
}
dvConsumer.addSortedSetField(fieldInfo,
@@ -165,7 +185,7 @@ class SortedSetDocValuesWriter extends DocValuesWriter {
new Iterable<BytesRef>() {
@Override
public Iterator<BytesRef> iterator() {
- return new ValuesIterator(sortedValues, valueCount, hash);
+ return new ValuesIterator(finalSortedValues, valueCount, hash);
}
},
@@ -173,7 +193,11 @@ class SortedSetDocValuesWriter extends DocValuesWriter {
new Iterable<Number>() {
@Override
public Iterator<Number> iterator() {
- return new OrdCountIterator(maxDoc, ordCounts);
+ if (sortMap == null) {
+ return new OrdCountIterator(maxDoc, finalOrdCounts);
+ } else {
+ return new SortingOrdCountIterator(maxDoc, finalOrdCounts, sortMap);
+ }
}
},
@@ -181,11 +205,75 @@ class SortedSetDocValuesWriter extends DocValuesWriter {
new Iterable<Number>() {
@Override
public Iterator<Number> iterator() {
- return new OrdsIterator(ordMap, maxCountPerDoc, ords, ordCounts);
+ if (sortMap == null) {
+ return new OrdsIterator(finalOrdMap, maxCountPerDoc, finalOrds, finalOrdCounts);
+ } else {
+ return new SortingOrdsIterator(finalOrdMap, maxCountPerDoc, finalOrds, finalOrdCounts, sortMap, valueStartPtrs);
+ }
}
});
}
+ @Override
+ Sorter.DocComparator getDocComparator(int numDoc, SortField sortField) throws IOException {
+ assert sortField instanceof SortedSetSortField;
+ SortedSetSortField sf = (SortedSetSortField) sortField;
+ valueStartPtrs = new int[numDoc];
+ int ptr = 0;
+ int doc = 0;
+ PackedLongValues.Iterator it = finalOrdCounts.iterator();
+ while (it.hasNext()) {
+ valueStartPtrs[doc++] = ptr;
+ ptr += it.next();
+ }
+
+ final int missingOrd;
+ if (sortField.getMissingValue() == SortField.STRING_LAST) {
+ missingOrd = Integer.MAX_VALUE;
+ } else {
+ missingOrd = Integer.MIN_VALUE;
+ }
+ // the ordinals per document are not sorted so we select the best ordinal per document based on the sort field selector
+ // only once and we record the result in an array.
+ int[] bestOrds = new int[numDoc];
+ Arrays.fill(bestOrds, missingOrd);
+ for (int docID = 0; docID < numDoc; docID++) {
+ int count = (int) finalOrdCounts.get(docID);
+ if (count == 0) {
+ continue;
+ }
+ int start = valueStartPtrs[docID];
+ switch (sf.getSelector()) {
+ case MIN:
+ int min = Integer.MAX_VALUE;
+ for (int i = 0; i < count; i++) {
+ min = Math.min(finalOrdMap[(int) finalOrds.get(start + i)], min);
+ }
+ bestOrds[docID] = min;
+ break;
+
+ case MAX:
+ int max = 0;
+ for (int i = 0; i < count; i++) {
+ max = Math.max(finalOrdMap[(int) finalOrds.get(start + i)], max);
+ }
+ bestOrds[docID] = max;
+ break;
+
+ default:
+ throw new IllegalStateException("unhandled SortedSetSortField.getSelector()=" + sf.getSelector());
+ }
+ }
+
+ final int reverseMul = sortField.getReverse() ? -1 : 1;
+ return new Sorter.DocComparator() {
+ @Override
+ public int compare(int docID1, int docID2) {
+ return reverseMul * Integer.compare(bestOrds[docID1], bestOrds[docID2]);
+ }
+ };
+ }
+
// iterates over the unique values we have in ram
private static class ValuesIterator implements Iterator<BytesRef> {
final int sortedValues[];
@@ -224,9 +312,10 @@ class SortedSetDocValuesWriter extends DocValuesWriter {
// iterates over the ords for each doc we have in ram
private static class OrdsIterator implements Iterator<Number> {
final PackedLongValues.Iterator iter;
- final PackedLongValues.Iterator counts;
+ final PackedLongValues.Iterator iterCounts;
final int ordMap[];
final long numOrds;
+
long ordUpto;
final int currentDoc[];
@@ -238,7 +327,7 @@ class SortedSetDocValuesWriter extends DocValuesWriter {
this.ordMap = ordMap;
this.numOrds = ords.size();
this.iter = ords.iterator();
- this.counts = ordCounts.iterator();
+ this.iterCounts = ordCounts.iterator();
}
@Override
@@ -254,7 +343,7 @@ class SortedSetDocValuesWriter extends DocValuesWriter {
while (currentUpto == currentLength) {
// refill next doc, and sort remapped ords within the doc.
currentUpto = 0;
- currentLength = (int) counts.next();
+ currentLength = (int) iterCounts.next();
for (int i = 0; i < currentLength; i++) {
currentDoc[i] = ordMap[(int) iter.next()];
}
@@ -272,8 +361,70 @@ class SortedSetDocValuesWriter extends DocValuesWriter {
throw new UnsupportedOperationException();
}
}
+
+ // iterates over the ords for each doc we have in ram
+ private static class SortingOrdsIterator implements Iterator<Number> {
+ final PackedLongValues ords;
+ final PackedLongValues ordCounts;
+ final int ordMap[];
+ final int starts[];
+ final long numOrds;
+ final Sorter.DocMap sortMap;
+
+ long ordUpto;
+
+ final int currentDoc[];
+ int docUpto;
+ int currentUpto;
+ int currentLength;
+
+ SortingOrdsIterator(int ordMap[], int maxCount, PackedLongValues ords, PackedLongValues ordCounts, Sorter.DocMap sortMap, int[] starts) {
+ this.currentDoc = new int[maxCount];
+ this.ordMap = ordMap;
+ this.numOrds = ords.size();
+ this.ords = ords;
+ this.ordCounts = ordCounts;
+ this.sortMap = sortMap;
+ this.starts = starts;
+ }
+
+ @Override
+ public boolean hasNext() {
+ return ordUpto < numOrds;
+ }
+
+ @Override
+ public Number next() {
+ if (!hasNext()) {
+ throw new NoSuchElementException();
+ }
+ while (currentUpto == currentLength) {
+ // refill next doc, and sort remapped ords within the doc.
+ currentUpto = 0;
+ int oldUpto = sortMap.newToOld(docUpto);
+ int start = starts[oldUpto];
+ currentLength = (int) ordCounts.get(oldUpto);
+ for (int i = 0; i < currentLength; i++) {
+ currentDoc[i] = ordMap[(int) ords.get(start+i)];
+ }
+ Arrays.sort(currentDoc, 0, currentLength);
+ docUpto ++;
+ }
+ int ord = currentDoc[currentUpto];
+ currentUpto++;
+ ordUpto++;
+ // TODO: make reusable Number
+ return ord;
+ }
+
+ @Override
+ public void remove() {
+ throw new UnsupportedOperationException();
+ }
+ }
private static class OrdCountIterator implements Iterator<Number> {
+ final PackedLongValues counts;
final PackedLongValues.Iterator iter;
final int maxDoc;
int docUpto;
@@ -282,6 +433,7 @@ class SortedSetDocValuesWriter extends DocValuesWriter {
this.maxDoc = maxDoc;
assert ordCounts.size() == maxDoc;
this.iter = ordCounts.iterator();
+ this.counts = ordCounts;
}
@Override
@@ -294,7 +446,7 @@ class SortedSetDocValuesWriter extends DocValuesWriter {
if (!hasNext()) {
throw new NoSuchElementException();
}
- docUpto++;
+ docUpto ++;
// TODO: make reusable Number
return iter.next();
}
@@ -304,4 +456,39 @@ class SortedSetDocValuesWriter extends DocValuesWriter {
throw new UnsupportedOperationException();
}
}
+
+ private static class SortingOrdCountIterator implements Iterator<Number> {
+ final PackedLongValues counts;
+ final Sorter.DocMap sortMap;
+ final int maxDoc;
+ int docUpto;
+
+ SortingOrdCountIterator(int maxDoc, PackedLongValues ordCounts, Sorter.DocMap sortMap) {
+ this.maxDoc = maxDoc;
+ assert ordCounts.size() == maxDoc;
+ this.counts = ordCounts;
+ this.sortMap = sortMap;
+ }
+
+ @Override
+ public boolean hasNext() {
+ return docUpto < maxDoc;
+ }
+
+ @Override
+ public Number next() {
+ if (!hasNext()) {
+ throw new NoSuchElementException();
+ }
+ int oldUpto = sortMap.newToOld(docUpto);
+ docUpto++;
+ // TODO: make reusable number
+ return counts.get(oldUpto);
+ }
+
+ @Override
+ public void remove() {
+ throw new UnsupportedOperationException();
+ }
+ }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7d96f9f7/lucene/core/src/java/org/apache/lucene/index/Sorter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/Sorter.java b/lucene/core/src/java/org/apache/lucene/index/Sorter.java
index 9ec472a..6472549 100644
--- a/lucene/core/src/java/org/apache/lucene/index/Sorter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/Sorter.java
@@ -139,7 +139,7 @@ final class Sorter {
}
/** Computes the old-to-new permutation over the given comparator. */
- private static Sorter.DocMap sort(final int maxDoc, DocComparator comparator) {
+ static Sorter.DocMap sort(final int maxDoc, DocComparator comparator) {
// check if the index is sorted
boolean sorted = true;
for (int i = 1; i < maxDoc; ++i) {
@@ -252,7 +252,7 @@ final class Sorter {
SortField fields[] = sort.getSort();
final int reverseMul[] = new int[fields.length];
final LeafFieldComparator comparators[] = new LeafFieldComparator[fields.length];
-
+
for (int i = 0; i < fields.length; i++) {
reverseMul[i] = fields[i].getReverse() ? -1 : 1;
comparators[i] = fields[i].getComparator(1, i).getLeafComparator(reader.getContext());
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7d96f9f7/lucene/core/src/java/org/apache/lucene/index/SortingLeafReader.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/SortingLeafReader.java b/lucene/core/src/java/org/apache/lucene/index/SortingLeafReader.java
index c1476d0..1b57594 100644
--- a/lucene/core/src/java/org/apache/lucene/index/SortingLeafReader.java
+++ b/lucene/core/src/java/org/apache/lucene/index/SortingLeafReader.java
@@ -36,15 +36,15 @@ import org.apache.lucene.util.automaton.CompiledAutomaton;
/**
* An {@link org.apache.lucene.index.LeafReader} which supports sorting documents by a given
- * {@link Sort}. This is package private and is only used by Lucene when it needs to merge
- * a newly flushed (unsorted) segment.
+ * {@link Sort}. This is package private and is only used by Lucene for BWC when it needs to merge
+ * an unsorted flushed segment built by an older version (newly flushed segments are sorted since version 7.0).
*
* @lucene.experimental
*/
class SortingLeafReader extends FilterLeafReader {
- private static class SortingFields extends FilterFields {
+ static class SortingFields extends FilterFields {
private final Sorter.DocMap docMap;
private final FieldInfos infos;