You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by si...@apache.org on 2020/09/23 12:21:44 UTC
[lucene-solr] branch master updated: LUCENE-9539: Remove caches
from SortingCodecReader (#1909)
This is an automated email from the ASF dual-hosted git repository.
simonw pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
The following commit(s) were added to refs/heads/master by this push:
new 17c285d LUCENE-9539: Remove caches from SortingCodecReader (#1909)
17c285d is described below
commit 17c285d61743da0c06735e06235b20bd5aac4e14
Author: Simon Willnauer <si...@apache.org>
AuthorDate: Wed Sep 23 14:21:28 2020 +0200
LUCENE-9539: Remove caches from SortingCodecReader (#1909)
SortingCodecReader keeps all docvalues in memory that are loaded from this reader.
Yet, this reader should only be used for merging which happens sequentially. This makes
caching docvalues unnecessary.
Co-authored-by: Jim Ferenczi <ji...@elastic.co>
---
lucene/CHANGES.txt | 7 +
.../apache/lucene/index/BinaryDocValuesWriter.java | 12 +-
.../org/apache/lucene/index/NormValuesWriter.java | 2 +-
.../lucene/index/NumericDocValuesWriter.java | 14 +-
.../apache/lucene/index/SortingCodecReader.java | 166 ++++++++++-----------
.../lucene/index/TestSortingCodecReader.java | 8 +-
6 files changed, 107 insertions(+), 102 deletions(-)
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 7389abf..8594ea0 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -274,6 +274,13 @@ Documentation
* LUCENE-9424: Add a performance warning to AttributeSource.captureState javadocs (Haoyu Zhai)
+Changes in Runtime Behavior
+---------------------
+
+* LUCENE-9539: SortingCodecReader now doesn't cache doc values fields anymore. Previously, SortingCodecReader
+ used to cache all doc values fields after they were loaded into memory. This reader should only be used
+ to sort segments after the fact using IndexWriter#addIndices. (Simon Willnauer)
+
Other
---------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java
index 785adff..fe9cfaf 100644
--- a/lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java
@@ -113,9 +113,9 @@ class BinaryDocValuesWriter extends DocValuesWriter<BinaryDocValues> {
if (finalLengths == null) {
finalLengths = this.lengths.build();
}
- final CachedBinaryDVs sorted;
+ final BinaryDVs sorted;
if (sortMap != null) {
- sorted = new CachedBinaryDVs(state.segmentInfo.maxDoc(), sortMap,
+ sorted = new BinaryDVs(state.segmentInfo.maxDoc(), sortMap,
new BufferedBinaryDocValues(finalLengths, maxLength, bytes.getDataInput(), docsWithField.iterator()));
} else {
sorted = null;
@@ -189,11 +189,11 @@ class BinaryDocValuesWriter extends DocValuesWriter<BinaryDocValues> {
}
static class SortingBinaryDocValues extends BinaryDocValues {
- private final CachedBinaryDVs dvs;
+ private final BinaryDVs dvs;
private final BytesRefBuilder spare = new BytesRefBuilder();
private int docID = -1;
- SortingBinaryDocValues(CachedBinaryDVs dvs) {
+ SortingBinaryDocValues(BinaryDVs dvs) {
this.dvs = dvs;
}
@@ -235,10 +235,10 @@ class BinaryDocValuesWriter extends DocValuesWriter<BinaryDocValues> {
}
}
- static final class CachedBinaryDVs {
+ static final class BinaryDVs {
final int[] offsets;
final BytesRefArray values;
- CachedBinaryDVs(int maxDoc, Sorter.DocMap sortMap, BinaryDocValues oldValues) throws IOException {
+ BinaryDVs(int maxDoc, Sorter.DocMap sortMap, BinaryDocValues oldValues) throws IOException {
offsets = new int[maxDoc];
values = new BytesRefArray(Counter.newCounter());
int offset = 1; // 0 means no values for this document
diff --git a/lucene/core/src/java/org/apache/lucene/index/NormValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/NormValuesWriter.java
index 2964352..1039880 100644
--- a/lucene/core/src/java/org/apache/lucene/index/NormValuesWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/NormValuesWriter.java
@@ -70,7 +70,7 @@ class NormValuesWriter {
public void flush(SegmentWriteState state, Sorter.DocMap sortMap, NormsConsumer normsConsumer) throws IOException {
final PackedLongValues values = pending.build();
- final NumericDocValuesWriter.CachedNumericDVs sorted;
+ final NumericDocValuesWriter.NumericDVs sorted;
if (sortMap != null) {
sorted = NumericDocValuesWriter.sortDocValues(state.segmentInfo.maxDoc(), sortMap,
new BufferedNorms(values, docsWithField.iterator()));
diff --git a/lucene/core/src/java/org/apache/lucene/index/NumericDocValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/NumericDocValuesWriter.java
index a696bca..fcd0527 100644
--- a/lucene/core/src/java/org/apache/lucene/index/NumericDocValuesWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/NumericDocValuesWriter.java
@@ -77,7 +77,7 @@ class NumericDocValuesWriter extends DocValuesWriter<NumericDocValues> {
return new BufferedNumericDocValues(finalValues, docsWithField.iterator());
}
- static CachedNumericDVs sortDocValues(int maxDoc, Sorter.DocMap sortMap, NumericDocValues oldDocValues) throws IOException {
+ static NumericDVs sortDocValues(int maxDoc, Sorter.DocMap sortMap, NumericDocValues oldDocValues) throws IOException {
FixedBitSet docsWithField = new FixedBitSet(maxDoc);
long[] values = new long[maxDoc];
while (true) {
@@ -89,7 +89,7 @@ class NumericDocValuesWriter extends DocValuesWriter<NumericDocValues> {
docsWithField.set(newDocID);
values[newDocID] = oldDocValues.longValue();
}
- return new CachedNumericDVs(values, docsWithField);
+ return new NumericDVs(values, docsWithField);
}
@Override
@@ -97,7 +97,7 @@ class NumericDocValuesWriter extends DocValuesWriter<NumericDocValues> {
if (finalValues == null) {
finalValues = pending.build();
}
- final CachedNumericDVs sorted;
+ final NumericDVs sorted;
if (sortMap != null) {
NumericDocValues oldValues = new BufferedNumericDocValues(finalValues, docsWithField.iterator());
sorted = sortDocValues(state.segmentInfo.maxDoc(), sortMap, oldValues);
@@ -169,11 +169,11 @@ class NumericDocValuesWriter extends DocValuesWriter<NumericDocValues> {
static class SortingNumericDocValues extends NumericDocValues {
- private final CachedNumericDVs dvs;
+ private final NumericDVs dvs;
private int docID = -1;
private long cost = -1;
- SortingNumericDocValues(CachedNumericDVs dvs) {
+ SortingNumericDocValues(NumericDVs dvs) {
this.dvs = dvs;
}
@@ -218,11 +218,11 @@ class NumericDocValuesWriter extends DocValuesWriter<NumericDocValues> {
}
}
- static class CachedNumericDVs {
+ static class NumericDVs {
private final long[] values;
private final BitSet docsWithField;
- CachedNumericDVs(long[] values, BitSet docsWithField) {
+ NumericDVs(long[] values, BitSet docsWithField) {
this.values = values;
this.docsWithField = docsWithField;
}
diff --git a/lucene/core/src/java/org/apache/lucene/index/SortingCodecReader.java b/lucene/core/src/java/org/apache/lucene/index/SortingCodecReader.java
index 8eb08c5..f5831fd 100644
--- a/lucene/core/src/java/org/apache/lucene/index/SortingCodecReader.java
+++ b/lucene/core/src/java/org/apache/lucene/index/SortingCodecReader.java
@@ -30,8 +30,10 @@ import org.apache.lucene.codecs.PointsReader;
import org.apache.lucene.codecs.StoredFieldsReader;
import org.apache.lucene.codecs.TermVectorsReader;
import org.apache.lucene.search.Sort;
+import org.apache.lucene.search.SortField;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.IOSupplier;
import org.apache.lucene.util.packed.PackedInts;
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
@@ -41,21 +43,13 @@ import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
* {@link Sort}. This can be used to re-sort and index after it's been created by wrapping all
* readers of the index with this reader and adding it to a fresh IndexWriter via
* {@link IndexWriter#addIndexes(CodecReader...)}.
+ * NOTE: This reader should only be used for merging. Pulling fields from this reader might be very costly and memory
+ * intensive.
*
* @lucene.experimental
*/
public final class SortingCodecReader extends FilterCodecReader {
- private final Map<String, NumericDocValuesWriter.CachedNumericDVs> cachedNumericDVs = new HashMap<>();
-
- private final Map<String, BinaryDocValuesWriter.CachedBinaryDVs> cachedBinaryDVs = new HashMap<>();
-
- private final Map<String, int[]> cachedSortedDVs = new HashMap<>();
-
- private final Map<String, SortedSetDocValuesWriter.DocOrds> cachedSortedSetDVs = new HashMap<>();
-
- private final Map<String, SortedNumericDocValuesWriter.LongValues> cachedSortedNumericDVs = new HashMap<>();
-
private static class SortingBits implements Bits {
private final Bits in;
@@ -148,10 +142,6 @@ public final class SortingCodecReader extends FilterCodecReader {
}
}
-
-
-
-
/** Return a sorted view of <code>reader</code> according to the order
* defined by <code>sort</code>. If the reader is already sorted, this
* method might return the reader as-is. */
@@ -313,15 +303,13 @@ public final class SortingCodecReader extends FilterCodecReader {
};
}
- private final Map<String, NumericDocValuesWriter.CachedNumericDVs> cachedNorms = new HashMap<>();
-
@Override
public NormsProducer getNormsReader() {
final NormsProducer delegate = in.getNormsReader();
return new NormsProducer() {
@Override
public NumericDocValues getNorms(FieldInfo field) throws IOException {
- return produceNumericDocValues(field, delegate.getNorms(field), cachedNorms);
+ return new NumericDocValuesWriter.SortingNumericDocValues(getOrCreateNorms(field.name, () -> getNumericDocValues(delegate.getNorms(field))));
}
@Override
@@ -341,101 +329,48 @@ public final class SortingCodecReader extends FilterCodecReader {
};
}
- private NumericDocValues produceNumericDocValues(FieldInfo field, NumericDocValues oldNorms,
- Map<String, NumericDocValuesWriter.CachedNumericDVs> cachedNorms) throws IOException {
- NumericDocValuesWriter.CachedNumericDVs norms;
- synchronized (cachedNorms) {
- norms = cachedNorms.get(field);
- if (norms == null) {
- FixedBitSet docsWithField = new FixedBitSet(maxDoc());
- long[] values = new long[maxDoc()];
- while (true) {
- int docID = oldNorms.nextDoc();
- if (docID == NO_MORE_DOCS) {
- break;
- }
- int newDocID = docMap.oldToNew(docID);
- docsWithField.set(newDocID);
- values[newDocID] = oldNorms.longValue();
- }
- norms = new NumericDocValuesWriter.CachedNumericDVs(values, docsWithField);
- cachedNorms.put(field.name, norms);
- }
- }
- return new NumericDocValuesWriter.SortingNumericDocValues(norms);
- }
-
@Override
public DocValuesProducer getDocValuesReader() {
final DocValuesProducer delegate = in.getDocValuesReader();
return new DocValuesProducer() {
@Override
public NumericDocValues getNumeric(FieldInfo field) throws IOException {
- return produceNumericDocValues(field,delegate.getNumeric(field), cachedNumericDVs);
+ return new NumericDocValuesWriter.SortingNumericDocValues(getOrCreateDV(field.name, () -> getNumericDocValues(delegate.getNumeric(field))));
}
@Override
public BinaryDocValues getBinary(FieldInfo field) throws IOException {
- final BinaryDocValues oldDocValues = delegate.getBinary(field);
- BinaryDocValuesWriter.CachedBinaryDVs dvs;
- synchronized (cachedBinaryDVs) {
- dvs = cachedBinaryDVs.get(field);
- if (dvs == null) {
- dvs = new BinaryDocValuesWriter.CachedBinaryDVs(maxDoc(), docMap, oldDocValues);
- cachedBinaryDVs.put(field.name, dvs);
- }
- }
- return new BinaryDocValuesWriter.SortingBinaryDocValues(dvs);
+ return new BinaryDocValuesWriter.SortingBinaryDocValues(getOrCreateDV(field.name,
+ () -> new BinaryDocValuesWriter.BinaryDVs(maxDoc(), docMap, delegate.getBinary(field))));
}
@Override
public SortedDocValues getSorted(FieldInfo field) throws IOException {
SortedDocValues oldDocValues = delegate.getSorted(field);
- int[] ords;
- synchronized (cachedSortedDVs) {
- ords = cachedSortedDVs.get(field);
- if (ords == null) {
- ords = new int[maxDoc()];
- Arrays.fill(ords, -1);
- int docID;
- while ((docID = oldDocValues.nextDoc()) != NO_MORE_DOCS) {
- int newDocID = docMap.oldToNew(docID);
- ords[newDocID] = oldDocValues.ordValue();
- }
- cachedSortedDVs.put(field.name, ords);
+ return new SortedDocValuesWriter.SortingSortedDocValues(oldDocValues, getOrCreateDV(field.name, () -> {
+ int[] ords = new int[maxDoc()];
+ Arrays.fill(ords, -1);
+ int docID;
+ while ((docID = oldDocValues.nextDoc()) != NO_MORE_DOCS) {
+ int newDocID = docMap.oldToNew(docID);
+ ords[newDocID] = oldDocValues.ordValue();
}
- }
-
- return new SortedDocValuesWriter.SortingSortedDocValues(oldDocValues, ords);
+ return ords;
+ }));
}
@Override
public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException {
final SortedNumericDocValues oldDocValues = delegate.getSortedNumeric(field);
- SortedNumericDocValuesWriter.LongValues values;
- synchronized (cachedSortedNumericDVs) {
- values = cachedSortedNumericDVs.get(field);
- if (values == null) {
- values = new SortedNumericDocValuesWriter.LongValues(maxDoc(), docMap, oldDocValues, PackedInts.FAST);
- cachedSortedNumericDVs.put(field.name, values);
- }
- }
-
- return new SortedNumericDocValuesWriter.SortingSortedNumericDocValues(oldDocValues, values);
+ return new SortedNumericDocValuesWriter.SortingSortedNumericDocValues(oldDocValues, getOrCreateDV(field.name, () ->
+ new SortedNumericDocValuesWriter.LongValues(maxDoc(), docMap, oldDocValues, PackedInts.FAST)));
}
@Override
public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException {
SortedSetDocValues oldDocValues = delegate.getSortedSet(field);
- SortedSetDocValuesWriter.DocOrds ords;
- synchronized (cachedSortedSetDVs) {
- ords = cachedSortedSetDVs.get(field);
- if (ords == null) {
- ords = new SortedSetDocValuesWriter.DocOrds(maxDoc(), docMap, oldDocValues, PackedInts.FASTEST);
- cachedSortedSetDVs.put(field.name, ords);
- }
- }
- return new SortedSetDocValuesWriter.SortingSortedSetDocValues(oldDocValues, ords);
+ return new SortedSetDocValuesWriter.SortingSortedSetDocValues(oldDocValues, getOrCreateDV(field.name, () ->
+ new SortedSetDocValuesWriter.DocOrds(maxDoc(), docMap, oldDocValues, PackedInts.FAST)));
}
@Override
@@ -455,6 +390,18 @@ public final class SortingCodecReader extends FilterCodecReader {
};
}
+ private NumericDocValuesWriter.NumericDVs getNumericDocValues(NumericDocValues oldNumerics) throws IOException {
+ FixedBitSet docsWithField = new FixedBitSet(maxDoc());
+ long[] values = new long[maxDoc()];
+ int docID;
+ while ((docID = oldNumerics.nextDoc()) != NO_MORE_DOCS) {
+ int newDocID = docMap.oldToNew(docID);
+ docsWithField.set(newDocID);
+ values[newDocID] = oldNumerics.longValue();
+ }
+ return new NumericDocValuesWriter.NumericDVs(values, docsWithField);
+ }
+
@Override
public TermVectorsReader getTermVectorsReader() {
return newTermVectorsReader(in.getTermVectorsReader());
@@ -510,4 +457,51 @@ public final class SortingCodecReader extends FilterCodecReader {
return metaData;
}
+ // we try to cache the last used DV or Norms instance since during merge
+ // this instance is used more than once. We could in addition to this single instance
+ // also cache the fields that are used for sorting since we do the work twice for these fields
+ private String cachedField;
+ private Object cachedObject;
+ private boolean cacheIsNorms;
+
+ private <T> T getOrCreateNorms(String field, IOSupplier<T> supplier) throws IOException {
+ return getOrCreate(field, true, supplier);
+ }
+
+ @SuppressWarnings("unchecked")
+ private synchronized <T> T getOrCreate(String field, boolean norms, IOSupplier<T> supplier) throws IOException {
+ if ((field.equals(cachedField) && cacheIsNorms == norms) == false) {
+ assert assertCreatedOnlyOnce(field, norms);
+ cachedObject = supplier.get();
+ cachedField = field;
+ cacheIsNorms = norms;
+ }
+ assert cachedObject != null;
+ return (T) cachedObject;
+ }
+
+ private final Map<String, Integer> cacheStats = new HashMap<>(); // only with assertions enabled
+ private boolean assertCreatedOnlyOnce(String field, boolean norms) {
+ assert Thread.holdsLock(this);
+ // this is mainly there to make sure we change anything in the way we merge we realize it early
+ Integer timesCached = cacheStats.compute(field + "N:" + norms, (s, i) -> i == null ? 1 : i.intValue() + 1);
+ if (timesCached > 1) {
+ assert norms == false :"[" + field + "] norms must not be cached twice";
+ boolean isSortField = false;
+ for (SortField sf : metaData.getSort().getSort()) {
+ if (field.equals(sf.getField())) {
+ isSortField = true;
+ break;
+ }
+ }
+ assert timesCached == 2 : "[" + field + "] must not be cached more than twice but was cached: "
+ + timesCached + " times isSortField: " + isSortField;
+ assert isSortField : "only sort fields should be cached twice but [" + field + "] is not a sort field";
+ }
+ return true;
+ }
+
+ private <T> T getOrCreateDV(String field, IOSupplier<T> supplier) throws IOException {
+ return getOrCreate(field, false, supplier);
+ }
}
diff --git a/lucene/core/src/test/org/apache/lucene/index/TestSortingCodecReader.java b/lucene/core/src/test/org/apache/lucene/index/TestSortingCodecReader.java
index c1ac197..fb54048 100644
--- a/lucene/core/src/test/org/apache/lucene/index/TestSortingCodecReader.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestSortingCodecReader.java
@@ -115,14 +115,18 @@ public class TestSortingCodecReader extends LuceneTestCase {
for (int i = 0; i < numDocs; i++) {
int docId = docIds.get(i);
Document doc = new Document();
- doc.add(new NumericDocValuesField("foo", random().nextInt(20)));
doc.add(new StringField("id", Integer.toString(docId), Field.Store.YES));
doc.add(new LongPoint("id", docId));
- doc.add(new TextField("text_field", RandomStrings.randomRealisticUnicodeOfLength(random(), 25), Field.Store.YES));
+ String s = RandomStrings.randomRealisticUnicodeOfLength(random(), 25);
+ doc.add(new TextField("text_field", s, Field.Store.YES));
+ doc.add(new BinaryDocValuesField("text_field", new BytesRef(s)));
+ doc.add(new TextField("another_text_field", s, Field.Store.YES));
+ doc.add(new BinaryDocValuesField("another_text_field", new BytesRef(s)));
doc.add(new SortedNumericDocValuesField("sorted_numeric_dv", docId));
doc.add(new SortedDocValuesField("binary_sorted_dv", new BytesRef(Integer.toString(docId))));
doc.add(new BinaryDocValuesField("binary_dv", new BytesRef(Integer.toString(docId))));
doc.add(new SortedSetDocValuesField("sorted_set_dv", new BytesRef(Integer.toString(docId))));
+ doc.add(new NumericDocValuesField("foo", random().nextInt(20)));
FieldType ft = new FieldType(StringField.TYPE_NOT_STORED);
ft.setStoreTermVectors(true);