You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by si...@apache.org on 2020/09/22 13:24:06 UTC
[lucene-solr] branch branch_8x updated: LUCENE-9539: Use more
compact datastructures for sorting doc-values (#1908)
This is an automated email from the ASF dual-hosted git repository.
simonw pushed a commit to branch branch_8x
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
The following commit(s) were added to refs/heads/branch_8x by this push:
new 705faa3 LUCENE-9539: Use more compact datastructures for sorting doc-values (#1908)
705faa3 is described below
commit 705faa3b2c02f350c928babc731685e1bfaf1027
Author: Simon Willnauer <si...@apache.org>
AuthorDate: Tue Sep 22 15:10:53 2020 +0200
LUCENE-9539: Use more compact datastructures for sorting doc-values (#1908)
This change cuts over from object based data-structures to primitive / compressed data-structures.
---
lucene/CHANGES.txt | 3 +
.../apache/lucene/index/BinaryDocValuesWriter.java | 64 ++++++--------
.../lucene/index/SortedNumericDocValuesWriter.java | 82 ++++++++++--------
.../lucene/index/SortedSetDocValuesWriter.java | 98 +++++++++++-----------
.../apache/lucene/index/SortingCodecReader.java | 16 ++--
5 files changed, 135 insertions(+), 128 deletions(-)
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 6bd927b..362ff57 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -66,6 +66,9 @@ Improvements
* LUCENE-9523: In query shapes over shape fields, skip points while traversing the
BKD tree when the relationship with the document is already known. (Ignacio Vera)
+* LUCENE-9539: Use more compact datastructures to represent sorted doc-values in memory when
+ sorting a segment before flush and in SortingCodecReader. (Simon Willnauer)
+
Optimizations
---------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java
index 775f325..785adff 100644
--- a/lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java
@@ -24,11 +24,10 @@ import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.ArrayUtil;
-import org.apache.lucene.util.BitSet;
import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefArray;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.Counter;
-import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.PagedBytes;
import org.apache.lucene.util.packed.PackedInts;
import org.apache.lucene.util.packed.PackedLongValues;
@@ -58,7 +57,7 @@ class BinaryDocValuesWriter extends DocValuesWriter<BinaryDocValues> {
private PackedLongValues finalLengths;
- public BinaryDocValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed) {
+ BinaryDocValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed) {
this.fieldInfo = fieldInfo;
this.bytes = new PagedBytes(BLOCK_BITS);
this.bytesOut = bytes.getDataOutput();
@@ -100,21 +99,6 @@ class BinaryDocValuesWriter extends DocValuesWriter<BinaryDocValues> {
bytesUsed = newBytesUsed;
}
- static CachedBinaryDVs sortDocValues(int maxDoc, Sorter.DocMap sortMap, BinaryDocValues oldValues) throws IOException {
- FixedBitSet docsWithField = new FixedBitSet(maxDoc);
- BytesRef[] values = new BytesRef[maxDoc];
- while (true) {
- int docID = oldValues.nextDoc();
- if (docID == NO_MORE_DOCS) {
- break;
- }
- int newDocID = sortMap.oldToNew(docID);
- docsWithField.set(newDocID);
- values[newDocID] = BytesRef.deepCopyOf(oldValues.binaryValue());
- }
- return new CachedBinaryDVs(values, docsWithField);
- }
-
@Override
BinaryDocValues getDocValues() {
if (finalLengths == null) {
@@ -131,7 +115,7 @@ class BinaryDocValuesWriter extends DocValuesWriter<BinaryDocValues> {
}
final CachedBinaryDVs sorted;
if (sortMap != null) {
- sorted = sortDocValues(state.segmentInfo.maxDoc(), sortMap,
+ sorted = new CachedBinaryDVs(state.segmentInfo.maxDoc(), sortMap,
new BufferedBinaryDocValues(finalLengths, maxLength, bytes.getDataInput(), docsWithField.iterator()));
} else {
sorted = null;
@@ -206,8 +190,8 @@ class BinaryDocValuesWriter extends DocValuesWriter<BinaryDocValues> {
static class SortingBinaryDocValues extends BinaryDocValues {
private final CachedBinaryDVs dvs;
+ private final BytesRefBuilder spare = new BytesRefBuilder();
private int docID = -1;
- private long cost = -1;
SortingBinaryDocValues(CachedBinaryDVs dvs) {
this.dvs = dvs;
@@ -215,11 +199,12 @@ class BinaryDocValuesWriter extends DocValuesWriter<BinaryDocValues> {
@Override
public int nextDoc() {
- if (docID+1 == dvs.docsWithField.length()) {
- docID = NO_MORE_DOCS;
- } else {
- docID = dvs.docsWithField.nextSetBit(docID+1);
- }
+ do {
+ docID++;
+ if (docID == dvs.offsets.length) {
+ return docID = NO_MORE_DOCS;
+ }
+ } while (dvs.offsets[docID] <= 0);
return docID;
}
@@ -240,26 +225,29 @@ class BinaryDocValuesWriter extends DocValuesWriter<BinaryDocValues> {
@Override
public BytesRef binaryValue() {
- return dvs.values[docID];
+ dvs.values.get(spare, dvs.offsets[docID]-1);
+ return spare.get();
}
@Override
public long cost() {
- if (cost == -1) {
- cost = dvs.docsWithField.cardinality();
- }
- return cost;
+ return dvs.values.size();
}
}
- static class CachedBinaryDVs {
- // TODO: at least cutover to BytesRefArray here:
- private final BytesRef[] values;
- private final BitSet docsWithField;
-
- CachedBinaryDVs(BytesRef[] values, BitSet docsWithField) {
- this.values = values;
- this.docsWithField = docsWithField;
+ static final class CachedBinaryDVs {
+ final int[] offsets;
+ final BytesRefArray values;
+ CachedBinaryDVs(int maxDoc, Sorter.DocMap sortMap, BinaryDocValues oldValues) throws IOException {
+ offsets = new int[maxDoc];
+ values = new BytesRefArray(Counter.newCounter());
+ int offset = 1; // 0 means no values for this document
+ int docID;
+ while ((docID = oldValues.nextDoc()) != NO_MORE_DOCS) {
+ int newDocID = sortMap.oldToNew(docID);
+ values.append(oldValues.binaryValue());
+ offsets[newDocID] = offset++;
+ }
}
}
}
diff --git a/lucene/core/src/java/org/apache/lucene/index/SortedNumericDocValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/SortedNumericDocValuesWriter.java
index 0acf93b..e756278 100644
--- a/lucene/core/src/java/org/apache/lucene/index/SortedNumericDocValuesWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/SortedNumericDocValuesWriter.java
@@ -39,13 +39,13 @@ class SortedNumericDocValuesWriter extends DocValuesWriter<SortedNumericDocValue
private long bytesUsed; // this only tracks differences in 'pending' and 'pendingCounts'
private final FieldInfo fieldInfo;
private int currentDoc = -1;
- private long currentValues[] = new long[8];
+ private long[] currentValues = new long[8];
private int currentUpto = 0;
private PackedLongValues finalValues;
private PackedLongValues finalValuesCount;
- public SortedNumericDocValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed) {
+ SortedNumericDocValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed) {
this.fieldInfo = fieldInfo;
this.iwBytesUsed = iwBytesUsed;
pending = PackedLongValues.deltaPackedBuilder(PackedInts.COMPACT);
@@ -108,18 +108,28 @@ class SortedNumericDocValuesWriter extends DocValuesWriter<SortedNumericDocValue
return new BufferedSortedNumericDocValues(finalValues, finalValuesCount, docsWithField.iterator());
}
- static long[][] sortDocValues(int maxDoc, Sorter.DocMap sortMap, SortedNumericDocValues oldValues) throws IOException {
- long[][] values = new long[maxDoc][];
- int docID;
- while ((docID = oldValues.nextDoc()) != NO_MORE_DOCS) {
- int newDocID = sortMap.oldToNew(docID);
- long[] docValues = new long[oldValues.docValueCount()];
- for (int i = 0; i < docValues.length; i++) {
- docValues[i] = oldValues.nextValue();
+ static final class LongValues {
+ final long[] offsets;
+ final PackedLongValues values;
+ LongValues(int maxDoc, Sorter.DocMap sortMap, SortedNumericDocValues oldValues, float acceptableOverheadRatio) throws IOException {
+ offsets = new long[maxDoc];
+ PackedLongValues.Builder valuesBuiler = PackedLongValues.packedBuilder(acceptableOverheadRatio);
+ int docID;
+ long offsetIndex = 1; // 0 means the doc has no values
+ while ((docID = oldValues.nextDoc()) != NO_MORE_DOCS) {
+ int newDocID = sortMap.oldToNew(docID);
+ int numValues = oldValues.docValueCount();
+ valuesBuiler.add(numValues);
+ offsets[newDocID] = offsetIndex++;
+ for (int i = 0; i < numValues; i++) {
+ valuesBuiler.add(oldValues.nextValue());
+ offsetIndex++;
+ }
}
- values[newDocID] = docValues;
+ values = valuesBuiler.build();
}
- return values;
+
+
}
@Override
@@ -135,10 +145,10 @@ class SortedNumericDocValuesWriter extends DocValuesWriter<SortedNumericDocValue
valueCounts = finalValuesCount;
}
- final long[][] sorted;
+ final LongValues sorted;
if (sortMap != null) {
- sorted = sortDocValues(state.segmentInfo.maxDoc(), sortMap,
- new BufferedSortedNumericDocValues(values, valueCounts, docsWithField.iterator()));
+ sorted = new LongValues(state.segmentInfo.maxDoc(), sortMap,
+ new BufferedSortedNumericDocValues(values, valueCounts, docsWithField.iterator()), PackedInts.FASTEST);
} else {
sorted = null;
}
@@ -225,11 +235,13 @@ class SortedNumericDocValuesWriter extends DocValuesWriter<SortedNumericDocValue
static class SortingSortedNumericDocValues extends SortedNumericDocValues {
private final SortedNumericDocValues in;
- private final long[][] values;
+ private final LongValues values;
private int docID = -1;
- private int upto;
+ private long upto;
+ private int numValues = - 1;
+ private long limit;
- SortingSortedNumericDocValues(SortedNumericDocValues in, long[][] values) {
+ SortingSortedNumericDocValues(SortedNumericDocValues in, LongValues values) {
this.in = in;
this.values = values;
}
@@ -241,18 +253,15 @@ class SortedNumericDocValuesWriter extends DocValuesWriter<SortedNumericDocValue
@Override
public int nextDoc() {
- while (true) {
+ do {
docID++;
- if (docID == values.length) {
- docID = NO_MORE_DOCS;
- break;
+ if (docID >= values.offsets.length) {
+ return docID = NO_MORE_DOCS;
}
- if (values[docID] != null) {
- break;
- }
- // skip missing docs
- }
- upto = 0;
+ } while (values.offsets[docID] <= 0);
+ upto = values.offsets[docID];
+ numValues = Math.toIntExact(values.values.get(upto-1));
+ limit = upto + numValues;
return docID;
}
@@ -264,16 +273,23 @@ class SortedNumericDocValuesWriter extends DocValuesWriter<SortedNumericDocValue
@Override
public boolean advanceExact(int target) throws IOException {
docID = target;
- upto = 0;
- return values[docID] != null;
+ upto = values.offsets[docID];
+ if (values.offsets[docID] > 0) {
+ numValues = Math.toIntExact(values.values.get(upto-1));
+ limit = upto + numValues;
+ return true;
+ } else {
+ limit = upto;
+ }
+ return false;
}
@Override
public long nextValue() {
- if (upto == values[docID].length) {
+ if (upto == limit) {
throw new AssertionError();
} else {
- return values[docID][upto++];
+ return values.values.get(upto++);
}
}
@@ -284,7 +300,7 @@ class SortedNumericDocValuesWriter extends DocValuesWriter<SortedNumericDocValue
@Override
public int docValueCount() {
- return values[docID].length;
+ return numValues;
}
}
}
diff --git a/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValuesWriter.java
index c444edd..de9fa88 100644
--- a/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValuesWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValuesWriter.java
@@ -45,7 +45,7 @@ class SortedSetDocValuesWriter extends DocValuesWriter<SortedSetDocValues> {
private long bytesUsed; // this only tracks differences in 'pending' and 'pendingCounts'
private final FieldInfo fieldInfo;
private int currentDoc = -1;
- private int currentValues[] = new int[8];
+ private int[] currentValues = new int[8];
private int currentUpto;
private int maxCount;
@@ -55,7 +55,7 @@ class SortedSetDocValuesWriter extends DocValuesWriter<SortedSetDocValues> {
private int[] finalOrdMap;
- public SortedSetDocValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed) {
+ SortedSetDocValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed) {
this.fieldInfo = fieldInfo;
this.iwBytesUsed = iwBytesUsed;
hash = new BytesRefHash(
@@ -139,28 +139,6 @@ class SortedSetDocValuesWriter extends DocValuesWriter<SortedSetDocValues> {
bytesUsed = newBytesUsed;
}
- static long[][] sortDocValues(int maxDoc, Sorter.DocMap sortMap, SortedSetDocValues oldValues) throws IOException {
- long[][] ords = new long[maxDoc][];
- int docID;
- while ((docID = oldValues.nextDoc()) != NO_MORE_DOCS) {
- int newDocID = sortMap.oldToNew(docID);
- long[] docOrds = new long[1];
- int upto = 0;
- while (true) {
- long ord = oldValues.nextOrd();
- if (ord == NO_MORE_ORDS) {
- break;
- }
- if (upto == docOrds.length) {
- docOrds = ArrayUtil.grow(docOrds);
- }
- docOrds[upto++] = ord;
- }
- ords[newDocID] = ArrayUtil.copyOfSubArray(docOrds, 0, upto);
- }
- return ords;
- }
-
@Override
SortedSetDocValues getDocValues() {
if (finalOrds == null) {
@@ -203,12 +181,12 @@ class SortedSetDocValuesWriter extends DocValuesWriter<SortedSetDocValues> {
ordMap = finalOrdMap;
}
- final long[][] sorted;
+ final DocOrds docOrds;
if (sortMap != null) {
- sorted = sortDocValues(state.segmentInfo.maxDoc(), sortMap,
- new BufferedSortedSetDocValues(sortedValues, ordMap, hash, ords, ordCounts, maxCount, docsWithField.iterator()));
+ docOrds = new DocOrds(state.segmentInfo.maxDoc(), sortMap,
+ new BufferedSortedSetDocValues(sortedValues, ordMap, hash, ords, ordCounts, maxCount, docsWithField.iterator()), PackedInts.FASTEST);
} else {
- sorted = null;
+ docOrds = null;
}
dvConsumer.addSortedSetField(fieldInfo,
new EmptyDocValuesProducer() {
@@ -219,10 +197,10 @@ class SortedSetDocValuesWriter extends DocValuesWriter<SortedSetDocValues> {
}
final SortedSetDocValues buf =
new BufferedSortedSetDocValues(sortedValues, ordMap, hash, ords, ordCounts, maxCount, docsWithField.iterator());
- if (sorted == null) {
+ if (docOrds == null) {
return buf;
} else {
- return new SortingSortedSetDocValues(buf, sorted);
+ return new SortingSortedSetDocValues(buf, docOrds);
}
}
});
@@ -236,7 +214,7 @@ class SortedSetDocValuesWriter extends DocValuesWriter<SortedSetDocValues> {
final PackedLongValues.Iterator ordsIter;
final PackedLongValues.Iterator ordCountsIter;
final DocIdSetIterator docsWithField;
- final int currentDoc[];
+ final int[] currentDoc;
private int ordCount;
private int ordUpto;
@@ -311,11 +289,11 @@ class SortedSetDocValuesWriter extends DocValuesWriter<SortedSetDocValues> {
static class SortingSortedSetDocValues extends SortedSetDocValues {
private final SortedSetDocValues in;
- private final long[][] ords;
+ private final DocOrds ords;
private int docID = -1;
- private int ordUpto;
+ private long ordUpto;
- SortingSortedSetDocValues(SortedSetDocValues in, long[][] ords) {
+ SortingSortedSetDocValues(SortedSetDocValues in, DocOrds ords) {
this.in = in;
this.ords = ords;
}
@@ -327,18 +305,13 @@ class SortedSetDocValuesWriter extends DocValuesWriter<SortedSetDocValues> {
@Override
public int nextDoc() {
- while (true) {
+ do {
docID++;
- if (docID == ords.length) {
- docID = NO_MORE_DOCS;
- break;
+ if (docID == ords.offsets.length) {
+ return docID = NO_MORE_DOCS;
}
- if (ords[docID] != null) {
- break;
- }
- // skip missing docs
- }
- ordUpto = 0;
+ } while (ords.offsets[docID] <= 0);
+ ordUpto = ords.offsets[docID]-1;
return docID;
}
@@ -351,16 +324,16 @@ class SortedSetDocValuesWriter extends DocValuesWriter<SortedSetDocValues> {
public boolean advanceExact(int target) throws IOException {
// needed in IndexSorter#StringSorter
docID = target;
- ordUpto = 0;
- return ords[docID] != null;
+ ordUpto = ords.offsets[docID]-1;
+ return ords.offsets[docID] > 0;
}
-
@Override
public long nextOrd() {
- if (ordUpto == ords[docID].length) {
+ long ord = ords.ords.get(ordUpto++);
+ if (ord == 0) {
return NO_MORE_ORDS;
} else {
- return ords[docID][ordUpto++];
+ return ord - 1 ;
}
}
@@ -379,4 +352,31 @@ class SortedSetDocValuesWriter extends DocValuesWriter<SortedSetDocValues> {
return in.getValueCount();
}
}
+
+ static final class DocOrds {
+ final long[] offsets;
+ final PackedLongValues ords;
+
+ DocOrds(int maxDoc, Sorter.DocMap sortMap, SortedSetDocValues oldValues, float acceptableOverheadRatio) throws IOException {
+ offsets = new long[maxDoc];
+ PackedLongValues.Builder builder = PackedLongValues.packedBuilder(acceptableOverheadRatio);
+ long ordOffset = 1; // 0 marks docs with no values
+ int docID;
+ while ((docID = oldValues.nextDoc()) != NO_MORE_DOCS) {
+ int newDocID = sortMap.oldToNew(docID);
+ long startOffset = ordOffset;
+ long ord;
+ while ((ord = oldValues.nextOrd()) != NO_MORE_ORDS) {
+ builder.add(ord + 1);
+ ordOffset++;
+ }
+ if (startOffset != ordOffset) { // do we have any values?
+ offsets[newDocID] = startOffset;
+ builder.add(0); // 0 ord marks next value
+ ordOffset++;
+ }
+ }
+ ords = builder.build();
+ }
+ }
}
diff --git a/lucene/core/src/java/org/apache/lucene/index/SortingCodecReader.java b/lucene/core/src/java/org/apache/lucene/index/SortingCodecReader.java
index cc7739c..8eb08c5 100644
--- a/lucene/core/src/java/org/apache/lucene/index/SortingCodecReader.java
+++ b/lucene/core/src/java/org/apache/lucene/index/SortingCodecReader.java
@@ -32,6 +32,7 @@ import org.apache.lucene.codecs.TermVectorsReader;
import org.apache.lucene.search.Sort;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.packed.PackedInts;
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
@@ -51,10 +52,9 @@ public final class SortingCodecReader extends FilterCodecReader {
private final Map<String, int[]> cachedSortedDVs = new HashMap<>();
- // TODO: pack long[][] into an int[] (offset) and long[] instead:
- private final Map<String, long[][]> cachedSortedSetDVs = new HashMap<>();
+ private final Map<String, SortedSetDocValuesWriter.DocOrds> cachedSortedSetDVs = new HashMap<>();
- private final Map<String, long[][]> cachedSortedNumericDVs = new HashMap<>();
+ private final Map<String, SortedNumericDocValuesWriter.LongValues> cachedSortedNumericDVs = new HashMap<>();
private static class SortingBits implements Bits {
@@ -381,7 +381,7 @@ public final class SortingCodecReader extends FilterCodecReader {
synchronized (cachedBinaryDVs) {
dvs = cachedBinaryDVs.get(field);
if (dvs == null) {
- dvs = BinaryDocValuesWriter.sortDocValues(maxDoc(), docMap, oldDocValues);
+ dvs = new BinaryDocValuesWriter.CachedBinaryDVs(maxDoc(), docMap, oldDocValues);
cachedBinaryDVs.put(field.name, dvs);
}
}
@@ -412,11 +412,11 @@ public final class SortingCodecReader extends FilterCodecReader {
@Override
public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException {
final SortedNumericDocValues oldDocValues = delegate.getSortedNumeric(field);
- long[][] values;
+ SortedNumericDocValuesWriter.LongValues values;
synchronized (cachedSortedNumericDVs) {
values = cachedSortedNumericDVs.get(field);
if (values == null) {
- values = SortedNumericDocValuesWriter.sortDocValues(maxDoc(), docMap, oldDocValues);
+ values = new SortedNumericDocValuesWriter.LongValues(maxDoc(), docMap, oldDocValues, PackedInts.FAST);
cachedSortedNumericDVs.put(field.name, values);
}
}
@@ -427,11 +427,11 @@ public final class SortingCodecReader extends FilterCodecReader {
@Override
public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException {
SortedSetDocValues oldDocValues = delegate.getSortedSet(field);
- long[][] ords;
+ SortedSetDocValuesWriter.DocOrds ords;
synchronized (cachedSortedSetDVs) {
ords = cachedSortedSetDVs.get(field);
if (ords == null) {
- ords = SortedSetDocValuesWriter.sortDocValues(maxDoc(), docMap, oldDocValues);
+ ords = new SortedSetDocValuesWriter.DocOrds(maxDoc(), docMap, oldDocValues, PackedInts.FASTEST);
cachedSortedSetDVs.put(field.name, ords);
}
}