You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by br...@apache.org on 2019/12/02 14:19:13 UTC
[lucene-solr] branch branch_8x updated: LUCENE-9045: Do not use
TreeMap/TreeSet in BlockTree and PerFieldPostingsFormat.
This is an automated email from the ASF dual-hosted git repository.
broustant pushed a commit to branch branch_8x
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
The following commit(s) were added to refs/heads/branch_8x by this push:
new bb600e3 LUCENE-9045: Do not use TreeMap/TreeSet in BlockTree and PerFieldPostingsFormat.
bb600e3 is described below
commit bb600e363823577fe2cb9b880e76b223fcf28ae8
Author: Bruno Roustant <br...@salesforce.com>
AuthorDate: Mon Dec 2 11:41:56 2019 +0100
LUCENE-9045: Do not use TreeMap/TreeSet in BlockTree and PerFieldPostingsFormat.
Closes #1007
---
lucene/CHANGES.txt | 2 +
.../codecs/blocktree/BlockTreeTermsReader.java | 25 +++++----
.../codecs/perfield/PerFieldPostingsFormat.java | 65 +++++++++++++++-------
3 files changed, 63 insertions(+), 29 deletions(-)
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 64e9f96..ca65b52 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -61,6 +61,8 @@ Optimizations
* LUCENE-9049: Remove FST cached root arcs now redundant with labels indexed by bitset.
This frees some on-heap FST space. (Jack Conradson via Bruno Roustant)
+* LUCENE-9045: Do not use TreeMap/TreeSet in BlockTree and PerFieldPostingsFormat. (Bruno Roustant)
+
Bug Fixes
* LUCENE-9001: Fix race condition in SetOnce. (Przemko Robakowski)
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsReader.java
index 3713452..0a0cd31 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsReader.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsReader.java
@@ -22,10 +22,10 @@ import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
+import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
-import java.util.TreeMap;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.FieldsProducer;
@@ -146,7 +146,8 @@ public final class BlockTreeTermsReader extends FieldsProducer {
// produce DocsEnum on demand
final PostingsReaderBase postingsReader;
- private final TreeMap<String,FieldReader> fields = new TreeMap<>();
+ private final Map<String,FieldReader> fieldMap;
+ private final List<String> fieldList;
final String segment;
@@ -195,6 +196,7 @@ public final class BlockTreeTermsReader extends FieldsProducer {
if (numFields < 0) {
throw new CorruptIndexException("invalid numFields: " + numFields, termsIn);
}
+ fieldMap = new HashMap<>((int) (numFields / 0.75f) + 1);
for (int i = 0; i < numFields; ++i) {
final int field = termsIn.readVInt();
final long numTerms = termsIn.readVLong();
@@ -227,13 +229,16 @@ public final class BlockTreeTermsReader extends FieldsProducer {
}
final FSTLoadMode perFieldLoadMode = getLoadMode(state.readerAttributes, FST_MODE_KEY + "." + fieldInfo.name, fstLoadMode);
final long indexStartFP = indexIn.readVLong();
- FieldReader previous = fields.put(fieldInfo.name,
+ FieldReader previous = fieldMap.put(fieldInfo.name,
new FieldReader(this, fieldInfo, numTerms, rootCode, sumTotalTermFreq, sumDocFreq, docCount,
indexStartFP, longsSize, indexIn, minTerm, maxTerm, state.openedFromWriter, perFieldLoadMode));
if (previous != null) {
throw new CorruptIndexException("duplicate field: " + fieldInfo.name, termsIn);
}
}
+ List<String> fieldList = new ArrayList<>(fieldMap.keySet());
+ fieldList.sort(null);
+ this.fieldList = Collections.unmodifiableList(fieldList);
success = true;
} finally {
if (!success) {
@@ -289,24 +294,24 @@ public final class BlockTreeTermsReader extends FieldsProducer {
} finally {
// Clear so refs to terms index is GCable even if
// app hangs onto us:
- fields.clear();
+ fieldMap.clear();
}
}
@Override
public Iterator<String> iterator() {
- return Collections.unmodifiableSet(fields.keySet()).iterator();
+ return fieldList.iterator();
}
@Override
public Terms terms(String field) throws IOException {
assert field != null;
- return fields.get(field);
+ return fieldMap.get(field);
}
@Override
public int size() {
- return fields.size();
+ return fieldMap.size();
}
// for debugging
@@ -328,7 +333,7 @@ public final class BlockTreeTermsReader extends FieldsProducer {
@Override
public long ramBytesUsed() {
long sizeInBytes = postingsReader.ramBytesUsed();
- for(FieldReader reader : fields.values()) {
+ for(FieldReader reader : fieldMap.values()) {
sizeInBytes += reader.ramBytesUsed();
}
return sizeInBytes;
@@ -336,7 +341,7 @@ public final class BlockTreeTermsReader extends FieldsProducer {
@Override
public Collection<Accountable> getChildResources() {
- List<Accountable> resources = new ArrayList<>(Accountables.namedAccountables("field", fields));
+ List<Accountable> resources = new ArrayList<>(Accountables.namedAccountables("field", fieldMap));
resources.add(Accountables.namedAccountable("delegate", postingsReader));
return Collections.unmodifiableList(resources);
}
@@ -352,6 +357,6 @@ public final class BlockTreeTermsReader extends FieldsProducer {
@Override
public String toString() {
- return getClass().getSimpleName() + "(fields=" + fields.size() + ",delegate=" + postingsReader + ")";
+ return getClass().getSimpleName() + "(fields=" + fieldMap.size() + ",delegate=" + postingsReader + ")";
}
}
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldPostingsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldPostingsFormat.java
index 81bbf72..52a8851 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldPostingsFormat.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldPostingsFormat.java
@@ -24,6 +24,7 @@ import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
+import java.util.HashSet;
import java.util.IdentityHashMap;
import java.util.Iterator;
import java.util.List;
@@ -31,7 +32,6 @@ import java.util.Map;
import java.util.ServiceLoader;
import java.util.Set;
import java.util.TreeMap;
-import java.util.TreeSet;
import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.FieldsProducer;
@@ -86,13 +86,41 @@ public abstract class PerFieldPostingsFormat extends PostingsFormat {
/** Group of fields written by one PostingsFormat */
static class FieldsGroup {
- final Set<String> fields = new TreeSet<>();
- int suffix;
-
+ final List<String> fields;
+ final int suffix;
/** Custom SegmentWriteState for this group of fields,
* with the segmentSuffix uniqueified for this
* PostingsFormat */
- SegmentWriteState state;
+ final SegmentWriteState state;
+
+ private FieldsGroup(List<String> fields, int suffix, SegmentWriteState state) {
+ this.fields = fields;
+ this.suffix = suffix;
+ this.state = state;
+ }
+
+ static class Builder {
+ final Set<String> fields;
+ final int suffix;
+ final SegmentWriteState state;
+
+ Builder(int suffix, SegmentWriteState state) {
+ this.suffix = suffix;
+ this.state = state;
+ fields = new HashSet<>();
+ }
+
+ Builder addField(String field) {
+ fields.add(field);
+ return this;
+ }
+
+ FieldsGroup build() {
+ List<String> fieldList = new ArrayList<>(fields);
+ fieldList.sort(null);
+ return new FieldsGroup(fieldList, suffix, state);
+ }
+ }
};
static String getSuffix(String formatName, String suffix) {
@@ -178,9 +206,8 @@ public abstract class PerFieldPostingsFormat extends PostingsFormat {
}
private Map<PostingsFormat, FieldsGroup> buildFieldsGroupMapping(Iterable<String> indexedFieldNames) {
- // Maps a PostingsFormat instance to the suffix it
- // should use
- Map<PostingsFormat,FieldsGroup> formatToGroups = new HashMap<>();
+ // Maps a PostingsFormat instance to the suffix it should use
+ Map<PostingsFormat,FieldsGroup.Builder> formatToGroupBuilders = new HashMap<>();
// Holds last suffix of each PostingFormat name
Map<String,Integer> suffixes = new HashMap<>();
@@ -196,10 +223,9 @@ public abstract class PerFieldPostingsFormat extends PostingsFormat {
}
String formatName = format.getName();
- FieldsGroup group = formatToGroups.get(format);
- if (group == null) {
- // First time we are seeing this format; create a
- // new instance
+ FieldsGroup.Builder groupBuilder = formatToGroupBuilders.get(format);
+ if (groupBuilder == null) {
+ // First time we are seeing this format; create a new instance
// bump the suffix
Integer suffix = suffixes.get(formatName);
@@ -213,22 +239,23 @@ public abstract class PerFieldPostingsFormat extends PostingsFormat {
String segmentSuffix = getFullSegmentSuffix(field,
writeState.segmentSuffix,
getSuffix(formatName, Integer.toString(suffix)));
- group = new FieldsGroup();
- group.state = new SegmentWriteState(writeState, segmentSuffix);
- group.suffix = suffix;
- formatToGroups.put(format, group);
+ groupBuilder = new FieldsGroup.Builder(suffix, new SegmentWriteState(writeState, segmentSuffix));
+ formatToGroupBuilders.put(format, groupBuilder);
} else {
// we've already seen this format, so just grab its suffix
if (!suffixes.containsKey(formatName)) {
- throw new IllegalStateException("no suffix for format name: " + formatName + ", expected: " + group.suffix);
+ throw new IllegalStateException("no suffix for format name: " + formatName + ", expected: " + groupBuilder.suffix);
}
}
- group.fields.add(field);
+ groupBuilder.addField(field);
fieldInfo.putAttribute(PER_FIELD_FORMAT_KEY, formatName);
- fieldInfo.putAttribute(PER_FIELD_SUFFIX_KEY, Integer.toString(group.suffix));
+ fieldInfo.putAttribute(PER_FIELD_SUFFIX_KEY, Integer.toString(groupBuilder.suffix));
}
+
+ Map<PostingsFormat,FieldsGroup> formatToGroups = new HashMap<>((int) (formatToGroupBuilders.size() / 0.75f) + 1);
+ formatToGroupBuilders.forEach((postingsFormat, builder) -> formatToGroups.put(postingsFormat, builder.build()));
return formatToGroups;
}