You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by br...@apache.org on 2019/12/02 14:19:13 UTC
[lucene-solr] branch branch_8x updated: LUCENE-9045: Do not use TreeMap/TreeSet in BlockTree and PerFieldPostingsFormat.

This is an automated email from the ASF dual-hosted git repository.

broustant pushed a commit to branch branch_8x
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git


The following commit(s) were added to refs/heads/branch_8x by this push:
     new bb600e3  LUCENE-9045: Do not use TreeMap/TreeSet in BlockTree and PerFieldPostingsFormat.
bb600e3 is described below

commit bb600e363823577fe2cb9b880e76b223fcf28ae8
Author: Bruno Roustant <br...@salesforce.com>
AuthorDate: Mon Dec 2 11:41:56 2019 +0100

    LUCENE-9045: Do not use TreeMap/TreeSet in BlockTree and PerFieldPostingsFormat.
    
    Closes #1007
---
 lucene/CHANGES.txt                                 |  2 +
 .../codecs/blocktree/BlockTreeTermsReader.java     | 25 +++++----
 .../codecs/perfield/PerFieldPostingsFormat.java    | 65 +++++++++++++++-------
 3 files changed, 63 insertions(+), 29 deletions(-)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 64e9f96..ca65b52 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -61,6 +61,8 @@ Optimizations
 * LUCENE-9049: Remove FST cached root arcs now redundant with labels indexed by bitset.
   This frees some on-heap FST space. (Jack Conradson via Bruno Roustant)
 
+* LUCENE-9045: Do not use TreeMap/TreeSet in BlockTree and PerFieldPostingsFormat. (Bruno Roustant)
+
 Bug Fixes
 
 * LUCENE-9001: Fix race condition in SetOnce. (Przemko Robakowski)
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsReader.java
index 3713452..0a0cd31 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsReader.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsReader.java
@@ -22,10 +22,10 @@ import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
+import java.util.HashMap;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
-import java.util.TreeMap;
 
 import org.apache.lucene.codecs.CodecUtil;
 import org.apache.lucene.codecs.FieldsProducer;
@@ -146,7 +146,8 @@ public final class BlockTreeTermsReader extends FieldsProducer {
   // produce DocsEnum on demand
   final PostingsReaderBase postingsReader;
 
-  private final TreeMap<String,FieldReader> fields = new TreeMap<>();
+  private final Map<String,FieldReader> fieldMap;
+  private final List<String> fieldList;
 
   final String segment;
   
@@ -195,6 +196,7 @@ public final class BlockTreeTermsReader extends FieldsProducer {
       if (numFields < 0) {
         throw new CorruptIndexException("invalid numFields: " + numFields, termsIn);
       }
+      fieldMap = new HashMap<>((int) (numFields / 0.75f) + 1);
       for (int i = 0; i < numFields; ++i) {
         final int field = termsIn.readVInt();
         final long numTerms = termsIn.readVLong();
@@ -227,13 +229,16 @@ public final class BlockTreeTermsReader extends FieldsProducer {
         }
         final FSTLoadMode perFieldLoadMode = getLoadMode(state.readerAttributes, FST_MODE_KEY + "." + fieldInfo.name, fstLoadMode);
         final long indexStartFP = indexIn.readVLong();
-        FieldReader previous = fields.put(fieldInfo.name,
+        FieldReader previous = fieldMap.put(fieldInfo.name,
                                           new FieldReader(this, fieldInfo, numTerms, rootCode, sumTotalTermFreq, sumDocFreq, docCount,
                                                           indexStartFP, longsSize, indexIn, minTerm, maxTerm, state.openedFromWriter, perFieldLoadMode));
         if (previous != null) {
           throw new CorruptIndexException("duplicate field: " + fieldInfo.name, termsIn);
         }
       }
+      List<String> fieldList = new ArrayList<>(fieldMap.keySet());
+      fieldList.sort(null);
+      this.fieldList = Collections.unmodifiableList(fieldList);
       success = true;
     } finally {
       if (!success) {
@@ -289,24 +294,24 @@ public final class BlockTreeTermsReader extends FieldsProducer {
     } finally { 
       // Clear so refs to terms index is GCable even if
       // app hangs onto us:
-      fields.clear();
+      fieldMap.clear();
     }
   }
 
   @Override
   public Iterator<String> iterator() {
-    return Collections.unmodifiableSet(fields.keySet()).iterator();
+    return fieldList.iterator();
   }
 
   @Override
   public Terms terms(String field) throws IOException {
     assert field != null;
-    return fields.get(field);
+    return fieldMap.get(field);
   }
 
   @Override
   public int size() {
-    return fields.size();
+    return fieldMap.size();
   }
 
   // for debugging
@@ -328,7 +333,7 @@ public final class BlockTreeTermsReader extends FieldsProducer {
   @Override
   public long ramBytesUsed() {
     long sizeInBytes = postingsReader.ramBytesUsed();
-    for(FieldReader reader : fields.values()) {
+    for(FieldReader reader : fieldMap.values()) {
       sizeInBytes += reader.ramBytesUsed();
     }
     return sizeInBytes;
@@ -336,7 +341,7 @@ public final class BlockTreeTermsReader extends FieldsProducer {
 
   @Override
   public Collection<Accountable> getChildResources() {
-    List<Accountable> resources = new ArrayList<>(Accountables.namedAccountables("field", fields));
+    List<Accountable> resources = new ArrayList<>(Accountables.namedAccountables("field", fieldMap));
     resources.add(Accountables.namedAccountable("delegate", postingsReader));
     return Collections.unmodifiableList(resources);
   }
@@ -352,6 +357,6 @@ public final class BlockTreeTermsReader extends FieldsProducer {
 
   @Override
   public String toString() {
-    return getClass().getSimpleName() + "(fields=" + fields.size() + ",delegate=" + postingsReader + ")";
+    return getClass().getSimpleName() + "(fields=" + fieldMap.size() + ",delegate=" + postingsReader + ")";
   }
 }
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldPostingsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldPostingsFormat.java
index 81bbf72..52a8851 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldPostingsFormat.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldPostingsFormat.java
@@ -24,6 +24,7 @@ import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.HashMap;
+import java.util.HashSet;
 import java.util.IdentityHashMap;
 import java.util.Iterator;
 import java.util.List;
@@ -31,7 +32,6 @@ import java.util.Map;
 import java.util.ServiceLoader;
 import java.util.Set;
 import java.util.TreeMap;
-import java.util.TreeSet;
 
 import org.apache.lucene.codecs.FieldsConsumer;
 import org.apache.lucene.codecs.FieldsProducer;
@@ -86,13 +86,41 @@ public abstract class PerFieldPostingsFormat extends PostingsFormat {
 
   /** Group of fields written by one PostingsFormat */
   static class FieldsGroup {
-    final Set<String> fields = new TreeSet<>();
-    int suffix;
-
+    final List<String> fields;
+    final int suffix;
     /** Custom SegmentWriteState for this group of fields,
      *  with the segmentSuffix uniqueified for this
      *  PostingsFormat */
-    SegmentWriteState state;
+    final SegmentWriteState state;
+
+    private FieldsGroup(List<String> fields, int suffix, SegmentWriteState state) {
+      this.fields = fields;
+      this.suffix = suffix;
+      this.state = state;
+    }
+
+    static class Builder {
+      final Set<String> fields;
+      final int suffix;
+      final SegmentWriteState state;
+
+      Builder(int suffix, SegmentWriteState state) {
+        this.suffix = suffix;
+        this.state = state;
+        fields = new HashSet<>();
+      }
+
+      Builder addField(String field) {
+        fields.add(field);
+        return this;
+      }
+
+      FieldsGroup build() {
+        List<String> fieldList = new ArrayList<>(fields);
+        fieldList.sort(null);
+        return new FieldsGroup(fieldList, suffix, state);
+      }
+    }
   };
 
   static String getSuffix(String formatName, String suffix) {
@@ -178,9 +206,8 @@ public abstract class PerFieldPostingsFormat extends PostingsFormat {
     }
 
     private Map<PostingsFormat, FieldsGroup> buildFieldsGroupMapping(Iterable<String> indexedFieldNames) {
-      // Maps a PostingsFormat instance to the suffix it
-      // should use
-      Map<PostingsFormat,FieldsGroup> formatToGroups = new HashMap<>();
+      // Maps a PostingsFormat instance to the suffix it should use
+      Map<PostingsFormat,FieldsGroup.Builder> formatToGroupBuilders = new HashMap<>();
 
       // Holds last suffix of each PostingFormat name
       Map<String,Integer> suffixes = new HashMap<>();
@@ -196,10 +223,9 @@ public abstract class PerFieldPostingsFormat extends PostingsFormat {
         }
         String formatName = format.getName();
 
-        FieldsGroup group = formatToGroups.get(format);
-        if (group == null) {
-          // First time we are seeing this format; create a
-          // new instance
+        FieldsGroup.Builder groupBuilder = formatToGroupBuilders.get(format);
+        if (groupBuilder == null) {
+          // First time we are seeing this format; create a new instance
 
           // bump the suffix
           Integer suffix = suffixes.get(formatName);
@@ -213,22 +239,23 @@ public abstract class PerFieldPostingsFormat extends PostingsFormat {
           String segmentSuffix = getFullSegmentSuffix(field,
                                                       writeState.segmentSuffix,
                                                       getSuffix(formatName, Integer.toString(suffix)));
-          group = new FieldsGroup();
-          group.state = new SegmentWriteState(writeState, segmentSuffix);
-          group.suffix = suffix;
-          formatToGroups.put(format, group);
+          groupBuilder = new FieldsGroup.Builder(suffix, new SegmentWriteState(writeState, segmentSuffix));
+          formatToGroupBuilders.put(format, groupBuilder);
         } else {
           // we've already seen this format, so just grab its suffix
           if (!suffixes.containsKey(formatName)) {
-            throw new IllegalStateException("no suffix for format name: " + formatName + ", expected: " + group.suffix);
+            throw new IllegalStateException("no suffix for format name: " + formatName + ", expected: " + groupBuilder.suffix);
           }
         }
 
-        group.fields.add(field);
+        groupBuilder.addField(field);
 
         fieldInfo.putAttribute(PER_FIELD_FORMAT_KEY, formatName);
-        fieldInfo.putAttribute(PER_FIELD_SUFFIX_KEY, Integer.toString(group.suffix));
+        fieldInfo.putAttribute(PER_FIELD_SUFFIX_KEY, Integer.toString(groupBuilder.suffix));
       }
+
+      Map<PostingsFormat,FieldsGroup> formatToGroups = new HashMap<>((int) (formatToGroupBuilders.size() / 0.75f) + 1);
+      formatToGroupBuilders.forEach((postingsFormat, builder) -> formatToGroups.put(postingsFormat, builder.build()));
       return formatToGroups;
     }