You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2018/02/13 11:34:27 UTC

[1/2] lucene-solr:master: LUCENE-8033: FieldInfos always use dense encoding

Repository: lucene-solr
Updated Branches:
  refs/heads/branch_7x 83befcbce -> c87b58cf7
  refs/heads/master 899966b48 -> 7d07fbee5


LUCENE-8033: FieldInfos always use dense encoding

FieldInfos always use an array to store FieldInfo byNumber

Signed-off-by: Adrien Grand <jp...@gmail.com>

Closes #320


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/7d07fbee
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/7d07fbee
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/7d07fbee

Branch: refs/heads/master
Commit: 7d07fbee5f991d97db2dd08bbc16348f6a83b465
Parents: 899966b
Author: Mayya Sharipova <ma...@elastic.co>
Authored: Fri Feb 2 17:10:24 2018 -0800
Committer: Adrien Grand <jp...@gmail.com>
Committed: Tue Feb 13 11:36:37 2018 +0100

----------------------------------------------------------------------
 lucene/CHANGES.txt                              |  3 +
 .../org/apache/lucene/index/FieldInfos.java     | 57 ++++++------
 .../org/apache/lucene/index/TestFieldInfos.java | 92 ++++++++++++++++++++
 3 files changed, 120 insertions(+), 32 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7d07fbee/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 3749158..7e0033a 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -147,6 +147,9 @@ Improvements
 * LUCENE-8152: Improve consumption of doc-value iterators. (Horatiu Lazu via
   Adrien Grand)
 
+* LUCENE-8033: FieldInfos now always use a dense encoding. (Mayya Sharipova
+  via Adrien Grand)
+
 Bug Fixes
 
 * LUCENE-8077: Fixed bug in how CheckIndex verifies doc-value iterators.

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7d07fbee/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java b/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java
index 3e68693..9666fd9 100644
--- a/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java
+++ b/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java
@@ -25,8 +25,8 @@ import java.util.HashSet;
 import java.util.Iterator;
 import java.util.Map;
 import java.util.Set;
-import java.util.SortedMap;
-import java.util.TreeMap;
+import java.util.Arrays;
+import java.util.List;
 
 import org.apache.lucene.util.ArrayUtil;
 
@@ -45,8 +45,7 @@ public class FieldInfos implements Iterable<FieldInfo> {
   private final boolean hasPointValues;
   
   // used only by fieldInfo(int)
-  private final FieldInfo[] byNumberTable; // contiguous
-  private final SortedMap<Integer,FieldInfo> byNumberMap; // sparse
+  private final FieldInfo[] byNumber;
   
   private final HashMap<String,FieldInfo> byName = new HashMap<>();
   private final Collection<FieldInfo> values; // for an unmodifiable iterator
@@ -63,21 +62,28 @@ public class FieldInfos implements Iterable<FieldInfo> {
     boolean hasNorms = false;
     boolean hasDocValues = false;
     boolean hasPointValues = false;
-    
-    TreeMap<Integer, FieldInfo> byNumber = new TreeMap<>();
+
+    int size = 0; // number of elements in byNumberTemp, number of used array slots
+    FieldInfo[] byNumberTemp = new FieldInfo[10]; // initial array capacity of 10
     for (FieldInfo info : infos) {
       if (info.number < 0) {
         throw new IllegalArgumentException("illegal field number: " + info.number + " for field " + info.name);
       }
-      FieldInfo previous = byNumber.put(info.number, info);
+      size = info.number >= size ? info.number+1 : size;
+      if (info.number >= byNumberTemp.length){ //grow array
+        byNumberTemp = ArrayUtil.grow(byNumberTemp, info.number + 1);
+      }
+      FieldInfo previous = byNumberTemp[info.number];
       if (previous != null) {
         throw new IllegalArgumentException("duplicate field numbers: " + previous.name + " and " + info.name + " have: " + info.number);
       }
+      byNumberTemp[info.number] = info;
+
       previous = byName.put(info.name, info);
       if (previous != null) {
         throw new IllegalArgumentException("duplicate field names: " + previous.number + " and " + info.number + " have: " + info.name);
       }
-      
+
       hasVectors |= info.hasVectors();
       hasProx |= info.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
       hasFreq |= info.getIndexOptions() != IndexOptions.DOCS;
@@ -96,25 +102,16 @@ public class FieldInfos implements Iterable<FieldInfo> {
     this.hasNorms = hasNorms;
     this.hasDocValues = hasDocValues;
     this.hasPointValues = hasPointValues;
-    Integer max = byNumber.isEmpty() ? null : byNumber.lastKey();
-    
-    // Only usee TreeMap in the very sparse case (< 1/16th of the numbers are used),
-    // because TreeMap uses ~ 64 (32 bit JVM) or 120 (64 bit JVM w/o compressed oops)
-    // overall bytes per entry, but array uses 4 (32 bit JMV) or 8
-    // (64 bit JVM w/o compressed oops):
-    if (max != null && max < ArrayUtil.MAX_ARRAY_LENGTH && max < 16L*byNumber.size()) {
-      // Pull infos into an arraylist to avoid holding a reference to the TreeMap
-      values = Collections.unmodifiableCollection(new ArrayList<>(byNumber.values()));
-      byNumberMap = null;
-      byNumberTable = new FieldInfo[max+1];
-      for (Map.Entry<Integer,FieldInfo> entry : byNumber.entrySet()) {
-        byNumberTable[entry.getKey()] = entry.getValue();
+
+    List<FieldInfo> valuesTemp = new ArrayList<>();
+    byNumber = new FieldInfo[size];
+    for(int i=0; i<size; i++){
+      byNumber[i] = byNumberTemp[i];
+      if (byNumberTemp[i] != null) {
+        valuesTemp.add(byNumberTemp[i]);
       }
-    } else {
-      byNumberMap = byNumber;
-      values = Collections.unmodifiableCollection(byNumber.values());
-      byNumberTable = null;
     }
+    values = Collections.unmodifiableCollection(Arrays.asList(valuesTemp.toArray(new FieldInfo[0])));
   }
   
   /** Returns true if any fields have freqs */
@@ -192,14 +189,10 @@ public class FieldInfos implements Iterable<FieldInfo> {
     if (fieldNumber < 0) {
       throw new IllegalArgumentException("Illegal field number: " + fieldNumber);
     }
-    if (byNumberTable != null) {
-      if (fieldNumber >= byNumberTable.length) {
-        return null;
-      }
-      return byNumberTable[fieldNumber];
-    } else {
-      return byNumberMap.get(fieldNumber);
+    if (fieldNumber >= byNumber.length) {
+      return null;
     }
+    return byNumber[fieldNumber];
   }
 
   static final class FieldDimensions {

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7d07fbee/lucene/core/src/test/org/apache/lucene/index/TestFieldInfos.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/index/TestFieldInfos.java b/lucene/core/src/test/org/apache/lucene/index/TestFieldInfos.java
new file mode 100644
index 0000000..308e11e
--- /dev/null
+++ b/lucene/core/src/test/org/apache/lucene/index/TestFieldInfos.java
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.index;
+
+
+import java.util.Iterator;
+
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.LuceneTestCase;
+
+public class TestFieldInfos extends LuceneTestCase {
+
+  public void testFieldInfos() throws Exception{
+    Directory dir = newDirectory();
+    IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random()))
+        .setMergePolicy(NoMergePolicy.INSTANCE));
+
+    Document d1 = new Document();
+    for (int i = 0; i < 15; i++) {
+      d1.add(new StringField("f" + i, "v" + i, Field.Store.YES));
+    }
+    writer.addDocument(d1);
+    writer.commit();
+
+    Document d2 = new Document();
+    d2.add(new StringField("f0", "v0", Field.Store.YES));
+    d2.add(new StringField("f15", "v15", Field.Store.YES));
+    d2.add(new StringField("f16", "v16", Field.Store.YES));
+    writer.addDocument(d2);
+    writer.commit();
+
+    Document d3 = new Document();
+    writer.addDocument(d3);
+    writer.close();
+
+    SegmentInfos sis = SegmentInfos.readLatestCommit(dir);
+    assertEquals(3, sis.size());
+
+    FieldInfos fis1 = IndexWriter.readFieldInfos(sis.info(0));
+    FieldInfos fis2 = IndexWriter.readFieldInfos(sis.info(1));
+    FieldInfos fis3 = IndexWriter.readFieldInfos(sis.info(2));
+
+    // testing dense FieldInfos
+    Iterator<FieldInfo>  it = fis1.iterator();
+    int i = 0;
+    while(it.hasNext()) {
+      FieldInfo fi = it.next();
+      assertEquals(i, fi.number);
+      assertEquals("f" + i , fi.name);
+      assertEquals("f" + i, fis1.fieldInfo(i).name); //lookup by number
+      assertEquals("f" + i, fis1.fieldInfo("f" + i).name); //lookup by name
+      i++;
+    }
+
+    // testing sparse FieldInfos
+    assertEquals("f0", fis2.fieldInfo(0).name); //lookup by number
+    assertEquals("f0", fis2.fieldInfo("f0").name); //lookup by name
+    assertNull(fis2.fieldInfo(1));
+    assertNull(fis2.fieldInfo("f1"));
+    assertEquals("f15", fis2.fieldInfo(15).name);
+    assertEquals("f15", fis2.fieldInfo("f15").name);
+    assertEquals("f16", fis2.fieldInfo(16).name);
+    assertEquals("f16", fis2.fieldInfo("f16").name);
+
+    // testing empty FieldInfos
+    assertNull(fis3.fieldInfo(0)); //lookup by number
+    assertNull(fis3.fieldInfo("f0")); //lookup by name
+    assertEquals(0, fis3.size());
+    Iterator<FieldInfo> it3 = fis3.iterator();
+    assertFalse(it3.hasNext());
+    dir.close();
+  }
+
+}


[2/2] lucene-solr:branch_7x: LUCENE-8033: FieldInfos always use dense encoding

Posted by jp...@apache.org.
LUCENE-8033: FieldInfos always use dense encoding

FieldInfos always use an array to store FieldInfo byNumber

Signed-off-by: Adrien Grand <jp...@gmail.com>

Closes #320


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/c87b58cf
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/c87b58cf
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/c87b58cf

Branch: refs/heads/branch_7x
Commit: c87b58cf78062989d406592e6b816d86179c4171
Parents: 83befcb
Author: Mayya Sharipova <ma...@elastic.co>
Authored: Fri Feb 2 17:10:24 2018 -0800
Committer: Adrien Grand <jp...@gmail.com>
Committed: Tue Feb 13 11:55:51 2018 +0100

----------------------------------------------------------------------
 lucene/CHANGES.txt                              |  3 +
 .../org/apache/lucene/index/FieldInfos.java     | 57 ++++++------
 .../org/apache/lucene/index/TestFieldInfos.java | 92 ++++++++++++++++++++
 3 files changed, 120 insertions(+), 32 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c87b58cf/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 71a3c06..550805b 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -66,6 +66,9 @@ Improvements
 * LUCENE-8152: Improve consumption of doc-value iterators. (Horatiu Lazu via
   Adrien Grand)
 
+* LUCENE-8033: FieldInfos now always use a dense encoding. (Mayya Sharipova
+  via Adrien Grand)
+
 Bug Fixes
 
 * LUCENE-8077: Fixed bug in how CheckIndex verifies doc-value iterators.

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c87b58cf/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java b/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java
index 3e68693..9666fd9 100644
--- a/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java
+++ b/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java
@@ -25,8 +25,8 @@ import java.util.HashSet;
 import java.util.Iterator;
 import java.util.Map;
 import java.util.Set;
-import java.util.SortedMap;
-import java.util.TreeMap;
+import java.util.Arrays;
+import java.util.List;
 
 import org.apache.lucene.util.ArrayUtil;
 
@@ -45,8 +45,7 @@ public class FieldInfos implements Iterable<FieldInfo> {
   private final boolean hasPointValues;
   
   // used only by fieldInfo(int)
-  private final FieldInfo[] byNumberTable; // contiguous
-  private final SortedMap<Integer,FieldInfo> byNumberMap; // sparse
+  private final FieldInfo[] byNumber;
   
   private final HashMap<String,FieldInfo> byName = new HashMap<>();
   private final Collection<FieldInfo> values; // for an unmodifiable iterator
@@ -63,21 +62,28 @@ public class FieldInfos implements Iterable<FieldInfo> {
     boolean hasNorms = false;
     boolean hasDocValues = false;
     boolean hasPointValues = false;
-    
-    TreeMap<Integer, FieldInfo> byNumber = new TreeMap<>();
+
+    int size = 0; // number of elements in byNumberTemp, number of used array slots
+    FieldInfo[] byNumberTemp = new FieldInfo[10]; // initial array capacity of 10
     for (FieldInfo info : infos) {
       if (info.number < 0) {
         throw new IllegalArgumentException("illegal field number: " + info.number + " for field " + info.name);
       }
-      FieldInfo previous = byNumber.put(info.number, info);
+      size = info.number >= size ? info.number+1 : size;
+      if (info.number >= byNumberTemp.length){ //grow array
+        byNumberTemp = ArrayUtil.grow(byNumberTemp, info.number + 1);
+      }
+      FieldInfo previous = byNumberTemp[info.number];
       if (previous != null) {
         throw new IllegalArgumentException("duplicate field numbers: " + previous.name + " and " + info.name + " have: " + info.number);
       }
+      byNumberTemp[info.number] = info;
+
       previous = byName.put(info.name, info);
       if (previous != null) {
         throw new IllegalArgumentException("duplicate field names: " + previous.number + " and " + info.number + " have: " + info.name);
       }
-      
+
       hasVectors |= info.hasVectors();
       hasProx |= info.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
       hasFreq |= info.getIndexOptions() != IndexOptions.DOCS;
@@ -96,25 +102,16 @@ public class FieldInfos implements Iterable<FieldInfo> {
     this.hasNorms = hasNorms;
     this.hasDocValues = hasDocValues;
     this.hasPointValues = hasPointValues;
-    Integer max = byNumber.isEmpty() ? null : byNumber.lastKey();
-    
-    // Only usee TreeMap in the very sparse case (< 1/16th of the numbers are used),
-    // because TreeMap uses ~ 64 (32 bit JVM) or 120 (64 bit JVM w/o compressed oops)
-    // overall bytes per entry, but array uses 4 (32 bit JMV) or 8
-    // (64 bit JVM w/o compressed oops):
-    if (max != null && max < ArrayUtil.MAX_ARRAY_LENGTH && max < 16L*byNumber.size()) {
-      // Pull infos into an arraylist to avoid holding a reference to the TreeMap
-      values = Collections.unmodifiableCollection(new ArrayList<>(byNumber.values()));
-      byNumberMap = null;
-      byNumberTable = new FieldInfo[max+1];
-      for (Map.Entry<Integer,FieldInfo> entry : byNumber.entrySet()) {
-        byNumberTable[entry.getKey()] = entry.getValue();
+
+    List<FieldInfo> valuesTemp = new ArrayList<>();
+    byNumber = new FieldInfo[size];
+    for(int i=0; i<size; i++){
+      byNumber[i] = byNumberTemp[i];
+      if (byNumberTemp[i] != null) {
+        valuesTemp.add(byNumberTemp[i]);
       }
-    } else {
-      byNumberMap = byNumber;
-      values = Collections.unmodifiableCollection(byNumber.values());
-      byNumberTable = null;
     }
+    values = Collections.unmodifiableCollection(Arrays.asList(valuesTemp.toArray(new FieldInfo[0])));
   }
   
   /** Returns true if any fields have freqs */
@@ -192,14 +189,10 @@ public class FieldInfos implements Iterable<FieldInfo> {
     if (fieldNumber < 0) {
       throw new IllegalArgumentException("Illegal field number: " + fieldNumber);
     }
-    if (byNumberTable != null) {
-      if (fieldNumber >= byNumberTable.length) {
-        return null;
-      }
-      return byNumberTable[fieldNumber];
-    } else {
-      return byNumberMap.get(fieldNumber);
+    if (fieldNumber >= byNumber.length) {
+      return null;
     }
+    return byNumber[fieldNumber];
   }
 
   static final class FieldDimensions {

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c87b58cf/lucene/core/src/test/org/apache/lucene/index/TestFieldInfos.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/index/TestFieldInfos.java b/lucene/core/src/test/org/apache/lucene/index/TestFieldInfos.java
new file mode 100644
index 0000000..308e11e
--- /dev/null
+++ b/lucene/core/src/test/org/apache/lucene/index/TestFieldInfos.java
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.index;
+
+
+import java.util.Iterator;
+
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.LuceneTestCase;
+
+public class TestFieldInfos extends LuceneTestCase {
+
+  public void testFieldInfos() throws Exception{
+    Directory dir = newDirectory();
+    IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random()))
+        .setMergePolicy(NoMergePolicy.INSTANCE));
+
+    Document d1 = new Document();
+    for (int i = 0; i < 15; i++) {
+      d1.add(new StringField("f" + i, "v" + i, Field.Store.YES));
+    }
+    writer.addDocument(d1);
+    writer.commit();
+
+    Document d2 = new Document();
+    d2.add(new StringField("f0", "v0", Field.Store.YES));
+    d2.add(new StringField("f15", "v15", Field.Store.YES));
+    d2.add(new StringField("f16", "v16", Field.Store.YES));
+    writer.addDocument(d2);
+    writer.commit();
+
+    Document d3 = new Document();
+    writer.addDocument(d3);
+    writer.close();
+
+    SegmentInfos sis = SegmentInfos.readLatestCommit(dir);
+    assertEquals(3, sis.size());
+
+    FieldInfos fis1 = IndexWriter.readFieldInfos(sis.info(0));
+    FieldInfos fis2 = IndexWriter.readFieldInfos(sis.info(1));
+    FieldInfos fis3 = IndexWriter.readFieldInfos(sis.info(2));
+
+    // testing dense FieldInfos
+    Iterator<FieldInfo>  it = fis1.iterator();
+    int i = 0;
+    while(it.hasNext()) {
+      FieldInfo fi = it.next();
+      assertEquals(i, fi.number);
+      assertEquals("f" + i , fi.name);
+      assertEquals("f" + i, fis1.fieldInfo(i).name); //lookup by number
+      assertEquals("f" + i, fis1.fieldInfo("f" + i).name); //lookup by name
+      i++;
+    }
+
+    // testing sparse FieldInfos
+    assertEquals("f0", fis2.fieldInfo(0).name); //lookup by number
+    assertEquals("f0", fis2.fieldInfo("f0").name); //lookup by name
+    assertNull(fis2.fieldInfo(1));
+    assertNull(fis2.fieldInfo("f1"));
+    assertEquals("f15", fis2.fieldInfo(15).name);
+    assertEquals("f15", fis2.fieldInfo("f15").name);
+    assertEquals("f16", fis2.fieldInfo(16).name);
+    assertEquals("f16", fis2.fieldInfo("f16").name);
+
+    // testing empty FieldInfos
+    assertNull(fis3.fieldInfo(0)); //lookup by number
+    assertNull(fis3.fieldInfo("f0")); //lookup by name
+    assertEquals(0, fis3.size());
+    Iterator<FieldInfo> it3 = fis3.iterator();
+    assertFalse(it3.hasNext());
+    dir.close();
+  }
+
+}