You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sh...@apache.org on 2017/03/23 14:40:53 UTC

[10/36] lucene-solr:feature/autoscaling: SOLR-10273: DocumentBuilder move longest field to last position

SOLR-10273: DocumentBuilder move longest field to last position


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/8100cbfd
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/8100cbfd
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/8100cbfd

Branch: refs/heads/feature/autoscaling
Commit: 8100cbfdf17024103df6d2e4380aff078f59f62f
Parents: 1ffd760
Author: David Smiley <ds...@apache.org>
Authored: Thu Mar 16 21:22:08 2017 -0400
Committer: Shalin Shekhar Mangar <sh...@apache.org>
Committed: Thu Mar 23 20:10:10 2017 +0530

----------------------------------------------------------------------
 solr/CHANGES.txt                                |  4 ++
 .../org/apache/solr/update/DocumentBuilder.java | 60 +++++++++++++++++-
 .../apache/solr/update/DocumentBuilderTest.java | 67 ++++++++++++++++++++
 3 files changed, 130 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8100cbfd/solr/CHANGES.txt
----------------------------------------------------------------------
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 9a5299c..dfe8d93 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -301,6 +301,10 @@ Optimizations
 * SOLR-10143: PointFields will create IndexOrDocValuesQuery when a field is both, indexed=true and docValues=true
   (Tom�s Fern�ndez L�bbe)
 
+* SOLR-10273: The field with the longest value (if it exceeds 4K) is moved to be last in the Lucene Document in order
+  to benefit from stored field optimizations in Lucene that can avoid reading it when it's not needed.  If the field is
+  multi-valued, they all move together to the end to retain order. (David Smiley)
+
 Other Changes
 ----------------------
 * SOLR-9980: Expose configVersion in core admin status (Jessica Cheng Mallet via Tom�s Fern�ndez L�bbe)

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8100cbfd/solr/core/src/java/org/apache/solr/update/DocumentBuilder.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/update/DocumentBuilder.java b/solr/core/src/java/org/apache/solr/update/DocumentBuilder.java
index e3d2011..b97af3b 100644
--- a/solr/core/src/java/org/apache/solr/update/DocumentBuilder.java
+++ b/solr/core/src/java/org/apache/solr/update/DocumentBuilder.java
@@ -16,12 +16,15 @@
  */
 package org.apache.solr.update;
 
+import java.util.Iterator;
+import java.util.LinkedList;
 import java.util.List;
 import java.util.Set;
 
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.NumericDocValuesField;
 import org.apache.lucene.index.IndexableField;
+import org.apache.lucene.util.BytesRef;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.common.SolrInputDocument;
 import org.apache.solr.common.SolrInputField;
@@ -33,10 +36,13 @@ import org.apache.solr.schema.SchemaField;
 import com.google.common.collect.Sets;
 
 /**
- *
+ * Builds a Lucene {@link Document} from a {@link SolrInputDocument}.
  */
 public class DocumentBuilder {
 
+  // accessible only for tests
+  static int MIN_LENGTH_TO_MOVE_LAST = Integer.getInteger("solr.docBuilder.minLengthToMoveLast", 4*1024); // internal setting
+
   /**
    * Add a field value to a given document.
    * @param doc Document that the field needs to be added to
@@ -227,6 +233,58 @@ public class DocumentBuilder {
         }
       }
     }
+
+    if (!forInPlaceUpdate) {
+      moveLargestFieldLast(out);
+    }
+    
     return out;
   }
+
+  /** Move the largest stored field last, because Lucene can avoid loading that one if it's not needed. */
+  private static void moveLargestFieldLast(Document doc) {
+    String largestField = null;
+    int largestFieldLen = -1;
+    boolean largestIsLast = true;
+    for (IndexableField field : doc) {
+      if (!field.fieldType().stored()) {
+        continue;
+      }
+      if (largestIsLast && !field.name().equals(largestField)) {
+        largestIsLast = false;
+      }
+      if (field.numericValue() != null) { // just ignore these as non-competitive (avoid toString'ing their number)
+        continue;
+      }
+      String strVal = field.stringValue();
+      if (strVal != null) {
+        if (strVal.length() > largestFieldLen) {
+          largestField = field.name();
+          largestFieldLen = strVal.length();
+          largestIsLast = true;
+        }
+      } else {
+        BytesRef bytesRef = field.binaryValue();
+        if (bytesRef != null && bytesRef.length > largestFieldLen) {
+          largestField = field.name();
+          largestFieldLen = bytesRef.length;
+          largestIsLast = true;
+        }
+      }
+    }
+    if (!largestIsLast && largestField != null && largestFieldLen > MIN_LENGTH_TO_MOVE_LAST) { // only bother if the value isn't tiny
+      LinkedList<IndexableField> addToEnd = new LinkedList<>();
+      Iterator<IndexableField> iterator = doc.iterator();
+      while (iterator.hasNext()) {
+        IndexableField field = iterator.next();
+        if (field.name().equals(largestField)) {
+          addToEnd.add(field);
+          iterator.remove(); // Document may not have "remove" but it's iterator allows mutation
+        }
+      }
+      for (IndexableField field : addToEnd) {
+        doc.add(field);
+      }
+    }
+  }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8100cbfd/solr/core/src/test/org/apache/solr/update/DocumentBuilderTest.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/update/DocumentBuilderTest.java b/solr/core/src/test/org/apache/solr/update/DocumentBuilderTest.java
index 2a78d6b..03dd17c 100644
--- a/solr/core/src/test/org/apache/solr/update/DocumentBuilderTest.java
+++ b/solr/core/src/test/org/apache/solr/update/DocumentBuilderTest.java
@@ -16,7 +16,14 @@
  */
 package org.apache.solr.update;
 
+import java.util.Iterator;
+import java.util.List;
+import java.util.stream.Collectors;
+import java.util.stream.StreamSupport;
+
+import com.carrotsearch.randomizedtesting.generators.RandomStrings;
 import org.apache.lucene.document.Document;
+import org.apache.lucene.index.IndexableField;
 import org.apache.lucene.util.TestUtil;
 import org.apache.solr.SolrTestCaseJ4;
 import org.apache.solr.common.SolrDocument;
@@ -25,6 +32,8 @@ import org.apache.solr.common.SolrInputDocument;
 import org.apache.solr.common.SolrInputField;
 import org.apache.solr.core.SolrCore;
 import org.apache.solr.schema.FieldType;
+import org.junit.After;
+import org.junit.AfterClass;
 import org.junit.BeforeClass;
 import org.junit.Test;
 
@@ -33,12 +42,23 @@ import org.junit.Test;
  *
  */
 public class DocumentBuilderTest extends SolrTestCaseJ4 {
+  static final int save_min_len = DocumentBuilder.MIN_LENGTH_TO_MOVE_LAST;
 
   @BeforeClass
   public static void beforeClass() throws Exception {
     initCore("solrconfig.xml", "schema.xml");
   }
 
+  @AfterClass
+  public static void afterClass() {
+    DocumentBuilder.MIN_LENGTH_TO_MOVE_LAST = save_min_len;
+  }
+
+  @After
+  public void afterTest() {
+    DocumentBuilder.MIN_LENGTH_TO_MOVE_LAST = save_min_len;
+  }
+
   @Test
   public void testBuildDocument() throws Exception 
   {
@@ -222,7 +242,54 @@ public class DocumentBuilderTest extends SolrTestCaseJ4 {
     sif2.setName("foo");
     assertFalse(assertSolrInputFieldEquals(sif1, sif2));
 
+  }
 
+  public void testMoveLargestLast() {
+    SolrInputDocument inDoc = new SolrInputDocument();
+    String TEXT_FLD = "text"; // not stored.  It won't be moved.  This value is the longest, however.
+    inDoc.addField(TEXT_FLD,
+        "NOT STORED|" + RandomStrings.randomAsciiOfLength(random(), 4 * DocumentBuilder.MIN_LENGTH_TO_MOVE_LAST));
+
+    String CAT_FLD = "cat"; // stored, multiValued
+    inDoc.addField(CAT_FLD,
+        "STORED V1|");
+    //  pretty long value
+    inDoc.addField(CAT_FLD,
+        "STORED V2|" + RandomStrings.randomAsciiOfLength(random(), 2 * DocumentBuilder.MIN_LENGTH_TO_MOVE_LAST));
+    inDoc.addField(CAT_FLD,
+        "STORED V3|" + RandomStrings.randomAsciiOfLength(random(), DocumentBuilder.MIN_LENGTH_TO_MOVE_LAST));
+
+    String SUBJECT_FLD = "subject"; // stored.  This value is long, but not long enough.
+    inDoc.addField(SUBJECT_FLD,
+        "2ndplace|" + RandomStrings.randomAsciiOfLength(random(), DocumentBuilder.MIN_LENGTH_TO_MOVE_LAST));
+
+    Document outDoc = DocumentBuilder.toDocument(inDoc, h.getCore().getLatestSchema());
+
+    // filter outDoc by stored fields; convert to list.
+    List<IndexableField> storedFields = StreamSupport.stream(outDoc.spliterator(), false)
+        .filter(f -> f.fieldType().stored()).collect(Collectors.toList());
+    // clip to last 3.  We expect these to be for CAT_FLD
+    storedFields = storedFields.subList(storedFields.size() - 3, storedFields.size());
+
+    Iterator<IndexableField> fieldIterator = storedFields.iterator();
+    IndexableField field;
+
+    // Test that we retained the particular value ordering, even though though the 2nd of three was longest
+
+    assertTrue(fieldIterator.hasNext());
+    field = fieldIterator.next();
+    assertEquals(CAT_FLD, field.name());
+    assertTrue(field.stringValue().startsWith("STORED V1|"));
+
+    assertTrue(fieldIterator.hasNext());
+    field = fieldIterator.next();
+    assertEquals(CAT_FLD, field.name());
+    assertTrue(field.stringValue().startsWith("STORED V2|"));
+
+    assertTrue(fieldIterator.hasNext());
+    field = fieldIterator.next();
+    assertEquals(CAT_FLD, field.name());
+    assertTrue(field.stringValue().startsWith("STORED V3|"));
   }
 
 }