You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ab...@apache.org on 2017/03/23 11:50:07 UTC
[20/46] lucene-solr:jira/solr-9959: SOLR-10273: DocumentBuilder move
longest field to last position
SOLR-10273: DocumentBuilder move longest field to last position
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/8fbd9f1e
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/8fbd9f1e
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/8fbd9f1e
Branch: refs/heads/jira/solr-9959
Commit: 8fbd9f1e403cc697f77d827cd1aa85876c8665ae
Parents: 4a55bc4
Author: David Smiley <ds...@apache.org>
Authored: Thu Mar 16 21:22:08 2017 -0400
Committer: David Smiley <ds...@apache.org>
Committed: Thu Mar 16 21:22:08 2017 -0400
----------------------------------------------------------------------
solr/CHANGES.txt | 4 ++
.../org/apache/solr/update/DocumentBuilder.java | 60 +++++++++++++++++-
.../apache/solr/update/DocumentBuilderTest.java | 67 ++++++++++++++++++++
3 files changed, 130 insertions(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8fbd9f1e/solr/CHANGES.txt
----------------------------------------------------------------------
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 9a5299c..dfe8d93 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -301,6 +301,10 @@ Optimizations
* SOLR-10143: PointFields will create IndexOrDocValuesQuery when a field is both, indexed=true and docValues=true
(Tom�s Fern�ndez L�bbe)
+* SOLR-10273: The field with the longest value (if it exceeds 4K) is moved to be last in the Lucene Document in order
+ to benefit from stored field optimizations in Lucene that can avoid reading it when it's not needed. If the field is
+ multi-valued, they all move together to the end to retain order. (David Smiley)
+
Other Changes
----------------------
* SOLR-9980: Expose configVersion in core admin status (Jessica Cheng Mallet via Tom�s Fern�ndez L�bbe)
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8fbd9f1e/solr/core/src/java/org/apache/solr/update/DocumentBuilder.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/update/DocumentBuilder.java b/solr/core/src/java/org/apache/solr/update/DocumentBuilder.java
index e3d2011..b97af3b 100644
--- a/solr/core/src/java/org/apache/solr/update/DocumentBuilder.java
+++ b/solr/core/src/java/org/apache/solr/update/DocumentBuilder.java
@@ -16,12 +16,15 @@
*/
package org.apache.solr.update;
+import java.util.Iterator;
+import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.index.IndexableField;
+import org.apache.lucene.util.BytesRef;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.SolrInputField;
@@ -33,10 +36,13 @@ import org.apache.solr.schema.SchemaField;
import com.google.common.collect.Sets;
/**
- *
+ * Builds a Lucene {@link Document} from a {@link SolrInputDocument}.
*/
public class DocumentBuilder {
+ // accessible only for tests
+ static int MIN_LENGTH_TO_MOVE_LAST = Integer.getInteger("solr.docBuilder.minLengthToMoveLast", 4*1024); // internal setting
+
/**
* Add a field value to a given document.
* @param doc Document that the field needs to be added to
@@ -227,6 +233,58 @@ public class DocumentBuilder {
}
}
}
+
+ if (!forInPlaceUpdate) {
+ moveLargestFieldLast(out);
+ }
+
return out;
}
+
+ /** Move the largest stored field last, because Lucene can avoid loading that one if it's not needed. */
+ private static void moveLargestFieldLast(Document doc) {
+ String largestField = null;
+ int largestFieldLen = -1;
+ boolean largestIsLast = true;
+ for (IndexableField field : doc) {
+ if (!field.fieldType().stored()) {
+ continue;
+ }
+ if (largestIsLast && !field.name().equals(largestField)) {
+ largestIsLast = false;
+ }
+ if (field.numericValue() != null) { // just ignore these as non-competitive (avoid toString'ing their number)
+ continue;
+ }
+ String strVal = field.stringValue();
+ if (strVal != null) {
+ if (strVal.length() > largestFieldLen) {
+ largestField = field.name();
+ largestFieldLen = strVal.length();
+ largestIsLast = true;
+ }
+ } else {
+ BytesRef bytesRef = field.binaryValue();
+ if (bytesRef != null && bytesRef.length > largestFieldLen) {
+ largestField = field.name();
+ largestFieldLen = bytesRef.length;
+ largestIsLast = true;
+ }
+ }
+ }
+ if (!largestIsLast && largestField != null && largestFieldLen > MIN_LENGTH_TO_MOVE_LAST) { // only bother if the value isn't tiny
+ LinkedList<IndexableField> addToEnd = new LinkedList<>();
+ Iterator<IndexableField> iterator = doc.iterator();
+ while (iterator.hasNext()) {
+ IndexableField field = iterator.next();
+ if (field.name().equals(largestField)) {
+ addToEnd.add(field);
+ iterator.remove(); // Document may not have "remove" but it's iterator allows mutation
+ }
+ }
+ for (IndexableField field : addToEnd) {
+ doc.add(field);
+ }
+ }
+ }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8fbd9f1e/solr/core/src/test/org/apache/solr/update/DocumentBuilderTest.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/update/DocumentBuilderTest.java b/solr/core/src/test/org/apache/solr/update/DocumentBuilderTest.java
index 2a78d6b..03dd17c 100644
--- a/solr/core/src/test/org/apache/solr/update/DocumentBuilderTest.java
+++ b/solr/core/src/test/org/apache/solr/update/DocumentBuilderTest.java
@@ -16,7 +16,14 @@
*/
package org.apache.solr.update;
+import java.util.Iterator;
+import java.util.List;
+import java.util.stream.Collectors;
+import java.util.stream.StreamSupport;
+
+import com.carrotsearch.randomizedtesting.generators.RandomStrings;
import org.apache.lucene.document.Document;
+import org.apache.lucene.index.IndexableField;
import org.apache.lucene.util.TestUtil;
import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.common.SolrDocument;
@@ -25,6 +32,8 @@ import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.SolrInputField;
import org.apache.solr.core.SolrCore;
import org.apache.solr.schema.FieldType;
+import org.junit.After;
+import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Test;
@@ -33,12 +42,23 @@ import org.junit.Test;
*
*/
public class DocumentBuilderTest extends SolrTestCaseJ4 {
+ static final int save_min_len = DocumentBuilder.MIN_LENGTH_TO_MOVE_LAST;
@BeforeClass
public static void beforeClass() throws Exception {
initCore("solrconfig.xml", "schema.xml");
}
+ @AfterClass
+ public static void afterClass() {
+ DocumentBuilder.MIN_LENGTH_TO_MOVE_LAST = save_min_len;
+ }
+
+ @After
+ public void afterTest() {
+ DocumentBuilder.MIN_LENGTH_TO_MOVE_LAST = save_min_len;
+ }
+
@Test
public void testBuildDocument() throws Exception
{
@@ -222,7 +242,54 @@ public class DocumentBuilderTest extends SolrTestCaseJ4 {
sif2.setName("foo");
assertFalse(assertSolrInputFieldEquals(sif1, sif2));
+ }
+ public void testMoveLargestLast() {
+ SolrInputDocument inDoc = new SolrInputDocument();
+ String TEXT_FLD = "text"; // not stored. It won't be moved. This value is the longest, however.
+ inDoc.addField(TEXT_FLD,
+ "NOT STORED|" + RandomStrings.randomAsciiOfLength(random(), 4 * DocumentBuilder.MIN_LENGTH_TO_MOVE_LAST));
+
+ String CAT_FLD = "cat"; // stored, multiValued
+ inDoc.addField(CAT_FLD,
+ "STORED V1|");
+ // pretty long value
+ inDoc.addField(CAT_FLD,
+ "STORED V2|" + RandomStrings.randomAsciiOfLength(random(), 2 * DocumentBuilder.MIN_LENGTH_TO_MOVE_LAST));
+ inDoc.addField(CAT_FLD,
+ "STORED V3|" + RandomStrings.randomAsciiOfLength(random(), DocumentBuilder.MIN_LENGTH_TO_MOVE_LAST));
+
+ String SUBJECT_FLD = "subject"; // stored. This value is long, but not long enough.
+ inDoc.addField(SUBJECT_FLD,
+ "2ndplace|" + RandomStrings.randomAsciiOfLength(random(), DocumentBuilder.MIN_LENGTH_TO_MOVE_LAST));
+
+ Document outDoc = DocumentBuilder.toDocument(inDoc, h.getCore().getLatestSchema());
+
+ // filter outDoc by stored fields; convert to list.
+ List<IndexableField> storedFields = StreamSupport.stream(outDoc.spliterator(), false)
+ .filter(f -> f.fieldType().stored()).collect(Collectors.toList());
+ // clip to last 3. We expect these to be for CAT_FLD
+ storedFields = storedFields.subList(storedFields.size() - 3, storedFields.size());
+
+ Iterator<IndexableField> fieldIterator = storedFields.iterator();
+ IndexableField field;
+
+ // Test that we retained the particular value ordering, even though though the 2nd of three was longest
+
+ assertTrue(fieldIterator.hasNext());
+ field = fieldIterator.next();
+ assertEquals(CAT_FLD, field.name());
+ assertTrue(field.stringValue().startsWith("STORED V1|"));
+
+ assertTrue(fieldIterator.hasNext());
+ field = fieldIterator.next();
+ assertEquals(CAT_FLD, field.name());
+ assertTrue(field.stringValue().startsWith("STORED V2|"));
+
+ assertTrue(fieldIterator.hasNext());
+ field = fieldIterator.next();
+ assertEquals(CAT_FLD, field.name());
+ assertTrue(field.stringValue().startsWith("STORED V3|"));
}
}