You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2014/07/21 13:15:14 UTC

svn commit: r1612249 - in /lucene/dev/branches/branch_4x: ./ lucene/ lucene/core/ lucene/core/src/java/org/apache/lucene/search/ lucene/core/src/test/org/apache/lucene/search/ lucene/misc/

Author: jpountz
Date: Mon Jul 21 11:15:14 2014
New Revision: 1612249

URL: http://svn.apache.org/r1612249
Log:
LUCENE-5835: TermValComparator can sort missing values last.

Modified:
    lucene/dev/branches/branch_4x/   (props changed)
    lucene/dev/branches/branch_4x/lucene/   (props changed)
    lucene/dev/branches/branch_4x/lucene/CHANGES.txt   (contents, props changed)
    lucene/dev/branches/branch_4x/lucene/core/   (props changed)
    lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/search/FieldComparator.java
    lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/search/SortField.java
    lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/search/TestSearchAfter.java
    lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/search/TestSortRandom.java   (contents, props changed)
    lucene/dev/branches/branch_4x/lucene/misc/   (props changed)

Modified: lucene/dev/branches/branch_4x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/CHANGES.txt?rev=1612249&r1=1612248&r2=1612249&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_4x/lucene/CHANGES.txt Mon Jul 21 11:15:14 2014
@@ -36,6 +36,8 @@ New Features
   format, to include term ordinals in the index so the optional
   TermsEnum.ord() and TermsEnum.seekExact(long ord) APIs work.  (Mike
   McCandless)
+
+* LUCENE-5835: TermValComparator can sort missing values last. (Adrien Grand)
   
 API Changes
 

Modified: lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/search/FieldComparator.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/search/FieldComparator.java?rev=1612249&r1=1612248&r2=1612249&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/search/FieldComparator.java (original)
+++ lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/search/FieldComparator.java Mon Jul 21 11:15:14 2014
@@ -906,6 +906,7 @@ public abstract class FieldComparator<T>
     /* Values for each slot.
        @lucene.internal */
     final BytesRef[] values;
+    private final BytesRef[] tempBRs;
 
     /* Which reader last copied a value into the slot. When
        we compare two slots, we just compare-by-ord if the
@@ -948,8 +949,6 @@ public abstract class FieldComparator<T>
     boolean topSameReader;
     int topOrd;
 
-    final BytesRef tempBR = new BytesRef();
-
     /** -1 if missing values are sorted first, 1 if they are
      *  sorted last */
     final int missingSortCmp;
@@ -968,6 +967,7 @@ public abstract class FieldComparator<T>
     public TermOrdValComparator(int numHits, String field, boolean sortMissingLast) {
       ords = new int[numHits];
       values = new BytesRef[numHits];
+      tempBRs = new BytesRef[numHits];
       readerGen = new int[numHits];
       this.field = field;
       if (sortMissingLast) {
@@ -1026,9 +1026,10 @@ public abstract class FieldComparator<T>
         values[slot] = null;
       } else {
         assert ord >= 0;
-        if (values[slot] == null) {
-          values[slot] = new BytesRef();
+        if (tempBRs[slot] == null) {
+          tempBRs[slot] = new BytesRef();
         }
+        values[slot] = tempBRs[slot];
         values[slot].copyBytes(termsIndex.lookupOrd(ord));
       }
       ords[slot] = ord;
@@ -1153,15 +1154,7 @@ public abstract class FieldComparator<T>
    *  comparisons are done using BytesRef.compareTo, which is
    *  slow for medium to large result sets but possibly
    *  very fast for very small results sets. */
-  // TODO: should we remove this?  who really uses it?
   public static final class TermValComparator extends FieldComparator<BytesRef> {
-
-    // sentinels, just used internally in this comparator
-    private static final byte[] MISSING_BYTES = new byte[0];
-    // TODO: this is seriously not good, we should nuke this comparator, or
-    // instead we should represent missing as null, or use missingValue from the user...
-    // but it was always this way...
-    private final BytesRef MISSING_BYTESREF = new BytesRef(MISSING_BYTES);
     
     private final BytesRef[] values;
     private final BytesRef[] tempBRs;
@@ -1170,14 +1163,16 @@ public abstract class FieldComparator<T>
     private final String field;
     private BytesRef bottom;
     private BytesRef topValue;
+    private final int missingSortCmp;
 
     // TODO: add missing first/last support here?
 
     /** Sole constructor. */
-    TermValComparator(int numHits, String field) {
+    public TermValComparator(int numHits, String field, boolean sortMissingLast) {
       values = new BytesRef[numHits];
       tempBRs = new BytesRef[numHits];
       this.field = field;
+      missingSortCmp = sortMissingLast ? 1 : -1;
     }
 
     @Override
@@ -1196,8 +1191,8 @@ public abstract class FieldComparator<T>
     @Override
     public void copy(int slot, int doc) {
       final BytesRef comparableBytes = getComparableBytes(doc, docTerms.get(doc));
-      if (comparableBytes == MISSING_BYTESREF) {
-        values[slot] = MISSING_BYTESREF;
+      if (comparableBytes == null) {
+        values[slot] = null;
       } else {
         if (tempBRs[slot] == null) {
           tempBRs[slot] = new BytesRef();
@@ -1207,10 +1202,32 @@ public abstract class FieldComparator<T>
       }
     }
 
+    /** Retrieves the BinaryDocValues for the field in this segment */
+    protected BinaryDocValues getBinaryDocValues(AtomicReaderContext context, String field) throws IOException {
+      return FieldCache.DEFAULT.getTerms(context.reader(), field, true);
+    }
+
+    /** Retrieves the set of documents that have a value in this segment */
+    protected Bits getDocsWithField(AtomicReaderContext context, String field) throws IOException {
+      return FieldCache.DEFAULT.getDocsWithField(context.reader(), field);
+    }
+
+    /** Check whether the given value represents <tt>null</tt>. This can be
+     *  useful if the {@link BinaryDocValues} returned by {@link #getBinaryDocValues}
+     *  use a special value as a sentinel. The default implementation checks
+     *  {@link #getDocsWithField}.
+     *  <p>NOTE: The null value can only be an EMPTY {@link BytesRef}. */
+    protected boolean isNull(int doc, BytesRef term) {
+      return docsWithField != null && docsWithField.get(doc) == false;
+    }
+
     @Override
     public FieldComparator<BytesRef> setNextReader(AtomicReaderContext context) throws IOException {
-      docTerms = FieldCache.DEFAULT.getTerms(context.reader(), field, true);
-      docsWithField = FieldCache.DEFAULT.getDocsWithField(context.reader(), field);
+      docTerms = getBinaryDocValues(context, field);
+      docsWithField = getDocsWithField(context, field);
+      if (docsWithField instanceof Bits.MatchAllBits) {
+        docsWithField = null;
+      }
       return this;
     }
     
@@ -1221,12 +1238,8 @@ public abstract class FieldComparator<T>
 
     @Override
     public void setTopValue(BytesRef value) {
-      if (value == null) {
-        throw new IllegalArgumentException("value cannot be null");
-      }
-      if (value.bytes == MISSING_BYTES) {
-        value = MISSING_BYTESREF;
-      }
+      // null is fine: it means the last doc of the prior
+      // search was missing this value
       topValue = value;
     }
 
@@ -1238,13 +1251,13 @@ public abstract class FieldComparator<T>
     @Override
     public int compareValues(BytesRef val1, BytesRef val2) {
       // missing always sorts first:
-      if (val1 == MISSING_BYTESREF) {
-        if (val2 == MISSING_BYTESREF) {
+      if (val1 == null) {
+        if (val2 == null) {
           return 0;
         }
-        return -1;
-      } else if (val2 == MISSING_BYTESREF) {
-        return 1;
+        return missingSortCmp;
+      } else if (val2 == null) {
+        return -missingSortCmp;
       }
       return val1.compareTo(val2);
     }
@@ -1257,11 +1270,11 @@ public abstract class FieldComparator<T>
 
     /**
      * Given a document and a term, return the term itself if it exists or
-     * {@link #MISSING_BYTESREF} otherwise.
+     * <tt>null</tt> otherwise.
      */
     private BytesRef getComparableBytes(int doc, BytesRef term) {
-      if (term.length == 0 && docsWithField.get(doc) == false) {
-        return MISSING_BYTESREF;
+      if (term.length == 0 && isNull(doc, term)) {
+        return null;
       }
       return term;
     }

Modified: lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/search/SortField.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/search/SortField.java?rev=1612249&r1=1612248&r2=1612249&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/search/SortField.java (original)
+++ lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/search/SortField.java Mon Jul 21 11:15:14 2014
@@ -193,7 +193,7 @@ public class SortField {
     };
 
   public void setMissingValue(Object missingValue) {
-    if (type == Type.STRING) {
+    if (type == Type.STRING || type == Type.STRING_VAL) {
       if (missingValue != STRING_FIRST && missingValue != STRING_LAST) {
         throw new IllegalArgumentException("For STRING type, missing value must be either STRING_FIRST or STRING_LAST");
       }
@@ -426,8 +426,7 @@ public class SortField {
       return new FieldComparator.TermOrdValComparator(numHits, field, missingValue == STRING_LAST);
 
     case STRING_VAL:
-      // TODO: should we remove this?  who really uses it?
-      return new FieldComparator.TermValComparator(numHits, field);
+      return new FieldComparator.TermValComparator(numHits, field, missingValue == STRING_LAST);
 
     case REWRITEABLE:
       throw new IllegalStateException("SortField needs to be rewritten through Sort.rewrite(..) and SortField.rewrite(..)");

Modified: lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/search/TestSearchAfter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/search/TestSearchAfter.java?rev=1612249&r1=1612248&r2=1612249&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/search/TestSearchAfter.java (original)
+++ lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/search/TestSearchAfter.java Mon Jul 21 11:15:14 2014
@@ -108,6 +108,20 @@ public class TestSearchAfter extends Luc
       }
     }
 
+    // Also test missing first / last for the "string_val" sorts:
+    for(String field : new String[] {"sortedbytesdocvaluesval", "straightbytesdocvalues"}) {
+      for(int rev=0;rev<2;rev++) {
+        boolean reversed = rev == 0;
+        SortField sf = new SortField(field, SortField.Type.STRING_VAL, reversed);
+        sf.setMissingValue(SortField.STRING_FIRST);
+        allSortFields.add(sf);
+
+        sf = new SortField(field, SortField.Type.STRING_VAL, reversed);
+        sf.setMissingValue(SortField.STRING_LAST);
+        allSortFields.add(sf);
+      }
+    }
+
     int limit = allSortFields.size();
     for(int i=0;i<limit;i++) {
       SortField sf = allSortFields.get(i);

Modified: lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/search/TestSortRandom.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/search/TestSortRandom.java?rev=1612249&r1=1612248&r2=1612249&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/search/TestSortRandom.java (original)
+++ lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/search/TestSortRandom.java Mon Jul 21 11:15:14 2014
@@ -41,12 +41,19 @@ import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.FixedBitSet;
 import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.util.TestUtil;
-import org.apache.lucene.util.TestUtil;
 
 /** random sorting tests */
 public class TestSortRandom extends LuceneTestCase {
 
   public void testRandomStringSort() throws Exception {
+    testRandomStringSort(SortField.Type.STRING);
+  }
+
+  public void testRandomStringValSort() throws Exception {
+    testRandomStringSort(SortField.Type.STRING_VAL);
+  }
+
+  private void testRandomStringSort(SortField.Type type) throws Exception {
     Random random = new Random(random().nextLong());
 
     final int NUM_DOCS = atLeast(100);
@@ -135,7 +142,7 @@ public class TestSortRandom extends Luce
       final boolean sortMissingLast;
       final boolean missingIsNull;
       if (defaultCodecSupportsDocValues() && random.nextBoolean()) {
-        sf = new SortField("stringdv", SortField.Type.STRING, reverse);
+        sf = new SortField("stringdv", type, reverse);
         // Can only use sort missing if the DVFormat
         // supports docsWithField:
         sortMissingLast = defaultCodecSupportsDocsWithField() && random().nextBoolean();