You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2020/10/16 17:21:16 UTC

[lucene-solr] 01/03: LUCENE-9578: TermRangeQuery empty string lower bound edge case (#1976)

This is an automated email from the ASF dual-hosted git repository.

jpountz pushed a commit to branch branch_8x
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git

commit 2c6a7d3efd60d697104cb8ab495fecc4554803c3
Author: Christoph Büscher <cb...@posteo.de>
AuthorDate: Fri Oct 16 19:01:35 2020 +0200

    LUCENE-9578: TermRangeQuery empty string lower bound edge case (#1976)
    
    Currently a TermRangeQuery with the empty String ("") as lower bound and
    includeLower=false leads internally constructs an Automaton that doesn't match
    anything. This is unexpected expecially for open upper bounds where any string
    should be considered to be "higher" than the empty string.
    
    This PR changes "Automata#makeBinaryInterval" so that for an empty string lower
    bound and an open upper bound, any String should match the query regardless or
    the includeLower flag.
---
 .../org/apache/lucene/util/automaton/Automata.java | 27 +++++++++++++---
 .../lucene/util/automaton/TestAutomaton.java       | 36 ++++++++++++++++++++++
 2 files changed, 59 insertions(+), 4 deletions(-)

diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java b/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java
index 294700b..deb7375 100644
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java
@@ -85,7 +85,22 @@ final public class Automata {
     a.finishState();
     return a;
   }
-  
+
+  /**
+   * Returns a new (deterministic) automaton that accepts all binary terms except
+   * the empty string.
+   */
+  public static Automaton makeNonEmptyBinary() {
+    Automaton a = new Automaton();
+    int s1 = a.createState();
+    int s2 = a.createState();
+    a.setAccept(s2, true);
+    a.addTransition(s1, s2, 0, 255);
+    a.addTransition(s2, s2, 0, 255);
+    a.finishState();
+    return a;
+  }
+
   /**
    * Returns a new (deterministic) automaton that accepts any single codepoint.
    */
@@ -254,8 +269,12 @@ final public class Automata {
       cmp = min.compareTo(max);
     } else {
       cmp = -1;
-      if (min.length == 0 && minInclusive) {
-        return makeAnyBinary();
+      if (min.length == 0) {
+        if (minInclusive) {
+          return makeAnyBinary();
+        } else {
+          return makeNonEmptyBinary();
+        }
       }
     }
 
@@ -266,7 +285,7 @@ final public class Automata {
         return makeBinary(min);
       }
     } else if (cmp > 0) {
-      // max > min
+      // max < min
       return makeEmpty();
     }
 
diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestAutomaton.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestAutomaton.java
index e4a09dc..2f521d7 100644
--- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestAutomaton.java
+++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestAutomaton.java
@@ -1331,6 +1331,23 @@ public class TestAutomaton extends LuceneTestCase {
     assertFalse(Operations.run(a, intsRef("baq")));
     assertTrue(Operations.run(a, intsRef("bara")));
   }
+  
+  public void testMakeBinaryIntervalLowerBoundEmptyString() throws Exception {
+    Automaton a = Automata.makeBinaryInterval(new BytesRef(""), true, new BytesRef("bar"), true);
+    assertTrue(Operations.run(a, intsRef("")));
+    assertTrue(Operations.run(a, intsRef("a")));
+    assertTrue(Operations.run(a, intsRef("bar")));
+    assertFalse(Operations.run(a, intsRef("bara")));
+    assertFalse(Operations.run(a, intsRef("baz")));
+    
+    
+    a = Automata.makeBinaryInterval(new BytesRef(""), false, new BytesRef("bar"), true);
+    assertFalse(Operations.run(a, intsRef("")));
+    assertTrue(Operations.run(a, intsRef("a")));
+    assertTrue(Operations.run(a, intsRef("bar")));
+    assertFalse(Operations.run(a, intsRef("bara")));
+    assertFalse(Operations.run(a, intsRef("baz")));
+  }
 
   public void testMakeBinaryIntervalEqual() throws Exception {
     Automaton a = Automata.makeBinaryInterval(new BytesRef("bar"), true, new BytesRef("bar"), true);
@@ -1352,6 +1369,12 @@ public class TestAutomaton extends LuceneTestCase {
     assertFalse(Operations.run(a, intsRef("barfoop")));
   }
 
+  public void testMakeBinaryExceptEmpty() throws Exception {
+    Automaton a = Automata.makeNonEmptyBinary();
+    assertFalse(Operations.run(a, intsRef("")));
+    assertTrue(Operations.run(a, intsRef(TestUtil.randomRealisticUnicodeString(random(), 1, 10))));
+  }
+
   public void testMakeBinaryIntervalOpenMax() throws Exception {
     Automaton a = Automata.makeBinaryInterval(new BytesRef("bar"), true, null, true);
     assertFalse(Operations.run(a, intsRef("bam")));
@@ -1366,6 +1389,19 @@ public class TestAutomaton extends LuceneTestCase {
     assertTrue(Operations.run(a, intsRef("zzz")));
   }
 
+  public void testMakeBinaryIntervalOpenMaxZeroLengthMin() throws Exception {
+    // when including min, automaton should accept "a"
+    Automaton a = Automata.makeBinaryInterval(new BytesRef(""), true, null, true);
+    assertTrue(Operations.run(a, intsRef("")));
+    assertTrue(Operations.run(a, intsRef("a")));
+    assertTrue(Operations.run(a, intsRef("aaaaaa")));
+    // excluding min should still accept "a"
+    a = Automata.makeBinaryInterval(new BytesRef(""), false, null, true);
+    assertFalse(Operations.run(a, intsRef("")));
+    assertTrue(Operations.run(a, intsRef("a")));
+    assertTrue(Operations.run(a, intsRef("aaaaaa")));
+  }
+
   public void testMakeBinaryIntervalOpenMin() throws Exception {
     Automaton a = Automata.makeBinaryInterval(null, true, new BytesRef("foo"), true);
     assertFalse(Operations.run(a, intsRef("foz")));