You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ro...@apache.org on 2020/06/30 08:34:40 UTC

[lucene-solr] 01/02: LUCENE-9418: Fix ordered intervals over interleaved terms (#1618)

This is an automated email from the ASF dual-hosted git repository.

romseygeek pushed a commit to branch branch_8x
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git

commit 3a42716cdb06ba650ccb2cbc9953c05c9a8a6abc
Author: Alan Woodward <ro...@apache.org>
AuthorDate: Tue Jun 30 09:13:55 2020 +0100

    LUCENE-9418: Fix ordered intervals over interleaved terms (#1618)
    
    Given the input text 'A B A C', an ordered interval 'A B C' will currently return an incorrect
    internal [2, 3] in addition to the correct [0, 3] interval. This is due to a bug in the ORDERED
    algorithm, where we assume that after the first interval is returned, the sub-intervals are
    always in-order. This assumption only holds during minimization, as minimizing an interval
    may move the earlier terms beyond the trailing terms.
    
    For example, after the initial [0, 3] interval is found above, the algorithm will attempt to
    minimize it by advancing A to [2,2]. Because this is still before C at [3,3], but after B at
    [1,1], we then try advancing B, leaving it at [Inf,Inf]. Minimization has failed, so we return
    the original interval of [0,3]. However, when we come to retrieve the next interval, our
    subintervals look like this: A[2,2], B[Inf,Inf], C[3,3] - the assumption that they are in order
    is broken. The algorithm sees that A is before B, assumes that therefore all subsequent
    subintervals are in order, and returns the new interval.
    
    This commit fixes things by changing the assumption of ordering to only hold during
    minimization. When first finding a candidate interval, the algorithm now checks that
    all sub-intervals appear in order.
---
 .../lucene/queries/intervals/OrderedIntervalsSource.java  |  4 +++-
 .../lucene/queries/intervals/TestIntervalQuery.java       | 10 +++++++++-
 .../apache/lucene/queries/intervals/TestIntervals.java    | 15 ++++++++++++++-
 3 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/OrderedIntervalsSource.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/OrderedIntervalsSource.java
index 6331688..2b7d107 100644
--- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/OrderedIntervalsSource.java
+++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/OrderedIntervalsSource.java
@@ -133,12 +133,13 @@ class OrderedIntervalsSource extends ConjunctionIntervalsSource {
     public int nextInterval() throws IOException {
       start = end = slop = IntervalIterator.NO_MORE_INTERVALS;
       int lastStart = Integer.MAX_VALUE;
+      boolean minimizing = false;
       i = 1;
       while (true) {
         while (true) {
           if (subIterators.get(i - 1).end() >= lastStart)
             return start;
-          if (i == subIterators.size() || subIterators.get(i).start() > subIterators.get(i - 1).end())
+          if (i == subIterators.size() || (minimizing && subIterators.get(i).start() > subIterators.get(i - 1).end()))
             break;
           do {
             if (subIterators.get(i).end() >= lastStart || subIterators.get(i).nextInterval() == IntervalIterator.NO_MORE_INTERVALS)
@@ -160,6 +161,7 @@ class OrderedIntervalsSource extends ConjunctionIntervalsSource {
         i = 1;
         if (subIterators.get(0).nextInterval() == IntervalIterator.NO_MORE_INTERVALS)
           return start;
+        minimizing = true;
       }
     }
 
diff --git a/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervalQuery.java b/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervalQuery.java
index 097b4ab..2ca79b3 100644
--- a/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervalQuery.java
+++ b/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervalQuery.java
@@ -73,7 +73,8 @@ public class TestIntervalQuery extends LuceneTestCase {
       "coordinate genome mapping research",
       "coordinate genome research",
       "greater new york",
-      "x x x x x intend x x x message x x x message x x x addressed x x"
+      "x x x x x intend x x x message x x x message x x x addressed x x",
+      "issue with intervals queries from search engine. So it's a big issue for us as we need to do ordered searches. Thank you to help us concerning that issue"
   };
 
   private void checkHits(Query query, int[] results) throws IOException {
@@ -262,6 +263,13 @@ public class TestIntervalQuery extends LuceneTestCase {
     checkHits(q, new int[]{ 6, 7 });
   }
 
+  public void testOrderedWithGaps() throws IOException {
+    Query q = new IntervalQuery(field, Intervals.maxgaps(1, Intervals.ordered(
+            Intervals.term("issue"), Intervals.term("search"), Intervals.term("ordered")
+    )));
+    checkHits(q, new int[]{});
+  }
+
   public void testNestedOrInContainedBy() throws IOException {
     Query q = new IntervalQuery(field, Intervals.containedBy(
         Intervals.term("genome"),
diff --git a/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervals.java b/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervals.java
index b230dd2..5444b66 100644
--- a/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervals.java
+++ b/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervals.java
@@ -73,7 +73,7 @@ public class TestIntervals extends LuceneTestCase {
   private static String field2_docs[] = {
       "In Xanadu did Kubla Khan a stately pleasure dome decree",
       "Where Alph the sacred river ran through caverns measureless to man",
-      "Down to a sunless sea",
+      "a b a c b a b c",
       "So thrice five miles of fertile ground",
       "Pease hot porridge porridge",
       "w1 w2 w3 w4 w1 w6 w3 w8 w4 w7 w1 w6"
@@ -525,6 +525,14 @@ public class TestIntervals extends LuceneTestCase {
     assertEquals(4, source.minExtent());
   }
 
+  public void testInterleavedOrdered() throws IOException {
+    IntervalsSource source = Intervals.ordered(Intervals.term("a"), Intervals.term("b"), Intervals.term("c"));
+    checkIntervals(source, "field2", 1, new int[][]{
+            {}, {}, { 0, 3, 5, 7 }, {}, {}, {}
+    });
+    assertGaps(source, 2, "field2", new int[]{ 1, 0 });
+  }
+
   public void testUnorderedDistinct() throws IOException {
     checkIntervals(Intervals.unorderedNoOverlaps(Intervals.term("pease"), Intervals.term("pease")),
         "field1", 3, new int[][]{
@@ -670,6 +678,11 @@ public class TestIntervals extends LuceneTestCase {
     assertMatch(mi, 0, 3, 0, 11);
 
     assertEquals(3, source.minExtent());
+    assertEquals(source, source);
+    assertEquals(source, Intervals.maxgaps(1,
+            Intervals.unordered(Intervals.term("w1"), Intervals.term("w3"), Intervals.term("w4"))));
+    assertNotEquals(source, Intervals.maxgaps(2,
+            Intervals.unordered(Intervals.term("w1"), Intervals.term("w3"), Intervals.term("w4"))));
 
   }