You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by dw...@apache.org on 2021/12/09 16:19:29 UTC
[lucene] branch main updated: LUCENE-10229: Unify behaviour of match offsets for interval queries (#521)
This is an automated email from the ASF dual-hosted git repository.
dweiss pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/lucene.git
The following commit(s) were added to refs/heads/main by this push:
new 53099e0 LUCENE-10229: Unify behaviour of match offsets for interval queries (#521)
53099e0 is described below
commit 53099e01de2f41253c32f02a7d32e5fe59b2aadb
Author: Patrick Zhai <zh...@users.noreply.github.com>
AuthorDate: Thu Dec 9 08:19:18 2021 -0800
LUCENE-10229: Unify behaviour of match offsets for interval queries (#521)
---
lucene/CHANGES.txt | 3 +
.../matchhighlight/TestMatchHighlighter.java | 231 ++++++++++++++++++++-
.../matchhighlight/TestMatchRegionRetriever.java | 8 +-
.../intervals/ConjunctionIntervalsSource.java | 11 +-
.../intervals/ContainedByIntervalsSource.java | 8 +
.../apache/lucene/queries/intervals/Intervals.java | 5 +-
.../MinimumShouldMatchIntervalsSource.java | 5 +-
.../intervals/OverlappingIntervalsSource.java | 8 +
.../lucene/queries/intervals/TestIntervals.java | 42 ++--
9 files changed, 299 insertions(+), 22 deletions(-)
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 0e05857..ccaa2f2 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -79,6 +79,9 @@ Improvements
* LUCENE-9538: Detect polygon self-intersections in the Tessellator. (Ignacio Vera)
* LUCENE-10275: Speed up MultiRangeQuery by using an interval tree. (Ignacio Vera)
+
+* LUCENE-10229: Unify behaviour of match offsets for interval queries on fields
+ with or without offsets enabled. (Patrick Zhai)
Optimizations
---------------------
diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/TestMatchHighlighter.java b/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/TestMatchHighlighter.java
index fa6eb1d..3ac4ec6 100644
--- a/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/TestMatchHighlighter.java
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/TestMatchHighlighter.java
@@ -332,8 +332,6 @@ public class TestMatchHighlighter extends LuceneTestCase {
}
};
- // TODO: the highlights are different when the field is indexed with offsets. Weird.
- // String field = FLD_TEXT1;
String field = FLD_TEXT2;
new IndexBuilder(this::toField)
// Just one document and multiple interval queries.
@@ -541,6 +539,235 @@ public class TestMatchHighlighter extends LuceneTestCase {
});
}
+ /**
+ * Almost the same as the one above, make sure the fields indexed with offsets are also
+ * highlighted correctly
+ */
+ @Test
+ public void testIntervalFunctionsWithOffsetField() throws Exception {
+ Analyzer analyzer =
+ new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ Tokenizer tokenizer = new StandardTokenizer();
+ TokenStream ts = tokenizer;
+ ts = new LowerCaseFilter(ts);
+ return new TokenStreamComponents(tokenizer, ts);
+ }
+ };
+
+ String field = FLD_TEXT1;
+ new IndexBuilder(this::toField)
+ // Just one document and multiple interval queries.
+ .doc(field, "The quick brown fox jumps over the lazy dog")
+ .build(
+ analyzer,
+ reader -> {
+ IndexSearcher searcher = new IndexSearcher(reader);
+ Sort sortOrder = Sort.INDEXORDER; // So that results are consistently ordered.
+
+ MatchHighlighter highlighter =
+ new MatchHighlighter(searcher, analyzer)
+ .appendFieldHighlighter(
+ FieldValueHighlighters.highlighted(
+ 80 * 3, 1, new PassageFormatter("...", ">", "<"), fld -> true))
+ .appendFieldHighlighter(FieldValueHighlighters.skipRemaining());
+
+ StandardQueryParser qp = new StandardQueryParser(analyzer);
+
+ // Run all pairs of query-expected highlight.
+ List<String> errors = new ArrayList<>();
+ for (var queryHighlightPair :
+ new String[][] {
+ {
+ "fn:ordered(brown dog)",
+ "0. %s: The quick >brown fox jumps over the lazy dog<"
+ },
+ {
+ "fn:within(fn:or(lazy quick) 1 fn:or(dog fox))",
+ "0. %s: The quick brown fox jumps over the >lazy< dog"
+ },
+ {
+ "fn:containedBy(fox fn:ordered(brown fox dog))",
+ "0. %s: The quick brown >fox< jumps over the lazy dog"
+ },
+ {
+ "fn:atLeast(2 quick fox \"furry dog\")",
+ "0. %s: The >quick brown fox< jumps over the lazy dog"
+ },
+ {
+ "fn:maxgaps(0 fn:ordered(fn:or(quick lazy) fn:or(fox dog)))",
+ "0. %s: The quick brown fox jumps over the >lazy dog<"
+ },
+ {
+ "fn:maxgaps(1 fn:ordered(fn:or(quick lazy) fn:or(fox dog)))",
+ "0. %s: The >quick brown fox< jumps over the >lazy dog<"
+ },
+ {
+ "fn:maxwidth(2 fn:ordered(fn:or(quick lazy) fn:or(fox dog)))",
+ "0. %s: The quick brown fox jumps over the >lazy dog<"
+ },
+ {
+ "fn:maxwidth(3 fn:ordered(fn:or(quick lazy) fn:or(fox dog)))",
+ "0. %s: The >quick brown fox< jumps over the >lazy dog<"
+ },
+ {
+ "fn:or(quick \"fox\")",
+ "0. %s: The >quick< brown >fox< jumps over the lazy dog"
+ },
+ {"fn:or(\"quick fox\")"},
+ {
+ "fn:phrase(quick brown fox)",
+ "0. %s: The >quick brown fox< jumps over the lazy dog"
+ },
+ {"fn:wildcard(jump*)", "0. %s: The quick brown fox >jumps< over the lazy dog"},
+ {"fn:wildcard(br*n)", "0. %s: The quick >brown< fox jumps over the lazy dog"},
+ {"fn:or(dog fox)", "0. %s: The quick brown >fox< jumps over the lazy >dog<"},
+ {
+ "fn:phrase(fn:ordered(quick fox) jumps)",
+ "0. %s: The >quick brown fox jumps< over the lazy dog"
+ },
+ {
+ "fn:ordered(quick jumps dog)",
+ "0. %s: The >quick brown fox jumps over the lazy dog<"
+ },
+ {
+ "fn:ordered(quick fn:or(fox dog))",
+ "0. %s: The >quick brown fox< jumps over the lazy dog"
+ },
+ {
+ "fn:ordered(quick jumps fn:or(fox dog))",
+ "0. %s: The >quick brown fox jumps over the lazy dog<"
+ },
+ {
+ "fn:unordered(dog jumps quick)",
+ "0. %s: The >quick brown fox jumps over the lazy dog<"
+ },
+ {
+ "fn:unordered(fn:or(fox dog) quick)",
+ "0. %s: The >quick brown fox< jumps over the lazy dog"
+ },
+ {
+ "fn:unordered(fn:phrase(brown fox) fn:phrase(fox jumps))",
+ "0. %s: The quick >brown fox jumps< over the lazy dog"
+ },
+ {"fn:ordered(fn:phrase(brown fox) fn:phrase(fox jumps))"},
+ {"fn:unorderedNoOverlaps(fn:phrase(brown fox) fn:phrase(fox jumps))"},
+ {
+ "fn:before(fn:or(brown lazy) fox)",
+ "0. %s: The quick >brown< fox jumps over the lazy dog"
+ },
+ {
+ "fn:before(fn:or(brown lazy) fn:or(dog fox))",
+ "0. %s: The quick >brown< fox jumps over the >lazy< dog"
+ },
+ {
+ "fn:after(fn:or(brown lazy) fox)",
+ "0. %s: The quick brown fox jumps over the >lazy< dog"
+ },
+ {
+ "fn:after(fn:or(brown lazy) fn:or(dog fox))",
+ "0. %s: The quick brown fox jumps over the >lazy< dog"
+ },
+ {
+ "fn:within(fn:or(fox dog) 1 fn:or(quick lazy))",
+ "0. %s: The quick brown fox jumps over the lazy >dog<"
+ },
+ {
+ "fn:within(fn:or(fox dog) 2 fn:or(quick lazy))",
+ "0. %s: The quick brown >fox< jumps over the lazy >dog<"
+ },
+ {
+ "fn:notWithin(fn:or(fox dog) 1 fn:or(quick lazy))",
+ "0. %s: The quick brown >fox< jumps over the lazy dog"
+ },
+ {
+ "fn:containedBy(fn:or(fox dog) fn:ordered(quick lazy))",
+ "0. %s: The quick brown >fox< jumps over the lazy dog"
+ },
+ {
+ "fn:notContainedBy(fn:or(fox dog) fn:ordered(quick lazy))",
+ "0. %s: The quick brown fox jumps over the lazy >dog<"
+ },
+ {
+ "fn:containing(fn:atLeast(2 quick fox dog) jumps)",
+ "0. %s: The quick brown >fox jumps over the lazy dog<"
+ },
+ {
+ "fn:notContaining(fn:ordered(fn:or(the The) fn:or(fox dog)) brown)",
+ "0. %s: The quick brown fox jumps over >the lazy dog<"
+ },
+ {
+ "fn:overlapping(fn:phrase(brown fox) fn:phrase(fox jumps))",
+ "0. %s: The quick >brown fox< jumps over the lazy dog"
+ },
+ {
+ "fn:overlapping(fn:or(fox dog) fn:extend(lazy 2 2))",
+ "0. %s: The quick brown fox jumps over the lazy >dog<"
+ },
+ {
+ "fn:nonOverlapping(fn:phrase(brown fox) fn:phrase(lazy dog))",
+ "0. %s: The quick >brown fox< jumps over the lazy dog"
+ },
+ {
+ "fn:nonOverlapping(fn:or(fox dog) fn:extend(lazy 2 2))",
+ "0. %s: The quick brown >fox< jumps over the lazy dog"
+ },
+ {
+ "fn:atLeast(2 fn:unordered(furry dog) fn:unordered(brown dog) lazy quick)",
+ "0. %s: The >quick >brown fox jumps over the lazy<<> dog<"
+ },
+ /*
+ The test cases below do not work for fields enabled with offset yet:
+ mainly "extend"
+ TODO: Fix them!
+
+ {"fn:extend(fox 1 2)", "0. %s: The quick >brown fox jumps over< the lazy dog"},
+ {
+ "fn:extend(fn:or(dog fox) 2 0)",
+ "0. %s: The >quick brown fox< jumps over >the lazy dog<"
+ },
+ {
+ "fn:containedBy(fn:or(fox dog) fn:extend(lazy 3 3))",
+ "0. %s: The quick brown fox jumps over the lazy >dog<"
+ },
+ {
+ "fn:notContainedBy(fn:or(fox dog) fn:extend(lazy 3 3))",
+ "0. %s: The quick brown >fox< jumps over the lazy dog"
+ },
+ {
+ "fn:containing(fn:extend(fn:or(lazy brown) 1 1) fn:or(fox dog))",
+ "0. %s: The >quick brown fox< jumps over >the lazy dog<"
+ },
+ {
+ "fn:notContaining(fn:extend(fn:or(fox dog) 1 0) fn:or(brown yellow))",
+ "0. %s: The quick brown fox jumps over the >lazy dog<"
+ } */
+ }) {
+ assert queryHighlightPair.length >= 1;
+ String queryString = queryHighlightPair[0];
+ var query = qp.parse(queryString, field);
+ var expected =
+ Arrays.stream(queryHighlightPair)
+ .skip(1)
+ .map(v -> String.format(Locale.ROOT, v, field))
+ .toArray(String[]::new);
+
+ try {
+ assertHighlights(
+ toDocList(
+ highlighter.highlight(searcher.search(query, 10, sortOrder), query)),
+ expected);
+ } catch (AssertionError e) {
+ errors.add("MISMATCH: query: " + queryString + "\n" + e.getMessage());
+ }
+ }
+ if (errors.size() > 0) {
+ throw new AssertionError(String.join("\n\n", errors));
+ }
+ });
+ }
+
@Test
public void testCustomFieldHighlightHandling() throws Exception {
// Match highlighter is a showcase of individual components in this package, suitable
diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/TestMatchRegionRetriever.java b/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/TestMatchRegionRetriever.java
index 4d8d4f2..dac77c1 100644
--- a/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/TestMatchRegionRetriever.java
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/TestMatchRegionRetriever.java
@@ -379,7 +379,7 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
Intervals.containedBy(
Intervals.term("foo"),
Intervals.unordered(Intervals.term("foo"), Intervals.term("bar"))))),
- containsInAnyOrder(fmt("2: (field_text_offs: '>bar baz foo< xyz')", field)));
+ containsInAnyOrder(fmt("2: (field_text_offs: 'bar baz >foo< xyz')", field)));
MatcherAssert.assertThat(
highlights(
@@ -387,9 +387,9 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
new IntervalQuery(
field,
Intervals.overlapping(
- Intervals.unordered(Intervals.term("foo"), Intervals.term("bar")),
- Intervals.term("foo")))),
- containsInAnyOrder(fmt("2: (field_text_offs: '>bar baz foo< xyz')", field)));
+ Intervals.term("foo"),
+ Intervals.unordered(Intervals.term("foo"), Intervals.term("bar"))))),
+ containsInAnyOrder(fmt("2: (field_text_offs: 'bar baz >foo< xyz')", field)));
});
}
diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/ConjunctionIntervalsSource.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/ConjunctionIntervalsSource.java
index 56cd0a6..90e15a0 100644
--- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/ConjunctionIntervalsSource.java
+++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/ConjunctionIntervalsSource.java
@@ -59,6 +59,15 @@ abstract class ConjunctionIntervalsSource extends IntervalsSource {
protected abstract IntervalIterator combine(List<IntervalIterator> iterators);
+ /**
+ * Create matches iterator from an advanced and validated interval iterator and a list of matches
+ * iterator of all the sub-sources
+ */
+ protected IntervalMatchesIterator createMatchesIterator(
+ IntervalIterator it, List<IntervalMatchesIterator> subs) {
+ return new ConjunctionMatchesIterator(it, subs);
+ }
+
@Override
public final IntervalMatchesIterator matches(String field, LeafReaderContext ctx, int doc)
throws IOException {
@@ -81,6 +90,6 @@ abstract class ConjunctionIntervalsSource extends IntervalsSource {
if (it.nextInterval() == IntervalIterator.NO_MORE_INTERVALS) {
return null;
}
- return new ConjunctionMatchesIterator(it, subs);
+ return createMatchesIterator(it, subs);
}
}
diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/ContainedByIntervalsSource.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/ContainedByIntervalsSource.java
index f8c50b1..d862114 100644
--- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/ContainedByIntervalsSource.java
+++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/ContainedByIntervalsSource.java
@@ -67,6 +67,14 @@ class ContainedByIntervalsSource extends ConjunctionIntervalsSource {
}
@Override
+ protected IntervalMatchesIterator createMatchesIterator(
+ IntervalIterator it, List<IntervalMatchesIterator> subs) {
+ assert subs.size() == 2;
+ // the only sub source we care is the "small" source
+ return new ConjunctionMatchesIterator(it, List.of(subs.get(0)));
+ }
+
+ @Override
public int minExtent() {
return small.minExtent();
}
diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/Intervals.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/Intervals.java
index 080194a..4c1ec48 100644
--- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/Intervals.java
+++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/Intervals.java
@@ -275,7 +275,10 @@ public final class Intervals {
}
/**
- * Create an unordered {@link IntervalsSource}
+ * Create an unordered {@link IntervalsSource}. Note that if there are multiple intervals ends at
+ * the same position are eligible, only the narrowest one will be returned. For example if asking
+ * for <code>unordered(term("apple"), term("banana"))</code> on field of "apple wolf apple orange
+ * banana", only the "apple orange banana" will be returned.
*
* <p>Returns intervals in which all the subsources appear. The subsources may overlap
*
diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/MinimumShouldMatchIntervalsSource.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/MinimumShouldMatchIntervalsSource.java
index 6a1db86..ea3181e 100644
--- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/MinimumShouldMatchIntervalsSource.java
+++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/MinimumShouldMatchIntervalsSource.java
@@ -215,6 +215,7 @@ class MinimumShouldMatchIntervalsSource extends IntervalsSource {
@Override
public int nextInterval() throws IOException {
+ lead = null;
// first, find a matching interval beyond the current start
while (this.proximityQueue.size() == minShouldMatch
&& proximityQueue.top().start() == start) {
@@ -258,7 +259,9 @@ class MinimumShouldMatchIntervalsSource extends IntervalsSource {
Collection<IntervalIterator> getCurrentIterators() {
currentIterators.clear();
- currentIterators.add(lead);
+ if (lead != null) {
+ currentIterators.add(lead);
+ }
for (IntervalIterator it : this.proximityQueue) {
if (it.end() <= end) {
currentIterators.add(it);
diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/OverlappingIntervalsSource.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/OverlappingIntervalsSource.java
index bb8c5bb..81efb35 100644
--- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/OverlappingIntervalsSource.java
+++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/OverlappingIntervalsSource.java
@@ -63,6 +63,14 @@ class OverlappingIntervalsSource extends ConjunctionIntervalsSource {
}
@Override
+ protected IntervalMatchesIterator createMatchesIterator(
+ IntervalIterator it, List<IntervalMatchesIterator> subs) {
+ assert subs.size() == 2;
+ // the only sub source we care is the "real" source
+ return new ConjunctionMatchesIterator(it, List.of(subs.get(0)));
+ }
+
+ @Override
public int minExtent() {
return source.minExtent();
}
diff --git a/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervals.java b/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervals.java
index 9afd84b..a48c6ba 100644
--- a/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervals.java
+++ b/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervals.java
@@ -621,17 +621,13 @@ public class TestIntervals extends LuceneTestCase {
checkIntervals(
source, "field1", 3, new int[][] {{}, {4, 4, 7, 7}, {1, 1, 7, 7}, {}, {4, 4}, {}});
MatchesIterator mi = getMatches(source, 1, "field1");
- assertMatch(mi, 4, 4, 20, 39);
+ assertMatch(mi, 4, 4, 26, 34);
MatchesIterator subs = mi.getSubMatches();
- assertMatch(subs, 3, 3, 20, 25);
assertMatch(subs, 4, 4, 26, 34);
- assertMatch(subs, 5, 5, 35, 39);
assertFalse(subs.next());
- assertMatch(mi, 7, 7, 41, 118);
+ assertMatch(mi, 7, 7, 47, 55);
subs = mi.getSubMatches();
- assertMatch(subs, 6, 6, 41, 46);
assertMatch(subs, 7, 7, 47, 55);
- assertMatch(subs, 21, 21, 114, 118);
assertFalse(subs.next());
assertFalse(mi.next());
assertEquals(1, source.minExtent());
@@ -788,6 +784,30 @@ public class TestIntervals extends LuceneTestCase {
assertEquals(3, source.minExtent());
}
+ public void testMinShouldMatch2() throws IOException {
+ IntervalsSource source =
+ Intervals.atLeast(
+ 2,
+ Intervals.unordered(Intervals.term("alph"), Intervals.term("ran")),
+ Intervals.term("where"),
+ Intervals.term("river"));
+ MatchesIterator mi = getMatches(source, 1, "field2");
+ assertMatch(mi, 0, 4, 0, 27); // contains "where" and "river"
+ MatchesIterator subs = mi.getSubMatches();
+ assertNotNull(subs);
+ assertMatch(subs, 0, 0, 0, 5);
+ assertMatch(subs, 4, 4, 22, 27);
+ assertFalse(subs.next());
+ assertMatch(mi, 1, 5, 6, 31); // contains "river" and unordered("alph", "run")
+ subs = mi.getSubMatches();
+ assertNotNull(subs);
+ assertMatch(subs, 1, 1, 6, 10);
+ assertMatch(subs, 4, 4, 22, 27);
+ assertMatch(subs, 5, 5, 28, 31);
+ assertFalse(subs.next());
+ assertFalse(mi.next());
+ }
+
public void testDegenerateMinShouldMatch() throws IOException {
IntervalsSource source =
Intervals.ordered(
@@ -851,11 +871,9 @@ public class TestIntervals extends LuceneTestCase {
checkIntervals(source, "field1", 3, new int[][] {{}, {7, 7}, {4, 4, 7, 7}, {}, {7, 7}, {}});
MatchesIterator mi = getMatches(source, 1, "field1");
- assertMatch(mi, 7, 7, 20, 55);
+ assertMatch(mi, 7, 7, 47, 55);
MatchesIterator sub = mi.getSubMatches();
assertNotNull(sub);
- assertMatch(sub, 3, 3, 20, 25);
- assertMatch(sub, 5, 5, 35, 39);
assertMatch(sub, 7, 7, 47, 55);
assertFalse(sub.next());
@@ -890,15 +908,13 @@ public class TestIntervals extends LuceneTestCase {
MatchesIterator mi = getMatches(source, 1, "field1");
assertNotNull(mi);
- assertMatch(mi, 2, 4, 15, 39);
+ assertMatch(mi, 2, 4, 15, 34);
MatchesIterator sub = mi.getSubMatches();
assertNotNull(sub);
assertMatch(sub, 2, 2, 15, 18);
- assertMatch(sub, 3, 3, 20, 25);
assertMatch(sub, 4, 4, 26, 34);
- assertMatch(sub, 5, 5, 35, 39);
assertFalse(sub.next());
- assertMatch(mi, 7, 17, 41, 118);
+ assertMatch(mi, 7, 17, 47, 99);
assertEquals(2, source.minExtent());
}