You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ro...@apache.org on 2020/04/29 09:32:17 UTC
[lucene-solr] branch branch_8x updated: LUCENE-9349: TermInSetQuery
should use consumeMatchingTerms in visit() (#1465)
This is an automated email from the ASF dual-hosted git repository.
romseygeek pushed a commit to branch branch_8x
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
The following commit(s) were added to refs/heads/branch_8x by this push:
new 4be69c8 LUCENE-9349: TermInSetQuery should use consumeMatchingTerms in visit() (#1465)
4be69c8 is described below
commit 4be69c8540b51f52355f9926cd84f3b3de985180
Author: Alan Woodward <ro...@apache.org>
AuthorDate: Wed Apr 29 10:19:05 2020 +0100
LUCENE-9349: TermInSetQuery should use consumeMatchingTerms in visit() (#1465)
TermInSetQuery currently iterates through all its prefix-encoded terms
in order to build an array to pass back to its visitor when visit() is called.
This seems like a waste, particularly when the visitor is not actually
consuming the terms (for example, when doing a clause-count check
before executing a search). This commit changes TermInSetQuery to use
consumeTermsMatching(), and also changes the signature of this method so
that it takes a BytesRunAutomaton supplier to allow for lazy instantiation.
---
lucene/CHANGES.txt | 5 +++
.../org/apache/lucene/search/IndexSearcher.java | 1 +
.../org/apache/lucene/search/QueryVisitor.java | 5 +--
.../org/apache/lucene/search/TermInSetQuery.java | 22 +++++++++---
.../lucene/util/automaton/CompiledAutomaton.java | 4 +--
.../apache/lucene/search/TermInSetQueryTest.java | 42 ++++++++++++++++++++++
.../search/uhighlight/MultiTermHighlighting.java | 5 +--
7 files changed, 74 insertions(+), 10 deletions(-)
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 811c741..03ae2ce 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -23,6 +23,11 @@ API Changes
interface that offers the basic methods to acquire pending merges, run the merge and do accounting
around it. (Simon Willnauer)
+* LUCENE-9349: QueryVisitor.consumeTermsMatching() now takes a
+ Supplier<ByteRunAutomaton> to enable queries that build large automata to
+ provide them lazily. TermsInSetQuery switches to using this method
+ to report matching terms. (Alan Woodward)
+
New Features
---------------------
(No changes)
diff --git a/lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java b/lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java
index e993832..5c30e45 100644
--- a/lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java
+++ b/lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java
@@ -47,6 +47,7 @@ import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.store.NIOFSDirectory;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.ThreadInterruptedException;
+import org.apache.lucene.util.automaton.ByteRunAutomaton;
/** Implements search over a single IndexReader.
*
diff --git a/lucene/core/src/java/org/apache/lucene/search/QueryVisitor.java b/lucene/core/src/java/org/apache/lucene/search/QueryVisitor.java
index 15a0eaf..d79775d 100644
--- a/lucene/core/src/java/org/apache/lucene/search/QueryVisitor.java
+++ b/lucene/core/src/java/org/apache/lucene/search/QueryVisitor.java
@@ -19,6 +19,7 @@ package org.apache.lucene.search;
import java.util.Arrays;
import java.util.Set;
+import java.util.function.Supplier;
import org.apache.lucene.index.Term;
import org.apache.lucene.util.automaton.ByteRunAutomaton;
@@ -43,11 +44,11 @@ public abstract class QueryVisitor {
*
* @param query the leaf query
* @param field the field queried against
- * @param automaton an automaton defining which terms match
+ * @param automaton a supplier for an automaton defining which terms match
*
* @lucene.experimental
*/
- public void consumeTermsMatching(Query query, String field, ByteRunAutomaton automaton) {
+ public void consumeTermsMatching(Query query, String field, Supplier<ByteRunAutomaton> automaton) {
visitLeaf(query); // default impl for backward compatibility
}
diff --git a/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java b/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java
index 9e5648b..e3b5dea 100644
--- a/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java
@@ -44,6 +44,11 @@ import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.DocIdSetBuilder;
import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.lucene.util.automaton.Automata;
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.ByteRunAutomaton;
+import org.apache.lucene.util.automaton.CompiledAutomaton;
+import org.apache.lucene.util.automaton.Operations;
/**
* Specialization for a disjunction over many terms that behaves like a
@@ -127,13 +132,22 @@ public class TermInSetQuery extends Query implements Accountable {
if (visitor.acceptField(field) == false) {
return;
}
- QueryVisitor v = visitor.getSubVisitor(Occur.SHOULD, this);
- List<Term> terms = new ArrayList<>();
+ if (termData.size() == 1) {
+ visitor.consumeTerms(this, new Term(field, termData.iterator().next()));
+ }
+ if (termData.size() > 1) {
+ visitor.consumeTermsMatching(this, field, this::asByteRunAutomaton);
+ }
+ }
+
+ private ByteRunAutomaton asByteRunAutomaton() {
TermIterator iterator = termData.iterator();
+ List<Automaton> automata = new ArrayList<>();
for (BytesRef term = iterator.next(); term != null; term = iterator.next()) {
- terms.add(new Term(field, BytesRef.deepCopyOf(term)));
+ automata.add(Automata.makeBinary(term));
}
- v.consumeTerms(this, terms.toArray(new Term[0]));
+ return new CompiledAutomaton(Operations.union(automata)).runAutomaton;
+
}
@Override
diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java b/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java
index 1c9a235..c14cd91 100644
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java
@@ -354,12 +354,12 @@ public class CompiledAutomaton implements Accountable {
if (visitor.acceptField(field)) {
switch (type) {
case NORMAL:
- visitor.consumeTermsMatching(parent, field, runAutomaton);
+ visitor.consumeTermsMatching(parent, field, () -> runAutomaton);
break;
case NONE:
break;
case ALL:
- visitor.consumeTermsMatching(parent, field, new ByteRunAutomaton(Automata.makeAnyString()));
+ visitor.consumeTermsMatching(parent, field, () -> new ByteRunAutomaton(Automata.makeAnyString()));
break;
case SINGLE:
visitor.consumeTerms(parent, new Term(field, term));
diff --git a/lucene/core/src/test/org/apache/lucene/search/TermInSetQueryTest.java b/lucene/core/src/test/org/apache/lucene/search/TermInSetQueryTest.java
index 9633ea4..fa525be 100644
--- a/lucene/core/src/test/org/apache/lucene/search/TermInSetQueryTest.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TermInSetQueryTest.java
@@ -23,6 +23,7 @@ import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;
+import java.util.function.Supplier;
import com.carrotsearch.randomizedtesting.generators.RandomStrings;
import org.apache.lucene.document.Document;
@@ -44,6 +45,7 @@ import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.RamUsageTester;
import org.apache.lucene.util.TestUtil;
+import org.apache.lucene.util.automaton.ByteRunAutomaton;
public class TermInSetQueryTest extends LuceneTestCase {
@@ -297,4 +299,44 @@ public class TermInSetQueryTest extends LuceneTestCase {
// cached after two uses
assertTrue(policy.shouldCache(query));
}
+
+ public void testVisitor() {
+ // singleton reports back to consumeTerms()
+ TermInSetQuery singleton = new TermInSetQuery("field", new BytesRef("term1"));
+ singleton.visit(new QueryVisitor() {
+ @Override
+ public void consumeTerms(Query query, Term... terms) {
+ assertEquals(1, terms.length);
+ assertEquals(new Term("field", new BytesRef("term1")), terms[0]);
+ }
+
+ @Override
+ public void consumeTermsMatching(Query query, String field, Supplier<ByteRunAutomaton> automaton) {
+ fail("Singleton TermInSetQuery should not try to build ByteRunAutomaton");
+ }
+ });
+
+ // multiple values built into automaton
+ List<BytesRef> terms = new ArrayList<>();
+ for (int i = 0; i < 100; i++) {
+ terms.add(new BytesRef("term" + i));
+ }
+ TermInSetQuery t = new TermInSetQuery("field", terms);
+ t.visit(new QueryVisitor() {
+ @Override
+ public void consumeTerms(Query query, Term... terms) {
+ fail("TermInSetQuery with multiple terms should build automaton");
+ }
+
+ @Override
+ public void consumeTermsMatching(Query query, String field, Supplier<ByteRunAutomaton> automaton) {
+ ByteRunAutomaton a = automaton.get();
+ BytesRef test = new BytesRef("nonmatching");
+ assertFalse(a.run(test.bytes, test.offset, test.length));
+ for (BytesRef term : terms) {
+ assertTrue(a.run(term.bytes, term.offset, term.length));
+ }
+ }
+ });
+ }
}
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MultiTermHighlighting.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MultiTermHighlighting.java
index ba8e85e..02bd672 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MultiTermHighlighting.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MultiTermHighlighting.java
@@ -19,6 +19,7 @@ package org.apache.lucene.search.uhighlight;
import java.util.ArrayList;
import java.util.List;
import java.util.function.Predicate;
+import java.util.function.Supplier;
import org.apache.lucene.search.AutomatonQuery;
import org.apache.lucene.search.BooleanClause;
@@ -80,8 +81,8 @@ final class MultiTermHighlighting {
}
@Override
- public void consumeTermsMatching(Query query, String field, ByteRunAutomaton automaton) {
- runAutomata.add(LabelledCharArrayMatcher.wrap(query.toString(), automaton));
+ public void consumeTermsMatching(Query query, String field, Supplier<ByteRunAutomaton> automaton) {
+ runAutomata.add(LabelledCharArrayMatcher.wrap(query.toString(), automaton.get()));
}
}