You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2016/11/07 10:54:23 UTC
lucene-solr:branch_6x: LUCENE-6824: TermAutomatonQuery now rewrites
to TermQuery,
PhraseQuery or MultiPhraseQuery when the word automaton is simple
Repository: lucene-solr
Updated Branches:
refs/heads/branch_6x d06605191 -> c294d3f08
LUCENE-6824: TermAutomatonQuery now rewrites to TermQuery, PhraseQuery or MultiPhraseQuery when the word automaton is simple
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/c294d3f0
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/c294d3f0
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/c294d3f0
Branch: refs/heads/branch_6x
Commit: c294d3f08317eb9139f32bfbde1b27e7eb134653
Parents: d066051
Author: Mike McCandless <mi...@apache.org>
Authored: Mon Nov 7 05:53:26 2016 -0500
Committer: Mike McCandless <mi...@apache.org>
Committed: Mon Nov 7 05:53:53 2016 -0500
----------------------------------------------------------------------
lucene/CHANGES.txt | 6 +
.../org/apache/lucene/search/PhraseQuery.java | 2 +-
.../lucene/search/TermAutomatonQuery.java | 86 ++++++++-
.../lucene/search/TestTermAutomatonQuery.java | 193 ++++++++++++++++++-
4 files changed, 278 insertions(+), 9 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c294d3f0/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 0bfb1ad..4424613 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -12,6 +12,12 @@ Improvements
(Shinichiro Abe via Mike McCandless)
+Improvements
+
+* LUCENE-6824: TermAutomatonQuery now rewrites to TermQuery,
+ PhraseQuery or MultiPhraseQuery when the word automaton is simple
+ (Mike McCandless)
+
======================= Lucene 6.3.0 =======================
API Changes
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c294d3f0/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java b/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java
index 64c0946..e8a66ec 100644
--- a/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java
@@ -265,7 +265,7 @@ public class PhraseQuery extends Query {
* Returns the relative positions of terms in this phrase.
*/
public int[] getPositions() {
- return positions;
+ return positions;
}
@Override
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c294d3f0/lucene/sandbox/src/java/org/apache/lucene/search/TermAutomatonQuery.java
----------------------------------------------------------------------
diff --git a/lucene/sandbox/src/java/org/apache/lucene/search/TermAutomatonQuery.java b/lucene/sandbox/src/java/org/apache/lucene/search/TermAutomatonQuery.java
index b3a7ba2..de63189 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/search/TermAutomatonQuery.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/search/TermAutomatonQuery.java
@@ -23,9 +23,10 @@ import java.util.List;
import java.util.Map;
import java.util.Set;
-import org.apache.lucene.index.PostingsEnum;
+import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexReaderContext;
import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.ReaderUtil;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
@@ -34,6 +35,7 @@ import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.Transition;
@@ -183,6 +185,10 @@ public class TermAutomatonQuery extends Query {
det = Operations.removeDeadStates(Operations.determinize(automaton,
maxDeterminizedStates));
+
+ if (det.isAccept(0)) {
+ throw new IllegalStateException("cannot accept the empty string");
+ }
}
@Override
@@ -406,4 +412,82 @@ public class TermAutomatonQuery extends Query {
return null;
}
}
+
+ public Query rewrite(IndexReader reader) throws IOException {
+ if (Operations.isEmpty(det)) {
+ return new MatchNoDocsQuery();
+ }
+
+ IntsRef single = Operations.getSingleton(det);
+ if (single != null && single.length == 1) {
+ return new TermQuery(new Term(field, idToTerm.get(single.ints[single.offset])));
+ }
+
+ // TODO: can PhraseQuery really handle multiple terms at the same position? If so, why do we even have MultiPhraseQuery?
+
+ // Try for either PhraseQuery or MultiPhraseQuery, which only works when the automaton is a sausage:
+ MultiPhraseQuery.Builder mpq = new MultiPhraseQuery.Builder();
+ PhraseQuery.Builder pq = new PhraseQuery.Builder();
+
+ Transition t = new Transition();
+ int state = 0;
+ int pos = 0;
+ query:
+ while (true) {
+ int count = det.initTransition(state, t);
+ if (count == 0) {
+ if (det.isAccept(state) == false) {
+ mpq = null;
+ pq = null;
+ }
+ break;
+ } else if (det.isAccept(state)) {
+ mpq = null;
+ pq = null;
+ break;
+ }
+ int dest = -1;
+ List<Term> terms = new ArrayList<>();
+ boolean matchesAny = false;
+ for(int i=0;i<count;i++) {
+ det.getNextTransition(t);
+ if (i == 0) {
+ dest = t.dest;
+ } else if (dest != t.dest) {
+ mpq = null;
+ pq = null;
+ break query;
+ }
+
+ matchesAny |= anyTermID >= t.min && anyTermID <= t.max;
+
+ if (matchesAny == false) {
+ for(int termID=t.min;termID<=t.max;termID++) {
+ terms.add(new Term(field, idToTerm.get(termID)));
+ }
+ }
+ }
+ if (matchesAny == false) {
+ mpq.add(terms.toArray(new Term[terms.size()]), pos);
+ if (pq != null) {
+ if (terms.size() == 1) {
+ pq.add(terms.get(0), pos);
+ } else {
+ pq = null;
+ }
+ }
+ }
+ state = dest;
+ pos++;
+ }
+
+ if (pq != null) {
+ return pq.build();
+ } else if (mpq != null) {
+ return mpq.build();
+ }
+
+ // TODO: we could maybe also rewrite to union of PhraseQuery (pull all finite strings) if it's "worth it"?
+ return this;
+ }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c294d3f0/lucene/sandbox/src/test/org/apache/lucene/search/TestTermAutomatonQuery.java
----------------------------------------------------------------------
diff --git a/lucene/sandbox/src/test/org/apache/lucene/search/TestTermAutomatonQuery.java b/lucene/sandbox/src/test/org/apache/lucene/search/TestTermAutomatonQuery.java
index 1cc86ba..ccd376b 100644
--- a/lucene/sandbox/src/test/org/apache/lucene/search/TestTermAutomatonQuery.java
+++ b/lucene/sandbox/src/test/org/apache/lucene/search/TestTermAutomatonQuery.java
@@ -296,7 +296,6 @@ public class TestTermAutomatonQuery extends LuceneTestCase {
while (scorer instanceof AssertingScorer) {
scorer = ((AssertingScorer) scorer).getIn();
}
- assert scorer instanceof TermAutomatonScorer;
}
@Override
@@ -683,7 +682,7 @@ public class TestTermAutomatonQuery extends LuceneTestCase {
w.addDocument(doc);
doc = new Document();
- doc.add(newTextField("field", "comes here", Field.Store.NO));
+ doc.add(newTextField("field", "comes foo", Field.Store.NO));
w.addDocument(doc);
IndexReader r = w.getReader();
IndexSearcher s = newSearcher(r);
@@ -691,9 +690,11 @@ public class TestTermAutomatonQuery extends LuceneTestCase {
TermAutomatonQuery q = new TermAutomatonQuery("field");
int init = q.createState();
int s1 = q.createState();
+ int s2 = q.createState();
q.addTransition(init, s1, "here");
- q.addTransition(s1, init, "comes");
- q.setAccept(init, true);
+ q.addTransition(s1, s2, "comes");
+ q.addTransition(s2, s1, "here");
+ q.setAccept(s1, true);
q.finish();
assertEquals(1, s.search(q, 1).totalHits);
@@ -779,8 +780,186 @@ public class TestTermAutomatonQuery extends LuceneTestCase {
// System.out.println("DOT: " + q.toDot());
assertEquals(0, s.search(q, 1).totalHits);
- w.close();
- r.close();
- dir.close();
+ IOUtils.close(w, r, dir);
+ }
+
+ public void testEmptyString() throws Exception {
+ TermAutomatonQuery q = new TermAutomatonQuery("field");
+ int initState = q.createState();
+ q.setAccept(initState, true);
+ try {
+ q.finish();
+ fail("did not hit exc");
+ } catch (IllegalStateException ise) {
+ // expected
+ }
+ }
+
+ public void testRewriteNoMatch() throws Exception {
+ TermAutomatonQuery q = new TermAutomatonQuery("field");
+ int initState = q.createState();
+ q.finish();
+
+ Directory dir = newDirectory();
+ RandomIndexWriter w = new RandomIndexWriter(random(), dir);
+ Document doc = new Document();
+ doc.add(newTextField("field", "x y z", Field.Store.NO));
+ w.addDocument(doc);
+
+ IndexReader r = w.getReader();
+ assertTrue(q.rewrite(r) instanceof MatchNoDocsQuery);
+ IOUtils.close(w, r, dir);
+ }
+
+ public void testRewriteTerm() throws Exception {
+ TermAutomatonQuery q = new TermAutomatonQuery("field");
+ int initState = q.createState();
+ int s1 = q.createState();
+ q.addTransition(initState, s1, "foo");
+ q.setAccept(s1, true);
+ q.finish();
+
+ Directory dir = newDirectory();
+ RandomIndexWriter w = new RandomIndexWriter(random(), dir);
+ Document doc = new Document();
+ doc.add(newTextField("field", "x y z", Field.Store.NO));
+ w.addDocument(doc);
+
+ IndexReader r = w.getReader();
+ Query rewrite = q.rewrite(r);
+ assertTrue(rewrite instanceof TermQuery);
+ assertEquals(new Term("field", "foo"), ((TermQuery) rewrite).getTerm());
+ IOUtils.close(w, r, dir);
+ }
+
+ public void testRewriteSimplePhrase() throws Exception {
+ TermAutomatonQuery q = new TermAutomatonQuery("field");
+ int initState = q.createState();
+ int s1 = q.createState();
+ int s2 = q.createState();
+ q.addTransition(initState, s1, "foo");
+ q.addTransition(s1, s2, "bar");
+ q.setAccept(s2, true);
+ q.finish();
+
+ Directory dir = newDirectory();
+ RandomIndexWriter w = new RandomIndexWriter(random(), dir);
+ Document doc = new Document();
+ doc.add(newTextField("field", "x y z", Field.Store.NO));
+ w.addDocument(doc);
+
+ IndexReader r = w.getReader();
+ Query rewrite = q.rewrite(r);
+ assertTrue(rewrite instanceof PhraseQuery);
+ Term[] terms = ((PhraseQuery) rewrite).getTerms();
+ assertEquals(new Term("field", "foo"), terms[0]);
+ assertEquals(new Term("field", "bar"), terms[1]);
+
+ int[] positions = ((PhraseQuery) rewrite).getPositions();
+ assertEquals(0, positions[0]);
+ assertEquals(1, positions[1]);
+
+ IOUtils.close(w, r, dir);
+ }
+
+ public void testRewritePhraseWithAny() throws Exception {
+ TermAutomatonQuery q = new TermAutomatonQuery("field");
+ int initState = q.createState();
+ int s1 = q.createState();
+ int s2 = q.createState();
+ int s3 = q.createState();
+ q.addTransition(initState, s1, "foo");
+ q.addAnyTransition(s1, s2);
+ q.addTransition(s2, s3, "bar");
+ q.setAccept(s3, true);
+ q.finish();
+
+ Directory dir = newDirectory();
+ RandomIndexWriter w = new RandomIndexWriter(random(), dir);
+ Document doc = new Document();
+ doc.add(newTextField("field", "x y z", Field.Store.NO));
+ w.addDocument(doc);
+
+ IndexReader r = w.getReader();
+ Query rewrite = q.rewrite(r);
+ assertTrue(rewrite instanceof PhraseQuery);
+ Term[] terms = ((PhraseQuery) rewrite).getTerms();
+ assertEquals(new Term("field", "foo"), terms[0]);
+ assertEquals(new Term("field", "bar"), terms[1]);
+
+ int[] positions = ((PhraseQuery) rewrite).getPositions();
+ assertEquals(0, positions[0]);
+ assertEquals(2, positions[1]);
+
+ IOUtils.close(w, r, dir);
+ }
+
+ public void testRewriteSimpleMultiPhrase() throws Exception {
+ TermAutomatonQuery q = new TermAutomatonQuery("field");
+ int initState = q.createState();
+ int s1 = q.createState();
+ q.addTransition(initState, s1, "foo");
+ q.addTransition(initState, s1, "bar");
+ q.setAccept(s1, true);
+ q.finish();
+
+ Directory dir = newDirectory();
+ RandomIndexWriter w = new RandomIndexWriter(random(), dir);
+ Document doc = new Document();
+ doc.add(newTextField("field", "x y z", Field.Store.NO));
+ w.addDocument(doc);
+
+ IndexReader r = w.getReader();
+ Query rewrite = q.rewrite(r);
+ assertTrue(rewrite instanceof MultiPhraseQuery);
+ Term[][] terms = ((MultiPhraseQuery) rewrite).getTermArrays();
+ assertEquals(1, terms.length);
+ assertEquals(2, terms[0].length);
+ assertEquals(new Term("field", "foo"), terms[0][0]);
+ assertEquals(new Term("field", "bar"), terms[0][1]);
+
+ int[] positions = ((MultiPhraseQuery) rewrite).getPositions();
+ assertEquals(1, positions.length);
+ assertEquals(0, positions[0]);
+
+ IOUtils.close(w, r, dir);
+ }
+
+ public void testRewriteMultiPhraseWithAny() throws Exception {
+ TermAutomatonQuery q = new TermAutomatonQuery("field");
+ int initState = q.createState();
+ int s1 = q.createState();
+ int s2 = q.createState();
+ int s3 = q.createState();
+ q.addTransition(initState, s1, "foo");
+ q.addTransition(initState, s1, "bar");
+ q.addAnyTransition(s1, s2);
+ q.addTransition(s2, s3, "baz");
+ q.setAccept(s3, true);
+ q.finish();
+
+ Directory dir = newDirectory();
+ RandomIndexWriter w = new RandomIndexWriter(random(), dir);
+ Document doc = new Document();
+ doc.add(newTextField("field", "x y z", Field.Store.NO));
+ w.addDocument(doc);
+
+ IndexReader r = w.getReader();
+ Query rewrite = q.rewrite(r);
+ assertTrue(rewrite instanceof MultiPhraseQuery);
+ Term[][] terms = ((MultiPhraseQuery) rewrite).getTermArrays();
+ assertEquals(2, terms.length);
+ assertEquals(2, terms[0].length);
+ assertEquals(new Term("field", "foo"), terms[0][0]);
+ assertEquals(new Term("field", "bar"), terms[0][1]);
+ assertEquals(1, terms[1].length);
+ assertEquals(new Term("field", "baz"), terms[1][0]);
+
+ int[] positions = ((MultiPhraseQuery) rewrite).getPositions();
+ assertEquals(2, positions.length);
+ assertEquals(0, positions[0]);
+ assertEquals(2, positions[1]);
+
+ IOUtils.close(w, r, dir);
}
}