You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2015/03/27 09:22:55 UTC
svn commit: r1669522 - in /lucene/dev/trunk/lucene: ./
core/src/java/org/apache/lucene/search/ core/src/java/org/apache/lucene/util/
core/src/java/org/apache/lucene/util/automaton/
core/src/test/org/apache/lucene/index/ core/src/test/org/apache/lucene/...
Author: mikemccand
Date: Fri Mar 27 08:22:54 2015
New Revision: 1669522
URL: http://svn.apache.org/r1669522
Log:
LUCENE-6367: PrefixQuery now subclasses AutomatonQuery
Removed:
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/PrefixTermsEnum.java
Modified:
lucene/dev/trunk/lucene/CHANGES.txt
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/AutomatonQuery.java
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/PrefixQuery.java
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/StringHelper.java
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/Automaton.java
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java
lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum.java
lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/TestAutomatonQuery.java
lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/TestPrefixQuery.java
lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/TestWildcard.java
lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/automaton/TestAutomaton.java
lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/automaton/TestCompiledAutomaton.java
lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/index/BasePostingsFormatTestCase.java
Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1669522&r1=1669521&r2=1669522&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Fri Mar 27 08:22:54 2015
@@ -244,6 +244,11 @@ Changes in Runtime Behavior
* LUCENE-6298: SimpleQueryParser returns an empty query rather than
null, if e.g. the terms were all stopwords. (Lee Hinman via Robert Muir)
+* LUCENE-6367: PrefixQuery now subclasses AutomatonQuery, removing the
+ specialized PrefixTermsEnum. PrefixQuery now operates in binary
+ term space, meaning any binary term (not just valid UTF-8 terms)
+ are accepted. (Robert Muir, Mike McCandless)
+
======================= Lucene 5.0.0 =======================
New Features
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/AutomatonQuery.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/AutomatonQuery.java?rev=1669522&r1=1669521&r2=1669522&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/AutomatonQuery.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/AutomatonQuery.java Fri Mar 27 08:22:54 2015
@@ -78,10 +78,28 @@ public class AutomatonQuery extends Mult
* space but can process more complex automata.
*/
public AutomatonQuery(final Term term, Automaton automaton, int maxDeterminizedStates) {
+ this(term, automaton, maxDeterminizedStates, false);
+ }
+
+ /**
+ * Create a new AutomatonQuery from an {@link Automaton}.
+ *
+ * @param term Term containing field and possibly some pattern structure. The
+ * term text is ignored.
+ * @param automaton Automaton to run, terms that are accepted are considered a
+ * match.
+ * @param maxDeterminizedStates maximum number of states in the resulting
+ * automata. If the automata would need more than this many states
+ * TooComplextToDeterminizeException is thrown. Higher number require more
+ * space but can process more complex automata.
+ * @param isBinary if true, this automaton is already binary and
+ * will not go through the UTF32ToUTF8 conversion
+ */
+ public AutomatonQuery(final Term term, Automaton automaton, int maxDeterminizedStates, boolean isBinary) {
super(term.field());
this.term = term;
this.automaton = automaton;
- this.compiled = new CompiledAutomaton(automaton, null, true, maxDeterminizedStates);
+ this.compiled = new CompiledAutomaton(automaton, null, true, maxDeterminizedStates, isBinary);
}
@Override
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/PrefixQuery.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/PrefixQuery.java?rev=1669522&r1=1669521&r2=1669522&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/PrefixQuery.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/PrefixQuery.java Fri Mar 27 08:22:54 2015
@@ -19,11 +19,13 @@ package org.apache.lucene.search;
import java.io.IOException;
-import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.AttributeSource;
+import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.ToStringUtils;
+import org.apache.lucene.util.automaton.Automaton;
/** A Query that matches documents containing terms with a specified prefix. A PrefixQuery
* is built by QueryParser for input like <code>app*</code>.
@@ -31,29 +33,38 @@ import org.apache.lucene.util.ToStringUt
* <p>This query uses the {@link
* MultiTermQuery#CONSTANT_SCORE_REWRITE}
* rewrite method. */
-public class PrefixQuery extends MultiTermQuery {
- private Term prefix;
+public class PrefixQuery extends AutomatonQuery {
/** Constructs a query for terms starting with <code>prefix</code>. */
public PrefixQuery(Term prefix) {
- super(prefix.field());
- this.prefix = prefix;
+ // It's OK to pass unlimited maxDeterminizedStates: the automaton is born small and determinized:
+ super(prefix, toAutomaton(prefix.bytes()), Integer.MAX_VALUE, true);
+ if (prefix == null) {
+ throw new NullPointerException("prefix cannot be null");
+ }
}
- /** Returns the prefix of this query. */
- public Term getPrefix() { return prefix; }
-
- @Override
- protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
- TermsEnum tenum = terms.iterator(null);
-
- if (prefix.bytes().length == 0) {
- // no prefix -- match all terms for this field:
- return tenum;
+ /** Build an automaton accepting all terms with the specified prefix. */
+ public static Automaton toAutomaton(BytesRef prefix) {
+ Automaton automaton = new Automaton();
+ int lastState = automaton.createState();
+ for(int i=0;i<prefix.length;i++) {
+ int state = automaton.createState();
+ automaton.addTransition(lastState, state, prefix.bytes[prefix.offset+i]&0xff);
+ lastState = state;
}
- return new PrefixTermsEnum(tenum, prefix.bytes());
+ automaton.setAccept(lastState, true);
+ automaton.addTransition(lastState, lastState, 0, 255);
+ automaton.finishState();
+ assert automaton.isDeterministic();
+ return automaton;
}
+ /** Returns the prefix of this query. */
+ public Term getPrefix() {
+ return term;
+ }
+
/** Prints a user-readable version of this query. */
@Override
public String toString(String field) {
@@ -62,7 +73,7 @@ public class PrefixQuery extends MultiTe
buffer.append(getField());
buffer.append(":");
}
- buffer.append(prefix.text());
+ buffer.append(term.text());
buffer.append('*');
buffer.append(ToStringUtils.boost(getBoost()));
return buffer.toString();
@@ -72,25 +83,23 @@ public class PrefixQuery extends MultiTe
public int hashCode() {
final int prime = 31;
int result = super.hashCode();
- result = prime * result + ((prefix == null) ? 0 : prefix.hashCode());
+ result = prime * result + term.hashCode();
return result;
}
@Override
public boolean equals(Object obj) {
- if (this == obj)
+ if (this == obj) {
return true;
- if (!super.equals(obj))
- return false;
- if (getClass() != obj.getClass())
+ }
+ if (!super.equals(obj)) {
return false;
+ }
+ // super.equals() ensures we are the same class
PrefixQuery other = (PrefixQuery) obj;
- if (prefix == null) {
- if (other.prefix != null)
- return false;
- } else if (!prefix.equals(other.prefix))
+ if (!term.equals(other.term)) {
return false;
+ }
return true;
}
-
}
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/StringHelper.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/StringHelper.java?rev=1669522&r1=1669521&r2=1669522&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/StringHelper.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/StringHelper.java Fri Mar 27 08:22:54 2015
@@ -393,4 +393,20 @@ public abstract class StringHelper {
return sb.toString();
}
}
+
+ /** Just converts each int in the incoming {@link IntsRef} to each byte
+ * in the returned {@link BytesRef}, throwing {@code IllegalArgumentException}
+ * if any int value is out of bounds for a byte. */
+ public static BytesRef intsRefToBytesRef(IntsRef ints) {
+ byte[] bytes = new byte[ints.length];
+ for(int i=0;i<ints.length;i++) {
+ int x = ints.ints[ints.offset+i];
+ if (x < 0 || x > 255) {
+ throw new IllegalArgumentException("int at pos=" + i + " with value=" + x + " is out-of-bounds for byte");
+ }
+ bytes[i] = (byte) x;
+ }
+
+ return new BytesRef(bytes);
+ }
}
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/Automaton.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/Automaton.java?rev=1669522&r1=1669521&r2=1669522&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/Automaton.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/Automaton.java Fri Mar 27 08:22:54 2015
@@ -347,6 +347,7 @@ public class Automaton implements Accoun
/** How many transitions this state has. */
public int getNumTransitions(int state) {
+ assert state >= 0;
int count = states[2*state+1];
if (count == -1) {
return 0;
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java?rev=1669522&r1=1669521&r2=1669522&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java Fri Mar 27 08:22:54 2015
@@ -24,9 +24,11 @@ import java.util.List;
import org.apache.lucene.index.SingleTermsEnum;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
-import org.apache.lucene.search.PrefixTermsEnum;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
+import org.apache.lucene.util.IntsRef;
+import org.apache.lucene.util.StringHelper;
+import org.apache.lucene.util.UnicodeUtil;
/**
* Immutable class holding compiled details for a given
@@ -47,8 +49,6 @@ public class CompiledAutomaton {
ALL,
/** Automaton that accepts only a single fixed string. */
SINGLE,
- /** Automaton that matches all Strings with a constant prefix. */
- PREFIX,
/** Catch-all for any other automata. */
NORMAL
};
@@ -57,8 +57,7 @@ public class CompiledAutomaton {
public final AUTOMATON_TYPE type;
/**
- * For {@link AUTOMATON_TYPE#PREFIX}, this is the prefix term;
- * for {@link AUTOMATON_TYPE#SINGLE} this is the singleton term.
+ * For {@link AUTOMATON_TYPE#SINGLE} this is the singleton term.
*/
public final BytesRef term;
@@ -101,7 +100,7 @@ public class CompiledAutomaton {
* possibly expensive operations to determine if the automaton is one
* the cases in {@link CompiledAutomaton.AUTOMATON_TYPE}. */
public CompiledAutomaton(Automaton automaton, Boolean finite, boolean simplify) {
- this(automaton, finite, simplify, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
+ this(automaton, finite, simplify, Operations.DEFAULT_MAX_DETERMINIZED_STATES, false);
}
@@ -114,7 +113,7 @@ public class CompiledAutomaton {
* TooComplexToDeterminizeException.
*/
public CompiledAutomaton(Automaton automaton, Boolean finite, boolean simplify,
- int maxDeterminizedStates) {
+ int maxDeterminizedStates, boolean isBinary) {
if (automaton.getNumStates() == 0) {
automaton = new Automaton();
automaton.createState();
@@ -135,8 +134,18 @@ public class CompiledAutomaton {
this.automaton = null;
this.finite = null;
return;
+ }
+
+ boolean isTotal;
+
// NOTE: only approximate, because automaton may not be minimal:
- } else if (Operations.isTotal(automaton)) {
+ if (isBinary) {
+ isTotal = Operations.isTotal(automaton, 0, 0xff);
+ } else {
+ isTotal = Operations.isTotal(automaton);
+ }
+
+ if (isTotal) {
// matches all possible strings
type = AUTOMATON_TYPE.ALL;
term = null;
@@ -145,43 +154,27 @@ public class CompiledAutomaton {
this.automaton = null;
this.finite = null;
return;
- } else {
+ }
- automaton = Operations.determinize(automaton, maxDeterminizedStates);
+ automaton = Operations.determinize(automaton, maxDeterminizedStates);
- final String commonPrefix = Operations.getCommonPrefix(automaton);
- final String singleton;
+ IntsRef singleton = Operations.getSingleton(automaton);
+
+ if (singleton != null) {
+ // matches a fixed string
+ type = AUTOMATON_TYPE.SINGLE;
+ commonSuffixRef = null;
+ runAutomaton = null;
+ this.automaton = null;
+ this.finite = null;
- if (commonPrefix.length() > 0 && Operations.sameLanguage(automaton, Automata.makeString(commonPrefix))) {
- singleton = commonPrefix;
+ if (isBinary) {
+ term = StringHelper.intsRefToBytesRef(singleton);
} else {
- singleton = null;
+ term = new BytesRef(UnicodeUtil.newString(singleton.ints, singleton.offset, singleton.length));
}
- if (singleton != null) {
- // matches a fixed string
- type = AUTOMATON_TYPE.SINGLE;
- term = new BytesRef(singleton);
- commonSuffixRef = null;
- runAutomaton = null;
- this.automaton = null;
- this.finite = null;
- return;
- } else if (commonPrefix.length() > 0) {
- Automaton other = Operations.concatenate(Automata.makeString(commonPrefix), Automata.makeAnyString());
- other = Operations.determinize(other, maxDeterminizedStates);
- assert Operations.hasDeadStates(other) == false;
- if (Operations.sameLanguage(automaton, other)) {
- // matches a constant prefix
- type = AUTOMATON_TYPE.PREFIX;
- term = new BytesRef(commonPrefix);
- commonSuffixRef = null;
- runAutomaton = null;
- this.automaton = null;
- this.finite = null;
- return;
- }
- }
+ return;
}
}
@@ -194,14 +187,26 @@ public class CompiledAutomaton {
this.finite = finite;
}
- Automaton utf8 = new UTF32ToUTF8().convert(automaton);
+ Automaton binary;
+ if (isBinary) {
+ // Caller already built binary automaton themselves, e.g. PrefixQuery
+ // does this since it can be provided with a binary (not necessarily
+ // UTF8!) term:
+ binary = automaton;
+ } else {
+ // Incoming automaton is unicode, and we must convert to UTF8 to match what's in the index:
+ binary = new UTF32ToUTF8().convert(automaton);
+ }
+
if (this.finite) {
commonSuffixRef = null;
} else {
// NOTE: this is a very costly operation! We should test if it's really warranted in practice...
- commonSuffixRef = Operations.getCommonSuffixBytesRef(utf8, maxDeterminizedStates);
+ commonSuffixRef = Operations.getCommonSuffixBytesRef(binary, maxDeterminizedStates);
}
- runAutomaton = new ByteRunAutomaton(utf8, true, maxDeterminizedStates);
+
+ // This will determinize the binary automaton for us:
+ runAutomaton = new ByteRunAutomaton(binary, true, maxDeterminizedStates);
this.automaton = runAutomaton.automaton;
}
@@ -285,10 +290,6 @@ public class CompiledAutomaton {
return terms.iterator(null);
case SINGLE:
return new SingleTermsEnum(terms.iterator(null), term);
- case PREFIX:
- // TODO: this is very likely faster than .intersect,
- // but we should test and maybe cutover
- return new PrefixTermsEnum(terms.iterator(null), term);
case NORMAL:
return terms.intersect(this, null);
default:
@@ -410,7 +411,7 @@ public class CompiledAutomaton {
if (getClass() != obj.getClass()) return false;
CompiledAutomaton other = (CompiledAutomaton) obj;
if (type != other.type) return false;
- if (type == AUTOMATON_TYPE.SINGLE || type == AUTOMATON_TYPE.PREFIX) {
+ if (type == AUTOMATON_TYPE.SINGLE) {
if (!term.equals(other.term)) return false;
} else if (type == AUTOMATON_TYPE.NORMAL) {
if (!runAutomaton.equals(other.runAutomaton)) return false;
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java?rev=1669522&r1=1669521&r2=1669522&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java Fri Mar 27 08:22:54 2015
@@ -834,11 +834,20 @@ final public class Operations {
* Returns true if the given automaton accepts all strings. The automaton must be minimized.
*/
public static boolean isTotal(Automaton a) {
+ return isTotal(a, Character.MIN_CODE_POINT, Character.MAX_CODE_POINT);
+ }
+
+ /**
+ * Returns true if the given automaton accepts all strings for the specified min/max
+ * range of the alphabet. The automaton must be minimized.
+ */
+ public static boolean isTotal(Automaton a, int minAlphabet, int maxAlphabet) {
if (a.isAccept(0) && a.getNumTransitions(0) == 1) {
Transition t = new Transition();
a.getTransition(0, 0, t);
- return t.dest == 0 && t.min == Character.MIN_CODE_POINT
- && t.max == Character.MAX_CODE_POINT;
+ return t.dest == 0
+ && t.min == minAlphabet
+ && t.max == maxAlphabet;
}
return false;
}
@@ -1112,6 +1121,37 @@ final public class Operations {
return builder.get();
}
+ /** If this automaton accepts a single input, return it. Else, return null.
+ * The automaton must be deterministic. */
+ public static IntsRef getSingleton(Automaton a) {
+ if (a.isDeterministic() == false) {
+ throw new IllegalArgumentException("input automaton must be deterministic");
+ }
+ IntsRefBuilder builder = new IntsRefBuilder();
+ HashSet<Integer> visited = new HashSet<>();
+ int s = 0;
+ boolean done;
+ Transition t = new Transition();
+ while (true) {
+ visited.add(s);
+ if (a.isAccept(s) == false) {
+ if (a.getNumTransitions(s) == 1) {
+ a.getTransition(s, 0, t);
+ if (t.min == t.max && !visited.contains(t.dest)) {
+ builder.append(t.min);
+ s = t.dest;
+ continue;
+ }
+ }
+ } else if (a.getNumTransitions(s) == 0) {
+ return builder.get();
+ }
+
+ // Automaton accepts more than one string:
+ return null;
+ }
+ }
+
/**
* Returns the longest BytesRef that is a suffix of all accepted strings.
* Worst case complexity: exponential in number of states (this calls
Modified: lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum.java?rev=1669522&r1=1669521&r2=1669522&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum.java (original)
+++ lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum.java Fri Mar 27 08:22:54 2015
@@ -260,7 +260,7 @@ public class TestTermsEnum extends Lucen
a = Automata.makeStringUnion(sortedAcceptTerms);
}
- final CompiledAutomaton c = new CompiledAutomaton(a, true, false, 1000000);
+ final CompiledAutomaton c = new CompiledAutomaton(a, true, false, 1000000, false);
final BytesRef[] acceptTermsArray = new BytesRef[acceptTerms.size()];
final Set<BytesRef> acceptTermsSet = new HashSet<>();
Modified: lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/TestAutomatonQuery.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/TestAutomatonQuery.java?rev=1669522&r1=1669521&r2=1669522&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/TestAutomatonQuery.java (original)
+++ lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/TestAutomatonQuery.java Fri Mar 27 08:22:54 2015
@@ -195,7 +195,6 @@ public class TestAutomatonQuery extends
Automaton prefixAutomaton = Operations.concatenate(pfx, Automata.makeAnyString());
AutomatonQuery aq = new AutomatonQuery(newTerm("bogus"), prefixAutomaton);
Terms terms = MultiFields.getTerms(searcher.getIndexReader(), FN);
- assertTrue(aq.getTermsEnum(terms) instanceof PrefixTermsEnum);
assertEquals(3, automatonQueryNrHits(aq));
}
Modified: lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/TestPrefixQuery.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/TestPrefixQuery.java?rev=1669522&r1=1669521&r2=1669522&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/TestPrefixQuery.java (original)
+++ lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/TestPrefixQuery.java Fri Mar 27 08:22:54 2015
@@ -17,15 +17,29 @@ package org.apache.lucene.search;
* limitations under the License.
*/
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
+import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
-import org.apache.lucene.store.Directory;
-import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.document.FieldType;
+import org.apache.lucene.document.StringField;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
-import org.apache.lucene.document.Document;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.AttributeImpl;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.StringHelper;
+import org.apache.lucene.util.TestUtil;
/**
* Tests {@link PrefixQuery} class.
@@ -57,11 +71,145 @@ public class TestPrefixQuery extends Luc
query = new PrefixQuery(new Term("category", ""));
Terms terms = MultiFields.getTerms(searcher.getIndexReader(), "category");
- assertFalse(query.getTermsEnum(terms) instanceof PrefixTermsEnum);
hits = searcher.search(query, 1000).scoreDocs;
assertEquals("everything", 3, hits.length);
writer.close();
reader.close();
directory.close();
}
+
+ public void testMatchAll() throws Exception {
+ Directory directory = newDirectory();
+
+ RandomIndexWriter writer = new RandomIndexWriter(random(), directory);
+ Document doc = new Document();
+ doc.add(newStringField("field", "field", Field.Store.YES));
+ writer.addDocument(doc);
+
+ IndexReader reader = writer.getReader();
+
+ PrefixQuery query = new PrefixQuery(new Term("field", ""));
+ IndexSearcher searcher = newSearcher(reader);
+
+ assertEquals(1, searcher.search(query, 1000).totalHits);
+
+ Terms terms = MultiFields.getTerms(searcher.getIndexReader(), "field");
+ writer.close();
+ reader.close();
+ directory.close();
+ }
+
+ static final class BinaryTokenStream extends TokenStream {
+ private final ByteTermAttribute bytesAtt = addAttribute(ByteTermAttribute.class);
+ private boolean available = true;
+
+ public BinaryTokenStream(BytesRef bytes) {
+ bytesAtt.setBytesRef(bytes);
+ }
+
+ @Override
+ public boolean incrementToken() {
+ if (available) {
+ clearAttributes();
+ available = false;
+ return true;
+ }
+ return false;
+ }
+
+ @Override
+ public void reset() {
+ available = true;
+ }
+
+ public interface ByteTermAttribute extends TermToBytesRefAttribute {
+ public void setBytesRef(BytesRef bytes);
+ }
+
+ public static class ByteTermAttributeImpl extends AttributeImpl implements ByteTermAttribute,TermToBytesRefAttribute {
+ private BytesRef bytes;
+
+ @Override
+ public void fillBytesRef() {
+ // no-op: the bytes was already filled by our owner's incrementToken
+ }
+
+ @Override
+ public BytesRef getBytesRef() {
+ return bytes;
+ }
+
+ @Override
+ public void setBytesRef(BytesRef bytes) {
+ this.bytes = bytes;
+ }
+
+ @Override
+ public void clear() {}
+
+ @Override
+ public void copyTo(AttributeImpl target) {
+ ByteTermAttributeImpl other = (ByteTermAttributeImpl) target;
+ other.bytes = bytes;
+ }
+ }
+ }
+
+ /** Basically a StringField that accepts binary term. */
+ private static class BinaryField extends Field {
+
+ final static FieldType TYPE;
+ static {
+ TYPE = new FieldType(StringField.TYPE_NOT_STORED);
+ // Necessary so our custom tokenStream is used by Field.tokenStream:
+ TYPE.setTokenized(true);
+ TYPE.freeze();
+ }
+
+ public BinaryField(String name, BytesRef value) {
+ super(name, new BinaryTokenStream(value), TYPE);
+ }
+ }
+
+ public void testRandomBinaryPrefix() throws Exception {
+ Directory dir = newDirectory();
+ RandomIndexWriter w = new RandomIndexWriter(random(), dir);
+
+ int numTerms = atLeast(10000);
+ Set<BytesRef> terms = new HashSet<>();
+ while (terms.size() < numTerms) {
+ byte[] bytes = new byte[TestUtil.nextInt(random(), 1, 10)];
+ random().nextBytes(bytes);
+ terms.add(new BytesRef(bytes));
+ }
+
+ List<BytesRef> termsList = new ArrayList<>(terms);
+ Collections.shuffle(termsList, random());
+ for(BytesRef term : termsList) {
+ Document doc = new Document();
+ doc.add(new BinaryField("field", term));
+ w.addDocument(doc);
+ }
+
+ IndexReader r = w.getReader();
+ IndexSearcher s = newSearcher(r);
+
+ int iters = atLeast(100);
+ for(int iter=0;iter<iters;iter++) {
+ byte[] bytes = new byte[random().nextInt(3)];
+ random().nextBytes(bytes);
+ BytesRef prefix = new BytesRef(bytes);
+ PrefixQuery q = new PrefixQuery(new Term("field", prefix));
+ int count = 0;
+ for(BytesRef term : termsList) {
+ if (StringHelper.startsWith(term, prefix)) {
+ count++;
+ }
+ }
+ assertEquals(count, s.search(q, 1).totalHits);
+ }
+ r.close();
+ w.close();
+ dir.close();
+ }
}
Modified: lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/TestWildcard.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/TestWildcard.java?rev=1669522&r1=1669521&r2=1669522&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/TestWildcard.java (original)
+++ lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/search/TestWildcard.java Fri Mar 27 08:22:54 2015
@@ -127,11 +127,9 @@ public class TestWildcard
MultiTermQuery wq = new WildcardQuery(new Term("field", "prefix*"));
assertMatches(searcher, wq, 2);
Terms terms = MultiFields.getTerms(searcher.getIndexReader(), "field");
- assertTrue(wq.getTermsEnum(terms) instanceof PrefixTermsEnum);
wq = new WildcardQuery(new Term("field", "*"));
assertMatches(searcher, wq, 2);
- assertFalse(wq.getTermsEnum(terms) instanceof PrefixTermsEnum);
assertFalse(wq.getTermsEnum(terms).getClass().getSimpleName().contains("AutomatonTermsEnum"));
reader.close();
indexStore.close();
Modified: lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/automaton/TestAutomaton.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/automaton/TestAutomaton.java?rev=1669522&r1=1669521&r2=1669522&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/automaton/TestAutomaton.java (original)
+++ lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/automaton/TestAutomaton.java Fri Mar 27 08:22:54 2015
@@ -1104,4 +1104,51 @@ public class TestAutomaton extends Lucen
throw ae;
}
}
+
+ private static IntsRef toIntsRef(String s) {
+ IntsRefBuilder b = new IntsRefBuilder();
+ for (int i = 0, cp = 0; i < s.length(); i += Character.charCount(cp)) {
+ cp = s.codePointAt(i);
+ b.append(cp);
+ }
+
+ return b.get();
+ }
+
+ public void testGetSingleton() {
+ int iters = atLeast(10000);
+ for(int iter=0;iter<iters;iter++) {
+ String s = TestUtil.randomRealisticUnicodeString(random());
+ Automaton a = Automata.makeString(s);
+ assertEquals(toIntsRef(s), Operations.getSingleton(a));
+ }
+ }
+
+ public void testGetSingletonEmptyString() {
+ Automaton a = new Automaton();
+ int s = a.createState();
+ a.setAccept(s, true);
+ a.finishState();
+ assertEquals(new IntsRef(), Operations.getSingleton(a));
+ }
+
+ public void testGetSingletonNothing() {
+ Automaton a = new Automaton();
+ a.createState();
+ a.finishState();
+ assertNull(Operations.getSingleton(a));
+ }
+
+ public void testGetSingletonTwo() {
+ Automaton a = new Automaton();
+ int s = a.createState();
+ int x = a.createState();
+ a.setAccept(x, true);
+ a.addTransition(s, x, 55);
+ int y = a.createState();
+ a.setAccept(y, true);
+ a.addTransition(s, y, 58);
+ a.finishState();
+ assertNull(Operations.getSingleton(a));
+ }
}
Modified: lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/automaton/TestCompiledAutomaton.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/automaton/TestCompiledAutomaton.java?rev=1669522&r1=1669521&r2=1669522&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/automaton/TestCompiledAutomaton.java (original)
+++ lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/automaton/TestCompiledAutomaton.java Fri Mar 27 08:22:54 2015
@@ -38,7 +38,7 @@ public class TestCompiledAutomaton exten
}
Collections.sort(terms);
final Automaton a = DaciukMihovAutomatonBuilder.build(terms);
- return new CompiledAutomaton(a, true, false, maxDeterminizedStates);
+ return new CompiledAutomaton(a, true, false, maxDeterminizedStates, false);
}
private void testFloor(CompiledAutomaton c, String input, String expected) {
@@ -121,4 +121,43 @@ public class TestCompiledAutomaton exten
testFloor(c, "aa", null);
testFloor(c, "zzz", "goo");
}
+
+ // LUCENE-6367
+ public void testBinaryAll() throws Exception {
+ Automaton a = new Automaton();
+ int state = a.createState();
+ a.setAccept(state, true);
+ a.addTransition(state, state, 0, 0xff);
+ a.finishState();
+
+ CompiledAutomaton ca = new CompiledAutomaton(a, null, true, Integer.MAX_VALUE, true);
+ assertEquals(CompiledAutomaton.AUTOMATON_TYPE.ALL, ca.type);
+ }
+
+ // LUCENE-6367
+ public void testUnicodeAll() throws Exception {
+ Automaton a = new Automaton();
+ int state = a.createState();
+ a.setAccept(state, true);
+ a.addTransition(state, state, 0, Character.MAX_CODE_POINT);
+ a.finishState();
+
+ CompiledAutomaton ca = new CompiledAutomaton(a, null, true, Integer.MAX_VALUE, false);
+ assertEquals(CompiledAutomaton.AUTOMATON_TYPE.ALL, ca.type);
+ }
+
+ // LUCENE-6367
+ public void testBinarySingleton() throws Exception {
+ // This is just ascii so we can pretend it's binary:
+ Automaton a = Automata.makeString("foobar");
+ CompiledAutomaton ca = new CompiledAutomaton(a, null, true, Integer.MAX_VALUE, true);
+ assertEquals(CompiledAutomaton.AUTOMATON_TYPE.SINGLE, ca.type);
+ }
+
+ // LUCENE-6367
+ public void testUnicodeSingleton() throws Exception {
+ Automaton a = Automata.makeString(TestUtil.randomRealisticUnicodeString(random()));
+ CompiledAutomaton ca = new CompiledAutomaton(a, null, true, Integer.MAX_VALUE, false);
+ assertEquals(CompiledAutomaton.AUTOMATON_TYPE.SINGLE, ca.type);
+ }
}
Modified: lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/index/BasePostingsFormatTestCase.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/index/BasePostingsFormatTestCase.java?rev=1669522&r1=1669521&r2=1669522&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/index/BasePostingsFormatTestCase.java (original)
+++ lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/index/BasePostingsFormatTestCase.java Fri Mar 27 08:22:54 2015
@@ -1238,7 +1238,7 @@ public abstract class BasePostingsFormat
for(String field : fields.keySet()) {
while (true) {
Automaton a = AutomatonTestUtil.randomAutomaton(random());
- CompiledAutomaton ca = new CompiledAutomaton(a, null, true, Integer.MAX_VALUE);
+ CompiledAutomaton ca = new CompiledAutomaton(a, null, true, Integer.MAX_VALUE, false);
if (ca.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
// Keep retrying until we get an A that will really "use" the PF's intersect code:
continue;