You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2015/03/27 09:43:50 UTC

svn commit: r1669526 - in /lucene/dev/branches/branch_5x: ./ lucene/ lucene/core/ lucene/core/src/java/org/apache/lucene/search/ lucene/core/src/java/org/apache/lucene/util/ lucene/core/src/java/org/apache/lucene/util/automaton/ lucene/core/src/test/or...

Author: mikemccand
Date: Fri Mar 27 08:43:49 2015
New Revision: 1669526

URL: http://svn.apache.org/r1669526
Log:
LUCENE-6367: PrefixQuery now subclasses AutomatonQuery

Removed:
    lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/search/PrefixTermsEnum.java
Modified:
    lucene/dev/branches/branch_5x/   (props changed)
    lucene/dev/branches/branch_5x/lucene/   (props changed)
    lucene/dev/branches/branch_5x/lucene/CHANGES.txt   (contents, props changed)
    lucene/dev/branches/branch_5x/lucene/core/   (props changed)
    lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/search/AutomatonQuery.java
    lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/search/PrefixQuery.java
    lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/StringHelper.java
    lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/automaton/Automaton.java
    lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java
    lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java
    lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum.java
    lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/search/TestAutomatonQuery.java
    lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/search/TestPrefixQuery.java
    lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/search/TestWildcard.java
    lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/util/automaton/TestAutomaton.java
    lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/util/automaton/TestCompiledAutomaton.java
    lucene/dev/branches/branch_5x/lucene/test-framework/   (props changed)
    lucene/dev/branches/branch_5x/lucene/test-framework/src/java/org/apache/lucene/index/BasePostingsFormatTestCase.java

Modified: lucene/dev/branches/branch_5x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/CHANGES.txt?rev=1669526&r1=1669525&r2=1669526&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_5x/lucene/CHANGES.txt Fri Mar 27 08:43:49 2015
@@ -212,6 +212,11 @@ Changes in Runtime Behavior
 * LUCENE-6298: SimpleQueryParser returns an empty query rather than
   null, if e.g. the terms were all stopwords. (Lee Hinman via Robert Muir)
 
+* LUCENE-6367: PrefixQuery now subclasses AutomatonQuery, removing the
+  specialized PrefixTermsEnum.  PrefixQuery now operates in binary
+  term space, meaning any binary term (not just valid UTF-8 terms)
+  are accepted.  (Robert Muir, Mike McCandless)
+
 ======================= Lucene 5.0.0 =======================
 
 New Features

Modified: lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/search/AutomatonQuery.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/search/AutomatonQuery.java?rev=1669526&r1=1669525&r2=1669526&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/search/AutomatonQuery.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/search/AutomatonQuery.java Fri Mar 27 08:43:49 2015
@@ -78,10 +78,28 @@ public class AutomatonQuery extends Mult
    *   space but can process more complex automata.
    */
   public AutomatonQuery(final Term term, Automaton automaton, int maxDeterminizedStates) {
+    this(term, automaton, maxDeterminizedStates, false);
+  }
+
+  /**
+   * Create a new AutomatonQuery from an {@link Automaton}.
+   * 
+   * @param term Term containing field and possibly some pattern structure. The
+   *        term text is ignored.
+   * @param automaton Automaton to run, terms that are accepted are considered a
+   *        match.
+   * @param maxDeterminizedStates maximum number of states in the resulting
+   *   automata.  If the automata would need more than this many states
+   *   TooComplextToDeterminizeException is thrown.  Higher number require more
+   *   space but can process more complex automata.
+   * @param isBinary if true, this automaton is already binary and
+   *   will not go through the UTF32ToUTF8 conversion
+   */
+  public AutomatonQuery(final Term term, Automaton automaton, int maxDeterminizedStates, boolean isBinary) {
     super(term.field());
     this.term = term;
     this.automaton = automaton;
-    this.compiled = new CompiledAutomaton(automaton, null, true, maxDeterminizedStates);
+    this.compiled = new CompiledAutomaton(automaton, null, true, maxDeterminizedStates, isBinary);
   }
 
   @Override

Modified: lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/search/PrefixQuery.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/search/PrefixQuery.java?rev=1669526&r1=1669525&r2=1669526&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/search/PrefixQuery.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/search/PrefixQuery.java Fri Mar 27 08:43:49 2015
@@ -19,11 +19,13 @@ package org.apache.lucene.search;
 
 import java.io.IOException;
 
-import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.util.AttributeSource;
+import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.ToStringUtils;
+import org.apache.lucene.util.automaton.Automaton;
 
 /** A Query that matches documents containing terms with a specified prefix. A PrefixQuery
  * is built by QueryParser for input like <code>app*</code>.
@@ -31,29 +33,38 @@ import org.apache.lucene.util.ToStringUt
  * <p>This query uses the {@link
  * MultiTermQuery#CONSTANT_SCORE_REWRITE}
  * rewrite method. */
-public class PrefixQuery extends MultiTermQuery {
-  private Term prefix;
+public class PrefixQuery extends AutomatonQuery {
 
   /** Constructs a query for terms starting with <code>prefix</code>. */
   public PrefixQuery(Term prefix) {
-    super(prefix.field());
-    this.prefix = prefix;
+    // It's OK to pass unlimited maxDeterminizedStates: the automaton is born small and determinized:
+    super(prefix, toAutomaton(prefix.bytes()), Integer.MAX_VALUE, true);
+    if (prefix == null) {
+      throw new NullPointerException("prefix cannot be null");
+    }
   }
 
-  /** Returns the prefix of this query. */
-  public Term getPrefix() { return prefix; }
-  
-  @Override  
-  protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
-    TermsEnum tenum = terms.iterator(null);
-    
-    if (prefix.bytes().length == 0) {
-      // no prefix -- match all terms for this field:
-      return tenum;
+  /** Build an automaton accepting all terms with the specified prefix. */
+  public static Automaton toAutomaton(BytesRef prefix) {
+    Automaton automaton = new Automaton();
+    int lastState = automaton.createState();
+    for(int i=0;i<prefix.length;i++) {
+      int state = automaton.createState();
+      automaton.addTransition(lastState, state, prefix.bytes[prefix.offset+i]&0xff);
+      lastState = state;
     }
-    return new PrefixTermsEnum(tenum, prefix.bytes());
+    automaton.setAccept(lastState, true);
+    automaton.addTransition(lastState, lastState, 0, 255);
+    automaton.finishState();
+    assert automaton.isDeterministic();
+    return automaton;
   }
 
+  /** Returns the prefix of this query. */
+  public Term getPrefix() {
+    return term;
+  }
+  
   /** Prints a user-readable version of this query. */
   @Override
   public String toString(String field) {
@@ -62,7 +73,7 @@ public class PrefixQuery extends MultiTe
       buffer.append(getField());
       buffer.append(":");
     }
-    buffer.append(prefix.text());
+    buffer.append(term.text());
     buffer.append('*');
     buffer.append(ToStringUtils.boost(getBoost()));
     return buffer.toString();
@@ -72,25 +83,23 @@ public class PrefixQuery extends MultiTe
   public int hashCode() {
     final int prime = 31;
     int result = super.hashCode();
-    result = prime * result + ((prefix == null) ? 0 : prefix.hashCode());
+    result = prime * result + term.hashCode();
     return result;
   }
 
   @Override
   public boolean equals(Object obj) {
-    if (this == obj)
+    if (this == obj) {
       return true;
-    if (!super.equals(obj))
-      return false;
-    if (getClass() != obj.getClass())
+    }
+    if (!super.equals(obj)) {
       return false;
+    }
+    // super.equals() ensures we are the same class
     PrefixQuery other = (PrefixQuery) obj;
-    if (prefix == null) {
-      if (other.prefix != null)
-        return false;
-    } else if (!prefix.equals(other.prefix))
+    if (!term.equals(other.term)) {
       return false;
+    }
     return true;
   }
-
 }

Modified: lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/StringHelper.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/StringHelper.java?rev=1669526&r1=1669525&r2=1669526&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/StringHelper.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/StringHelper.java Fri Mar 27 08:43:49 2015
@@ -393,4 +393,20 @@ public abstract class StringHelper {
       return sb.toString();
     }
   }
+  
+  /** Just converts each int in the incoming {@link IntsRef} to each byte
+   *  in the returned {@link BytesRef}, throwing {@code IllegalArgumentException}
+   *  if any int value is out of bounds for a byte. */
+  public static BytesRef intsRefToBytesRef(IntsRef ints) {
+    byte[] bytes = new byte[ints.length];
+    for(int i=0;i<ints.length;i++) {
+      int x = ints.ints[ints.offset+i];
+      if (x < 0 || x > 255) {
+        throw new IllegalArgumentException("int at pos=" + i + " with value=" + x + " is out-of-bounds for byte");
+      }
+      bytes[i] = (byte) x;
+    }
+
+    return new BytesRef(bytes);
+  }
 }

Modified: lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/automaton/Automaton.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/automaton/Automaton.java?rev=1669526&r1=1669525&r2=1669526&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/automaton/Automaton.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/automaton/Automaton.java Fri Mar 27 08:43:49 2015
@@ -349,6 +349,7 @@ public class Automaton implements Accoun
   
   /** How many transitions this state has. */
   public int getNumTransitions(int state) {
+    assert state >= 0;
     int count = states[2*state+1];
     if (count == -1) {
       return 0;

Modified: lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java?rev=1669526&r1=1669525&r2=1669526&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java Fri Mar 27 08:43:49 2015
@@ -24,9 +24,11 @@ import java.util.List;
 import org.apache.lucene.index.SingleTermsEnum;
 import org.apache.lucene.index.Terms;
 import org.apache.lucene.index.TermsEnum;
-import org.apache.lucene.search.PrefixTermsEnum;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.BytesRefBuilder;
+import org.apache.lucene.util.IntsRef;
+import org.apache.lucene.util.StringHelper;
+import org.apache.lucene.util.UnicodeUtil;
 
 /**
  * Immutable class holding compiled details for a given
@@ -47,8 +49,6 @@ public class CompiledAutomaton {
     ALL, 
     /** Automaton that accepts only a single fixed string. */
     SINGLE, 
-    /** Automaton that matches all Strings with a constant prefix. */
-    PREFIX, 
     /** Catch-all for any other automata. */
     NORMAL
   };
@@ -57,8 +57,7 @@ public class CompiledAutomaton {
   public final AUTOMATON_TYPE type;
 
   /** 
-   * For {@link AUTOMATON_TYPE#PREFIX}, this is the prefix term; 
-   * for {@link AUTOMATON_TYPE#SINGLE} this is the singleton term.
+   * For {@link AUTOMATON_TYPE#SINGLE} this is the singleton term.
    */
   public final BytesRef term;
 
@@ -101,7 +100,7 @@ public class CompiledAutomaton {
    *  possibly expensive operations to determine if the automaton is one
    *  the cases in {@link CompiledAutomaton.AUTOMATON_TYPE}. */
   public CompiledAutomaton(Automaton automaton, Boolean finite, boolean simplify) {
-    this(automaton, finite, simplify, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
+    this(automaton, finite, simplify, Operations.DEFAULT_MAX_DETERMINIZED_STATES, false);
   }
 
 
@@ -114,7 +113,7 @@ public class CompiledAutomaton {
    *  TooComplexToDeterminizeException.
    */
   public CompiledAutomaton(Automaton automaton, Boolean finite, boolean simplify,
-      int maxDeterminizedStates) {
+                           int maxDeterminizedStates, boolean isBinary) {
     if (automaton.getNumStates() == 0) {
       automaton = new Automaton();
       automaton.createState();
@@ -135,8 +134,18 @@ public class CompiledAutomaton {
         this.automaton = null;
         this.finite = null;
         return;
+      }
+
+      boolean isTotal;
+
       // NOTE: only approximate, because automaton may not be minimal:
-      } else if (Operations.isTotal(automaton)) {
+      if (isBinary) {
+        isTotal = Operations.isTotal(automaton, 0, 0xff);
+      } else {
+        isTotal = Operations.isTotal(automaton);
+      }
+
+      if (isTotal) {
         // matches all possible strings
         type = AUTOMATON_TYPE.ALL;
         term = null;
@@ -145,43 +154,27 @@ public class CompiledAutomaton {
         this.automaton = null;
         this.finite = null;
         return;
-      } else {
+      }
 
-        automaton = Operations.determinize(automaton, maxDeterminizedStates);
+      automaton = Operations.determinize(automaton, maxDeterminizedStates);
 
-        final String commonPrefix = Operations.getCommonPrefix(automaton);
-        final String singleton;
+      IntsRef singleton = Operations.getSingleton(automaton);
+
+      if (singleton != null) {
+        // matches a fixed string
+        type = AUTOMATON_TYPE.SINGLE;
+        commonSuffixRef = null;
+        runAutomaton = null;
+        this.automaton = null;
+        this.finite = null;
 
-        if (commonPrefix.length() > 0 && Operations.sameLanguage(automaton, Automata.makeString(commonPrefix))) {
-          singleton = commonPrefix;
+        if (isBinary) {
+          term = StringHelper.intsRefToBytesRef(singleton);
         } else {
-          singleton = null;
+          term = new BytesRef(UnicodeUtil.newString(singleton.ints, singleton.offset, singleton.length));
         }
 
-        if (singleton != null) {
-          // matches a fixed string
-          type = AUTOMATON_TYPE.SINGLE;
-          term = new BytesRef(singleton);
-          commonSuffixRef = null;
-          runAutomaton = null;
-          this.automaton = null;
-          this.finite = null;
-          return;
-        } else if (commonPrefix.length() > 0) {
-          Automaton other = Operations.concatenate(Automata.makeString(commonPrefix), Automata.makeAnyString());
-          other = Operations.determinize(other, maxDeterminizedStates);
-          assert Operations.hasDeadStates(other) == false;
-          if (Operations.sameLanguage(automaton, other)) {
-            // matches a constant prefix
-            type = AUTOMATON_TYPE.PREFIX;
-            term = new BytesRef(commonPrefix);
-            commonSuffixRef = null;
-            runAutomaton = null;
-            this.automaton = null;
-            this.finite = null;
-            return;
-          }
-        }
+        return;
       }
     }
 
@@ -194,14 +187,26 @@ public class CompiledAutomaton {
       this.finite = finite;
     }
 
-    Automaton utf8 = new UTF32ToUTF8().convert(automaton);
+    Automaton binary;
+    if (isBinary) {
+      // Caller already built binary automaton themselves, e.g. PrefixQuery
+      // does this since it can be provided with a binary (not necessarily
+      // UTF8!) term:
+      binary = automaton;
+    } else {
+      // Incoming automaton is unicode, and we must convert to UTF8 to match what's in the index:
+      binary = new UTF32ToUTF8().convert(automaton);
+    }
+
     if (this.finite) {
       commonSuffixRef = null;
     } else {
       // NOTE: this is a very costly operation!  We should test if it's really warranted in practice...
-      commonSuffixRef = Operations.getCommonSuffixBytesRef(utf8, maxDeterminizedStates);
+      commonSuffixRef = Operations.getCommonSuffixBytesRef(binary, maxDeterminizedStates);
     }
-    runAutomaton = new ByteRunAutomaton(utf8, true, maxDeterminizedStates);
+
+    // This will determinize the binary automaton for us:
+    runAutomaton = new ByteRunAutomaton(binary, true, maxDeterminizedStates);
 
     this.automaton = runAutomaton.automaton;
   }
@@ -285,10 +290,6 @@ public class CompiledAutomaton {
       return terms.iterator(null);
     case SINGLE:
       return new SingleTermsEnum(terms.iterator(null), term);
-    case PREFIX:
-      // TODO: this is very likely faster than .intersect,
-      // but we should test and maybe cutover
-      return new PrefixTermsEnum(terms.iterator(null), term);
     case NORMAL:
       return terms.intersect(this, null);
     default:
@@ -410,7 +411,7 @@ public class CompiledAutomaton {
     if (getClass() != obj.getClass()) return false;
     CompiledAutomaton other = (CompiledAutomaton) obj;
     if (type != other.type) return false;
-    if (type == AUTOMATON_TYPE.SINGLE || type == AUTOMATON_TYPE.PREFIX) {
+    if (type == AUTOMATON_TYPE.SINGLE) {
       if (!term.equals(other.term)) return false;
     } else if (type == AUTOMATON_TYPE.NORMAL) {
       if (!runAutomaton.equals(other.runAutomaton)) return false;

Modified: lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java?rev=1669526&r1=1669525&r2=1669526&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java Fri Mar 27 08:43:49 2015
@@ -834,11 +834,20 @@ final public class Operations {
    * Returns true if the given automaton accepts all strings.  The automaton must be minimized.
    */
   public static boolean isTotal(Automaton a) {
+    return isTotal(a, Character.MIN_CODE_POINT, Character.MAX_CODE_POINT);
+  }
+
+  /**
+   * Returns true if the given automaton accepts all strings for the specified min/max
+   * range of the alphabet.  The automaton must be minimized.
+   */
+  public static boolean isTotal(Automaton a, int minAlphabet, int maxAlphabet) {
     if (a.isAccept(0) && a.getNumTransitions(0) == 1) {
       Transition t = new Transition();
       a.getTransition(0, 0, t);
-      return t.dest == 0 && t.min == Character.MIN_CODE_POINT
-          && t.max == Character.MAX_CODE_POINT;
+      return t.dest == 0
+        && t.min == minAlphabet
+        && t.max == maxAlphabet;
     }
     return false;
   }
@@ -1112,6 +1121,37 @@ final public class Operations {
     return builder.get();
   }
 
+  /** If this automaton accepts a single input, return it.  Else, return null.
+   *  The automaton must be deterministic. */
+  public static IntsRef getSingleton(Automaton a) {
+    if (a.isDeterministic() == false) {
+      throw new IllegalArgumentException("input automaton must be deterministic");
+    }
+    IntsRefBuilder builder = new IntsRefBuilder();
+    HashSet<Integer> visited = new HashSet<>();
+    int s = 0;
+    boolean done;
+    Transition t = new Transition();
+    while (true) {
+      visited.add(s);
+      if (a.isAccept(s) == false) {
+        if (a.getNumTransitions(s) == 1) {
+          a.getTransition(s, 0, t);
+          if (t.min == t.max && !visited.contains(t.dest)) {
+            builder.append(t.min);
+            s = t.dest;
+            continue;
+          }
+        }
+      } else if (a.getNumTransitions(s) == 0) {
+        return builder.get();
+      }
+
+      // Automaton accepts more than one string:
+      return null;
+    }
+  }
+
   /**
    * Returns the longest BytesRef that is a suffix of all accepted strings.
    * Worst case complexity: exponential in number of states (this calls

Modified: lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum.java?rev=1669526&r1=1669525&r2=1669526&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum.java Fri Mar 27 08:43:49 2015
@@ -260,7 +260,7 @@ public class TestTermsEnum extends Lucen
         a = Automata.makeStringUnion(sortedAcceptTerms);
       }
       
-      final CompiledAutomaton c = new CompiledAutomaton(a, true, false, 1000000);
+      final CompiledAutomaton c = new CompiledAutomaton(a, true, false, 1000000, false);
 
       final BytesRef[] acceptTermsArray = new BytesRef[acceptTerms.size()];
       final Set<BytesRef> acceptTermsSet = new HashSet<>();

Modified: lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/search/TestAutomatonQuery.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/search/TestAutomatonQuery.java?rev=1669526&r1=1669525&r2=1669526&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/search/TestAutomatonQuery.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/search/TestAutomatonQuery.java Fri Mar 27 08:43:49 2015
@@ -195,7 +195,6 @@ public class TestAutomatonQuery extends
     Automaton prefixAutomaton = Operations.concatenate(pfx, Automata.makeAnyString());
     AutomatonQuery aq = new AutomatonQuery(newTerm("bogus"), prefixAutomaton);
     Terms terms = MultiFields.getTerms(searcher.getIndexReader(), FN);
-    assertTrue(aq.getTermsEnum(terms) instanceof PrefixTermsEnum);
     assertEquals(3, automatonQueryNrHits(aq));
   }
   

Modified: lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/search/TestPrefixQuery.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/search/TestPrefixQuery.java?rev=1669526&r1=1669525&r2=1669526&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/search/TestPrefixQuery.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/search/TestPrefixQuery.java Fri Mar 27 08:43:49 2015
@@ -17,15 +17,29 @@ package org.apache.lucene.search;
  * limitations under the License.
  */
 
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
+import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
-import org.apache.lucene.store.Directory;
-import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.document.FieldType;
+import org.apache.lucene.document.StringField;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.MultiFields;
 import org.apache.lucene.index.RandomIndexWriter;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.Terms;
-import org.apache.lucene.document.Document;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.AttributeImpl;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.StringHelper;
+import org.apache.lucene.util.TestUtil;
 
 /**
  * Tests {@link PrefixQuery} class.
@@ -57,11 +71,145 @@ public class TestPrefixQuery extends Luc
 
     query = new PrefixQuery(new Term("category", ""));
     Terms terms = MultiFields.getTerms(searcher.getIndexReader(), "category");
-    assertFalse(query.getTermsEnum(terms) instanceof PrefixTermsEnum);
     hits = searcher.search(query, 1000).scoreDocs;
     assertEquals("everything", 3, hits.length);
     writer.close();
     reader.close();
     directory.close();
   }
+
+  public void testMatchAll() throws Exception {
+    Directory directory = newDirectory();
+
+    RandomIndexWriter writer = new RandomIndexWriter(random(), directory);
+    Document doc = new Document();
+    doc.add(newStringField("field", "field", Field.Store.YES));
+    writer.addDocument(doc);
+
+    IndexReader reader = writer.getReader();
+
+    PrefixQuery query = new PrefixQuery(new Term("field", ""));
+    IndexSearcher searcher = newSearcher(reader);
+
+    assertEquals(1, searcher.search(query, 1000).totalHits);
+
+    Terms terms = MultiFields.getTerms(searcher.getIndexReader(), "field");
+    writer.close();
+    reader.close();
+    directory.close();
+  }
+
+  static final class BinaryTokenStream extends TokenStream {
+    private final ByteTermAttribute bytesAtt = addAttribute(ByteTermAttribute.class);
+    private boolean available = true;
+  
+    public BinaryTokenStream(BytesRef bytes) {
+      bytesAtt.setBytesRef(bytes);
+    }
+  
+    @Override
+    public boolean incrementToken() {
+      if (available) {
+        clearAttributes();
+        available = false;
+        return true;
+      }
+      return false;
+    }
+  
+    @Override
+    public void reset() {
+      available = true;
+    }
+  
+    public interface ByteTermAttribute extends TermToBytesRefAttribute {
+      public void setBytesRef(BytesRef bytes);
+    }
+  
+    public static class ByteTermAttributeImpl extends AttributeImpl implements ByteTermAttribute,TermToBytesRefAttribute {
+      private BytesRef bytes;
+    
+      @Override
+      public void fillBytesRef() {
+       // no-op: the bytes was already filled by our owner's incrementToken
+      }
+    
+      @Override
+      public BytesRef getBytesRef() {
+        return bytes;
+      }
+
+      @Override
+      public void setBytesRef(BytesRef bytes) {
+        this.bytes = bytes;
+      }
+   
+      @Override
+      public void clear() {}
+    
+      @Override
+      public void copyTo(AttributeImpl target) {
+        ByteTermAttributeImpl other = (ByteTermAttributeImpl) target;
+        other.bytes = bytes;
+      }
+    }
+  }
+
+  /** Basically a StringField that accepts binary term. */
+  private static class BinaryField extends Field {
+
+    final static FieldType TYPE;
+    static {
+      TYPE = new FieldType(StringField.TYPE_NOT_STORED);
+      // Necessary so our custom tokenStream is used by Field.tokenStream:
+      TYPE.setTokenized(true);
+      TYPE.freeze();
+    }
+
+    public BinaryField(String name, BytesRef value) {
+      super(name, new BinaryTokenStream(value), TYPE);
+    }
+  }
+
+  public void testRandomBinaryPrefix() throws Exception {
+    Directory dir = newDirectory();
+    RandomIndexWriter w = new RandomIndexWriter(random(), dir);
+
+    int numTerms = atLeast(10000);
+    Set<BytesRef> terms = new HashSet<>();
+    while (terms.size() < numTerms) {
+      byte[] bytes = new byte[TestUtil.nextInt(random(), 1, 10)];
+      random().nextBytes(bytes);
+      terms.add(new BytesRef(bytes));
+    }
+
+    List<BytesRef> termsList = new ArrayList<>(terms);  
+    Collections.shuffle(termsList, random());
+    for(BytesRef term : termsList) {
+      Document doc = new Document();
+      doc.add(new BinaryField("field", term));
+      w.addDocument(doc);
+    }
+
+    IndexReader r = w.getReader();
+    IndexSearcher s = newSearcher(r);
+
+    int iters = atLeast(100);   
+    for(int iter=0;iter<iters;iter++) {
+      byte[] bytes = new byte[random().nextInt(3)];
+      random().nextBytes(bytes);
+      BytesRef prefix = new BytesRef(bytes);
+      PrefixQuery q = new PrefixQuery(new Term("field", prefix));
+      int count = 0;
+      for(BytesRef term : termsList) {
+        if (StringHelper.startsWith(term, prefix)) {
+          count++;
+        }
+      }
+      assertEquals(count, s.search(q, 1).totalHits);
+    }
+    r.close();
+    w.close();
+    dir.close();
+  }
 }

Modified: lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/search/TestWildcard.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/search/TestWildcard.java?rev=1669526&r1=1669525&r2=1669526&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/search/TestWildcard.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/search/TestWildcard.java Fri Mar 27 08:43:49 2015
@@ -127,11 +127,9 @@ public class TestWildcard
     MultiTermQuery wq = new WildcardQuery(new Term("field", "prefix*"));
     assertMatches(searcher, wq, 2);
     Terms terms = MultiFields.getTerms(searcher.getIndexReader(), "field");
-    assertTrue(wq.getTermsEnum(terms) instanceof PrefixTermsEnum);
     
     wq = new WildcardQuery(new Term("field", "*"));
     assertMatches(searcher, wq, 2);
-    assertFalse(wq.getTermsEnum(terms) instanceof PrefixTermsEnum);
     assertFalse(wq.getTermsEnum(terms).getClass().getSimpleName().contains("AutomatonTermsEnum"));
     reader.close();
     indexStore.close();

Modified: lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/util/automaton/TestAutomaton.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/util/automaton/TestAutomaton.java?rev=1669526&r1=1669525&r2=1669526&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/util/automaton/TestAutomaton.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/util/automaton/TestAutomaton.java Fri Mar 27 08:43:49 2015
@@ -1104,4 +1104,51 @@ public class TestAutomaton extends Lucen
       throw ae;
     }
   }
+
+  private static IntsRef toIntsRef(String s) {
+    IntsRefBuilder b = new IntsRefBuilder();
+    for (int i = 0, cp = 0; i < s.length(); i += Character.charCount(cp)) {
+      cp = s.codePointAt(i);
+      b.append(cp);
+    }
+
+    return b.get();
+  }
+
+  public void testGetSingleton() {
+    int iters = atLeast(10000);
+    for(int iter=0;iter<iters;iter++) {
+      String s = TestUtil.randomRealisticUnicodeString(random());
+      Automaton a = Automata.makeString(s);
+      assertEquals(toIntsRef(s), Operations.getSingleton(a));
+    }
+  }
+
+  public void testGetSingletonEmptyString() {
+    Automaton a = new Automaton();
+    int s = a.createState();
+    a.setAccept(s, true);
+    a.finishState();
+    assertEquals(new IntsRef(), Operations.getSingleton(a));
+  }
+
+  public void testGetSingletonNothing() {
+    Automaton a = new Automaton();
+    a.createState();
+    a.finishState();
+    assertNull(Operations.getSingleton(a));
+  }
+
+  public void testGetSingletonTwo() {
+    Automaton a = new Automaton();
+    int s = a.createState();
+    int x = a.createState();
+    a.setAccept(x, true);
+    a.addTransition(s, x, 55);
+    int y = a.createState();
+    a.setAccept(y, true);
+    a.addTransition(s, y, 58);
+    a.finishState();
+    assertNull(Operations.getSingleton(a));
+  }
 }

Modified: lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/util/automaton/TestCompiledAutomaton.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/util/automaton/TestCompiledAutomaton.java?rev=1669526&r1=1669525&r2=1669526&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/util/automaton/TestCompiledAutomaton.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/util/automaton/TestCompiledAutomaton.java Fri Mar 27 08:43:49 2015
@@ -38,7 +38,7 @@ public class TestCompiledAutomaton exten
     }
     Collections.sort(terms);
     final Automaton a = DaciukMihovAutomatonBuilder.build(terms);
-    return new CompiledAutomaton(a, true, false, maxDeterminizedStates);
+    return new CompiledAutomaton(a, true, false, maxDeterminizedStates, false);
   }
 
   private void testFloor(CompiledAutomaton c, String input, String expected) {
@@ -121,4 +121,43 @@ public class TestCompiledAutomaton exten
     testFloor(c, "aa", null);
     testFloor(c, "zzz", "goo");
   }
+  
+  // LUCENE-6367
+  public void testBinaryAll() throws Exception {
+    Automaton a = new Automaton();
+    int state = a.createState();
+    a.setAccept(state, true);
+    a.addTransition(state, state, 0, 0xff);
+    a.finishState();
+
+    CompiledAutomaton ca = new CompiledAutomaton(a, null, true, Integer.MAX_VALUE, true);
+    assertEquals(CompiledAutomaton.AUTOMATON_TYPE.ALL, ca.type);
+  }
+
+  // LUCENE-6367
+  public void testUnicodeAll() throws Exception {
+    Automaton a = new Automaton();
+    int state = a.createState();
+    a.setAccept(state, true);
+    a.addTransition(state, state, 0, Character.MAX_CODE_POINT);
+    a.finishState();
+
+    CompiledAutomaton ca = new CompiledAutomaton(a, null, true, Integer.MAX_VALUE, false);
+    assertEquals(CompiledAutomaton.AUTOMATON_TYPE.ALL, ca.type);
+  }
+
+  // LUCENE-6367
+  public void testBinarySingleton() throws Exception {
+    // This is just ascii so we can pretend it's binary:
+    Automaton a = Automata.makeString("foobar");
+    CompiledAutomaton ca = new CompiledAutomaton(a, null, true, Integer.MAX_VALUE, true);
+    assertEquals(CompiledAutomaton.AUTOMATON_TYPE.SINGLE, ca.type);
+  }
+
+  // LUCENE-6367
+  public void testUnicodeSingleton() throws Exception {
+    Automaton a = Automata.makeString(TestUtil.randomRealisticUnicodeString(random()));
+    CompiledAutomaton ca = new CompiledAutomaton(a, null, true, Integer.MAX_VALUE, false);
+    assertEquals(CompiledAutomaton.AUTOMATON_TYPE.SINGLE, ca.type);
+  }
 }

Modified: lucene/dev/branches/branch_5x/lucene/test-framework/src/java/org/apache/lucene/index/BasePostingsFormatTestCase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/test-framework/src/java/org/apache/lucene/index/BasePostingsFormatTestCase.java?rev=1669526&r1=1669525&r2=1669526&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/test-framework/src/java/org/apache/lucene/index/BasePostingsFormatTestCase.java (original)
+++ lucene/dev/branches/branch_5x/lucene/test-framework/src/java/org/apache/lucene/index/BasePostingsFormatTestCase.java Fri Mar 27 08:43:49 2015
@@ -1238,7 +1238,7 @@ public abstract class BasePostingsFormat
     for(String field : fields.keySet()) {
       while (true) {
         Automaton a = AutomatonTestUtil.randomAutomaton(random());
-        CompiledAutomaton ca = new CompiledAutomaton(a, null, true, Integer.MAX_VALUE);
+        CompiledAutomaton ca = new CompiledAutomaton(a, null, true, Integer.MAX_VALUE, false);
         if (ca.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
           // Keep retrying until we get an A that will really "use" the PF's intersect code:
           continue;