You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2012/11/05 22:22:05 UTC
svn commit: r1405963 - in /lucene/dev/branches/branch_4x: ./ lucene/ lucene/suggest/ lucene/suggest/src/java/org/apache/lucene/search/suggest/ lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/ lucene/suggest/src/java/org/apache/lucene...

Author: mikemccand
Date: Mon Nov  5 21:22:04 2012
New Revision: 1405963

URL: http://svn.apache.org/viewvc?rev=1405963&view=rev
Log:
LUCENE-4534: handle 0 byte values in lookup keys

Modified:
    lucene/dev/branches/branch_4x/   (props changed)
    lucene/dev/branches/branch_4x/lucene/   (props changed)
    lucene/dev/branches/branch_4x/lucene/CHANGES.txt   (contents, props changed)
    lucene/dev/branches/branch_4x/lucene/suggest/   (props changed)
    lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/SortedTermFreqIteratorWrapper.java
    lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java
    lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java
    lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/TestTermFreqIterator.java
    lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java
    lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/fst/WFSTCompletionTest.java

Modified: lucene/dev/branches/branch_4x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/CHANGES.txt?rev=1405963&r1=1405962&r2=1405963&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_4x/lucene/CHANGES.txt Mon Nov  5 21:22:04 2012
@@ -106,6 +106,9 @@ Bug Fixes
 * LUCENE-4513: Fixed that deleted nested docs are scored into the
   parent doc when using ToParentBlockJoinQuery. (Martijn van Groningen)
 
+* LUCENE-4534: Fixed WFSTCompletionLookup and Analyzing/FuzzySuggester
+  to allow 0 byte values in the lookup keys.  (Mike McCandless)
+
 Optimizations
 
 * LUCENE-4512: Additional memory savings for CompressingStoredFieldsFormat.

Modified: lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/SortedTermFreqIteratorWrapper.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/SortedTermFreqIteratorWrapper.java?rev=1405963&r1=1405962&r2=1405963&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/SortedTermFreqIteratorWrapper.java (original)
+++ lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/SortedTermFreqIteratorWrapper.java Mon Nov  5 21:22:04 2012
@@ -41,28 +41,33 @@ public class SortedTermFreqIteratorWrapp
   private File tempInput;
   private File tempSorted;
   private final ByteSequencesReader reader;
+  private final Comparator<BytesRef> comparator;
   private boolean done = false;
   
   private long weight;
   private final BytesRef scratch = new BytesRef();
-  private final Comparator<BytesRef> comparator;
   
-  /** 
-   * Calls {@link #SortedTermFreqIteratorWrapper(TermFreqIterator, Comparator, boolean) 
-   * SortedTermFreqIteratorWrapper(source, comparator, false)}
-   */
-  public SortedTermFreqIteratorWrapper(TermFreqIterator source, Comparator<BytesRef> comparator) throws IOException {
-    this(source, comparator, false);
+  /**
+   * Creates a new sorted wrapper, using {@link
+   * BytesRef#getUTF8SortedAsUnicodeComparator} for
+   * sorting. */
+  public SortedTermFreqIteratorWrapper(TermFreqIterator source) throws IOException {
+    this(source, BytesRef.getUTF8SortedAsUnicodeComparator());
   }
-  
+
   /**
-   * Creates a new sorted wrapper. if <code>compareRawBytes</code> is true, then
-   * only the bytes (not the weight) will be used for comparison.
+   * Creates a new sorted wrapper, sorting by BytesRef
+   * (ascending) then cost (ascending).
    */
-  public SortedTermFreqIteratorWrapper(TermFreqIterator source, Comparator<BytesRef> comparator, boolean compareRawBytes) throws IOException {
+  public SortedTermFreqIteratorWrapper(TermFreqIterator source, Comparator<BytesRef> comparator) throws IOException {
     this.source = source;
     this.comparator = comparator;
-    this.reader = sort(compareRawBytes ? comparator : new BytesOnlyComparator(this.comparator));
+    this.reader = sort();
+  }
+  
+  @Override
+  public Comparator<BytesRef> getComparator() {
+    return comparator;
   }
   
   @Override
@@ -90,16 +95,43 @@ public class SortedTermFreqIteratorWrapp
   }
   
   @Override
-  public Comparator<BytesRef> getComparator() {
-    return comparator;
-  }
-  
-  @Override
   public long weight() {
     return weight;
   }
+
+  /** Sortes by BytesRef (ascending) then cost (ascending). */
+  private final Comparator<BytesRef> tieBreakByCostComparator = new Comparator<BytesRef>() {
+
+    private final BytesRef leftScratch = new BytesRef();
+    private final BytesRef rightScratch = new BytesRef();
+    private final ByteArrayDataInput input = new ByteArrayDataInput();
+    
+    @Override
+    public int compare(BytesRef left, BytesRef right) {
+      // Make shallow copy in case decode changes the BytesRef:
+      leftScratch.bytes = left.bytes;
+      leftScratch.offset = left.offset;
+      leftScratch.length = left.length;
+      rightScratch.bytes = right.bytes;
+      rightScratch.offset = right.offset;
+      rightScratch.length = right.length;
+      long leftCost = decode(leftScratch, input);
+      long rightCost = decode(rightScratch, input);
+      int cmp = comparator.compare(leftScratch, rightScratch);
+      if (cmp != 0) {
+        return cmp;
+      }
+      if (leftCost < rightCost) {
+        return -1;
+      } else if (rightCost < leftCost) {
+        return 1;
+      } else {
+        return 0;
+      }
+    }
+  };
   
-  private Sort.ByteSequencesReader sort(Comparator<BytesRef> comparator) throws IOException {
+  private Sort.ByteSequencesReader sort() throws IOException {
     String prefix = getClass().getSimpleName();
     File directory = Sort.defaultTempDir();
     tempInput = File.createTempFile(prefix, ".input", directory);
@@ -116,7 +148,7 @@ public class SortedTermFreqIteratorWrapp
         encode(writer, output, buffer, spare, source.weight());
       }
       writer.close();
-      new Sort(comparator).sort(tempInput, tempSorted);
+      new Sort(tieBreakByCostComparator).sort(tempInput, tempSorted);
       ByteSequencesReader reader = new Sort.ByteSequencesReader(tempSorted);
       success = true;
       return reader;
@@ -131,7 +163,6 @@ public class SortedTermFreqIteratorWrapp
           close();
         }
       }
-      
     }
   }
   
@@ -145,31 +176,6 @@ public class SortedTermFreqIteratorWrapp
     }
   }
   
-  private final static class BytesOnlyComparator implements Comparator<BytesRef> {
-
-    final Comparator<BytesRef> other;
-    private final BytesRef leftScratch = new BytesRef();
-    private final BytesRef rightScratch = new BytesRef();
-    
-    public BytesOnlyComparator(Comparator<BytesRef> other) {
-      this.other = other;
-    }
-
-    @Override
-    public int compare(BytesRef left, BytesRef right) {
-      wrap(leftScratch, left);
-      wrap(rightScratch, right);
-      return other.compare(leftScratch, rightScratch);
-    }
-    
-    private void wrap(BytesRef wrapper, BytesRef source) {
-      wrapper.bytes = source.bytes;
-      wrapper.offset = source.offset;
-      wrapper.length = source.length - 8;
-      
-    }
-  }
-  
   /** encodes an entry (bytes+weight) to the provided writer */
   protected void encode(ByteSequencesWriter writer, ByteArrayDataOutput output, byte[] buffer, BytesRef spare, long weight) throws IOException {
     if (spare.length + 8 >= buffer.length) {
@@ -184,9 +190,8 @@ public class SortedTermFreqIteratorWrapp
   /** decodes the weight at the current position */
   protected long decode(BytesRef scratch, ByteArrayDataInput tmpInput) {
     tmpInput.reset(scratch.bytes);
-    tmpInput.skipBytes(scratch.length - 8); // suggestion + separator
-    scratch.length -= 8; // sep + long
+    tmpInput.skipBytes(scratch.length - 8); // suggestion
+    scratch.length -= 8; // long
     return tmpInput.readLong();
   }
-  
 }

Modified: lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java?rev=1405963&r1=1405962&r2=1405963&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java (original)
+++ lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java Mon Nov  5 21:22:04 2012
@@ -320,6 +320,56 @@ public class AnalyzingSuggester extends 
       return new TokenStreamToAutomaton();
     }
   }
+
+  private  Comparator<BytesRef> sortComparator = new Comparator<BytesRef>() {
+    private final ByteArrayDataInput readerA = new ByteArrayDataInput();
+    private final ByteArrayDataInput readerB = new ByteArrayDataInput();
+    private final BytesRef scratchA = new BytesRef();
+    private final BytesRef scratchB = new BytesRef();
+
+    @Override
+    public int compare(BytesRef a, BytesRef b) {
+
+      // First by analyzed form:
+      readerA.reset(a.bytes, a.offset, a.length);
+      scratchA.length = readerA.readShort();
+      scratchA.bytes = a.bytes;
+      scratchA.offset = readerA.getPosition();
+
+      readerB.reset(b.bytes, b.offset, b.length);
+      scratchB.bytes = b.bytes;
+      scratchB.length = readerB.readShort();
+      scratchB.offset = readerB.getPosition();
+
+      int cmp = scratchA.compareTo(scratchB);
+      if (cmp != 0) {
+        return cmp;
+      }
+
+      // Next by cost:
+      long aCost = readerA.readInt();
+      long bCost = readerB.readInt();
+
+      if (aCost < bCost) {
+        return -1;
+      } else if (aCost > bCost) {
+        return 1;
+      }
+
+      // Finally by surface form:
+      scratchA.offset = readerA.getPosition();
+      scratchA.length = a.length - scratchA.offset;
+      scratchB.offset = readerB.getPosition();
+      scratchB.length = b.length - scratchB.offset;
+
+      cmp = scratchA.compareTo(scratchB);
+      if (cmp != 0) {
+        return cmp;
+      }
+
+      return 0;
+    }
+  };
   
   @Override
   public void build(TermFreqIterator iterator) throws IOException {
@@ -350,42 +400,43 @@ public class AnalyzingSuggester extends 
           Util.toBytesRef(path, scratch);
           
           // length of the analyzed text (FST input)
+          if (scratch.length > Short.MAX_VALUE-2) {
+            throw new IllegalArgumentException("cannot handle analyzed forms > " + (Short.MAX_VALUE-2) + " in length (got " + scratch.length + ")");
+          }
           short analyzedLength = (short) scratch.length;
+
           // compute the required length:
-          // analyzed sequence + 12 (separator) + weight (4) + surface + analyzedLength (short)
-          int requiredLength = analyzedLength + 2 + 4 + surfaceForm.length + 2;
+          // analyzed sequence + weight (4) + surface + analyzedLength (short)
+          int requiredLength = analyzedLength + 4 + surfaceForm.length + 2;
           
           buffer = ArrayUtil.grow(buffer, requiredLength);
           
           output.reset(buffer);
+
+          output.writeShort(analyzedLength);
+
           output.writeBytes(scratch.bytes, scratch.offset, scratch.length);
-          output.writeByte((byte)0); // separator: not used, just for sort order
-          output.writeByte((byte)0); // separator: not used, just for sort order
 
-          // NOTE: important that writeInt is big-endian,
-          // because this means we sort secondarily by
-          // cost ascending (= weight descending) so that
-          // when we discard too many surface forms for a
-          // single analyzed form we are discarding the
-          // least weight ones:
           output.writeInt(encodeWeight(iterator.weight()));
 
           output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length);
-          output.writeShort(analyzedLength);
+
+          assert output.getPosition() == requiredLength: output.getPosition() + " vs " + requiredLength;
+
           writer.write(buffer, 0, output.getPosition());
         }
       }
       writer.close();
 
       // Sort all input/output pairs (required by FST.Builder):
-      new Sort().sort(tempInput, tempSorted);
+      new Sort(sortComparator).sort(tempInput, tempSorted);
       reader = new Sort.ByteSequencesReader(tempSorted);
      
       PairOutputs<Long,BytesRef> outputs = new PairOutputs<Long,BytesRef>(PositiveIntOutputs.getSingleton(true), ByteSequenceOutputs.getSingleton());
       Builder<Pair<Long,BytesRef>> builder = new Builder<Pair<Long,BytesRef>>(FST.INPUT_TYPE.BYTE1, outputs);
 
       // Build FST:
-      BytesRef previous = null;
+      BytesRef previousAnalyzed = null;
       BytesRef analyzed = new BytesRef();
       BytesRef surface = new BytesRef();
       IntsRef scratchInts = new IntsRef();
@@ -394,24 +445,21 @@ public class AnalyzingSuggester extends 
       int dedup = 0;
       while (reader.read(scratch)) {
         input.reset(scratch.bytes, scratch.offset, scratch.length);
-        input.setPosition(input.length()-2);
         short analyzedLength = input.readShort();
-
-        analyzed.bytes = scratch.bytes;
-        analyzed.offset = scratch.offset;
+        analyzed.grow(analyzedLength+2);
+        input.readBytes(analyzed.bytes, 0, analyzedLength);
         analyzed.length = analyzedLength;
-        
-        input.setPosition(analyzedLength + 2); // analyzed sequence + separator
+
         long cost = input.readInt();
-   
+
         surface.bytes = scratch.bytes;
         surface.offset = input.getPosition();
-        surface.length = input.length() - input.getPosition() - 2;
-
-        if (previous == null) {
-          previous = new BytesRef();
-          previous.copyBytes(analyzed);
-        } else if (analyzed.equals(previous)) {
+        surface.length = scratch.length - surface.offset;
+        
+        if (previousAnalyzed == null) {
+          previousAnalyzed = new BytesRef();
+          previousAnalyzed.copyBytes(analyzed);
+        } else if (analyzed.equals(previousAnalyzed)) {
           dedup++;
           if (dedup >= maxSurfaceFormsPerAnalyzedForm) {
             // More than maxSurfaceFormsPerAnalyzedForm
@@ -420,11 +468,9 @@ public class AnalyzingSuggester extends 
           }
         } else {
           dedup = 0;
-          previous.copyBytes(analyzed);
+          previousAnalyzed.copyBytes(analyzed);
         }
 
-        analyzed.grow(analyzed.length+2);
-
         // TODO: I think we can avoid the extra 2 bytes when
         // there is no dup (dedup==0), but we'd have to fix
         // the exactFirst logic ... which would be sort of
@@ -433,8 +479,8 @@ public class AnalyzingSuggester extends 
 
         // NOTE: must be byte 0 so we sort before whatever
         // is next
-        analyzed.bytes[analyzed.length] = 0;
-        analyzed.bytes[analyzed.length+1] = (byte) dedup;
+        analyzed.bytes[analyzed.offset+analyzed.length] = 0;
+        analyzed.bytes[analyzed.offset+analyzed.length+1] = (byte) dedup;
         analyzed.length += 2;
 
         Util.toIntsRef(analyzed, scratchInts);

Modified: lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java?rev=1405963&r1=1405962&r2=1405963&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java (original)
+++ lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java Mon Nov  5 21:22:04 2012
@@ -94,8 +94,7 @@ public class WFSTCompletionLookup extend
   @Override
   public void build(TermFreqIterator iterator) throws IOException {
     BytesRef scratch = new BytesRef();
-    TermFreqIterator iter = new WFSTTermFreqIteratorWrapper(iterator,
-        BytesRef.getUTF8SortedAsUnicodeComparator());
+    TermFreqIterator iter = new WFSTTermFreqIteratorWrapper(iterator);
     IntsRef scratchInts = new IntsRef();
     BytesRef previous = null;
     PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
@@ -247,28 +246,26 @@ public class WFSTCompletionLookup extend
   
   private final class WFSTTermFreqIteratorWrapper extends SortedTermFreqIteratorWrapper {
 
-    WFSTTermFreqIteratorWrapper(TermFreqIterator source,
-        Comparator<BytesRef> comparator) throws IOException {
-      super(source, comparator, true);
+    WFSTTermFreqIteratorWrapper(TermFreqIterator source) throws IOException {
+      super(source);
     }
 
     @Override
     protected void encode(ByteSequencesWriter writer, ByteArrayDataOutput output, byte[] buffer, BytesRef spare, long weight) throws IOException {
-      if (spare.length + 5 >= buffer.length) {
-        buffer = ArrayUtil.grow(buffer, spare.length + 5);
+      if (spare.length + 4 >= buffer.length) {
+        buffer = ArrayUtil.grow(buffer, spare.length + 4);
       }
       output.reset(buffer);
       output.writeBytes(spare.bytes, spare.offset, spare.length);
-      output.writeByte((byte)0); // separator: not used, just for sort order
       output.writeInt(encodeWeight(weight));
       writer.write(buffer, 0, output.getPosition());
     }
     
     @Override
     protected long decode(BytesRef scratch, ByteArrayDataInput tmpInput) {
-      tmpInput.reset(scratch.bytes);
-      tmpInput.skipBytes(scratch.length - 4); // suggestion + separator
-      scratch.length -= 5; // sep + long
+      scratch.length -= 4; // int
+      // skip suggestion:
+      tmpInput.reset(scratch.bytes, scratch.offset+scratch.length, 4);
       return tmpInput.readInt();
     }
   }

Modified: lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/TestTermFreqIterator.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/TestTermFreqIterator.java?rev=1405963&r1=1405962&r2=1405963&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/TestTermFreqIterator.java (original)
+++ lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/TestTermFreqIterator.java Mon Nov  5 21:22:04 2012
@@ -80,50 +80,6 @@ public class TestTermFreqIterator extend
     assertEquals(sorted, actual);
   }
   
-  
-  public void testRaw() throws Exception {
-    int num = atLeast(10000);
-    
-    Comparator<BytesRef> comparator = BytesRef.getUTF8SortedAsUnicodeComparator();
-    BytesRefHash sorted = new BytesRefHash();
-    TermFreq[] unsorted = new TermFreq[num];
-    byte[] buffer = new byte[0];
-    ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
-
-    final Random random = new Random(random().nextLong());
-    for (int i = 0; i < num; i++) {
-      BytesRef spare;
-      long weight;
-      do {
-        spare = new BytesRef(_TestUtil.randomUnicodeString(random));
-        if (spare.length + 8 >= buffer.length) {
-          buffer = ArrayUtil.grow(buffer, spare.length + 8);
-        }
-        output.reset(buffer);
-        output.writeBytes(spare.bytes, spare.offset, spare.length);
-        weight = random.nextLong();
-        output.writeLong(weight);
-        
-      } while (sorted.add(new BytesRef(buffer, 0, output.getPosition())) < 0);
-      unsorted[i] = new TermFreq(spare, weight);
-    }
-    
-    // test the sorted iterator wrapper
-    TermFreqIterator wrapper = new SortedTermFreqIteratorWrapper(new TermFreqArrayIterator(unsorted), comparator, true);
-    int[] sort = sorted.sort(comparator);
-    int size = sorted.size();
-    BytesRef spare = new BytesRef();
-    for (int i = 0; i < size; i++) {
-      sorted.get(sort[i], spare);
-      spare.length -= 8; // sub the long value
-      assertEquals(spare, wrapper.next());
-      spare.offset = spare.offset + spare.length;
-      spare.length = 8;
-      assertEquals(asLong(spare), wrapper.weight());
-    }
-    assertNull(wrapper.next());
-  }
-  
   public static long asLong(BytesRef b) {
     return (((long) asIntInternal(b, b.offset) << 32) | asIntInternal(b,
         b.offset + 4) & 0xFFFFFFFFL);

Modified: lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java?rev=1405963&r1=1405962&r2=1405963&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java (original)
+++ lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java Mon Nov  5 21:22:04 2012
@@ -19,10 +19,10 @@ package org.apache.lucene.search.suggest
 
 import java.io.File;
 import java.io.FileInputStream;
-import java.io.InputStream;
 import java.io.FileOutputStream;
-import java.io.OutputStream;
 import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
 import java.io.Reader;
 import java.io.StringReader;
 import java.util.ArrayList;
@@ -39,6 +39,7 @@ import org.apache.lucene.analysis.Canned
 import org.apache.lucene.analysis.CannedBinaryTokenStream;
 import org.apache.lucene.analysis.CannedTokenStream;
 import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.analysis.MockBytesAttributeFactory;
 import org.apache.lucene.analysis.MockTokenFilter;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.Token;
@@ -503,6 +504,8 @@ public class AnalyzingSuggesterTest exte
     private int numStopChars;
     private boolean preserveHoles;
 
+    private final MockBytesAttributeFactory factory = new MockBytesAttributeFactory();
+
     public MockTokenEatingAnalyzer(int numStopChars, boolean preserveHoles) {
       this.preserveHoles = preserveHoles;
       this.numStopChars = numStopChars;
@@ -510,7 +513,7 @@ public class AnalyzingSuggesterTest exte
 
     @Override
     public TokenStreamComponents createComponents(String fieldName, Reader reader) {
-      MockTokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH);
+      MockTokenizer tokenizer = new MockTokenizer(factory, reader, MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH);
       tokenizer.setEnableChecks(true);
       TokenStream next;
       if (numStopChars != 0) {
@@ -983,4 +986,49 @@ public class AnalyzingSuggesterTest exte
     assertEquals("b", results.get(1).key);
     assertEquals(5, results.get(1).value);
   }
+
+  public void test0ByteKeys() throws Exception {
+    final Analyzer a = new Analyzer() {
+        @Override
+        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+          Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true);
+        
+          return new TokenStreamComponents(tokenizer) {
+            int tokenStreamCounter = 0;
+            final TokenStream[] tokenStreams = new TokenStream[] {
+              new CannedBinaryTokenStream(new BinaryToken[] {
+                  token(new BytesRef(new byte[] {0x0, 0x0, 0x0})),
+                }),
+              new CannedBinaryTokenStream(new BinaryToken[] {
+                  token(new BytesRef(new byte[] {0x0, 0x0})),
+                }),
+              new CannedBinaryTokenStream(new BinaryToken[] {
+                  token(new BytesRef(new byte[] {0x0, 0x0, 0x0})),
+                }),
+              new CannedBinaryTokenStream(new BinaryToken[] {
+                  token(new BytesRef(new byte[] {0x0, 0x0})),
+                }),
+            };
+
+            @Override
+            public TokenStream getTokenStream() {
+              TokenStream result = tokenStreams[tokenStreamCounter];
+              tokenStreamCounter++;
+              return result;
+            }
+         
+            @Override
+            protected void setReader(final Reader reader) throws IOException {
+            }
+          };
+        }
+      };
+
+    AnalyzingSuggester suggester = new AnalyzingSuggester(a, a, 0, 256, -1);
+
+    suggester.build(new TermFreqArrayIterator(new TermFreq[] {
+          new TermFreq("a a", 50),
+          new TermFreq("a b", 50),
+        }));
+  }
 }

Modified: lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/fst/WFSTCompletionTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/fst/WFSTCompletionTest.java?rev=1405963&r1=1405962&r2=1405963&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/fst/WFSTCompletionTest.java (original)
+++ lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/fst/WFSTCompletionTest.java Mon Nov  5 21:22:04 2012
@@ -22,12 +22,13 @@ import java.util.*;
 import org.apache.lucene.search.suggest.Lookup.LookupResult;
 import org.apache.lucene.search.suggest.TermFreq;
 import org.apache.lucene.search.suggest.TermFreqArrayIterator;
+import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.util._TestUtil;
 
 public class WFSTCompletionTest extends LuceneTestCase {
   
-  public void test() throws Exception {
+  public void testBasic() throws Exception {
     TermFreq keys[] = new TermFreq[] {
         new TermFreq("foo", 50),
         new TermFreq("bar", 10),
@@ -194,4 +195,18 @@ public class WFSTCompletionTest extends 
       }
     }
   }
+
+  public void test0ByteKeys() throws Exception {
+    BytesRef key1 = new BytesRef(4);
+    key1.length = 4;
+    BytesRef key2 = new BytesRef(3);
+    key1.length = 3;
+
+    WFSTCompletionLookup suggester = new WFSTCompletionLookup(false);
+
+    suggester.build(new TermFreqArrayIterator(new TermFreq[] {
+          new TermFreq(key1, 50),
+          new TermFreq(key2, 50),
+        }));
+  }
 }