You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2012/11/05 22:22:05 UTC
svn commit: r1405963 - in /lucene/dev/branches/branch_4x: ./ lucene/
lucene/suggest/ lucene/suggest/src/java/org/apache/lucene/search/suggest/
lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/
lucene/suggest/src/java/org/apache/lucene...
Author: mikemccand
Date: Mon Nov 5 21:22:04 2012
New Revision: 1405963
URL: http://svn.apache.org/viewvc?rev=1405963&view=rev
Log:
LUCENE-4534: handle 0 byte values in lookup keys
Modified:
lucene/dev/branches/branch_4x/ (props changed)
lucene/dev/branches/branch_4x/lucene/ (props changed)
lucene/dev/branches/branch_4x/lucene/CHANGES.txt (contents, props changed)
lucene/dev/branches/branch_4x/lucene/suggest/ (props changed)
lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/SortedTermFreqIteratorWrapper.java
lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java
lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java
lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/TestTermFreqIterator.java
lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java
lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/fst/WFSTCompletionTest.java
Modified: lucene/dev/branches/branch_4x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/CHANGES.txt?rev=1405963&r1=1405962&r2=1405963&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_4x/lucene/CHANGES.txt Mon Nov 5 21:22:04 2012
@@ -106,6 +106,9 @@ Bug Fixes
* LUCENE-4513: Fixed that deleted nested docs are scored into the
parent doc when using ToParentBlockJoinQuery. (Martijn van Groningen)
+* LUCENE-4534: Fixed WFSTCompletionLookup and Analyzing/FuzzySuggester
+ to allow 0 byte values in the lookup keys. (Mike McCandless)
+
Optimizations
* LUCENE-4512: Additional memory savings for CompressingStoredFieldsFormat.
Modified: lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/SortedTermFreqIteratorWrapper.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/SortedTermFreqIteratorWrapper.java?rev=1405963&r1=1405962&r2=1405963&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/SortedTermFreqIteratorWrapper.java (original)
+++ lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/SortedTermFreqIteratorWrapper.java Mon Nov 5 21:22:04 2012
@@ -41,28 +41,33 @@ public class SortedTermFreqIteratorWrapp
private File tempInput;
private File tempSorted;
private final ByteSequencesReader reader;
+ private final Comparator<BytesRef> comparator;
private boolean done = false;
private long weight;
private final BytesRef scratch = new BytesRef();
- private final Comparator<BytesRef> comparator;
- /**
- * Calls {@link #SortedTermFreqIteratorWrapper(TermFreqIterator, Comparator, boolean)
- * SortedTermFreqIteratorWrapper(source, comparator, false)}
- */
- public SortedTermFreqIteratorWrapper(TermFreqIterator source, Comparator<BytesRef> comparator) throws IOException {
- this(source, comparator, false);
+ /**
+ * Creates a new sorted wrapper, using {@link
+ * BytesRef#getUTF8SortedAsUnicodeComparator} for
+ * sorting. */
+ public SortedTermFreqIteratorWrapper(TermFreqIterator source) throws IOException {
+ this(source, BytesRef.getUTF8SortedAsUnicodeComparator());
}
-
+
/**
- * Creates a new sorted wrapper. if <code>compareRawBytes</code> is true, then
- * only the bytes (not the weight) will be used for comparison.
+ * Creates a new sorted wrapper, sorting by BytesRef
+ * (ascending) then cost (ascending).
*/
- public SortedTermFreqIteratorWrapper(TermFreqIterator source, Comparator<BytesRef> comparator, boolean compareRawBytes) throws IOException {
+ public SortedTermFreqIteratorWrapper(TermFreqIterator source, Comparator<BytesRef> comparator) throws IOException {
this.source = source;
this.comparator = comparator;
- this.reader = sort(compareRawBytes ? comparator : new BytesOnlyComparator(this.comparator));
+ this.reader = sort();
+ }
+
+ @Override
+ public Comparator<BytesRef> getComparator() {
+ return comparator;
}
@Override
@@ -90,16 +95,43 @@ public class SortedTermFreqIteratorWrapp
}
@Override
- public Comparator<BytesRef> getComparator() {
- return comparator;
- }
-
- @Override
public long weight() {
return weight;
}
+
+ /** Sortes by BytesRef (ascending) then cost (ascending). */
+ private final Comparator<BytesRef> tieBreakByCostComparator = new Comparator<BytesRef>() {
+
+ private final BytesRef leftScratch = new BytesRef();
+ private final BytesRef rightScratch = new BytesRef();
+ private final ByteArrayDataInput input = new ByteArrayDataInput();
+
+ @Override
+ public int compare(BytesRef left, BytesRef right) {
+ // Make shallow copy in case decode changes the BytesRef:
+ leftScratch.bytes = left.bytes;
+ leftScratch.offset = left.offset;
+ leftScratch.length = left.length;
+ rightScratch.bytes = right.bytes;
+ rightScratch.offset = right.offset;
+ rightScratch.length = right.length;
+ long leftCost = decode(leftScratch, input);
+ long rightCost = decode(rightScratch, input);
+ int cmp = comparator.compare(leftScratch, rightScratch);
+ if (cmp != 0) {
+ return cmp;
+ }
+ if (leftCost < rightCost) {
+ return -1;
+ } else if (rightCost < leftCost) {
+ return 1;
+ } else {
+ return 0;
+ }
+ }
+ };
- private Sort.ByteSequencesReader sort(Comparator<BytesRef> comparator) throws IOException {
+ private Sort.ByteSequencesReader sort() throws IOException {
String prefix = getClass().getSimpleName();
File directory = Sort.defaultTempDir();
tempInput = File.createTempFile(prefix, ".input", directory);
@@ -116,7 +148,7 @@ public class SortedTermFreqIteratorWrapp
encode(writer, output, buffer, spare, source.weight());
}
writer.close();
- new Sort(comparator).sort(tempInput, tempSorted);
+ new Sort(tieBreakByCostComparator).sort(tempInput, tempSorted);
ByteSequencesReader reader = new Sort.ByteSequencesReader(tempSorted);
success = true;
return reader;
@@ -131,7 +163,6 @@ public class SortedTermFreqIteratorWrapp
close();
}
}
-
}
}
@@ -145,31 +176,6 @@ public class SortedTermFreqIteratorWrapp
}
}
- private final static class BytesOnlyComparator implements Comparator<BytesRef> {
-
- final Comparator<BytesRef> other;
- private final BytesRef leftScratch = new BytesRef();
- private final BytesRef rightScratch = new BytesRef();
-
- public BytesOnlyComparator(Comparator<BytesRef> other) {
- this.other = other;
- }
-
- @Override
- public int compare(BytesRef left, BytesRef right) {
- wrap(leftScratch, left);
- wrap(rightScratch, right);
- return other.compare(leftScratch, rightScratch);
- }
-
- private void wrap(BytesRef wrapper, BytesRef source) {
- wrapper.bytes = source.bytes;
- wrapper.offset = source.offset;
- wrapper.length = source.length - 8;
-
- }
- }
-
/** encodes an entry (bytes+weight) to the provided writer */
protected void encode(ByteSequencesWriter writer, ByteArrayDataOutput output, byte[] buffer, BytesRef spare, long weight) throws IOException {
if (spare.length + 8 >= buffer.length) {
@@ -184,9 +190,8 @@ public class SortedTermFreqIteratorWrapp
/** decodes the weight at the current position */
protected long decode(BytesRef scratch, ByteArrayDataInput tmpInput) {
tmpInput.reset(scratch.bytes);
- tmpInput.skipBytes(scratch.length - 8); // suggestion + separator
- scratch.length -= 8; // sep + long
+ tmpInput.skipBytes(scratch.length - 8); // suggestion
+ scratch.length -= 8; // long
return tmpInput.readLong();
}
-
}
Modified: lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java?rev=1405963&r1=1405962&r2=1405963&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java (original)
+++ lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java Mon Nov 5 21:22:04 2012
@@ -320,6 +320,56 @@ public class AnalyzingSuggester extends
return new TokenStreamToAutomaton();
}
}
+
+ private Comparator<BytesRef> sortComparator = new Comparator<BytesRef>() {
+ private final ByteArrayDataInput readerA = new ByteArrayDataInput();
+ private final ByteArrayDataInput readerB = new ByteArrayDataInput();
+ private final BytesRef scratchA = new BytesRef();
+ private final BytesRef scratchB = new BytesRef();
+
+ @Override
+ public int compare(BytesRef a, BytesRef b) {
+
+ // First by analyzed form:
+ readerA.reset(a.bytes, a.offset, a.length);
+ scratchA.length = readerA.readShort();
+ scratchA.bytes = a.bytes;
+ scratchA.offset = readerA.getPosition();
+
+ readerB.reset(b.bytes, b.offset, b.length);
+ scratchB.bytes = b.bytes;
+ scratchB.length = readerB.readShort();
+ scratchB.offset = readerB.getPosition();
+
+ int cmp = scratchA.compareTo(scratchB);
+ if (cmp != 0) {
+ return cmp;
+ }
+
+ // Next by cost:
+ long aCost = readerA.readInt();
+ long bCost = readerB.readInt();
+
+ if (aCost < bCost) {
+ return -1;
+ } else if (aCost > bCost) {
+ return 1;
+ }
+
+ // Finally by surface form:
+ scratchA.offset = readerA.getPosition();
+ scratchA.length = a.length - scratchA.offset;
+ scratchB.offset = readerB.getPosition();
+ scratchB.length = b.length - scratchB.offset;
+
+ cmp = scratchA.compareTo(scratchB);
+ if (cmp != 0) {
+ return cmp;
+ }
+
+ return 0;
+ }
+ };
@Override
public void build(TermFreqIterator iterator) throws IOException {
@@ -350,42 +400,43 @@ public class AnalyzingSuggester extends
Util.toBytesRef(path, scratch);
// length of the analyzed text (FST input)
+ if (scratch.length > Short.MAX_VALUE-2) {
+ throw new IllegalArgumentException("cannot handle analyzed forms > " + (Short.MAX_VALUE-2) + " in length (got " + scratch.length + ")");
+ }
short analyzedLength = (short) scratch.length;
+
// compute the required length:
- // analyzed sequence + 12 (separator) + weight (4) + surface + analyzedLength (short)
- int requiredLength = analyzedLength + 2 + 4 + surfaceForm.length + 2;
+ // analyzed sequence + weight (4) + surface + analyzedLength (short)
+ int requiredLength = analyzedLength + 4 + surfaceForm.length + 2;
buffer = ArrayUtil.grow(buffer, requiredLength);
output.reset(buffer);
+
+ output.writeShort(analyzedLength);
+
output.writeBytes(scratch.bytes, scratch.offset, scratch.length);
- output.writeByte((byte)0); // separator: not used, just for sort order
- output.writeByte((byte)0); // separator: not used, just for sort order
- // NOTE: important that writeInt is big-endian,
- // because this means we sort secondarily by
- // cost ascending (= weight descending) so that
- // when we discard too many surface forms for a
- // single analyzed form we are discarding the
- // least weight ones:
output.writeInt(encodeWeight(iterator.weight()));
output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length);
- output.writeShort(analyzedLength);
+
+ assert output.getPosition() == requiredLength: output.getPosition() + " vs " + requiredLength;
+
writer.write(buffer, 0, output.getPosition());
}
}
writer.close();
// Sort all input/output pairs (required by FST.Builder):
- new Sort().sort(tempInput, tempSorted);
+ new Sort(sortComparator).sort(tempInput, tempSorted);
reader = new Sort.ByteSequencesReader(tempSorted);
PairOutputs<Long,BytesRef> outputs = new PairOutputs<Long,BytesRef>(PositiveIntOutputs.getSingleton(true), ByteSequenceOutputs.getSingleton());
Builder<Pair<Long,BytesRef>> builder = new Builder<Pair<Long,BytesRef>>(FST.INPUT_TYPE.BYTE1, outputs);
// Build FST:
- BytesRef previous = null;
+ BytesRef previousAnalyzed = null;
BytesRef analyzed = new BytesRef();
BytesRef surface = new BytesRef();
IntsRef scratchInts = new IntsRef();
@@ -394,24 +445,21 @@ public class AnalyzingSuggester extends
int dedup = 0;
while (reader.read(scratch)) {
input.reset(scratch.bytes, scratch.offset, scratch.length);
- input.setPosition(input.length()-2);
short analyzedLength = input.readShort();
-
- analyzed.bytes = scratch.bytes;
- analyzed.offset = scratch.offset;
+ analyzed.grow(analyzedLength+2);
+ input.readBytes(analyzed.bytes, 0, analyzedLength);
analyzed.length = analyzedLength;
-
- input.setPosition(analyzedLength + 2); // analyzed sequence + separator
+
long cost = input.readInt();
-
+
surface.bytes = scratch.bytes;
surface.offset = input.getPosition();
- surface.length = input.length() - input.getPosition() - 2;
-
- if (previous == null) {
- previous = new BytesRef();
- previous.copyBytes(analyzed);
- } else if (analyzed.equals(previous)) {
+ surface.length = scratch.length - surface.offset;
+
+ if (previousAnalyzed == null) {
+ previousAnalyzed = new BytesRef();
+ previousAnalyzed.copyBytes(analyzed);
+ } else if (analyzed.equals(previousAnalyzed)) {
dedup++;
if (dedup >= maxSurfaceFormsPerAnalyzedForm) {
// More than maxSurfaceFormsPerAnalyzedForm
@@ -420,11 +468,9 @@ public class AnalyzingSuggester extends
}
} else {
dedup = 0;
- previous.copyBytes(analyzed);
+ previousAnalyzed.copyBytes(analyzed);
}
- analyzed.grow(analyzed.length+2);
-
// TODO: I think we can avoid the extra 2 bytes when
// there is no dup (dedup==0), but we'd have to fix
// the exactFirst logic ... which would be sort of
@@ -433,8 +479,8 @@ public class AnalyzingSuggester extends
// NOTE: must be byte 0 so we sort before whatever
// is next
- analyzed.bytes[analyzed.length] = 0;
- analyzed.bytes[analyzed.length+1] = (byte) dedup;
+ analyzed.bytes[analyzed.offset+analyzed.length] = 0;
+ analyzed.bytes[analyzed.offset+analyzed.length+1] = (byte) dedup;
analyzed.length += 2;
Util.toIntsRef(analyzed, scratchInts);
Modified: lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java?rev=1405963&r1=1405962&r2=1405963&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java (original)
+++ lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java Mon Nov 5 21:22:04 2012
@@ -94,8 +94,7 @@ public class WFSTCompletionLookup extend
@Override
public void build(TermFreqIterator iterator) throws IOException {
BytesRef scratch = new BytesRef();
- TermFreqIterator iter = new WFSTTermFreqIteratorWrapper(iterator,
- BytesRef.getUTF8SortedAsUnicodeComparator());
+ TermFreqIterator iter = new WFSTTermFreqIteratorWrapper(iterator);
IntsRef scratchInts = new IntsRef();
BytesRef previous = null;
PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
@@ -247,28 +246,26 @@ public class WFSTCompletionLookup extend
private final class WFSTTermFreqIteratorWrapper extends SortedTermFreqIteratorWrapper {
- WFSTTermFreqIteratorWrapper(TermFreqIterator source,
- Comparator<BytesRef> comparator) throws IOException {
- super(source, comparator, true);
+ WFSTTermFreqIteratorWrapper(TermFreqIterator source) throws IOException {
+ super(source);
}
@Override
protected void encode(ByteSequencesWriter writer, ByteArrayDataOutput output, byte[] buffer, BytesRef spare, long weight) throws IOException {
- if (spare.length + 5 >= buffer.length) {
- buffer = ArrayUtil.grow(buffer, spare.length + 5);
+ if (spare.length + 4 >= buffer.length) {
+ buffer = ArrayUtil.grow(buffer, spare.length + 4);
}
output.reset(buffer);
output.writeBytes(spare.bytes, spare.offset, spare.length);
- output.writeByte((byte)0); // separator: not used, just for sort order
output.writeInt(encodeWeight(weight));
writer.write(buffer, 0, output.getPosition());
}
@Override
protected long decode(BytesRef scratch, ByteArrayDataInput tmpInput) {
- tmpInput.reset(scratch.bytes);
- tmpInput.skipBytes(scratch.length - 4); // suggestion + separator
- scratch.length -= 5; // sep + long
+ scratch.length -= 4; // int
+ // skip suggestion:
+ tmpInput.reset(scratch.bytes, scratch.offset+scratch.length, 4);
return tmpInput.readInt();
}
}
Modified: lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/TestTermFreqIterator.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/TestTermFreqIterator.java?rev=1405963&r1=1405962&r2=1405963&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/TestTermFreqIterator.java (original)
+++ lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/TestTermFreqIterator.java Mon Nov 5 21:22:04 2012
@@ -80,50 +80,6 @@ public class TestTermFreqIterator extend
assertEquals(sorted, actual);
}
-
- public void testRaw() throws Exception {
- int num = atLeast(10000);
-
- Comparator<BytesRef> comparator = BytesRef.getUTF8SortedAsUnicodeComparator();
- BytesRefHash sorted = new BytesRefHash();
- TermFreq[] unsorted = new TermFreq[num];
- byte[] buffer = new byte[0];
- ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
-
- final Random random = new Random(random().nextLong());
- for (int i = 0; i < num; i++) {
- BytesRef spare;
- long weight;
- do {
- spare = new BytesRef(_TestUtil.randomUnicodeString(random));
- if (spare.length + 8 >= buffer.length) {
- buffer = ArrayUtil.grow(buffer, spare.length + 8);
- }
- output.reset(buffer);
- output.writeBytes(spare.bytes, spare.offset, spare.length);
- weight = random.nextLong();
- output.writeLong(weight);
-
- } while (sorted.add(new BytesRef(buffer, 0, output.getPosition())) < 0);
- unsorted[i] = new TermFreq(spare, weight);
- }
-
- // test the sorted iterator wrapper
- TermFreqIterator wrapper = new SortedTermFreqIteratorWrapper(new TermFreqArrayIterator(unsorted), comparator, true);
- int[] sort = sorted.sort(comparator);
- int size = sorted.size();
- BytesRef spare = new BytesRef();
- for (int i = 0; i < size; i++) {
- sorted.get(sort[i], spare);
- spare.length -= 8; // sub the long value
- assertEquals(spare, wrapper.next());
- spare.offset = spare.offset + spare.length;
- spare.length = 8;
- assertEquals(asLong(spare), wrapper.weight());
- }
- assertNull(wrapper.next());
- }
-
public static long asLong(BytesRef b) {
return (((long) asIntInternal(b, b.offset) << 32) | asIntInternal(b,
b.offset + 4) & 0xFFFFFFFFL);
Modified: lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java?rev=1405963&r1=1405962&r2=1405963&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java (original)
+++ lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java Mon Nov 5 21:22:04 2012
@@ -19,10 +19,10 @@ package org.apache.lucene.search.suggest
import java.io.File;
import java.io.FileInputStream;
-import java.io.InputStream;
import java.io.FileOutputStream;
-import java.io.OutputStream;
import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
@@ -39,6 +39,7 @@ import org.apache.lucene.analysis.Canned
import org.apache.lucene.analysis.CannedBinaryTokenStream;
import org.apache.lucene.analysis.CannedTokenStream;
import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.analysis.MockBytesAttributeFactory;
import org.apache.lucene.analysis.MockTokenFilter;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Token;
@@ -503,6 +504,8 @@ public class AnalyzingSuggesterTest exte
private int numStopChars;
private boolean preserveHoles;
+ private final MockBytesAttributeFactory factory = new MockBytesAttributeFactory();
+
public MockTokenEatingAnalyzer(int numStopChars, boolean preserveHoles) {
this.preserveHoles = preserveHoles;
this.numStopChars = numStopChars;
@@ -510,7 +513,7 @@ public class AnalyzingSuggesterTest exte
@Override
public TokenStreamComponents createComponents(String fieldName, Reader reader) {
- MockTokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH);
+ MockTokenizer tokenizer = new MockTokenizer(factory, reader, MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH);
tokenizer.setEnableChecks(true);
TokenStream next;
if (numStopChars != 0) {
@@ -983,4 +986,49 @@ public class AnalyzingSuggesterTest exte
assertEquals("b", results.get(1).key);
assertEquals(5, results.get(1).value);
}
+
+ public void test0ByteKeys() throws Exception {
+ final Analyzer a = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true);
+
+ return new TokenStreamComponents(tokenizer) {
+ int tokenStreamCounter = 0;
+ final TokenStream[] tokenStreams = new TokenStream[] {
+ new CannedBinaryTokenStream(new BinaryToken[] {
+ token(new BytesRef(new byte[] {0x0, 0x0, 0x0})),
+ }),
+ new CannedBinaryTokenStream(new BinaryToken[] {
+ token(new BytesRef(new byte[] {0x0, 0x0})),
+ }),
+ new CannedBinaryTokenStream(new BinaryToken[] {
+ token(new BytesRef(new byte[] {0x0, 0x0, 0x0})),
+ }),
+ new CannedBinaryTokenStream(new BinaryToken[] {
+ token(new BytesRef(new byte[] {0x0, 0x0})),
+ }),
+ };
+
+ @Override
+ public TokenStream getTokenStream() {
+ TokenStream result = tokenStreams[tokenStreamCounter];
+ tokenStreamCounter++;
+ return result;
+ }
+
+ @Override
+ protected void setReader(final Reader reader) throws IOException {
+ }
+ };
+ }
+ };
+
+ AnalyzingSuggester suggester = new AnalyzingSuggester(a, a, 0, 256, -1);
+
+ suggester.build(new TermFreqArrayIterator(new TermFreq[] {
+ new TermFreq("a a", 50),
+ new TermFreq("a b", 50),
+ }));
+ }
}
Modified: lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/fst/WFSTCompletionTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/fst/WFSTCompletionTest.java?rev=1405963&r1=1405962&r2=1405963&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/fst/WFSTCompletionTest.java (original)
+++ lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/fst/WFSTCompletionTest.java Mon Nov 5 21:22:04 2012
@@ -22,12 +22,13 @@ import java.util.*;
import org.apache.lucene.search.suggest.Lookup.LookupResult;
import org.apache.lucene.search.suggest.TermFreq;
import org.apache.lucene.search.suggest.TermFreqArrayIterator;
+import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util._TestUtil;
public class WFSTCompletionTest extends LuceneTestCase {
- public void test() throws Exception {
+ public void testBasic() throws Exception {
TermFreq keys[] = new TermFreq[] {
new TermFreq("foo", 50),
new TermFreq("bar", 10),
@@ -194,4 +195,18 @@ public class WFSTCompletionTest extends
}
}
}
+
+ public void test0ByteKeys() throws Exception {
+ BytesRef key1 = new BytesRef(4);
+ key1.length = 4;
+ BytesRef key2 = new BytesRef(3);
+ key1.length = 3;
+
+ WFSTCompletionLookup suggester = new WFSTCompletionLookup(false);
+
+ suggester.build(new TermFreqArrayIterator(new TermFreq[] {
+ new TermFreq(key1, 50),
+ new TermFreq(key2, 50),
+ }));
+ }
}