You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2012/01/07 17:47:34 UTC
svn commit: r1228657 - in /lucene/dev/branches/branch_3x: ./ lucene/
lucene/contrib/ lucene/contrib/analyzers/
lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/synonym/
lucene/contrib/analyzers/common/src/test/org/apache/lucene/analy...
Author: mikemccand
Date: Sat Jan 7 16:47:34 2012
New Revision: 1228657
URL: http://svn.apache.org/viewvc?rev=1228657&view=rev
Log:
LUCENE-3668: if there's only 1 output for a synonym rule then set start/endOffset to match the full span of the input tokens
Modified:
lucene/dev/branches/branch_3x/ (props changed)
lucene/dev/branches/branch_3x/lucene/ (props changed)
lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt
lucene/dev/branches/branch_3x/lucene/contrib/analyzers/ (props changed)
lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java
lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymMapFilter.java
lucene/dev/branches/branch_3x/lucene/contrib/icu/ (props changed)
lucene/dev/branches/branch_3x/solr/ (props changed)
lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/analysis/TestSynonymFilterFactory.java
Modified: lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt?rev=1228657&r1=1228656&r2=1228657&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt Sat Jan 7 16:47:34 2012
@@ -53,6 +53,14 @@ Bug Fixes
* LUCENE-3609: Fix regression in BooleanFilter, introduced in Lucene 3.5,
to correctly handle minShouldMatch behaviour of previous versions.
(Shay Banon, Uwe Schindler)
+
+ * LUCENE-3668: For a multi-token synonym mapping to a single token,
+ SynonymFilter will now set the start offset of the synonym token to
+ the start offset of the first matched token, and the end offset of
+ the synonym token to the end offset of the last matched token.
+ This way if the synonym token is used for highlighting, it will
+ cover all tokens it had matched. (Koji Sekiguchi, Robert Muir,
+ Mike McCandless)
Documentation
Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java?rev=1228657&r1=1228656&r2=1228657&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java Sat Jan 7 16:47:34 2012
@@ -153,12 +153,15 @@ public final class SynonymFilter extends
// Holds pending output synonyms for one future position:
private static class PendingOutputs {
CharsRef[] outputs;
+ int[] endOffsets;
int upto;
int count;
int posIncr = 1;
+ int lastEndOffset;
public PendingOutputs() {
outputs = new CharsRef[1];
+ endOffsets = new int[1];
}
public void reset() {
@@ -168,6 +171,7 @@ public final class SynonymFilter extends
public CharsRef pullNext() {
assert upto < count;
+ lastEndOffset = endOffsets[upto];
final CharsRef result = outputs[upto++];
posIncr = 0;
if (upto == count) {
@@ -176,16 +180,29 @@ public final class SynonymFilter extends
return result;
}
- public void add(char[] output, int offset, int len) {
+ public int getLastEndOffset() {
+ return lastEndOffset;
+ }
+
+ public void add(char[] output, int offset, int len, int endOffset) {
if (count == outputs.length) {
final CharsRef[] next = new CharsRef[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
System.arraycopy(outputs, 0, next, 0, count);
outputs = next;
}
+ if (count == endOffsets.length) {
+ final int[] next = new int[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_INT)];
+ System.arraycopy(endOffsets, 0, next, 0, count);
+ endOffsets = next;
+ }
if (outputs[count] == null) {
outputs[count] = new CharsRef();
}
outputs[count].copyChars(output, offset, len);
+ // endOffset can be -1, in which case we should simply
+ // use the endOffset of the input token, or X >= 0, in
+ // which case we use X as the endOffset for this output
+ endOffsets[count] = endOffset;
count++;
}
};
@@ -281,6 +298,7 @@ public final class SynonymFilter extends
// Holds the longest match we've seen so far:
BytesRef matchOutput = null;
int matchInputLength = 0;
+ int matchEndOffset = -1;
BytesRef pendingOutput = fst.outputs.getNoOutput();
fst.getFirstArc(scratchArc);
@@ -297,6 +315,8 @@ public final class SynonymFilter extends
final int bufferLen;
//System.out.println(" cycle nextRead=" + curNextRead + " nextWrite=" + nextWrite);
+ int inputEndOffset = 0;
+
if (curNextRead == nextWrite) {
// We used up our lookahead buffer of input tokens
@@ -317,6 +337,7 @@ public final class SynonymFilter extends
final PendingInput input = futureInputs[nextWrite];
input.startOffset = offsetAtt.startOffset();
input.endOffset = offsetAtt.endOffset();
+ inputEndOffset = input.endOffset;
//System.out.println(" new token=" + new String(buffer, 0, bufferLen));
if (nextRead != nextWrite) {
capture();
@@ -335,6 +356,7 @@ public final class SynonymFilter extends
// Still in our lookahead
buffer = futureInputs[curNextRead].term.chars;
bufferLen = futureInputs[curNextRead].term.length;
+ inputEndOffset = futureInputs[curNextRead].endOffset;
//System.out.println(" old token=" + new String(buffer, 0, bufferLen));
}
@@ -360,6 +382,7 @@ public final class SynonymFilter extends
if (scratchArc.isFinal()) {
matchOutput = fst.outputs.add(pendingOutput, scratchArc.nextFinalOutput);
matchInputLength = tokenCount;
+ matchEndOffset = inputEndOffset;
//System.out.println(" found matchLength=" + matchInputLength + " output=" + matchOutput);
}
@@ -390,7 +413,7 @@ public final class SynonymFilter extends
if (matchOutput != null) {
//System.out.println(" add matchLength=" + matchInputLength + " output=" + matchOutput);
inputSkipCount = matchInputLength;
- addOutput(matchOutput, matchInputLength);
+ addOutput(matchOutput, matchInputLength, matchEndOffset);
} else if (nextRead != nextWrite) {
// Even though we had no match here, we set to 1
// because we need to skip current input token before
@@ -404,7 +427,7 @@ public final class SynonymFilter extends
}
// Interleaves all output tokens onto the futureOutputs:
- private void addOutput(BytesRef bytes, int matchInputLength) {
+ private void addOutput(BytesRef bytes, int matchInputLength, int matchEndOffset) {
bytesReader.reset(bytes.bytes, bytes.offset, bytes.length);
final int code = bytesReader.readVInt();
@@ -425,7 +448,21 @@ public final class SynonymFilter extends
// Caller is not allowed to have empty string in
// the output:
assert outputLen > 0: "output contains empty string: " + scratchChars;
- futureOutputs[outputUpto].add(scratchChars.chars, lastStart, outputLen);
+ final int endOffset;
+ if (chIDX == chEnd && lastStart == scratchChars.offset) {
+ // This rule had a single output token, so, we set
+ // this output's endOffset to the current
+ // endOffset (ie, endOffset of the last input
+ // token it matched):
+ endOffset = matchEndOffset;
+ } else {
+ // This rule has more than one output token; we
+ // can't pick any particular endOffset for this
+ // case, so, we inherit the endOffset for the
+ // input token which this output overlaps:
+ endOffset = -1;
+ }
+ futureOutputs[outputUpto].add(scratchChars.chars, lastStart, outputLen, endOffset);
//System.out.println(" " + new String(scratchChars.chars, lastStart, outputLen) + " outputUpto=" + outputUpto);
lastStart = 1+chIDX;
//System.out.println(" slot=" + outputUpto + " keepOrig=" + keepOrig);
@@ -507,7 +544,11 @@ public final class SynonymFilter extends
clearAttributes();
termAtt.copyBuffer(output.chars, output.offset, output.length);
typeAtt.setType(TYPE_SYNONYM);
- offsetAtt.setOffset(input.startOffset, input.endOffset);
+ int endOffset = outputs.getLastEndOffset();
+ if (endOffset == -1) {
+ endOffset = input.endOffset;
+ }
+ offsetAtt.setOffset(input.startOffset, endOffset);
posIncrAtt.setPositionIncrement(posIncr);
if (outputs.count == 0) {
// Done with the buffered input and all outputs at
Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymMapFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymMapFilter.java?rev=1228657&r1=1228656&r2=1228657&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymMapFilter.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymMapFilter.java Sat Jan 7 16:47:34 2012
@@ -60,7 +60,12 @@ public class TestSynonymMapFilter extend
}
}
- // todo: we should probably refactor this guy to use/take analyzer,
+ // For the output string: separate positions with a space,
+ // and separate multiple tokens at each position with a
+ // /. If a token should have end offset != the input
+ // token's end offset then add :X to it:
+
+ // TODO: we should probably refactor this guy to use/take analyzer,
// the tests are a little messy
private void verify(String input, String output) throws Exception {
if (VERBOSE) {
@@ -74,7 +79,7 @@ public class TestSynonymMapFilter extend
while(tokensOut.incrementToken()) {
if (VERBOSE) {
- System.out.println(" incr token=" + termAtt.toString() + " posIncr=" + posIncrAtt.getPositionIncrement());
+ System.out.println(" incr token=" + termAtt.toString() + " posIncr=" + posIncrAtt.getPositionIncrement() + " startOff=" + offsetAtt.startOffset() + " endOff=" + offsetAtt.endOffset());
}
assertTrue(expectedUpto < expected.length);
@@ -86,16 +91,26 @@ public class TestSynonymMapFilter extend
if (atPos > 0) {
assertTrue(tokensOut.incrementToken());
if (VERBOSE) {
- System.out.println(" incr token=" + termAtt.toString() + " posIncr=" + posIncrAtt.getPositionIncrement());
+ System.out.println(" incr token=" + termAtt.toString() + " posIncr=" + posIncrAtt.getPositionIncrement() + " startOff=" + offsetAtt.startOffset() + " endOff=" + offsetAtt.endOffset());
}
}
- assertEquals(termAtt, expectedAtPos[atPos]);
+ final int colonIndex = expectedAtPos[atPos].indexOf(':');
+ final String expectedToken;
+ final int expectedEndOffset;
+ if (colonIndex != -1) {
+ expectedToken = expectedAtPos[atPos].substring(0, colonIndex);
+ expectedEndOffset = Integer.parseInt(expectedAtPos[atPos].substring(1+colonIndex));
+ } else {
+ expectedToken = expectedAtPos[atPos];
+ expectedEndOffset = endOffset;
+ }
+ assertEquals(expectedToken, termAtt.toString());
assertEquals(atPos == 0 ? 1 : 0,
posIncrAtt.getPositionIncrement());
// start/end offset of all tokens at same pos should
// be the same:
assertEquals(startOffset, offsetAtt.startOffset());
- assertEquals(endOffset, offsetAtt.endOffset());
+ assertEquals(expectedEndOffset, offsetAtt.endOffset());
}
}
tokensOut.end();
@@ -113,6 +128,7 @@ public class TestSynonymMapFilter extend
add("b c", "dog collar", true);
add("c d", "dog harness holder extras", true);
add("m c e", "dog barks loudly", false);
+ add("i j k", "feep", true);
add("e f", "foo bar", false);
add("e f", "baz bee", false);
@@ -149,6 +165,9 @@ public class TestSynonymMapFilter extend
// two outputs for same input
verify("e f", "foo/baz bar/bee");
+ // verify multi-word / single-output offsets:
+ verify("g i j k g", "g i/feep:7 j k g");
+
// mixed keepOrig true/false:
verify("a m c e x", "a/foo dog barks loudly x");
verify("c d m c e x", "c/dog d/harness holder/dog extras/barks loudly x");
@@ -242,6 +261,10 @@ public class TestSynonymMapFilter extend
} else {
outputs[matchIDX] = outputs[matchIDX] + "/" + synOutputs[synUpto++];
}
+ if (synOutputs.length == 1) {
+ // Add endOffset
+ outputs[matchIDX] = outputs[matchIDX] + ":" + ((inputIDX*2) + syn.in.length());
+ }
}
}
}
@@ -664,4 +687,24 @@ public class TestSynonymMapFilter extend
new String[] { "zoo", "zoo", "zoo", "$", "zoo", "zoo", "zoo" },
new int[] { 1, 0, 1, 1, 1, 0, 1 });
}
+
+ public void testMultiwordOffsets() throws Exception {
+ b = new SynonymMap.Builder(true);
+ final boolean keepOrig = true;
+ add("national hockey league", "nhl", keepOrig);
+ final SynonymMap map = b.build();
+ Analyzer a = new ReusableAnalyzerBase() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+ return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true));
+ }
+ };
+
+ assertAnalyzesTo(a, "national hockey league",
+ new String[] { "national", "nhl", "hockey", "league" },
+ new int[] { 0, 0, 9, 16 },
+ new int[] { 8, 22, 15, 22 },
+ new int[] { 1, 0, 1, 1 });
+ }
}
Modified: lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/analysis/TestSynonymFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/analysis/TestSynonymFilterFactory.java?rev=1228657&r1=1228656&r2=1228657&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/analysis/TestSynonymFilterFactory.java (original)
+++ lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/analysis/TestSynonymFilterFactory.java Sat Jan 7 16:47:34 2012
@@ -21,6 +21,7 @@ import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
+import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@@ -65,6 +66,25 @@ public class TestSynonymFilterFactory ex
new int[] { 1, 0, 0, 0 });
}
+ /** test multiword offsets with the old impl
+ * @deprecated Remove this test in Lucene 5.0 */
+ @Deprecated
+ public void testMultiwordOffsetsOld() throws Exception {
+ SynonymFilterFactory factory = new SynonymFilterFactory();
+ Map<String,String> args = new HashMap<String,String>();
+ args.put("luceneMatchVersion", Version.LUCENE_33.toString());
+ args.put("synonyms", "synonyms.txt");
+ factory.init(args);
+ factory.inform(new StringMockSolrResourceLoader("national hockey league, nhl"));
+ TokenStream ts = factory.create(new MockTokenizer(new StringReader("national hockey league"), MockTokenizer.WHITESPACE, false));
+ // WTF?
+ assertTokenStreamContents(ts,
+ new String[] { "national", "nhl", "hockey", "league" },
+ new int[] { 0, 0, 0, 0 },
+ new int[] { 22, 22, 22, 22 },
+ new int[] { 1, 0, 1, 1 });
+ }
+
/** if the synonyms are completely empty, test that we still analyze correctly */
public void testEmptySynonyms() throws Exception {
SynonymFilterFactory factory = new SynonymFilterFactory();
@@ -85,7 +105,7 @@ public class TestSynonymFilterFactory ex
}
public List<String> getLines(String resource) throws IOException {
- return null;
+ return Arrays.asList(text.split("\n"));
}
public Object newInstance(String cname, String... subpackages) {