You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2012/01/07 17:47:34 UTC
svn commit: r1228657 - in /lucene/dev/branches/branch_3x: ./ lucene/ lucene/contrib/ lucene/contrib/analyzers/ lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/synonym/ lucene/contrib/analyzers/common/src/test/org/apache/lucene/analy...

Author: mikemccand
Date: Sat Jan  7 16:47:34 2012
New Revision: 1228657

URL: http://svn.apache.org/viewvc?rev=1228657&view=rev
Log:
LUCENE-3668: if there's only 1 output for a synonym rule then set start/endOffset to match the full span of the input tokens

Modified:
    lucene/dev/branches/branch_3x/   (props changed)
    lucene/dev/branches/branch_3x/lucene/   (props changed)
    lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt
    lucene/dev/branches/branch_3x/lucene/contrib/analyzers/   (props changed)
    lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java
    lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymMapFilter.java
    lucene/dev/branches/branch_3x/lucene/contrib/icu/   (props changed)
    lucene/dev/branches/branch_3x/solr/   (props changed)
    lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/analysis/TestSynonymFilterFactory.java

Modified: lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt?rev=1228657&r1=1228656&r2=1228657&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt Sat Jan  7 16:47:34 2012
@@ -53,6 +53,14 @@ Bug Fixes
  * LUCENE-3609: Fix regression in BooleanFilter, introduced in Lucene 3.5,
    to correctly handle minShouldMatch behaviour of previous versions.
    (Shay Banon, Uwe Schindler)
+
+ * LUCENE-3668: For a multi-token synonym mapping to a single token,
+   SynonymFilter will now set the start offset of the synonym token to
+   the start offset of the first matched token, and the end offset of
+   the synonym token to the end offset of the last matched token.
+   This way if the synonym token is used for highlighting, it will
+   cover all tokens it had matched.  (Koji Sekiguchi, Robert Muir,
+   Mike McCandless)
  
 Documentation
 

Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java?rev=1228657&r1=1228656&r2=1228657&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java Sat Jan  7 16:47:34 2012
@@ -153,12 +153,15 @@ public final class SynonymFilter extends
   // Holds pending output synonyms for one future position:
   private static class PendingOutputs {
     CharsRef[] outputs;
+    int[] endOffsets;
     int upto;
     int count;
     int posIncr = 1;
+    int lastEndOffset;
 
     public PendingOutputs() {
       outputs = new CharsRef[1];
+      endOffsets = new int[1];
     }
 
     public void reset() {
@@ -168,6 +171,7 @@ public final class SynonymFilter extends
 
     public CharsRef pullNext() {
       assert upto < count;
+      lastEndOffset = endOffsets[upto];
       final CharsRef result = outputs[upto++];
       posIncr = 0;
       if (upto == count) {
@@ -176,16 +180,29 @@ public final class SynonymFilter extends
       return result;
     }
 
-    public void add(char[] output, int offset, int len) {
+    public int getLastEndOffset() {
+      return lastEndOffset;
+    }
+
+    public void add(char[] output, int offset, int len, int endOffset) {
       if (count == outputs.length) {
         final CharsRef[] next = new CharsRef[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
         System.arraycopy(outputs, 0, next, 0, count);
         outputs = next;
       }
+      if (count == endOffsets.length) {
+        final int[] next = new int[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_INT)];
+        System.arraycopy(endOffsets, 0, next, 0, count);
+        endOffsets = next;
+      }
       if (outputs[count] == null) {
         outputs[count] = new CharsRef();
       }
       outputs[count].copyChars(output, offset, len);
+      // endOffset can be -1, in which case we should simply
+      // use the endOffset of the input token, or X >= 0, in
+      // which case we use X as the endOffset for this output
+      endOffsets[count] = endOffset;
       count++;
     }
   };
@@ -281,6 +298,7 @@ public final class SynonymFilter extends
     // Holds the longest match we've seen so far:
     BytesRef matchOutput = null;
     int matchInputLength = 0;
+    int matchEndOffset = -1;
 
     BytesRef pendingOutput = fst.outputs.getNoOutput();
     fst.getFirstArc(scratchArc);
@@ -297,6 +315,8 @@ public final class SynonymFilter extends
       final int bufferLen;
       //System.out.println("  cycle nextRead=" + curNextRead + " nextWrite=" + nextWrite);
 
+      int inputEndOffset = 0;
+
       if (curNextRead == nextWrite) {
 
         // We used up our lookahead buffer of input tokens
@@ -317,6 +337,7 @@ public final class SynonymFilter extends
             final PendingInput input = futureInputs[nextWrite];
             input.startOffset = offsetAtt.startOffset();
             input.endOffset = offsetAtt.endOffset();
+            inputEndOffset = input.endOffset;
             //System.out.println("  new token=" + new String(buffer, 0, bufferLen));
             if (nextRead != nextWrite) {
               capture();
@@ -335,6 +356,7 @@ public final class SynonymFilter extends
         // Still in our lookahead
         buffer = futureInputs[curNextRead].term.chars;
         bufferLen = futureInputs[curNextRead].term.length;
+        inputEndOffset = futureInputs[curNextRead].endOffset;
         //System.out.println("  old token=" + new String(buffer, 0, bufferLen));
       }
 
@@ -360,6 +382,7 @@ public final class SynonymFilter extends
       if (scratchArc.isFinal()) {
         matchOutput = fst.outputs.add(pendingOutput, scratchArc.nextFinalOutput);
         matchInputLength = tokenCount;
+        matchEndOffset = inputEndOffset;
         //System.out.println("  found matchLength=" + matchInputLength + " output=" + matchOutput);
       }
 
@@ -390,7 +413,7 @@ public final class SynonymFilter extends
     if (matchOutput != null) {
       //System.out.println("  add matchLength=" + matchInputLength + " output=" + matchOutput);
       inputSkipCount = matchInputLength;
-      addOutput(matchOutput, matchInputLength);
+      addOutput(matchOutput, matchInputLength, matchEndOffset);
     } else if (nextRead != nextWrite) {
       // Even though we had no match here, we set to 1
       // because we need to skip current input token before
@@ -404,7 +427,7 @@ public final class SynonymFilter extends
   }
 
   // Interleaves all output tokens onto the futureOutputs:
-  private void addOutput(BytesRef bytes, int matchInputLength) {
+  private void addOutput(BytesRef bytes, int matchInputLength, int matchEndOffset) {
     bytesReader.reset(bytes.bytes, bytes.offset, bytes.length);
 
     final int code = bytesReader.readVInt();
@@ -425,7 +448,21 @@ public final class SynonymFilter extends
           // Caller is not allowed to have empty string in
           // the output:
           assert outputLen > 0: "output contains empty string: " + scratchChars;
-          futureOutputs[outputUpto].add(scratchChars.chars, lastStart, outputLen);
+          final int endOffset;
+          if (chIDX == chEnd && lastStart == scratchChars.offset) {
+            // This rule had a single output token, so, we set
+            // this output's endOffset to the current
+            // endOffset (ie, endOffset of the last input
+            // token it matched):
+            endOffset = matchEndOffset;
+          } else {
+            // This rule has more than one output token; we
+            // can't pick any particular endOffset for this
+            // case, so, we inherit the endOffset for the
+            // input token which this output overlaps:
+            endOffset = -1;
+          }
+          futureOutputs[outputUpto].add(scratchChars.chars, lastStart, outputLen, endOffset);
           //System.out.println("      " + new String(scratchChars.chars, lastStart, outputLen) + " outputUpto=" + outputUpto);
           lastStart = 1+chIDX;
           //System.out.println("  slot=" + outputUpto + " keepOrig=" + keepOrig);
@@ -507,7 +544,11 @@ public final class SynonymFilter extends
           clearAttributes();
           termAtt.copyBuffer(output.chars, output.offset, output.length);
           typeAtt.setType(TYPE_SYNONYM);
-          offsetAtt.setOffset(input.startOffset, input.endOffset);
+          int endOffset = outputs.getLastEndOffset();
+          if (endOffset == -1) {
+            endOffset = input.endOffset;
+          }
+          offsetAtt.setOffset(input.startOffset, endOffset);
           posIncrAtt.setPositionIncrement(posIncr);
           if (outputs.count == 0) {
             // Done with the buffered input and all outputs at

Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymMapFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymMapFilter.java?rev=1228657&r1=1228656&r2=1228657&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymMapFilter.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymMapFilter.java Sat Jan  7 16:47:34 2012
@@ -60,7 +60,12 @@ public class TestSynonymMapFilter extend
     }
   }
 
-  // todo: we should probably refactor this guy to use/take analyzer,
+  // For the output string: separate positions with a space,
+  // and separate multiple tokens at each position with a
+  // /.  If a token should have end offset != the input
+  // token's end offset then add :X to it:
+
+  // TODO: we should probably refactor this guy to use/take analyzer,
   // the tests are a little messy
   private void verify(String input, String output) throws Exception {
     if (VERBOSE) {
@@ -74,7 +79,7 @@ public class TestSynonymMapFilter extend
     while(tokensOut.incrementToken()) {
 
       if (VERBOSE) {
-        System.out.println("  incr token=" + termAtt.toString() + " posIncr=" + posIncrAtt.getPositionIncrement());
+        System.out.println("  incr token=" + termAtt.toString() + " posIncr=" + posIncrAtt.getPositionIncrement() + " startOff=" + offsetAtt.startOffset() + " endOff=" + offsetAtt.endOffset());
       }
 
       assertTrue(expectedUpto < expected.length);
@@ -86,16 +91,26 @@ public class TestSynonymMapFilter extend
         if (atPos > 0) {
           assertTrue(tokensOut.incrementToken());
           if (VERBOSE) {
-            System.out.println("  incr token=" + termAtt.toString() + " posIncr=" + posIncrAtt.getPositionIncrement());
+            System.out.println("  incr token=" + termAtt.toString() + " posIncr=" + posIncrAtt.getPositionIncrement() + " startOff=" + offsetAtt.startOffset() + " endOff=" + offsetAtt.endOffset());
           }
         }
-        assertEquals(termAtt, expectedAtPos[atPos]);
+        final int colonIndex = expectedAtPos[atPos].indexOf(':');
+        final String expectedToken;
+        final int expectedEndOffset;
+        if (colonIndex != -1) {
+          expectedToken = expectedAtPos[atPos].substring(0, colonIndex);
+          expectedEndOffset = Integer.parseInt(expectedAtPos[atPos].substring(1+colonIndex));
+        } else {
+          expectedToken = expectedAtPos[atPos];
+          expectedEndOffset = endOffset;
+        }
+        assertEquals(expectedToken, termAtt.toString());
         assertEquals(atPos == 0 ? 1 : 0,
                      posIncrAtt.getPositionIncrement());
         // start/end offset of all tokens at same pos should
         // be the same:
         assertEquals(startOffset, offsetAtt.startOffset());
-        assertEquals(endOffset, offsetAtt.endOffset());
+        assertEquals(expectedEndOffset, offsetAtt.endOffset());
       }
     }
     tokensOut.end();
@@ -113,6 +128,7 @@ public class TestSynonymMapFilter extend
     add("b c", "dog collar", true);
     add("c d", "dog harness holder extras", true);
     add("m c e", "dog barks loudly", false);
+    add("i j k", "feep", true);
 
     add("e f", "foo bar", false);
     add("e f", "baz bee", false);
@@ -149,6 +165,9 @@ public class TestSynonymMapFilter extend
     // two outputs for same input
     verify("e f", "foo/baz bar/bee");
 
+    // verify multi-word / single-output offsets:
+    verify("g i j k g", "g i/feep:7 j k g");
+
     // mixed keepOrig true/false:
     verify("a m c e x", "a/foo dog barks loudly x");
     verify("c d m c e x", "c/dog d/harness holder/dog extras/barks loudly x");
@@ -242,6 +261,10 @@ public class TestSynonymMapFilter extend
           } else {
             outputs[matchIDX] = outputs[matchIDX] + "/" + synOutputs[synUpto++];
           }
+          if (synOutputs.length == 1) {
+            // Add endOffset
+            outputs[matchIDX] = outputs[matchIDX] + ":" + ((inputIDX*2) + syn.in.length());
+          }
         }
       }
     }
@@ -664,4 +687,24 @@ public class TestSynonymMapFilter extend
         new String[] { "zoo", "zoo", "zoo", "$", "zoo", "zoo", "zoo" },
         new int[] { 1, 0, 1, 1, 1, 0, 1 });
   }
+  
+  public void testMultiwordOffsets() throws Exception {
+    b = new SynonymMap.Builder(true);
+    final boolean keepOrig = true;
+    add("national hockey league", "nhl", keepOrig);
+    final SynonymMap map = b.build();
+    Analyzer a = new ReusableAnalyzerBase() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+        return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true));
+      }
+    };
+    
+    assertAnalyzesTo(a, "national hockey league",
+        new String[] { "national", "nhl", "hockey", "league" },
+        new int[] { 0, 0, 9, 16 },
+        new int[] { 8, 22, 15, 22 },
+        new int[] { 1, 0, 1, 1 });
+  }
 }

Modified: lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/analysis/TestSynonymFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/analysis/TestSynonymFilterFactory.java?rev=1228657&r1=1228656&r2=1228657&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/analysis/TestSynonymFilterFactory.java (original)
+++ lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/analysis/TestSynonymFilterFactory.java Sat Jan  7 16:47:34 2012
@@ -21,6 +21,7 @@ import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.StringReader;
+import java.util.Arrays;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
@@ -65,6 +66,25 @@ public class TestSynonymFilterFactory ex
         new int[] { 1, 0, 0, 0 });
   }
   
+  /** test multiword offsets with the old impl
+   * @deprecated Remove this test in Lucene 5.0 */
+  @Deprecated
+  public void testMultiwordOffsetsOld() throws Exception {
+    SynonymFilterFactory factory = new SynonymFilterFactory();
+    Map<String,String> args = new HashMap<String,String>();
+    args.put("luceneMatchVersion", Version.LUCENE_33.toString());
+    args.put("synonyms", "synonyms.txt");
+    factory.init(args);
+    factory.inform(new StringMockSolrResourceLoader("national hockey league, nhl"));
+    TokenStream ts = factory.create(new MockTokenizer(new StringReader("national hockey league"), MockTokenizer.WHITESPACE, false));
+    // WTF?
+    assertTokenStreamContents(ts, 
+        new String[] { "national", "nhl", "hockey", "league" },
+        new int[] { 0, 0, 0, 0 },
+        new int[] { 22, 22, 22, 22 },
+        new int[] { 1, 0, 1, 1 });
+  }
+  
   /** if the synonyms are completely empty, test that we still analyze correctly */
   public void testEmptySynonyms() throws Exception {
     SynonymFilterFactory factory = new SynonymFilterFactory();
@@ -85,7 +105,7 @@ public class TestSynonymFilterFactory ex
     }
 
     public List<String> getLines(String resource) throws IOException {
-      return null;
+      return Arrays.asList(text.split("\n"));
     }
 
     public Object newInstance(String cname, String... subpackages) {