You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2013/09/17 18:26:37 UTC

svn commit: r1524117 - in /lucene/dev/trunk/lucene: ./ analysis/common/src/java/org/apache/lucene/analysis/shingle/ analysis/common/src/test/org/apache/lucene/analysis/shingle/ test-framework/src/java/org/apache/lucene/analysis/

Author: mikemccand
Date: Tue Sep 17 16:26:36 2013
New Revision: 1524117

URL: http://svn.apache.org/r1524117
Log:
LUCENE-5180: ShingleFilter creates shingles from trailing holes

Modified:
    lucene/dev/trunk/lucene/CHANGES.txt
    lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
    lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java
    lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/analysis/CannedTokenStream.java

Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1524117&r1=1524116&r2=1524117&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Tue Sep 17 16:26:36 2013
@@ -38,6 +38,10 @@ New Features
 * SOLR-3359: Added analyzer attribute/property to SynonymFilterFactory.
   (Ryo Onodera via Koji Sekiguchi)
 
+* LUCENE-5180: ShingleFilter now creates shingles with trailing holes,
+  for example if a StopFilter had removed the last token.  (Mike
+  McCandless)
+
 Optimizations
 
 * LUCENE-4848: Use Java 7 NIO2-FileChannel instead of RandomAccessFile

Modified: lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java?rev=1524117&r1=1524116&r2=1524117&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java Tue Sep 17 16:26:36 2013
@@ -147,6 +147,12 @@ public final class ShingleFilter extends
    * true if no shingles have been output yet (for outputUnigramsIfNoShingles).
    */
   boolean noShingleOutput = true;
+
+  /**
+   * Holds the State after input.end() was called, so we can
+   * restore it in our end() impl.
+   */
+  private State endState;
   
   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
   private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
@@ -279,7 +285,7 @@ public final class ShingleFilter extends
   }
 
   @Override
-  public final boolean incrementToken() throws IOException {
+  public boolean incrementToken() throws IOException {
     boolean tokenAvailable = false;
     int builtGramSize = 0;
     if (gramSize.atMinValue() || inputWindow.size() < gramSize.getValue()) {
@@ -364,39 +370,63 @@ public final class ShingleFilter extends
       }
       isNextInputStreamToken = false;
       newTarget.isFiller = false;
-    } else if (!exhausted && input.incrementToken()) {
-      if (null == target) {
-        newTarget = new InputWindowToken(cloneAttributes());
-      } else {
-        this.copyTo(target.attSource);
-      }
-      if (posIncrAtt.getPositionIncrement() > 1) {
-        // Each output shingle must contain at least one input token, 
-        // so no more than (maxShingleSize - 1) filler tokens will be inserted.
-        numFillerTokensToInsert 
-          = Math.min(posIncrAtt.getPositionIncrement() - 1, maxShingleSize - 1);
-        // Save the current token as the next input stream token
-        if (null == nextInputStreamToken) {
-          nextInputStreamToken = cloneAttributes();
+    } else if (!exhausted) {
+      if (input.incrementToken()) {
+        if (null == target) {
+          newTarget = new InputWindowToken(cloneAttributes());
+        } else {
+          this.copyTo(target.attSource);
+        }
+        if (posIncrAtt.getPositionIncrement() > 1) {
+          // Each output shingle must contain at least one input token, 
+          // so no more than (maxShingleSize - 1) filler tokens will be inserted.
+          numFillerTokensToInsert = Math.min(posIncrAtt.getPositionIncrement() - 1, maxShingleSize - 1);
+          // Save the current token as the next input stream token
+          if (null == nextInputStreamToken) {
+            nextInputStreamToken = cloneAttributes();
+          } else {
+            this.copyTo(nextInputStreamToken);
+          }
+          isNextInputStreamToken = true;
+          // A filler token occupies no space
+          newTarget.offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset());
+          newTarget.termAtt.copyBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length);
+          newTarget.isFiller = true;
+          --numFillerTokensToInsert;
         } else {
-          this.copyTo(nextInputStreamToken);
+          newTarget.isFiller = false;
         }
-        isNextInputStreamToken = true;
-        // A filler token occupies no space
-        newTarget.offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset());
-        newTarget.termAtt.copyBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length);
-        newTarget.isFiller = true;
-        --numFillerTokensToInsert;
       } else {
-        newTarget.isFiller = false;
+        exhausted = true;
+        input.end();
+        endState = captureState();
+        numFillerTokensToInsert = Math.min(posIncrAtt.getPositionIncrement(), maxShingleSize - 1);
+        if (numFillerTokensToInsert > 0) {
+          nextInputStreamToken = new AttributeSource(getAttributeFactory());
+          nextInputStreamToken.addAttribute(CharTermAttribute.class);
+          OffsetAttribute newOffsetAtt = nextInputStreamToken.addAttribute(OffsetAttribute.class);
+          newOffsetAtt.setOffset(offsetAtt.endOffset(), offsetAtt.endOffset());
+          // Recurse/loop just once:
+          return getNextToken(target);
+        } else {
+          newTarget = null;
+        }
       }
     } else {
       newTarget = null;
-      exhausted = true;
     }
     return newTarget;
   }
 
+  @Override
+  public void end() throws IOException {
+    if (!exhausted) {
+      super.end();
+    } else {
+      restoreState(endState);
+    }
+  }
+
   /**
    * <p>Fills {@link #inputWindow} with input stream tokens, if available, 
    * shifting to the right if the window was previously full.
@@ -445,6 +475,7 @@ public final class ShingleFilter extends
     isOutputHere = false;
     noShingleOutput = true;
     exhausted = false;
+    endState = null;
     if (outputUnigramsIfNoShingles && ! outputUnigrams) {
       // Fix up gramSize if minValue was reset for outputUnigramsIfNoShingles
       gramSize.minValue = minShingleSize;

Modified: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java?rev=1524117&r1=1524116&r2=1524117&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java Tue Sep 17 16:26:36 2013
@@ -24,6 +24,7 @@ import java.util.Random;
 
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CannedTokenStream;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;
@@ -34,41 +35,6 @@ import org.apache.lucene.analysis.tokena
 
 public class ShingleFilterTest extends BaseTokenStreamTestCase {
 
-  public class TestTokenStream extends TokenStream {
-
-    protected int index = 0;
-    protected Token[] testToken;
-    
-    private CharTermAttribute termAtt;
-    private OffsetAttribute offsetAtt;
-    private PositionIncrementAttribute posIncrAtt;
-    private TypeAttribute typeAtt;
-
-    public TestTokenStream(Token[] testToken) {
-      super();
-      this.testToken = testToken;
-      this.termAtt = addAttribute(CharTermAttribute.class);
-      this.offsetAtt = addAttribute(OffsetAttribute.class);
-      this.posIncrAtt = addAttribute(PositionIncrementAttribute.class);
-      this.typeAtt = addAttribute(TypeAttribute.class);
-    }
-
-    @Override
-    public final boolean incrementToken() {
-      clearAttributes();
-      if (index < testToken.length) {
-        Token t = testToken[index++];
-        termAtt.copyBuffer(t.buffer(), 0, t.length());
-        offsetAtt.setOffset(t.startOffset(), t.endOffset());
-        posIncrAtt.setPositionIncrement(t.getPositionIncrement());
-        typeAtt.setType(TypeAttribute.DEFAULT_TYPE);
-        return true;
-      } else {
-        return false;
-      }
-    }
-  }
-
   public static final Token[] TEST_TOKEN = new Token[] {
       createToken("please", 0, 6),
       createToken("divide", 7, 13),
@@ -1066,7 +1032,7 @@ public class ShingleFilterTest extends B
                                    boolean outputUnigrams)
     throws IOException {
 
-    ShingleFilter filter = new ShingleFilter(new TestTokenStream(tokensToShingle), maxSize);
+    ShingleFilter filter = new ShingleFilter(new CannedTokenStream(tokensToShingle), maxSize);
     filter.setOutputUnigrams(outputUnigrams);
     shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types);
   }
@@ -1076,7 +1042,7 @@ public class ShingleFilterTest extends B
                                    String[] types, boolean outputUnigrams)
     throws IOException {
     ShingleFilter filter 
-      = new ShingleFilter(new TestTokenStream(tokensToShingle), minSize, maxSize);
+      = new ShingleFilter(new CannedTokenStream(tokensToShingle), minSize, maxSize);
     filter.setOutputUnigrams(outputUnigrams);
     shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types);
   }
@@ -1087,7 +1053,7 @@ public class ShingleFilterTest extends B
                                    boolean outputUnigramsIfNoShingles)
     throws IOException {
     ShingleFilter filter 
-      = new ShingleFilter(new TestTokenStream(tokensToShingle), minSize, maxSize);
+      = new ShingleFilter(new CannedTokenStream(tokensToShingle), minSize, maxSize);
     filter.setOutputUnigrams(outputUnigrams);
     filter.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
     shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types);
@@ -1098,7 +1064,7 @@ public class ShingleFilterTest extends B
                                    String[] types, boolean outputUnigrams)
     throws IOException {
     ShingleFilter filter 
-      = new ShingleFilter(new TestTokenStream(tokensToShingle), minSize, maxSize);
+      = new ShingleFilter(new CannedTokenStream(tokensToShingle), minSize, maxSize);
     filter.setTokenSeparator(tokenSeparator);
     filter.setOutputUnigrams(outputUnigrams);
     shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types);
@@ -1170,4 +1136,63 @@ public class ShingleFilterTest extends B
     };
     checkOneTermReuse(a, "", "");
   }
+
+  public void testTrailingHole1() throws IOException {
+    // Analyzing "wizard of", where of is removed as a
+    // stopword leaving a trailing hole:
+    Token[] inputTokens = new Token[] {createToken("wizard", 0, 6)};
+    ShingleFilter filter = new ShingleFilter(new CannedTokenStream(1, 9, inputTokens), 2, 2);
+
+    assertTokenStreamContents(filter,
+                              new String[] {"wizard", "wizard _"},
+                              new int[] {0, 0},
+                              new int[] {6, 9},
+                              new int[] {1, 0},
+                              9);
+  }
+
+  public void testTrailingHole2() throws IOException {
+    // Analyzing "purple wizard of", where of is removed as a
+    // stopword leaving a trailing hole:
+    Token[] inputTokens = new Token[] {createToken("purple", 0, 6),
+                                       createToken("wizard", 7, 13)};
+    ShingleFilter filter = new ShingleFilter(new CannedTokenStream(1, 16, inputTokens), 2, 2);
+
+    assertTokenStreamContents(filter,
+                              new String[] {"purple", "purple wizard", "wizard", "wizard _"},
+                              new int[] {0, 0, 7, 7},
+                              new int[] {6, 13, 13, 16},
+                              new int[] {1, 0, 1, 0},
+                              16);
+  }
+
+  public void testTwoTrailingHoles() throws IOException {
+    // Analyzing "purple wizard of the", where of and the are removed as a
+    // stopwords, leaving two trailing holes:
+    Token[] inputTokens = new Token[] {createToken("purple", 0, 6),
+                                       createToken("wizard", 7, 13)};
+    ShingleFilter filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 2);
+
+    assertTokenStreamContents(filter,
+                              new String[] {"purple", "purple wizard", "wizard", "wizard _"},
+                              new int[] {0, 0, 7, 7},
+                              new int[] {6, 13, 13, 20},
+                              new int[] {1, 0, 1, 0},
+                              20);
+  }
+
+  public void testTwoTrailingHolesTriShingle() throws IOException {
+    // Analyzing "purple wizard of the", where of and the are removed as a
+    // stopwords, leaving two trailing holes:
+    Token[] inputTokens = new Token[] {createToken("purple", 0, 6),
+                                       createToken("wizard", 7, 13)};
+    ShingleFilter filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
+
+    assertTokenStreamContents(filter,
+                              new String[] {"purple", "purple wizard", "purple wizard _", "wizard", "wizard _", "wizard _ _"},
+                              new int[] {0, 0, 0, 7, 7, 7},
+                              new int[] {6, 13, 20, 13, 20, 20},
+                              new int[] {1, 0, 0, 1, 0, 0},
+                              20);
+  }
 }

Modified: lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/analysis/CannedTokenStream.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/analysis/CannedTokenStream.java?rev=1524117&r1=1524116&r2=1524117&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/analysis/CannedTokenStream.java (original)
+++ lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/analysis/CannedTokenStream.java Tue Sep 17 16:26:36 2013
@@ -17,6 +17,8 @@ package org.apache.lucene.analysis;
  * limitations under the License.
  */
 
+import java.io.IOException;
+
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
@@ -34,9 +36,28 @@ public final class CannedTokenStream ext
   private final PositionLengthAttribute posLengthAtt = addAttribute(PositionLengthAttribute.class);
   private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
   private final PayloadAttribute payloadAtt = addAttribute(PayloadAttribute.class);
-  
+  private final int finalOffset;
+  private final int finalPosInc;
+
   public CannedTokenStream(Token... tokens) {
     this.tokens = tokens;
+    finalOffset = 0;
+    finalPosInc = 0;
+  }
+
+  /** If you want trailing holes, pass a non-zero
+   *  finalPosInc. */
+  public CannedTokenStream(int finalPosInc, int finalOffset, Token... tokens) {
+    this.tokens = tokens;
+    this.finalOffset = finalOffset;
+    this.finalPosInc = finalPosInc;
+  }
+
+  @Override
+  public void end() throws IOException {
+    super.end();
+    posIncrAtt.setPositionIncrement(finalPosInc);
+    offsetAtt.setOffset(finalOffset, finalOffset);
   }
   
   @Override