You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sa...@apache.org on 2010/10/09 18:55:23 UTC

svn commit: r1006187 - in /lucene/dev/trunk/modules/analysis: ./ common/src/java/org/apache/lucene/analysis/shingle/ common/src/test/org/apache/lucene/analysis/shingle/

Author: sarowe
Date: Sat Oct  9 16:55:23 2010
New Revision: 1006187

URL: http://svn.apache.org/viewvc?rev=1006187&view=rev
Log:
LUCENE-1370: Added ShingleFilter option to output unigrams if no shingles can be generated.

Modified:
    lucene/dev/trunk/modules/analysis/CHANGES.txt
    lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java
    lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
    lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java
    lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java

Modified: lucene/dev/trunk/modules/analysis/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/CHANGES.txt?rev=1006187&r1=1006186&r2=1006187&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/CHANGES.txt (original)
+++ lucene/dev/trunk/modules/analysis/CHANGES.txt Sat Oct  9 16:55:23 2010
@@ -15,6 +15,9 @@ API Changes
    RFCs.  ClassicTokenizer/Analyzer retains the old StandardTokenizer/Analyzer
    behavior.  (Steven Rowe, Robert Muir, Uwe Schindler)
 
+ * LUCENE-1370: Added ShingleFilter option to output unigrams if no shingles
+   can be generated. (Chris Harris via Steven Rowe)
+   
 New Features
    
  * LUCENE-2413: Consolidated Solr analysis components into common. 

Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java?rev=1006187&r1=1006186&r2=1006187&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java Sat Oct  9 16:55:23 2010
@@ -38,6 +38,7 @@ public final class ShingleAnalyzerWrappe
   private int minShingleSize = ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE;
   private String tokenSeparator = ShingleFilter.TOKEN_SEPARATOR;
   private boolean outputUnigrams = true;
+  private boolean outputUnigramsIfNoShingles = false;
 
   public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer) {
     super();
@@ -147,6 +148,24 @@ public final class ShingleAnalyzerWrappe
   public void setOutputUnigrams(boolean outputUnigrams) {
     this.outputUnigrams = outputUnigrams;
   }
+  
+  public boolean isOutputUnigramsIfNoShingles() {
+    return outputUnigramsIfNoShingles;
+  }
+  
+  /**
+   * <p>Shall we override the behavior of outputUnigrams==false for those
+   * times when no shingles are available (because there are fewer than
+   * minShingleSize tokens in the input stream)? (default: false.)
+   * <p>Note that if outputUnigrams==true, then unigrams are always output,
+   * regardless of whether any shingles are available.
+   *
+   * @param outputUnigramsIfNoShingles Whether or not to output a single
+   *  unigram when no shingles are available.
+   */
+  public void setOutputUnigramsIfNoShingles(boolean outputUnigramsIfNoShingles) {
+    this.outputUnigramsIfNoShingles = outputUnigramsIfNoShingles;
+  }
 
   @Override
   public TokenStream tokenStream(String fieldName, Reader reader) {
@@ -161,6 +180,7 @@ public final class ShingleAnalyzerWrappe
     filter.setMaxShingleSize(maxShingleSize);
     filter.setTokenSeparator(tokenSeparator);
     filter.setOutputUnigrams(outputUnigrams);
+    filter.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
     return filter;
   }
   
@@ -192,6 +212,7 @@ public final class ShingleAnalyzerWrappe
     streams.shingle.setMinShingleSize(minShingleSize);
     streams.shingle.setTokenSeparator(tokenSeparator);
     streams.shingle.setOutputUnigrams(outputUnigrams);
+    streams.shingle.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
     return streams.shingle;
   }
 }

Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java?rev=1006187&r1=1006186&r2=1006187&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java Sat Oct  9 16:55:23 2010
@@ -103,6 +103,11 @@ public final class ShingleFilter extends
   private boolean outputUnigrams = true;
 
   /**
+   * By default, we don't override behavior of outputUnigrams.
+   */
+  private boolean outputUnigramsIfNoShingles = false;
+ 
+  /**
    * maximum shingle size (number of tokens)
    */
   private int maxShingleSize;
@@ -136,6 +141,11 @@ public final class ShingleFilter extends
    * position.
    */
   private boolean isOutputHere = false;
+
+  /**
+   * true if no shingles have been output yet (for outputUnigramsIfNoShingles).
+   */
+  boolean noShingleOutput = true;
   
   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
   private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
@@ -212,6 +222,20 @@ public final class ShingleFilter extends
   }
 
   /**
+   * <p>Shall we override the behavior of outputUnigrams==false for those
+   * times when no shingles are available (because there are fewer than
+   * minShingleSize tokens in the input stream)? (default: false.)
+   * <p>Note that if outputUnigrams==true, then unigrams are always output,
+   * regardless of whether any shingles are available.
+   *
+   * @param outputUnigramsIfNoShingles Whether or not to output a single
+   * unigram when no shingles are available.
+   */
+  public void setOutputUnigramsIfNoShingles(boolean outputUnigramsIfNoShingles) {
+    this.outputUnigramsIfNoShingles = outputUnigramsIfNoShingles;
+  }
+
+  /**
    * Set the max shingle size (default: 2)
    *
    * @param maxShingleSize max size of output shingles
@@ -292,6 +316,7 @@ public final class ShingleFilter extends
         termAtt.setEmpty().append(gramBuilder);
         if (gramSize.getValue() > 1) {
           typeAtt.setType(tokenType);
+          noShingleOutput = false;
         }
         offsetAtt.setOffset(offsetAtt.startOffset(), nextToken.offsetAtt.endOffset());
         isOutputHere = true;
@@ -395,6 +420,10 @@ public final class ShingleFilter extends
         }
       }
     }
+    if (outputUnigramsIfNoShingles && noShingleOutput 
+        && gramSize.minValue > 1 && inputWindow.size() < minShingleSize) {
+      gramSize.minValue = 1;
+    }
     gramSize.reset();
     isOutputHere = false;
   }
@@ -406,6 +435,11 @@ public final class ShingleFilter extends
     inputWindow.clear();
     numFillerTokensToInsert = 0;
     isOutputHere = false;
+    noShingleOutput = true;    
+    if (outputUnigramsIfNoShingles && ! outputUnigrams) {
+      // Fix up gramSize if minValue was reset for outputUnigramsIfNoShingles
+      gramSize.minValue = minShingleSize;
+    }
   }
 
 

Modified: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java?rev=1006187&r1=1006186&r2=1006187&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java Sat Oct  9 16:55:23 2010
@@ -359,4 +359,16 @@ public class ShingleAnalyzerWrapperTest 
                           new int[] { 13, 18, 27 },
                           new int[] {  1,  1,  1 });
   }
+  
+  public void testOutputUnigramsIfNoShinglesSingleToken() throws Exception {
+    ShingleAnalyzerWrapper analyzer
+      = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(TEST_VERSION_CURRENT));
+    analyzer.setOutputUnigrams(false);
+    analyzer.setOutputUnigramsIfNoShingles(true);
+    assertAnalyzesToReuse(analyzer, "please",
+                          new String[] { "please" },
+                          new int[] { 0 },
+                          new int[] { 6 },
+                          new int[] { 1 });
+  }
 }

Modified: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java?rev=1006187&r1=1006186&r2=1006187&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java Sat Oct  9 16:55:23 2010
@@ -73,6 +73,14 @@ public class ShingleFilterTest extends B
       createToken("shingles", 33, 39),
   };
 
+  public static final int[] UNIGRAM_ONLY_POSITION_INCREMENTS = new int[] {
+    1, 1, 1, 1, 1, 1
+  };
+
+  public static final String[] UNIGRAM_ONLY_TYPES = new String[] {
+    "word", "word", "word", "word", "word", "word"
+  };
+
   public static Token[] testTokenWithHoles;
 
   public static final Token[] BI_GRAM_TOKENS = new Token[] {
@@ -1018,15 +1026,44 @@ public class ShingleFilterTest extends B
       new int[]{1,0,1,0,1,0,1}
     );
   }
-  
+
+  public void testOutputUnigramsIfNoShinglesSingleTokenCase() throws IOException {
+    // Single token input with outputUnigrams==false is the primary case where
+    // enabling this option should alter program behavior.
+    this.shingleFilterTest(2, 2, TEST_SINGLE_TOKEN, SINGLE_TOKEN,
+                           SINGLE_TOKEN_INCREMENTS, SINGLE_TOKEN_TYPES,
+                           false, true);
+  }
+ 
+  public void testOutputUnigramsIfNoShinglesWithSimpleBigram() throws IOException {
+    // Here we expect the same result as with testBiGramFilter().
+    this.shingleFilterTest(2, 2, TEST_TOKEN, BI_GRAM_TOKENS,
+                           BI_GRAM_POSITION_INCREMENTS, BI_GRAM_TYPES,
+                           true, true);
+  }
+
+  public void testOutputUnigramsIfNoShinglesWithSimpleUnigramlessBigram() throws IOException {
+    // Here we expect the same result as with testBiGramFilterWithoutUnigrams().
+    this.shingleFilterTest(2, 2, TEST_TOKEN, BI_GRAM_TOKENS_WITHOUT_UNIGRAMS,
+                           BI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS, BI_GRAM_TYPES_WITHOUT_UNIGRAMS,
+                           false, true);
+  }
+
+  public void testOutputUnigramsIfNoShinglesWithMultipleInputTokens() throws IOException {
+    // Test when the minimum shingle size is greater than the number of input tokens
+    this.shingleFilterTest(7, 7, TEST_TOKEN, TEST_TOKEN, 
+                           UNIGRAM_ONLY_POSITION_INCREMENTS, UNIGRAM_ONLY_TYPES,
+                           false, true);
+  }
+
   protected void shingleFilterTest(int maxSize, Token[] tokensToShingle, Token[] tokensToCompare,
                                    int[] positionIncrements, String[] types,
                                    boolean outputUnigrams)
     throws IOException {
 
     ShingleFilter filter = new ShingleFilter(new TestTokenStream(tokensToShingle), maxSize);
-    shingleFilterTestCommon
-      (filter, tokensToCompare, positionIncrements, types, outputUnigrams);
+    filter.setOutputUnigrams(outputUnigrams);
+    shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types);
   }
 
   protected void shingleFilterTest(int minSize, int maxSize, Token[] tokensToShingle, 
@@ -1035,8 +1072,20 @@ public class ShingleFilterTest extends B
     throws IOException {
     ShingleFilter filter 
       = new ShingleFilter(new TestTokenStream(tokensToShingle), minSize, maxSize);
-    shingleFilterTestCommon
-      (filter, tokensToCompare, positionIncrements, types, outputUnigrams);
+    filter.setOutputUnigrams(outputUnigrams);
+    shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types);
+  }
+
+  protected void shingleFilterTest(int minSize, int maxSize, Token[] tokensToShingle, 
+                                   Token[] tokensToCompare, int[] positionIncrements,
+                                   String[] types, boolean outputUnigrams, 
+                                   boolean outputUnigramsIfNoShingles)
+    throws IOException {
+    ShingleFilter filter 
+      = new ShingleFilter(new TestTokenStream(tokensToShingle), minSize, maxSize);
+    filter.setOutputUnigrams(outputUnigrams);
+    filter.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
+    shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types);
   }
 
   protected void shingleFilterTest(String tokenSeparator, int minSize, int maxSize, Token[] tokensToShingle, 
@@ -1046,18 +1095,15 @@ public class ShingleFilterTest extends B
     ShingleFilter filter 
       = new ShingleFilter(new TestTokenStream(tokensToShingle), minSize, maxSize);
     filter.setTokenSeparator(tokenSeparator);
-    shingleFilterTestCommon
-      (filter, tokensToCompare, positionIncrements, types, outputUnigrams);
+    filter.setOutputUnigrams(outputUnigrams);
+    shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types);
   }
 
   protected void shingleFilterTestCommon(ShingleFilter filter,
                                          Token[] tokensToCompare,
                                          int[] positionIncrements,
-                                         String[] types, boolean outputUnigrams)
+                                         String[] types)
     throws IOException {
-
-    filter.setOutputUnigrams(outputUnigrams);
-
     String text[] = new String[tokensToCompare.length];
     int startOffsets[] = new int[tokensToCompare.length];
     int endOffsets[] = new int[tokensToCompare.length];