You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by si...@apache.org on 2010/01/10 19:06:20 UTC

svn commit: r897672 - in /lucene/java/trunk/contrib: CHANGES.txt analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java

Author: simonw
Date: Sun Jan 10 18:06:19 2010
New Revision: 897672

URL: http://svn.apache.org/viewvc?rev=897672&view=rev
Log:
LUCENE-2199: ShingleFilter skipped over tri-gram shingles if outputUnigram was set to false

Modified:
    lucene/java/trunk/contrib/CHANGES.txt
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
    lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java

Modified: lucene/java/trunk/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/CHANGES.txt?rev=897672&r1=897671&r2=897672&view=diff
==============================================================================
--- lucene/java/trunk/contrib/CHANGES.txt (original)
+++ lucene/java/trunk/contrib/CHANGES.txt Sun Jan 10 18:06:19 2010
@@ -18,6 +18,9 @@
 
 Bug fixes
 
+ * LUCENE-2199: ShingleFilter skipped over tri-gram shingles if outputUnigram
+   was set to false. (Simon Willnauer)
+
  * LUCENE-2068: Fixed ReverseStringFilter which was not aware of supplementary
    characters. During reverse the filter created unpaired surrogates, which
    will be replaced by U+FFFD by the indexer, but not at query time. The filter

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java?rev=897672&r1=897671&r2=897672&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java Sun Jan 10 18:06:19 2010
@@ -182,7 +182,7 @@
           shingleBufferPosition++;
           return true;
         }
-      } else {
+      } else if (shingleBufferPosition % this.maxShingleSize == 0){
         shingleBufferPosition++;
       }
   
@@ -197,7 +197,7 @@
           termBuffer = termAtt.resizeTermBuffer(termLength);
         buf.getChars(0, termLength, termBuffer, 0);
         termAtt.setTermLength(termLength);
-        if ((! outputUnigrams) && shingleBufferPosition == 1) {
+        if ((! outputUnigrams) && shingleBufferPosition % this.maxShingleSize == 1) {
           posIncrAtt.setPositionIncrement(1);
         } else {
           posIncrAtt.setPositionIncrement(0);

Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java?rev=897672&r1=897671&r2=897672&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java Sun Jan 10 18:06:19 2010
@@ -200,6 +200,93 @@
     "word", "shingle",
     "word"
   };
+  
+  public static final Token[] TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS = new Token[] {
+    createToken("please divide", 0, 13),
+    createToken("please divide this", 0, 18),
+    createToken("divide this", 7, 18),
+    createToken("divide this sentence", 7, 27),
+    createToken("this sentence", 14, 27),
+    createToken("this sentence into", 14, 32),
+    createToken("sentence into", 19, 32),
+    createToken("sentence into shingles", 19, 39),
+    createToken("into shingles", 28, 39),
+  };
+
+  public static final int[] TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS = new int[] {
+    1, 0, 1, 0, 1, 0, 1, 0, 1
+  };
+  
+  public static final String[] TRI_GRAM_TYPES_WITHOUT_UNIGRAMS = new String[] {
+    "shingle", "shingle",
+    "shingle", "shingle",
+    "shingle", "shingle",
+    "shingle", "shingle",
+    "shingle",
+  };
+  
+  public static final Token[] FOUR_GRAM_TOKENS = new Token[] {
+    createToken("please", 0, 6),
+    createToken("please divide", 0, 13),
+    createToken("please divide this", 0, 18),
+    createToken("please divide this sentence", 0, 27),
+    createToken("divide", 7, 13),
+    createToken("divide this", 7, 18),
+    createToken("divide this sentence", 7, 27),
+    createToken("divide this sentence into", 7, 32),
+    createToken("this", 14, 18),
+    createToken("this sentence", 14, 27),
+    createToken("this sentence into", 14, 32),
+    createToken("this sentence into shingles", 14, 39),
+    createToken("sentence", 19, 27),
+    createToken("sentence into", 19, 32),
+    createToken("sentence into shingles", 19, 39),
+    createToken("into", 28, 32),
+    createToken("into shingles", 28, 39),
+    createToken("shingles", 33, 39)
+  };
+
+  public static final int[] FOUR_GRAM_POSITION_INCREMENTS = new int[] {
+    1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1
+  };
+
+  public static final String[] FOUR_GRAM_TYPES = new String[] {
+    "word", "shingle", "shingle", "shingle",
+    "word", "shingle", "shingle", "shingle",
+    "word", "shingle", "shingle", "shingle",
+    "word", "shingle", "shingle",
+    "word", "shingle",
+    "word"
+  };
+  
+  public static final Token[] FOUR_GRAM_TOKENS_WITHOUT_UNIGRAMS = new Token[] {
+    createToken("please divide", 0, 13),
+    createToken("please divide this", 0, 18),
+    createToken("please divide this sentence", 0, 27),
+    createToken("divide this", 7, 18),
+    createToken("divide this sentence", 7, 27),
+    createToken("divide this sentence into", 7, 32),
+    createToken("this sentence", 14, 27),
+    createToken("this sentence into", 14, 32),
+    createToken("this sentence into shingles", 14, 39),
+    createToken("sentence into", 19, 32),
+    createToken("sentence into shingles", 19, 39),
+    createToken("into shingles", 28, 39),
+  };
+
+  public static final int[] FOUR_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS = new int[] {
+    1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1
+  };
+  
+  public static final String[] FOUR_GRAM_TYPES_WITHOUT_UNIGRAMS = new String[] {
+    "shingle", "shingle",
+    "shingle", "shingle",
+    "shingle", "shingle",
+    "shingle", "shingle",
+    "shingle", "shingle",
+    "shingle", "shingle",
+
+  };
 
 
   @Override
@@ -272,8 +359,25 @@
                            TRI_GRAM_POSITION_INCREMENTS, TRI_GRAM_TYPES,
                            true);
   }
-
-
+  
+  public void testTriGramFilterWithoutUnigrams() throws IOException {
+    this.shingleFilterTest(3, TEST_TOKEN, TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS,
+                           TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS, TRI_GRAM_TYPES_WITHOUT_UNIGRAMS,
+                           false);
+  }
+  
+  public void testFourGramFilter() throws IOException {
+    this.shingleFilterTest(4, TEST_TOKEN, FOUR_GRAM_TOKENS,
+        FOUR_GRAM_POSITION_INCREMENTS, FOUR_GRAM_TYPES,
+                           true);
+  }
+  
+  public void testFourGramFilterWithoutUnigrams() throws IOException {
+    this.shingleFilterTest(4, TEST_TOKEN, FOUR_GRAM_TOKENS_WITHOUT_UNIGRAMS,
+        FOUR_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS,
+        FOUR_GRAM_TYPES_WITHOUT_UNIGRAMS, false);
+  }
+  
   
   public void testReset() throws Exception {
     Tokenizer wsTokenizer = new WhitespaceTokenizer(new StringReader("please divide this sentence"));