You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2013/05/18 11:21:46 UTC

svn commit: r1484075 - in /lucene/dev/trunk/lucene/analysis/common/src: java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java

Author: jpountz
Date: Sat May 18 09:21:46 2013
New Revision: 1484075

URL: http://svn.apache.org/r1484075
Log:
Fix EdgeNGramTokenFilter to correctly handle graph inputs.

Modified:
    lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
    lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java

Modified: lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java?rev=1484075&r1=1484074&r2=1484075&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java Sat May 18 09:21:46 2013
@@ -24,6 +24,7 @@ import org.apache.lucene.analysis.TokenS
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
 import org.apache.lucene.util.Version;
 
 /**
@@ -43,11 +44,13 @@ public final class EdgeNGramTokenFilter 
   private int tokStart;
   private int tokEnd; // only used if the length changed before this filter
   private int savePosIncr;
+  private int savePosLen;
   private boolean isFirstToken = true;
   
   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
   private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
   private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+  private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
 
   /**
    * Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range
@@ -88,7 +91,8 @@ public final class EdgeNGramTokenFilter 
           curGramSize = minGram;
           tokStart = offsetAtt.startOffset();
           tokEnd = offsetAtt.endOffset();
-          savePosIncr = posIncrAtt.getPositionIncrement();
+          savePosIncr += posIncrAtt.getPositionIncrement();
+          savePosLen = posLenAtt.getPositionLength();
         }
       }
       if (curGramSize <= maxGram) {         // if we have hit the end of our n-gram size range, quit
@@ -98,13 +102,12 @@ public final class EdgeNGramTokenFilter 
           offsetAtt.setOffset(tokStart, tokEnd);
           // first ngram gets increment, others don't
           if (curGramSize == minGram) {
-            //  Leave the first token position increment at the cleared-attribute value of 1
-            if ( ! isFirstToken) {
-              posIncrAtt.setPositionIncrement(savePosIncr);
-            }
+            posIncrAtt.setPositionIncrement(savePosIncr);
+            savePosIncr = 0;
           } else {
             posIncrAtt.setPositionIncrement(0);
           }
+          posLenAtt.setPositionLength(savePosLen);
           termAtt.copyBuffer(curTermBuffer, 0, curGramSize);
           curGramSize++;
           isFirstToken = false;
@@ -120,5 +123,6 @@ public final class EdgeNGramTokenFilter 
     super.reset();
     curTermBuffer = null;
     isFirstToken = true;
+    savePosIncr = 0;
   }
 }

Modified: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java?rev=1484075&r1=1484074&r2=1484075&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java Sat May 18 09:21:46 2013
@@ -29,8 +29,11 @@ import org.apache.lucene.analysis.TokenF
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.core.KeywordTokenizer;
+import org.apache.lucene.analysis.core.LetterTokenizer;
 import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+import org.apache.lucene.analysis.shingle.ShingleFilter;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.util.Version;
 
 /**
  * Tests {@link EdgeNGramTokenFilter} for correctness.
@@ -187,4 +190,19 @@ public class EdgeNGramTokenFilterTest ex
     };
     checkAnalysisConsistency(random, a, random.nextBoolean(), "");
   }
+
+  public void testGraphs() throws IOException {
+    TokenStream tk = new LetterTokenizer(Version.LUCENE_44, new StringReader("abc d efgh ij klmno p q"));
+    tk = new ShingleFilter(tk);
+    tk = new EdgeNGramTokenFilter(Version.LUCENE_44, tk, 7, 10);
+    tk.reset();
+    assertTokenStreamContents(tk,
+        new String[] { "efgh ij", "ij klmn", "ij klmno", "klmno p" },
+        new int[]    { 6,11,11,14 },
+        new int[]    { 13,19,19,21 },
+        new int[]    { 3,1,0,1 },
+        new int[]    { 2,2,2,2 },
+        23
+    );
+  }
 }