You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by us...@apache.org on 2010/02/14 22:33:13 UTC

svn commit: r910078 - in /lucene/java/trunk/contrib: ./ analyzers/common/src/java/org/apache/lucene/analysis/ngram/ analyzers/common/src/test/org/apache/lucene/analysis/ngram/

Author: uschindler
Date: Sun Feb 14 21:33:12 2010
New Revision: 910078

URL: http://svn.apache.org/viewvc?rev=910078&view=rev
Log:
LUCENE-2266: Fixed offset calculations in NGramTokenFilter and EdgeNGramTokenFilter

Modified:
    lucene/java/trunk/contrib/CHANGES.txt
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java
    lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java
    lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java

Modified: lucene/java/trunk/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/CHANGES.txt?rev=910078&r1=910077&r2=910078&view=diff
==============================================================================
--- lucene/java/trunk/contrib/CHANGES.txt (original)
+++ lucene/java/trunk/contrib/CHANGES.txt Sun Feb 14 21:33:12 2010
@@ -153,6 +153,9 @@
    CJKTokenizer, ChineseTokenizer, SmartChinese SentenceTokenizer, 
    and WikipediaTokenizer.  (Koji Sekiguchi, Robert Muir)
    
+ * LUCENE-2266: Fixed offset calculations in NGramTokenFilter and 
+   EdgeNGramTokenFilter.  (Joe Calderon, Robert Muir via Uwe Schindler)
+   
 API Changes
 
  * LUCENE-2108: Add SpellChecker.close, to close the underlying

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java?rev=910078&r1=910077&r2=910078&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java Sun Feb 14 21:33:12 2010
@@ -70,6 +70,7 @@
   private char[] curTermBuffer;
   private int curTermLength;
   private int curGramSize;
+  private int tokStart;
   
   private final TermAttribute termAtt;
   private final OffsetAttribute offsetAtt;
@@ -126,6 +127,7 @@
           curTermBuffer = (char[]) termAtt.termBuffer().clone();
           curTermLength = termAtt.termLength();
           curGramSize = minGram;
+          tokStart = offsetAtt.startOffset();
         }
       }
       if (curGramSize <= maxGram) {
@@ -135,7 +137,7 @@
           int start = side == Side.FRONT ? 0 : curTermLength - curGramSize;
           int end = start + curGramSize;
           clearAttributes();
-          offsetAtt.setOffset(start, end);
+          offsetAtt.setOffset(tokStart + start, tokStart + end);
           termAtt.setTermBuffer(curTermBuffer, start, curGramSize);
           curGramSize++;
           return true;

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java?rev=910078&r1=910077&r2=910078&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java Sun Feb 14 21:33:12 2010
@@ -37,6 +37,7 @@
   private int curTermLength;
   private int curGramSize;
   private int curPos;
+  private int tokStart;
   
   private TermAttribute termAtt;
   private OffsetAttribute offsetAtt;
@@ -82,13 +83,14 @@
           curTermLength = termAtt.termLength();
           curGramSize = minGram;
           curPos = 0;
+          tokStart = offsetAtt.startOffset();
         }
       }
       while (curGramSize <= maxGram) {
         while (curPos+curGramSize <= curTermLength) {     // while there is input
           clearAttributes();
           termAtt.setTermBuffer(curTermBuffer, curPos, curGramSize);
-          offsetAtt.setOffset(curPos, curPos+curGramSize);
+          offsetAtt.setOffset(tokStart + curPos, tokStart + curPos + curGramSize);
           curPos++;
           return true;
         }

Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java?rev=910078&r1=910077&r2=910078&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java Sun Feb 14 21:33:12 2010
@@ -94,7 +94,7 @@
   public void testSmallTokenInStream() throws Exception {
     input = new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader("abc de fgh"));
     EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.FRONT, 3, 3);
-    assertTokenStreamContents(tokenizer, new String[]{"abc","fgh"}, new int[]{0,0}, new int[]{3,3});
+    assertTokenStreamContents(tokenizer, new String[]{"abc","fgh"}, new int[]{0,7}, new int[]{3,10});
   }
   
   public void testReset() throws Exception {

Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java?rev=910078&r1=910077&r2=910078&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java Sun Feb 14 21:33:12 2010
@@ -83,7 +83,7 @@
     public void testSmallTokenInStream() throws Exception {
       input = new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader("abc de fgh"));
       NGramTokenFilter filter = new NGramTokenFilter(input, 3, 3);
-      assertTokenStreamContents(filter, new String[]{"abc","fgh"}, new int[]{0,0}, new int[]{3,3});
+      assertTokenStreamContents(filter, new String[]{"abc","fgh"}, new int[]{0,7}, new int[]{3,10});
     }
     
     public void testReset() throws Exception {