You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2013/05/28 09:21:48 UTC

svn commit: r1486788 - in /lucene/dev/trunk/lucene: CHANGES.txt analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java

Author: jpountz
Date: Tue May 28 07:21:48 2013
New Revision: 1486788

URL: http://svn.apache.org/r1486788
Log:
LUCENE-5018: Don't update offsets in CompoundWordTokenFilterBase.

Modified:
    lucene/dev/trunk/lucene/CHANGES.txt
    lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java
    lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java

Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1486788&r1=1486787&r2=1486788&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Tue May 28 07:21:48 2013
@@ -82,6 +82,10 @@ Changes in backwards compatibility polic
   QueryParser.setAutoGeneratePhraseQueries(false) (for simple cases) or to
   override QueryParser.newFieldQuery. (Adrien Grand, Steve Rowe)
 
+* LUCENE-5018: CompoundWordTokenFilterBase and its children
+  DictionaryCompoundWordTokenFilter and HyphenationCompoundWordTokenFilter don't
+  update offsets anymore. (Adrien Grand)
+
 Bug Fixes
 
 * LUCENE-4997: Internal test framework's tests are sensitive to previous 

Modified: lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java?rev=1486788&r1=1486787&r2=1486788&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java Tue May 28 07:21:48 2013
@@ -19,7 +19,6 @@ package org.apache.lucene.analysis.compo
 
 import java.io.IOException;
 import java.util.LinkedList;
-import java.util.Set;
 
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
@@ -41,6 +40,7 @@ import org.apache.lucene.util.Version;
  * <li>As of 3.1, CompoundWordTokenFilterBase correctly handles Unicode 4.0
  * supplementary characters in strings and char arrays provided as compound word
  * dictionaries.
+ * <li>As of 4.4, {@link CompoundWordTokenFilterBase} doesn't update offsets.
  * </ul>
  */
 public abstract class CompoundWordTokenFilterBase extends TokenFilter {
@@ -58,7 +58,8 @@ public abstract class CompoundWordTokenF
    * The default for maximal length of subwords that get propagated to the output of this filter
    */
   public static final int DEFAULT_MAX_SUBWORD_SIZE = 15;
-  
+
+  protected final Version matchVersion;
   protected final CharArraySet dictionary;
   protected final LinkedList<CompoundToken> tokens;
   protected final int minWordSize;
@@ -82,7 +83,7 @@ public abstract class CompoundWordTokenF
 
   protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, CharArraySet dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
     super(input);
-    
+    this.matchVersion = matchVersion;
     this.tokens=new LinkedList<CompoundToken>();
     if (minWordSize < 0) {
       throw new IllegalArgumentException("minWordSize cannot be negative");
@@ -156,7 +157,8 @@ public abstract class CompoundWordTokenF
       int startOff = CompoundWordTokenFilterBase.this.offsetAtt.startOffset();
       int endOff = CompoundWordTokenFilterBase.this.offsetAtt.endOffset();
       
-      if (endOff - startOff != CompoundWordTokenFilterBase.this.termAtt.length()) {
+      if (matchVersion.onOrAfter(Version.LUCENE_44) ||
+          endOff - startOff != CompoundWordTokenFilterBase.this.termAtt.length()) {
         // if length by start + end offsets doesn't match the term text then assume
         // this is a synonym and don't adjust the offsets.
         this.startOffset = startOff;

Modified: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java?rev=1486788&r1=1486787&r2=1486788&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java Tue May 28 07:21:48 2013
@@ -151,12 +151,12 @@ public class TestCompoundWordTokenFilter
         "fiol", "fodral", "Basfiolsfodralmakaregesäll", "Bas", "fiol",
         "fodral", "makare", "gesäll", "Skomakare", "Sko", "makare",
         "Vindrutetorkare", "Vind", "rute", "torkare", "Vindrutetorkarblad",
-        "Vind", "rute", "blad", "abba" }, new int[] { 0, 0, 3, 8, 8, 11, 17,
-        17, 20, 24, 24, 28, 33, 33, 39, 44, 44, 49, 54, 54, 58, 62, 69, 69, 72,
-        77, 84, 84, 87, 92, 98, 104, 111, 111, 114, 121, 121, 125, 129, 137,
-        137, 141, 151, 156 }, new int[] { 7, 3, 7, 16, 11, 16, 23, 20, 23, 32,
-        28, 32, 43, 39, 43, 53, 49, 53, 68, 58, 62, 68, 83, 72, 76, 83, 110,
-        87, 91, 98, 104, 110, 120, 114, 120, 136, 125, 129, 136, 155, 141, 145,
+        "Vind", "rute", "blad", "abba" }, new int[] { 0, 0, 0, 8, 8, 8, 17,
+        17, 17, 24, 24, 24, 33, 33, 33, 44, 44, 44, 54, 54, 54, 54, 69, 69, 69,
+        69, 84, 84, 84, 84, 84, 84, 111, 111, 111, 121, 121, 121, 121, 137,
+        137, 137, 137, 156 }, new int[] { 7, 7, 7, 16, 16, 16, 23, 23, 23, 32,
+        32, 32, 43, 43, 43, 53, 53, 53, 68, 68, 68, 68, 83, 83, 83, 83, 110,
+        110, 110, 110, 110, 110, 120, 120, 120, 136, 136, 136, 136, 155, 155, 155,
         155, 160 }, new int[] { 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1,
         0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1,
         0, 0, 0, 1 });
@@ -174,8 +174,8 @@ public class TestCompoundWordTokenFilter
         CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, true);
 
     assertTokenStreamContents(tf, new String[] { "Basfiolsfodralmakaregesäll", "Bas",
-        "fiolsfodral", "fodral", "makare", "gesäll" }, new int[] { 0, 0, 3, 8,
-        14, 20 }, new int[] { 26, 3, 14, 14, 20, 26 }, new int[] { 1, 0, 0, 0,
+        "fiolsfodral", "fodral", "makare", "gesäll" }, new int[] { 0, 0, 0, 0,
+        0, 0 }, new int[] { 26, 26, 26, 26, 26, 26 }, new int[] { 1, 0, 0, 0,
         0, 0 });
   }
 
@@ -194,8 +194,8 @@ public class TestCompoundWordTokenFilter
 
     assertTokenStreamContents(tf,
       new String[] { "abcdef", "ab", "cd", "ef" },
-      new int[] { 0, 0, 2, 4},
-      new int[] { 6, 2, 4, 6},
+      new int[] { 0, 0, 0, 0},
+      new int[] { 6, 6, 6, 6},
       new int[] { 1, 0, 0, 0}
       );
   }
@@ -216,8 +216,8 @@ public class TestCompoundWordTokenFilter
   // since "d" is shorter than the minimum subword size, it should not be added to the token stream
     assertTokenStreamContents(tf,
       new String[] { "abcdefg", "abc", "efg" },
-      new int[] { 0, 0, 4},
-      new int[] { 7, 3, 7},
+      new int[] { 0, 0, 0},
+      new int[] { 7, 7, 7},
       new int[] { 1, 0, 0}
       );
   }