You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2014/03/18 19:12:17 UTC
svn commit: r1578993 - in /lucene/dev/trunk: lucene/ lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ lucene/analysis/common/src/test/org/apache/lucene/analysis/core/ lucene/analysis/common/src/test/org/apache/lucene/analysis/m...

Author: rmuir
Date: Tue Mar 18 18:12:16 2014
New Revision: 1578993

URL: http://svn.apache.org/r1578993
Log:
LUCENE-5111: Fix WordDelimiterFilter offsets

Added:
    lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/Lucene47WordDelimiterFilter.java   (contents, props changed)
      - copied, changed from r1578876, lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java
    lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLucene47WordDelimiterFilter.java   (contents, props changed)
      - copied, changed from r1578886, lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java
Modified:
    lucene/dev/trunk/lucene/CHANGES.txt
    lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java
    lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilterFactory.java
    lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
    lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java
    lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestWordDelimiterFilterFactory.java
    lucene/dev/trunk/solr/core/src/test/org/apache/solr/handler/FieldAnalysisRequestHandlerTest.java

Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1578993&r1=1578992&r2=1578993&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Tue Mar 18 18:12:16 2014
@@ -215,6 +215,8 @@ Bug fixes
   indexed shapes within 1/2 maxDistErr from the edge of the query shape.  This meant
   searching for a point by the same point as a query rarely worked.  (David Smiley)
 
+* LUCENE-5111: Fix WordDelimiterFilter to return offsets in correct order.  (Robert Muir)
+
 Test Framework
 
 * LUCENE-5449: Rename _TestUtil and _TestHelper to remove the leading _.

Copied: lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/Lucene47WordDelimiterFilter.java (from r1578876, lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java)
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/Lucene47WordDelimiterFilter.java?p2=lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/Lucene47WordDelimiterFilter.java&p1=lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java&r1=1578876&r2=1578993&rev=1578993&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/Lucene47WordDelimiterFilter.java Tue Mar 18 18:12:16 2014
@@ -32,55 +32,10 @@ import org.apache.lucene.util.RamUsageEs
 import java.io.IOException;
 
 /**
- * Splits words into subwords and performs optional transformations on subword
- * groups. Words are split into subwords with the following rules:
- * <ul>
- * <li>split on intra-word delimiters (by default, all non alpha-numeric
- * characters): <code>"Wi-Fi"</code> &#8594; <code>"Wi", "Fi"</code></li>
- * <li>split on case transitions: <code>"PowerShot"</code> &#8594;
- * <code>"Power", "Shot"</code></li>
- * <li>split on letter-number transitions: <code>"SD500"</code> &#8594;
- * <code>"SD", "500"</code></li>
- * <li>leading and trailing intra-word delimiters on each subword are ignored:
- * <code>"//hello---there, 'dude'"</code> &#8594;
- * <code>"hello", "there", "dude"</code></li>
- * <li>trailing "'s" are removed for each subword: <code>"O'Neil's"</code>
- * &#8594; <code>"O", "Neil"</code>
- * <ul>
- * <li>Note: this step isn't performed in a separate filter because of possible
- * subword combinations.</li>
- * </ul>
- * </li>
- * </ul>
- * 
- * The <b>combinations</b> parameter affects how subwords are combined:
- * <ul>
- * <li>combinations="0" causes no subword combinations: <code>"PowerShot"</code>
- * &#8594; <code>0:"Power", 1:"Shot"</code> (0 and 1 are the token positions)</li>
- * <li>combinations="1" means that in addition to the subwords, maximum runs of
- * non-numeric subwords are catenated and produced at the same position of the
- * last subword in the run:
- * <ul>
- * <li><code>"PowerShot"</code> &#8594;
- * <code>0:"Power", 1:"Shot" 1:"PowerShot"</code></li>
- * <li><code>"A's+B's&C's"</code> -gt; <code>0:"A", 1:"B", 2:"C", 2:"ABC"</code>
- * </li>
- * <li><code>"Super-Duper-XL500-42-AutoCoder!"</code> &#8594;
- * <code>0:"Super", 1:"Duper", 2:"XL", 2:"SuperDuperXL", 3:"500" 4:"42", 5:"Auto", 6:"Coder", 6:"AutoCoder"</code>
- * </li>
- * </ul>
- * </li>
- * </ul>
- * One use for {@link WordDelimiterFilter} is to help match words with different
- * subword delimiters. For example, if the source text contained "wi-fi" one may
- * want "wifi" "WiFi" "wi-fi" "wi+fi" queries to all match. One way of doing so
- * is to specify combinations="1" in the analyzer used for indexing, and
- * combinations="0" (the default) in the analyzer used for querying. Given that
- * the current {@link StandardTokenizer} immediately removes many intra-word
- * delimiters, it is recommended that this filter be used after a tokenizer that
- * does not do this (such as {@link WhitespaceTokenizer}).
+ * Old Broken version of {@link WordDelimiterFilter}
  */
-public final class WordDelimiterFilter extends TokenFilter {
+@Deprecated
+public final class Lucene47WordDelimiterFilter extends TokenFilter {
   
   public static final int LOWER = 0x01;
   public static final int UPPER = 0x02;
@@ -202,7 +157,7 @@ public final class WordDelimiterFilter e
    * @param configurationFlags Flags configuring the filter
    * @param protWords If not null is the set of tokens to protect from being delimited
    */
-  public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int configurationFlags, CharArraySet protWords) {
+  public Lucene47WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int configurationFlags, CharArraySet protWords) {
     super(in);
     this.flags = configurationFlags;
     this.protWords = protWords;
@@ -218,7 +173,7 @@ public final class WordDelimiterFilter e
    * @param configurationFlags Flags configuring the filter
    * @param protWords If not null is the set of tokens to protect from being delimited
    */
-  public WordDelimiterFilter(TokenStream in, int configurationFlags, CharArraySet protWords) {
+  public Lucene47WordDelimiterFilter(TokenStream in, int configurationFlags, CharArraySet protWords) {
     this(in, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, configurationFlags, protWords);
   }
 

Modified: lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java?rev=1578993&r1=1578992&r2=1578993&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java Tue Mar 18 18:12:16 2014
@@ -27,9 +27,13 @@ import org.apache.lucene.analysis.tokena
 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 import org.apache.lucene.analysis.util.CharArraySet;
 import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.AttributeSource;
+import org.apache.lucene.util.InPlaceMergeSorter;
 import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.lucene.util.Version;
 
 import java.io.IOException;
+import java.util.Arrays;
 
 /**
  * Splits words into subwords and performs optional transformations on subword
@@ -202,8 +206,11 @@ public final class WordDelimiterFilter e
    * @param configurationFlags Flags configuring the filter
    * @param protWords If not null is the set of tokens to protect from being delimited
    */
-  public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int configurationFlags, CharArraySet protWords) {
+  public WordDelimiterFilter(Version matchVersion, TokenStream in, byte[] charTypeTable, int configurationFlags, CharArraySet protWords) {
     super(in);
+    if (!matchVersion.onOrAfter(Version.LUCENE_48)) {
+      throw new IllegalArgumentException("This class only works with Lucene 4.8+. To emulate the old (broken) behavior of WordDelimiterFilter, use Lucene47WordDelimiterFilter");
+    }
     this.flags = configurationFlags;
     this.protWords = protWords;
     this.iterator = new WordDelimiterIterator(
@@ -218,8 +225,8 @@ public final class WordDelimiterFilter e
    * @param configurationFlags Flags configuring the filter
    * @param protWords If not null is the set of tokens to protect from being delimited
    */
-  public WordDelimiterFilter(TokenStream in, int configurationFlags, CharArraySet protWords) {
-    this(in, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, configurationFlags, protWords);
+  public WordDelimiterFilter(Version matchVersion, TokenStream in, int configurationFlags, CharArraySet protWords) {
+    this(matchVersion, in, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, configurationFlags, protWords);
   }
 
   @Override
@@ -244,6 +251,7 @@ public final class WordDelimiterFilter e
             (protWords != null && protWords.contains(termBuffer, 0, termLength))) {
           posIncAttribute.setPositionIncrement(accumPosInc);
           accumPosInc = 0;
+          first = false;
           return true;
         }
         
@@ -265,6 +273,7 @@ public final class WordDelimiterFilter e
         if (has(PRESERVE_ORIGINAL)) {
           posIncAttribute.setPositionIncrement(accumPosInc);
           accumPosInc = 0;
+          first = false;
           return true;
         }
       }
@@ -273,7 +282,8 @@ public final class WordDelimiterFilter e
       if (iterator.end == WordDelimiterIterator.DONE) {
         if (!concat.isEmpty()) {
           if (flushConcatenation(concat)) {
-            return true;
+            buffer();
+            continue;
           }
         }
         
@@ -281,12 +291,28 @@ public final class WordDelimiterFilter e
           // only if we haven't output this same combo above!
           if (concatAll.subwordCount > lastConcatCount) {
             concatAll.writeAndClear();
-            return true;
+            buffer();
+            continue;
           }
           concatAll.clear();
         }
         
+        if (bufferedPos < bufferedLen) {
+          if (bufferedPos == 0) {
+            sorter.sort(0, bufferedLen);
+          }
+          clearAttributes();
+          restoreState(buffered[bufferedPos++]);
+          if (first && posIncAttribute.getPositionIncrement() == 0) {
+            // can easily happen with strange combinations (e.g. not outputting numbers, but concat-all)
+            posIncAttribute.setPositionIncrement(1);
+          }
+          first = false;
+          return true;
+        }
+        
         // no saved concatenations, on to the next input word
+        bufferedPos = bufferedLen = 0;
         hasSavedState = false;
         continue;
       }
@@ -295,6 +321,7 @@ public final class WordDelimiterFilter e
       if (iterator.isSingleWord()) {
         generatePart(true);
         iterator.next();
+        first = false;
         return true;
       }
       
@@ -304,7 +331,8 @@ public final class WordDelimiterFilter e
       if (!concat.isEmpty() && (concat.type & wordType) == 0) {
         if (flushConcatenation(concat)) {
           hasOutputToken = false;
-          return true;
+          buffer();
+          continue;
         }
         hasOutputToken = false;
       }
@@ -325,28 +353,74 @@ public final class WordDelimiterFilter e
       // if we should output the word or number part
       if (shouldGenerateParts(wordType)) {
         generatePart(false);
-        iterator.next();
-        return true;
+        buffer();
       }
         
       iterator.next();
     }
   }
 
-  /**
-   * {@inheritDoc}
-   */
   @Override
   public void reset() throws IOException {
     super.reset();
     hasSavedState = false;
     concat.clear();
     concatAll.clear();
-    accumPosInc = 0;
+    accumPosInc = bufferedPos = bufferedLen = 0;
+    first = true;
   }
 
   // ================================================= Helper Methods ================================================
 
+  
+  private AttributeSource.State buffered[] = new AttributeSource.State[8];
+  private int startOff[] = new int[8];
+  private int posInc[] = new int[8];
+  private int bufferedLen = 0;
+  private int bufferedPos = 0;
+  private boolean first;
+  
+  private class OffsetSorter extends InPlaceMergeSorter {
+    @Override
+    protected int compare(int i, int j) {
+      int cmp = Integer.compare(startOff[i], startOff[j]);
+      if (cmp == 0) {
+        cmp = Integer.compare(posInc[j], posInc[i]);
+      }
+      return cmp;
+    }
+
+    @Override
+    protected void swap(int i, int j) {
+      AttributeSource.State tmp = buffered[i];
+      buffered[i] = buffered[j];
+      buffered[j] = tmp;
+      
+      int tmp2 = startOff[i];
+      startOff[i] = startOff[j];
+      startOff[j] = tmp2;
+      
+      tmp2 = posInc[i];
+      posInc[i] = posInc[j];
+      posInc[j] = tmp2;
+    }
+  }
+  
+  final OffsetSorter sorter = new OffsetSorter();
+  
+  private void buffer() {
+    if (bufferedLen == buffered.length) {
+      int newSize = ArrayUtil.oversize(bufferedLen+1, 8);
+      buffered = Arrays.copyOf(buffered, newSize);
+      startOff = Arrays.copyOf(startOff, newSize);
+      posInc = Arrays.copyOf(posInc, newSize);
+    }
+    startOff[bufferedLen] = offsetAttribute.startOffset();
+    posInc[bufferedLen] = posIncAttribute.getPositionIncrement();
+    buffered[bufferedLen] = captureState();
+    bufferedLen++;
+  }
+  
   /**
    * Saves the existing attribute states
    */

Modified: lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilterFactory.java?rev=1578993&r1=1578992&r2=1578993&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilterFactory.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilterFactory.java Tue Mar 18 18:12:16 2014
@@ -17,11 +17,13 @@ package org.apache.lucene.analysis.misce
  * limitations under the License.
  */
 
+import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.util.CharArraySet;
 import org.apache.lucene.analysis.util.ResourceLoader;
 import org.apache.lucene.analysis.util.ResourceLoaderAware;
 import org.apache.lucene.analysis.util.TokenFilterFactory;
+import org.apache.lucene.util.Version;
 
 import java.util.ArrayList;
 import java.util.List;
@@ -61,6 +63,7 @@ public class WordDelimiterFilterFactory 
   /** Creates a new WordDelimiterFilterFactory */
   public WordDelimiterFilterFactory(Map<String, String> args) {
     super(args);
+    assureMatchVersion();
     int flags = 0;
     if (getInt(args, "generateWordParts", 1) != 0) {
       flags |= GENERATE_WORD_PARTS;
@@ -114,9 +117,14 @@ public class WordDelimiterFilterFactory 
   }
 
   @Override
-  public WordDelimiterFilter create(TokenStream input) {
-    return new WordDelimiterFilter(input, typeTable == null ? WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE : typeTable,
+  public TokenFilter create(TokenStream input) {
+    if (luceneMatchVersion.onOrAfter(Version.LUCENE_48)) {
+      return new WordDelimiterFilter(luceneMatchVersion, input, typeTable == null ? WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE : typeTable,
                                    flags, protectedWords);
+    } else {
+      return new Lucene47WordDelimiterFilter(input, typeTable == null ? WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE : typeTable,
+                                  flags, protectedWords);
+    }
   }
   
   // source => type

Modified: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java?rev=1578993&r1=1578992&r2=1578993&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java Tue Mar 18 18:12:16 2014
@@ -144,9 +144,7 @@ public class TestRandomChains extends Ba
           CachingTokenFilter.class,
           // Not broken: we forcefully add this, so we shouldn't
           // also randomly pick it:
-          ValidatingTokenFilter.class,
-          // broken!
-          WordDelimiterFilter.class)) {
+          ValidatingTokenFilter.class)) {
         for (Constructor<?> ctor : c.getConstructors()) {
           brokenConstructors.put(ctor, ALWAYS);
         }
@@ -175,7 +173,9 @@ public class TestRandomChains extends Ba
           // TODO: LUCENE-4983
           CommonGramsFilter.class,
           // TODO: doesn't handle graph inputs
-          CommonGramsQueryFilter.class)) {
+          CommonGramsQueryFilter.class,
+          // TODO: probably doesnt handle graph inputs, too afraid to try
+          WordDelimiterFilter.class)) {
         for (Constructor<?> ctor : c.getConstructors()) {
           brokenOffsetsConstructors.put(ctor, ALWAYS);
         }
@@ -893,7 +893,26 @@ public class TestRandomChains extends Ba
   }
   
   public void testRandomChains() throws Throwable {
-    int numIterations = atLeast(10);
+    int numIterations = atLeast(20);
+    Random random = random();
+    for (int i = 0; i < numIterations; i++) {
+      MockRandomAnalyzer a = new MockRandomAnalyzer(random.nextLong());
+      if (VERBOSE) {
+        System.out.println("Creating random analyzer:" + a);
+      }
+      try {
+        checkRandomData(random, a, 500*RANDOM_MULTIPLIER, 20, false,
+                        false /* We already validate our own offsets... */);
+      } catch (Throwable e) {
+        System.err.println("Exception from random analyzer: " + a);
+        throw e;
+      }
+    }
+  }
+  
+  // we might regret this decision...
+  public void testRandomChainsWithLargeStrings() throws Throwable {
+    int numIterations = atLeast(20);
     Random random = random();
     for (int i = 0; i < numIterations; i++) {
       MockRandomAnalyzer a = new MockRandomAnalyzer(random.nextLong());
@@ -901,7 +920,7 @@ public class TestRandomChains extends Ba
         System.out.println("Creating random analyzer:" + a);
       }
       try {
-        checkRandomData(random, a, 200, 20, false,
+        checkRandomData(random, a, 50*RANDOM_MULTIPLIER, 256, false,
                         false /* We already validate our own offsets... */);
       } catch (Throwable e) {
         System.err.println("Exception from random analyzer: " + a);

Copied: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLucene47WordDelimiterFilter.java (from r1578886, lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java)
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLucene47WordDelimiterFilter.java?p2=lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLucene47WordDelimiterFilter.java&p1=lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java&r1=1578886&r2=1578993&rev=1578993&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLucene47WordDelimiterFilter.java Tue Mar 18 18:12:16 2014
@@ -37,7 +37,8 @@ import static org.apache.lucene.analysis
  * TODO: should explicitly test things like protWords and not rely on
  * the factory tests in Solr.
  */
-public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
+@Deprecated
+public class TestLucene47WordDelimiterFilter extends BaseTokenStreamTestCase {
 
   /***
   public void testPerformance() throws IOException {
@@ -62,7 +63,7 @@ public class TestWordDelimiterFilter ext
     int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
     // test that subwords and catenated subwords have
     // the correct offsets.
-    WordDelimiterFilter wdf = new WordDelimiterFilter(new SingleTokenTokenStream(new Token("foo-bar", 5, 12)), DEFAULT_WORD_DELIM_TABLE, flags, null);
+    TokenFilter wdf = new Lucene47WordDelimiterFilter(new SingleTokenTokenStream(new Token("foo-bar", 5, 12)), DEFAULT_WORD_DELIM_TABLE, flags, null);
 
     assertTokenStreamContents(wdf, 
         new String[] { "foo", "bar", "foobar" },
@@ -70,7 +71,7 @@ public class TestWordDelimiterFilter ext
         new int[] { 8, 12, 12 },
         null, null, null, null, false);
 
-    wdf = new WordDelimiterFilter(new SingleTokenTokenStream(new Token("foo-bar", 5, 6)), DEFAULT_WORD_DELIM_TABLE, flags, null);
+    wdf = new Lucene47WordDelimiterFilter(new SingleTokenTokenStream(new Token("foo-bar", 5, 6)), DEFAULT_WORD_DELIM_TABLE, flags, null);
     
     assertTokenStreamContents(wdf,
         new String[] { "foo", "bar", "foobar" },
@@ -82,7 +83,7 @@ public class TestWordDelimiterFilter ext
   @Test
   public void testOffsetChange() throws Exception {
     int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
-    WordDelimiterFilter wdf = new WordDelimiterFilter(new SingleTokenTokenStream(new Token("Ã¼belkeit)", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);
+    TokenFilter wdf = new Lucene47WordDelimiterFilter(new SingleTokenTokenStream(new Token("Ã¼belkeit)", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);
     
     assertTokenStreamContents(wdf,
         new String[] { "Ã¼belkeit" },
@@ -93,7 +94,7 @@ public class TestWordDelimiterFilter ext
   @Test
   public void testOffsetChange2() throws Exception {
     int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
-    WordDelimiterFilter wdf = new WordDelimiterFilter(new SingleTokenTokenStream(new Token("(Ã¼belkeit", 7, 17)), DEFAULT_WORD_DELIM_TABLE, flags, null);
+    TokenFilter wdf = new Lucene47WordDelimiterFilter(new SingleTokenTokenStream(new Token("(Ã¼belkeit", 7, 17)), DEFAULT_WORD_DELIM_TABLE, flags, null);
     
     assertTokenStreamContents(wdf,
         new String[] { "Ã¼belkeit" },
@@ -104,7 +105,7 @@ public class TestWordDelimiterFilter ext
   @Test
   public void testOffsetChange3() throws Exception {
     int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
-    WordDelimiterFilter wdf = new WordDelimiterFilter(new SingleTokenTokenStream(new Token("(Ã¼belkeit", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);
+    TokenFilter wdf = new Lucene47WordDelimiterFilter(new SingleTokenTokenStream(new Token("(Ã¼belkeit", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);
     
     assertTokenStreamContents(wdf,
         new String[] { "Ã¼belkeit" },
@@ -115,7 +116,7 @@ public class TestWordDelimiterFilter ext
   @Test
   public void testOffsetChange4() throws Exception {
     int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
-    WordDelimiterFilter wdf = new WordDelimiterFilter(new SingleTokenTokenStream(new Token("(foo,bar)", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);
+    TokenFilter wdf = new Lucene47WordDelimiterFilter(new SingleTokenTokenStream(new Token("(foo,bar)", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);
     
     assertTokenStreamContents(wdf,
         new String[] { "foo", "bar", "foobar"},
@@ -126,7 +127,7 @@ public class TestWordDelimiterFilter ext
 
   public void doSplit(final String input, String... output) throws Exception {
     int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
-    WordDelimiterFilter wdf = new WordDelimiterFilter(keywordMockTokenizer(input),
+    TokenFilter wdf = new Lucene47WordDelimiterFilter(keywordMockTokenizer(input),
         WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, flags, null);
     
     assertTokenStreamContents(wdf, output);
@@ -170,7 +171,7 @@ public class TestWordDelimiterFilter ext
   public void doSplitPossessive(int stemPossessive, final String input, final String... output) throws Exception {
     int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS;
     flags |= (stemPossessive == 1) ? STEM_ENGLISH_POSSESSIVE : 0;
-    WordDelimiterFilter wdf = new WordDelimiterFilter(keywordMockTokenizer(input), flags, null);
+    TokenFilter wdf = new Lucene47WordDelimiterFilter(keywordMockTokenizer(input), flags, null);
 
     assertTokenStreamContents(wdf, output);
   }
@@ -217,7 +218,7 @@ public class TestWordDelimiterFilter ext
       @Override
       public TokenStreamComponents createComponents(String field) {
         Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
-        return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(
+        return new TokenStreamComponents(tokenizer, new Lucene47WordDelimiterFilter(
             tokenizer,
             flags, protWords));
       }
@@ -254,7 +255,7 @@ public class TestWordDelimiterFilter ext
       @Override
       public TokenStreamComponents createComponents(String field) {
         Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
-        return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(
+        return new TokenStreamComponents(tokenizer, new Lucene47WordDelimiterFilter(
             new LargePosIncTokenFilter(tokenizer),
             flags, protWords));
       }
@@ -301,7 +302,7 @@ public class TestWordDelimiterFilter ext
         Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
         StopFilter filter = new StopFilter(TEST_VERSION_CURRENT,
             tokenizer, StandardAnalyzer.STOP_WORDS_SET);
-        return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(filter, flags, protWords));
+        return new TokenStreamComponents(tokenizer, new Lucene47WordDelimiterFilter(filter, flags, protWords));
       }
     };
 
@@ -342,7 +343,7 @@ public class TestWordDelimiterFilter ext
         @Override
         protected TokenStreamComponents createComponents(String fieldName) {
           Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
-          return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(tokenizer, flags, protectedWords));
+          return new TokenStreamComponents(tokenizer, new Lucene47WordDelimiterFilter(tokenizer, flags, protectedWords));
         }
       };
       checkRandomData(random(), a, 200, 20, false, false);
@@ -364,7 +365,7 @@ public class TestWordDelimiterFilter ext
         @Override
         protected TokenStreamComponents createComponents(String fieldName) {
           Tokenizer tokenizer = new KeywordTokenizer();
-          return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(tokenizer, flags, protectedWords));
+          return new TokenStreamComponents(tokenizer, new Lucene47WordDelimiterFilter(tokenizer, flags, protectedWords));
         }
       };
       // depending upon options, this thing may or may not preserve the empty term

Modified: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java?rev=1578993&r1=1578992&r2=1578993&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java Tue Mar 18 18:12:16 2014
@@ -62,27 +62,25 @@ public class TestWordDelimiterFilter ext
     int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
     // test that subwords and catenated subwords have
     // the correct offsets.
-    WordDelimiterFilter wdf = new WordDelimiterFilter(new SingleTokenTokenStream(new Token("foo-bar", 5, 12)), DEFAULT_WORD_DELIM_TABLE, flags, null);
+    WordDelimiterFilter wdf = new WordDelimiterFilter(TEST_VERSION_CURRENT, new SingleTokenTokenStream(new Token("foo-bar", 5, 12)), DEFAULT_WORD_DELIM_TABLE, flags, null);
 
     assertTokenStreamContents(wdf, 
-        new String[] { "foo", "bar", "foobar" },
-        new int[] { 5, 9, 5 }, 
-        new int[] { 8, 12, 12 },
-        null, null, null, null, false);
+        new String[] { "foo", "foobar", "bar" },
+        new int[] { 5, 5, 9 }, 
+        new int[] { 8, 12, 12 });
 
-    wdf = new WordDelimiterFilter(new SingleTokenTokenStream(new Token("foo-bar", 5, 6)), DEFAULT_WORD_DELIM_TABLE, flags, null);
+    wdf = new WordDelimiterFilter(TEST_VERSION_CURRENT, new SingleTokenTokenStream(new Token("foo-bar", 5, 6)), DEFAULT_WORD_DELIM_TABLE, flags, null);
     
     assertTokenStreamContents(wdf,
         new String[] { "foo", "bar", "foobar" },
         new int[] { 5, 5, 5 },
-        new int[] { 6, 6, 6 },
-        null, null, null, null, false);
+        new int[] { 6, 6, 6 });
   }
   
   @Test
   public void testOffsetChange() throws Exception {
     int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
-    WordDelimiterFilter wdf = new WordDelimiterFilter(new SingleTokenTokenStream(new Token("Ã¼belkeit)", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);
+    WordDelimiterFilter wdf = new WordDelimiterFilter(TEST_VERSION_CURRENT, new SingleTokenTokenStream(new Token("Ã¼belkeit)", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);
     
     assertTokenStreamContents(wdf,
         new String[] { "Ã¼belkeit" },
@@ -93,7 +91,7 @@ public class TestWordDelimiterFilter ext
   @Test
   public void testOffsetChange2() throws Exception {
     int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
-    WordDelimiterFilter wdf = new WordDelimiterFilter(new SingleTokenTokenStream(new Token("(Ã¼belkeit", 7, 17)), DEFAULT_WORD_DELIM_TABLE, flags, null);
+    WordDelimiterFilter wdf = new WordDelimiterFilter(TEST_VERSION_CURRENT, new SingleTokenTokenStream(new Token("(Ã¼belkeit", 7, 17)), DEFAULT_WORD_DELIM_TABLE, flags, null);
     
     assertTokenStreamContents(wdf,
         new String[] { "Ã¼belkeit" },
@@ -104,7 +102,7 @@ public class TestWordDelimiterFilter ext
   @Test
   public void testOffsetChange3() throws Exception {
     int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
-    WordDelimiterFilter wdf = new WordDelimiterFilter(new SingleTokenTokenStream(new Token("(Ã¼belkeit", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);
+    WordDelimiterFilter wdf = new WordDelimiterFilter(TEST_VERSION_CURRENT, new SingleTokenTokenStream(new Token("(Ã¼belkeit", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);
     
     assertTokenStreamContents(wdf,
         new String[] { "Ã¼belkeit" },
@@ -115,18 +113,17 @@ public class TestWordDelimiterFilter ext
   @Test
   public void testOffsetChange4() throws Exception {
     int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
-    WordDelimiterFilter wdf = new WordDelimiterFilter(new SingleTokenTokenStream(new Token("(foo,bar)", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);
+    WordDelimiterFilter wdf = new WordDelimiterFilter(TEST_VERSION_CURRENT, new SingleTokenTokenStream(new Token("(foo,bar)", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);
     
     assertTokenStreamContents(wdf,
-        new String[] { "foo", "bar", "foobar"},
-        new int[] { 8, 12, 8 },
-        new int[] { 11, 15, 15 },
-        null, null, null, null, false);
+        new String[] { "foo", "foobar", "bar"},
+        new int[] { 8, 8, 12 },
+        new int[] { 11, 15, 15 });
   }
 
   public void doSplit(final String input, String... output) throws Exception {
     int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
-    WordDelimiterFilter wdf = new WordDelimiterFilter(keywordMockTokenizer(input),
+    WordDelimiterFilter wdf = new WordDelimiterFilter(TEST_VERSION_CURRENT, keywordMockTokenizer(input),
         WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, flags, null);
     
     assertTokenStreamContents(wdf, output);
@@ -170,7 +167,7 @@ public class TestWordDelimiterFilter ext
   public void doSplitPossessive(int stemPossessive, final String input, final String... output) throws Exception {
     int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS;
     flags |= (stemPossessive == 1) ? STEM_ENGLISH_POSSESSIVE : 0;
-    WordDelimiterFilter wdf = new WordDelimiterFilter(keywordMockTokenizer(input), flags, null);
+    WordDelimiterFilter wdf = new WordDelimiterFilter(TEST_VERSION_CURRENT, keywordMockTokenizer(input), flags, null);
 
     assertTokenStreamContents(wdf, output);
   }
@@ -217,7 +214,7 @@ public class TestWordDelimiterFilter ext
       @Override
       public TokenStreamComponents createComponents(String field) {
         Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
-        return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(
+        return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(TEST_VERSION_CURRENT, 
             tokenizer,
             flags, protWords));
       }
@@ -227,34 +224,25 @@ public class TestWordDelimiterFilter ext
     assertAnalyzesTo(a, "LUCENE / SOLR", new String[] { "LUCENE", "SOLR" },
         new int[] { 0, 9 },
         new int[] { 6, 13 },
-        null,
-        new int[] { 1, 1 },
-        null,
-        false);
+        new int[] { 1, 1 });
     
     /* only in this case, posInc of 2 ?! */
-    assertAnalyzesTo(a, "LUCENE / solR", new String[] { "LUCENE", "sol", "R", "solR" },
-        new int[] { 0, 9, 12, 9 },
+    assertAnalyzesTo(a, "LUCENE / solR", new String[] { "LUCENE", "sol", "solR", "R" },
+        new int[] { 0, 9, 9, 12 },
         new int[] { 6, 12, 13, 13 },
-        null,
-        new int[] { 1, 1, 1, 0 },
-        null,
-        false);
+        new int[] { 1, 1, 0, 1 });
     
     assertAnalyzesTo(a, "LUCENE / NUTCH SOLR", new String[] { "LUCENE", "NUTCH", "SOLR" },
         new int[] { 0, 9, 15 },
         new int[] { 6, 14, 19 },
-        null,
-        new int[] { 1, 1, 1 },
-        null,
-        false);
+        new int[] { 1, 1, 1 });
     
     /* analyzer that will consume tokens with large position increments */
     Analyzer a2 = new Analyzer() {
       @Override
       public TokenStreamComponents createComponents(String field) {
         Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
-        return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(
+        return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(TEST_VERSION_CURRENT, 
             new LargePosIncTokenFilter(tokenizer),
             flags, protWords));
       }
@@ -264,36 +252,24 @@ public class TestWordDelimiterFilter ext
     assertAnalyzesTo(a2, "LUCENE largegap SOLR", new String[] { "LUCENE", "largegap", "SOLR" },
         new int[] { 0, 7, 16 },
         new int[] { 6, 15, 20 },
-        null,
-        new int[] { 1, 10, 1 },
-        null,
-        false);
+        new int[] { 1, 10, 1 });
     
     /* the "/" had a position increment of 10, where did it go?!?!! */
     assertAnalyzesTo(a2, "LUCENE / SOLR", new String[] { "LUCENE", "SOLR" },
         new int[] { 0, 9 },
         new int[] { 6, 13 },
-        null,
-        new int[] { 1, 11 },
-        null,
-        false);
+        new int[] { 1, 11 });
     
     /* in this case, the increment of 10 from the "/" is carried over */
-    assertAnalyzesTo(a2, "LUCENE / solR", new String[] { "LUCENE", "sol", "R", "solR" },
-        new int[] { 0, 9, 12, 9 },
+    assertAnalyzesTo(a2, "LUCENE / solR", new String[] { "LUCENE", "sol", "solR", "R" },
+        new int[] { 0, 9, 9, 12 },
         new int[] { 6, 12, 13, 13 },
-        null,
-        new int[] { 1, 11, 1, 0 },
-        null,
-        false);
+        new int[] { 1, 11, 0, 1 });
     
     assertAnalyzesTo(a2, "LUCENE / NUTCH SOLR", new String[] { "LUCENE", "NUTCH", "SOLR" },
         new int[] { 0, 9, 15 },
         new int[] { 6, 14, 19 },
-        null,
-        new int[] { 1, 11, 1 },
-        null,
-        false);
+        new int[] { 1, 11, 1 });
 
     Analyzer a3 = new Analyzer() {
       @Override
@@ -301,28 +277,62 @@ public class TestWordDelimiterFilter ext
         Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
         StopFilter filter = new StopFilter(TEST_VERSION_CURRENT,
             tokenizer, StandardAnalyzer.STOP_WORDS_SET);
-        return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(filter, flags, protWords));
+        return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(TEST_VERSION_CURRENT, filter, flags, protWords));
       }
     };
 
     assertAnalyzesTo(a3, "lucene.solr", 
-        new String[] { "lucene", "solr", "lucenesolr" },
-        new int[] { 0, 7, 0 },
+        new String[] { "lucene", "lucenesolr", "solr" },
+        new int[] { 0, 0, 7 },
         new int[] { 6, 11, 11 },
-        null,
-        new int[] { 1, 1, 0 },
-        null,
-        false);
+        new int[] { 1, 0, 1 });
 
     /* the stopword should add a gap here */
     assertAnalyzesTo(a3, "the lucene.solr", 
-        new String[] { "lucene", "solr", "lucenesolr" }, 
-        new int[] { 4, 11, 4 }, 
+        new String[] { "lucene", "lucenesolr", "solr" }, 
+        new int[] { 4, 4, 11 }, 
         new int[] { 10, 15, 15 },
-        null,
-        new int[] { 2, 1, 0 },
-        null,
-        false);
+        new int[] { 2, 0, 1 });
+  }
+  
+  /** concat numbers + words + all */
+  public void testLotsOfConcatenating() throws Exception {
+    final int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_WORDS | CATENATE_NUMBERS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;    
+
+    /* analyzer that uses whitespace + wdf */
+    Analyzer a = new Analyzer() {
+      @Override
+      public TokenStreamComponents createComponents(String field) {
+        Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+        return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(TEST_VERSION_CURRENT, tokenizer, flags, null));
+      }
+    };
+    
+    assertAnalyzesTo(a, "abc-def-123-456", 
+        new String[] { "abc", "abcdef", "abcdef123456", "def", "123", "123456", "456" }, 
+        new int[] { 0, 0, 0, 4, 8, 8, 12 }, 
+        new int[] { 3, 7, 15, 7, 11, 15, 15 },
+        new int[] { 1, 0, 0, 1, 1, 0, 1 });
+  }
+  
+  /** concat numbers + words + all + preserve original */
+  public void testLotsOfConcatenating2() throws Exception {
+    final int flags = PRESERVE_ORIGINAL | GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_WORDS | CATENATE_NUMBERS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;    
+
+    /* analyzer that uses whitespace + wdf */
+    Analyzer a = new Analyzer() {
+      @Override
+      public TokenStreamComponents createComponents(String field) {
+        Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+        return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(TEST_VERSION_CURRENT, tokenizer, flags, null));
+      }
+    };
+    
+    assertAnalyzesTo(a, "abc-def-123-456", 
+        new String[] { "abc-def-123-456", "abc", "abcdef", "abcdef123456", "def", "123", "123456", "456" }, 
+        new int[] { 0, 0, 0, 0, 4, 8, 8, 12 }, 
+        new int[] { 15, 3, 7, 15, 7, 11, 15, 15 },
+        new int[] { 1, 0, 0, 0, 1, 1, 0, 1 });
   }
   
   /** blast some random strings through the analyzer */
@@ -342,10 +352,34 @@ public class TestWordDelimiterFilter ext
         @Override
         protected TokenStreamComponents createComponents(String fieldName) {
           Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
-          return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(tokenizer, flags, protectedWords));
+          return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(TEST_VERSION_CURRENT, tokenizer, flags, protectedWords));
+        }
+      };
+      checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
+    }
+  }
+  
+  /** blast some enormous random strings through the analyzer */
+  public void testRandomHugeStrings() throws Exception {
+    int numIterations = atLeast(5);
+    for (int i = 0; i < numIterations; i++) {
+      final int flags = random().nextInt(512);
+      final CharArraySet protectedWords;
+      if (random().nextBoolean()) {
+        protectedWords = new CharArraySet(TEST_VERSION_CURRENT, new HashSet<>(Arrays.asList("a", "b", "cd")), false);
+      } else {
+        protectedWords = null;
+      }
+      
+      Analyzer a = new Analyzer() {
+        
+        @Override
+        protected TokenStreamComponents createComponents(String fieldName) {
+          Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+          return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(TEST_VERSION_CURRENT, tokenizer, flags, protectedWords));
         }
       };
-      checkRandomData(random(), a, 200, 20, false, false);
+      checkRandomData(random(), a, 100*RANDOM_MULTIPLIER, 8192);
     }
   }
   
@@ -364,7 +398,7 @@ public class TestWordDelimiterFilter ext
         @Override
         protected TokenStreamComponents createComponents(String fieldName) {
           Tokenizer tokenizer = new KeywordTokenizer();
-          return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(tokenizer, flags, protectedWords));
+          return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(TEST_VERSION_CURRENT, tokenizer, flags, protectedWords));
         }
       };
       // depending upon options, this thing may or may not preserve the empty term

Modified: lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestWordDelimiterFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestWordDelimiterFilterFactory.java?rev=1578993&r1=1578992&r2=1578993&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestWordDelimiterFilterFactory.java (original)
+++ lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestWordDelimiterFilterFactory.java Tue Mar 18 18:12:16 2014
@@ -200,6 +200,7 @@ public class TestWordDelimiterFilterFact
     String testText = "I borrowed $5,400.00 at 25% interest-rate";
     ResourceLoader loader = new SolrResourceLoader("solr/collection1");
     Map<String,String> args = new HashMap<>();
+    args.put("luceneMatchVersion", TEST_VERSION_CURRENT.toString());
     args.put("generateWordParts", "1");
     args.put("generateNumberParts", "1");
     args.put("catenateWords", "1");
@@ -213,16 +214,17 @@ public class TestWordDelimiterFilterFact
     
     TokenStream ts = factoryDefault.create(whitespaceMockTokenizer(testText));
     BaseTokenStreamTestCase.assertTokenStreamContents(ts, 
-        new String[] { "I", "borrowed", "5", "400", "00", "540000", "at", "25", "interest", "rate", "interestrate" });
+        new String[] { "I", "borrowed", "5", "540000", "400", "00", "at", "25", "interest", "interestrate", "rate" });
 
     ts = factoryDefault.create(whitespaceMockTokenizer("foo\u200Dbar"));
     BaseTokenStreamTestCase.assertTokenStreamContents(ts, 
-        new String[] { "foo", "bar", "foobar" });
+        new String[] { "foo", "foobar", "bar" });
 
     
     /* custom behavior */
     args = new HashMap<>();
     // use a custom type mapping
+    args.put("luceneMatchVersion", TEST_VERSION_CURRENT.toString());
     args.put("generateWordParts", "1");
     args.put("generateNumberParts", "1");
     args.put("catenateWords", "1");
@@ -235,7 +237,7 @@ public class TestWordDelimiterFilterFact
     
     ts = factoryCustom.create(whitespaceMockTokenizer(testText));
     BaseTokenStreamTestCase.assertTokenStreamContents(ts, 
-        new String[] { "I", "borrowed", "$5,400.00", "at", "25%", "interest", "rate", "interestrate" });
+        new String[] { "I", "borrowed", "$5,400.00", "at", "25%", "interest", "interestrate", "rate" });
     
     /* test custom behavior with a char > 0x7F, because we had to make a larger byte[] */
     ts = factoryCustom.create(whitespaceMockTokenizer("foo\u200Dbar"));

Modified: lucene/dev/trunk/solr/core/src/test/org/apache/solr/handler/FieldAnalysisRequestHandlerTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/test/org/apache/solr/handler/FieldAnalysisRequestHandlerTest.java?rev=1578993&r1=1578992&r2=1578993&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/src/test/org/apache/solr/handler/FieldAnalysisRequestHandlerTest.java (original)
+++ lucene/dev/trunk/solr/core/src/test/org/apache/solr/handler/FieldAnalysisRequestHandlerTest.java Tue Mar 18 18:12:16 2014
@@ -365,8 +365,8 @@ public class FieldAnalysisRequestHandler
     assertEquals(6, tokenList.size());
     assertToken(tokenList.get(0), new TokenInfo("hi", null, "word", 0, 2, 1, new int[]{1,1}, null, false));
     assertToken(tokenList.get(1), new TokenInfo("3456", null, "word", 4, 8, 2, new int[]{2,2}, null, false));
-    assertToken(tokenList.get(2), new TokenInfo("12", null, "word", 9, 11, 3, new int[]{2,3}, null, false));
-    assertToken(tokenList.get(3), new TokenInfo("345612", null, "word", 4, 11, 3, new int[]{2,3}, null, false));
+    assertToken(tokenList.get(2), new TokenInfo("345612", null, "word", 4, 11, 2, new int[]{2,2}, null, false));
+    assertToken(tokenList.get(3), new TokenInfo("12", null, "word", 9, 11, 3, new int[]{2,3}, null, false));
     assertToken(tokenList.get(4), new TokenInfo("a", null, "word", 12, 13, 4, new int[]{3,4}, null, false));
     assertToken(tokenList.get(5), new TokenInfo("Test", null, "word", 14, 18, 5, new int[]{4,5}, null, false));
     tokenList = indexPart.get("org.apache.lucene.analysis.core.LowerCaseFilter");
@@ -374,8 +374,8 @@ public class FieldAnalysisRequestHandler
     assertEquals(6, tokenList.size());
     assertToken(tokenList.get(0), new TokenInfo("hi", null, "word", 0, 2, 1, new int[]{1,1,1}, null, false));
     assertToken(tokenList.get(1), new TokenInfo("3456", null, "word", 4, 8, 2, new int[]{2,2,2}, null, false));
-    assertToken(tokenList.get(2), new TokenInfo("12", null, "word", 9, 11, 3, new int[]{2,3,3}, null, false));
-    assertToken(tokenList.get(3), new TokenInfo("345612", null, "word", 4, 11, 3, new int[]{2,3,3}, null, false));
+    assertToken(tokenList.get(2), new TokenInfo("345612", null, "word", 4, 11, 2, new int[]{2,2,2}, null, false));
+    assertToken(tokenList.get(3), new TokenInfo("12", null, "word", 9, 11, 3, new int[]{2,3,3}, null, false));
     assertToken(tokenList.get(4), new TokenInfo("a", null, "word", 12, 13, 4, new int[]{3,4,4}, null, false));
     assertToken(tokenList.get(5), new TokenInfo("test", null, "word", 14, 18, 5, new int[]{4,5,5}, null, false));
   }