You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2015/08/14 15:12:57 UTC
svn commit: r1695898 - in /lucene/dev/trunk/lucene: ./ analysis/common/src/java/org/apache/lucene/analysis/ar/ analysis/common/src/java/org/apache/lucene/analysis/ckb/ analysis/common/src/java/org/apache/lucene/analysis/core/ analysis/common/src/java/o...

Author: rmuir
Date: Fri Aug 14 13:12:56 2015
New Revision: 1695898

URL: http://svn.apache.org/r1695898
Log:
LUCENE-6737: Add DecimalDigitFilter which folds unicode digits to basic latin

Added:
    lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/DecimalDigitFilter.java   (with props)
    lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/DecimalDigitFilterFactory.java   (with props)
    lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestDecimalDigitFilter.java   (with props)
    lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestDecimalDigitFilterFactory.java   (with props)
Modified:
    lucene/dev/trunk/lucene/CHANGES.txt
    lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
    lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniAnalyzer.java
    lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java
    lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiAnalyzer.java
    lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java
    lucene/dev/trunk/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
    lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java
    lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ckb/TestSoraniAnalyzer.java
    lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ckb/TestSoraniStemFilter.java
    lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/fa/TestPersianAnalyzer.java
    lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hi/TestHindiAnalyzer.java
    lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java

Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1695898&r1=1695897&r2=1695898&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Fri Aug 14 13:12:56 2015
@@ -57,6 +57,9 @@ New Features
 * LUCENE-6724: Add utility APIs to GeoHashUtils to compute neighbor
   geohash cells (Nick Knize via Mike McCandless).
 
+* LUCENE-6737: Add DecimalDigitFilter which folds unicode digits to basic latin.
+  (Robert Muir)
+
 Optimizations
 
 * LUCENE-6708: TopFieldCollector does not compute the score several times on the

Modified: lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java?rev=1695898&r1=1695897&r2=1695898&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java Fri Aug 14 13:12:56 2015
@@ -21,6 +21,7 @@ import java.io.IOException;
 import java.io.Reader;
 
 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.core.DecimalDigitFilter;
 import org.apache.lucene.analysis.core.LowerCaseFilter;
 import org.apache.lucene.analysis.core.StopFilter;
 import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
@@ -29,6 +30,7 @@ import org.apache.lucene.analysis.util.C
 import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.util.Version;
 
 /**
  * {@link Analyzer} for Arabic. 
@@ -124,7 +126,7 @@ public final class ArabicAnalyzer extend
    * 
    * @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
    *         built from an {@link StandardTokenizer} filtered with
-   *         {@link LowerCaseFilter}, {@link StopFilter},
+   *         {@link LowerCaseFilter}, {@link DecimalDigitFilter}, {@link StopFilter},
    *         {@link ArabicNormalizationFilter}, {@link SetKeywordMarkerFilter}
    *         if a stem exclusion set is provided and {@link ArabicStemFilter}.
    */
@@ -132,6 +134,9 @@ public final class ArabicAnalyzer extend
   protected TokenStreamComponents createComponents(String fieldName) {
     final Tokenizer source = new StandardTokenizer();
     TokenStream result = new LowerCaseFilter(source);
+    if (getVersion().onOrAfter(Version.LUCENE_5_4_0)) {
+      result = new DecimalDigitFilter(result);
+    }
     // the order here is important: the stopword list is not normalized!
     result = new StopFilter(result, stopwords);
     // TODO maybe we should make ArabicNormalization filter also KeywordAttribute aware?!

Modified: lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniAnalyzer.java?rev=1695898&r1=1695897&r2=1695898&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniAnalyzer.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniAnalyzer.java Fri Aug 14 13:12:56 2015
@@ -22,6 +22,7 @@ import java.io.Reader;
 import java.nio.charset.StandardCharsets;
 
 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.core.DecimalDigitFilter;
 import org.apache.lucene.analysis.core.LowerCaseFilter;
 import org.apache.lucene.analysis.core.StopFilter;
 import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
@@ -33,6 +34,7 @@ import org.apache.lucene.analysis.util.C
 import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
 import org.apache.lucene.analysis.util.WordlistLoader;
 import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.Version;
 
 /**
  * {@link Analyzer} for Sorani Kurdish.
@@ -108,7 +110,7 @@ public final class SoraniAnalyzer extend
    *         {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
    *         built from an {@link StandardTokenizer} filtered with
    *         {@link StandardFilter}, {@link SoraniNormalizationFilter}, 
-   *         {@link LowerCaseFilter}, {@link StopFilter}
+   *         {@link LowerCaseFilter}, {@link DecimalDigitFilter}, {@link StopFilter}
    *         , {@link SetKeywordMarkerFilter} if a stem exclusion set is
    *         provided and {@link SoraniStemFilter}.
    */
@@ -118,6 +120,9 @@ public final class SoraniAnalyzer extend
     TokenStream result = new StandardFilter(source);
     result = new SoraniNormalizationFilter(result);
     result = new LowerCaseFilter(result);
+    if (getVersion().onOrAfter(Version.LUCENE_5_4_0)) {
+      result = new DecimalDigitFilter(result);
+    }
     result = new StopFilter(result, stopwords);
     if(!stemExclusionSet.isEmpty())
       result = new SetKeywordMarkerFilter(result, stemExclusionSet);

Added: lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/DecimalDigitFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/DecimalDigitFilter.java?rev=1695898&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/DecimalDigitFilter.java (added)
+++ lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/DecimalDigitFilter.java Fri Aug 14 13:12:56 2015
@@ -0,0 +1,66 @@
+package org.apache.lucene.analysis.core;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.util.StemmerUtil;
+
+/**
+ * Folds all Unicode digits in {@code [:General_Category=Decimal_Number:]}
+ * to Basic Latin digits ({@code 0-9}). 
+ */
+public final class DecimalDigitFilter extends TokenFilter {
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+
+  /** 
+   * Creates a new DecimalDigitFilter over {@code input}
+   */
+  public DecimalDigitFilter(TokenStream input) {
+    super(input);
+  }
+
+  @Override
+  public boolean incrementToken() throws IOException {
+    if (input.incrementToken()) {
+      char buffer[] = termAtt.buffer();
+      int length = termAtt.length();
+      
+      for (int i = 0; i < length; i++) {
+        int ch = Character.codePointAt(buffer, i, length);
+        // look for digits outside of basic latin
+        if (ch > 0x7F && Character.isDigit(ch)) {
+          // replace with equivalent basic latin digit
+          buffer[i] = (char) ('0' + Character.getNumericValue(ch));
+          // if the original was supplementary, shrink the string
+          if (ch > 0xFFFF) {
+            length = StemmerUtil.delete(buffer, ++i, length);
+            termAtt.setLength(length);
+          }
+        }
+      }
+      
+      return true;
+    } else {
+      return false;
+    }
+  }
+}

Added: lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/DecimalDigitFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/DecimalDigitFilterFactory.java?rev=1695898&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/DecimalDigitFilterFactory.java (added)
+++ lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/DecimalDigitFilterFactory.java Fri Aug 14 13:12:56 2015
@@ -0,0 +1,56 @@
+package org.apache.lucene.analysis.core;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
+import org.apache.lucene.analysis.util.MultiTermAwareComponent;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
+
+/**
+ * Factory for {@link DecimalDigitFilter}. 
+ * <pre class="prettyprint">
+ * &lt;fieldType name="text_lwrcase" class="solr.TextField" positionIncrementGap="100"&gt;
+ *   &lt;analyzer&gt;
+ *     &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
+ *     &lt;filter class="solr.DecimalDigitFilterFactory"/&gt;
+ *   &lt;/analyzer&gt;
+ * &lt;/fieldType&gt;</pre>
+ */
+public class DecimalDigitFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
+  
+  /** Creates a new DecimalDigitFilterFactory */
+  public DecimalDigitFilterFactory(Map<String,String> args) {
+    super(args);
+    if (!args.isEmpty()) {
+      throw new IllegalArgumentException("Unknown parameters: " + args);
+    }
+  }
+
+  @Override
+  public DecimalDigitFilter create(TokenStream input) {
+    return new DecimalDigitFilter(input);
+  }
+
+  @Override
+  public AbstractAnalysisFactory getMultiTermComponent() {
+    return this;
+  }
+}

Modified: lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java?rev=1695898&r1=1695897&r2=1695898&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java Fri Aug 14 13:12:56 2015
@@ -24,11 +24,13 @@ import org.apache.lucene.analysis.Analyz
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
+import org.apache.lucene.analysis.core.DecimalDigitFilter;
 import org.apache.lucene.analysis.core.LowerCaseFilter;
 import org.apache.lucene.analysis.core.StopFilter;
 import org.apache.lucene.analysis.standard.StandardTokenizer;
 import org.apache.lucene.analysis.util.CharArraySet;
 import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
+import org.apache.lucene.util.Version;
 
 /**
  * {@link Analyzer} for Persian.
@@ -107,13 +109,16 @@ public final class PersianAnalyzer exten
    * 
    * @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
    *         built from a {@link StandardTokenizer} filtered with
-   *         {@link LowerCaseFilter}, {@link ArabicNormalizationFilter},
+   *         {@link LowerCaseFilter}, {@link DecimalDigitFilter}, {@link ArabicNormalizationFilter},
    *         {@link PersianNormalizationFilter} and Persian Stop words
    */
   @Override
   protected TokenStreamComponents createComponents(String fieldName) {
     final Tokenizer source = new StandardTokenizer();
     TokenStream result = new LowerCaseFilter(source);
+    if (getVersion().onOrAfter(Version.LUCENE_5_4_0)) {
+      result = new DecimalDigitFilter(result);
+    }
     result = new ArabicNormalizationFilter(result);
     /* additional persian-specific normalization */
     result = new PersianNormalizationFilter(result);

Modified: lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiAnalyzer.java?rev=1695898&r1=1695897&r2=1695898&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiAnalyzer.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiAnalyzer.java Fri Aug 14 13:12:56 2015
@@ -26,9 +26,11 @@ import org.apache.lucene.analysis.util.C
 import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.DecimalDigitFilter;
 import org.apache.lucene.analysis.core.LowerCaseFilter;
 import org.apache.lucene.analysis.core.StopFilter;
 import org.apache.lucene.analysis.in.IndicNormalizationFilter;
+import org.apache.lucene.util.Version;
 
 /**
  * Analyzer for Hindi.
@@ -106,7 +108,7 @@ public final class HindiAnalyzer extends
    * 
    * @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
    *         built from a {@link StandardTokenizer} filtered with
-   *         {@link LowerCaseFilter}, {@link IndicNormalizationFilter},
+   *         {@link LowerCaseFilter}, {@link DecimalDigitFilter}, {@link IndicNormalizationFilter},
    *         {@link HindiNormalizationFilter}, {@link SetKeywordMarkerFilter}
    *         if a stem exclusion set is provided, {@link HindiStemFilter}, and
    *         Hindi Stop words
@@ -115,6 +117,9 @@ public final class HindiAnalyzer extends
   protected TokenStreamComponents createComponents(String fieldName) {
     final Tokenizer source = new StandardTokenizer();
     TokenStream result = new LowerCaseFilter(source);
+    if (getVersion().onOrAfter(Version.LUCENE_5_4_0)) {
+      result = new DecimalDigitFilter(result);
+    }
     if (!stemExclusionSet.isEmpty())
       result = new SetKeywordMarkerFilter(result, stemExclusionSet);
     result = new IndicNormalizationFilter(result);

Modified: lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java?rev=1695898&r1=1695897&r2=1695898&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java Fri Aug 14 13:12:56 2015
@@ -22,10 +22,12 @@ import java.io.Reader;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.DecimalDigitFilter;
 import org.apache.lucene.analysis.core.LowerCaseFilter;
 import org.apache.lucene.analysis.core.StopFilter;
 import org.apache.lucene.analysis.util.CharArraySet;
 import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
+import org.apache.lucene.util.Version;
 
 /**
  * {@link Analyzer} for Thai language. It uses {@link java.text.BreakIterator} to break words.
@@ -90,12 +92,15 @@ public final class ThaiAnalyzer extends
    * 
    * @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
    *         built from a {@link ThaiTokenizer} filtered with
-   *         {@link LowerCaseFilter} and {@link StopFilter}
+   *         {@link LowerCaseFilter}, {@link DecimalDigitFilter} and {@link StopFilter}
    */
   @Override
   protected TokenStreamComponents createComponents(String fieldName) {
     final Tokenizer source = new ThaiTokenizer();
     TokenStream result = new LowerCaseFilter(source);
+    if (getVersion().onOrAfter(Version.LUCENE_5_4_0)) {
+      result = new DecimalDigitFilter(result);
+    }
     result = new StopFilter(result, stopwords);
     return new TokenStreamComponents(source, result);
   }

Modified: lucene/dev/trunk/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory?rev=1695898&r1=1695897&r2=1695898&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory Fri Aug 14 13:12:56 2015
@@ -26,6 +26,7 @@ org.apache.lucene.analysis.commongrams.C
 org.apache.lucene.analysis.commongrams.CommonGramsQueryFilterFactory
 org.apache.lucene.analysis.compound.DictionaryCompoundWordTokenFilterFactory
 org.apache.lucene.analysis.compound.HyphenationCompoundWordTokenFilterFactory
+org.apache.lucene.analysis.core.DecimalDigitFilterFactory
 org.apache.lucene.analysis.core.LowerCaseFilterFactory
 org.apache.lucene.analysis.core.StopFilterFactory
 org.apache.lucene.analysis.core.TypeTokenFilterFactory

Modified: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java?rev=1695898&r1=1695897&r2=1695898&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java Fri Aug 14 13:12:56 2015
@@ -21,6 +21,7 @@ import java.io.IOException;
 
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.util.Version;
 
 /**
  * Test the Arabic Analyzer
@@ -100,6 +101,27 @@ public class TestArabicAnalyzer extends
     a.close();
   }
   
+  /**
+   * test we fold digits to latin-1
+   */
+  public void testDigits() throws Exception {
+    ArabicAnalyzer a = new ArabicAnalyzer();
+    checkOneTerm(a, "Ù¡Ù¢Ù£Ù¤", "1234");
+    a.close();
+  }
+  
+  /**
+   * test that we don't fold digits for back compat behavior
+   * @deprecated remove this test in lucene 7
+   */
+  @Deprecated
+  public void testDigitsBackCompat() throws Exception {
+    ArabicAnalyzer a = new ArabicAnalyzer();
+    a.setVersion(Version.LUCENE_5_3_0);
+    checkOneTerm(a, "Ù¡Ù¢Ù£Ù¤", "Ù¡Ù¢Ù£Ù¤");
+    a.close();
+  }
+  
   /** blast some random strings through the analyzer */
   public void testRandomStrings() throws Exception {
     ArabicAnalyzer a = new ArabicAnalyzer();

Modified: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ckb/TestSoraniAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ckb/TestSoraniAnalyzer.java?rev=1695898&r1=1695897&r2=1695898&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ckb/TestSoraniAnalyzer.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ckb/TestSoraniAnalyzer.java Fri Aug 14 13:12:56 2015
@@ -22,6 +22,7 @@ import java.io.IOException;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.util.Version;
 
 /**
  * Test the Sorani analyzer
@@ -63,6 +64,28 @@ public class TestSoraniAnalyzer extends
     a.close();
   }
   
+  /**
+   * test we fold digits to latin-1
+   * (these are somewhat rare, but generally a few % of digits still)
+   */
+  public void testDigits() throws Exception {
+    SoraniAnalyzer a = new SoraniAnalyzer();
+    checkOneTerm(a, "Ù¡Ù¢Ù£Ù¤", "1234");
+    a.close();
+  }
+  
+  /**
+   * test that we don't fold digits for back compat behavior
+   * @deprecated remove this test in lucene 7
+   */
+  @Deprecated
+  public void testDigitsBackCompat() throws Exception {
+    SoraniAnalyzer a = new SoraniAnalyzer();
+    a.setVersion(Version.LUCENE_5_3_0);
+    checkOneTerm(a, "Ù¡Ù¢Ù£Ù¤", "Ù¡Ù¢Ù£Ù¤");
+    a.close();
+  }
+  
   /** blast some random strings through the analyzer */
   public void testRandomStrings() throws Exception {
     Analyzer a = new SoraniAnalyzer();

Modified: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ckb/TestSoraniStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ckb/TestSoraniStemFilter.java?rev=1695898&r1=1695897&r2=1695898&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ckb/TestSoraniStemFilter.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ckb/TestSoraniStemFilter.java Fri Aug 14 13:12:56 2015
@@ -23,6 +23,8 @@ import java.io.IOException;
 
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.core.KeywordTokenizer;
 
@@ -107,6 +109,18 @@ public class TestSoraniStemFilter extend
   /** test against a basic vocabulary file */
   public void testVocabulary() throws Exception {
     // top 8k words or so: freq > 1000
+    
+    // just normalization+stem, we are testing that the stemming doesn't break.
+    Analyzer a = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName) {
+        Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true);
+        TokenStream stream = new SoraniNormalizationFilter(tokenizer);
+        stream = new SoraniStemFilter(stream);
+        return new TokenStreamComponents(tokenizer, stream);
+      }
+    };
     assertVocabulary(a, getDataPath("ckbtestdata.zip"), "testdata.txt");
+    a.close();
   }
 }

Added: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestDecimalDigitFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestDecimalDigitFilter.java?rev=1695898&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestDecimalDigitFilter.java (added)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestDecimalDigitFilter.java Fri Aug 14 13:12:56 2015
@@ -0,0 +1,106 @@
+package org.apache.lucene.analysis.core;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.util.TestUtil;
+
+/**
+ * Tests for {@link DecimalDigitFilter}
+ */
+public class TestDecimalDigitFilter extends BaseTokenStreamTestCase {
+  private Analyzer tokenized;
+  private Analyzer keyword;
+  
+  @Override
+  public void setUp() throws Exception {
+    super.setUp();
+    tokenized = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName) {
+        Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+        return new TokenStreamComponents(tokenizer, new DecimalDigitFilter(tokenizer));
+      }
+    };
+    keyword = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName) {
+        Tokenizer tokenizer = new KeywordTokenizer();
+        return new TokenStreamComponents(tokenizer, new DecimalDigitFilter(tokenizer));
+      }
+    };
+  }
+  
+  @Override
+  public void tearDown() throws Exception {
+    tokenized.close();
+    keyword.close();
+    super.tearDown();
+  }
+
+  /**
+   * test that digits are normalized
+   */
+  public void testSimple() throws Exception {
+    checkOneTerm(tokenized, "Ù¡Ù¢Ù£Ù¤", "1234");
+  }
+  
+  /**
+   * test all digits in different locations of strings.
+   */
+  public void testRandom() throws Exception {
+    for (int codepoint = Character.MIN_CODE_POINT; codepoint < Character.MAX_CODE_POINT; codepoint++) {
+      if (Character.isDigit(codepoint)) {
+        // add some a-z before/after the string
+        String prefix = TestUtil.randomSimpleString(random());
+        String suffix = TestUtil.randomSimpleString(random());
+        
+        StringBuilder expected = new StringBuilder();
+        expected.append(prefix);
+        int value = Character.getNumericValue(codepoint);
+        assert value >= 0 && value <= 9;
+        expected.append(Integer.toString(value));
+        expected.append(suffix);
+        
+        StringBuilder actual = new StringBuilder();
+        actual.append(prefix);
+        actual.appendCodePoint(codepoint);
+        actual.append(suffix);
+        
+        checkOneTerm(keyword, actual.toString(), expected.toString());
+      }
+    }
+  }
+  
+  /**
+   * check the filter is a no-op for the empty string term
+   */
+  public void testEmptyTerm() throws Exception {
+    checkOneTerm(keyword, "", "");
+  }
+  
+  /** 
+   * blast some random strings through the filter
+   */
+  public void testRandomStrings() throws Exception {
+    checkRandomData(random(), tokenized, 1000*RANDOM_MULTIPLIER);
+  }
+}

Added: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestDecimalDigitFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestDecimalDigitFilterFactory.java?rev=1695898&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestDecimalDigitFilterFactory.java (added)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestDecimalDigitFilterFactory.java Fri Aug 14 13:12:56 2015
@@ -0,0 +1,50 @@
+package org.apache.lucene.analysis.core;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
+
+/**
+ * Simple tests to ensure the digit normalization factory is working.
+ */
+public class TestDecimalDigitFilterFactory extends BaseTokenStreamFactoryTestCase {
+  
+  /**
+   * Ensure the filter actually normalizes digits.
+   */
+  public void testNormalization() throws Exception {
+    Reader reader = new StringReader("Ù¡Ù¢Ù£Ù¤");
+    TokenStream stream = whitespaceMockTokenizer(reader);
+    stream = tokenFilterFactory("DecimalDigit").create(stream);
+    assertTokenStreamContents(stream, new String[] { "1234" });
+  }
+  
+  /** Test that bogus arguments result in exception */
+  public void testBogusArguments() throws Exception {
+    try {
+      tokenFilterFactory("DecimalDigit", "bogusArg", "bogusValue");
+      fail();
+    } catch (IllegalArgumentException expected) {
+      assertTrue(expected.getMessage().contains("Unknown parameters"));
+    }
+  }
+}
\ No newline at end of file

Modified: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/fa/TestPersianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/fa/TestPersianAnalyzer.java?rev=1695898&r1=1695897&r2=1695898&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/fa/TestPersianAnalyzer.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/fa/TestPersianAnalyzer.java Fri Aug 14 13:12:56 2015
@@ -20,6 +20,7 @@ package org.apache.lucene.analysis.fa;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.util.Version;
 
 /**
  * Test the Persian Analyzer
@@ -228,6 +229,27 @@ public class TestPersianAnalyzer extends
     a.close();
   }
   
+  /**
+   * test we fold digits to latin-1
+   */
+  public void testDigits() throws Exception {
+    PersianAnalyzer a = new PersianAnalyzer();
+    checkOneTerm(a, "Û±Û²Û³Û´", "1234");
+    a.close();
+  }
+  
+  /**
+   * test that we don't fold digits for back compat behavior
+   * @deprecated remove this test in lucene 7
+   */
+  @Deprecated
+  public void testDigitsBackCompat() throws Exception {
+    PersianAnalyzer a = new PersianAnalyzer();
+    a.setVersion(Version.LUCENE_5_3_0);
+    checkOneTerm(a, "Û±Û²Û³Û´", "Û±Û²Û³Û´");
+    a.close();
+  }
+  
   /** blast some random strings through the analyzer */
   public void testRandomStrings() throws Exception {
     PersianAnalyzer a = new PersianAnalyzer();

Modified: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hi/TestHindiAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hi/TestHindiAnalyzer.java?rev=1695898&r1=1695897&r2=1695898&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hi/TestHindiAnalyzer.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hi/TestHindiAnalyzer.java Fri Aug 14 13:12:56 2015
@@ -3,6 +3,7 @@ package org.apache.lucene.analysis.hi;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.util.Version;
 
 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
@@ -47,6 +48,27 @@ public class TestHindiAnalyzer extends B
     a.close();
   }
   
+  /**
+   * test we fold digits to latin-1
+   */
+  public void testDigits() throws Exception {
+    HindiAnalyzer a = new HindiAnalyzer();
+    checkOneTerm(a, "à¥§à¥¨à¥©à¥ª", "1234");
+    a.close();
+  }
+  
+  /**
+   * test that we don't fold digits for back compat behavior
+   * @deprecated remove this test in lucene 7
+   */
+  @Deprecated
+  public void testDigitsBackCompat() throws Exception {
+    HindiAnalyzer a = new HindiAnalyzer();
+    a.setVersion(Version.LUCENE_5_3_0);
+    checkOneTerm(a, "à¥§à¥¨à¥©à¥ª", "à¥§à¥¨à¥©à¥ª");
+    a.close();
+  }
+  
   /** blast some random strings through the analyzer */
   public void testRandomStrings() throws Exception {
     Analyzer analyzer = new HindiAnalyzer();

Modified: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java?rev=1695898&r1=1695897&r2=1695898&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java Fri Aug 14 13:12:56 2015
@@ -23,6 +23,7 @@ import org.apache.lucene.analysis.TokenS
 import org.apache.lucene.analysis.core.StopAnalyzer;
 import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
 import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.util.Version;
 
 /**
  * Test case for ThaiAnalyzer, modified from TestFrenchAnalyzer
@@ -122,6 +123,27 @@ public class TestThaiAnalyzer extends Ba
     analyzer.close();
   }
   
+  /**
+   * test we fold digits to latin-1
+   */
+  public void testDigits() throws Exception {
+    ThaiAnalyzer a = new ThaiAnalyzer();
+    checkOneTerm(a, "à¹à¹à¹à¹", "1234");
+    a.close();
+  }
+  
+  /**
+   * test that we don't fold digits for back compat behavior
+   * @deprecated remove this test in lucene 7
+   */
+  @Deprecated
+  public void testDigitsBackCompat() throws Exception {
+    ThaiAnalyzer a = new ThaiAnalyzer();
+    a.setVersion(Version.LUCENE_5_3_0);
+    checkOneTerm(a, "à¹à¹à¹à¹", "à¹à¹à¹à¹");
+    a.close();
+  }
+  
   public void testTwoSentences() throws Exception {
     Analyzer analyzer = new ThaiAnalyzer(CharArraySet.EMPTY_SET);
     assertAnalyzesTo(analyzer, "This is a test. à¸à¸²à¸£à¸à¸µà¹à¹à¸à¹à¸à¹à¸à¸à¹à¸ªà¸à¸à¸§à¹à¸²à¸à¸²à¸à¸à¸µ",