You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2015/08/14 15:42:09 UTC

svn commit: r1695908 - in /lucene/dev/branches/branch_5x: ./ lucene/ lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/ lucene/analysis/common/src/java/org/apache/lucene/analys...

Author: rmuir
Date: Fri Aug 14 13:42:08 2015
New Revision: 1695908

URL: http://svn.apache.org/r1695908
Log:
LUCENE-6737: Add DecimalDigitFilter which folds unicode digits to basic latin

Added:
    lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/DecimalDigitFilter.java
      - copied unchanged from r1695898, lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/DecimalDigitFilter.java
    lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/DecimalDigitFilterFactory.java
      - copied unchanged from r1695898, lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/DecimalDigitFilterFactory.java
    lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestDecimalDigitFilter.java
      - copied unchanged from r1695898, lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestDecimalDigitFilter.java
    lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestDecimalDigitFilterFactory.java
      - copied unchanged from r1695898, lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestDecimalDigitFilterFactory.java
Modified:
    lucene/dev/branches/branch_5x/   (props changed)
    lucene/dev/branches/branch_5x/lucene/   (props changed)
    lucene/dev/branches/branch_5x/lucene/CHANGES.txt   (contents, props changed)
    lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
    lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniAnalyzer.java
    lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java
    lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiAnalyzer.java
    lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java
    lucene/dev/branches/branch_5x/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
    lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java
    lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/ckb/TestSoraniAnalyzer.java
    lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/ckb/TestSoraniStemFilter.java
    lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/fa/TestPersianAnalyzer.java
    lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hi/TestHindiAnalyzer.java
    lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java

Modified: lucene/dev/branches/branch_5x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/CHANGES.txt?rev=1695908&r1=1695907&r2=1695908&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_5x/lucene/CHANGES.txt Fri Aug 14 13:42:08 2015
@@ -14,6 +14,9 @@ New Features
 * LUCENE-6724: Add utility APIs to GeoHashUtils to compute neighbor
   geohash cells (Nick Knize via Mike McCandless).
 
+* LUCENE-6737: Add DecimalDigitFilter which folds unicode digits to basic latin.
+  (Robert Muir)
+
 Optimizations
 
 * LUCENE-6708: TopFieldCollector does not compute the score several times on the

Modified: lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java?rev=1695908&r1=1695907&r2=1695908&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java (original)
+++ lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java Fri Aug 14 13:42:08 2015
@@ -21,6 +21,7 @@ import java.io.IOException;
 import java.io.Reader;
 
 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.core.DecimalDigitFilter;
 import org.apache.lucene.analysis.core.LowerCaseFilter;
 import org.apache.lucene.analysis.core.StopFilter;
 import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
@@ -126,7 +127,7 @@ public final class ArabicAnalyzer extend
    * 
    * @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
    *         built from an {@link StandardTokenizer} filtered with
-   *         {@link LowerCaseFilter}, {@link StopFilter},
+   *         {@link LowerCaseFilter}, {@link DecimalDigitFilter}, {@link StopFilter},
    *         {@link ArabicNormalizationFilter}, {@link SetKeywordMarkerFilter}
    *         if a stem exclusion set is provided and {@link ArabicStemFilter}.
    */
@@ -139,6 +140,9 @@ public final class ArabicAnalyzer extend
       source = new StandardTokenizer40();
     }
     TokenStream result = new LowerCaseFilter(source);
+    if (getVersion().onOrAfter(Version.LUCENE_5_4_0)) {
+      result = new DecimalDigitFilter(result);
+    }
     // the order here is important: the stopword list is not normalized!
     result = new StopFilter(result, stopwords);
     // TODO maybe we should make ArabicNormalization filter also KeywordAttribute aware?!

Modified: lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniAnalyzer.java?rev=1695908&r1=1695907&r2=1695908&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniAnalyzer.java (original)
+++ lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniAnalyzer.java Fri Aug 14 13:42:08 2015
@@ -22,6 +22,7 @@ import java.io.Reader;
 import java.nio.charset.StandardCharsets;
 
 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.core.DecimalDigitFilter;
 import org.apache.lucene.analysis.core.LowerCaseFilter;
 import org.apache.lucene.analysis.core.StopFilter;
 import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
@@ -110,7 +111,7 @@ public final class SoraniAnalyzer extend
    *         {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
    *         built from an {@link StandardTokenizer} filtered with
    *         {@link StandardFilter}, {@link SoraniNormalizationFilter}, 
-   *         {@link LowerCaseFilter}, {@link StopFilter}
+   *         {@link LowerCaseFilter}, {@link DecimalDigitFilter}, {@link StopFilter}
    *         , {@link SetKeywordMarkerFilter} if a stem exclusion set is
    *         provided and {@link SoraniStemFilter}.
    */
@@ -125,6 +126,9 @@ public final class SoraniAnalyzer extend
     TokenStream result = new StandardFilter(source);
     result = new SoraniNormalizationFilter(result);
     result = new LowerCaseFilter(result);
+    if (getVersion().onOrAfter(Version.LUCENE_5_4_0)) {
+      result = new DecimalDigitFilter(result);
+    }
     result = new StopFilter(result, stopwords);
     if(!stemExclusionSet.isEmpty())
       result = new SetKeywordMarkerFilter(result, stemExclusionSet);

Modified: lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java?rev=1695908&r1=1695907&r2=1695908&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java (original)
+++ lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java Fri Aug 14 13:42:08 2015
@@ -24,6 +24,7 @@ import org.apache.lucene.analysis.Analyz
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
+import org.apache.lucene.analysis.core.DecimalDigitFilter;
 import org.apache.lucene.analysis.core.LowerCaseFilter;
 import org.apache.lucene.analysis.core.StopFilter;
 import org.apache.lucene.analysis.standard.StandardTokenizer;
@@ -109,7 +110,7 @@ public final class PersianAnalyzer exten
    * 
    * @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
    *         built from a {@link StandardTokenizer} filtered with
-   *         {@link LowerCaseFilter}, {@link ArabicNormalizationFilter},
+   *         {@link LowerCaseFilter}, {@link DecimalDigitFilter}, {@link ArabicNormalizationFilter},
    *         {@link PersianNormalizationFilter} and Persian Stop words
    */
   @Override
@@ -121,6 +122,9 @@ public final class PersianAnalyzer exten
       source = new StandardTokenizer40();
     }
     TokenStream result = new LowerCaseFilter(source);
+    if (getVersion().onOrAfter(Version.LUCENE_5_4_0)) {
+      result = new DecimalDigitFilter(result);
+    }
     result = new ArabicNormalizationFilter(result);
     /* additional persian-specific normalization */
     result = new PersianNormalizationFilter(result);

Modified: lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiAnalyzer.java?rev=1695908&r1=1695907&r2=1695908&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiAnalyzer.java (original)
+++ lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiAnalyzer.java Fri Aug 14 13:42:08 2015
@@ -27,6 +27,7 @@ import org.apache.lucene.analysis.util.C
 import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.DecimalDigitFilter;
 import org.apache.lucene.analysis.core.LowerCaseFilter;
 import org.apache.lucene.analysis.core.StopFilter;
 import org.apache.lucene.analysis.in.IndicNormalizationFilter;
@@ -108,7 +109,7 @@ public final class HindiAnalyzer extends
    * 
    * @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
    *         built from a {@link StandardTokenizer} filtered with
-   *         {@link LowerCaseFilter}, {@link IndicNormalizationFilter},
+   *         {@link LowerCaseFilter}, {@link DecimalDigitFilter}, {@link IndicNormalizationFilter},
    *         {@link HindiNormalizationFilter}, {@link SetKeywordMarkerFilter}
    *         if a stem exclusion set is provided, {@link HindiStemFilter}, and
    *         Hindi Stop words
@@ -122,6 +123,9 @@ public final class HindiAnalyzer extends
       source = new StandardTokenizer40();
     }
     TokenStream result = new LowerCaseFilter(source);
+    if (getVersion().onOrAfter(Version.LUCENE_5_4_0)) {
+      result = new DecimalDigitFilter(result);
+    }
     if (!stemExclusionSet.isEmpty())
       result = new SetKeywordMarkerFilter(result, stemExclusionSet);
     result = new IndicNormalizationFilter(result);

Modified: lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java?rev=1695908&r1=1695907&r2=1695908&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java (original)
+++ lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java Fri Aug 14 13:42:08 2015
@@ -22,6 +22,7 @@ import java.io.Reader;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.DecimalDigitFilter;
 import org.apache.lucene.analysis.core.LowerCaseFilter;
 import org.apache.lucene.analysis.core.StopFilter;
 import org.apache.lucene.analysis.standard.StandardFilter;
@@ -93,15 +94,17 @@ public final class ThaiAnalyzer extends
    * used to tokenize all the text in the provided {@link Reader}.
    * 
    * @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
-   *         built from a {@link StandardTokenizer} filtered with
-   *         {@link StandardFilter}, {@link LowerCaseFilter}, {@link ThaiWordFilter}, and
-   *         {@link StopFilter}
+   *         built from a {@link ThaiTokenizer} filtered with
+   *         {@link LowerCaseFilter}, {@link DecimalDigitFilter} and {@link StopFilter}
    */
   @Override
   protected TokenStreamComponents createComponents(String fieldName) {
     if (getVersion().onOrAfter(Version.LUCENE_4_8_0)) {
       final Tokenizer source = new ThaiTokenizer();
       TokenStream result = new LowerCaseFilter(source);
+      if (getVersion().onOrAfter(Version.LUCENE_5_4_0)) {
+        result = new DecimalDigitFilter(result);
+      }
       result = new StopFilter(result, stopwords);
       return new TokenStreamComponents(source, result);
     } else {

Modified: lucene/dev/branches/branch_5x/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory?rev=1695908&r1=1695907&r2=1695908&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory (original)
+++ lucene/dev/branches/branch_5x/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory Fri Aug 14 13:42:08 2015
@@ -26,6 +26,7 @@ org.apache.lucene.analysis.commongrams.C
 org.apache.lucene.analysis.commongrams.CommonGramsQueryFilterFactory
 org.apache.lucene.analysis.compound.DictionaryCompoundWordTokenFilterFactory
 org.apache.lucene.analysis.compound.HyphenationCompoundWordTokenFilterFactory
+org.apache.lucene.analysis.core.DecimalDigitFilterFactory
 org.apache.lucene.analysis.core.LowerCaseFilterFactory
 org.apache.lucene.analysis.core.StopFilterFactory
 org.apache.lucene.analysis.core.TypeTokenFilterFactory

Modified: lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java?rev=1695908&r1=1695907&r2=1695908&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java (original)
+++ lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java Fri Aug 14 13:42:08 2015
@@ -101,6 +101,27 @@ public class TestArabicAnalyzer extends
     a.close();
   }
   
+  /**
+   * test we fold digits to latin-1
+   */
+  public void testDigits() throws Exception {
+    ArabicAnalyzer a = new ArabicAnalyzer();
+    checkOneTerm(a, "١٢٣٤", "1234");
+    a.close();
+  }
+  
+  /**
+   * test that we don't fold digits for back compat behavior
+   * @deprecated remove this test in lucene 7
+   */
+  @Deprecated
+  public void testDigitsBackCompat() throws Exception {
+    ArabicAnalyzer a = new ArabicAnalyzer();
+    a.setVersion(Version.LUCENE_5_3_0);
+    checkOneTerm(a, "١٢٣٤", "١٢٣٤");
+    a.close();
+  }
+  
   /** blast some random strings through the analyzer */
   public void testRandomStrings() throws Exception {
     ArabicAnalyzer a = new ArabicAnalyzer();

Modified: lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/ckb/TestSoraniAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/ckb/TestSoraniAnalyzer.java?rev=1695908&r1=1695907&r2=1695908&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/ckb/TestSoraniAnalyzer.java (original)
+++ lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/ckb/TestSoraniAnalyzer.java Fri Aug 14 13:42:08 2015
@@ -64,6 +64,28 @@ public class TestSoraniAnalyzer extends
     a.close();
   }
   
+  /**
+   * test we fold digits to latin-1
+   * (these are somewhat rare, but generally a few % of digits still)
+   */
+  public void testDigits() throws Exception {
+    SoraniAnalyzer a = new SoraniAnalyzer();
+    checkOneTerm(a, "١٢٣٤", "1234");
+    a.close();
+  }
+  
+  /**
+   * test that we don't fold digits for back compat behavior
+   * @deprecated remove this test in lucene 7
+   */
+  @Deprecated
+  public void testDigitsBackCompat() throws Exception {
+    SoraniAnalyzer a = new SoraniAnalyzer();
+    a.setVersion(Version.LUCENE_5_3_0);
+    checkOneTerm(a, "١٢٣٤", "١٢٣٤");
+    a.close();
+  }
+  
   /** blast some random strings through the analyzer */
   public void testRandomStrings() throws Exception {
     Analyzer a = new SoraniAnalyzer();

Modified: lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/ckb/TestSoraniStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/ckb/TestSoraniStemFilter.java?rev=1695908&r1=1695907&r2=1695908&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/ckb/TestSoraniStemFilter.java (original)
+++ lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/ckb/TestSoraniStemFilter.java Fri Aug 14 13:42:08 2015
@@ -23,6 +23,8 @@ import java.io.IOException;
 
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.core.KeywordTokenizer;
 
@@ -107,6 +109,18 @@ public class TestSoraniStemFilter extend
   /** test against a basic vocabulary file */
   public void testVocabulary() throws Exception {
     // top 8k words or so: freq > 1000
+    
+    // just normalization+stem, we are testing that the stemming doesn't break.
+    Analyzer a = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName) {
+        Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true);
+        TokenStream stream = new SoraniNormalizationFilter(tokenizer);
+        stream = new SoraniStemFilter(stream);
+        return new TokenStreamComponents(tokenizer, stream);
+      }
+    };
     assertVocabulary(a, getDataPath("ckbtestdata.zip"), "testdata.txt");
+    a.close();
   }
 }

Modified: lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/fa/TestPersianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/fa/TestPersianAnalyzer.java?rev=1695908&r1=1695907&r2=1695908&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/fa/TestPersianAnalyzer.java (original)
+++ lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/fa/TestPersianAnalyzer.java Fri Aug 14 13:42:08 2015
@@ -231,6 +231,27 @@ public class TestPersianAnalyzer extends
     a.close();
   }
   
+  /**
+   * test we fold digits to latin-1
+   */
+  public void testDigits() throws Exception {
+    PersianAnalyzer a = new PersianAnalyzer();
+    checkOneTerm(a, "Û±Û²Û³Û´", "1234");
+    a.close();
+  }
+  
+  /**
+   * test that we don't fold digits for back compat behavior
+   * @deprecated remove this test in lucene 7
+   */
+  @Deprecated
+  public void testDigitsBackCompat() throws Exception {
+    PersianAnalyzer a = new PersianAnalyzer();
+    a.setVersion(Version.LUCENE_5_3_0);
+    checkOneTerm(a, "Û±Û²Û³Û´", "Û±Û²Û³Û´");
+    a.close();
+  }
+  
   /** blast some random strings through the analyzer */
   public void testRandomStrings() throws Exception {
     PersianAnalyzer a = new PersianAnalyzer();

Modified: lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hi/TestHindiAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hi/TestHindiAnalyzer.java?rev=1695908&r1=1695907&r2=1695908&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hi/TestHindiAnalyzer.java (original)
+++ lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hi/TestHindiAnalyzer.java Fri Aug 14 13:42:08 2015
@@ -50,6 +50,27 @@ public class TestHindiAnalyzer extends B
     a.close();
   }
   
+  /**
+   * test we fold digits to latin-1
+   */
+  public void testDigits() throws Exception {
+    HindiAnalyzer a = new HindiAnalyzer();
+    checkOneTerm(a, "१२३४", "1234");
+    a.close();
+  }
+  
+  /**
+   * test that we don't fold digits for back compat behavior
+   * @deprecated remove this test in lucene 7
+   */
+  @Deprecated
+  public void testDigitsBackCompat() throws Exception {
+    HindiAnalyzer a = new HindiAnalyzer();
+    a.setVersion(Version.LUCENE_5_3_0);
+    checkOneTerm(a, "१२३४", "१२३४");
+    a.close();
+  }
+  
   /** blast some random strings through the analyzer */
   public void testRandomStrings() throws Exception {
     Analyzer analyzer = new HindiAnalyzer();

Modified: lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java?rev=1695908&r1=1695907&r2=1695908&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java (original)
+++ lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java Fri Aug 14 13:42:08 2015
@@ -123,6 +123,27 @@ public class TestThaiAnalyzer extends Ba
     analyzer.close();
   }
   
+  /**
+   * test we fold digits to latin-1
+   */
+  public void testDigits() throws Exception {
+    ThaiAnalyzer a = new ThaiAnalyzer();
+    checkOneTerm(a, "๑๒๓๔", "1234");
+    a.close();
+  }
+  
+  /**
+   * test that we don't fold digits for back compat behavior
+   * @deprecated remove this test in lucene 7
+   */
+  @Deprecated
+  public void testDigitsBackCompat() throws Exception {
+    ThaiAnalyzer a = new ThaiAnalyzer();
+    a.setVersion(Version.LUCENE_5_3_0);
+    checkOneTerm(a, "๑๒๓๔", "๑๒๓๔");
+    a.close();
+  }
+  
   public void testTwoSentences() throws Exception {
     Analyzer analyzer = new ThaiAnalyzer(CharArraySet.EMPTY_SET);
     assertAnalyzesTo(analyzer, "This is a test. การที่ได้ต้องแสดงว่างานดี",