You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2015/08/14 15:42:09 UTC
svn commit: r1695908 - in /lucene/dev/branches/branch_5x: ./ lucene/
lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/
lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/
lucene/analysis/common/src/java/org/apache/lucene/analys...
Author: rmuir
Date: Fri Aug 14 13:42:08 2015
New Revision: 1695908
URL: http://svn.apache.org/r1695908
Log:
LUCENE-6737: Add DecimalDigitFilter which folds unicode digits to basic latin
Added:
lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/DecimalDigitFilter.java
- copied unchanged from r1695898, lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/DecimalDigitFilter.java
lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/DecimalDigitFilterFactory.java
- copied unchanged from r1695898, lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/DecimalDigitFilterFactory.java
lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestDecimalDigitFilter.java
- copied unchanged from r1695898, lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestDecimalDigitFilter.java
lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestDecimalDigitFilterFactory.java
- copied unchanged from r1695898, lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestDecimalDigitFilterFactory.java
Modified:
lucene/dev/branches/branch_5x/ (props changed)
lucene/dev/branches/branch_5x/lucene/ (props changed)
lucene/dev/branches/branch_5x/lucene/CHANGES.txt (contents, props changed)
lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniAnalyzer.java
lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java
lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiAnalyzer.java
lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java
lucene/dev/branches/branch_5x/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java
lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/ckb/TestSoraniAnalyzer.java
lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/ckb/TestSoraniStemFilter.java
lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/fa/TestPersianAnalyzer.java
lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hi/TestHindiAnalyzer.java
lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java
Modified: lucene/dev/branches/branch_5x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/CHANGES.txt?rev=1695908&r1=1695907&r2=1695908&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_5x/lucene/CHANGES.txt Fri Aug 14 13:42:08 2015
@@ -14,6 +14,9 @@ New Features
* LUCENE-6724: Add utility APIs to GeoHashUtils to compute neighbor
geohash cells (Nick Knize via Mike McCandless).
+* LUCENE-6737: Add DecimalDigitFilter which folds unicode digits to basic latin.
+ (Robert Muir)
+
Optimizations
* LUCENE-6708: TopFieldCollector does not compute the score several times on the
Modified: lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java?rev=1695908&r1=1695907&r2=1695908&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java (original)
+++ lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java Fri Aug 14 13:42:08 2015
@@ -21,6 +21,7 @@ import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.core.DecimalDigitFilter;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
@@ -126,7 +127,7 @@ public final class ArabicAnalyzer extend
*
* @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* built from an {@link StandardTokenizer} filtered with
- * {@link LowerCaseFilter}, {@link StopFilter},
+ * {@link LowerCaseFilter}, {@link DecimalDigitFilter}, {@link StopFilter},
* {@link ArabicNormalizationFilter}, {@link SetKeywordMarkerFilter}
* if a stem exclusion set is provided and {@link ArabicStemFilter}.
*/
@@ -139,6 +140,9 @@ public final class ArabicAnalyzer extend
source = new StandardTokenizer40();
}
TokenStream result = new LowerCaseFilter(source);
+ if (getVersion().onOrAfter(Version.LUCENE_5_4_0)) {
+ result = new DecimalDigitFilter(result);
+ }
// the order here is important: the stopword list is not normalized!
result = new StopFilter(result, stopwords);
// TODO maybe we should make ArabicNormalization filter also KeywordAttribute aware?!
Modified: lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniAnalyzer.java?rev=1695908&r1=1695907&r2=1695908&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniAnalyzer.java (original)
+++ lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniAnalyzer.java Fri Aug 14 13:42:08 2015
@@ -22,6 +22,7 @@ import java.io.Reader;
import java.nio.charset.StandardCharsets;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.core.DecimalDigitFilter;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
@@ -110,7 +111,7 @@ public final class SoraniAnalyzer extend
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* built from an {@link StandardTokenizer} filtered with
* {@link StandardFilter}, {@link SoraniNormalizationFilter},
- * {@link LowerCaseFilter}, {@link StopFilter}
+ * {@link LowerCaseFilter}, {@link DecimalDigitFilter}, {@link StopFilter}
* , {@link SetKeywordMarkerFilter} if a stem exclusion set is
* provided and {@link SoraniStemFilter}.
*/
@@ -125,6 +126,9 @@ public final class SoraniAnalyzer extend
TokenStream result = new StandardFilter(source);
result = new SoraniNormalizationFilter(result);
result = new LowerCaseFilter(result);
+ if (getVersion().onOrAfter(Version.LUCENE_5_4_0)) {
+ result = new DecimalDigitFilter(result);
+ }
result = new StopFilter(result, stopwords);
if(!stemExclusionSet.isEmpty())
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
Modified: lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java?rev=1695908&r1=1695907&r2=1695908&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java (original)
+++ lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java Fri Aug 14 13:42:08 2015
@@ -24,6 +24,7 @@ import org.apache.lucene.analysis.Analyz
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
+import org.apache.lucene.analysis.core.DecimalDigitFilter;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
@@ -109,7 +110,7 @@ public final class PersianAnalyzer exten
*
* @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* built from a {@link StandardTokenizer} filtered with
- * {@link LowerCaseFilter}, {@link ArabicNormalizationFilter},
+ * {@link LowerCaseFilter}, {@link DecimalDigitFilter}, {@link ArabicNormalizationFilter},
* {@link PersianNormalizationFilter} and Persian Stop words
*/
@Override
@@ -121,6 +122,9 @@ public final class PersianAnalyzer exten
source = new StandardTokenizer40();
}
TokenStream result = new LowerCaseFilter(source);
+ if (getVersion().onOrAfter(Version.LUCENE_5_4_0)) {
+ result = new DecimalDigitFilter(result);
+ }
result = new ArabicNormalizationFilter(result);
/* additional persian-specific normalization */
result = new PersianNormalizationFilter(result);
Modified: lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiAnalyzer.java?rev=1695908&r1=1695907&r2=1695908&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiAnalyzer.java (original)
+++ lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiAnalyzer.java Fri Aug 14 13:42:08 2015
@@ -27,6 +27,7 @@ import org.apache.lucene.analysis.util.C
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.DecimalDigitFilter;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.in.IndicNormalizationFilter;
@@ -108,7 +109,7 @@ public final class HindiAnalyzer extends
*
* @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* built from a {@link StandardTokenizer} filtered with
- * {@link LowerCaseFilter}, {@link IndicNormalizationFilter},
+ * {@link LowerCaseFilter}, {@link DecimalDigitFilter}, {@link IndicNormalizationFilter},
* {@link HindiNormalizationFilter}, {@link SetKeywordMarkerFilter}
* if a stem exclusion set is provided, {@link HindiStemFilter}, and
* Hindi Stop words
@@ -122,6 +123,9 @@ public final class HindiAnalyzer extends
source = new StandardTokenizer40();
}
TokenStream result = new LowerCaseFilter(source);
+ if (getVersion().onOrAfter(Version.LUCENE_5_4_0)) {
+ result = new DecimalDigitFilter(result);
+ }
if (!stemExclusionSet.isEmpty())
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
result = new IndicNormalizationFilter(result);
Modified: lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java?rev=1695908&r1=1695907&r2=1695908&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java (original)
+++ lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java Fri Aug 14 13:42:08 2015
@@ -22,6 +22,7 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.DecimalDigitFilter;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
@@ -93,15 +94,17 @@ public final class ThaiAnalyzer extends
* used to tokenize all the text in the provided {@link Reader}.
*
* @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
- * built from a {@link StandardTokenizer} filtered with
- * {@link StandardFilter}, {@link LowerCaseFilter}, {@link ThaiWordFilter}, and
- * {@link StopFilter}
+ * built from a {@link ThaiTokenizer} filtered with
+ * {@link LowerCaseFilter}, {@link DecimalDigitFilter} and {@link StopFilter}
*/
@Override
protected TokenStreamComponents createComponents(String fieldName) {
if (getVersion().onOrAfter(Version.LUCENE_4_8_0)) {
final Tokenizer source = new ThaiTokenizer();
TokenStream result = new LowerCaseFilter(source);
+ if (getVersion().onOrAfter(Version.LUCENE_5_4_0)) {
+ result = new DecimalDigitFilter(result);
+ }
result = new StopFilter(result, stopwords);
return new TokenStreamComponents(source, result);
} else {
Modified: lucene/dev/branches/branch_5x/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory?rev=1695908&r1=1695907&r2=1695908&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory (original)
+++ lucene/dev/branches/branch_5x/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory Fri Aug 14 13:42:08 2015
@@ -26,6 +26,7 @@ org.apache.lucene.analysis.commongrams.C
org.apache.lucene.analysis.commongrams.CommonGramsQueryFilterFactory
org.apache.lucene.analysis.compound.DictionaryCompoundWordTokenFilterFactory
org.apache.lucene.analysis.compound.HyphenationCompoundWordTokenFilterFactory
+org.apache.lucene.analysis.core.DecimalDigitFilterFactory
org.apache.lucene.analysis.core.LowerCaseFilterFactory
org.apache.lucene.analysis.core.StopFilterFactory
org.apache.lucene.analysis.core.TypeTokenFilterFactory
Modified: lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java?rev=1695908&r1=1695907&r2=1695908&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java (original)
+++ lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java Fri Aug 14 13:42:08 2015
@@ -101,6 +101,27 @@ public class TestArabicAnalyzer extends
a.close();
}
+ /**
+ * test we fold digits to latin-1
+ */
+ public void testDigits() throws Exception {
+ ArabicAnalyzer a = new ArabicAnalyzer();
+ checkOneTerm(a, "١٢٣٤", "1234");
+ a.close();
+ }
+
+ /**
+ * test that we don't fold digits for back compat behavior
+ * @deprecated remove this test in lucene 7
+ */
+ @Deprecated
+ public void testDigitsBackCompat() throws Exception {
+ ArabicAnalyzer a = new ArabicAnalyzer();
+ a.setVersion(Version.LUCENE_5_3_0);
+ checkOneTerm(a, "١٢٣٤", "١٢٣٤");
+ a.close();
+ }
+
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
ArabicAnalyzer a = new ArabicAnalyzer();
Modified: lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/ckb/TestSoraniAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/ckb/TestSoraniAnalyzer.java?rev=1695908&r1=1695907&r2=1695908&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/ckb/TestSoraniAnalyzer.java (original)
+++ lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/ckb/TestSoraniAnalyzer.java Fri Aug 14 13:42:08 2015
@@ -64,6 +64,28 @@ public class TestSoraniAnalyzer extends
a.close();
}
+ /**
+ * test we fold digits to latin-1
+ * (these are somewhat rare, but generally a few % of digits still)
+ */
+ public void testDigits() throws Exception {
+ SoraniAnalyzer a = new SoraniAnalyzer();
+ checkOneTerm(a, "١٢٣٤", "1234");
+ a.close();
+ }
+
+ /**
+ * test that we don't fold digits for back compat behavior
+ * @deprecated remove this test in lucene 7
+ */
+ @Deprecated
+ public void testDigitsBackCompat() throws Exception {
+ SoraniAnalyzer a = new SoraniAnalyzer();
+ a.setVersion(Version.LUCENE_5_3_0);
+ checkOneTerm(a, "١٢٣٤", "١٢٣٤");
+ a.close();
+ }
+
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
Analyzer a = new SoraniAnalyzer();
Modified: lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/ckb/TestSoraniStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/ckb/TestSoraniStemFilter.java?rev=1695908&r1=1695907&r2=1695908&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/ckb/TestSoraniStemFilter.java (original)
+++ lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/ckb/TestSoraniStemFilter.java Fri Aug 14 13:42:08 2015
@@ -23,6 +23,8 @@ import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
@@ -107,6 +109,18 @@ public class TestSoraniStemFilter extend
/** test against a basic vocabulary file */
public void testVocabulary() throws Exception {
// top 8k words or so: freq > 1000
+
+ // just normalization+stem, we are testing that the stemming doesn't break.
+ Analyzer a = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true);
+ TokenStream stream = new SoraniNormalizationFilter(tokenizer);
+ stream = new SoraniStemFilter(stream);
+ return new TokenStreamComponents(tokenizer, stream);
+ }
+ };
assertVocabulary(a, getDataPath("ckbtestdata.zip"), "testdata.txt");
+ a.close();
}
}
Modified: lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/fa/TestPersianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/fa/TestPersianAnalyzer.java?rev=1695908&r1=1695907&r2=1695908&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/fa/TestPersianAnalyzer.java (original)
+++ lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/fa/TestPersianAnalyzer.java Fri Aug 14 13:42:08 2015
@@ -231,6 +231,27 @@ public class TestPersianAnalyzer extends
a.close();
}
+ /**
+ * test we fold digits to latin-1
+ */
+ public void testDigits() throws Exception {
+ PersianAnalyzer a = new PersianAnalyzer();
+ checkOneTerm(a, "Û±Û²Û³Û´", "1234");
+ a.close();
+ }
+
+ /**
+ * test that we don't fold digits for back compat behavior
+ * @deprecated remove this test in lucene 7
+ */
+ @Deprecated
+ public void testDigitsBackCompat() throws Exception {
+ PersianAnalyzer a = new PersianAnalyzer();
+ a.setVersion(Version.LUCENE_5_3_0);
+ checkOneTerm(a, "Û±Û²Û³Û´", "Û±Û²Û³Û´");
+ a.close();
+ }
+
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
PersianAnalyzer a = new PersianAnalyzer();
Modified: lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hi/TestHindiAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hi/TestHindiAnalyzer.java?rev=1695908&r1=1695907&r2=1695908&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hi/TestHindiAnalyzer.java (original)
+++ lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/hi/TestHindiAnalyzer.java Fri Aug 14 13:42:08 2015
@@ -50,6 +50,27 @@ public class TestHindiAnalyzer extends B
a.close();
}
+ /**
+ * test we fold digits to latin-1
+ */
+ public void testDigits() throws Exception {
+ HindiAnalyzer a = new HindiAnalyzer();
+ checkOneTerm(a, "१२३४", "1234");
+ a.close();
+ }
+
+ /**
+ * test that we don't fold digits for back compat behavior
+ * @deprecated remove this test in lucene 7
+ */
+ @Deprecated
+ public void testDigitsBackCompat() throws Exception {
+ HindiAnalyzer a = new HindiAnalyzer();
+ a.setVersion(Version.LUCENE_5_3_0);
+ checkOneTerm(a, "१२३४", "१२३४");
+ a.close();
+ }
+
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
Analyzer analyzer = new HindiAnalyzer();
Modified: lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java?rev=1695908&r1=1695907&r2=1695908&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java (original)
+++ lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java Fri Aug 14 13:42:08 2015
@@ -123,6 +123,27 @@ public class TestThaiAnalyzer extends Ba
analyzer.close();
}
+ /**
+ * test we fold digits to latin-1
+ */
+ public void testDigits() throws Exception {
+ ThaiAnalyzer a = new ThaiAnalyzer();
+ checkOneTerm(a, "à¹à¹à¹à¹", "1234");
+ a.close();
+ }
+
+ /**
+ * test that we don't fold digits for back compat behavior
+ * @deprecated remove this test in lucene 7
+ */
+ @Deprecated
+ public void testDigitsBackCompat() throws Exception {
+ ThaiAnalyzer a = new ThaiAnalyzer();
+ a.setVersion(Version.LUCENE_5_3_0);
+ checkOneTerm(a, "à¹à¹à¹à¹", "à¹à¹à¹à¹");
+ a.close();
+ }
+
public void testTwoSentences() throws Exception {
Analyzer analyzer = new ThaiAnalyzer(CharArraySet.EMPTY_SET);
assertAnalyzesTo(analyzer, "This is a test. à¸à¸²à¸£à¸à¸µà¹à¹à¸à¹à¸à¹à¸à¸à¹à¸ªà¸à¸à¸§à¹à¸²à¸à¸²à¸à¸à¸µ",