You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2015/08/14 15:12:57 UTC
svn commit: r1695898 - in /lucene/dev/trunk/lucene: ./
analysis/common/src/java/org/apache/lucene/analysis/ar/
analysis/common/src/java/org/apache/lucene/analysis/ckb/
analysis/common/src/java/org/apache/lucene/analysis/core/
analysis/common/src/java/o...
Author: rmuir
Date: Fri Aug 14 13:12:56 2015
New Revision: 1695898
URL: http://svn.apache.org/r1695898
Log:
LUCENE-6737: Add DecimalDigitFilter which folds unicode digits to basic latin
Added:
lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/DecimalDigitFilter.java (with props)
lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/DecimalDigitFilterFactory.java (with props)
lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestDecimalDigitFilter.java (with props)
lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestDecimalDigitFilterFactory.java (with props)
Modified:
lucene/dev/trunk/lucene/CHANGES.txt
lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniAnalyzer.java
lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java
lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiAnalyzer.java
lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java
lucene/dev/trunk/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java
lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ckb/TestSoraniAnalyzer.java
lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ckb/TestSoraniStemFilter.java
lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/fa/TestPersianAnalyzer.java
lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hi/TestHindiAnalyzer.java
lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java
Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1695898&r1=1695897&r2=1695898&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Fri Aug 14 13:12:56 2015
@@ -57,6 +57,9 @@ New Features
* LUCENE-6724: Add utility APIs to GeoHashUtils to compute neighbor
geohash cells (Nick Knize via Mike McCandless).
+* LUCENE-6737: Add DecimalDigitFilter which folds unicode digits to basic latin.
+ (Robert Muir)
+
Optimizations
* LUCENE-6708: TopFieldCollector does not compute the score several times on the
Modified: lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java?rev=1695898&r1=1695897&r2=1695898&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java Fri Aug 14 13:12:56 2015
@@ -21,6 +21,7 @@ import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.core.DecimalDigitFilter;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
@@ -29,6 +30,7 @@ import org.apache.lucene.analysis.util.C
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.util.Version;
/**
* {@link Analyzer} for Arabic.
@@ -124,7 +126,7 @@ public final class ArabicAnalyzer extend
*
* @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* built from an {@link StandardTokenizer} filtered with
- * {@link LowerCaseFilter}, {@link StopFilter},
+ * {@link LowerCaseFilter}, {@link DecimalDigitFilter}, {@link StopFilter},
* {@link ArabicNormalizationFilter}, {@link SetKeywordMarkerFilter}
* if a stem exclusion set is provided and {@link ArabicStemFilter}.
*/
@@ -132,6 +134,9 @@ public final class ArabicAnalyzer extend
protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer source = new StandardTokenizer();
TokenStream result = new LowerCaseFilter(source);
+ if (getVersion().onOrAfter(Version.LUCENE_5_4_0)) {
+ result = new DecimalDigitFilter(result);
+ }
// the order here is important: the stopword list is not normalized!
result = new StopFilter(result, stopwords);
// TODO maybe we should make ArabicNormalization filter also KeywordAttribute aware?!
Modified: lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniAnalyzer.java?rev=1695898&r1=1695897&r2=1695898&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniAnalyzer.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniAnalyzer.java Fri Aug 14 13:12:56 2015
@@ -22,6 +22,7 @@ import java.io.Reader;
import java.nio.charset.StandardCharsets;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.core.DecimalDigitFilter;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
@@ -33,6 +34,7 @@ import org.apache.lucene.analysis.util.C
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.Version;
/**
* {@link Analyzer} for Sorani Kurdish.
@@ -108,7 +110,7 @@ public final class SoraniAnalyzer extend
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* built from an {@link StandardTokenizer} filtered with
* {@link StandardFilter}, {@link SoraniNormalizationFilter},
- * {@link LowerCaseFilter}, {@link StopFilter}
+ * {@link LowerCaseFilter}, {@link DecimalDigitFilter}, {@link StopFilter}
* , {@link SetKeywordMarkerFilter} if a stem exclusion set is
* provided and {@link SoraniStemFilter}.
*/
@@ -118,6 +120,9 @@ public final class SoraniAnalyzer extend
TokenStream result = new StandardFilter(source);
result = new SoraniNormalizationFilter(result);
result = new LowerCaseFilter(result);
+ if (getVersion().onOrAfter(Version.LUCENE_5_4_0)) {
+ result = new DecimalDigitFilter(result);
+ }
result = new StopFilter(result, stopwords);
if(!stemExclusionSet.isEmpty())
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
Added: lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/DecimalDigitFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/DecimalDigitFilter.java?rev=1695898&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/DecimalDigitFilter.java (added)
+++ lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/DecimalDigitFilter.java Fri Aug 14 13:12:56 2015
@@ -0,0 +1,66 @@
+package org.apache.lucene.analysis.core;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.util.StemmerUtil;
+
+/**
+ * Folds all Unicode digits in {@code [:General_Category=Decimal_Number:]}
+ * to Basic Latin digits ({@code 0-9}).
+ */
+public final class DecimalDigitFilter extends TokenFilter {
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+
+ /**
+ * Creates a new DecimalDigitFilter over {@code input}
+ */
+ public DecimalDigitFilter(TokenStream input) {
+ super(input);
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ char buffer[] = termAtt.buffer();
+ int length = termAtt.length();
+
+ for (int i = 0; i < length; i++) {
+ int ch = Character.codePointAt(buffer, i, length);
+ // look for digits outside of basic latin
+ if (ch > 0x7F && Character.isDigit(ch)) {
+ // replace with equivalent basic latin digit
+ buffer[i] = (char) ('0' + Character.getNumericValue(ch));
+ // if the original was supplementary, shrink the string
+ if (ch > 0xFFFF) {
+ length = StemmerUtil.delete(buffer, ++i, length);
+ termAtt.setLength(length);
+ }
+ }
+ }
+
+ return true;
+ } else {
+ return false;
+ }
+ }
+}
Added: lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/DecimalDigitFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/DecimalDigitFilterFactory.java?rev=1695898&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/DecimalDigitFilterFactory.java (added)
+++ lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/DecimalDigitFilterFactory.java Fri Aug 14 13:12:56 2015
@@ -0,0 +1,56 @@
+package org.apache.lucene.analysis.core;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
+import org.apache.lucene.analysis.util.MultiTermAwareComponent;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
+
+/**
+ * Factory for {@link DecimalDigitFilter}.
+ * <pre class="prettyprint">
+ * <fieldType name="text_lwrcase" class="solr.TextField" positionIncrementGap="100">
+ * <analyzer>
+ * <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ * <filter class="solr.DecimalDigitFilterFactory"/>
+ * </analyzer>
+ * </fieldType></pre>
+ */
+public class DecimalDigitFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
+
+ /** Creates a new DecimalDigitFilterFactory */
+ public DecimalDigitFilterFactory(Map<String,String> args) {
+ super(args);
+ if (!args.isEmpty()) {
+ throw new IllegalArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ @Override
+ public DecimalDigitFilter create(TokenStream input) {
+ return new DecimalDigitFilter(input);
+ }
+
+ @Override
+ public AbstractAnalysisFactory getMultiTermComponent() {
+ return this;
+ }
+}
Modified: lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java?rev=1695898&r1=1695897&r2=1695898&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java Fri Aug 14 13:12:56 2015
@@ -24,11 +24,13 @@ import org.apache.lucene.analysis.Analyz
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
+import org.apache.lucene.analysis.core.DecimalDigitFilter;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
+import org.apache.lucene.util.Version;
/**
* {@link Analyzer} for Persian.
@@ -107,13 +109,16 @@ public final class PersianAnalyzer exten
*
* @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* built from a {@link StandardTokenizer} filtered with
- * {@link LowerCaseFilter}, {@link ArabicNormalizationFilter},
+ * {@link LowerCaseFilter}, {@link DecimalDigitFilter}, {@link ArabicNormalizationFilter},
* {@link PersianNormalizationFilter} and Persian Stop words
*/
@Override
protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer source = new StandardTokenizer();
TokenStream result = new LowerCaseFilter(source);
+ if (getVersion().onOrAfter(Version.LUCENE_5_4_0)) {
+ result = new DecimalDigitFilter(result);
+ }
result = new ArabicNormalizationFilter(result);
/* additional persian-specific normalization */
result = new PersianNormalizationFilter(result);
Modified: lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiAnalyzer.java?rev=1695898&r1=1695897&r2=1695898&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiAnalyzer.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiAnalyzer.java Fri Aug 14 13:12:56 2015
@@ -26,9 +26,11 @@ import org.apache.lucene.analysis.util.C
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.DecimalDigitFilter;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.in.IndicNormalizationFilter;
+import org.apache.lucene.util.Version;
/**
* Analyzer for Hindi.
@@ -106,7 +108,7 @@ public final class HindiAnalyzer extends
*
* @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* built from a {@link StandardTokenizer} filtered with
- * {@link LowerCaseFilter}, {@link IndicNormalizationFilter},
+ * {@link LowerCaseFilter}, {@link DecimalDigitFilter}, {@link IndicNormalizationFilter},
* {@link HindiNormalizationFilter}, {@link SetKeywordMarkerFilter}
* if a stem exclusion set is provided, {@link HindiStemFilter}, and
* Hindi Stop words
@@ -115,6 +117,9 @@ public final class HindiAnalyzer extends
protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer source = new StandardTokenizer();
TokenStream result = new LowerCaseFilter(source);
+ if (getVersion().onOrAfter(Version.LUCENE_5_4_0)) {
+ result = new DecimalDigitFilter(result);
+ }
if (!stemExclusionSet.isEmpty())
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
result = new IndicNormalizationFilter(result);
Modified: lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java?rev=1695898&r1=1695897&r2=1695898&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java Fri Aug 14 13:12:56 2015
@@ -22,10 +22,12 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.DecimalDigitFilter;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
+import org.apache.lucene.util.Version;
/**
* {@link Analyzer} for Thai language. It uses {@link java.text.BreakIterator} to break words.
@@ -90,12 +92,15 @@ public final class ThaiAnalyzer extends
*
* @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* built from a {@link ThaiTokenizer} filtered with
- * {@link LowerCaseFilter} and {@link StopFilter}
+ * {@link LowerCaseFilter}, {@link DecimalDigitFilter} and {@link StopFilter}
*/
@Override
protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer source = new ThaiTokenizer();
TokenStream result = new LowerCaseFilter(source);
+ if (getVersion().onOrAfter(Version.LUCENE_5_4_0)) {
+ result = new DecimalDigitFilter(result);
+ }
result = new StopFilter(result, stopwords);
return new TokenStreamComponents(source, result);
}
Modified: lucene/dev/trunk/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory?rev=1695898&r1=1695897&r2=1695898&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory Fri Aug 14 13:12:56 2015
@@ -26,6 +26,7 @@ org.apache.lucene.analysis.commongrams.C
org.apache.lucene.analysis.commongrams.CommonGramsQueryFilterFactory
org.apache.lucene.analysis.compound.DictionaryCompoundWordTokenFilterFactory
org.apache.lucene.analysis.compound.HyphenationCompoundWordTokenFilterFactory
+org.apache.lucene.analysis.core.DecimalDigitFilterFactory
org.apache.lucene.analysis.core.LowerCaseFilterFactory
org.apache.lucene.analysis.core.StopFilterFactory
org.apache.lucene.analysis.core.TypeTokenFilterFactory
Modified: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java?rev=1695898&r1=1695897&r2=1695898&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java Fri Aug 14 13:12:56 2015
@@ -21,6 +21,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.util.Version;
/**
* Test the Arabic Analyzer
@@ -100,6 +101,27 @@ public class TestArabicAnalyzer extends
a.close();
}
+ /**
+ * test we fold digits to latin-1
+ */
+ public void testDigits() throws Exception {
+ ArabicAnalyzer a = new ArabicAnalyzer();
+ checkOneTerm(a, "١٢٣٤", "1234");
+ a.close();
+ }
+
+ /**
+ * test that we don't fold digits for back compat behavior
+ * @deprecated remove this test in lucene 7
+ */
+ @Deprecated
+ public void testDigitsBackCompat() throws Exception {
+ ArabicAnalyzer a = new ArabicAnalyzer();
+ a.setVersion(Version.LUCENE_5_3_0);
+ checkOneTerm(a, "١٢٣٤", "١٢٣٤");
+ a.close();
+ }
+
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
ArabicAnalyzer a = new ArabicAnalyzer();
Modified: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ckb/TestSoraniAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ckb/TestSoraniAnalyzer.java?rev=1695898&r1=1695897&r2=1695898&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ckb/TestSoraniAnalyzer.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ckb/TestSoraniAnalyzer.java Fri Aug 14 13:12:56 2015
@@ -22,6 +22,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.util.Version;
/**
* Test the Sorani analyzer
@@ -63,6 +64,28 @@ public class TestSoraniAnalyzer extends
a.close();
}
+ /**
+ * test we fold digits to latin-1
+ * (these are somewhat rare, but generally a few % of digits still)
+ */
+ public void testDigits() throws Exception {
+ SoraniAnalyzer a = new SoraniAnalyzer();
+ checkOneTerm(a, "١٢٣٤", "1234");
+ a.close();
+ }
+
+ /**
+ * test that we don't fold digits for back compat behavior
+ * @deprecated remove this test in lucene 7
+ */
+ @Deprecated
+ public void testDigitsBackCompat() throws Exception {
+ SoraniAnalyzer a = new SoraniAnalyzer();
+ a.setVersion(Version.LUCENE_5_3_0);
+ checkOneTerm(a, "١٢٣٤", "١٢٣٤");
+ a.close();
+ }
+
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
Analyzer a = new SoraniAnalyzer();
Modified: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ckb/TestSoraniStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ckb/TestSoraniStemFilter.java?rev=1695898&r1=1695897&r2=1695898&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ckb/TestSoraniStemFilter.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ckb/TestSoraniStemFilter.java Fri Aug 14 13:12:56 2015
@@ -23,6 +23,8 @@ import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
@@ -107,6 +109,18 @@ public class TestSoraniStemFilter extend
/** test against a basic vocabulary file */
public void testVocabulary() throws Exception {
// top 8k words or so: freq > 1000
+
+ // just normalization+stem, we are testing that the stemming doesn't break.
+ Analyzer a = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true);
+ TokenStream stream = new SoraniNormalizationFilter(tokenizer);
+ stream = new SoraniStemFilter(stream);
+ return new TokenStreamComponents(tokenizer, stream);
+ }
+ };
assertVocabulary(a, getDataPath("ckbtestdata.zip"), "testdata.txt");
+ a.close();
}
}
Added: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestDecimalDigitFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestDecimalDigitFilter.java?rev=1695898&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestDecimalDigitFilter.java (added)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestDecimalDigitFilter.java Fri Aug 14 13:12:56 2015
@@ -0,0 +1,106 @@
+package org.apache.lucene.analysis.core;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.util.TestUtil;
+
+/**
+ * Tests for {@link DecimalDigitFilter}
+ */
+public class TestDecimalDigitFilter extends BaseTokenStreamTestCase {
+ private Analyzer tokenized;
+ private Analyzer keyword;
+
+ @Override
+ public void setUp() throws Exception {
+ super.setUp();
+ tokenized = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+ return new TokenStreamComponents(tokenizer, new DecimalDigitFilter(tokenizer));
+ }
+ };
+ keyword = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ Tokenizer tokenizer = new KeywordTokenizer();
+ return new TokenStreamComponents(tokenizer, new DecimalDigitFilter(tokenizer));
+ }
+ };
+ }
+
+ @Override
+ public void tearDown() throws Exception {
+ tokenized.close();
+ keyword.close();
+ super.tearDown();
+ }
+
+ /**
+ * test that digits are normalized
+ */
+ public void testSimple() throws Exception {
+ checkOneTerm(tokenized, "١٢٣٤", "1234");
+ }
+
+ /**
+ * test all digits in different locations of strings.
+ */
+ public void testRandom() throws Exception {
+ for (int codepoint = Character.MIN_CODE_POINT; codepoint < Character.MAX_CODE_POINT; codepoint++) {
+ if (Character.isDigit(codepoint)) {
+ // add some a-z before/after the string
+ String prefix = TestUtil.randomSimpleString(random());
+ String suffix = TestUtil.randomSimpleString(random());
+
+ StringBuilder expected = new StringBuilder();
+ expected.append(prefix);
+ int value = Character.getNumericValue(codepoint);
+ assert value >= 0 && value <= 9;
+ expected.append(Integer.toString(value));
+ expected.append(suffix);
+
+ StringBuilder actual = new StringBuilder();
+ actual.append(prefix);
+ actual.appendCodePoint(codepoint);
+ actual.append(suffix);
+
+ checkOneTerm(keyword, actual.toString(), expected.toString());
+ }
+ }
+ }
+
+ /**
+ * check the filter is a no-op for the empty string term
+ */
+ public void testEmptyTerm() throws Exception {
+ checkOneTerm(keyword, "", "");
+ }
+
+ /**
+ * blast some random strings through the filter
+ */
+ public void testRandomStrings() throws Exception {
+ checkRandomData(random(), tokenized, 1000*RANDOM_MULTIPLIER);
+ }
+}
Added: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestDecimalDigitFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestDecimalDigitFilterFactory.java?rev=1695898&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestDecimalDigitFilterFactory.java (added)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestDecimalDigitFilterFactory.java Fri Aug 14 13:12:56 2015
@@ -0,0 +1,50 @@
+package org.apache.lucene.analysis.core;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
+
+/**
+ * Simple tests to ensure the digit normalization factory is working.
+ */
+public class TestDecimalDigitFilterFactory extends BaseTokenStreamFactoryTestCase {
+
+ /**
+ * Ensure the filter actually normalizes digits.
+ */
+ public void testNormalization() throws Exception {
+ Reader reader = new StringReader("١٢٣٤");
+ TokenStream stream = whitespaceMockTokenizer(reader);
+ stream = tokenFilterFactory("DecimalDigit").create(stream);
+ assertTokenStreamContents(stream, new String[] { "1234" });
+ }
+
+ /** Test that bogus arguments result in exception */
+ public void testBogusArguments() throws Exception {
+ try {
+ tokenFilterFactory("DecimalDigit", "bogusArg", "bogusValue");
+ fail();
+ } catch (IllegalArgumentException expected) {
+ assertTrue(expected.getMessage().contains("Unknown parameters"));
+ }
+ }
+}
\ No newline at end of file
Modified: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/fa/TestPersianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/fa/TestPersianAnalyzer.java?rev=1695898&r1=1695897&r2=1695898&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/fa/TestPersianAnalyzer.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/fa/TestPersianAnalyzer.java Fri Aug 14 13:12:56 2015
@@ -20,6 +20,7 @@ package org.apache.lucene.analysis.fa;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.util.Version;
/**
* Test the Persian Analyzer
@@ -228,6 +229,27 @@ public class TestPersianAnalyzer extends
a.close();
}
+ /**
+ * test we fold digits to latin-1
+ */
+ public void testDigits() throws Exception {
+ PersianAnalyzer a = new PersianAnalyzer();
+ checkOneTerm(a, "Û±Û²Û³Û´", "1234");
+ a.close();
+ }
+
+ /**
+ * test that we don't fold digits for back compat behavior
+ * @deprecated remove this test in lucene 7
+ */
+ @Deprecated
+ public void testDigitsBackCompat() throws Exception {
+ PersianAnalyzer a = new PersianAnalyzer();
+ a.setVersion(Version.LUCENE_5_3_0);
+ checkOneTerm(a, "Û±Û²Û³Û´", "Û±Û²Û³Û´");
+ a.close();
+ }
+
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
PersianAnalyzer a = new PersianAnalyzer();
Modified: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hi/TestHindiAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hi/TestHindiAnalyzer.java?rev=1695898&r1=1695897&r2=1695898&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hi/TestHindiAnalyzer.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hi/TestHindiAnalyzer.java Fri Aug 14 13:12:56 2015
@@ -3,6 +3,7 @@ package org.apache.lucene.analysis.hi;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.util.Version;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@@ -47,6 +48,27 @@ public class TestHindiAnalyzer extends B
a.close();
}
+ /**
+ * test we fold digits to latin-1
+ */
+ public void testDigits() throws Exception {
+ HindiAnalyzer a = new HindiAnalyzer();
+ checkOneTerm(a, "१२३४", "1234");
+ a.close();
+ }
+
+ /**
+ * test that we don't fold digits for back compat behavior
+ * @deprecated remove this test in lucene 7
+ */
+ @Deprecated
+ public void testDigitsBackCompat() throws Exception {
+ HindiAnalyzer a = new HindiAnalyzer();
+ a.setVersion(Version.LUCENE_5_3_0);
+ checkOneTerm(a, "१२३४", "१२३४");
+ a.close();
+ }
+
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
Analyzer analyzer = new HindiAnalyzer();
Modified: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java?rev=1695898&r1=1695897&r2=1695898&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java Fri Aug 14 13:12:56 2015
@@ -23,6 +23,7 @@ import org.apache.lucene.analysis.TokenS
import org.apache.lucene.analysis.core.StopAnalyzer;
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.util.Version;
/**
* Test case for ThaiAnalyzer, modified from TestFrenchAnalyzer
@@ -122,6 +123,27 @@ public class TestThaiAnalyzer extends Ba
analyzer.close();
}
+ /**
+ * test we fold digits to latin-1
+ */
+ public void testDigits() throws Exception {
+ ThaiAnalyzer a = new ThaiAnalyzer();
+ checkOneTerm(a, "à¹à¹à¹à¹", "1234");
+ a.close();
+ }
+
+ /**
+ * test that we don't fold digits for back compat behavior
+ * @deprecated remove this test in lucene 7
+ */
+ @Deprecated
+ public void testDigitsBackCompat() throws Exception {
+ ThaiAnalyzer a = new ThaiAnalyzer();
+ a.setVersion(Version.LUCENE_5_3_0);
+ checkOneTerm(a, "à¹à¹à¹à¹", "à¹à¹à¹à¹");
+ a.close();
+ }
+
public void testTwoSentences() throws Exception {
Analyzer analyzer = new ThaiAnalyzer(CharArraySet.EMPTY_SET);
assertAnalyzesTo(analyzer, "This is a test. à¸à¸²à¸£à¸à¸µà¹à¹à¸à¹à¸à¹à¸à¸à¹à¸ªà¸à¸à¸§à¹à¸²à¸à¸²à¸à¸à¸µ",