You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2018/01/16 20:48:56 UTC
lucene-solr:branch_7x: LUCENE-8129: allow passing filtered unicode
sets to ICUFoldingFilter
Repository: lucene-solr
Updated Branches:
refs/heads/branch_7x 1c6cc20eb -> d3d1b6b96
LUCENE-8129: allow passing filtered unicode sets to ICUFoldingFilter
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/d3d1b6b9
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/d3d1b6b9
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/d3d1b6b9
Branch: refs/heads/branch_7x
Commit: d3d1b6b96ef24885a8b25e5d08bd47666597dea4
Parents: 1c6cc20
Author: Rob Muir <ro...@ntrepidcorp.com>
Authored: Tue Jan 16 12:41:31 2018 -0800
Committer: Rob Muir <ro...@ntrepidcorp.com>
Committed: Tue Jan 16 12:48:08 2018 -0800
----------------------------------------------------------------------
lucene/CHANGES.txt | 3 +++
.../lucene/analysis/icu/ICUFoldingFilter.java | 28 +++++++++++++++-----
.../analysis/icu/ICUFoldingFilterFactory.java | 20 ++++++++++++--
.../icu/TestICUFoldingFilterFactory.java | 21 +++++++++++++--
4 files changed, 62 insertions(+), 10 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/d3d1b6b9/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 484a1ff..941a58d 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -47,6 +47,9 @@ Improvements
* LUCENE-8125: ICUTokenizer support for emoji/emoji sequence tokens. (Robert Muir)
+* LUCENE-8129: A Unicode set filter can now be specified when using ICUFoldingFilter.
+ (Ere Maijala)
+
Bug Fixes
* LUCENE-8077: Fixed bug in how CheckIndex verifies doc-value iterators.
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/d3d1b6b9/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUFoldingFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUFoldingFilter.java b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUFoldingFilter.java
index 0895b47..9c3770c 100644
--- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUFoldingFilter.java
+++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUFoldingFilter.java
@@ -59,18 +59,34 @@ import com.ibm.icu.text.Normalizer2;
* All foldings, case folding, and normalization mappings are applied recursively
* to ensure a fully folded and normalized result.
* </p>
+ * <p>
+ * A normalizer with additional settings such as a filter that lists characters not
+ * to be normalized can be passed in the constructor.
+ * </p>
*/
public final class ICUFoldingFilter extends ICUNormalizer2Filter {
- // TODO: if the wrong version of the ICU jar is used, loading these data files may give a strange error.
- // maybe add an explicit check? http://icu-project.org/apiref/icu4j/com/ibm/icu/util/VersionInfo.html
- private static final Normalizer2 normalizer = Normalizer2.getInstance(
- ICUFoldingFilter.class.getResourceAsStream("utr30.nrm"),
- "utr30", Normalizer2.Mode.COMPOSE);
-
+ /**
+ * A normalizer for search term folding to Unicode text,
+ * applying foldings from UTR#30 Character Foldings.
+ */
+ public static final Normalizer2 NORMALIZER = Normalizer2.getInstance(
+ // TODO: if the wrong version of the ICU jar is used, loading these data files may give a strange error.
+ // maybe add an explicit check? http://icu-project.org/apiref/icu4j/com/ibm/icu/util/VersionInfo.html
+ ICUFoldingFilter.class.getResourceAsStream("utr30.nrm"),
+ "utr30", Normalizer2.Mode.COMPOSE);
+
/**
* Create a new ICUFoldingFilter on the specified input
*/
public ICUFoldingFilter(TokenStream input) {
+ super(input, NORMALIZER);
+ }
+
+ /**
+ * Create a new ICUFoldingFilter on the specified input with the specified
+ * normalizer
+ */
+ public ICUFoldingFilter(TokenStream input, Normalizer2 normalizer) {
super(input, normalizer);
}
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/d3d1b6b9/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUFoldingFilterFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUFoldingFilterFactory.java b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUFoldingFilterFactory.java
index 036874a..1065cbf 100644
--- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUFoldingFilterFactory.java
+++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUFoldingFilterFactory.java
@@ -25,7 +25,11 @@ import org.apache.lucene.analysis.util.AbstractAnalysisFactory; // javadocs
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
import org.apache.lucene.analysis.util.TokenFilterFactory;
-/**
+import com.ibm.icu.text.FilteredNormalizer2;
+import com.ibm.icu.text.Normalizer2;
+import com.ibm.icu.text.UnicodeSet;
+
+/**
* Factory for {@link ICUFoldingFilter}.
* <pre class="prettyprint">
* <fieldType name="text_folded" class="solr.TextField" positionIncrementGap="100">
@@ -37,18 +41,30 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
* @since 3.1.0
*/
public class ICUFoldingFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
+ private final Normalizer2 normalizer;
/** Creates a new ICUFoldingFilterFactory */
public ICUFoldingFilterFactory(Map<String,String> args) {
super(args);
+
+ Normalizer2 normalizer = ICUFoldingFilter.NORMALIZER;
+ String filter = get(args, "filter");
+ if (filter != null) {
+ UnicodeSet set = new UnicodeSet(filter);
+ if (!set.isEmpty()) {
+ set.freeze();
+ normalizer = new FilteredNormalizer2(normalizer, set);
+ }
+ }
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
+ this.normalizer = normalizer;
}
@Override
public TokenStream create(TokenStream input) {
- return new ICUFoldingFilter(input);
+ return new ICUFoldingFilter(input, normalizer);
}
@Override
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/d3d1b6b9/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUFoldingFilterFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUFoldingFilterFactory.java b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUFoldingFilterFactory.java
index 3782216..3e3c523 100644
--- a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUFoldingFilterFactory.java
+++ b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUFoldingFilterFactory.java
@@ -26,7 +26,7 @@ import org.apache.lucene.analysis.TokenStream;
/** basic tests for {@link ICUFoldingFilterFactory} */
public class TestICUFoldingFilterFactory extends BaseTokenStreamTestCase {
-
+
/** basic tests to ensure the folding is working */
public void test() throws Exception {
Reader reader = new StringReader("Résumé");
@@ -35,7 +35,24 @@ public class TestICUFoldingFilterFactory extends BaseTokenStreamTestCase {
stream = factory.create(stream);
assertTokenStreamContents(stream, new String[] { "resume" });
}
-
+
+ /** test to ensure the filter parameter is working */
+ public void testFilter() throws Exception {
+ HashMap<String,String> args = new HashMap<String,String>();
+ args.put("filter", "[^ö]");
+ ICUFoldingFilterFactory factory = new ICUFoldingFilterFactory(args);
+
+ Reader reader = new StringReader("Résumé");
+ TokenStream stream = whitespaceMockTokenizer(reader);
+ stream = factory.create(stream);
+ assertTokenStreamContents(stream, new String[] { "resume" });
+
+ reader = new StringReader("Fönster");
+ stream = whitespaceMockTokenizer(reader);
+ stream = factory.create(stream);
+ assertTokenStreamContents(stream, new String[] { "fönster" });
+ }
+
/** Test that bogus arguments result in exception */
public void testBogusArguments() throws Exception {
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {