You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by da...@apache.org on 2018/01/23 10:30:56 UTC

[26/41] lucene-solr:jira/solr-11702: LUCENE-8129: allow passing filtered unicode sets to ICUFoldingFilter

LUCENE-8129: allow passing filtered unicode sets to ICUFoldingFilter


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/6781a0d2
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/6781a0d2
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/6781a0d2

Branch: refs/heads/jira/solr-11702
Commit: 6781a0d2d3113e4f423bf717e9c8f781374265ca
Parents: a6b5c5b
Author: Rob Muir <ro...@ntrepidcorp.com>
Authored: Tue Jan 16 12:41:31 2018 -0800
Committer: Rob Muir <ro...@ntrepidcorp.com>
Committed: Tue Jan 16 12:45:17 2018 -0800

----------------------------------------------------------------------
 lucene/CHANGES.txt                              |  3 +++
 .../lucene/analysis/icu/ICUFoldingFilter.java   | 28 +++++++++++++++-----
 .../analysis/icu/ICUFoldingFilterFactory.java   | 20 ++++++++++++--
 .../icu/TestICUFoldingFilterFactory.java        | 21 +++++++++++++--
 4 files changed, 62 insertions(+), 10 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6781a0d2/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 435a461..038285e 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -125,6 +125,9 @@ Improvements
 
 * LUCENE-8125: ICUTokenizer support for emoji/emoji sequence tokens. (Robert Muir)
 
+* LUCENE-8129: A Unicode set filter can now be specified when using ICUFoldingFilter.
+  (Ere Maijala)
+
 Bug Fixes
 
 * LUCENE-8077: Fixed bug in how CheckIndex verifies doc-value iterators.

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6781a0d2/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUFoldingFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUFoldingFilter.java b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUFoldingFilter.java
index 0895b47..9c3770c 100644
--- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUFoldingFilter.java
+++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUFoldingFilter.java
@@ -59,18 +59,34 @@ import com.ibm.icu.text.Normalizer2;
  * All foldings, case folding, and normalization mappings are applied recursively
  * to ensure a fully folded and normalized result.
  * </p>
+ * <p>
+ * A normalizer with additional settings such as a filter that lists characters not
+ * to be normalized can be passed in the constructor.
+ * </p>
  */
 public final class ICUFoldingFilter extends ICUNormalizer2Filter {
-  // TODO: if the wrong version of the ICU jar is used, loading these data files may give a strange error.
-  // maybe add an explicit check? http://icu-project.org/apiref/icu4j/com/ibm/icu/util/VersionInfo.html
-  private static final Normalizer2 normalizer =  Normalizer2.getInstance(
-      ICUFoldingFilter.class.getResourceAsStream("utr30.nrm"), 
-      "utr30", Normalizer2.Mode.COMPOSE);
-  
+  /**
+   * A normalizer for search term folding to Unicode text,
+   * applying foldings from UTR#30 Character Foldings.
+   */
+  public static final Normalizer2 NORMALIZER = Normalizer2.getInstance(
+    // TODO: if the wrong version of the ICU jar is used, loading these data files may give a strange error.
+    // maybe add an explicit check? http://icu-project.org/apiref/icu4j/com/ibm/icu/util/VersionInfo.html
+    ICUFoldingFilter.class.getResourceAsStream("utr30.nrm"),
+    "utr30", Normalizer2.Mode.COMPOSE);
+
   /**
    * Create a new ICUFoldingFilter on the specified input
    */
   public ICUFoldingFilter(TokenStream input) {
+    super(input, NORMALIZER);
+  }
+
+  /**
+   * Create a new ICUFoldingFilter on the specified input with the specified
+   * normalizer
+   */
+  public ICUFoldingFilter(TokenStream input, Normalizer2 normalizer) {
     super(input, normalizer);
   }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6781a0d2/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUFoldingFilterFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUFoldingFilterFactory.java b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUFoldingFilterFactory.java
index 036874a..1065cbf 100644
--- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUFoldingFilterFactory.java
+++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUFoldingFilterFactory.java
@@ -25,7 +25,11 @@ import org.apache.lucene.analysis.util.AbstractAnalysisFactory; // javadocs
 import org.apache.lucene.analysis.util.MultiTermAwareComponent;
 import org.apache.lucene.analysis.util.TokenFilterFactory;
 
-/** 
+import com.ibm.icu.text.FilteredNormalizer2;
+import com.ibm.icu.text.Normalizer2;
+import com.ibm.icu.text.UnicodeSet;
+
+/**
  * Factory for {@link ICUFoldingFilter}.
  * <pre class="prettyprint">
  * &lt;fieldType name="text_folded" class="solr.TextField" positionIncrementGap="100"&gt;
@@ -37,18 +41,30 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
  * @since 3.1.0
  */
 public class ICUFoldingFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
+  private final Normalizer2 normalizer;
 
   /** Creates a new ICUFoldingFilterFactory */
   public ICUFoldingFilterFactory(Map<String,String> args) {
     super(args);
+
+    Normalizer2 normalizer = ICUFoldingFilter.NORMALIZER;
+    String filter = get(args, "filter");
+    if (filter != null) {
+      UnicodeSet set = new UnicodeSet(filter);
+      if (!set.isEmpty()) {
+        set.freeze();
+        normalizer = new FilteredNormalizer2(normalizer, set);
+      }
+    }
     if (!args.isEmpty()) {
       throw new IllegalArgumentException("Unknown parameters: " + args);
     }
+    this.normalizer = normalizer;
   }
 
   @Override
   public TokenStream create(TokenStream input) {
-    return new ICUFoldingFilter(input);
+    return new ICUFoldingFilter(input, normalizer);
   }
 
   @Override

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6781a0d2/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUFoldingFilterFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUFoldingFilterFactory.java b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUFoldingFilterFactory.java
index 3782216..3e3c523 100644
--- a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUFoldingFilterFactory.java
+++ b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUFoldingFilterFactory.java
@@ -26,7 +26,7 @@ import org.apache.lucene.analysis.TokenStream;
 
 /** basic tests for {@link ICUFoldingFilterFactory} */
 public class TestICUFoldingFilterFactory extends BaseTokenStreamTestCase {
-  
+
   /** basic tests to ensure the folding is working */
   public void test() throws Exception {
     Reader reader = new StringReader("Résumé");
@@ -35,7 +35,24 @@ public class TestICUFoldingFilterFactory extends BaseTokenStreamTestCase {
     stream = factory.create(stream);
     assertTokenStreamContents(stream, new String[] { "resume" });
   }
-  
+
+  /** test to ensure the filter parameter is working */
+  public void testFilter() throws Exception {
+    HashMap<String,String> args = new HashMap<String,String>();
+    args.put("filter", "[^ö]");
+    ICUFoldingFilterFactory factory = new ICUFoldingFilterFactory(args);
+
+    Reader reader = new StringReader("Résumé");
+    TokenStream stream = whitespaceMockTokenizer(reader);
+    stream = factory.create(stream);
+    assertTokenStreamContents(stream, new String[] { "resume" });
+
+    reader = new StringReader("Fönster");
+    stream = whitespaceMockTokenizer(reader);
+    stream = factory.create(stream);
+    assertTokenStreamContents(stream, new String[] { "fönster" });
+  }
+
   /** Test that bogus arguments result in exception */
   public void testBogusArguments() throws Exception {
     IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {