You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by kr...@apache.org on 2016/11/18 16:42:40 UTC

[15/20] lucene-solr:jira/solr-8593: LUCENE-7536: ASCIIFoldingFilterFactory.getMultiTermComponent can emit two tokens.

LUCENE-7536: ASCIIFoldingFilterFactory.getMultiTermComponent can emit two tokens.


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/6d540b9d
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/6d540b9d
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/6d540b9d

Branch: refs/heads/jira/solr-8593
Commit: 6d540b9d7a1a9b944bacb348c7ea681705e462e4
Parents: 8938c9f
Author: Adrien Grand <jp...@gmail.com>
Authored: Fri Nov 18 10:06:16 2016 +0100
Committer: Adrien Grand <jp...@gmail.com>
Committed: Fri Nov 18 10:07:09 2016 +0100

----------------------------------------------------------------------
 lucene/CHANGES.txt                              |  3 ++
 .../ASCIIFoldingFilterFactory.java              | 17 +++++-
 .../TestAsciiFoldingFilterFactory.java          | 54 ++++++++++++++++++++
 3 files changed, 72 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6d540b9d/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 051c326..dfbf318 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -76,6 +76,9 @@ Bug Fixes
 * LUCENE-7533: Classic query parser: disallow autoGeneratePhraseQueries=true
   when splitOnWhitespace=false (and vice-versa). (Steve Rowe)
 
+* LUCENE-7536: ASCIIFoldingFilterFactory used to return an illegal multi-term
+  component when preserveOriginal was set to true. (Adrien Grand)
+
 Improvements
 
 * LUCENE-6824: TermAutomatonQuery now rewrites to TermQuery,

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6d540b9d/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilterFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilterFactory.java
index 60dddff..4e64abe 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilterFactory.java
@@ -17,6 +17,7 @@
 package org.apache.lucene.analysis.miscellaneous;
 
 
+import java.util.HashMap;
 import java.util.Map;
 
 import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
@@ -36,12 +37,14 @@ import org.apache.lucene.analysis.TokenStream;
  * &lt;/fieldType&gt;</pre>
  */
 public class ASCIIFoldingFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
+  private static final String PRESERVE_ORIGINAL = "preserveOriginal";
+
   private final boolean preserveOriginal;
   
   /** Creates a new ASCIIFoldingFilterFactory */
   public ASCIIFoldingFilterFactory(Map<String,String> args) {
     super(args);
-    preserveOriginal = getBoolean(args, "preserveOriginal", false);
+    preserveOriginal = getBoolean(args, PRESERVE_ORIGINAL, false);
     if (!args.isEmpty()) {
       throw new IllegalArgumentException("Unknown parameters: " + args);
     }
@@ -54,7 +57,17 @@ public class ASCIIFoldingFilterFactory extends TokenFilterFactory implements Mul
 
   @Override
   public AbstractAnalysisFactory getMultiTermComponent() {
-    return this;
+    if (preserveOriginal) {
+      // The main use-case for using preserveOriginal is to match regardless of
+      // case but to give better scores to exact matches. Since most multi-term
+      // queries return constant scores anyway, the multi-term component only
+      // emits the folded token
+      Map<String, String> args = new HashMap<>(getOriginalArgs());
+      args.remove(PRESERVE_ORIGINAL);
+      return new ASCIIFoldingFilterFactory(args);
+    } else {
+      return this;
+    }
   }
 }
 

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6d540b9d/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestAsciiFoldingFilterFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestAsciiFoldingFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestAsciiFoldingFilterFactory.java
new file mode 100644
index 0000000..87d8760
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestAsciiFoldingFilterFactory.java
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.miscellaneous;
+
+import java.io.IOException;
+import java.util.Collections;
+import java.util.HashMap;
+
+import org.apache.lucene.analysis.CannedTokenStream;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
+import org.apache.lucene.analysis.util.MultiTermAwareComponent;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
+
+public class TestAsciiFoldingFilterFactory extends BaseTokenStreamFactoryTestCase {
+
+  public void testMultiTermAnalysis() throws IOException {
+    TokenFilterFactory factory = new ASCIIFoldingFilterFactory(Collections.emptyMap());
+    TokenStream stream = new CannedTokenStream(new Token("�t�", 0, 3));
+    stream = factory.create(stream);
+    assertTokenStreamContents(stream, new String[] { "Ete" });
+
+    factory = (TokenFilterFactory) ((MultiTermAwareComponent) factory).getMultiTermComponent();
+    stream = new CannedTokenStream(new Token("�t�", 0, 3));
+    stream = factory.create(stream);
+    assertTokenStreamContents(stream, new String[] { "Ete" });
+
+    factory = new ASCIIFoldingFilterFactory(new HashMap<>(Collections.singletonMap("preserveOriginal", "true")));
+    stream = new CannedTokenStream(new Token("�t�", 0, 3));
+    stream = factory.create(stream);
+    assertTokenStreamContents(stream, new String[] { "Ete", "�t�" });
+
+    factory = (TokenFilterFactory) ((MultiTermAwareComponent) factory).getMultiTermComponent();
+    stream = new CannedTokenStream(new Token("�t�", 0, 3));
+    stream = factory.create(stream);
+    assertTokenStreamContents(stream, new String[] { "Ete" });
+  }
+
+}