You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by kr...@apache.org on 2016/11/18 16:42:40 UTC
[15/20] lucene-solr:jira/solr-8593: LUCENE-7536:
ASCIIFoldingFilterFactory.getMultiTermComponent can emit two tokens.
LUCENE-7536: ASCIIFoldingFilterFactory.getMultiTermComponent can emit two tokens.
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/6d540b9d
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/6d540b9d
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/6d540b9d
Branch: refs/heads/jira/solr-8593
Commit: 6d540b9d7a1a9b944bacb348c7ea681705e462e4
Parents: 8938c9f
Author: Adrien Grand <jp...@gmail.com>
Authored: Fri Nov 18 10:06:16 2016 +0100
Committer: Adrien Grand <jp...@gmail.com>
Committed: Fri Nov 18 10:07:09 2016 +0100
----------------------------------------------------------------------
lucene/CHANGES.txt | 3 ++
.../ASCIIFoldingFilterFactory.java | 17 +++++-
.../TestAsciiFoldingFilterFactory.java | 54 ++++++++++++++++++++
3 files changed, 72 insertions(+), 2 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6d540b9d/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 051c326..dfbf318 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -76,6 +76,9 @@ Bug Fixes
* LUCENE-7533: Classic query parser: disallow autoGeneratePhraseQueries=true
when splitOnWhitespace=false (and vice-versa). (Steve Rowe)
+* LUCENE-7536: ASCIIFoldingFilterFactory used to return an illegal multi-term
+ component when preserveOriginal was set to true. (Adrien Grand)
+
Improvements
* LUCENE-6824: TermAutomatonQuery now rewrites to TermQuery,
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6d540b9d/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilterFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilterFactory.java
index 60dddff..4e64abe 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilterFactory.java
@@ -17,6 +17,7 @@
package org.apache.lucene.analysis.miscellaneous;
+import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
@@ -36,12 +37,14 @@ import org.apache.lucene.analysis.TokenStream;
* </fieldType></pre>
*/
public class ASCIIFoldingFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
+ private static final String PRESERVE_ORIGINAL = "preserveOriginal";
+
private final boolean preserveOriginal;
/** Creates a new ASCIIFoldingFilterFactory */
public ASCIIFoldingFilterFactory(Map<String,String> args) {
super(args);
- preserveOriginal = getBoolean(args, "preserveOriginal", false);
+ preserveOriginal = getBoolean(args, PRESERVE_ORIGINAL, false);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
@@ -54,7 +57,17 @@ public class ASCIIFoldingFilterFactory extends TokenFilterFactory implements Mul
@Override
public AbstractAnalysisFactory getMultiTermComponent() {
- return this;
+ if (preserveOriginal) {
+ // The main use-case for using preserveOriginal is to match regardless of
+ // case but to give better scores to exact matches. Since most multi-term
+ // queries return constant scores anyway, the multi-term component only
+ // emits the folded token
+ Map<String, String> args = new HashMap<>(getOriginalArgs());
+ args.remove(PRESERVE_ORIGINAL);
+ return new ASCIIFoldingFilterFactory(args);
+ } else {
+ return this;
+ }
}
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6d540b9d/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestAsciiFoldingFilterFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestAsciiFoldingFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestAsciiFoldingFilterFactory.java
new file mode 100644
index 0000000..87d8760
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestAsciiFoldingFilterFactory.java
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.miscellaneous;
+
+import java.io.IOException;
+import java.util.Collections;
+import java.util.HashMap;
+
+import org.apache.lucene.analysis.CannedTokenStream;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
+import org.apache.lucene.analysis.util.MultiTermAwareComponent;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
+
+public class TestAsciiFoldingFilterFactory extends BaseTokenStreamFactoryTestCase {
+
+ public void testMultiTermAnalysis() throws IOException {
+ TokenFilterFactory factory = new ASCIIFoldingFilterFactory(Collections.emptyMap());
+ TokenStream stream = new CannedTokenStream(new Token("�t�", 0, 3));
+ stream = factory.create(stream);
+ assertTokenStreamContents(stream, new String[] { "Ete" });
+
+ factory = (TokenFilterFactory) ((MultiTermAwareComponent) factory).getMultiTermComponent();
+ stream = new CannedTokenStream(new Token("�t�", 0, 3));
+ stream = factory.create(stream);
+ assertTokenStreamContents(stream, new String[] { "Ete" });
+
+ factory = new ASCIIFoldingFilterFactory(new HashMap<>(Collections.singletonMap("preserveOriginal", "true")));
+ stream = new CannedTokenStream(new Token("�t�", 0, 3));
+ stream = factory.create(stream);
+ assertTokenStreamContents(stream, new String[] { "Ete", "�t�" });
+
+ factory = (TokenFilterFactory) ((MultiTermAwareComponent) factory).getMultiTermComponent();
+ stream = new CannedTokenStream(new Token("�t�", 0, 3));
+ stream = factory.create(stream);
+ assertTokenStreamContents(stream, new String[] { "Ete" });
+ }
+
+}