You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2018/05/28 14:52:10 UTC
[2/5] lucene-solr:master: LUCENE-8186: LowerCaseTokenizerFactory now
lowercases text in multi-term queries.
LUCENE-8186: LowerCaseTokenizerFactory now lowercases text in multi-term queries.
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/1971ef31
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/1971ef31
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/1971ef31
Branch: refs/heads/master
Commit: 1971ef310906239d88602444ae6b74081648f3e4
Parents: 78ca82e
Author: Adrien Grand <jp...@gmail.com>
Authored: Mon May 28 16:20:49 2018 +0200
Committer: Adrien Grand <jp...@gmail.com>
Committed: Mon May 28 16:20:49 2018 +0200
----------------------------------------------------------------------
lucene/CHANGES.txt | 3 +++
.../apache/lucene/analysis/custom/CustomAnalyzer.java | 6 ++++++
.../lucene/analysis/custom/TestCustomAnalyzer.java | 12 ++++++++++--
3 files changed, 19 insertions(+), 2 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/1971ef31/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index b643af1..d3f89ba 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -248,6 +248,9 @@ Bug Fixes
* LUCENE-8325: Fixed the smartcn tokenizer to not split UTF-16 surrogate pairs.
(chengpohi via Jim Ferenczi)
+* LUCENE-8186: LowerCaseTokenizerFactory now lowercases text in multi-term
+ queries. (Tim Allison via Adrien Grand)
+
Other
* LUCENE-8301: Update randomizedtesting to 2.6.0. (Dawid Weiss)
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/1971ef31/lucene/analysis/common/src/java/org/apache/lucene/analysis/custom/CustomAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/custom/CustomAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/custom/CustomAnalyzer.java
index 19e207f..f60c6a2 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/custom/CustomAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/custom/CustomAnalyzer.java
@@ -162,6 +162,12 @@ public final class CustomAnalyzer extends Analyzer {
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
TokenStream result = in;
+ // tokenizers can return a tokenfilter if the tokenizer does normalization,
+ // although this is really bogus/abstraction violation...
+ if (tokenizer instanceof MultiTermAwareComponent) {
+ TokenFilterFactory filter = (TokenFilterFactory) ((MultiTermAwareComponent) tokenizer).getMultiTermComponent();
+ result = filter.create(result);
+ }
for (TokenFilterFactory filter : tokenFilters) {
if (filter instanceof MultiTermAwareComponent) {
filter = (TokenFilterFactory) ((MultiTermAwareComponent) filter).getMultiTermComponent();
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/1971ef31/lucene/analysis/common/src/test/org/apache/lucene/analysis/custom/TestCustomAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/custom/TestCustomAnalyzer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/custom/TestCustomAnalyzer.java
index 01ad75c..1fa59d1 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/custom/TestCustomAnalyzer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/custom/TestCustomAnalyzer.java
@@ -31,9 +31,9 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.charfilter.HTMLStripCharFilterFactory;
import org.apache.lucene.analysis.charfilter.MappingCharFilterFactory;
-import org.apache.lucene.analysis.core.KeywordTokenizerFactory;
import org.apache.lucene.analysis.core.LowerCaseFilterFactory;
import org.apache.lucene.analysis.core.LowerCaseTokenizer;
+import org.apache.lucene.analysis.core.LowerCaseTokenizerFactory;
import org.apache.lucene.analysis.core.StopFilterFactory;
import org.apache.lucene.analysis.core.WhitespaceTokenizerFactory;
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilterFactory;
@@ -432,7 +432,7 @@ public class TestCustomAnalyzer extends BaseTokenStreamTestCase {
@Override
public AbstractAnalysisFactory getMultiTermComponent() {
- return new KeywordTokenizerFactory(getOriginalArgs());
+ return new DummyTokenFilterFactory(Collections.emptyMap());
}
}
@@ -500,6 +500,14 @@ public class TestCustomAnalyzer extends BaseTokenStreamTestCase {
.build();
assertEquals(new BytesRef("e f c"), analyzer.normalize("dummy", "a b c"));
}
+
+ /** test normalize where the TokenizerFactory returns a filter to normalize the text */
+ public void testNormalizationWithLowerCaseTokenizer() throws IOException {
+ CustomAnalyzer analyzer1 = CustomAnalyzer.builder()
+ .withTokenizer(LowerCaseTokenizerFactory.class, Collections.emptyMap())
+ .build();
+ assertEquals(new BytesRef("abc"), analyzer1.normalize("dummy", "ABC"));
+ }
public void testConditions() throws IOException {
CustomAnalyzer analyzer = CustomAnalyzer.builder()