You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2016/07/12 16:03:18 UTC
[3/3] lucene-solr:branch_6x: LUCENE-7355: Add Analyzer#normalize()
and use it in query parsers.
LUCENE-7355: Add Analyzer#normalize() and use it in query parsers.
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/7c2e7a0f
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/7c2e7a0f
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/7c2e7a0f
Branch: refs/heads/branch_6x
Commit: 7c2e7a0fb80a5bf733cf710aee6cbf01d02629eb
Parents: ccd3bc8
Author: Adrien Grand <jp...@gmail.com>
Authored: Tue Jun 28 18:23:11 2016 +0200
Committer: Adrien Grand <jp...@gmail.com>
Committed: Tue Jul 12 18:01:44 2016 +0200
----------------------------------------------------------------------
lucene/CHANGES.txt | 7 +
.../lucene/analysis/ar/ArabicAnalyzer.java | 8 ++
.../lucene/analysis/bg/BulgarianAnalyzer.java | 7 +
.../lucene/analysis/br/BrazilianAnalyzer.java | 7 +
.../lucene/analysis/ca/CatalanAnalyzer.java | 8 ++
.../apache/lucene/analysis/cjk/CJKAnalyzer.java | 7 +
.../lucene/analysis/ckb/SoraniAnalyzer.java | 9 ++
.../lucene/analysis/core/SimpleAnalyzer.java | 6 +
.../lucene/analysis/core/StopAnalyzer.java | 6 +
.../lucene/analysis/custom/CustomAnalyzer.java | 28 +++-
.../lucene/analysis/cz/CzechAnalyzer.java | 7 +
.../lucene/analysis/da/DanishAnalyzer.java | 7 +
.../lucene/analysis/de/GermanAnalyzer.java | 8 ++
.../lucene/analysis/el/GreekAnalyzer.java | 7 +
.../lucene/analysis/en/EnglishAnalyzer.java | 7 +
.../lucene/analysis/es/SpanishAnalyzer.java | 7 +
.../lucene/analysis/eu/BasqueAnalyzer.java | 7 +
.../lucene/analysis/fa/PersianAnalyzer.java | 14 +-
.../lucene/analysis/fi/FinnishAnalyzer.java | 7 +
.../lucene/analysis/fr/FrenchAnalyzer.java | 8 ++
.../lucene/analysis/ga/IrishAnalyzer.java | 8 ++
.../lucene/analysis/gl/GalicianAnalyzer.java | 7 +
.../lucene/analysis/hi/HindiAnalyzer.java | 11 ++
.../lucene/analysis/hu/HungarianAnalyzer.java | 7 +
.../lucene/analysis/hy/ArmenianAnalyzer.java | 7 +
.../lucene/analysis/id/IndonesianAnalyzer.java | 7 +
.../lucene/analysis/it/ItalianAnalyzer.java | 8 ++
.../lucene/analysis/lt/LithuanianAnalyzer.java | 7 +
.../lucene/analysis/lv/LatvianAnalyzer.java | 7 +
.../lucene/analysis/nl/DutchAnalyzer.java | 7 +
.../lucene/analysis/no/NorwegianAnalyzer.java | 7 +
.../lucene/analysis/pt/PortugueseAnalyzer.java | 7 +
.../lucene/analysis/ro/RomanianAnalyzer.java | 7 +
.../lucene/analysis/ru/RussianAnalyzer.java | 7 +
.../analysis/standard/ClassicAnalyzer.java | 5 +
.../standard/UAX29URLEmailAnalyzer.java | 5 +
.../lucene/analysis/sv/SwedishAnalyzer.java | 7 +
.../apache/lucene/analysis/th/ThaiAnalyzer.java | 7 +
.../lucene/analysis/tr/TurkishAnalyzer.java | 7 +
.../lucene/collation/CollationKeyAnalyzer.java | 7 +
.../core/TestAllAnalyzersHaveFactories.java | 2 +
.../lucene/analysis/core/TestAnalyzers.java | 4 +
.../lucene/analysis/core/TestRandomChains.java | 10 +-
.../analysis/custom/TestCustomAnalyzer.java | 143 +++++++++++++++++++
.../lucene/analysis/ja/JapaneseAnalyzer.java | 7 +
.../analysis/morfologik/MorfologikAnalyzer.java | 6 +
.../analysis/cn/smart/SmartChineseAnalyzer.java | 6 +
.../lucene/analysis/pl/PolishAnalyzer.java | 7 +
.../org/apache/lucene/analysis/Analyzer.java | 135 ++++++++++++++++-
.../analysis/standard/StandardAnalyzer.java | 7 +
.../analysis/standard/TestStandardAnalyzer.java | 6 +
.../analyzing/AnalyzingQueryParser.java | 127 ++++++----------
.../queryparser/classic/QueryParserBase.java | 35 +----
.../queryparser/simple/SimpleQueryParser.java | 9 +-
.../analyzing/TestAnalyzingQueryParser.java | 45 +++---
.../queryparser/util/QueryParserTestBase.java | 4 +
.../analysis/BaseTokenStreamTestCase.java | 5 +-
.../apache/lucene/analysis/MockAnalyzer.java | 11 +-
.../lucene/analysis/MockBytesAnalyzer.java | 7 +
.../lucene/analysis/MockLowerCaseFilter.java | 40 ++++++
.../apache/solr/analysis/TokenizerChain.java | 28 +++-
61 files changed, 808 insertions(+), 150 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 573dd4a..83d1782 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -26,6 +26,9 @@ New Features
methods Directory.rename and Directory.syncMetaData instead (Robert Muir,
Uwe Schindler, Mike McCandless)
+* LUCENE-7355: Added Analyzer#normalize(), which only applies normalization to
+ an input string. (Adrien Grand)
+
Bug Fixes
* LUCENE-6662: Fixed potential resource leaks. (Rishabh Patel via Adrien Grand)
@@ -77,6 +80,10 @@ Improvements
* LUCENE-7276: MatchNoDocsQuery now includes an optional reason for
why it was used (Jim Ferenczi via Mike McCandless)
+* LUCENE-7355: AnalyzingQueryParser now only applies the subset of the analysis
+ chain that is about normalization for range/fuzzy/wildcard queries.
+ (Adrien Grand)
+
Optimizations
* LUCENE-7330, LUCENE-7339: Speed up conjunction queries. (Adrien Grand)
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
index 71da32d..c68399e 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
@@ -146,5 +146,13 @@ public final class ArabicAnalyzer extends StopwordAnalyzerBase {
}
return new TokenStreamComponents(source, new ArabicStemFilter(result));
}
+
+ @Override
+ protected TokenStream normalize(String fieldName, TokenStream in) {
+ TokenStream result = new LowerCaseFilter(in);
+ result = new DecimalDigitFilter(result);
+ result = new ArabicNormalizationFilter(result);
+ return result;
+ }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java
index 9cb0657..06c7eea 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java
@@ -126,4 +126,11 @@ public final class BulgarianAnalyzer extends StopwordAnalyzerBase {
result = new BulgarianStemFilter(result);
return new TokenStreamComponents(source, result);
}
+
+ @Override
+ protected TokenStream normalize(String fieldName, TokenStream in) {
+ TokenStream result = new StandardFilter(in);
+ result = new LowerCaseFilter(result);
+ return result;
+ }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java
index 5dd0cbc..ad1af92 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java
@@ -127,5 +127,12 @@ public final class BrazilianAnalyzer extends StopwordAnalyzerBase {
result = new SetKeywordMarkerFilter(result, excltable);
return new TokenStreamComponents(source, new BrazilianStemFilter(result));
}
+
+ @Override
+ protected TokenStream normalize(String fieldName, TokenStream in) {
+ TokenStream result = new StandardFilter(in);
+ result = new LowerCaseFilter(result);
+ return result;
+ }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/ca/CatalanAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ca/CatalanAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ca/CatalanAnalyzer.java
index 739b61a..56f36e1 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ca/CatalanAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ca/CatalanAnalyzer.java
@@ -130,4 +130,12 @@ public final class CatalanAnalyzer extends StopwordAnalyzerBase {
result = new SnowballFilter(result, new CatalanStemmer());
return new TokenStreamComponents(source, result);
}
+
+ @Override
+ protected TokenStream normalize(String fieldName, TokenStream in) {
+ TokenStream result = new StandardFilter(in);
+ result = new ElisionFilter(result, DEFAULT_ARTICLES);
+ result = new LowerCaseFilter(result);
+ return result;
+ }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java
index d500ff9..d4214a1 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java
@@ -92,4 +92,11 @@ public final class CJKAnalyzer extends StopwordAnalyzerBase {
result = new CJKBigramFilter(result);
return new TokenStreamComponents(source, new StopFilter(result, stopwords));
}
+
+ @Override
+ protected TokenStream normalize(String fieldName, TokenStream in) {
+ TokenStream result = new CJKWidthFilter(in);
+ result = new LowerCaseFilter(result);
+ return result;
+ }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniAnalyzer.java
index 5fd1bec..7819c66 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniAnalyzer.java
@@ -129,4 +129,13 @@ public final class SoraniAnalyzer extends StopwordAnalyzerBase {
result = new SoraniStemFilter(result);
return new TokenStreamComponents(source, result);
}
+
+ @Override
+ protected TokenStream normalize(String fieldName, TokenStream in) {
+ TokenStream result = new StandardFilter(in);
+ result = new SoraniNormalizationFilter(result);
+ result = new LowerCaseFilter(result);
+ result = new DecimalDigitFilter(result);
+ return result;
+ }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/SimpleAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/SimpleAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/SimpleAnalyzer.java
index d0fdcf6..6e0f2f0 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/SimpleAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/SimpleAnalyzer.java
@@ -19,6 +19,7 @@ package org.apache.lucene.analysis.core;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.TokenStream;
/** An {@link Analyzer} that filters {@link LetterTokenizer}
* with {@link LowerCaseFilter}
@@ -35,4 +36,9 @@ public final class SimpleAnalyzer extends Analyzer {
protected TokenStreamComponents createComponents(final String fieldName) {
return new TokenStreamComponents(new LowerCaseTokenizer());
}
+
+ @Override
+ protected TokenStream normalize(String fieldName, TokenStream in) {
+ return new LowerCaseFilter(in);
+ }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopAnalyzer.java
index 3fa4982..7d7f532 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopAnalyzer.java
@@ -25,6 +25,7 @@ import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.StopwordAnalyzerBase;
+import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WordlistLoader;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
@@ -79,5 +80,10 @@ public final class StopAnalyzer extends StopwordAnalyzerBase {
final Tokenizer source = new LowerCaseTokenizer();
return new TokenStreamComponents(source, new StopFilter(source, stopwords));
}
+
+ @Override
+ protected TokenStream normalize(String fieldName, TokenStream in) {
+ return new LowerCaseFilter(in);
+ }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/custom/CustomAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/custom/CustomAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/custom/CustomAnalyzer.java
index f2ed01f..b2de5e8 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/custom/CustomAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/custom/CustomAnalyzer.java
@@ -37,6 +37,7 @@ import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
import org.apache.lucene.analysis.util.CharFilterFactory;
import org.apache.lucene.analysis.util.ClasspathResourceLoader;
import org.apache.lucene.analysis.util.FilesystemResourceLoader;
+import org.apache.lucene.analysis.util.MultiTermAwareComponent;
import org.apache.lucene.analysis.util.ResourceLoader;
import org.apache.lucene.analysis.util.ResourceLoaderAware;
import org.apache.lucene.analysis.util.TokenFilterFactory;
@@ -118,15 +119,38 @@ public final class CustomAnalyzer extends Analyzer {
}
@Override
+ protected Reader initReaderForNormalization(String fieldName, Reader reader) {
+ for (CharFilterFactory charFilter : charFilters) {
+ if (charFilter instanceof MultiTermAwareComponent) {
+ charFilter = (CharFilterFactory) ((MultiTermAwareComponent) charFilter).getMultiTermComponent();
+ reader = charFilter.create(reader);
+ }
+ }
+ return reader;
+ }
+
+ @Override
protected TokenStreamComponents createComponents(String fieldName) {
- final Tokenizer tk = tokenizer.create();
+ final Tokenizer tk = tokenizer.create(attributeFactory());
TokenStream ts = tk;
for (final TokenFilterFactory filter : tokenFilters) {
ts = filter.create(ts);
}
return new TokenStreamComponents(tk, ts);
}
-
+
+ @Override
+ protected TokenStream normalize(String fieldName, TokenStream in) {
+ TokenStream result = in;
+ for (TokenFilterFactory filter : tokenFilters) {
+ if (filter instanceof MultiTermAwareComponent) {
+ filter = (TokenFilterFactory) ((MultiTermAwareComponent) filter).getMultiTermComponent();
+ result = filter.create(in);
+ }
+ }
+ return result;
+ }
+
@Override
public int getPositionIncrementGap(String fieldName) {
// use default from Analyzer base class if null
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java
index 9777179..fbb9efa 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java
@@ -125,5 +125,12 @@ public final class CzechAnalyzer extends StopwordAnalyzerBase {
result = new CzechStemFilter(result);
return new TokenStreamComponents(source, result);
}
+
+ @Override
+ protected TokenStream normalize(String fieldName, TokenStream in) {
+ TokenStream result = new StandardFilter(in);
+ result = new LowerCaseFilter(result);
+ return result;
+ }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/da/DanishAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/da/DanishAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/da/DanishAnalyzer.java
index f9c316d..ccbd9d1 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/da/DanishAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/da/DanishAnalyzer.java
@@ -124,4 +124,11 @@ public final class DanishAnalyzer extends StopwordAnalyzerBase {
result = new SnowballFilter(result, new DanishStemmer());
return new TokenStreamComponents(source, result);
}
+
+ @Override
+ protected TokenStream normalize(String fieldName, TokenStream in) {
+ TokenStream result = new StandardFilter(in);
+ result = new LowerCaseFilter(result);
+ return result;
+ }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
index 790fc48..8a39945 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
@@ -139,4 +139,12 @@ public final class GermanAnalyzer extends StopwordAnalyzerBase {
result = new GermanLightStemFilter(result);
return new TokenStreamComponents(source, result);
}
+
+ @Override
+ protected TokenStream normalize(String fieldName, TokenStream in) {
+ TokenStream result = new StandardFilter(in);
+ result = new LowerCaseFilter(result);
+ result = new GermanNormalizationFilter(result);
+ return result;
+ }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java
index c85b6ec..bd09d25 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java
@@ -104,4 +104,11 @@ public final class GreekAnalyzer extends StopwordAnalyzerBase {
result = new GreekStemFilter(result);
return new TokenStreamComponents(source, result);
}
+
+ @Override
+ protected TokenStream normalize(String fieldName, TokenStream in) {
+ TokenStream result = new StandardFilter(in);
+ result = new GreekLowerCaseFilter(result);
+ return result;
+ }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/EnglishAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/EnglishAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/EnglishAnalyzer.java
index 16dc0c5..94ba43a 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/EnglishAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/EnglishAnalyzer.java
@@ -107,4 +107,11 @@ public final class EnglishAnalyzer extends StopwordAnalyzerBase {
result = new PorterStemFilter(result);
return new TokenStreamComponents(source, result);
}
+
+ @Override
+ protected TokenStream normalize(String fieldName, TokenStream in) {
+ TokenStream result = new StandardFilter(in);
+ result = new LowerCaseFilter(result);
+ return result;
+ }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishAnalyzer.java
index ab5b6c3..3b21cdd 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishAnalyzer.java
@@ -123,4 +123,11 @@ public final class SpanishAnalyzer extends StopwordAnalyzerBase {
result = new SpanishLightStemFilter(result);
return new TokenStreamComponents(source, result);
}
+
+ @Override
+ protected TokenStream normalize(String fieldName, TokenStream in) {
+ TokenStream result = new StandardFilter(in);
+ result = new LowerCaseFilter(result);
+ return result;
+ }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/eu/BasqueAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/eu/BasqueAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/eu/BasqueAnalyzer.java
index cff2da0..4bc1ba7 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/eu/BasqueAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/eu/BasqueAnalyzer.java
@@ -121,4 +121,11 @@ public final class BasqueAnalyzer extends StopwordAnalyzerBase {
result = new SnowballFilter(result, new BasqueStemmer());
return new TokenStreamComponents(source, result);
}
+
+ @Override
+ protected TokenStream normalize(String fieldName, TokenStream in) {
+ TokenStream result = new StandardFilter(in);
+ result = new LowerCaseFilter(result);
+ return result;
+ }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java
index 2515d1e..0d6b80c 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java
@@ -29,6 +29,7 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
import org.apache.lucene.analysis.core.DecimalDigitFilter;
+import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.util.Version;
@@ -128,7 +129,18 @@ public final class PersianAnalyzer extends StopwordAnalyzerBase {
*/
return new TokenStreamComponents(source, new StopFilter(result, stopwords));
}
-
+
+ @Override
+ protected TokenStream normalize(String fieldName, TokenStream in) {
+ TokenStream result = new StandardFilter(in);
+ result = new LowerCaseFilter(result);
+ result = new DecimalDigitFilter(result);
+ result = new ArabicNormalizationFilter(result);
+ /* additional persian-specific normalization */
+ result = new PersianNormalizationFilter(result);
+ return result;
+ }
+
/**
* Wraps the Reader with {@link PersianCharFilter}
*/
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishAnalyzer.java
index 6b00101..69cc537 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishAnalyzer.java
@@ -124,4 +124,11 @@ public final class FinnishAnalyzer extends StopwordAnalyzerBase {
result = new SnowballFilter(result, new FinnishStemmer());
return new TokenStreamComponents(source, result);
}
+
+ @Override
+ protected TokenStream normalize(String fieldName, TokenStream in) {
+ TokenStream result = new StandardFilter(in);
+ result = new LowerCaseFilter(result);
+ return result;
+ }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java
index 5f90246..2e072be 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java
@@ -144,5 +144,13 @@ public final class FrenchAnalyzer extends StopwordAnalyzerBase {
result = new FrenchLightStemFilter(result);
return new TokenStreamComponents(source, result);
}
+
+ @Override
+ protected TokenStream normalize(String fieldName, TokenStream in) {
+ TokenStream result = new StandardFilter(in);
+ result = new ElisionFilter(result, DEFAULT_ARTICLES);
+ result = new LowerCaseFilter(result);
+ return result;
+ }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/ga/IrishAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ga/IrishAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ga/IrishAnalyzer.java
index 1ca3455..3ae366d 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ga/IrishAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ga/IrishAnalyzer.java
@@ -141,4 +141,12 @@ public final class IrishAnalyzer extends StopwordAnalyzerBase {
result = new SnowballFilter(result, new IrishStemmer());
return new TokenStreamComponents(source, result);
}
+
+ @Override
+ protected TokenStream normalize(String fieldName, TokenStream in) {
+ TokenStream result = new StandardFilter(in);
+ result = new ElisionFilter(result, DEFAULT_ARTICLES);
+ result = new IrishLowerCaseFilter(result);
+ return result;
+ }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianAnalyzer.java
index 372a6ec..4f70596 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianAnalyzer.java
@@ -122,4 +122,11 @@ public final class GalicianAnalyzer extends StopwordAnalyzerBase {
result = new GalicianStemFilter(result);
return new TokenStreamComponents(source, result);
}
+
+ @Override
+ protected TokenStream normalize(String fieldName, TokenStream in) {
+ TokenStream result = new StandardFilter(in);
+ result = new LowerCaseFilter(result);
+ return result;
+ }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiAnalyzer.java
index 1b57129..84bfd7a 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiAnalyzer.java
@@ -29,6 +29,7 @@ import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.DecimalDigitFilter;
import org.apache.lucene.analysis.in.IndicNormalizationFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
+import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.util.Version;
@@ -128,4 +129,14 @@ public final class HindiAnalyzer extends StopwordAnalyzerBase {
result = new HindiStemFilter(result);
return new TokenStreamComponents(source, result);
}
+
+ @Override
+ protected TokenStream normalize(String fieldName, TokenStream in) {
+ TokenStream result = new StandardFilter(in);
+ result = new LowerCaseFilter(result);
+ result = new DecimalDigitFilter(result);
+ result = new IndicNormalizationFilter(result);
+ result = new HindiNormalizationFilter(result);
+ return result;
+ }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianAnalyzer.java
index 0615bdc..e980f5a 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianAnalyzer.java
@@ -124,4 +124,11 @@ public final class HungarianAnalyzer extends StopwordAnalyzerBase {
result = new SnowballFilter(result, new HungarianStemmer());
return new TokenStreamComponents(source, result);
}
+
+ @Override
+ protected TokenStream normalize(String fieldName, TokenStream in) {
+ TokenStream result = new StandardFilter(in);
+ result = new LowerCaseFilter(result);
+ return result;
+ }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/hy/ArmenianAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hy/ArmenianAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hy/ArmenianAnalyzer.java
index 8c04639..95506e1 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hy/ArmenianAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hy/ArmenianAnalyzer.java
@@ -121,4 +121,11 @@ public final class ArmenianAnalyzer extends StopwordAnalyzerBase {
result = new SnowballFilter(result, new ArmenianStemmer());
return new TokenStreamComponents(source, result);
}
+
+ @Override
+ protected TokenStream normalize(String fieldName, TokenStream in) {
+ TokenStream result = new StandardFilter(in);
+ result = new LowerCaseFilter(result);
+ return result;
+ }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianAnalyzer.java
index fc9b4d2..9804bea 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianAnalyzer.java
@@ -119,4 +119,11 @@ public final class IndonesianAnalyzer extends StopwordAnalyzerBase {
}
return new TokenStreamComponents(source, new IndonesianStemFilter(result));
}
+
+ @Override
+ protected TokenStream normalize(String fieldName, TokenStream in) {
+ TokenStream result = new StandardFilter(in);
+ result = new LowerCaseFilter(result);
+ return result;
+ }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java
index a18aa5d..32f4e30 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java
@@ -133,4 +133,12 @@ public final class ItalianAnalyzer extends StopwordAnalyzerBase {
result = new ItalianLightStemFilter(result);
return new TokenStreamComponents(source, result);
}
+
+ @Override
+ protected TokenStream normalize(String fieldName, TokenStream in) {
+ TokenStream result = new StandardFilter(in);
+ result = new ElisionFilter(result, DEFAULT_ARTICLES);
+ result = new LowerCaseFilter(result);
+ return result;
+ }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/lt/LithuanianAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/lt/LithuanianAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/lt/LithuanianAnalyzer.java
index 5e24cf9..4eccc51 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/lt/LithuanianAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/lt/LithuanianAnalyzer.java
@@ -121,4 +121,11 @@ public final class LithuanianAnalyzer extends StopwordAnalyzerBase {
result = new SnowballFilter(result, new LithuanianStemmer());
return new TokenStreamComponents(source, result);
}
+
+ @Override
+ protected TokenStream normalize(String fieldName, TokenStream in) {
+ TokenStream result = new StandardFilter(in);
+ result = new LowerCaseFilter(result);
+ return result;
+ }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianAnalyzer.java
index 0a016af..1b08b3b 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianAnalyzer.java
@@ -122,4 +122,11 @@ public final class LatvianAnalyzer extends StopwordAnalyzerBase {
result = new LatvianStemFilter(result);
return new TokenStreamComponents(source, result);
}
+
+ @Override
+ protected TokenStream normalize(String fieldName, TokenStream in) {
+ TokenStream result = new StandardFilter(in);
+ result = new LowerCaseFilter(result);
+ return result;
+ }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java
index 0391425..900d9c6 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java
@@ -159,4 +159,11 @@ public final class DutchAnalyzer extends Analyzer {
result = new SnowballFilter(result, new org.tartarus.snowball.ext.DutchStemmer());
return new TokenStreamComponents(source, result);
}
+
+ @Override
+ protected TokenStream normalize(String fieldName, TokenStream in) {
+ TokenStream result = new StandardFilter(in);
+ result = new LowerCaseFilter(result);
+ return result;
+ }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianAnalyzer.java
index c413793..3570ad4 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianAnalyzer.java
@@ -124,5 +124,12 @@ public final class NorwegianAnalyzer extends StopwordAnalyzerBase {
result = new SnowballFilter(result, new NorwegianStemmer());
return new TokenStreamComponents(source, result);
}
+
+ @Override
+ protected TokenStream normalize(String fieldName, TokenStream in) {
+ TokenStream result = new StandardFilter(in);
+ result = new LowerCaseFilter(result);
+ return result;
+ }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseAnalyzer.java
index 769e142..8f54803 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseAnalyzer.java
@@ -123,4 +123,11 @@ public final class PortugueseAnalyzer extends StopwordAnalyzerBase {
result = new PortugueseLightStemFilter(result);
return new TokenStreamComponents(source, result);
}
+
+ @Override
+ protected TokenStream normalize(String fieldName, TokenStream in) {
+ TokenStream result = new StandardFilter(in);
+ result = new LowerCaseFilter(result);
+ return result;
+ }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/ro/RomanianAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ro/RomanianAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ro/RomanianAnalyzer.java
index 06ff999..1b74184 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ro/RomanianAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ro/RomanianAnalyzer.java
@@ -126,4 +126,11 @@ public final class RomanianAnalyzer extends StopwordAnalyzerBase {
result = new SnowballFilter(result, new RomanianStemmer());
return new TokenStreamComponents(source, result);
}
+
+ @Override
+ protected TokenStream normalize(String fieldName, TokenStream in) {
+ TokenStream result = new StandardFilter(in);
+ result = new LowerCaseFilter(result);
+ return result;
+ }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
index dfe8ef3..76bf495 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
@@ -121,4 +121,11 @@ public final class RussianAnalyzer extends StopwordAnalyzerBase {
result = new SnowballFilter(result, new org.tartarus.snowball.ext.RussianStemmer());
return new TokenStreamComponents(source, result);
}
+
+ @Override
+ protected TokenStream normalize(String fieldName, TokenStream in) {
+ TokenStream result = new StandardFilter(in);
+ result = new LowerCaseFilter(result);
+ return result;
+ }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicAnalyzer.java
index dc6c118..ef2ef7e 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicAnalyzer.java
@@ -100,4 +100,9 @@ public final class ClassicAnalyzer extends StopwordAnalyzerBase {
}
};
}
+
+ @Override
+ protected TokenStream normalize(String fieldName, TokenStream in) {
+ return new LowerCaseFilter(in);
+ }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailAnalyzer.java
index 9994884..fe71b7e 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailAnalyzer.java
@@ -97,4 +97,9 @@ public final class UAX29URLEmailAnalyzer extends StopwordAnalyzerBase {
}
};
}
+
+ @Override
+ protected TokenStream normalize(String fieldName, TokenStream in) {
+ return new LowerCaseFilter(in);
+ }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishAnalyzer.java
index fd2aa2e..3896d3e 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishAnalyzer.java
@@ -124,4 +124,11 @@ public final class SwedishAnalyzer extends StopwordAnalyzerBase {
result = new SnowballFilter(result, new SwedishStemmer());
return new TokenStreamComponents(source, result);
}
+
+ @Override
+ protected TokenStream normalize(String fieldName, TokenStream in) {
+ TokenStream result = new StandardFilter(in);
+ result = new LowerCaseFilter(result);
+ return result;
+ }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java
index 9543c5c..6ab7ba1 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java
@@ -104,4 +104,11 @@ public final class ThaiAnalyzer extends StopwordAnalyzerBase {
result = new StopFilter(result, stopwords);
return new TokenStreamComponents(source, result);
}
+
+ @Override
+ protected TokenStream normalize(String fieldName, TokenStream in) {
+ TokenStream result = new LowerCaseFilter(in);
+ result = new DecimalDigitFilter(result);
+ return result;
+ }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/tr/TurkishAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/tr/TurkishAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/tr/TurkishAnalyzer.java
index a21495f..719e434 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/tr/TurkishAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/tr/TurkishAnalyzer.java
@@ -127,4 +127,11 @@ public final class TurkishAnalyzer extends StopwordAnalyzerBase {
result = new SnowballFilter(result, new TurkishStemmer());
return new TokenStreamComponents(source, result);
}
+
+ @Override
+ protected TokenStream normalize(String fieldName, TokenStream in) {
+ TokenStream result = new StandardFilter(in);
+ result = new TurkishLowerCaseFilter(result);
+ return result;
+ }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/collation/CollationKeyAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/collation/CollationKeyAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/collation/CollationKeyAnalyzer.java
index f7b15f6..ea98731 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/collation/CollationKeyAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/collation/CollationKeyAnalyzer.java
@@ -20,6 +20,8 @@ package org.apache.lucene.collation;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
+import org.apache.lucene.util.AttributeFactory;
+
import java.text.Collator;
/**
@@ -83,6 +85,11 @@ public final class CollationKeyAnalyzer extends Analyzer {
}
@Override
+ protected AttributeFactory attributeFactory() {
+ return factory;
+ }
+
+ @Override
protected TokenStreamComponents createComponents(String fieldName) {
KeywordTokenizer tokenizer = new KeywordTokenizer(factory, KeywordTokenizer.DEFAULT_BUFFER_SIZE);
return new TokenStreamComponents(tokenizer, tokenizer);
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAllAnalyzersHaveFactories.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAllAnalyzersHaveFactories.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAllAnalyzersHaveFactories.java
index d826a60..7099566 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAllAnalyzersHaveFactories.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAllAnalyzersHaveFactories.java
@@ -35,6 +35,7 @@ import org.apache.lucene.analysis.MockCharFilter;
import org.apache.lucene.analysis.MockFixedLengthPayloadFilter;
import org.apache.lucene.analysis.MockGraphTokenFilter;
import org.apache.lucene.analysis.MockHoleInjectingTokenFilter;
+import org.apache.lucene.analysis.MockLowerCaseFilter;
import org.apache.lucene.analysis.MockRandomLookaheadTokenFilter;
import org.apache.lucene.analysis.MockSynonymFilter;
import org.apache.lucene.analysis.MockTokenFilter;
@@ -75,6 +76,7 @@ public class TestAllAnalyzersHaveFactories extends LuceneTestCase {
MockFixedLengthPayloadFilter.class,
MockGraphTokenFilter.class,
MockHoleInjectingTokenFilter.class,
+ MockLowerCaseFilter.class,
MockRandomLookaheadTokenFilter.class,
MockSynonymFilter.class,
MockTokenFilter.class,
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAnalyzers.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAnalyzers.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAnalyzers.java
index 8f7f2cd..6d514d1 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAnalyzers.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAnalyzers.java
@@ -52,6 +52,7 @@ public class TestAnalyzers extends BaseTokenStreamTestCase {
new String[] { "b" });
assertAnalyzesTo(a, "\"QUOTED\" word",
new String[] { "quoted", "word" });
+ assertEquals(new BytesRef("\"\\�3[]()! cz@"), a.normalize("dummy", "\"\\�3[]()! Cz@"));
a.close();
}
@@ -73,6 +74,7 @@ public class TestAnalyzers extends BaseTokenStreamTestCase {
new String[] { "2B" });
assertAnalyzesTo(a, "\"QUOTED\" word",
new String[] { "\"QUOTED\"", "word" });
+ assertEquals(new BytesRef("\"\\�3[]()! Cz@"), a.normalize("dummy", "\"\\�3[]()! Cz@"));
a.close();
}
@@ -82,6 +84,8 @@ public class TestAnalyzers extends BaseTokenStreamTestCase {
new String[] { "foo", "bar", "foo", "bar" });
assertAnalyzesTo(a, "foo a bar such FOO THESE BAR",
new String[] { "foo", "bar", "foo", "bar" });
+ assertEquals(new BytesRef("\"\\�3[]()! cz@"), a.normalize("dummy", "\"\\�3[]()! Cz@"));
+ assertEquals(new BytesRef("the"), a.normalize("dummy", "the"));
a.close();
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
index 4effc79..25ca7a3 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
@@ -928,6 +928,7 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
System.out.println("Creating random analyzer:" + a);
}
try {
+ checkNormalize(a);
checkRandomData(random, a, 500*RANDOM_MULTIPLIER, 20, false,
false /* We already validate our own offsets... */);
} catch (Throwable e) {
@@ -937,7 +938,14 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
}
}
}
-
+
+ public void checkNormalize(Analyzer a) {
+ // normalization should not modify characters that may be used for wildcards
+ // or regular expressions
+ String s = "([0-9]+)?*";
+ assertEquals(s, a.normalize("dummy", s).utf8ToString());
+ }
+
// we might regret this decision...
public void testRandomChainsWithLargeStrings() throws Throwable {
int numIterations = TEST_NIGHTLY ? atLeast(20) : 3;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/test/org/apache/lucene/analysis/custom/TestCustomAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/custom/TestCustomAnalyzer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/custom/TestCustomAnalyzer.java
index af11927..60633e4 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/custom/TestCustomAnalyzer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/custom/TestCustomAnalyzer.java
@@ -17,6 +17,8 @@
package org.apache.lucene.analysis.custom;
+import java.io.IOException;
+import java.io.Reader;
import java.nio.file.Paths;
import java.util.Collections;
import java.util.HashMap;
@@ -24,16 +26,25 @@ import java.util.List;
import java.util.Map;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.charfilter.HTMLStripCharFilterFactory;
+import org.apache.lucene.analysis.core.KeywordTokenizerFactory;
import org.apache.lucene.analysis.core.LowerCaseFilterFactory;
+import org.apache.lucene.analysis.core.LowerCaseTokenizer;
import org.apache.lucene.analysis.core.StopFilterFactory;
import org.apache.lucene.analysis.core.WhitespaceTokenizerFactory;
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilterFactory;
import org.apache.lucene.analysis.standard.ClassicTokenizerFactory;
import org.apache.lucene.analysis.standard.StandardTokenizerFactory;
+import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
import org.apache.lucene.analysis.util.CharFilterFactory;
+import org.apache.lucene.analysis.util.MultiTermAwareComponent;
import org.apache.lucene.analysis.util.TokenFilterFactory;
import org.apache.lucene.analysis.util.TokenizerFactory;
+import org.apache.lucene.util.AttributeFactory;
+import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.SetOnce.AlreadySetException;
import org.apache.lucene.util.Version;
@@ -336,4 +347,136 @@ public class TestCustomAnalyzer extends BaseTokenStreamTestCase {
});
}
+ private static class DummyCharFilter extends CharFilter {
+
+ private final char match, repl;
+
+ public DummyCharFilter(Reader input, char match, char repl) {
+ super(input);
+ this.match = match;
+ this.repl = repl;
+ }
+
+ @Override
+ protected int correct(int currentOff) {
+ return currentOff;
+ }
+
+ @Override
+ public int read(char[] cbuf, int off, int len) throws IOException {
+ final int read = input.read(cbuf, off, len);
+ for (int i = 0; i < read; ++i) {
+ if (cbuf[off+i] == match) {
+ cbuf[off+i] = repl;
+ }
+ }
+ return read;
+ }
+
+ }
+
+ public static class DummyCharFilterFactory extends CharFilterFactory {
+
+ private final char match, repl;
+
+ public DummyCharFilterFactory(Map<String,String> args) {
+ this(args, '0', '1');
+ }
+
+ DummyCharFilterFactory(Map<String,String> args, char match, char repl) {
+ super(args);
+ this.match = match;
+ this.repl = repl;
+ }
+
+ @Override
+ public Reader create(Reader input) {
+ return new DummyCharFilter(input, match, repl);
+ }
+
+ }
+
+ public static class DummyMultiTermAwareCharFilterFactory extends DummyCharFilterFactory implements MultiTermAwareComponent {
+
+ public DummyMultiTermAwareCharFilterFactory(Map<String,String> args) {
+ super(args);
+ }
+
+ @Override
+ public AbstractAnalysisFactory getMultiTermComponent() {
+ return new DummyCharFilterFactory(Collections.emptyMap(), '0', '2');
+ }
+
+ }
+
+ public static class DummyTokenizerFactory extends TokenizerFactory {
+
+ public DummyTokenizerFactory(Map<String,String> args) {
+ super(args);
+ }
+
+ @Override
+ public Tokenizer create(AttributeFactory factory) {
+ return new LowerCaseTokenizer(factory);
+ }
+
+ }
+
+ public static class DummyMultiTermAwareTokenizerFactory extends DummyTokenizerFactory implements MultiTermAwareComponent {
+
+ public DummyMultiTermAwareTokenizerFactory(Map<String,String> args) {
+ super(args);
+ }
+
+ @Override
+ public AbstractAnalysisFactory getMultiTermComponent() {
+ return new KeywordTokenizerFactory(getOriginalArgs());
+ }
+
+ }
+
+ public static class DummyTokenFilterFactory extends TokenFilterFactory {
+
+ public DummyTokenFilterFactory(Map<String,String> args) {
+ super(args);
+ }
+
+ @Override
+ public TokenStream create(TokenStream input) {
+ return input;
+ }
+
+ }
+
+ public static class DummyMultiTermAwareTokenFilterFactory extends DummyTokenFilterFactory implements MultiTermAwareComponent {
+
+ public DummyMultiTermAwareTokenFilterFactory(Map<String,String> args) {
+ super(args);
+ }
+
+ @Override
+ public AbstractAnalysisFactory getMultiTermComponent() {
+ return new ASCIIFoldingFilterFactory(Collections.emptyMap());
+ }
+
+ }
+
+ public void testNormalization() throws IOException {
+ CustomAnalyzer analyzer1 = CustomAnalyzer.builder()
+ // none of these components are multi-term aware so they should not be applied
+ .withTokenizer(DummyTokenizerFactory.class, Collections.emptyMap())
+ .addCharFilter(DummyCharFilterFactory.class, Collections.emptyMap())
+ .addTokenFilter(DummyTokenFilterFactory.class, Collections.emptyMap())
+ .build();
+ assertEquals(new BytesRef("0�"), analyzer1.normalize("dummy", "0�"));
+
+ CustomAnalyzer analyzer2 = CustomAnalyzer.builder()
+ // these components are multi-term aware so they should be applied
+ .withTokenizer(DummyMultiTermAwareTokenizerFactory.class, Collections.emptyMap())
+ .addCharFilter(DummyMultiTermAwareCharFilterFactory.class, Collections.emptyMap())
+ .addTokenFilter(DummyMultiTermAwareTokenFilterFactory.class, Collections.emptyMap())
+ .build();
+ assertEquals(new BytesRef("2A"), analyzer2.normalize("dummy", "0�"));
+ }
+
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseAnalyzer.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseAnalyzer.java
index 46d40b1..06e119e 100644
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseAnalyzer.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseAnalyzer.java
@@ -94,4 +94,11 @@ public class JapaneseAnalyzer extends StopwordAnalyzerBase {
stream = new LowerCaseFilter(stream);
return new TokenStreamComponents(tokenizer, stream);
}
+
+ @Override
+ protected TokenStream normalize(String fieldName, TokenStream in) {
+ TokenStream result = new CJKWidthFilter(in);
+ result = new LowerCaseFilter(result);
+ return result;
+ }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikAnalyzer.java b/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikAnalyzer.java
index 091acfd..0caca35 100644
--- a/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikAnalyzer.java
+++ b/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikAnalyzer.java
@@ -23,6 +23,7 @@ import morfologik.stemming.Dictionary;
import morfologik.stemming.polish.PolishStemmer;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
@@ -69,4 +70,9 @@ public class MorfologikAnalyzer extends Analyzer {
src,
new MorfologikFilter(new StandardFilter(src), dictionary));
}
+
+ @Override
+ protected TokenStream normalize(String fieldName, TokenStream in) {
+ return new StandardFilter(in);
+ }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java b/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java
index 5f0347b..f604d4b 100644
--- a/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java
+++ b/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java
@@ -22,6 +22,7 @@ import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
@@ -139,4 +140,9 @@ public final class SmartChineseAnalyzer extends Analyzer {
}
return new TokenStreamComponents(tokenizer, result);
}
+
+ @Override
+ protected TokenStream normalize(String fieldName, TokenStream in) {
+ return new LowerCaseFilter(in);
+ }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/stempel/src/java/org/apache/lucene/analysis/pl/PolishAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/stempel/src/java/org/apache/lucene/analysis/pl/PolishAnalyzer.java b/lucene/analysis/stempel/src/java/org/apache/lucene/analysis/pl/PolishAnalyzer.java
index 6ed4fda..2d3ef4c 100644
--- a/lucene/analysis/stempel/src/java/org/apache/lucene/analysis/pl/PolishAnalyzer.java
+++ b/lucene/analysis/stempel/src/java/org/apache/lucene/analysis/pl/PolishAnalyzer.java
@@ -146,4 +146,11 @@ public final class PolishAnalyzer extends StopwordAnalyzerBase {
result = new StempelFilter(result, new StempelStemmer(stemTable));
return new TokenStreamComponents(source, result);
}
+
+ @Override
+ protected TokenStream normalize(String fieldName, TokenStream in) {
+ TokenStream result = new StandardFilter(in);
+ result = new LowerCaseFilter(result);
+ return result;
+ }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/core/src/java/org/apache/lucene/analysis/Analyzer.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/Analyzer.java b/lucene/core/src/java/org/apache/lucene/analysis/Analyzer.java
index cce740d..0d60d24 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/Analyzer.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/Analyzer.java
@@ -18,11 +18,18 @@ package org.apache.lucene.analysis;
import java.io.Closeable;
+import java.io.IOException;
import java.io.Reader;
+import java.io.StringReader;
import java.util.HashMap;
import java.util.Map;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.store.AlreadyClosedException;
+import org.apache.lucene.util.AttributeFactory;
+import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CloseableThreadLocal;
import org.apache.lucene.util.Version;
@@ -44,6 +51,12 @@ import org.apache.lucene.util.Version;
* filter = new BarFilter(filter);
* return new TokenStreamComponents(source, filter);
* }
+ * {@literal @Override}
+ * protected TokenStream normalize(TokenStream in) {
+ * // Assuming FooFilter is about normalization and BarFilter is about
+ * // stemming, only FooFilter should be applied
+ * return new FooFilter(in);
+ * }
* };
* </pre>
* For more examples, see the {@link org.apache.lucene.analysis Analysis package documentation}.
@@ -108,6 +121,15 @@ public abstract class Analyzer implements Closeable {
protected abstract TokenStreamComponents createComponents(String fieldName);
/**
+ * Wrap the given {@link TokenStream} in order to apply normalization filters.
+ * The default implementation returns the {@link TokenStream} as-is. This is
+ * used by {@link #normalize(String, String)}.
+ */
+ protected TokenStream normalize(String fieldName, TokenStream in) {
+ return in;
+ }
+
+ /**
* Returns a TokenStream suitable for <code>fieldName</code>, tokenizing
* the contents of <code>reader</code>.
* <p>
@@ -181,7 +203,65 @@ public abstract class Analyzer implements Closeable {
components.reusableStringReader = strReader;
return components.getTokenStream();
}
-
+
+ /**
+ * Normalize a string down to the representation that it would have in the
+ * index.
+ * <p>
+ * This is typically used by query parsers in order to generate a query on
+ * a given term, without tokenizing or stemming, which are undesirable if
+ * the string to analyze is a partial word (eg. in case of a wildcard or
+ * fuzzy query).
+ * <p>
+ * This method uses {@link #initReaderForNormalization(String, Reader)} in
+ * order to apply necessary character-level normalization and then
+ * {@link #normalize(String, TokenStream)} in order to apply the normalizing
+ * token filters.
+ */
+ public final BytesRef normalize(final String fieldName, final String text) {
+ try {
+ // apply char filters
+ final String filteredText;
+ try (Reader reader = new StringReader(text)) {
+ Reader filterReader = initReaderForNormalization(fieldName, reader);
+ char[] buffer = new char[64];
+ StringBuilder builder = new StringBuilder();
+ for (;;) {
+ final int read = filterReader.read(buffer, 0, buffer.length);
+ if (read == -1) {
+ break;
+ }
+ builder.append(buffer, 0, read);
+ }
+ filteredText = builder.toString();
+ } catch (IOException e) {
+ throw new IllegalStateException("Normalization threw an unexpected exeption", e);
+ }
+
+ final AttributeFactory attributeFactory = attributeFactory();
+ try (TokenStream ts = normalize(fieldName,
+ new StringTokenStream(attributeFactory, filteredText, text.length()))) {
+ final TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class);
+ ts.reset();
+ if (ts.incrementToken() == false) {
+ throw new IllegalStateException("The normalization token stream is "
+ + "expected to produce exactly 1 token, but got 0 for analyzer "
+ + this + " and input \"" + text + "\"");
+ }
+ final BytesRef term = BytesRef.deepCopyOf(termAtt.getBytesRef());
+ if (ts.incrementToken()) {
+ throw new IllegalStateException("The normalization token stream is "
+ + "expected to produce exactly 1 token, but got 2+ for analyzer "
+ + this + " and input \"" + text + "\"");
+ }
+ ts.end();
+ return term;
+ }
+ } catch (IOException e) {
+ throw new IllegalStateException("Normalization threw an unexpected exeption", e);
+ }
+ }
+
/**
* Override this if you want to add a CharFilter chain.
* <p>
@@ -196,6 +276,22 @@ public abstract class Analyzer implements Closeable {
return reader;
}
+ /** Wrap the given {@link Reader} with {@link CharFilter}s that make sense
+ * for normalization. This is typically a subset of the {@link CharFilter}s
+ * that are applied in {@link #initReader(String, Reader)}. This is used by
+ * {@link #normalize(String, String)}. */
+ protected Reader initReaderForNormalization(String fieldName, Reader reader) {
+ return reader;
+ }
+
+ /** Return the {@link AttributeFactory} to be used for
+ * {@link #tokenStream analysis} and
+ * {@link #normalize(String, String) normalization}. The default
+ * implementation returns {@link AttributeFactory#DEFAULT_ATTRIBUTE_FACTORY}. */
+ protected AttributeFactory attributeFactory() {
+ return AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY;
+ }
+
/**
* Invoked before indexing a IndexableField instance if
* terms have already been added to that field. This allows custom
@@ -435,4 +531,41 @@ public abstract class Analyzer implements Closeable {
}
};
+ private static final class StringTokenStream extends TokenStream {
+
+ private final String value;
+ private final int length;
+ private boolean used = true;
+ private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
+ private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
+
+ StringTokenStream(AttributeFactory attributeFactory, String value, int length) {
+ super(attributeFactory);
+ this.value = value;
+ this.length = length;
+ }
+
+ @Override
+ public void reset() {
+ used = false;
+ }
+
+ @Override
+ public boolean incrementToken() {
+ if (used) {
+ return false;
+ }
+ clearAttributes();
+ termAttribute.append(value);
+ offsetAttribute.setOffset(0, length);
+ used = true;
+ return true;
+ }
+
+ @Override
+ public void end() throws IOException {
+ super.end();
+ offsetAttribute.setOffset(length, length);
+ }
+ }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
index 251017d..fb57573 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
@@ -112,4 +112,11 @@ public final class StandardAnalyzer extends StopwordAnalyzerBase {
}
};
}
+
+ @Override
+ protected TokenStream normalize(String fieldName, TokenStream in) {
+ TokenStream result = new StandardFilter(in);
+ result = new LowerCaseFilter(result);
+ return result;
+ }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/core/src/test/org/apache/lucene/analysis/standard/TestStandardAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/analysis/standard/TestStandardAnalyzer.java b/lucene/core/src/test/org/apache/lucene/analysis/standard/TestStandardAnalyzer.java
index 6c6ddc8..2cc9274 100644
--- a/lucene/core/src/test/org/apache/lucene/analysis/standard/TestStandardAnalyzer.java
+++ b/lucene/core/src/test/org/apache/lucene/analysis/standard/TestStandardAnalyzer.java
@@ -27,6 +27,7 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockGraphTokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.TestUtil;
public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
@@ -387,4 +388,9 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
checkRandomData(random, analyzer, 100*RANDOM_MULTIPLIER, 8192);
analyzer.close();
}
+
+ public void testNormalize() {
+ Analyzer a = new StandardAnalyzer();
+ assertEquals(new BytesRef("\"\\�3[]()! cz@"), a.normalize("dummy", "\"\\�3[]()! Cz@"));
+ }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/queryparser/src/java/org/apache/lucene/queryparser/analyzing/AnalyzingQueryParser.java
----------------------------------------------------------------------
diff --git a/lucene/queryparser/src/java/org/apache/lucene/queryparser/analyzing/AnalyzingQueryParser.java b/lucene/queryparser/src/java/org/apache/lucene/queryparser/analyzing/AnalyzingQueryParser.java
index 49690fe..1fab24f 100644
--- a/lucene/queryparser/src/java/org/apache/lucene/queryparser/analyzing/AnalyzingQueryParser.java
+++ b/lucene/queryparser/src/java/org/apache/lucene/queryparser/analyzing/AnalyzingQueryParser.java
@@ -16,15 +16,15 @@
*/
package org.apache.lucene.queryparser.analyzing;
-import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.search.Query;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefBuilder;
/**
* Overrides Lucene's default QueryParser so that Fuzzy-, Prefix-, Range-, and WildcardQuerys
@@ -39,7 +39,7 @@ import org.apache.lucene.search.Query;
*/
public class AnalyzingQueryParser extends org.apache.lucene.queryparser.classic.QueryParser {
// gobble escaped chars or find a wildcard character
- private final Pattern wildcardPattern = Pattern.compile("(\\.)|([?*]+)");
+ private static final Pattern WILDCARD_PATTERN = Pattern.compile("(\\\\.)|([?*]+)");
public AnalyzingQueryParser(String field, Analyzer analyzer) {
super(field, analyzer);
setAnalyzeRangeTerms(true);
@@ -65,42 +65,41 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryparser.classic.
*/
@Override
protected Query getWildcardQuery(String field, String termStr) throws ParseException {
-
- if (termStr == null){
- //can't imagine this would ever happen
- throw new ParseException("Passed null value as term to getWildcardQuery");
- }
- if ( ! getAllowLeadingWildcard() && (termStr.startsWith("*") || termStr.startsWith("?"))) {
- throw new ParseException("'*' or '?' not allowed as first character in WildcardQuery"
- + " unless getAllowLeadingWildcard() returns true");
+ if ("*".equals(field)) {
+ if ("*".equals(termStr)) return newMatchAllDocsQuery();
}
-
- Matcher wildcardMatcher = wildcardPattern.matcher(termStr);
- StringBuilder sb = new StringBuilder();
+ if (getAllowLeadingWildcard() == false && (termStr.startsWith("*") || termStr.startsWith("?")))
+ throw new ParseException("'*' or '?' not allowed as first character in WildcardQuery");
+
+ Term t = new Term(field, analyzeWildcard(field, termStr));
+ return newWildcardQuery(t);
+ }
+
+ private BytesRef analyzeWildcard(String field, String termStr) {
+ // best effort to not pass the wildcard characters and escaped characters through #normalize
+ Matcher wildcardMatcher = WILDCARD_PATTERN.matcher(termStr);
+ BytesRefBuilder sb = new BytesRefBuilder();
int last = 0;
-
+
while (wildcardMatcher.find()){
- // continue if escaped char
- if (wildcardMatcher.group(1) != null){
- continue;
- }
-
- if (wildcardMatcher.start() > 0){
+ if (wildcardMatcher.start() > 0) {
String chunk = termStr.substring(last, wildcardMatcher.start());
- String analyzed = analyzeSingleChunk(field, termStr, chunk);
- sb.append(analyzed);
+ BytesRef normalized = getAnalyzer().normalize(field, chunk);
+ sb.append(normalized);
}
- //append the wildcard character
- sb.append(wildcardMatcher.group(2));
-
+ //append the matched group - without normalizing
+ sb.append(new BytesRef(wildcardMatcher.group()));
+
last = wildcardMatcher.end();
}
if (last < termStr.length()){
- sb.append(analyzeSingleChunk(field, termStr, termStr.substring(last)));
+ String chunk = termStr.substring(last);
+ BytesRef normalized = getAnalyzer().normalize(field, chunk);
+ sb.append(normalized);
}
- return super.getWildcardQuery(field, sb.toString());
+ return sb.toBytesRef();
}
-
+
/**
* Called when parser parses an input term
* that uses prefix notation; that is, contains a single '*' wildcard
@@ -121,8 +120,14 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryparser.classic.
*/
@Override
protected Query getPrefixQuery(String field, String termStr) throws ParseException {
- String analyzed = analyzeSingleChunk(field, termStr, termStr);
- return super.getPrefixQuery(field, analyzed);
+ if (!getAllowLeadingWildcard() && termStr.startsWith("*"))
+ throw new ParseException("'*' not allowed as first character in PrefixQuery");
+ if (getLowercaseExpandedTerms()) {
+ termStr = termStr.toLowerCase(getLocale());
+ }
+ BytesRef term = getAnalyzer().normalize(field, termStr);
+ Term t = new Term(field, term);
+ return newPrefixQuery(t);
}
/**
@@ -142,61 +147,9 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryparser.classic.
protected Query getFuzzyQuery(String field, String termStr, float minSimilarity)
throws ParseException {
- String analyzed = analyzeSingleChunk(field, termStr, termStr);
- return super.getFuzzyQuery(field, analyzed, minSimilarity);
+ BytesRef term = getAnalyzer().normalize(field, termStr);
+ Term t = new Term(field, term);
+ return newFuzzyQuery(t, minSimilarity, getFuzzyPrefixLength());
}
- /**
- * Returns the analyzed form for the given chunk
- *
- * If the analyzer produces more than one output token from the given chunk,
- * a ParseException is thrown.
- *
- * @param field The target field
- * @param termStr The full term from which the given chunk is excerpted
- * @param chunk The portion of the given termStr to be analyzed
- * @return The result of analyzing the given chunk
- * @throws ParseException when analysis returns other than one output token
- */
- protected String analyzeSingleChunk(String field, String termStr, String chunk) throws ParseException{
- String analyzed = null;
- try (TokenStream stream = getAnalyzer().tokenStream(field, chunk)) {
- stream.reset();
- CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
- // get first and hopefully only output token
- if (stream.incrementToken()) {
- analyzed = termAtt.toString();
-
- // try to increment again, there should only be one output token
- StringBuilder multipleOutputs = null;
- while (stream.incrementToken()) {
- if (null == multipleOutputs) {
- multipleOutputs = new StringBuilder();
- multipleOutputs.append('"');
- multipleOutputs.append(analyzed);
- multipleOutputs.append('"');
- }
- multipleOutputs.append(',');
- multipleOutputs.append('"');
- multipleOutputs.append(termAtt.toString());
- multipleOutputs.append('"');
- }
- stream.end();
- if (null != multipleOutputs) {
- throw new ParseException(
- String.format(getLocale(),
- "Analyzer created multiple terms for \"%s\": %s", chunk, multipleOutputs.toString()));
- }
- } else {
- // nothing returned by analyzer. Was it a stop word and the user accidentally
- // used an analyzer with stop words?
- stream.end();
- throw new ParseException(String.format(getLocale(), "Analyzer returned nothing for \"%s\"", chunk));
- }
- } catch (IOException e){
- throw new ParseException(
- String.format(getLocale(), "IO error while trying to analyze single term: \"%s\"", termStr));
- }
- return analyzed;
- }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParserBase.java
----------------------------------------------------------------------
diff --git a/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParserBase.java b/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParserBase.java
index cdfa477..8b0866f 100644
--- a/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParserBase.java
+++ b/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParserBase.java
@@ -16,14 +16,11 @@
*/
package org.apache.lucene.queryparser.classic;
-import java.io.IOException;
import java.io.StringReader;
import java.text.DateFormat;
import java.util.*;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.QueryParser.Operator;
@@ -41,9 +38,6 @@ import static org.apache.lucene.util.automaton.Operations.DEFAULT_MAX_DETERMINIZ
* and acts to separate the majority of the Java code from the .jj grammar file.
*/
public abstract class QueryParserBase extends QueryBuilder implements CommonQueryParserConfiguration {
-
- /** Do not catch this exception in your code, it means you are using methods that you should no longer use. */
- public static class MethodRemovedUseAnother extends Throwable {}
static final int CONJ_NONE = 0;
static final int CONJ_AND = 1;
@@ -640,31 +634,6 @@ public abstract class QueryParserBase extends QueryBuilder implements CommonQuer
return new FuzzyQuery(term,numEdits,prefixLength);
}
- // TODO: Should this be protected instead?
- private BytesRef analyzeMultitermTerm(String field, String part) {
- return analyzeMultitermTerm(field, part, getAnalyzer());
- }
-
- protected BytesRef analyzeMultitermTerm(String field, String part, Analyzer analyzerIn) {
- if (analyzerIn == null) analyzerIn = getAnalyzer();
-
- try (TokenStream source = analyzerIn.tokenStream(field, part)) {
- source.reset();
-
- TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
-
- if (!source.incrementToken())
- throw new IllegalArgumentException("analyzer returned no terms for multiTerm term: " + part);
- BytesRef bytes = BytesRef.deepCopyOf(termAtt.getBytesRef());
- if (source.incrementToken())
- throw new IllegalArgumentException("analyzer returned too many terms for multiTerm term: " + part);
- source.end();
- return bytes;
- } catch (IOException e) {
- throw new RuntimeException("Error analyzing multiTerm term: " + part, e);
- }
- }
-
/**
* Builds a new {@link TermRangeQuery} instance
* @param field Field
@@ -681,13 +650,13 @@ public abstract class QueryParserBase extends QueryBuilder implements CommonQuer
if (part1 == null) {
start = null;
} else {
- start = analyzeRangeTerms ? analyzeMultitermTerm(field, part1) : new BytesRef(part1);
+ start = analyzeRangeTerms ? getAnalyzer().normalize(field, part1) : new BytesRef(part1);
}
if (part2 == null) {
end = null;
} else {
- end = analyzeRangeTerms ? analyzeMultitermTerm(field, part2) : new BytesRef(part2);
+ end = analyzeRangeTerms ? getAnalyzer().normalize(field, part2) : new BytesRef(part2);
}
final TermRangeQuery query = new TermRangeQuery(field, start, end, startInclusive, endInclusive);
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/queryparser/src/java/org/apache/lucene/queryparser/simple/SimpleQueryParser.java
----------------------------------------------------------------------
diff --git a/lucene/queryparser/src/java/org/apache/lucene/queryparser/simple/SimpleQueryParser.java b/lucene/queryparser/src/java/org/apache/lucene/queryparser/simple/SimpleQueryParser.java
index 45a24f7..316d97d 100644
--- a/lucene/queryparser/src/java/org/apache/lucene/queryparser/simple/SimpleQueryParser.java
+++ b/lucene/queryparser/src/java/org/apache/lucene/queryparser/simple/SimpleQueryParser.java
@@ -26,6 +26,7 @@ import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.MatchNoDocsQuery;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
+import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.QueryBuilder;
import org.apache.lucene.util.automaton.LevenshteinAutomata;
@@ -551,7 +552,9 @@ public class SimpleQueryParser extends QueryBuilder {
BooleanQuery.Builder bq = new BooleanQuery.Builder();
bq.setDisableCoord(true);
for (Map.Entry<String,Float> entry : weights.entrySet()) {
- Query q = new FuzzyQuery(new Term(entry.getKey(), text), fuzziness);
+ final String fieldName = entry.getKey();
+ final BytesRef term = getAnalyzer().normalize(fieldName, text);
+ Query q = new FuzzyQuery(new Term(fieldName, term), fuzziness);
float boost = entry.getValue();
if (boost != 1f) {
q = new BoostQuery(q, boost);
@@ -587,7 +590,9 @@ public class SimpleQueryParser extends QueryBuilder {
BooleanQuery.Builder bq = new BooleanQuery.Builder();
bq.setDisableCoord(true);
for (Map.Entry<String,Float> entry : weights.entrySet()) {
- Query q = new PrefixQuery(new Term(entry.getKey(), text));
+ final String fieldName = entry.getKey();
+ final BytesRef term = getAnalyzer().normalize(fieldName, text);
+ Query q = new PrefixQuery(new Term(fieldName, term));
float boost = entry.getValue();
if (boost != 1f) {
q = new BoostQuery(q, boost);