You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by da...@apache.org on 2018/09/20 23:59:31 UTC
[25/29] lucene-solr:jira/http2: LUCENE-8498: Remove LowerCaseTokenizer
LUCENE-8498: Remove LowerCaseTokenizer
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/c0d29759
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/c0d29759
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/c0d29759
Branch: refs/heads/jira/http2
Commit: c0d2975970d3de8f5056a20504dec1431d455ab1
Parents: 52bdcf6
Author: Alan Woodward <ro...@apache.org>
Authored: Sat Sep 15 16:56:27 2018 +0100
Committer: Alan Woodward <ro...@apache.org>
Committed: Thu Sep 20 11:57:05 2018 +0100
----------------------------------------------------------------------
lucene/CHANGES.txt | 3 +
lucene/MIGRATE.txt | 11 ++
.../analysis/core/LowerCaseTokenizer.java | 72 -----------
.../core/LowerCaseTokenizerFactory.java | 75 ------------
.../lucene/analysis/core/SimpleAnalyzer.java | 4 +-
.../lucene/analysis/core/StopAnalyzer.java | 6 +-
.../lucene/analysis/util/CharTokenizer.java | 86 +------------
...apache.lucene.analysis.util.TokenizerFactory | 1 -
.../analysis/br/TestBrazilianAnalyzer.java | 7 +-
.../lucene/analysis/core/TestAnalyzers.java | 8 --
.../analysis/custom/TestCustomAnalyzer.java | 13 +-
.../lucene/analysis/de/TestGermanAnalyzer.java | 8 +-
.../standard/TestStandardFactories.java | 16 ---
.../analysis/util/TestCharTokenizers.java | 122 +++----------------
.../extraction/solr/collection1/conf/schema.xml | 15 ---
.../collection1/conf/schema-copyfield-test.xml | 14 ---
.../solr/collection1/conf/schema-folding.xml | 3 +-
.../solr/collection1/conf/schema-hash.xml | 13 --
.../collection1/conf/schema-required-fields.xml | 14 ---
.../solr/collection1/conf/schema-rest.xml | 9 +-
.../solr/collection1/conf/schema-sql.xml | 13 --
.../collection1/conf/schema-tokenizer-test.xml | 11 +-
.../test-files/solr/collection1/conf/schema.xml | 15 +--
.../solr/collection1/conf/schema12.xml | 9 +-
.../solr/collection1/conf/schema15.xml | 14 ---
.../solr/collection1/conf/schemasurround.xml | 14 ---
.../schema/TestFieldCollectionResource.java | 10 +-
.../solr/rest/schema/TestFieldTypeResource.java | 3 +-
.../solr/util/TestMaxTokenLenTokenizer.java | 20 +--
.../solrj/solr/collection1/conf/schema-sql.xml | 13 --
.../solrj/solr/collection1/conf/schema.xml | 13 --
.../solr/configsets/streaming/conf/schema.xml | 11 --
32 files changed, 78 insertions(+), 568 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c0d29759/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index bd8c616..70badd8 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -81,6 +81,9 @@ API Changes
* LUCENE-8352: TokenStreamComponents is now final, and can take a Consumer<Reader>
in its constructor (Mark Harwood, Alan Woodward, Adrien Grand)
+* LUCENE-8498: LowerCaseTokenizer has been removed, and CharTokenizer no longer
+ takes a normalizer function. (Alan Woodward)
+
Changes in Runtime Behavior
* LUCENE-8333: Switch MoreLikeThis.setMaxDocFreqPct to use maxDoc instead of
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c0d29759/lucene/MIGRATE.txt
----------------------------------------------------------------------
diff --git a/lucene/MIGRATE.txt b/lucene/MIGRATE.txt
index 6008956..1b56b64 100644
--- a/lucene/MIGRATE.txt
+++ b/lucene/MIGRATE.txt
@@ -129,3 +129,14 @@ Most code should just require recompilation, though possibly requiring some adde
Instead of overriding TokenStreamComponents#setReader() to customise analyzer
initialisation, you should now pass a Consumer<Reader> instance to the
TokenStreamComponents constructor.
+
+## LowerCaseTokenizer and LowerCaseTokenizerFactory have been removed ##
+
+LowerCaseTokenizer combined tokenization and filtering in a way that broke token
+normalization, so they have been removed. Instead, use a LetterTokenizer followed by
+a LowerCaseFilter
+
+## CharTokenizer no longer takes a normalizer function ##
+
+CharTokenizer now only performs tokenization. To perform any type of filtering
+use a TokenFilter chain as you would with any other Tokenizer.
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c0d29759/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseTokenizer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseTokenizer.java
deleted file mode 100644
index 26b8747..0000000
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseTokenizer.java
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.analysis.core;
-
-
-import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.util.AttributeFactory;
-
-/**
- * LowerCaseTokenizer performs the function of LetterTokenizer
- * and LowerCaseFilter together. It divides text at non-letters and converts
- * them to lower case. While it is functionally equivalent to the combination
- * of LetterTokenizer and LowerCaseFilter, there is a performance advantage
- * to doing the two tasks at once, hence this (redundant) implementation.
- * <P>
- * Note: this does a decent job for most European languages, but does a terrible
- * job for some Asian languages, where words are not separated by spaces.
- * </p>
- */
-public final class LowerCaseTokenizer extends LetterTokenizer {
-
- /**
- * Construct a new LowerCaseTokenizer.
- */
- public LowerCaseTokenizer() {
- }
-
- /**
- * Construct a new LowerCaseTokenizer using a given
- * {@link org.apache.lucene.util.AttributeFactory}.
- *
- * @param factory
- * the attribute factory to use for this {@link Tokenizer}
- */
- public LowerCaseTokenizer(AttributeFactory factory) {
- super(factory);
- }
-
- /**
- * Construct a new LowerCaseTokenizer using a given
- * {@link org.apache.lucene.util.AttributeFactory}.
- *
- * @param factory the attribute factory to use for this {@link Tokenizer}
- * @param maxTokenLen maximum token length the tokenizer will emit.
- * Must be greater than 0 and less than MAX_TOKEN_LENGTH_LIMIT (1024*1024)
- * @throws IllegalArgumentException if maxTokenLen is invalid.
- */
- public LowerCaseTokenizer(AttributeFactory factory, int maxTokenLen) {
- super(factory, maxTokenLen);
- }
-
- /** Converts char to lower case
- * {@link Character#toLowerCase(int)}.*/
- @Override
- protected int normalize(int c) {
- return Character.toLowerCase(c);
- }
-}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c0d29759/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseTokenizerFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseTokenizerFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseTokenizerFactory.java
deleted file mode 100644
index 44e2742..0000000
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseTokenizerFactory.java
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.analysis.core;
-
-
-import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
-import org.apache.lucene.analysis.util.CharTokenizer;
-import org.apache.lucene.analysis.util.MultiTermAwareComponent;
-import org.apache.lucene.analysis.util.TokenizerFactory;
-import org.apache.lucene.util.AttributeFactory;
-
-import java.util.HashMap;
-import java.util.Map;
-
-import static org.apache.lucene.analysis.standard.StandardTokenizer.MAX_TOKEN_LENGTH_LIMIT;
-
-/**
- * Factory for {@link LowerCaseTokenizer}.
- * <pre class="prettyprint">
- * <fieldType name="text_lwrcase" class="solr.TextField" positionIncrementGap="100">
- * <analyzer>
- * <tokenizer class="solr.LowerCaseTokenizerFactory" maxTokenLen="256"/>
- * </analyzer>
- * </fieldType></pre>
- * <p>
- * Options:
- * <ul>
- * <li>maxTokenLen: max token length, should be greater than 0 and less than MAX_TOKEN_LENGTH_LIMIT (1024*1024).
- * It is rare to need to change this
- * else {@link CharTokenizer}::DEFAULT_MAX_WORD_LEN</li>
- * </ul>
- */
-public class LowerCaseTokenizerFactory extends TokenizerFactory implements MultiTermAwareComponent {
- private final int maxTokenLen;
-
- /**
- * Creates a new LowerCaseTokenizerFactory
- */
- public LowerCaseTokenizerFactory(Map<String, String> args) {
- super(args);
- maxTokenLen = getInt(args, "maxTokenLen", CharTokenizer.DEFAULT_MAX_WORD_LEN);
- if (maxTokenLen > MAX_TOKEN_LENGTH_LIMIT || maxTokenLen <= 0) {
- throw new IllegalArgumentException("maxTokenLen must be greater than 0 and less than " + MAX_TOKEN_LENGTH_LIMIT + " passed: " + maxTokenLen);
- }
- if (!args.isEmpty()) {
- throw new IllegalArgumentException("Unknown parameters: " + args);
- }
- }
-
- @Override
- public LowerCaseTokenizer create(AttributeFactory factory) {
- return new LowerCaseTokenizer(factory, maxTokenLen);
- }
-
- @Override
- public AbstractAnalysisFactory getMultiTermComponent() {
- Map<String,String> map = new HashMap<>(getOriginalArgs());
- map.remove("maxTokenLen"); //removing "maxTokenLen" argument for LowerCaseFilterFactory init
- return new LowerCaseFilterFactory(map);
- }
-}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c0d29759/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/SimpleAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/SimpleAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/SimpleAnalyzer.java
index 6e0f2f0..3fcb92c 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/SimpleAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/SimpleAnalyzer.java
@@ -20,6 +20,7 @@ package org.apache.lucene.analysis.core;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
/** An {@link Analyzer} that filters {@link LetterTokenizer}
* with {@link LowerCaseFilter}
@@ -34,7 +35,8 @@ public final class SimpleAnalyzer extends Analyzer {
@Override
protected TokenStreamComponents createComponents(final String fieldName) {
- return new TokenStreamComponents(new LowerCaseTokenizer());
+ Tokenizer tokenizer = new LetterTokenizer();
+ return new TokenStreamComponents(tokenizer, new LowerCaseFilter(tokenizer));
}
@Override
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c0d29759/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopAnalyzer.java
index cf7ecdd..dde74c0 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopAnalyzer.java
@@ -60,13 +60,13 @@ public final class StopAnalyzer extends StopwordAnalyzerBase {
* used to tokenize all the text in the provided {@link Reader}.
*
* @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
- * built from a {@link LowerCaseTokenizer} filtered with
+ * built from a {@link LetterTokenizer} filtered with
* {@link StopFilter}
*/
@Override
protected TokenStreamComponents createComponents(String fieldName) {
- final Tokenizer source = new LowerCaseTokenizer();
- return new TokenStreamComponents(source, new StopFilter(source, stopwords));
+ final Tokenizer source = new LetterTokenizer();
+ return new TokenStreamComponents(source, new StopFilter(new LowerCaseFilter(source), stopwords));
}
@Override
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c0d29759/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java
index ff9d6ff..092d25d 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java
@@ -20,14 +20,11 @@ package org.apache.lucene.analysis.util;
import java.io.IOException;
import java.util.Objects;
import java.util.function.IntPredicate;
-import java.util.function.IntUnaryOperator;
-import org.apache.lucene.analysis.CharacterUtils.CharacterBuffer;
import org.apache.lucene.analysis.CharacterUtils;
-import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.CharacterUtils.CharacterBuffer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.LetterTokenizer;
-import org.apache.lucene.analysis.core.LowerCaseTokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
@@ -107,48 +104,12 @@ public abstract class CharTokenizer extends Tokenizer {
* </pre>
*/
public static CharTokenizer fromTokenCharPredicate(AttributeFactory factory, final IntPredicate tokenCharPredicate) {
- return fromTokenCharPredicate(factory, tokenCharPredicate, IntUnaryOperator.identity());
- }
-
- /**
- * Creates a new instance of CharTokenizer using a custom predicate, supplied as method reference or lambda expression.
- * The predicate should return {@code true} for all valid token characters.
- * This factory also takes a function to normalize chars, e.g., lowercasing them, supplied as method reference or lambda expression.
- * <p>
- * This factory is intended to be used with lambdas or method references. E.g., an elegant way
- * to create an instance which behaves exactly as {@link LowerCaseTokenizer} is:
- * <pre class="prettyprint lang-java">
- * Tokenizer tok = CharTokenizer.fromTokenCharPredicate(Character::isLetter, Character::toLowerCase);
- * </pre>
- */
- public static CharTokenizer fromTokenCharPredicate(final IntPredicate tokenCharPredicate, final IntUnaryOperator normalizer) {
- return fromTokenCharPredicate(DEFAULT_TOKEN_ATTRIBUTE_FACTORY, tokenCharPredicate, normalizer);
- }
-
- /**
- * Creates a new instance of CharTokenizer with the supplied attribute factory using a custom predicate, supplied as method reference or lambda expression.
- * The predicate should return {@code true} for all valid token characters.
- * This factory also takes a function to normalize chars, e.g., lowercasing them, supplied as method reference or lambda expression.
- * <p>
- * This factory is intended to be used with lambdas or method references. E.g., an elegant way
- * to create an instance which behaves exactly as {@link LowerCaseTokenizer} is:
- * <pre class="prettyprint lang-java">
- * Tokenizer tok = CharTokenizer.fromTokenCharPredicate(factory, Character::isLetter, Character::toLowerCase);
- * </pre>
- */
- public static CharTokenizer fromTokenCharPredicate(AttributeFactory factory, final IntPredicate tokenCharPredicate, final IntUnaryOperator normalizer) {
Objects.requireNonNull(tokenCharPredicate, "predicate must not be null.");
- Objects.requireNonNull(normalizer, "normalizer must not be null");
return new CharTokenizer(factory) {
@Override
protected boolean isTokenChar(int c) {
return tokenCharPredicate.test(c);
}
-
- @Override
- protected int normalize(int c) {
- return normalizer.applyAsInt(c);
- }
};
}
@@ -167,7 +128,7 @@ public abstract class CharTokenizer extends Tokenizer {
public static CharTokenizer fromSeparatorCharPredicate(final IntPredicate separatorCharPredicate) {
return fromSeparatorCharPredicate(DEFAULT_TOKEN_ATTRIBUTE_FACTORY, separatorCharPredicate);
}
-
+
/**
* Creates a new instance of CharTokenizer with the supplied attribute factory using a custom predicate, supplied as method reference or lambda expression.
* The predicate should return {@code true} for all valid token separator characters.
@@ -179,37 +140,7 @@ public abstract class CharTokenizer extends Tokenizer {
* </pre>
*/
public static CharTokenizer fromSeparatorCharPredicate(AttributeFactory factory, final IntPredicate separatorCharPredicate) {
- return fromSeparatorCharPredicate(factory, separatorCharPredicate, IntUnaryOperator.identity());
- }
-
- /**
- * Creates a new instance of CharTokenizer using a custom predicate, supplied as method reference or lambda expression.
- * The predicate should return {@code true} for all valid token separator characters.
- * This factory also takes a function to normalize chars, e.g., lowercasing them, supplied as method reference or lambda expression.
- * <p>
- * This factory is intended to be used with lambdas or method references. E.g., an elegant way
- * to create an instance which behaves exactly as the combination {@link WhitespaceTokenizer} and {@link LowerCaseFilter} is:
- * <pre class="prettyprint lang-java">
- * Tokenizer tok = CharTokenizer.fromSeparatorCharPredicate(Character::isWhitespace, Character::toLowerCase);
- * </pre>
- */
- public static CharTokenizer fromSeparatorCharPredicate(final IntPredicate separatorCharPredicate, final IntUnaryOperator normalizer) {
- return fromSeparatorCharPredicate(DEFAULT_TOKEN_ATTRIBUTE_FACTORY, separatorCharPredicate, normalizer);
- }
-
- /**
- * Creates a new instance of CharTokenizer with the supplied attribute factory using a custom predicate.
- * The predicate should return {@code true} for all valid token separator characters.
- * This factory also takes a function to normalize chars, e.g., lowercasing them, supplied as method reference or lambda expression.
- * <p>
- * This factory is intended to be used with lambdas or method references. E.g., an elegant way
- * to create an instance which behaves exactly as {@link WhitespaceTokenizer} and {@link LowerCaseFilter} is:
- * <pre class="prettyprint lang-java">
- * Tokenizer tok = CharTokenizer.fromSeparatorCharPredicate(factory, Character::isWhitespace, Character::toLowerCase);
- * </pre>
- */
- public static CharTokenizer fromSeparatorCharPredicate(AttributeFactory factory, final IntPredicate separatorCharPredicate, final IntUnaryOperator normalizer) {
- return fromTokenCharPredicate(factory, separatorCharPredicate.negate(), normalizer);
+ return fromTokenCharPredicate(factory, separatorCharPredicate.negate());
}
private int offset = 0, bufferIndex = 0, dataLen = 0, finalOffset = 0;
@@ -230,15 +161,6 @@ public abstract class CharTokenizer extends Tokenizer {
*/
protected abstract boolean isTokenChar(int c);
- /**
- * Called on each token character to normalize it before it is added to the
- * token. The default implementation does nothing. Subclasses may use this to,
- * e.g., lowercase tokens.
- */
- protected int normalize(int c) {
- return c;
- }
-
@Override
public final boolean incrementToken() throws IOException {
clearAttributes();
@@ -276,7 +198,7 @@ public abstract class CharTokenizer extends Tokenizer {
buffer = termAtt.resizeBuffer(2+length); // make sure a supplementary fits in the buffer
}
end += charCount;
- length += Character.toChars(normalize(c), buffer, length); // buffer it, normalized
+ length += Character.toChars(c, buffer, length); // buffer it, normalized
if (length >= maxTokenLen) { // buffer overflow! make sure to check for >= surrogate pair could break == test
break;
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c0d29759/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenizerFactory
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenizerFactory b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenizerFactory
index 4b37eb8..e8bceff 100644
--- a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenizerFactory
+++ b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenizerFactory
@@ -15,7 +15,6 @@
org.apache.lucene.analysis.core.KeywordTokenizerFactory
org.apache.lucene.analysis.core.LetterTokenizerFactory
-org.apache.lucene.analysis.core.LowerCaseTokenizerFactory
org.apache.lucene.analysis.core.WhitespaceTokenizerFactory
org.apache.lucene.analysis.ngram.EdgeNGramTokenizerFactory
org.apache.lucene.analysis.ngram.NGramTokenizerFactory
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c0d29759/lucene/analysis/common/src/test/org/apache/lucene/analysis/br/TestBrazilianAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/br/TestBrazilianAnalyzer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/br/TestBrazilianAnalyzer.java
index 550a62a..5096ee8 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/br/TestBrazilianAnalyzer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/br/TestBrazilianAnalyzer.java
@@ -25,7 +25,8 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
-import org.apache.lucene.analysis.core.LowerCaseTokenizer;
+import org.apache.lucene.analysis.core.LetterTokenizer;
+import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
/**
@@ -147,9 +148,9 @@ public class TestBrazilianAnalyzer extends BaseTokenStreamTestCase {
public void testWithKeywordAttribute() throws IOException {
CharArraySet set = new CharArraySet(1, true);
set.add("Brasília");
- Tokenizer tokenizer = new LowerCaseTokenizer();
+ Tokenizer tokenizer = new LetterTokenizer();
tokenizer.setReader(new StringReader("Brasília Brasilia"));
- BrazilianStemFilter filter = new BrazilianStemFilter(new SetKeywordMarkerFilter(tokenizer, set));
+ BrazilianStemFilter filter = new BrazilianStemFilter(new SetKeywordMarkerFilter(new LowerCaseFilter(tokenizer), set));
assertTokenStreamContents(filter, new String[] { "brasília", "brasil" });
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c0d29759/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAnalyzers.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAnalyzers.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAnalyzers.java
index b7fc18b..8133b7a 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAnalyzers.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAnalyzers.java
@@ -216,14 +216,6 @@ public class TestAnalyzers extends BaseTokenStreamTestCase {
int length = highSurEndingLower.length();
assertEquals('\ud801', termBuffer[length - 1]);
}
-
- public void testLowerCaseTokenizer() throws IOException {
- StringReader reader = new StringReader("Tokenizer \ud801\udc1ctest");
- LowerCaseTokenizer tokenizer = new LowerCaseTokenizer();
- tokenizer.setReader(reader);
- assertTokenStreamContents(tokenizer, new String[] { "tokenizer",
- "\ud801\udc44test" });
- }
public void testWhitespaceTokenizer() throws IOException {
StringReader reader = new StringReader("Tokenizer \ud801\udc1ctest");
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c0d29759/lucene/analysis/common/src/test/org/apache/lucene/analysis/custom/TestCustomAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/custom/TestCustomAnalyzer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/custom/TestCustomAnalyzer.java
index 1fa59d1..a4e1ac5 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/custom/TestCustomAnalyzer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/custom/TestCustomAnalyzer.java
@@ -31,9 +31,8 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.charfilter.HTMLStripCharFilterFactory;
import org.apache.lucene.analysis.charfilter.MappingCharFilterFactory;
+import org.apache.lucene.analysis.core.LetterTokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilterFactory;
-import org.apache.lucene.analysis.core.LowerCaseTokenizer;
-import org.apache.lucene.analysis.core.LowerCaseTokenizerFactory;
import org.apache.lucene.analysis.core.StopFilterFactory;
import org.apache.lucene.analysis.core.WhitespaceTokenizerFactory;
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilterFactory;
@@ -419,7 +418,7 @@ public class TestCustomAnalyzer extends BaseTokenStreamTestCase {
@Override
public Tokenizer create(AttributeFactory factory) {
- return new LowerCaseTokenizer(factory);
+ return new LetterTokenizer(factory);
}
}
@@ -500,14 +499,6 @@ public class TestCustomAnalyzer extends BaseTokenStreamTestCase {
.build();
assertEquals(new BytesRef("e f c"), analyzer.normalize("dummy", "a b c"));
}
-
- /** test normalize where the TokenizerFactory returns a filter to normalize the text */
- public void testNormalizationWithLowerCaseTokenizer() throws IOException {
- CustomAnalyzer analyzer1 = CustomAnalyzer.builder()
- .withTokenizer(LowerCaseTokenizerFactory.class, Collections.emptyMap())
- .build();
- assertEquals(new BytesRef("abc"), analyzer1.normalize("dummy", "ABC"));
- }
public void testConditions() throws IOException {
CustomAnalyzer analyzer = CustomAnalyzer.builder()
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c0d29759/lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanAnalyzer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanAnalyzer.java
index 4c52c0e..3d8be31 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanAnalyzer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanAnalyzer.java
@@ -23,7 +23,9 @@ import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CharArraySet;
-import org.apache.lucene.analysis.core.LowerCaseTokenizer;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.LetterTokenizer;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
public class TestGermanAnalyzer extends BaseTokenStreamTestCase {
@@ -38,10 +40,10 @@ public class TestGermanAnalyzer extends BaseTokenStreamTestCase {
public void testWithKeywordAttribute() throws IOException {
CharArraySet set = new CharArraySet( 1, true);
set.add("fischen");
- final LowerCaseTokenizer in = new LowerCaseTokenizer();
+ final Tokenizer in = new LetterTokenizer();
in.setReader(new StringReader("Fischen Trinken"));
GermanStemFilter filter = new GermanStemFilter(
- new SetKeywordMarkerFilter(in, set));
+ new SetKeywordMarkerFilter(new LowerCaseFilter(in), set));
assertTokenStreamContents(filter, new String[] { "fischen", "trink" });
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c0d29759/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestStandardFactories.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestStandardFactories.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestStandardFactories.java
index 00bc7c6..3f3d5c2 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestStandardFactories.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestStandardFactories.java
@@ -126,17 +126,6 @@ public class TestStandardFactories extends BaseTokenStreamFactoryTestCase {
}
/**
- * Test LowerCaseTokenizerFactory
- */
- public void testLowerCaseTokenizer() throws Exception {
- Reader reader = new StringReader("What's this thing do?");
- Tokenizer stream = tokenizerFactory("LowerCase").create(newAttributeFactory());
- stream.setReader(reader);
- assertTokenStreamContents(stream,
- new String[] { "what", "s", "this", "thing", "do" });
- }
-
- /**
* Ensure the ASCIIFoldingFilterFactory works
*/
public void testASCIIFolding() throws Exception {
@@ -169,11 +158,6 @@ public class TestStandardFactories extends BaseTokenStreamFactoryTestCase {
assertTrue(expected.getMessage().contains("Unknown parameters"));
expected = expectThrows(IllegalArgumentException.class, () -> {
- tokenizerFactory("LowerCase", "bogusArg", "bogusValue");
- });
- assertTrue(expected.getMessage().contains("Unknown parameters"));
-
- expected = expectThrows(IllegalArgumentException.class, () -> {
tokenFilterFactory("ASCIIFolding", "bogusArg", "bogusValue");
});
assertTrue(expected.getMessage().contains("Unknown parameters"));
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c0d29759/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharTokenizers.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharTokenizers.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharTokenizers.java
index 4596608..2fcda4f 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharTokenizers.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharTokenizers.java
@@ -21,16 +21,12 @@ import java.io.IOException;
import java.io.StringReader;
import java.util.Locale;
-import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.core.LetterTokenizer;
-import org.apache.lucene.analysis.core.LowerCaseTokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.lucene.util.TestUtil;
/**
@@ -54,9 +50,9 @@ public class TestCharTokenizers extends BaseTokenStreamTestCase {
}
// internal buffer size is 1024 make sure we have a surrogate pair right at the border
builder.insert(1023, "\ud801\udc1c");
- Tokenizer tokenizer = new LowerCaseTokenizer(newAttributeFactory());
+ Tokenizer tokenizer = new LetterTokenizer(newAttributeFactory());
tokenizer.setReader(new StringReader(builder.toString()));
- assertTokenStreamContents(tokenizer, builder.toString().toLowerCase(Locale.ROOT).split(" "));
+ assertTokenStreamContents(new LowerCaseFilter(tokenizer), builder.toString().toLowerCase(Locale.ROOT).split(" "));
}
/*
@@ -72,9 +68,9 @@ public class TestCharTokenizers extends BaseTokenStreamTestCase {
builder.append("a");
}
builder.append("\ud801\udc1cabc");
- Tokenizer tokenizer = new LowerCaseTokenizer(newAttributeFactory());
+ Tokenizer tokenizer = new LetterTokenizer(newAttributeFactory());
tokenizer.setReader(new StringReader(builder.toString()));
- assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(Locale.ROOT)});
+ assertTokenStreamContents(new LowerCaseFilter(tokenizer), new String[] {builder.toString().toLowerCase(Locale.ROOT)});
}
}
@@ -87,9 +83,9 @@ public class TestCharTokenizers extends BaseTokenStreamTestCase {
for (int i = 0; i < 255; i++) {
builder.append("A");
}
- Tokenizer tokenizer = new LowerCaseTokenizer(newAttributeFactory());
+ Tokenizer tokenizer = new LetterTokenizer(newAttributeFactory());
tokenizer.setReader(new StringReader(builder.toString() + builder.toString()));
- assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(Locale.ROOT), builder.toString().toLowerCase(Locale.ROOT)});
+ assertTokenStreamContents(new LowerCaseFilter(tokenizer), new String[] {builder.toString().toLowerCase(Locale.ROOT), builder.toString().toLowerCase(Locale.ROOT)});
}
/*
@@ -101,14 +97,14 @@ public class TestCharTokenizers extends BaseTokenStreamTestCase {
for (int i = 0; i < 100; i++) {
builder.append("A");
}
- Tokenizer tokenizer = new LowerCaseTokenizer(newAttributeFactory(), 100);
+ Tokenizer tokenizer = new LetterTokenizer(newAttributeFactory(), 100);
// Tricky, passing two copies of the string to the reader....
tokenizer.setReader(new StringReader(builder.toString() + builder.toString()));
- assertTokenStreamContents(tokenizer, new String[]{builder.toString().toLowerCase(Locale.ROOT),
+ assertTokenStreamContents(new LowerCaseFilter(tokenizer), new String[]{builder.toString().toLowerCase(Locale.ROOT),
builder.toString().toLowerCase(Locale.ROOT) });
Exception e = expectThrows(IllegalArgumentException.class, () ->
- new LowerCaseTokenizer(newAttributeFactory(), -1));
+ new LetterTokenizer(newAttributeFactory(), -1));
assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: -1", e.getMessage());
tokenizer = new LetterTokenizer(newAttributeFactory(), 100);
@@ -134,16 +130,16 @@ public class TestCharTokenizers extends BaseTokenStreamTestCase {
}
e = expectThrows(IllegalArgumentException.class, () ->
- new LowerCaseTokenizer(newAttributeFactory(), 0));
+ new LetterTokenizer(newAttributeFactory(), 0));
assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 0", e.getMessage());
e = expectThrows(IllegalArgumentException.class, () ->
- new LowerCaseTokenizer(newAttributeFactory(), 10_000_000));
+ new LetterTokenizer(newAttributeFactory(), 10_000_000));
assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 10000000", e.getMessage());
- tokenizer = new LowerCaseTokenizer(newAttributeFactory(), 4800);
+ tokenizer = new LetterTokenizer(newAttributeFactory(), 4800);
tokenizer.setReader(new StringReader(builder.toString()));
- assertTokenStreamContents(tokenizer, new String[]{builder.toString().toLowerCase(Locale.ROOT)});
+ assertTokenStreamContents(new LowerCaseFilter(tokenizer), new String[]{builder.toString().toLowerCase(Locale.ROOT)});
e = expectThrows(IllegalArgumentException.class, () ->
@@ -195,87 +191,9 @@ public class TestCharTokenizers extends BaseTokenStreamTestCase {
builder.append("A");
}
builder.append("\ud801\udc1c");
- Tokenizer tokenizer = new LowerCaseTokenizer(newAttributeFactory());
+ Tokenizer tokenizer = new LetterTokenizer(newAttributeFactory());
tokenizer.setReader(new StringReader(builder.toString() + builder.toString()));
- assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(Locale.ROOT), builder.toString().toLowerCase(Locale.ROOT)});
- }
-
- // LUCENE-3642: normalize SMP->BMP and check that offsets are correct
- public void testCrossPlaneNormalization() throws IOException {
- Analyzer analyzer = new Analyzer() {
- @Override
- protected TokenStreamComponents createComponents(String fieldName) {
- Tokenizer tokenizer = new LetterTokenizer(newAttributeFactory()) {
- @Override
- protected int normalize(int c) {
- if (c > 0xffff) {
- return 'δ';
- } else {
- return c;
- }
- }
- };
- return new TokenStreamComponents(tokenizer, tokenizer);
- }
- };
- int num = 1000 * RANDOM_MULTIPLIER;
- for (int i = 0; i < num; i++) {
- String s = TestUtil.randomUnicodeString(random());
- try (TokenStream ts = analyzer.tokenStream("foo", s)) {
- ts.reset();
- OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
- while (ts.incrementToken()) {
- String highlightedText = s.substring(offsetAtt.startOffset(), offsetAtt.endOffset());
- for (int j = 0, cp = 0; j < highlightedText.length(); j += Character.charCount(cp)) {
- cp = highlightedText.codePointAt(j);
- assertTrue("non-letter:" + Integer.toHexString(cp), Character.isLetter(cp));
- }
- }
- ts.end();
- }
- }
- // just for fun
- checkRandomData(random(), analyzer, num);
- analyzer.close();
- }
-
- // LUCENE-3642: normalize BMP->SMP and check that offsets are correct
- public void testCrossPlaneNormalization2() throws IOException {
- Analyzer analyzer = new Analyzer() {
- @Override
- protected TokenStreamComponents createComponents(String fieldName) {
- Tokenizer tokenizer = new LetterTokenizer(newAttributeFactory()) {
- @Override
- protected int normalize(int c) {
- if (c <= 0xffff) {
- return 0x1043C;
- } else {
- return c;
- }
- }
- };
- return new TokenStreamComponents(tokenizer, tokenizer);
- }
- };
- int num = 1000 * RANDOM_MULTIPLIER;
- for (int i = 0; i < num; i++) {
- String s = TestUtil.randomUnicodeString(random());
- try (TokenStream ts = analyzer.tokenStream("foo", s)) {
- ts.reset();
- OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
- while (ts.incrementToken()) {
- String highlightedText = s.substring(offsetAtt.startOffset(), offsetAtt.endOffset());
- for (int j = 0, cp = 0; j < highlightedText.length(); j += Character.charCount(cp)) {
- cp = highlightedText.codePointAt(j);
- assertTrue("non-letter:" + Integer.toHexString(cp), Character.isLetter(cp));
- }
- }
- ts.end();
- }
- }
- // just for fun
- checkRandomData(random(), analyzer, num);
- analyzer.close();
+ assertTokenStreamContents(new LowerCaseFilter(tokenizer), new String[] {builder.toString().toLowerCase(Locale.ROOT), builder.toString().toLowerCase(Locale.ROOT)});
}
public void testDefinitionUsingMethodReference1() throws Exception {
@@ -287,16 +205,16 @@ public class TestCharTokenizers extends BaseTokenStreamTestCase {
public void testDefinitionUsingMethodReference2() throws Exception {
final StringReader reader = new StringReader("Tokenizer(Test)");
- final Tokenizer tokenizer = CharTokenizer.fromTokenCharPredicate(Character::isLetter, Character::toUpperCase);
+ final Tokenizer tokenizer = CharTokenizer.fromTokenCharPredicate(Character::isLetter);
tokenizer.setReader(reader);
- assertTokenStreamContents(tokenizer, new String[] { "TOKENIZER", "TEST" });
+ assertTokenStreamContents(tokenizer, new String[] { "Tokenizer", "Test" });
}
public void testDefinitionUsingLambda() throws Exception {
final StringReader reader = new StringReader("Tokenizer\u00A0Test Foo");
- final Tokenizer tokenizer = CharTokenizer.fromSeparatorCharPredicate(c -> c == '\u00A0' || Character.isWhitespace(c), Character::toLowerCase);
+ final Tokenizer tokenizer = CharTokenizer.fromSeparatorCharPredicate(c -> c == '\u00A0' || Character.isWhitespace(c));
tokenizer.setReader(reader);
- assertTokenStreamContents(tokenizer, new String[] { "tokenizer", "test", "foo" });
+ assertTokenStreamContents(tokenizer, new String[] { "Tokenizer", "Test", "Foo" });
}
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c0d29759/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/schema.xml
----------------------------------------------------------------------
diff --git a/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/schema.xml b/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/schema.xml
index 3dbd6aa..475c333 100644
--- a/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/schema.xml
+++ b/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/schema.xml
@@ -126,20 +126,7 @@
<analyzer class="org.apache.lucene.analysis.core.WhitespaceAnalyzer"/>
</fieldType>
- <fieldType name="teststop" class="solr.TextField">
- <analyzer>
- <tokenizer class="solr.LowerCaseTokenizerFactory"/>
- <filter class="solr.ClassicFilterFactory"/>
- <filter class="solr.StopFilterFactory" words="stopwords.txt"/>
- </analyzer>
- </fieldType>
-
<!-- fieldTypes in this section isolate tokenizers and tokenfilters for testing -->
- <fieldType name="lowertok" class="solr.TextField">
- <analyzer>
- <tokenizer class="solr.LowerCaseTokenizerFactory"/>
- </analyzer>
- </fieldType>
<fieldType name="keywordtok" class="solr.TextField">
<analyzer>
<tokenizer class="solr.MockTokenizerFactory" pattern="keyword"/>
@@ -386,8 +373,6 @@
<field name="test_hlt_off" type="highlittext" indexed="true"/>
<!-- fields to test individual tokenizers and tokenfilters -->
- <field name="teststop" type="teststop" indexed="true" stored="true"/>
- <field name="lowertok" type="lowertok" indexed="true" stored="true"/>
<field name="keywordtok" type="keywordtok" indexed="true" stored="true"/>
<field name="standardtok" type="standardtok" indexed="true" stored="true"/>
<field name="HTMLstandardtok" type="HTMLstandardtok" indexed="true" stored="true"/>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c0d29759/solr/core/src/test-files/solr/collection1/conf/schema-copyfield-test.xml
----------------------------------------------------------------------
diff --git a/solr/core/src/test-files/solr/collection1/conf/schema-copyfield-test.xml b/solr/core/src/test-files/solr/collection1/conf/schema-copyfield-test.xml
index f36751e..20dc97a 100644
--- a/solr/core/src/test-files/solr/collection1/conf/schema-copyfield-test.xml
+++ b/solr/core/src/test-files/solr/collection1/conf/schema-copyfield-test.xml
@@ -90,19 +90,7 @@
<analyzer class="org.apache.lucene.analysis.core.WhitespaceAnalyzer"/>
</fieldType>
- <fieldType name="teststop" class="solr.TextField">
- <analyzer>
- <tokenizer class="solr.LowerCaseTokenizerFactory"/>
- <filter class="solr.StopFilterFactory" words="stopwords.txt"/>
- </analyzer>
- </fieldType>
-
<!-- fieldTypes in this section isolate tokenizers and tokenfilters for testing -->
- <fieldType name="lowertok" class="solr.TextField">
- <analyzer>
- <tokenizer class="solr.LowerCaseTokenizerFactory"/>
- </analyzer>
- </fieldType>
<fieldType name="keywordtok" class="solr.TextField">
<analyzer>
<tokenizer class="solr.MockTokenizerFactory" pattern="keyword"/>
@@ -347,8 +335,6 @@
<field name="test_hlt_off" type="highlittext" indexed="true"/>
<!-- fields to test individual tokenizers and tokenfilters -->
- <field name="teststop" type="teststop" indexed="true" stored="true"/>
- <field name="lowertok" type="lowertok" indexed="true" stored="true"/>
<field name="keywordtok" type="keywordtok" indexed="true" stored="true"/>
<field name="standardtok" type="standardtok" indexed="true" stored="true"/>
<field name="HTMLstandardtok" type="HTMLstandardtok" indexed="true" stored="true"/>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c0d29759/solr/core/src/test-files/solr/collection1/conf/schema-folding.xml
----------------------------------------------------------------------
diff --git a/solr/core/src/test-files/solr/collection1/conf/schema-folding.xml b/solr/core/src/test-files/solr/collection1/conf/schema-folding.xml
index 1d20b80..0b13a57 100644
--- a/solr/core/src/test-files/solr/collection1/conf/schema-folding.xml
+++ b/solr/core/src/test-files/solr/collection1/conf/schema-folding.xml
@@ -81,7 +81,8 @@
<fieldType name="text_lower_token" class="solr.TextField">
<analyzer>
- <tokenizer class="solr.LowerCaseTokenizerFactory"/>
+ <tokenizer class="solr.LetterTokenizerFactory"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.ASCIIFoldingFilterFactory"/>
</analyzer>
</fieldType>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c0d29759/solr/core/src/test-files/solr/collection1/conf/schema-hash.xml
----------------------------------------------------------------------
diff --git a/solr/core/src/test-files/solr/collection1/conf/schema-hash.xml b/solr/core/src/test-files/solr/collection1/conf/schema-hash.xml
index 3e8aa15..c2d6b39 100644
--- a/solr/core/src/test-files/solr/collection1/conf/schema-hash.xml
+++ b/solr/core/src/test-files/solr/collection1/conf/schema-hash.xml
@@ -139,18 +139,7 @@
<analyzer class="org.apache.lucene.analysis.core.WhitespaceAnalyzer"/>
</fieldtype>
- <fieldtype name="teststop" class="solr.TextField">
- <analyzer>
- <tokenizer class="solr.LowerCaseTokenizerFactory"/>
- </analyzer>
- </fieldtype>
-
<!-- fieldtypes in this section isolate tokenizers and tokenfilters for testing -->
- <fieldtype name="lowertok" class="solr.TextField">
- <analyzer>
- <tokenizer class="solr.LowerCaseTokenizerFactory"/>
- </analyzer>
- </fieldtype>
<fieldtype name="keywordtok" class="solr.TextField">
<analyzer>
<tokenizer class="solr.MockTokenizerFactory" pattern="keyword"/>
@@ -484,8 +473,6 @@
termPositions="true" termOffsets="true"/>
<!-- fields to test individual tokenizers and tokenfilters -->
- <field name="teststop" type="teststop" indexed="true" stored="true"/>
- <field name="lowertok" type="lowertok" indexed="true" stored="true"/>
<field name="keywordtok" type="keywordtok" indexed="true" stored="true"/>
<field name="standardtok" type="standardtok" indexed="true" stored="true"/>
<field name="HTMLstandardtok" type="HTMLstandardtok" indexed="true" stored="true"/>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c0d29759/solr/core/src/test-files/solr/collection1/conf/schema-required-fields.xml
----------------------------------------------------------------------
diff --git a/solr/core/src/test-files/solr/collection1/conf/schema-required-fields.xml b/solr/core/src/test-files/solr/collection1/conf/schema-required-fields.xml
index 4210d5b..0ac0c04 100644
--- a/solr/core/src/test-files/solr/collection1/conf/schema-required-fields.xml
+++ b/solr/core/src/test-files/solr/collection1/conf/schema-required-fields.xml
@@ -73,19 +73,7 @@
<analyzer class="org.apache.lucene.analysis.core.WhitespaceAnalyzer"/>
</fieldType>
- <fieldType name="teststop" class="solr.TextField">
- <analyzer>
- <tokenizer class="solr.LowerCaseTokenizerFactory"/>
- <filter class="solr.StopFilterFactory" words="stopwords.txt"/>
- </analyzer>
- </fieldType>
-
<!-- fieldTypes in this section isolate tokenizers and tokenfilters for testing -->
- <fieldType name="lowertok" class="solr.TextField">
- <analyzer>
- <tokenizer class="solr.LowerCaseTokenizerFactory"/>
- </analyzer>
- </fieldType>
<fieldType name="keywordtok" class="solr.TextField">
<analyzer>
<tokenizer class="solr.MockTokenizerFactory" pattern="keyword"/>
@@ -331,8 +319,6 @@
<field name="test_hlt_off" type="highlittext" indexed="true"/>
<!-- fields to test individual tokenizers and tokenfilters -->
- <field name="teststop" type="teststop" indexed="true" stored="true"/>
- <field name="lowertok" type="lowertok" indexed="true" stored="true"/>
<field name="keywordtok" type="keywordtok" indexed="true" stored="true"/>
<field name="standardtok" type="standardtok" indexed="true" stored="true"/>
<field name="HTMLstandardtok" type="HTMLstandardtok" indexed="true" stored="true"/>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c0d29759/solr/core/src/test-files/solr/collection1/conf/schema-rest.xml
----------------------------------------------------------------------
diff --git a/solr/core/src/test-files/solr/collection1/conf/schema-rest.xml b/solr/core/src/test-files/solr/collection1/conf/schema-rest.xml
index 2a04356..46b735c 100644
--- a/solr/core/src/test-files/solr/collection1/conf/schema-rest.xml
+++ b/solr/core/src/test-files/solr/collection1/conf/schema-rest.xml
@@ -199,17 +199,13 @@
<fieldType name="teststop" class="solr.TextField">
<analyzer>
- <tokenizer class="solr.LowerCaseTokenizerFactory"/>
+ <tokenizer class="solr.LetterTokenizerFactory"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" words="stopwords.txt"/>
</analyzer>
</fieldType>
<!-- fieldTypes in this section isolate tokenizers and tokenfilters for testing -->
- <fieldType name="lowertok" class="solr.TextField">
- <analyzer>
- <tokenizer class="solr.LowerCaseTokenizerFactory"/>
- </analyzer>
- </fieldType>
<fieldType name="keywordtok" class="solr.TextField">
<analyzer>
<tokenizer class="solr.MockTokenizerFactory" pattern="keyword"/>
@@ -604,7 +600,6 @@
<!-- fields to test individual tokenizers and tokenfilters -->
<field name="teststop" type="teststop" indexed="true" stored="true"/>
- <field name="lowertok" type="lowertok" indexed="true" stored="true"/>
<field name="keywordtok" type="keywordtok" indexed="true" stored="true"/>
<field name="standardtok" type="standardtok" indexed="true" stored="true"/>
<field name="HTMLstandardtok" type="HTMLstandardtok" indexed="true" stored="true"/>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c0d29759/solr/core/src/test-files/solr/collection1/conf/schema-sql.xml
----------------------------------------------------------------------
diff --git a/solr/core/src/test-files/solr/collection1/conf/schema-sql.xml b/solr/core/src/test-files/solr/collection1/conf/schema-sql.xml
index 40bbe5a..03d9d7e 100644
--- a/solr/core/src/test-files/solr/collection1/conf/schema-sql.xml
+++ b/solr/core/src/test-files/solr/collection1/conf/schema-sql.xml
@@ -149,18 +149,7 @@
<analyzer class="org.apache.lucene.analysis.core.WhitespaceAnalyzer"/>
</fieldtype>
- <fieldtype name="teststop" class="solr.TextField">
- <analyzer>
- <tokenizer class="solr.LowerCaseTokenizerFactory"/>
- </analyzer>
- </fieldtype>
-
<!-- fieldtypes in this section isolate tokenizers and tokenfilters for testing -->
- <fieldtype name="lowertok" class="solr.TextField">
- <analyzer>
- <tokenizer class="solr.LowerCaseTokenizerFactory"/>
- </analyzer>
- </fieldtype>
<fieldtype name="keywordtok" class="solr.TextField">
<analyzer>
<tokenizer class="solr.MockTokenizerFactory" pattern="keyword"/>
@@ -501,8 +490,6 @@
termPositions="true" termOffsets="true"/>
<!-- fields to test individual tokenizers and tokenfilters -->
- <field name="teststop" type="teststop" indexed="true" stored="true"/>
- <field name="lowertok" type="lowertok" indexed="true" stored="true"/>
<field name="keywordtok" type="keywordtok" indexed="true" stored="true"/>
<field name="standardtok" type="standardtok" indexed="true" stored="true"/>
<field name="HTMLstandardtok" type="HTMLstandardtok" indexed="true" stored="true"/>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c0d29759/solr/core/src/test-files/solr/collection1/conf/schema-tokenizer-test.xml
----------------------------------------------------------------------
diff --git a/solr/core/src/test-files/solr/collection1/conf/schema-tokenizer-test.xml b/solr/core/src/test-files/solr/collection1/conf/schema-tokenizer-test.xml
index 6c33504..5613c66 100644
--- a/solr/core/src/test-files/solr/collection1/conf/schema-tokenizer-test.xml
+++ b/solr/core/src/test-files/solr/collection1/conf/schema-tokenizer-test.xml
@@ -43,7 +43,8 @@ more concise example.
<fieldType name="lowerCasefieldType" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="false">
<analyzer type="index">
- <tokenizer class="solr.LowerCaseTokenizerFactory" maxTokenLen="3" />
+ <tokenizer class="solr.LetterTokenizerFactory" maxTokenLen="3" />
+ <filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.StandardTokenizerFactory"/>
@@ -85,12 +86,6 @@ more concise example.
</analyzer>
</fieldType>
- <fieldType name="lowerCase0fieldType" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="false">
- <analyzer>
- <tokenizer class="solr.LowerCaseTokenizerFactory" maxTokenLen="3" />
- </analyzer>
- </fieldType>
-
<fieldType name="whiteSp0fieldType" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="false">
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory" maxTokenLen="3" />
@@ -112,13 +107,11 @@ more concise example.
<field name="id" type="string" indexed="true" stored="true" multiValued="false" required="true"/>
<field name="letter" type="letterfieldType" indexed="true" stored="true"/>
- <field name="lowerCase" type="lowerCasefieldType" indexed="true" stored="true"/>
<field name="whiteSpace" type="whiteSpfieldType" indexed="true" stored="true"/>
<field name="unicodeWhiteSpace" type="uniWhiteSpfieldType" indexed="true" stored="true"/>
<field name="keyword" type="keywordfieldType" indexed="true" stored="true"/>
<field name="letter0" type="letter0fieldType" indexed="true" stored="true"/>
- <field name="lowerCase0" type="lowerCase0fieldType" indexed="true" stored="true"/>
<field name="whiteSpace0" type="whiteSp0fieldType" indexed="true" stored="true"/>
<field name="unicodeWhiteSpace0" type="uniWhiteSp0fieldType" indexed="true" stored="true"/>
<field name="keyword0" type="keyword0fieldType" indexed="true" stored="true"/>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c0d29759/solr/core/src/test-files/solr/collection1/conf/schema.xml
----------------------------------------------------------------------
diff --git a/solr/core/src/test-files/solr/collection1/conf/schema.xml b/solr/core/src/test-files/solr/collection1/conf/schema.xml
index b1a261b..b61bbb1 100644
--- a/solr/core/src/test-files/solr/collection1/conf/schema.xml
+++ b/solr/core/src/test-files/solr/collection1/conf/schema.xml
@@ -142,20 +142,17 @@
<fieldType name="teststop" class="solr.TextField">
<analyzer type="index">
- <tokenizer class="solr.LowerCaseTokenizerFactory"/>
+ <tokenizer class="solr.LetterTokenizerFactory"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" words="stopwords.txt"/>
</analyzer>
<analyzer type="query">
- <tokenizer class="solr.LowerCaseTokenizerFactory"/>
+ <tokenizer class="solr.LetterTokenizerFactory"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldType>
<!-- fieldTypes in this section isolate tokenizers and tokenfilters for testing -->
- <fieldType name="lowertok" class="solr.TextField">
- <analyzer>
- <tokenizer class="solr.LowerCaseTokenizerFactory"/>
- </analyzer>
- </fieldType>
<fieldType name="keywordtok" class="solr.TextField">
<analyzer>
<tokenizer class="solr.MockTokenizerFactory" pattern="keyword"/>
@@ -574,7 +571,7 @@
<!-- fields to test individual tokenizers and tokenfilters -->
<field name="teststop" type="teststop" indexed="true" stored="true"/>
- <field name="lowertok" type="lowertok" indexed="true" stored="true"/>
+ <field name="lowertok" type="lowerfilt" indexed="true" stored="true"/>
<field name="keywordtok" type="keywordtok" indexed="true" stored="true"/>
<field name="standardtok" type="standardtok" indexed="true" stored="true"/>
<field name="HTMLstandardtok" type="HTMLstandardtok" indexed="true" stored="true"/>
@@ -636,7 +633,7 @@
<field name="store" type="location" indexed="true" stored="true" omitNorms="false"/>
- <field name="lower" type="lowertok" indexed="false" stored="true" multiValued="true"/>
+ <field name="lower" type="lowerfilt" indexed="false" stored="true" multiValued="true"/>
<field name="_route_" type="string" indexed="true" stored="true" multiValued="false"/>
<field name="payloadDelimited" type="payloadDelimited"/>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c0d29759/solr/core/src/test-files/solr/collection1/conf/schema12.xml
----------------------------------------------------------------------
diff --git a/solr/core/src/test-files/solr/collection1/conf/schema12.xml b/solr/core/src/test-files/solr/collection1/conf/schema12.xml
index 8947676..e4c3ad2 100644
--- a/solr/core/src/test-files/solr/collection1/conf/schema12.xml
+++ b/solr/core/src/test-files/solr/collection1/conf/schema12.xml
@@ -234,17 +234,13 @@
<fieldType name="teststop" class="solr.TextField">
<analyzer>
- <tokenizer class="solr.LowerCaseTokenizerFactory"/>
+ <tokenizer class="solr.LetterTokenizerFactory"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" words="stopwords.txt"/>
</analyzer>
</fieldType>
<!-- fieldTypes in this section isolate tokenizers and tokenfilters for testing -->
- <fieldType name="lowertok" class="solr.TextField">
- <analyzer>
- <tokenizer class="solr.LowerCaseTokenizerFactory"/>
- </analyzer>
- </fieldType>
<fieldType name="keywordtok" class="solr.TextField">
<analyzer>
<tokenizer class="solr.MockTokenizerFactory" pattern="keyword"/>
@@ -591,7 +587,6 @@
<!-- fields to test individual tokenizers and tokenfilters -->
<field name="teststop" type="teststop" indexed="true" stored="true"/>
- <field name="lowertok" type="lowertok" indexed="true" stored="true"/>
<field name="keywordtok" type="keywordtok" indexed="true" stored="true"/>
<field name="standardtok" type="standardtok" indexed="true" stored="true"/>
<field name="HTMLstandardtok" type="HTMLstandardtok" indexed="true" stored="true"/>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c0d29759/solr/core/src/test-files/solr/collection1/conf/schema15.xml
----------------------------------------------------------------------
diff --git a/solr/core/src/test-files/solr/collection1/conf/schema15.xml b/solr/core/src/test-files/solr/collection1/conf/schema15.xml
index 80d19e9..361344f 100644
--- a/solr/core/src/test-files/solr/collection1/conf/schema15.xml
+++ b/solr/core/src/test-files/solr/collection1/conf/schema15.xml
@@ -163,19 +163,7 @@
<analyzer class="org.apache.lucene.analysis.core.WhitespaceAnalyzer"/>
</fieldType>
- <fieldType name="teststop" class="solr.TextField">
- <analyzer>
- <tokenizer class="solr.LowerCaseTokenizerFactory"/>
- <filter class="solr.StopFilterFactory" words="stopwords.txt"/>
- </analyzer>
- </fieldType>
-
<!-- fieldTypes in this section isolate tokenizers and tokenfilters for testing -->
- <fieldType name="lowertok" class="solr.TextField">
- <analyzer>
- <tokenizer class="solr.LowerCaseTokenizerFactory"/>
- </analyzer>
- </fieldType>
<fieldType name="keywordtok" class="solr.TextField">
<analyzer>
<tokenizer class="solr.MockTokenizerFactory" pattern="keyword"/>
@@ -505,8 +493,6 @@
<field name="test_hlt_off" type="highlittext" indexed="true"/>
<!-- fields to test individual tokenizers and tokenfilters -->
- <field name="teststop" type="teststop" indexed="true" stored="true"/>
- <field name="lowertok" type="lowertok" indexed="true" stored="true"/>
<field name="keywordtok" type="keywordtok" indexed="true" stored="true"/>
<field name="standardtok" type="standardtok" indexed="true" stored="true"/>
<field name="HTMLstandardtok" type="HTMLstandardtok" indexed="true" stored="true"/>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c0d29759/solr/core/src/test-files/solr/collection1/conf/schemasurround.xml
----------------------------------------------------------------------
diff --git a/solr/core/src/test-files/solr/collection1/conf/schemasurround.xml b/solr/core/src/test-files/solr/collection1/conf/schemasurround.xml
index 213acc7..93b11ed 100644
--- a/solr/core/src/test-files/solr/collection1/conf/schemasurround.xml
+++ b/solr/core/src/test-files/solr/collection1/conf/schemasurround.xml
@@ -164,19 +164,7 @@
<analyzer class="org.apache.lucene.analysis.core.WhitespaceAnalyzer"/>
</fieldType>
- <fieldType name="teststop" class="solr.TextField">
- <analyzer>
- <tokenizer class="solr.LowerCaseTokenizerFactory"/>
- <filter class="solr.StopFilterFactory" words="stopwords.txt"/>
- </analyzer>
- </fieldType>
-
<!-- fieldTypes in this section isolate tokenizers and tokenfilters for testing -->
- <fieldType name="lowertok" class="solr.TextField">
- <analyzer>
- <tokenizer class="solr.LowerCaseTokenizerFactory"/>
- </analyzer>
- </fieldType>
<fieldType name="keywordtok" class="solr.TextField">
<analyzer>
<tokenizer class="solr.MockTokenizerFactory" pattern="keyword"/>
@@ -517,8 +505,6 @@
<field name="test_hlt_off" type="highlittext" indexed="true"/>
<!-- fields to test individual tokenizers and tokenfilters -->
- <field name="teststop" type="teststop" indexed="true" stored="true"/>
- <field name="lowertok" type="lowertok" indexed="true" stored="true"/>
<field name="keywordtok" type="keywordtok" indexed="true" stored="true"/>
<field name="standardtok" type="standardtok" indexed="true" stored="true"/>
<field name="HTMLstandardtok" type="HTMLstandardtok" indexed="true" stored="true"/>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c0d29759/solr/core/src/test/org/apache/solr/rest/schema/TestFieldCollectionResource.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/rest/schema/TestFieldCollectionResource.java b/solr/core/src/test/org/apache/solr/rest/schema/TestFieldCollectionResource.java
index 31fa9f5..bdd3cd2 100644
--- a/solr/core/src/test/org/apache/solr/rest/schema/TestFieldCollectionResource.java
+++ b/solr/core/src/test/org/apache/solr/rest/schema/TestFieldCollectionResource.java
@@ -77,11 +77,11 @@ public class TestFieldCollectionResource extends SolrRestletTestBase {
"/fields/[0]/name=='HTMLstandardtok'",
"/fields/[1]/name=='HTMLwhitetok'",
"/fields/[2]/name=='_version_'",
- "/fields/[108]/name=='*_d'",
- "/fields/[107]/name=='*_f'",
- "/fields/[106]/name=='*_b'",
- "/fields/[105]/name=='*_t'",
- "/fields/[104]/name=='*_l'"
+ "/fields/[107]/name=='*_d'",
+ "/fields/[106]/name=='*_f'",
+ "/fields/[105]/name=='*_b'",
+ "/fields/[104]/name=='*_t'",
+ "/fields/[103]/name=='*_l'"
);
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c0d29759/solr/core/src/test/org/apache/solr/rest/schema/TestFieldTypeResource.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/rest/schema/TestFieldTypeResource.java b/solr/core/src/test/org/apache/solr/rest/schema/TestFieldTypeResource.java
index ea19af0..08a3f1b 100644
--- a/solr/core/src/test/org/apache/solr/rest/schema/TestFieldTypeResource.java
+++ b/solr/core/src/test/org/apache/solr/rest/schema/TestFieldTypeResource.java
@@ -81,7 +81,8 @@ public class TestFieldTypeResource extends SolrRestletTestBase {
"count(/response/lst[@name='fieldType']/*) = 3",
"/response/lst[@name='fieldType']/str[@name='name'] = 'teststop'",
"/response/lst[@name='fieldType']/str[@name='class'] = 'solr.TextField'",
- "/response/lst[@name='fieldType']/lst[@name='analyzer']/lst[@name='tokenizer']/str[@name='class'] = 'solr.LowerCaseTokenizerFactory'",
+ "/response/lst[@name='fieldType']/lst[@name='analyzer']/lst[@name='tokenizer']/str[@name='class'] = 'solr.LetterTokenizerFactory'",
+ "/response/lst[@name='fieldType']/lst[@name='analyzer']/arr[@name='filters']/lst/str[@name='class'][.='solr.LowerCaseFilterFactory']",
"/response/lst[@name='fieldType']/lst[@name='analyzer']/arr[@name='filters']/lst/str[@name='class'][.='solr.StopFilterFactory']",
"/response/lst[@name='fieldType']/lst[@name='analyzer']/arr[@name='filters']/lst/str[@name='words'][.='stopwords.txt']"
);
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c0d29759/solr/core/src/test/org/apache/solr/util/TestMaxTokenLenTokenizer.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/util/TestMaxTokenLenTokenizer.java b/solr/core/src/test/org/apache/solr/util/TestMaxTokenLenTokenizer.java
index c7e0dc3..f66c03e 100644
--- a/solr/core/src/test/org/apache/solr/util/TestMaxTokenLenTokenizer.java
+++ b/solr/core/src/test/org/apache/solr/util/TestMaxTokenLenTokenizer.java
@@ -22,7 +22,6 @@ import org.junit.BeforeClass;
/**
* Tests for:
- * {@link org.apache.lucene.analysis.core.LowerCaseTokenizerFactory}
* {@link org.apache.lucene.analysis.core.LetterTokenizerFactory}
* {@link org.apache.lucene.analysis.core.KeywordTokenizerFactory}
* {@link org.apache.lucene.analysis.core.WhitespaceTokenizerFactory}
@@ -44,25 +43,18 @@ public class TestMaxTokenLenTokenizer extends SolrTestCaseJ4 {
// using fields with definitions, different tokenizer factories respectively at index time and standard tokenizer at query time.
updateJ("{\"add\":{\"doc\": {\"id\":1,\"letter\":\"letter\"}},\"commit\":{}}",null);
- updateJ("{\"add\":{\"doc\": {\"id\":2,\"lowerCase\":\"lowerCase\"}},\"commit\":{}}",null);
updateJ("{\"add\":{\"doc\": {\"id\":3,\"whiteSpace\":\"whiteSpace in\"}},\"commit\":{}}",null);
updateJ("{\"add\":{\"doc\": {\"id\":4,\"unicodeWhiteSpace\":\"unicode in\"}},\"commit\":{}}",null);
updateJ("{\"add\":{\"doc\": {\"id\":5,\"keyword\":\"keyword\"}},\"commit\":{}}",null);
assertU(commit());
- assertQ("Check the total number of docs", req("q","*:*"), "//result[@numFound=5]");
+ assertQ("Check the total number of docs", req("q","*:*"), "//result[@numFound=4]");
//Tokens generated for "letter": "let" "ter" "letter" , maxTokenLen=3
assertQ("Check the total number of docs", req("q","letter:let"), "//result[@numFound=1]");
assertQ("Check the total number of docs", req("q","letter:lett"), "//result[@numFound=0]");
- //Tokens generated for "lowerCase": "low" "erC" "ase" "lowerCase" , maxTokenLen=3
- assertQ("Check the total number of docs", req("q","lowerCase:low"), "//result[@numFound=1]");
- assertQ("Check the total number of docs", req("q","lowerCase:l"), "//result[@numFound=0]");
- assertQ("Check the total number of docs", req("q","lowerCase:lo"), "//result[@numFound=0]");
- assertQ("Check the total number of docs", req("q","lowerCase:lower"), "//result[@numFound=0]");
-
//Tokens generated for "whiteSpace in": "whi" "teS" "pac" "e" "in" "whiteSpace" , maxTokenLen=3
assertQ("Check the total number of docs", req("q","whiteSpace:whi"), "//result[@numFound=1]");
assertQ("Check the total number of docs", req("q","whiteSpace:teS"), "//result[@numFound=1]");
@@ -88,14 +80,13 @@ public class TestMaxTokenLenTokenizer extends SolrTestCaseJ4 {
// using fields with definitions, same tokenizers both at index and query time.
updateJ("{\"add\":{\"doc\": {\"id\":1,\"letter0\":\"letter\"}},\"commit\":{}}",null);
- updateJ("{\"add\":{\"doc\": {\"id\":2,\"lowerCase0\":\"lowerCase\"}},\"commit\":{}}",null);
updateJ("{\"add\":{\"doc\": {\"id\":3,\"whiteSpace0\":\"whiteSpace in\"}},\"commit\":{}}",null);
updateJ("{\"add\":{\"doc\": {\"id\":4,\"unicodeWhiteSpace0\":\"unicode in\"}},\"commit\":{}}",null);
updateJ("{\"add\":{\"doc\": {\"id\":5,\"keyword0\":\"keyword\"}},\"commit\":{}}",null);
assertU(commit());
- assertQ("Check the total number of docs", req("q","*:*"), "//result[@numFound=5]");
+ assertQ("Check the total number of docs", req("q","*:*"), "//result[@numFound=4]");
//Tokens generated for "letter": "let" "ter" "letter" , maxTokenLen=3
// Anything that matches the first three letters should be found when maxLen=3
@@ -104,13 +95,6 @@ public class TestMaxTokenLenTokenizer extends SolrTestCaseJ4 {
assertQ("Check the total number of docs", req("q","letter0:lett"), "//result[@numFound=1]");
assertQ("Check the total number of docs", req("q","letter0:letXYZ"), "//result[@numFound=1]");
- //Tokens generated for "lowerCase": "low" "erC" "ase" "lowerCase" , maxTokenLen=3
- // Anything that matches the first three letters should be found when maxLen=3
- assertQ("Check the total number of docs", req("q","lowerCase0:low"), "//result[@numFound=1]");
- assertQ("Check the total number of docs", req("q","lowerCase0:l"), "//result[@numFound=0]");
- assertQ("Check the total number of docs", req("q","lowerCase0:lo"), "//result[@numFound=0]");
- assertQ("Check the total number of docs", req("q","lowerCase0:lowerXYZ"), "//result[@numFound=1]");
-
//Tokens generated for "whiteSpace in": "whi" "teS" "pac" "e" "in" "whiteSpace" , maxTokenLen=3
// Anything that matches the first three letters should be found when maxLen=3
assertQ("Check the total number of docs", req("q","whiteSpace0:h"), "//result[@numFound=0]");
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c0d29759/solr/solrj/src/test-files/solrj/solr/collection1/conf/schema-sql.xml
----------------------------------------------------------------------
diff --git a/solr/solrj/src/test-files/solrj/solr/collection1/conf/schema-sql.xml b/solr/solrj/src/test-files/solrj/solr/collection1/conf/schema-sql.xml
index 3a1f328..974893c 100644
--- a/solr/solrj/src/test-files/solrj/solr/collection1/conf/schema-sql.xml
+++ b/solr/solrj/src/test-files/solrj/solr/collection1/conf/schema-sql.xml
@@ -141,18 +141,7 @@
<analyzer class="org.apache.lucene.analysis.core.WhitespaceAnalyzer"/>
</fieldtype>
- <fieldtype name="teststop" class="solr.TextField">
- <analyzer>
- <tokenizer class="solr.LowerCaseTokenizerFactory"/>
- </analyzer>
- </fieldtype>
-
<!-- fieldtypes in this section isolate tokenizers and tokenfilters for testing -->
- <fieldtype name="lowertok" class="solr.TextField">
- <analyzer>
- <tokenizer class="solr.LowerCaseTokenizerFactory"/>
- </analyzer>
- </fieldtype>
<fieldtype name="keywordtok" class="solr.TextField">
<analyzer>
<tokenizer class="solr.MockTokenizerFactory" pattern="keyword"/>
@@ -493,8 +482,6 @@
termPositions="true" termOffsets="true"/>
<!-- fields to test individual tokenizers and tokenfilters -->
- <field name="teststop" type="teststop" indexed="true" stored="true"/>
- <field name="lowertok" type="lowertok" indexed="true" stored="true"/>
<field name="keywordtok" type="keywordtok" indexed="true" stored="true"/>
<field name="standardtok" type="standardtok" indexed="true" stored="true"/>
<field name="HTMLstandardtok" type="HTMLstandardtok" indexed="true" stored="true"/>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c0d29759/solr/solrj/src/test-files/solrj/solr/collection1/conf/schema.xml
----------------------------------------------------------------------
diff --git a/solr/solrj/src/test-files/solrj/solr/collection1/conf/schema.xml b/solr/solrj/src/test-files/solrj/solr/collection1/conf/schema.xml
index 02b5053..079a35f 100644
--- a/solr/solrj/src/test-files/solrj/solr/collection1/conf/schema.xml
+++ b/solr/solrj/src/test-files/solrj/solr/collection1/conf/schema.xml
@@ -116,18 +116,7 @@
<analyzer class="org.apache.lucene.analysis.core.WhitespaceAnalyzer"/>
</fieldType>
- <fieldType name="teststop" class="solr.TextField">
- <analyzer>
- <tokenizer class="solr.LowerCaseTokenizerFactory"/>
- </analyzer>
- </fieldType>
-
<!-- fieldTypes in this section isolate tokenizers and tokenfilters for testing -->
- <fieldType name="lowertok" class="solr.TextField">
- <analyzer>
- <tokenizer class="solr.LowerCaseTokenizerFactory"/>
- </analyzer>
- </fieldType>
<fieldType name="keywordtok" class="solr.TextField">
<analyzer>
<tokenizer class="solr.MockTokenizerFactory" pattern="keyword"/>
@@ -461,8 +450,6 @@
termPositions="true" termOffsets="true"/>
<!-- fields to test individual tokenizers and tokenfilters -->
- <field name="teststop" type="teststop" indexed="true" stored="true"/>
- <field name="lowertok" type="lowertok" indexed="true" stored="true"/>
<field name="keywordtok" type="keywordtok" indexed="true" stored="true"/>
<field name="standardtok" type="standardtok" indexed="true" stored="true"/>
<field name="HTMLstandardtok" type="HTMLstandardtok" indexed="true" stored="true"/>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c0d29759/solr/solrj/src/test-files/solrj/solr/configsets/streaming/conf/schema.xml
----------------------------------------------------------------------
diff --git a/solr/solrj/src/test-files/solrj/solr/configsets/streaming/conf/schema.xml b/solr/solrj/src/test-files/solrj/solr/configsets/streaming/conf/schema.xml
index aa96296..6cd4f91 100644
--- a/solr/solrj/src/test-files/solrj/solr/configsets/streaming/conf/schema.xml
+++ b/solr/solrj/src/test-files/solrj/solr/configsets/streaming/conf/schema.xml
@@ -137,16 +137,7 @@
<analyzer class="org.apache.lucene.analysis.core.WhitespaceAnalyzer"/>
</fieldtype>
- <fieldtype name="teststop" class="solr.TextField">
- <analyzer>
- <tokenizer class="solr.LowerCaseTokenizerFactory"/>
- </analyzer>
- </fieldtype>
-
<!-- fieldtypes in this section isolate tokenizers and tokenfilters for testing -->
- <fieldtype name="lowertok" class="solr.TextField">
- <analyzer><tokenizer class="solr.LowerCaseTokenizerFactory"/></analyzer>
- </fieldtype>
<fieldtype name="keywordtok" class="solr.TextField">
<analyzer><tokenizer class="solr.MockTokenizerFactory" pattern="keyword"/></analyzer>
</fieldtype>
@@ -479,8 +470,6 @@
termPositions="true" termOffsets="true"/>
<!-- fields to test individual tokenizers and tokenfilters -->
- <field name="teststop" type="teststop" indexed="true" stored="true"/>
- <field name="lowertok" type="lowertok" indexed="true" stored="true"/>
<field name="keywordtok" type="keywordtok" indexed="true" stored="true"/>
<field name="standardtok" type="standardtok" indexed="true" stored="true"/>
<field name="HTMLstandardtok" type="HTMLstandardtok" indexed="true" stored="true"/>