You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by er...@apache.org on 2017/05/28 22:38:07 UTC
lucene-solr:master: LUCENE=7705: Allow CharTokenizer-derived
tokenizers and KeywordTokenizer to configure the max token length
Repository: lucene-solr
Updated Branches:
refs/heads/master bc973ecdc -> 906679adc
LUCENE=7705: Allow CharTokenizer-derived tokenizers and KeywordTokenizer to configure the max token length
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/906679ad
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/906679ad
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/906679ad
Branch: refs/heads/master
Commit: 906679adc80f0fad1e5c311b03023c7bd95633d7
Parents: bc973ec
Author: Erick Erickson <er...@apache.org>
Authored: Sun May 28 15:18:48 2017 -0700
Committer: Erick Erickson <er...@apache.org>
Committed: Sun May 28 15:18:48 2017 -0700
----------------------------------------------------------------------
.../lucene/analysis/core/KeywordTokenizer.java | 10 +-
.../analysis/core/KeywordTokenizerFactory.java | 19 ++-
.../lucene/analysis/core/LetterTokenizer.java | 14 ++
.../analysis/core/LetterTokenizerFactory.java | 19 ++-
.../analysis/core/LowerCaseTokenizer.java | 13 ++
.../core/LowerCaseTokenizerFactory.java | 37 +++--
.../core/UnicodeWhitespaceTokenizer.java | 13 ++
.../analysis/core/WhitespaceTokenizer.java | 13 ++
.../core/WhitespaceTokenizerFactory.java | 18 ++-
.../lucene/analysis/util/CharTokenizer.java | 27 +++-
.../analysis/core/TestKeywordTokenizer.java | 88 +++++++++++
.../core/TestUnicodeWhitespaceTokenizer.java | 51 +++++++
.../analysis/util/TestCharTokenizers.java | 95 ++++++++++++
solr/CHANGES.txt | 3 +
.../collection1/conf/schema-tokenizer-test.xml | 150 +++++++++++++++++++
.../solr/util/TestMaxTokenLenTokenizer.java | 135 +++++++++++++++++
16 files changed, 680 insertions(+), 25 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/906679ad/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizer.java
index 209ecee..eb08eea 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizer.java
@@ -24,6 +24,8 @@ import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.util.AttributeFactory;
+import static org.apache.lucene.analysis.standard.StandardTokenizer.MAX_TOKEN_LENGTH_LIMIT;
+
/**
* Emits the entire input as a single token.
*/
@@ -41,16 +43,16 @@ public final class KeywordTokenizer extends Tokenizer {
}
public KeywordTokenizer(int bufferSize) {
- if (bufferSize <= 0) {
- throw new IllegalArgumentException("bufferSize must be > 0");
+ if (bufferSize > MAX_TOKEN_LENGTH_LIMIT || bufferSize <= 0) {
+ throw new IllegalArgumentException("maxTokenLen must be greater than 0 and less than " + MAX_TOKEN_LENGTH_LIMIT + " passed: " + bufferSize);
}
termAtt.resizeBuffer(bufferSize);
}
public KeywordTokenizer(AttributeFactory factory, int bufferSize) {
super(factory);
- if (bufferSize <= 0) {
- throw new IllegalArgumentException("bufferSize must be > 0");
+ if (bufferSize > MAX_TOKEN_LENGTH_LIMIT || bufferSize <= 0) {
+ throw new IllegalArgumentException("maxTokenLen must be greater than 0 and less than " + MAX_TOKEN_LENGTH_LIMIT + " passed: " + bufferSize);
}
termAtt.resizeBuffer(bufferSize);
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/906679ad/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizerFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizerFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizerFactory.java
index 3654f67..86f65d6 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizerFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizerFactory.java
@@ -16,26 +16,39 @@
*/
package org.apache.lucene.analysis.core;
-
import org.apache.lucene.analysis.util.TokenizerFactory;
import org.apache.lucene.util.AttributeFactory;
import java.util.Map;
+import static org.apache.lucene.analysis.standard.StandardTokenizer.MAX_TOKEN_LENGTH_LIMIT;
+
/**
* Factory for {@link KeywordTokenizer}.
* <pre class="prettyprint">
* <fieldType name="text_keyword" class="solr.TextField" positionIncrementGap="100">
* <analyzer>
- * <tokenizer class="solr.KeywordTokenizerFactory"/>
+ * <tokenizer class="solr.KeywordTokenizerFactory" maxTokenLen="256"/>
* </analyzer>
* </fieldType></pre>
+ *
+ * Options:
+ * <ul>
+ * <li>maxTokenLen: max token length, should be greater than 0 and less than
+ * MAX_TOKEN_LENGTH_LIMIT (1024*1024). It is rare to need to change this
+ * else {@link KeywordTokenizer}::DEFAULT_BUFFER_SIZE</li>
+ * </ul>
*/
public class KeywordTokenizerFactory extends TokenizerFactory {
+ private final int maxTokenLen;
/** Creates a new KeywordTokenizerFactory */
public KeywordTokenizerFactory(Map<String,String> args) {
super(args);
+ maxTokenLen = getInt(args, "maxTokenLen", KeywordTokenizer.DEFAULT_BUFFER_SIZE);
+ if (maxTokenLen > MAX_TOKEN_LENGTH_LIMIT || maxTokenLen <= 0) {
+ throw new IllegalArgumentException("maxTokenLen must be greater than 0 and less than " + MAX_TOKEN_LENGTH_LIMIT + " passed: " + maxTokenLen);
+ }
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
@@ -43,6 +56,6 @@ public class KeywordTokenizerFactory extends TokenizerFactory {
@Override
public KeywordTokenizer create(AttributeFactory factory) {
- return new KeywordTokenizer(factory, KeywordTokenizer.DEFAULT_BUFFER_SIZE);
+ return new KeywordTokenizer(factory, maxTokenLen);
}
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/906679ad/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LetterTokenizer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LetterTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LetterTokenizer.java
index df41b37..8fb7d0e 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LetterTokenizer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LetterTokenizer.java
@@ -50,6 +50,20 @@ public class LetterTokenizer extends CharTokenizer {
super(factory);
}
+ /**
+ * Construct a new LetterTokenizer using a given
+ * {@link org.apache.lucene.util.AttributeFactory}.
+ *
+ * @param factory the attribute factory to use for this {@link Tokenizer}
+ * @param maxTokenLen maximum token length the tokenizer will emit.
+ * Must be greater than 0 and less than MAX_TOKEN_LENGTH_LIMIT (1024*1024)
+ * @throws IllegalArgumentException if maxTokenLen is invalid.
+
+ */
+ public LetterTokenizer(AttributeFactory factory, int maxTokenLen) {
+ super(factory, maxTokenLen);
+ }
+
/** Collects only characters which satisfy
* {@link Character#isLetter(int)}.*/
@Override
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/906679ad/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LetterTokenizerFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LetterTokenizerFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LetterTokenizerFactory.java
index 828d6cf..41ada68 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LetterTokenizerFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LetterTokenizerFactory.java
@@ -17,25 +17,40 @@
package org.apache.lucene.analysis.core;
+import org.apache.lucene.analysis.util.CharTokenizer;
import org.apache.lucene.analysis.util.TokenizerFactory;
import org.apache.lucene.util.AttributeFactory;
import java.util.Map;
+import static org.apache.lucene.analysis.standard.StandardTokenizer.MAX_TOKEN_LENGTH_LIMIT;
+
/**
* Factory for {@link LetterTokenizer}.
* <pre class="prettyprint">
* <fieldType name="text_letter" class="solr.TextField" positionIncrementGap="100">
* <analyzer>
- * <tokenizer class="solr.LetterTokenizerFactory"/>
+ * <tokenizer class="solr.LetterTokenizerFactory" maxTokenLen="256"/>
* </analyzer>
* </fieldType></pre>
+ *
+ * Options:
+ * <ul>
+ * <li>maxTokenLen: max token length, must be greater than 0 and less than MAX_TOKEN_LENGTH_LIMIT (1024*1024).
+ * It is rare to need to change this
+ * else {@link CharTokenizer}::DEFAULT_MAX_TOKEN_LEN</li>
+ * </ul>
*/
public class LetterTokenizerFactory extends TokenizerFactory {
+ private final int maxTokenLen;
/** Creates a new LetterTokenizerFactory */
public LetterTokenizerFactory(Map<String,String> args) {
super(args);
+ maxTokenLen = getInt(args, "maxTokenLen", CharTokenizer.DEFAULT_MAX_WORD_LEN);
+ if (maxTokenLen > MAX_TOKEN_LENGTH_LIMIT || maxTokenLen <= 0) {
+ throw new IllegalArgumentException("maxTokenLen must be greater than 0 and less than " + MAX_TOKEN_LENGTH_LIMIT + " passed: " + maxTokenLen);
+ }
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
@@ -43,6 +58,6 @@ public class LetterTokenizerFactory extends TokenizerFactory {
@Override
public LetterTokenizer create(AttributeFactory factory) {
- return new LetterTokenizer(factory);
+ return new LetterTokenizer(factory, maxTokenLen);
}
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/906679ad/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseTokenizer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseTokenizer.java
index 982d356..26b8747 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseTokenizer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseTokenizer.java
@@ -50,6 +50,19 @@ public final class LowerCaseTokenizer extends LetterTokenizer {
super(factory);
}
+ /**
+ * Construct a new LowerCaseTokenizer using a given
+ * {@link org.apache.lucene.util.AttributeFactory}.
+ *
+ * @param factory the attribute factory to use for this {@link Tokenizer}
+ * @param maxTokenLen maximum token length the tokenizer will emit.
+ * Must be greater than 0 and less than MAX_TOKEN_LENGTH_LIMIT (1024*1024)
+ * @throws IllegalArgumentException if maxTokenLen is invalid.
+ */
+ public LowerCaseTokenizer(AttributeFactory factory, int maxTokenLen) {
+ super(factory, maxTokenLen);
+ }
+
/** Converts char to lower case
* {@link Character#toLowerCase(int)}.*/
@Override
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/906679ad/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseTokenizerFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseTokenizerFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseTokenizerFactory.java
index 3e29161..a3e06c7 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseTokenizerFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseTokenizerFactory.java
@@ -18,6 +18,7 @@ package org.apache.lucene.analysis.core;
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
+import org.apache.lucene.analysis.util.CharTokenizer;
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
import org.apache.lucene.analysis.util.TokenizerFactory;
import org.apache.lucene.util.AttributeFactory;
@@ -25,20 +26,36 @@ import org.apache.lucene.util.AttributeFactory;
import java.util.HashMap;
import java.util.Map;
+import static org.apache.lucene.analysis.standard.StandardTokenizer.MAX_TOKEN_LENGTH_LIMIT;
+
/**
- * Factory for {@link LowerCaseTokenizer}.
+ * Factory for {@link LowerCaseTokenizer}.
* <pre class="prettyprint">
* <fieldType name="text_lwrcase" class="solr.TextField" positionIncrementGap="100">
- * <analyzer>
- * <tokenizer class="solr.LowerCaseTokenizerFactory"/>
- * </analyzer>
+ * <analyzer>
+ * <tokenizer class="solr.LowerCaseTokenizerFactory" maxTokenLen="256"/>
+ * </analyzer>
* </fieldType></pre>
+ * <p>
+ * Options:
+ * <ul>
+ * <li>maxTokenLen: max token length, should be greater than 0 and less than MAX_TOKEN_LENGTH_LIMIT (1024*1024).
+ * It is rare to need to change this
+ * else {@link CharTokenizer}::DEFAULT_MAX_WORD_LEN</li>
+ * </ul>
*/
public class LowerCaseTokenizerFactory extends TokenizerFactory implements MultiTermAwareComponent {
-
- /** Creates a new LowerCaseTokenizerFactory */
- public LowerCaseTokenizerFactory(Map<String,String> args) {
+ private final int maxTokenLen;
+
+ /**
+ * Creates a new LowerCaseTokenizerFactory
+ */
+ public LowerCaseTokenizerFactory(Map<String, String> args) {
super(args);
+ maxTokenLen = getInt(args, "maxTokenLen", CharTokenizer.DEFAULT_MAX_WORD_LEN);
+ if (maxTokenLen > MAX_TOKEN_LENGTH_LIMIT || maxTokenLen <= 0) {
+ throw new IllegalArgumentException("maxTokenLen must be greater than 0 and less than " + MAX_TOKEN_LENGTH_LIMIT + " passed: " + maxTokenLen);
+ }
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
@@ -46,11 +63,13 @@ public class LowerCaseTokenizerFactory extends TokenizerFactory implements Multi
@Override
public LowerCaseTokenizer create(AttributeFactory factory) {
- return new LowerCaseTokenizer(factory);
+ return new LowerCaseTokenizer(factory, maxTokenLen);
}
@Override
public AbstractAnalysisFactory getMultiTermComponent() {
- return new LowerCaseFilterFactory(new HashMap<>(getOriginalArgs()));
+ Map map = new HashMap<>(getOriginalArgs());
+ map.remove("maxTokenLen"); //removing "maxTokenLen" argument for LowerCaseFilterFactory init
+ return new LowerCaseFilterFactory(map);
}
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/906679ad/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/UnicodeWhitespaceTokenizer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/UnicodeWhitespaceTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/UnicodeWhitespaceTokenizer.java
index 5e4313f..00c181f 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/UnicodeWhitespaceTokenizer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/UnicodeWhitespaceTokenizer.java
@@ -47,6 +47,19 @@ public final class UnicodeWhitespaceTokenizer extends CharTokenizer {
public UnicodeWhitespaceTokenizer(AttributeFactory factory) {
super(factory);
}
+
+ /**
+ * Construct a new UnicodeWhitespaceTokenizer using a given
+ * {@link org.apache.lucene.util.AttributeFactory}.
+ *
+ * @param factory the attribute factory to use for this {@link Tokenizer}
+ * @param maxTokenLen maximum token length the tokenizer will emit.
+ * Must be greater than 0 and less than MAX_TOKEN_LENGTH_LIMIT (1024*1024)
+ * @throws IllegalArgumentException if maxTokenLen is invalid.
+ */
+ public UnicodeWhitespaceTokenizer(AttributeFactory factory, int maxTokenLen) {
+ super(factory, maxTokenLen);
+ }
/** Collects only characters which do not satisfy Unicode's WHITESPACE property. */
@Override
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/906679ad/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/WhitespaceTokenizer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/WhitespaceTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/WhitespaceTokenizer.java
index 70f2d62..0655227 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/WhitespaceTokenizer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/WhitespaceTokenizer.java
@@ -46,6 +46,19 @@ public final class WhitespaceTokenizer extends CharTokenizer {
public WhitespaceTokenizer(AttributeFactory factory) {
super(factory);
}
+
+ /**
+ * Construct a new WhitespaceTokenizer using a given
+ * {@link org.apache.lucene.util.AttributeFactory}.
+ *
+ * @param factory the attribute factory to use for this {@link Tokenizer}
+ * @param maxTokenLen maximum token length the tokenizer will emit.
+ * Must be greater than 0 and less than MAX_TOKEN_LENGTH_LIMIT (1024*1024)
+ * @throws IllegalArgumentException if maxTokenLen is invalid.
+ */
+ public WhitespaceTokenizer(AttributeFactory factory, int maxTokenLen) {
+ super(factory, maxTokenLen);
+ }
/** Collects only characters which do not satisfy
* {@link Character#isWhitespace(int)}.*/
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/906679ad/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/WhitespaceTokenizerFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/WhitespaceTokenizerFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/WhitespaceTokenizerFactory.java
index fd38b63..29e9ed5 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/WhitespaceTokenizerFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/WhitespaceTokenizerFactory.java
@@ -22,15 +22,18 @@ import java.util.Collection;
import java.util.Map;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.util.CharTokenizer;
import org.apache.lucene.analysis.util.TokenizerFactory;
import org.apache.lucene.util.AttributeFactory;
+import static org.apache.lucene.analysis.standard.StandardTokenizer.MAX_TOKEN_LENGTH_LIMIT;
+
/**
* Factory for {@link WhitespaceTokenizer}.
* <pre class="prettyprint">
* <fieldType name="text_ws" class="solr.TextField" positionIncrementGap="100">
* <analyzer>
- * <tokenizer class="solr.WhitespaceTokenizerFactory" rule="unicode"/>
+ * <tokenizer class="solr.WhitespaceTokenizerFactory" rule="unicode" maxTokenLen="256"/>
* </analyzer>
* </fieldType></pre>
*
@@ -38,6 +41,9 @@ import org.apache.lucene.util.AttributeFactory;
* <ul>
* <li>rule: either "java" for {@link WhitespaceTokenizer}
* or "unicode" for {@link UnicodeWhitespaceTokenizer}</li>
+ * <li>maxTokenLen: max token length, should be greater than 0 and less than MAX_TOKEN_LENGTH_LIMIT (1024*1024).
+ * It is rare to need to change this
+ * else {@link CharTokenizer}::DEFAULT_MAX_TOKEN_LEN</li>
* </ul>
*/
public class WhitespaceTokenizerFactory extends TokenizerFactory {
@@ -46,13 +52,17 @@ public class WhitespaceTokenizerFactory extends TokenizerFactory {
private static final Collection<String> RULE_NAMES = Arrays.asList(RULE_JAVA, RULE_UNICODE);
private final String rule;
+ private final int maxTokenLen;
/** Creates a new WhitespaceTokenizerFactory */
public WhitespaceTokenizerFactory(Map<String,String> args) {
super(args);
rule = get(args, "rule", RULE_NAMES, RULE_JAVA);
-
+ maxTokenLen = getInt(args, "maxTokenLen", CharTokenizer.DEFAULT_MAX_WORD_LEN);
+ if (maxTokenLen > MAX_TOKEN_LENGTH_LIMIT || maxTokenLen <= 0) {
+ throw new IllegalArgumentException("maxTokenLen must be greater than 0 and less than " + MAX_TOKEN_LENGTH_LIMIT + " passed: " + maxTokenLen);
+ }
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
@@ -62,9 +72,9 @@ public class WhitespaceTokenizerFactory extends TokenizerFactory {
public Tokenizer create(AttributeFactory factory) {
switch (rule) {
case RULE_JAVA:
- return new WhitespaceTokenizer(factory);
+ return new WhitespaceTokenizer(factory, maxTokenLen);
case RULE_UNICODE:
- return new UnicodeWhitespaceTokenizer(factory);
+ return new UnicodeWhitespaceTokenizer(factory, maxTokenLen);
default:
throw new AssertionError();
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/906679ad/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java
index 13289be..ff9d6ff 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java
@@ -33,6 +33,8 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.util.AttributeFactory;
+import static org.apache.lucene.analysis.standard.StandardTokenizer.MAX_TOKEN_LENGTH_LIMIT;
+
/**
* An abstract base class for simple, character-oriented tokenizers.
* <p>
@@ -50,6 +52,7 @@ public abstract class CharTokenizer extends Tokenizer {
* Creates a new {@link CharTokenizer} instance
*/
public CharTokenizer() {
+ this.maxTokenLen = DEFAULT_MAX_WORD_LEN;
}
/**
@@ -60,6 +63,23 @@ public abstract class CharTokenizer extends Tokenizer {
*/
public CharTokenizer(AttributeFactory factory) {
super(factory);
+ this.maxTokenLen = DEFAULT_MAX_WORD_LEN;
+ }
+
+ /**
+ * Creates a new {@link CharTokenizer} instance
+ *
+ * @param factory the attribute factory to use for this {@link Tokenizer}
+ * @param maxTokenLen maximum token length the tokenizer will emit.
+ * Must be greater than 0 and less than MAX_TOKEN_LENGTH_LIMIT (1024*1024)
+ * @throws IllegalArgumentException if maxTokenLen is invalid.
+ */
+ public CharTokenizer(AttributeFactory factory, int maxTokenLen) {
+ super(factory);
+ if (maxTokenLen > MAX_TOKEN_LENGTH_LIMIT || maxTokenLen <= 0) {
+ throw new IllegalArgumentException("maxTokenLen must be greater than 0 and less than " + MAX_TOKEN_LENGTH_LIMIT + " passed: " + maxTokenLen);
+ }
+ this.maxTokenLen = maxTokenLen;
}
/**
@@ -193,9 +213,10 @@ public abstract class CharTokenizer extends Tokenizer {
}
private int offset = 0, bufferIndex = 0, dataLen = 0, finalOffset = 0;
- private static final int MAX_WORD_LEN = 255;
+ public static final int DEFAULT_MAX_WORD_LEN = 255;
private static final int IO_BUFFER_SIZE = 4096;
-
+ private final int maxTokenLen;
+
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
@@ -256,7 +277,7 @@ public abstract class CharTokenizer extends Tokenizer {
}
end += charCount;
length += Character.toChars(normalize(c), buffer, length); // buffer it, normalized
- if (length >= MAX_WORD_LEN) { // buffer overflow! make sure to check for >= surrogate pair could break == test
+ if (length >= maxTokenLen) { // buffer overflow! make sure to check for >= surrogate pair could break == test
break;
}
} else if (length > 0) { // at non-Letter w/ chars
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/906679ad/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestKeywordTokenizer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestKeywordTokenizer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestKeywordTokenizer.java
new file mode 100644
index 0000000..3f03a00
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestKeywordTokenizer.java
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.core;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.util.AttributeFactory;
+
+public class TestKeywordTokenizer extends BaseTokenStreamTestCase {
+
+ public void testSimple() throws IOException {
+ StringReader reader = new StringReader("Tokenizer \ud801\udc1ctest");
+ KeywordTokenizer tokenizer = new KeywordTokenizer();
+ tokenizer.setReader(reader);
+ assertTokenStreamContents(tokenizer, new String[]{"Tokenizer \ud801\udc1ctest"});
+ }
+
+ public void testFactory() {
+ Map<String, String> args = new HashMap<>();
+ KeywordTokenizerFactory factory = new KeywordTokenizerFactory(args);
+ AttributeFactory attributeFactory = newAttributeFactory();
+ Tokenizer tokenizer = factory.create(attributeFactory);
+ assertEquals(KeywordTokenizer.class, tokenizer.getClass());
+ }
+
+ private Map<String, String> makeArgs(String... args) {
+ Map<String, String> ret = new HashMap<>();
+ for (int idx = 0; idx < args.length; idx += 2) {
+ ret.put(args[idx], args[idx + 1]);
+ }
+ return ret;
+ }
+
+ public void testParamsFactory() throws IOException {
+ // negative maxTokenLen
+ IllegalArgumentException iae = expectThrows(IllegalArgumentException.class, () ->
+ new KeywordTokenizerFactory(makeArgs("maxTokenLen", "-1")));
+ assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: -1", iae.getMessage());
+
+ // zero maxTokenLen
+ iae = expectThrows(IllegalArgumentException.class, () ->
+ new KeywordTokenizerFactory(makeArgs("maxTokenLen", "0")));
+ assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 0", iae.getMessage());
+
+ // Added random param, should throw illegal error
+ iae = expectThrows(IllegalArgumentException.class, () ->
+ new KeywordTokenizerFactory(makeArgs("maxTokenLen", "255", "randomParam", "rValue")));
+ assertEquals("Unknown parameters: {randomParam=rValue}", iae.getMessage());
+
+ // tokeniser will never split, no matter what is passed,
+ // but the buffer will not be more than length of the token
+
+ KeywordTokenizerFactory factory = new KeywordTokenizerFactory(makeArgs("maxTokenLen", "5"));
+ AttributeFactory attributeFactory = newAttributeFactory();
+ Tokenizer tokenizer = factory.create(attributeFactory);
+ StringReader reader = new StringReader("Tokenizertest");
+ tokenizer.setReader(reader);
+ assertTokenStreamContents(tokenizer, new String[]{"Tokenizertest"});
+
+ // tokeniser will never split, no matter what is passed,
+ // but the buffer will not be more than length of the token
+ factory = new KeywordTokenizerFactory(makeArgs("maxTokenLen", "2"));
+ attributeFactory = newAttributeFactory();
+ tokenizer = factory.create(attributeFactory);
+ reader = new StringReader("Tokenizer\u00A0test");
+ tokenizer.setReader(reader);
+ assertTokenStreamContents(tokenizer, new String[]{"Tokenizer\u00A0test"});
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/906679ad/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestUnicodeWhitespaceTokenizer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestUnicodeWhitespaceTokenizer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestUnicodeWhitespaceTokenizer.java
index acdb670..16089e9 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestUnicodeWhitespaceTokenizer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestUnicodeWhitespaceTokenizer.java
@@ -54,4 +54,55 @@ public class TestUnicodeWhitespaceTokenizer extends BaseTokenStreamTestCase {
assertEquals(UnicodeWhitespaceTokenizer.class, tokenizer.getClass());
}
+ private Map<String, String> makeArgs(String... args) {
+ Map<String, String> ret = new HashMap<>();
+ for (int idx = 0; idx < args.length; idx += 2) {
+ ret.put(args[idx], args[idx + 1]);
+ }
+ return ret;
+ }
+
+ public void testParamsFactory() throws IOException {
+
+
+ // negative maxTokenLen
+ IllegalArgumentException iae = expectThrows(IllegalArgumentException.class, () ->
+ new WhitespaceTokenizerFactory(makeArgs("rule", "unicode", "maxTokenLen", "-1")));
+ assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: -1", iae.getMessage());
+
+ // zero maxTokenLen
+ iae = expectThrows(IllegalArgumentException.class, () ->
+ new WhitespaceTokenizerFactory(makeArgs("rule", "unicode", "maxTokenLen", "0")));
+ assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 0", iae.getMessage());
+
+ // Added random param, should throw illegal error
+ iae = expectThrows(IllegalArgumentException.class, () ->
+ new WhitespaceTokenizerFactory(makeArgs("rule", "unicode", "maxTokenLen", "255", "randomParam", "rValue")));
+ assertEquals("Unknown parameters: {randomParam=rValue}", iae.getMessage());
+
+ // tokeniser will split at 5, Token | izer, no matter what happens
+ WhitespaceTokenizerFactory factory = new WhitespaceTokenizerFactory(makeArgs("rule", "unicode", "maxTokenLen", "5"));
+ AttributeFactory attributeFactory = newAttributeFactory();
+ Tokenizer tokenizer = factory.create(attributeFactory);
+ StringReader reader = new StringReader("Tokenizer \ud801\udc1ctest");
+ tokenizer.setReader(reader);
+ assertTokenStreamContents(tokenizer, new String[]{"Token", "izer", "\ud801\udc1ctes", "t"});
+
+ // tokeniser will split at 2, To | ke | ni | ze | r, no matter what happens
+ factory = new WhitespaceTokenizerFactory(makeArgs("rule", "unicode", "maxTokenLen", "2"));
+ attributeFactory = newAttributeFactory();
+ tokenizer = factory.create(attributeFactory);
+ reader = new StringReader("Tokenizer\u00A0test");
+ tokenizer.setReader(reader);
+ assertTokenStreamContents(tokenizer, new String[]{"To", "ke", "ni", "ze", "r", "te", "st"});
+
+ // tokeniser will split at 10, no matter what happens,
+ // but tokens' length are less than that
+ factory = new WhitespaceTokenizerFactory(makeArgs("rule", "unicode", "maxTokenLen", "10"));
+ attributeFactory = newAttributeFactory();
+ tokenizer = factory.create(attributeFactory);
+ reader = new StringReader("Tokenizer\u00A0test");
+ tokenizer.setReader(reader);
+ assertTokenStreamContents(tokenizer, new String[]{"Tokenizer", "test"});
+ }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/906679ad/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharTokenizers.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharTokenizers.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharTokenizers.java
index 783fc3e..4596608 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharTokenizers.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharTokenizers.java
@@ -25,8 +25,10 @@ import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.core.LetterTokenizer;
import org.apache.lucene.analysis.core.LowerCaseTokenizer;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.util.TestUtil;
@@ -89,6 +91,99 @@ public class TestCharTokenizers extends BaseTokenStreamTestCase {
tokenizer.setReader(new StringReader(builder.toString() + builder.toString()));
assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(Locale.ROOT), builder.toString().toLowerCase(Locale.ROOT)});
}
+
+ /*
+ * tests the max word length passed as parameter - tokenizer will split at the passed position char no matter what happens
+ */
+ public void testCustomMaxTokenLength() throws IOException {
+
+ StringBuilder builder = new StringBuilder();
+ for (int i = 0; i < 100; i++) {
+ builder.append("A");
+ }
+ Tokenizer tokenizer = new LowerCaseTokenizer(newAttributeFactory(), 100);
+ // Tricky, passing two copies of the string to the reader....
+ tokenizer.setReader(new StringReader(builder.toString() + builder.toString()));
+ assertTokenStreamContents(tokenizer, new String[]{builder.toString().toLowerCase(Locale.ROOT),
+ builder.toString().toLowerCase(Locale.ROOT) });
+
+ Exception e = expectThrows(IllegalArgumentException.class, () ->
+ new LowerCaseTokenizer(newAttributeFactory(), -1));
+ assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: -1", e.getMessage());
+
+ tokenizer = new LetterTokenizer(newAttributeFactory(), 100);
+ tokenizer.setReader(new StringReader(builder.toString() + builder.toString()));
+ assertTokenStreamContents(tokenizer, new String[]{builder.toString(), builder.toString()});
+
+
+ // Let's test that we can get a token longer than 255 through.
+ builder.setLength(0);
+ for (int i = 0; i < 500; i++) {
+ builder.append("Z");
+ }
+ tokenizer = new LetterTokenizer(newAttributeFactory(), 500);
+ tokenizer.setReader(new StringReader(builder.toString()));
+ assertTokenStreamContents(tokenizer, new String[]{builder.toString()});
+
+
+ // Just to be sure what is happening here, token lengths of zero make no sense,
+ // Let's try the edge cases, token > I/O buffer (4096)
+ builder.setLength(0);
+ for (int i = 0; i < 600; i++) {
+ builder.append("aUrOkIjq"); // 600 * 8 = 4800 chars.
+ }
+
+ e = expectThrows(IllegalArgumentException.class, () ->
+ new LowerCaseTokenizer(newAttributeFactory(), 0));
+ assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 0", e.getMessage());
+
+ e = expectThrows(IllegalArgumentException.class, () ->
+ new LowerCaseTokenizer(newAttributeFactory(), 10_000_000));
+ assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 10000000", e.getMessage());
+
+ tokenizer = new LowerCaseTokenizer(newAttributeFactory(), 4800);
+ tokenizer.setReader(new StringReader(builder.toString()));
+ assertTokenStreamContents(tokenizer, new String[]{builder.toString().toLowerCase(Locale.ROOT)});
+
+
+ e = expectThrows(IllegalArgumentException.class, () ->
+ new KeywordTokenizer(newAttributeFactory(), 0));
+ assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 0", e.getMessage());
+
+ e = expectThrows(IllegalArgumentException.class, () ->
+ new KeywordTokenizer(newAttributeFactory(), 10_000_000));
+ assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 10000000", e.getMessage());
+
+
+ tokenizer = new KeywordTokenizer(newAttributeFactory(), 4800);
+ tokenizer.setReader(new StringReader(builder.toString()));
+ assertTokenStreamContents(tokenizer, new String[]{builder.toString()});
+
+ e = expectThrows(IllegalArgumentException.class, () ->
+ new LetterTokenizer(newAttributeFactory(), 0));
+ assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 0", e.getMessage());
+
+ e = expectThrows(IllegalArgumentException.class, () ->
+ new LetterTokenizer(newAttributeFactory(), 2_000_000));
+ assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 2000000", e.getMessage());
+
+ tokenizer = new LetterTokenizer(newAttributeFactory(), 4800);
+ tokenizer.setReader(new StringReader(builder.toString()));
+ assertTokenStreamContents(tokenizer, new String[]{builder.toString()});
+
+ e = expectThrows(IllegalArgumentException.class, () ->
+ new WhitespaceTokenizer(newAttributeFactory(), 0));
+ assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 0", e.getMessage());
+
+ e = expectThrows(IllegalArgumentException.class, () ->
+ new WhitespaceTokenizer(newAttributeFactory(), 3_000_000));
+ assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 3000000", e.getMessage());
+
+ tokenizer = new WhitespaceTokenizer(newAttributeFactory(), 4800);
+ tokenizer.setReader(new StringReader(builder.toString()));
+ assertTokenStreamContents(tokenizer, new String[]{builder.toString()});
+
+ }
/*
* tests the max word length of 255 with a surrogate pair at position 255
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/906679ad/solr/CHANGES.txt
----------------------------------------------------------------------
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index d4e6eac..c413cf8 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -260,6 +260,9 @@ Other Changes
* SOLR-10438: Assign explicit useDocValuesAsStored values to all points field types in
schema-point.xml/TestPointFields. (hossman, Steve Rowe)
+
+* LUCENE-7705: Allow CharTokenizer-derived tokenizers and KeywordTokenizer to configure the max token length.
+ (Amrit Sarkar via Erick Erickson)
* SOLR-10659: Remove ResponseBuilder.getSortSpec use in SearchGroupShardResponseProcessor.
(Judith Silverman via Christine Poerschke)
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/906679ad/solr/core/src/test-files/solr/collection1/conf/schema-tokenizer-test.xml
----------------------------------------------------------------------
diff --git a/solr/core/src/test-files/solr/collection1/conf/schema-tokenizer-test.xml b/solr/core/src/test-files/solr/collection1/conf/schema-tokenizer-test.xml
new file mode 100644
index 0000000..f3d3196
--- /dev/null
+++ b/solr/core/src/test-files/solr/collection1/conf/schema-tokenizer-test.xml
@@ -0,0 +1,150 @@
+<?xml version="1.0" ?>
+<!--
+Licensed to the Apache Software Foundation (ASF) under one or more
+contributor license agreements. See the NOTICE file distributed with
+this work for additional information regarding copyright ownership.
+The ASF licenses this file to You under the Apache License, Version 2.0
+(the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<!-- The Solr schema file. This file should be named "schema.xml" and
+should be located where the classloader for the Solr webapp can find it.
+
+This schema is used for testing, and as such has everything and the
+kitchen sink thrown in. See example/solr/conf/schema.xml for a
+more concise example.
+
+-->
+
+<schema name="test" version="1.0">
+
+ <!-- field type definitions... note that the "name" attribute is
+ just a label to be used by field definitions. The "class"
+ attribute and any other attributes determine the real type and
+ behavior of the fieldType.
+ -->
+
+ <!--
+ Default numeric field types. For faster range queries, consider the tint/tfloat/tlong/tdouble types.
+ -->
+ <fieldType name="int" class="solr.TrieIntField" precisionStep="0" positionIncrementGap="0"/>
+ <fieldType name="float" class="solr.TrieFloatField" precisionStep="0" positionIncrementGap="0"/>
+ <fieldType name="long" class="solr.TrieLongField" precisionStep="0" positionIncrementGap="0"/>
+ <fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" positionIncrementGap="0"/>
+
+ <!--
+ Numeric field types that index each value at various levels of precision
+ to accelerate range queries when the number of values between the range
+ endpoints is large. See the javadoc for LegacyNumericRangeQuery for internal
+ implementation details.
+ -->
+
+ <!-- Seperate analyzers for index and query time -->
+
+ <fieldType name="letterfieldType" class="solr.TextField" positionIncrementGap="100">
+ <analyzer type="index">
+ <tokenizer class="solr.LetterTokenizerFactory" maxTokenLen="3" />
+ </analyzer>
+ <analyzer type="query">
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ </analyzer>
+ </fieldType>
+
+ <fieldType name="lowerCasefieldType" class="solr.TextField" positionIncrementGap="100">
+ <analyzer type="index">
+ <tokenizer class="solr.LowerCaseTokenizerFactory" maxTokenLen="3" />
+ </analyzer>
+ <analyzer type="query">
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ </analyzer>
+ </fieldType>
+
+ <fieldType name="whiteSpfieldType" class="solr.TextField" positionIncrementGap="100">
+ <analyzer type="index">
+ <tokenizer class="solr.WhitespaceTokenizerFactory" maxTokenLen="3" />
+ </analyzer>
+ <analyzer type="query">
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ </analyzer>
+ </fieldType>
+
+ <fieldType name="uniWhiteSpfieldType" class="solr.TextField" positionIncrementGap="100">
+ <analyzer type="index">
+ <tokenizer class="solr.WhitespaceTokenizerFactory" maxTokenLen="3" />
+ </analyzer>
+ <analyzer type="query">
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ </analyzer>
+ </fieldType>
+
+ <fieldType name="keywordfieldType" class="solr.TextField" positionIncrementGap="100">
+ <analyzer index="index">
+ <tokenizer class="solr.KeywordTokenizerFactory" maxTokenLen="3" />
+ </analyzer>
+ <analyzer type="query">
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ </analyzer>
+ </fieldType>
+
+ <!-- Same analyzers for both index and query time -->
+
+ <fieldType name="letter0fieldType" class="solr.TextField" positionIncrementGap="100">
+ <analyzer>
+ <tokenizer class="solr.LetterTokenizerFactory" maxTokenLen="3" />
+ </analyzer>
+ </fieldType>
+
+ <fieldType name="lowerCase0fieldType" class="solr.TextField" positionIncrementGap="100">
+ <analyzer>
+ <tokenizer class="solr.LowerCaseTokenizerFactory" maxTokenLen="3" />
+ </analyzer>
+ </fieldType>
+
+ <fieldType name="whiteSp0fieldType" class="solr.TextField" positionIncrementGap="100">
+ <analyzer>
+ <tokenizer class="solr.WhitespaceTokenizerFactory" maxTokenLen="3" />
+ </analyzer>
+ </fieldType>
+
+ <fieldType name="uniWhiteSp0fieldType" class="solr.TextField" positionIncrementGap="100">
+ <analyzer>
+ <tokenizer class="solr.WhitespaceTokenizerFactory" maxTokenLen="3" />
+ </analyzer>
+ </fieldType>
+
+ <fieldType name="keyword0fieldType" class="solr.TextField" positionIncrementGap="100">
+ <analyzer>
+ <tokenizer class="solr.KeywordTokenizerFactory" maxTokenLen="3" />
+ </analyzer>
+ </fieldType>
+
+ <field name="id" type="int" indexed="true" stored="true" multiValued="false" required="true"/>
+
+ <field name="letter" type="letterfieldType" indexed="true" stored="true"/>
+ <field name="lowerCase" type="lowerCasefieldType" indexed="true" stored="true"/>
+ <field name="whiteSpace" type="whiteSpfieldType" indexed="true" stored="true"/>
+ <field name="unicodeWhiteSpace" type="uniWhiteSpfieldType" indexed="true" stored="true"/>
+ <field name="keyword" type="keywordfieldType" indexed="true" stored="true"/>
+
+ <field name="letter0" type="letter0fieldType" indexed="true" stored="true"/>
+ <field name="lowerCase0" type="lowerCase0fieldType" indexed="true" stored="true"/>
+ <field name="whiteSpace0" type="whiteSp0fieldType" indexed="true" stored="true"/>
+ <field name="unicodeWhiteSpace0" type="uniWhiteSp0fieldType" indexed="true" stored="true"/>
+ <field name="keyword0" type="keyword0fieldType" indexed="true" stored="true"/>
+
+ <field name="_version_" type="long" indexed="true" stored="true" multiValued="false"/>
+
+
+ <uniqueKey>id</uniqueKey>
+
+
+</schema>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/906679ad/solr/core/src/test/org/apache/solr/util/TestMaxTokenLenTokenizer.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/util/TestMaxTokenLenTokenizer.java b/solr/core/src/test/org/apache/solr/util/TestMaxTokenLenTokenizer.java
new file mode 100644
index 0000000..c7e0dc3
--- /dev/null
+++ b/solr/core/src/test/org/apache/solr/util/TestMaxTokenLenTokenizer.java
@@ -0,0 +1,135 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.util;
+
+import org.apache.solr.SolrTestCaseJ4;
+import org.junit.BeforeClass;
+
+/**
+ * Tests for:
+ * {@link org.apache.lucene.analysis.core.LowerCaseTokenizerFactory}
+ * {@link org.apache.lucene.analysis.core.LetterTokenizerFactory}
+ * {@link org.apache.lucene.analysis.core.KeywordTokenizerFactory}
+ * {@link org.apache.lucene.analysis.core.WhitespaceTokenizerFactory}
+ */
+
+public class TestMaxTokenLenTokenizer extends SolrTestCaseJ4 {
+ /* field names are used in accordance with the solrconfig and schema supplied */
+ private static final String ID = "id";
+
+ @BeforeClass
+ public static void beforeClass() throws Exception {
+ initCore("solrconfig-update-processor-chains.xml", "schema-tokenizer-test.xml");
+ }
+
+ public void testSingleFieldDiffAnalyzers() throws Exception {
+
+ clearIndex();
+
+ // using fields with definitions, different tokenizer factories respectively at index time and standard tokenizer at query time.
+
+ updateJ("{\"add\":{\"doc\": {\"id\":1,\"letter\":\"letter\"}},\"commit\":{}}",null);
+ updateJ("{\"add\":{\"doc\": {\"id\":2,\"lowerCase\":\"lowerCase\"}},\"commit\":{}}",null);
+ updateJ("{\"add\":{\"doc\": {\"id\":3,\"whiteSpace\":\"whiteSpace in\"}},\"commit\":{}}",null);
+ updateJ("{\"add\":{\"doc\": {\"id\":4,\"unicodeWhiteSpace\":\"unicode in\"}},\"commit\":{}}",null);
+ updateJ("{\"add\":{\"doc\": {\"id\":5,\"keyword\":\"keyword\"}},\"commit\":{}}",null);
+
+ assertU(commit());
+
+ assertQ("Check the total number of docs", req("q","*:*"), "//result[@numFound=5]");
+
+ //Tokens generated for "letter": "let" "ter" "letter" , maxTokenLen=3
+ assertQ("Check the total number of docs", req("q","letter:let"), "//result[@numFound=1]");
+ assertQ("Check the total number of docs", req("q","letter:lett"), "//result[@numFound=0]");
+
+ //Tokens generated for "lowerCase": "low" "erC" "ase" "lowerCase" , maxTokenLen=3
+ assertQ("Check the total number of docs", req("q","lowerCase:low"), "//result[@numFound=1]");
+ assertQ("Check the total number of docs", req("q","lowerCase:l"), "//result[@numFound=0]");
+ assertQ("Check the total number of docs", req("q","lowerCase:lo"), "//result[@numFound=0]");
+ assertQ("Check the total number of docs", req("q","lowerCase:lower"), "//result[@numFound=0]");
+
+ //Tokens generated for "whiteSpace in": "whi" "teS" "pac" "e" "in" "whiteSpace" , maxTokenLen=3
+ assertQ("Check the total number of docs", req("q","whiteSpace:whi"), "//result[@numFound=1]");
+ assertQ("Check the total number of docs", req("q","whiteSpace:teS"), "//result[@numFound=1]");
+ assertQ("Check the total number of docs", req("q","whiteSpace:in"), "//result[@numFound=1]");
+ assertQ("Check the total number of docs", req("q","whiteSpace:white"), "//result[@numFound=0]");
+
+ //Tokens generated for "unicode in": "uni" "cod" "e" "in" "unicode" , maxTokenLen=3
+ assertQ("Check the total number of docs", req("q","unicodeWhiteSpace:uni"), "//result[@numFound=1]");
+ assertQ("Check the total number of docs", req("q","unicodeWhiteSpace:cod"), "//result[@numFound=1]");
+ assertQ("Check the total number of docs", req("q","unicodeWhiteSpace:e"), "//result[@numFound=1]");
+ assertQ("Check the total number of docs", req("q","unicodeWhiteSpace:unico"), "//result[@numFound=0]");
+
+ //Tokens generated for "keyword": "keyword" , maxTokenLen=3
+ assertQ("Check the total number of docs", req("q","keyword:keyword"), "//result[@numFound=1]");
+ assertQ("Check the total number of docs", req("q","keyword:key"), "//result[@numFound=0]");
+
+ }
+
+ public void testSingleFieldSameAnalyzers() throws Exception {
+
+ clearIndex();
+
+ // using fields with definitions, same tokenizers both at index and query time.
+
+ updateJ("{\"add\":{\"doc\": {\"id\":1,\"letter0\":\"letter\"}},\"commit\":{}}",null);
+ updateJ("{\"add\":{\"doc\": {\"id\":2,\"lowerCase0\":\"lowerCase\"}},\"commit\":{}}",null);
+ updateJ("{\"add\":{\"doc\": {\"id\":3,\"whiteSpace0\":\"whiteSpace in\"}},\"commit\":{}}",null);
+ updateJ("{\"add\":{\"doc\": {\"id\":4,\"unicodeWhiteSpace0\":\"unicode in\"}},\"commit\":{}}",null);
+ updateJ("{\"add\":{\"doc\": {\"id\":5,\"keyword0\":\"keyword\"}},\"commit\":{}}",null);
+
+ assertU(commit());
+
+ assertQ("Check the total number of docs", req("q","*:*"), "//result[@numFound=5]");
+
+ //Tokens generated for "letter": "let" "ter" "letter" , maxTokenLen=3
+ // Anything that matches the first three letters should be found when maxLen=3
+ assertQ("Check the total number of docs", req("q","letter0:l"), "//result[@numFound=0]");
+ assertQ("Check the total number of docs", req("q","letter0:let"), "//result[@numFound=1]");
+ assertQ("Check the total number of docs", req("q","letter0:lett"), "//result[@numFound=1]");
+ assertQ("Check the total number of docs", req("q","letter0:letXYZ"), "//result[@numFound=1]");
+
+ //Tokens generated for "lowerCase": "low" "erC" "ase" "lowerCase" , maxTokenLen=3
+ // Anything that matches the first three letters should be found when maxLen=3
+ assertQ("Check the total number of docs", req("q","lowerCase0:low"), "//result[@numFound=1]");
+ assertQ("Check the total number of docs", req("q","lowerCase0:l"), "//result[@numFound=0]");
+ assertQ("Check the total number of docs", req("q","lowerCase0:lo"), "//result[@numFound=0]");
+ assertQ("Check the total number of docs", req("q","lowerCase0:lowerXYZ"), "//result[@numFound=1]");
+
+ //Tokens generated for "whiteSpace in": "whi" "teS" "pac" "e" "in" "whiteSpace" , maxTokenLen=3
+ // Anything that matches the first three letters should be found when maxLen=3
+ assertQ("Check the total number of docs", req("q","whiteSpace0:h"), "//result[@numFound=0]");
+ assertQ("Check the total number of docs", req("q","whiteSpace0:whi"), "//result[@numFound=1]");
+ assertQ("Check the total number of docs", req("q","whiteSpace0:teS"), "//result[@numFound=1]");
+ assertQ("Check the total number of docs", req("q","whiteSpace0:in"), "//result[@numFound=1]");
+ assertQ("Check the total number of docs", req("q","whiteSpace0:whiteZKY"), "//result[@numFound=1]");
+
+ //Tokens generated for "unicode in": "uni" "cod" "e" "in" "unicode" , maxTokenLen=3
+ // Anything that matches the first three letters should be found when maxLen=3
+ assertQ("Check the total number of docs", req("q","unicodeWhiteSpace0:u"), "//result[@numFound=0]");
+ assertQ("Check the total number of docs", req("q","unicodeWhiteSpace0:uni"), "//result[@numFound=1]");
+ assertQ("Check the total number of docs", req("q","unicodeWhiteSpace0:cod"), "//result[@numFound=1]");
+ assertQ("Check the total number of docs", req("q","unicodeWhiteSpace0:e"), "//result[@numFound=1]");
+ assertQ("Check the total number of docs", req("q","unicodeWhiteSpace0:unicoVBRT"), "//result[@numFound=1]");
+
+ //Tokens generated for "keyword": "keyword" , maxTokenLen=3
+ assertQ("Check the total number of docs", req("q","keyword0:keyword"), "//result[@numFound=1]");
+ assertQ("Check the total number of docs", req("q","keyword0:key"), "//result[@numFound=0]");
+
+ }
+}