You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by us...@apache.org on 2015/11/04 23:44:18 UTC
svn commit: r1712682 - in /lucene/dev/trunk/lucene: CHANGES.txt
analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java
analysis/common/src/test/org/apache/lucene/analysis/util/TestCharTokenizers.java
Author: uschindler
Date: Wed Nov 4 22:44:17 2015
New Revision: 1712682
URL: http://svn.apache.org/viewvc?rev=1712682&view=rev
Log:
LUCENE-6879: Allow to define custom CharTokenizer instances without subclassing using Java 8 lambdas or method references
Modified:
lucene/dev/trunk/lucene/CHANGES.txt
lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java
lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharTokenizers.java
Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1712682&r1=1712681&r2=1712682&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Wed Nov 4 22:44:17 2015
@@ -46,6 +46,9 @@ New Features
* LUCENE-6861: Create Lucene60Codec, supporting dimensional values.
(Mike McCandless)
+* LUCENE-6879: Allow to define custom CharTokenizer instances without
+ subclassing using Java 8 lambdas or method references. (Uwe Schindler)
+
API Changes
* LUCENE-3312: The API of oal.document was restructured to
Modified: lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java?rev=1712682&r1=1712681&r2=1712682&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java Wed Nov 4 22:44:17 2015
@@ -18,8 +18,14 @@ package org.apache.lucene.analysis.util;
*/
import java.io.IOException;
+import java.util.function.IntPredicate;
+import java.util.function.IntUnaryOperator;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.LetterTokenizer;
+import org.apache.lucene.analysis.core.LowerCaseFilter;
+import org.apache.lucene.analysis.core.LowerCaseTokenizer;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.util.AttributeFactory;
@@ -27,8 +33,16 @@ import org.apache.lucene.analysis.util.C
import org.apache.lucene.analysis.util.CharacterUtils.CharacterBuffer;
/**
- * An abstract base class for simple, character-oriented tokenizers.
- **/
+ * An abstract base class for simple, character-oriented tokenizers.
+ * <p>
+ * The base class also provides factories to create instances of
+ * {@code CharTokenizer} using Java 8 lambdas or method references.
+ * It is possible to create an instance which behaves exactly like
+ * {@link LetterTokenizer}:
+ * <pre class="prettyprint lang-java">
+ * Tokenizer tok = CharTokenizer.fromTokenCharPredicate(Character::isLetter);
+ * </pre>
+ */
public abstract class CharTokenizer extends Tokenizer {
/**
@@ -47,6 +61,134 @@ public abstract class CharTokenizer exte
super(factory);
}
+ /**
+ * Creates a new instance of CharTokenizer using a custom predicate, supplied as method reference or lambda expression.
+ * The predicate should return {@code true} for all valid token characters.
+ * <p>
+ * This factory is intended to be used with lambdas or method references. E.g., an elegant way
+ * to create an instance which behaves exactly as {@link LetterTokenizer} is:
+ * <pre class="prettyprint lang-java">
+ * Tokenizer tok = CharTokenizer.fromTokenCharPredicate(Character::isLetter);
+ * </pre>
+ */
+ public static CharTokenizer fromTokenCharPredicate(final IntPredicate tokenCharPredicate) {
+ return fromTokenCharPredicate(DEFAULT_TOKEN_ATTRIBUTE_FACTORY, tokenCharPredicate);
+ }
+
+ /**
+ * Creates a new instance of CharTokenizer with the supplied attribute factory using a custom predicate, supplied as method reference or lambda expression.
+ * The predicate should return {@code true} for all valid token characters.
+ * <p>
+ * This factory is intended to be used with lambdas or method references. E.g., an elegant way
+ * to create an instance which behaves exactly as {@link LetterTokenizer} is:
+ * <pre class="prettyprint lang-java">
+ * Tokenizer tok = CharTokenizer.fromTokenCharPredicate(factory, Character::isLetter);
+ * </pre>
+ */
+ public static CharTokenizer fromTokenCharPredicate(AttributeFactory factory, final IntPredicate tokenCharPredicate) {
+ return fromTokenCharPredicate(factory, tokenCharPredicate, IntUnaryOperator.identity());
+ }
+
+ /**
+ * Creates a new instance of CharTokenizer using a custom predicate, supplied as method reference or lambda expression.
+ * The predicate should return {@code true} for all valid token characters.
+ * This factory also takes a function to normalize chars, e.g., lowercasing them, supplied as method reference or lambda expression.
+ * <p>
+ * This factory is intended to be used with lambdas or method references. E.g., an elegant way
+ * to create an instance which behaves exactly as {@link LowerCaseTokenizer} is:
+ * <pre class="prettyprint lang-java">
+ * Tokenizer tok = CharTokenizer.fromTokenCharPredicate(Character::isLetter, Character::toLowerCase);
+ * </pre>
+ */
+ public static CharTokenizer fromTokenCharPredicate(final IntPredicate tokenCharPredicate, final IntUnaryOperator normalizer) {
+ return fromTokenCharPredicate(DEFAULT_TOKEN_ATTRIBUTE_FACTORY, tokenCharPredicate, normalizer);
+ }
+
+ /**
+ * Creates a new instance of CharTokenizer with the supplied attribute factory using a custom predicate, supplied as method reference or lambda expression.
+ * The predicate should return {@code true} for all valid token characters.
+ * This factory also takes a function to normalize chars, e.g., lowercasing them, supplied as method reference or lambda expression.
+ * <p>
+ * This factory is intended to be used with lambdas or method references. E.g., an elegant way
+ * to create an instance which behaves exactly as {@link LowerCaseTokenizer} is:
+ * <pre class="prettyprint lang-java">
+ * Tokenizer tok = CharTokenizer.fromTokenCharPredicate(factory, Character::isLetter, Character::toLowerCase);
+ * </pre>
+ */
+ public static CharTokenizer fromTokenCharPredicate(AttributeFactory factory, final IntPredicate tokenCharPredicate, final IntUnaryOperator normalizer) {
+ return new CharTokenizer(factory) {
+ @Override
+ protected boolean isTokenChar(int c) {
+ return tokenCharPredicate.test(c);
+ }
+
+ @Override
+ protected int normalize(int c) {
+ return normalizer.applyAsInt(c);
+ }
+ };
+ }
+
+ /**
+ * Creates a new instance of CharTokenizer using a custom predicate, supplied as method reference or lambda expression.
+ * The predicate should return {@code true} for all valid token separator characters.
+ * This method is provided for convenience to easily use predicates that are negated
+ * (they match the separator characters, not the token characters).
+ * <p>
+ * This factory is intended to be used with lambdas or method references. E.g., an elegant way
+ * to create an instance which behaves exactly as {@link WhitespaceTokenizer} is:
+ * <pre class="prettyprint lang-java">
+ * Tokenizer tok = CharTokenizer.fromSeparatorCharPredicate(Character::isWhitespace);
+ * </pre>
+ */
+ public static CharTokenizer fromSeparatorCharPredicate(final IntPredicate separatorCharPredicate) {
+ return fromSeparatorCharPredicate(DEFAULT_TOKEN_ATTRIBUTE_FACTORY, separatorCharPredicate);
+ }
+
+ /**
+ * Creates a new instance of CharTokenizer with the supplied attribute factory using a custom predicate, supplied as method reference or lambda expression.
+ * The predicate should return {@code true} for all valid token separator characters.
+ * <p>
+ * This factory is intended to be used with lambdas or method references. E.g., an elegant way
+ * to create an instance which behaves exactly as {@link WhitespaceTokenizer} is:
+ * <pre class="prettyprint lang-java">
+ * Tokenizer tok = CharTokenizer.fromSeparatorCharPredicate(factory, Character::isWhitespace);
+ * </pre>
+ */
+ public static CharTokenizer fromSeparatorCharPredicate(AttributeFactory factory, final IntPredicate separatorCharPredicate) {
+ return fromSeparatorCharPredicate(factory, separatorCharPredicate, IntUnaryOperator.identity());
+ }
+
+ /**
+ * Creates a new instance of CharTokenizer using a custom predicate, supplied as method reference or lambda expression.
+ * The predicate should return {@code true} for all valid token separator characters.
+ * This factory also takes a function to normalize chars, e.g., lowercasing them, supplied as method reference or lambda expression.
+ * <p>
+ * This factory is intended to be used with lambdas or method references. E.g., an elegant way
+ * to create an instance which behaves exactly as the combination {@link WhitespaceTokenizer} and {@link LowerCaseFilter} is:
+ * <pre class="prettyprint lang-java">
+ * Tokenizer tok = CharTokenizer.fromSeparatorCharPredicate(Character::isWhitespace, Character::toLowerCase);
+ * </pre>
+ */
+ public static CharTokenizer fromSeparatorCharPredicate(final IntPredicate separatorCharPredicate, final IntUnaryOperator normalizer) {
+ return fromSeparatorCharPredicate(DEFAULT_TOKEN_ATTRIBUTE_FACTORY, separatorCharPredicate, normalizer);
+ }
+
+ /**
+ * Creates a new instance of CharTokenizer with the supplied attribute factory using a custom predicate.
+ * The predicate should return {@code true} for all valid token separator characters.
+ * This factory also takes a function to normalize chars, e.g., lowercasing them, supplied as method reference or lambda expression.
+ * <p>
+ * This factory is intended to be used with lambdas or method references. E.g., an elegant way
+ * to create an instance which behaves exactly as {@link WhitespaceTokenizer} and {@link LowerCaseFilter} is:
+ * <pre class="prettyprint lang-java">
+ * Tokenizer tok = CharTokenizer.fromSeparatorCharPredicate(factory, Character::isWhitespace, Character::toLowerCase);
+ * </pre>
+ */
+ public static CharTokenizer fromSeparatorCharPredicate(AttributeFactory factory, final IntPredicate separatorCharPredicate, final IntUnaryOperator normalizer) {
+ return fromTokenCharPredicate(factory, separatorCharPredicate.negate(), normalizer);
+ }
+
private int offset = 0, bufferIndex = 0, dataLen = 0, finalOffset = 0;
private static final int MAX_WORD_LEN = 255;
private static final int IO_BUFFER_SIZE = 4096;
Modified: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharTokenizers.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharTokenizers.java?rev=1712682&r1=1712681&r2=1712682&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharTokenizers.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharTokenizers.java Wed Nov 4 22:44:17 2015
@@ -182,4 +182,26 @@ public class TestCharTokenizers extends
checkRandomData(random(), analyzer, num);
analyzer.close();
}
+
+ public void testDefinitionUsingMethodReference1() throws Exception {
+ final StringReader reader = new StringReader("Tokenizer Test");
+ final Tokenizer tokenizer = CharTokenizer.fromSeparatorCharPredicate(Character::isWhitespace);
+ tokenizer.setReader(reader);
+ assertTokenStreamContents(tokenizer, new String[] { "Tokenizer", "Test" });
+ }
+
+ public void testDefinitionUsingMethodReference2() throws Exception {
+ final StringReader reader = new StringReader("Tokenizer(Test)");
+ final Tokenizer tokenizer = CharTokenizer.fromTokenCharPredicate(Character::isLetter, Character::toUpperCase);
+ tokenizer.setReader(reader);
+ assertTokenStreamContents(tokenizer, new String[] { "TOKENIZER", "TEST" });
+ }
+
+ public void testDefinitionUsingLambda() throws Exception {
+ final StringReader reader = new StringReader("Tokenizer\u00A0Test Foo");
+ final Tokenizer tokenizer = CharTokenizer.fromSeparatorCharPredicate(c -> c == '\u00A0' || Character.isWhitespace(c), Character::toLowerCase);
+ tokenizer.setReader(reader);
+ assertTokenStreamContents(tokenizer, new String[] { "tokenizer", "test", "foo" });
+ }
+
}