You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by us...@apache.org on 2015/11/04 23:44:18 UTC
svn commit: r1712682 - in /lucene/dev/trunk/lucene: CHANGES.txt analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java analysis/common/src/test/org/apache/lucene/analysis/util/TestCharTokenizers.java

Author: uschindler
Date: Wed Nov  4 22:44:17 2015
New Revision: 1712682

URL: http://svn.apache.org/viewvc?rev=1712682&view=rev
Log:
LUCENE-6879: Allow to define custom CharTokenizer instances without subclassing using Java 8 lambdas or method references

Modified:
    lucene/dev/trunk/lucene/CHANGES.txt
    lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java
    lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharTokenizers.java

Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1712682&r1=1712681&r2=1712682&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Wed Nov  4 22:44:17 2015
@@ -46,6 +46,9 @@ New Features
 * LUCENE-6861: Create Lucene60Codec, supporting dimensional values.
   (Mike McCandless)
 
+* LUCENE-6879: Allow to define custom CharTokenizer instances without
+  subclassing using Java 8 lambdas or method references. (Uwe Schindler)
+
 API Changes
 
 * LUCENE-3312: The API of oal.document was restructured to

Modified: lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java?rev=1712682&r1=1712681&r2=1712682&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java Wed Nov  4 22:44:17 2015
@@ -18,8 +18,14 @@ package org.apache.lucene.analysis.util;
  */
 
 import java.io.IOException;
+import java.util.function.IntPredicate;
+import java.util.function.IntUnaryOperator;
 
 import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.LetterTokenizer;
+import org.apache.lucene.analysis.core.LowerCaseFilter;
+import org.apache.lucene.analysis.core.LowerCaseTokenizer;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.util.AttributeFactory;
@@ -27,8 +33,16 @@ import org.apache.lucene.analysis.util.C
 import org.apache.lucene.analysis.util.CharacterUtils.CharacterBuffer;
 
 /**
- * An abstract base class for simple, character-oriented tokenizers. 
- **/
+ * An abstract base class for simple, character-oriented tokenizers.
+ * <p>
+ * The base class also provides factories to create instances of
+ * {@code CharTokenizer} using Java 8 lambdas or method references.
+ * It is possible to create an instance which behaves exactly like
+ * {@link LetterTokenizer}:
+ * <pre class="prettyprint lang-java">
+ * Tokenizer tok = CharTokenizer.fromTokenCharPredicate(Character::isLetter);
+ * </pre>
+ */
 public abstract class CharTokenizer extends Tokenizer {
   
   /**
@@ -47,6 +61,134 @@ public abstract class CharTokenizer exte
     super(factory);
   }
   
+  /**
+   * Creates a new instance of CharTokenizer using a custom predicate, supplied as method reference or lambda expression.
+   * The predicate should return {@code true} for all valid token characters.
+   * <p>
+   * This factory is intended to be used with lambdas or method references. E.g., an elegant way
+   * to create an instance which behaves exactly as {@link LetterTokenizer} is:
+   * <pre class="prettyprint lang-java">
+   * Tokenizer tok = CharTokenizer.fromTokenCharPredicate(Character::isLetter);
+   * </pre>
+   */
+  public static CharTokenizer fromTokenCharPredicate(final IntPredicate tokenCharPredicate) {
+    return fromTokenCharPredicate(DEFAULT_TOKEN_ATTRIBUTE_FACTORY, tokenCharPredicate);
+  }
+  
+  /**
+   * Creates a new instance of CharTokenizer with the supplied attribute factory using a custom predicate, supplied as method reference or lambda expression.
+   * The predicate should return {@code true} for all valid token characters.
+   * <p>
+   * This factory is intended to be used with lambdas or method references. E.g., an elegant way
+   * to create an instance which behaves exactly as {@link LetterTokenizer} is:
+   * <pre class="prettyprint lang-java">
+   * Tokenizer tok = CharTokenizer.fromTokenCharPredicate(factory, Character::isLetter);
+   * </pre>
+   */
+  public static CharTokenizer fromTokenCharPredicate(AttributeFactory factory, final IntPredicate tokenCharPredicate) {
+    return fromTokenCharPredicate(factory, tokenCharPredicate, IntUnaryOperator.identity());
+  }
+  
+  /**
+   * Creates a new instance of CharTokenizer using a custom predicate, supplied as method reference or lambda expression.
+   * The predicate should return {@code true} for all valid token characters.
+   * This factory also takes a function to normalize chars, e.g., lowercasing them, supplied as method reference or lambda expression.
+   * <p>
+   * This factory is intended to be used with lambdas or method references. E.g., an elegant way
+   * to create an instance which behaves exactly as {@link LowerCaseTokenizer} is:
+   * <pre class="prettyprint lang-java">
+   * Tokenizer tok = CharTokenizer.fromTokenCharPredicate(Character::isLetter, Character::toLowerCase);
+   * </pre>
+   */
+  public static CharTokenizer fromTokenCharPredicate(final IntPredicate tokenCharPredicate, final IntUnaryOperator normalizer) {
+    return fromTokenCharPredicate(DEFAULT_TOKEN_ATTRIBUTE_FACTORY, tokenCharPredicate, normalizer);
+  }
+  
+  /**
+   * Creates a new instance of CharTokenizer with the supplied attribute factory using a custom predicate, supplied as method reference or lambda expression.
+   * The predicate should return {@code true} for all valid token characters.
+   * This factory also takes a function to normalize chars, e.g., lowercasing them, supplied as method reference or lambda expression.
+   * <p>
+   * This factory is intended to be used with lambdas or method references. E.g., an elegant way
+   * to create an instance which behaves exactly as {@link LowerCaseTokenizer} is:
+   * <pre class="prettyprint lang-java">
+   * Tokenizer tok = CharTokenizer.fromTokenCharPredicate(factory, Character::isLetter, Character::toLowerCase);
+   * </pre>
+   */
+  public static CharTokenizer fromTokenCharPredicate(AttributeFactory factory, final IntPredicate tokenCharPredicate, final IntUnaryOperator normalizer) {
+    return new CharTokenizer(factory) {
+      @Override
+      protected boolean isTokenChar(int c) {
+        return tokenCharPredicate.test(c);
+      }
+
+      @Override
+      protected int normalize(int c) {
+        return normalizer.applyAsInt(c);
+      }
+    };
+  }
+  
+  /**
+   * Creates a new instance of CharTokenizer using a custom predicate, supplied as method reference or lambda expression.
+   * The predicate should return {@code true} for all valid token separator characters.
+   * This method is provided for convenience to easily use predicates that are negated
+   * (they match the separator characters, not the token characters).
+   * <p>
+   * This factory is intended to be used with lambdas or method references. E.g., an elegant way
+   * to create an instance which behaves exactly as {@link WhitespaceTokenizer} is:
+   * <pre class="prettyprint lang-java">
+   * Tokenizer tok = CharTokenizer.fromSeparatorCharPredicate(Character::isWhitespace);
+   * </pre>
+   */
+  public static CharTokenizer fromSeparatorCharPredicate(final IntPredicate separatorCharPredicate) {
+    return fromSeparatorCharPredicate(DEFAULT_TOKEN_ATTRIBUTE_FACTORY, separatorCharPredicate);
+  }
+  
+  /**
+   * Creates a new instance of CharTokenizer with the supplied attribute factory using a custom predicate, supplied as method reference or lambda expression.
+   * The predicate should return {@code true} for all valid token separator characters.
+   * <p>
+   * This factory is intended to be used with lambdas or method references. E.g., an elegant way
+   * to create an instance which behaves exactly as {@link WhitespaceTokenizer} is:
+   * <pre class="prettyprint lang-java">
+   * Tokenizer tok = CharTokenizer.fromSeparatorCharPredicate(factory, Character::isWhitespace);
+   * </pre>
+   */
+  public static CharTokenizer fromSeparatorCharPredicate(AttributeFactory factory, final IntPredicate separatorCharPredicate) {
+    return fromSeparatorCharPredicate(factory, separatorCharPredicate, IntUnaryOperator.identity());
+  }
+  
+  /**
+   * Creates a new instance of CharTokenizer using a custom predicate, supplied as method reference or lambda expression.
+   * The predicate should return {@code true} for all valid token separator characters.
+   * This factory also takes a function to normalize chars, e.g., lowercasing them, supplied as method reference or lambda expression.
+   * <p>
+   * This factory is intended to be used with lambdas or method references. E.g., an elegant way
+   * to create an instance which behaves exactly as the combination {@link WhitespaceTokenizer} and {@link LowerCaseFilter} is:
+   * <pre class="prettyprint lang-java">
+   * Tokenizer tok = CharTokenizer.fromSeparatorCharPredicate(Character::isWhitespace, Character::toLowerCase);
+   * </pre>
+   */
+  public static CharTokenizer fromSeparatorCharPredicate(final IntPredicate separatorCharPredicate, final IntUnaryOperator normalizer) {
+    return fromSeparatorCharPredicate(DEFAULT_TOKEN_ATTRIBUTE_FACTORY, separatorCharPredicate, normalizer);
+  }
+  
+  /**
+   * Creates a new instance of CharTokenizer with the supplied attribute factory using a custom predicate.
+   * The predicate should return {@code true} for all valid token separator characters.
+   * This factory also takes a function to normalize chars, e.g., lowercasing them, supplied as method reference or lambda expression.
+   * <p>
+   * This factory is intended to be used with lambdas or method references. E.g., an elegant way
+   * to create an instance which behaves exactly as {@link WhitespaceTokenizer} and {@link LowerCaseFilter} is:
+   * <pre class="prettyprint lang-java">
+   * Tokenizer tok = CharTokenizer.fromSeparatorCharPredicate(factory, Character::isWhitespace, Character::toLowerCase);
+   * </pre>
+   */
+  public static CharTokenizer fromSeparatorCharPredicate(AttributeFactory factory, final IntPredicate separatorCharPredicate, final IntUnaryOperator normalizer) {
+    return fromTokenCharPredicate(factory, separatorCharPredicate.negate(), normalizer);
+  }
+  
   private int offset = 0, bufferIndex = 0, dataLen = 0, finalOffset = 0;
   private static final int MAX_WORD_LEN = 255;
   private static final int IO_BUFFER_SIZE = 4096;

Modified: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharTokenizers.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharTokenizers.java?rev=1712682&r1=1712681&r2=1712682&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharTokenizers.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharTokenizers.java Wed Nov  4 22:44:17 2015
@@ -182,4 +182,26 @@ public class TestCharTokenizers extends
     checkRandomData(random(), analyzer, num);
     analyzer.close();
   }
+  
+  public void testDefinitionUsingMethodReference1() throws Exception {
+    final StringReader reader = new StringReader("Tokenizer Test");
+    final Tokenizer tokenizer = CharTokenizer.fromSeparatorCharPredicate(Character::isWhitespace);
+    tokenizer.setReader(reader);
+    assertTokenStreamContents(tokenizer, new String[] { "Tokenizer", "Test" });
+  }
+  
+  public void testDefinitionUsingMethodReference2() throws Exception {
+    final StringReader reader = new StringReader("Tokenizer(Test)");
+    final Tokenizer tokenizer = CharTokenizer.fromTokenCharPredicate(Character::isLetter, Character::toUpperCase);
+    tokenizer.setReader(reader);
+    assertTokenStreamContents(tokenizer, new String[] { "TOKENIZER", "TEST" });
+  }
+  
+  public void testDefinitionUsingLambda() throws Exception {
+    final StringReader reader = new StringReader("Tokenizer\u00A0Test Foo");
+    final Tokenizer tokenizer = CharTokenizer.fromSeparatorCharPredicate(c -> c == '\u00A0' || Character.isWhitespace(c), Character::toLowerCase);
+    tokenizer.setReader(reader);
+    assertTokenStreamContents(tokenizer, new String[] { "tokenizer", "test", "foo" });
+  }
+  
 }