You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by ot...@apache.org on 2007/03/01 15:22:58 UTC
svn commit: r513344 - in /lucene/java/trunk/contrib/analyzers/src:
java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java
test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java
Author: otis
Date: Thu Mar 1 06:22:57 2007
New Revision: 513344
URL: http://svn.apache.org/viewvc?view=rev&rev=513344
Log:
- LUCENE-759: Made the tokenizer capable of creating n-grams of a varying sizes - from min to max characters per n-gram. Patch from Adam Hiatt.
Modified:
lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java
lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java
Modified: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java?view=diff&rev=513344&r1=513343&r2=513344
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java (original)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java Thu Mar 1 06:22:57 2007
@@ -24,75 +24,123 @@
import java.io.Reader;
/**
- * Tokenizes the input into n-grams of the given size.
+ * Tokenizes the input from an edge into n-grams of given size(s).
* @author Otis Gospodnetic
+ * @author Adam Hiatt
*/
public class EdgeNGramTokenizer extends Tokenizer {
- // which side to get the n-gram from
- // TODO: switch to using this enum when we move to 1.5+
-// public enum Side {
-// FRONT (),
-// BACK ();
-// }
+ public static final Side DEFAULT_SIDE = Side.FRONT;
+ public static final int DEFAULT_MAX_GRAM_SIZE = 1;
+ public static final int DEFAULT_MIN_GRAM_SIZE = 1;
+
+ // Replace this with an enum when the Java 1.5 upgrade is made, the impl will be simplified
/** Specifies which side of the input the n-gram should be generated from */
public static class Side {
+ private String label;
+
/** Get the n-gram from the front of the input */
public static Side FRONT = new Side("front");
+
/** Get the n-gram from the end of the input */
public static Side BACK = new Side("back");
- private Side(String label) {}
+
+ // Private ctor
+ private Side(String label) { this.label = label; }
+
+
+ public String getLabel() { return label; }
+
+ // Get the appropriate Side from a string
+ public static Side getSide(String sideName) {
+ if (FRONT.getLabel().equals(sideName)) {
+ return FRONT;
+ }
+ else if (BACK.getLabel().equals(sideName)) {
+ return BACK;
+ }
+ return null;
+ }
}
+
+ private int minGram;
+ private int maxGram;
private int gramSize;
private Side side;
+ private boolean started = false;
private int inLen;
private String inStr;
- private boolean started = false;
+
/**
- * Creates EdgeNGramTokenizer that can generate an n-gram of the given size.
+ * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+ *
* @param input Reader holding the input to be tokenized
- * @param side the {@link Side} from which to chop off an n-gram
- * @param gramSize the size of the n-gram to generate
+ * @param side the {@link Side} from which to chop off an n-gram
+ * @param minGram the smallest n-gram to generate
+ * @param maxGram the largest n-gram to generate
*/
- public EdgeNGramTokenizer(Reader input, Side side, int gramSize) {
+ public EdgeNGramTokenizer(Reader input, Side side, int minGram, int maxGram) {
super(input);
- if (gramSize < 1) {
- throw new IllegalArgumentException("gramSize must be greater than zero");
+
+ if (side == null) {
+ throw new IllegalArgumentException("sideLabel must be either front or back");
+ }
+
+ if (minGram < 1) {
+ throw new IllegalArgumentException("minGram must be greater than zero");
}
- this.gramSize = gramSize;
+
+ if (minGram > maxGram) {
+ throw new IllegalArgumentException("minGram must not be greater than maxGram");
+ }
+
+ this.minGram = minGram;
+ this.maxGram = maxGram;
this.side = side;
}
- public EdgeNGramTokenizer(Reader input, String side, int gramSize) {
-
+ /**
+ * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+ *
+ * @param input Reader holding the input to be tokenized
+ * @param sideLabel the name of the {@link Side} from which to chop off an n-gram
+ * @param minGram the smallest n-gram to generate
+ * @param maxGram the largest n-gram to generate
+ */
+ public EdgeNGramTokenizer(Reader input, String sideLabel, int minGram, int maxGram) {
+ this(input, Side.getSide(sideLabel), minGram, maxGram);
}
/** Returns the next token in the stream, or null at EOS. */
public final Token next() throws IOException {
- // if we already returned the edge n-gram, we are done
- if (started)
- return null;
+ // if we are just starting, read the whole input
if (!started) {
started = true;
char[] chars = new char[1024];
input.read(chars);
- inStr = new String(chars).trim(); // remove any trailing empty strings
+ inStr = new String(chars).trim(); // remove any trailing empty strings
inLen = inStr.length();
+ gramSize = minGram;
+ }
+
+ // if the remaining input is too short, we can't generate any n-grams
+ if (gramSize > inLen) {
+ return null;
}
- // if the input is too short, we can't generate any n-grams
- if (gramSize > inLen)
+
+ // if we have hit the end of our n-gram size range, quit
+ if (gramSize > maxGram) {
return null;
- if (side == Side.FRONT)
- return new Token(inStr.substring(0, gramSize), 0, gramSize);
- else
- return new Token(inStr.substring(inLen-gramSize), inLen-gramSize, inLen);
- }
+ }
+
+ Token tok;
+ if (side == Side.FRONT) {
+ tok = new Token(inStr.substring(0, gramSize), 0, gramSize);
+ }
+ else {
+ tok = new Token(inStr.substring(inLen-gramSize), inLen-gramSize, inLen);
+ }
- static Side side(String label) {
- if (label == null || label.trim().length() == 0)
- throw new IllegalArgumentException("Label must be either 'front' or 'back'");
- if (label.equals("front"))
- return Side.FRONT;
- else
- return Side.BACK;
+ gramSize++;
+ return tok;
}
}
Modified: lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java?view=diff&rev=513344&r1=513343&r2=513344
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java (original)
+++ lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java Thu Mar 1 06:22:57 2007
@@ -28,54 +28,90 @@
* @author Otis Gospodnetic
*/
public class EdgeNGramTokenizerTest extends TestCase {
- private StringReader input;
-
- public void setUp() {
- input = new StringReader("abcde");
- }
+ private StringReader input;
- public void testInvalidInput() throws Exception {
- boolean gotException = false;
- try {
- new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 0);
- } catch (IllegalArgumentException e) {
- gotException = true;
- }
- assertTrue(gotException);
- }
+ public void setUp() {
+ input = new StringReader("abcde");
+ }
- public void testInvalidInput2() throws Exception {
- boolean gotException = false;
- try {
- new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, -1);
- } catch (IllegalArgumentException e) {
- gotException = true;
- }
- assertTrue(gotException);
+ public void testInvalidInput() throws Exception {
+ boolean gotException = false;
+ try {
+ new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 0, 0);
+ } catch (IllegalArgumentException e) {
+ gotException = true;
}
+ assertTrue(gotException);
+ }
- public void testFrontUnigram() throws Exception {
- EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 1);
- Token token = null;
- token = tokenizer.next();
- assertEquals("(a,0,1)", token.toString());
- token = tokenizer.next();
- assertNull(token);
+ public void testInvalidInput2() throws Exception {
+ boolean gotException = false;
+ try {
+ new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 2, 1);
+ } catch (IllegalArgumentException e) {
+ gotException = true;
}
+ assertTrue(gotException);
+ }
- public void testBackUnigram() throws Exception {
- EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.BACK, 1);
- Token token = null;
- token = tokenizer.next();
- assertEquals("(e,4,5)", token.toString());
- token = tokenizer.next();
- assertNull(token);
+ public void testInvalidInput3() throws Exception {
+ boolean gotException = false;
+ try {
+ new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, -1, 2);
+ } catch (IllegalArgumentException e) {
+ gotException = true;
}
+ assertTrue(gotException);
+ }
- public void testOversizedNgrams() throws Exception {
- EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 6);
- Token token = null;
- token = tokenizer.next();
- assertNull(token);
- }
+ public void testFrontUnigram() throws Exception {
+ EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 1, 1);
+ Token token = null;
+ token = tokenizer.next();
+ assertEquals("(a,0,1)", token.toString());
+ token = tokenizer.next();
+ assertNull(token);
+ }
+
+ public void testBackUnigram() throws Exception {
+ EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.BACK, 1, 1);
+ Token token = null;
+ token = tokenizer.next();
+ assertEquals("(e,4,5)", token.toString());
+ token = tokenizer.next();
+ assertNull(token);
+ }
+
+ public void testOversizedNgrams() throws Exception {
+ EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 6, 6);
+ Token token = null;
+ token = tokenizer.next();
+ assertNull(token);
+ }
+
+ public void testFrontRangeOfNgrams() throws Exception {
+ EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 1, 3);
+ Token token = null;
+ token = tokenizer.next();
+ assertEquals("(a,0,1)", token.toString());
+ token = tokenizer.next();
+ assertEquals("(ab,0,2)", token.toString());
+ token = tokenizer.next();
+ assertEquals("(abc,0,3)", token.toString());
+ token = tokenizer.next();
+ assertNull(token);
+ }
+
+ public void testBackRangeOfNgrams() throws Exception {
+ EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.BACK, 1, 3);
+ Token token = null;
+ token = tokenizer.next();
+ assertEquals("(e,4,5)", token.toString());
+ token = tokenizer.next();
+ assertEquals("(de,3,5)", token.toString());
+ token = tokenizer.next();
+ assertEquals("(cde,2,5)", token.toString());
+ token = tokenizer.next();
+ assertNull(token);
+ }
}