You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by ot...@apache.org on 2006/12/23 00:43:18 UTC
svn commit: r489802 - in /lucene/java/trunk: ./
contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/
contrib/analyzers/src/test/org/apache/lucene/analysis/ngram/
Author: otis
Date: Fri Dec 22 15:43:17 2006
New Revision: 489802
URL: http://svn.apache.org/viewvc?view=rev&rev=489802
Log:
- LUCENE-759: New n-gram-capable tokenizers and their unit tests.
Added:
lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/
lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java
lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/ngram/
lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java
lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java
Modified:
lucene/java/trunk/CHANGES.txt
Modified: lucene/java/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/CHANGES.txt?view=diff&rev=489802&r1=489801&r2=489802
==============================================================================
--- lucene/java/trunk/CHANGES.txt (original)
+++ lucene/java/trunk/CHANGES.txt Fri Dec 22 15:43:17 2006
@@ -91,6 +91,10 @@
and lives in contrib/miscellaneous.
(Chris Hostetter, Otis Gospodnetic)
+12. LUCENE-759: Added NGramTokenizer and EdgeNGramTokenizer classes and
+ their passing unit tests.
+ (Otis Gospodnetic)
+
API Changes
1. LUCENE-438: Remove "final" from Token, implement Cloneable, allow
Added: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java?view=auto&rev=489802
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java (added)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java Fri Dec 22 15:43:17 2006
@@ -0,0 +1,95 @@
+package org.apache.lucene.analysis.ngram;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.Tokenizer;
+
+import java.io.IOException;
+import java.io.Reader;
+
+/**
+ * Tokenizes the input into n-grams of given size(s).
+ * @author Otis Gospodnetic
+ */
+public class EdgeNGramTokenizer extends Tokenizer {
+ // which side to get the n-gram from
+ // TODO: switch to using this enum when we move to 1.5+
+// public enum Side {
+// FRONT (),
+// BACK ();
+// }
+ public static class Side {
+ public static Side FRONT = new Side("front");
+ public static Side BACK = new Side("back");
+ private Side(String label) {}
+ }
+ private int gramSize;
+ private Side side;
+ private int inLen;
+ private String inStr;
+ private boolean started = false;
+
+ /**
+ * Creates EdgeNGramTokenizer with given min and max n-grams.
+ * @param input Reader holding the input to be tokenized
+ * @param side the {@link Side} from which to chop off an n-gram
+ * @param gramSize the n-gram size to generate
+ */
+ public EdgeNGramTokenizer(Reader input, Side side, int gramSize) {
+ super(input);
+ if (gramSize < 1) {
+ throw new IllegalArgumentException("gramSize must be greater than zero");
+ }
+ this.gramSize = gramSize;
+ this.side = side;
+ }
+ public EdgeNGramTokenizer(Reader input, String side, int gramSize) {
+
+ }
+
+ /** Returns the next token in the stream, or null at EOS. */
+ public final Token next() throws IOException {
+ // if we already returned the edge n-gram, we are done
+ if (started)
+ return null;
+ if (!started) {
+ started = true;
+ char[] chars = new char[1024];
+ input.read(chars);
+ inStr = new String(chars).trim(); // remove any trailing empty strings
+ inLen = inStr.length();
+ }
+ // if the input is too short, we can't generate any n-grams
+ if (gramSize > inLen)
+ return null;
+ if (side == Side.FRONT)
+ return new Token(inStr.substring(0, gramSize), 0, gramSize);
+ else
+ return new Token(inStr.substring(inLen-gramSize), inLen-gramSize, inLen);
+ }
+
+ static Side side(String label) {
+ if (label == null || label.isEmpty())
+ throw new IllegalArgumentException("Label must be either 'front' or 'back'");
+ if (label.equals("front"))
+ return Side.FRONT;
+ else
+ return Side.BACK;
+ }
+}
Added: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java?view=auto&rev=489802
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java (added)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java Fri Dec 22 15:43:17 2006
@@ -0,0 +1,90 @@
+package org.apache.lucene.analysis.ngram;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.Tokenizer;
+
+import java.io.IOException;
+import java.io.Reader;
+
+/**
+ * Tokenizes the input into n-grams of given size(s).
+ * @author Otis Gospodnetic
+ */
+public class NGramTokenizer extends Tokenizer {
+ public static final int DEFAULT_MIN_NGRAM_SIZE = 1;
+ public static final int DEFAULT_MAX_NGRAM_SIZE = 2;
+
+ private int minGram, maxGram;
+ private int gramSize;
+ private int pos = 0;
+ private int inLen;
+ private String inStr;
+ private boolean started = false;
+
+ /**
+ * Creates NGramTokenizer with given min and max n-grams.
+ * @param input Reader holding the input to be tokenized
+ * @param minGram the smallest n-gram to generate
+ * @param maxGram the largest n-gram to generate
+ */
+ public NGramTokenizer(Reader input, int minGram, int maxGram) {
+ super(input);
+ if (minGram < 1) {
+ throw new IllegalArgumentException("minGram must be greater than zero");
+ }
+ if (minGram > maxGram) {
+ throw new IllegalArgumentException("minGram must not be greater than maxGram");
+ }
+ this.minGram = minGram;
+ this.maxGram = maxGram;
+ }
+ /**
+ * Creates NGramTokenizer with default min and max n-grams.
+ * @param input Reader holding the input to be tokenized
+ */
+ public NGramTokenizer(Reader input) {
+ this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
+ }
+
+ /** Returns the next token in the stream, or null at EOS. */
+ public final Token next() throws IOException {
+ if (!started) {
+ started = true;
+ gramSize = minGram;
+ char[] chars = new char[1024];
+ input.read(chars);
+ inStr = new String(chars).trim(); // remove any trailing empty strings
+ inLen = inStr.length();
+ }
+
+ if (pos+gramSize > inLen) { // if we hit the end of the string
+ pos = 0; // reset to beginning of string
+ gramSize++; // increase n-gram size
+ if (gramSize > maxGram) // we are done
+ return null;
+ if (pos+gramSize > inLen)
+ return null;
+ }
+ String gram = inStr.substring(pos, pos+gramSize);
+ int oldPos = pos;
+ pos++;
+ return new Token(gram, oldPos, oldPos+gramSize);
+ }
+}
Added: lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java?view=auto&rev=489802
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java (added)
+++ lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java Fri Dec 22 15:43:17 2006
@@ -0,0 +1,81 @@
+package org.apache.lucene.analysis.ngram;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.Token;
+
+import java.io.StringReader;
+
+import junit.framework.TestCase;
+
+/**
+ * Tests {@link EdgeNGramTokenizer} for correctness.
+ * @author Otis Gospodnetic
+ */
+public class EdgeNGramTokenizerTest extends TestCase {
+ private StringReader input;
+
+ public void setUp() {
+ input = new StringReader("abcde");
+ }
+
+ public void testInvalidInput() throws Exception {
+ boolean gotException = false;
+ try {
+ new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 0);
+ } catch (IllegalArgumentException e) {
+ gotException = true;
+ }
+ assertTrue(gotException);
+ }
+
+ public void testInvalidInput2() throws Exception {
+ boolean gotException = false;
+ try {
+ new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, -1);
+ } catch (IllegalArgumentException e) {
+ gotException = true;
+ }
+ assertTrue(gotException);
+ }
+
+ public void testFrontUnigram() throws Exception {
+ EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 1);
+ Token token = null;
+ token = tokenizer.next();
+ assertEquals("(a,0,1)", token.toString());
+ token = tokenizer.next();
+ assertNull(token);
+ }
+
+ public void testBackUnigram() throws Exception {
+ EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.BACK, 1);
+ Token token = null;
+ token = tokenizer.next();
+ assertEquals("(e,4,5)", token.toString());
+ token = tokenizer.next();
+ assertNull(token);
+ }
+
+ public void testOversizedNgrams() throws Exception {
+ EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 6);
+ Token token = null;
+ token = tokenizer.next();
+ assertNull(token);
+ }
+}
Added: lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java?view=auto&rev=489802
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java (added)
+++ lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java Fri Dec 22 15:43:17 2006
@@ -0,0 +1,137 @@
+package org.apache.lucene.analysis.ngram;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.Token;
+
+import java.io.StringReader;
+import java.util.ArrayList;
+
+import junit.framework.TestCase;
+
+/**
+ * Tests {@link NGramTokenizer} for correctness.
+ * @author Otis Gospodnetic
+ */
+public class NGramTokenizerTest extends TestCase {
+ private StringReader input;
+ private ArrayList tokens = new ArrayList();
+
+ public void setUp() {
+ input = new StringReader("abcde");
+ }
+
+ public void testInvalidInput() throws Exception {
+ boolean gotException = false;
+ try {
+ new NGramTokenizer(input, 2, 1);
+ } catch (IllegalArgumentException e) {
+ gotException = true;
+ }
+ assertTrue(gotException);
+ }
+
+ public void testInvalidInput2() throws Exception {
+ boolean gotException = false;
+ try {
+ new NGramTokenizer(input, 0, 1);
+ } catch (IllegalArgumentException e) {
+ gotException = true;
+ }
+ assertTrue(gotException);
+ }
+
+ public void testUnigrams() throws Exception {
+ NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 1);
+
+ Token token = null;
+ do {
+ token = tokenizer.next();
+ if (token != null) {
+ tokens.add(token.toString());
+// System.out.println(token.termText());
+// System.out.println(token);
+// Thread.sleep(1000);
+ }
+ } while (token != null);
+
+ assertEquals(5, tokens.size());
+ ArrayList exp = new ArrayList();
+ exp.add("(a,0,1)"); exp.add("(b,1,2)"); exp.add("(c,2,3)"); exp.add("(d,3,4)"); exp.add("(e,4,5)");
+ assertEquals(exp, tokens);
+ }
+
+ public void testBigrams() throws Exception {
+ NGramTokenizer tokenizer = new NGramTokenizer(input, 2, 2);
+
+ Token token = null;
+ do {
+ token = tokenizer.next();
+ if (token != null) {
+ tokens.add(token.toString());
+// System.out.println(token.termText());
+// System.out.println(token);
+// Thread.sleep(1000);
+ }
+ } while (token != null);
+
+ assertEquals(4, tokens.size());
+ ArrayList exp = new ArrayList();
+ exp.add("(ab,0,2)"); exp.add("(bc,1,3)"); exp.add("(cd,2,4)"); exp.add("(de,3,5)");
+ assertEquals(exp, tokens);
+ }
+
+ public void testNgrams() throws Exception {
+ NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 3);
+
+ Token token = null;
+ do {
+ token = tokenizer.next();
+ if (token != null) {
+ tokens.add(token.toString());
+// System.out.println(token.termText());
+// System.out.println(token);
+// Thread.sleep(1000);
+ }
+ } while (token != null);
+
+ assertEquals(12, tokens.size());
+ ArrayList exp = new ArrayList();
+ exp.add("(a,0,1)"); exp.add("(b,1,2)"); exp.add("(c,2,3)"); exp.add("(d,3,4)"); exp.add("(e,4,5)");
+ exp.add("(ab,0,2)"); exp.add("(bc,1,3)"); exp.add("(cd,2,4)"); exp.add("(de,3,5)");
+ exp.add("(abc,0,3)"); exp.add("(bcd,1,4)"); exp.add("(cde,2,5)");
+ assertEquals(exp, tokens);
+ }
+
+ public void testOversizedNgrams() throws Exception {
+ NGramTokenizer tokenizer = new NGramTokenizer(input, 6, 7);
+
+ Token token = null;
+ do {
+ token = tokenizer.next();
+ if (token != null) {
+ tokens.add(token.toString());
+// System.out.println(token.termText());
+// System.out.println(token);
+// Thread.sleep(1000);
+ }
+ } while (token != null);
+
+ assertTrue(tokens.isEmpty());
+ }
+}