You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by mi...@apache.org on 2009/06/16 18:38:39 UTC
svn commit: r785287 - in /lucene/java/trunk/contrib/analyzers/src:
java/org/apache/lucene/analysis/cjk/CJKTokenizer.java
test/org/apache/lucene/analysis/cjk/
test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java
Author: mikemccand
Date: Tue Jun 16 16:38:39 2009
New Revision: 785287
URL: http://svn.apache.org/viewvc?rev=785287&view=rev
Log:
LUCENE-973: add test case for CJKAnalyzer; fix trailing empty string bug
Added:
lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/cjk/
lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java (with props)
Modified:
lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java
Modified: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java?rev=785287&r1=785286&r2=785287&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java (original)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java Tue Jun 16 16:38:39 2009
@@ -37,7 +37,18 @@
*/
public final class CJKTokenizer extends Tokenizer {
//~ Static fields/initializers ---------------------------------------------
-
+ /** Word token type */
+ static final int WORD_TYPE = 0;
+
+ /** Single byte token type */
+ static final int SINGLE_TOKEN_TYPE = 1;
+
+ /** Double byte token type */
+ static final int DOUBLE_TOKEN_TYPE = 2;
+
+ /** Names for token types */
+ static final String[] TOKEN_TYPE_NAMES = { "word", "single", "double" };
+
/** Max word length */
private static final int MAX_WORD_LEN = 255;
@@ -68,7 +79,7 @@
private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
/** word type: single=>ASCII double=>non-ASCII word=>default */
- private String tokenType = "word";
+ private int tokenType = WORD_TYPE;
/**
* tag: previous character is a cached double-byte character "C1C2C3C4"
@@ -105,12 +116,15 @@
public final Token next(final Token reusableToken) throws java.io.IOException {
/** how many character(s) has been stored in buffer */
assert reusableToken != null;
- int length = 0;
- /** the position used to create Token */
- int start = offset;
+ while(true) { // loop until we find a non-empty token
+
+ int length = 0;
+
+ /** the position used to create Token */
+ int start = offset;
- while (true) {
+ while (true) { // loop until we've found a full token
/** current character */
char c;
@@ -150,7 +164,7 @@
if (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS) {
int i = (int) c;
if (i >= 65281 && i <= 65374) {
- /** convert certain HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN */
+ // convert certain HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN
i = i - 65248;
c = (char) i;
}
@@ -165,19 +179,17 @@
// ^--: the current character begin to token the ASCII
// letter
start = offset - 1;
- } else if (tokenType == "double") {
+ } else if (tokenType == DOUBLE_TOKEN_TYPE) {
// "javaC1C2C3C4linux" <br>
// ^--: the previous non-ASCII
// : the current character
offset--;
bufferIndex--;
- tokenType = "single";
if (preIsTokened == true) {
// there is only one non-ASCII has been stored
length = 0;
preIsTokened = false;
-
break;
} else {
break;
@@ -186,7 +198,7 @@
// store the LowerCase(c) in the buffer
buffer[length++] = Character.toLowerCase(c);
- tokenType = "single";
+ tokenType = SINGLE_TOKEN_TYPE;
// break the procedure if buffer overflowed!
if (length == MAX_WORD_LEN) {
@@ -206,9 +218,9 @@
if (length == 0) {
start = offset - 1;
buffer[length++] = c;
- tokenType = "double";
+ tokenType = DOUBLE_TOKEN_TYPE;
} else {
- if (tokenType == "single") {
+ if (tokenType == SINGLE_TOKEN_TYPE) {
offset--;
bufferIndex--;
@@ -216,7 +228,7 @@
break;
} else {
buffer[length++] = c;
- tokenType = "double";
+ tokenType = DOUBLE_TOKEN_TYPE;
if (length == 2) {
offset--;
@@ -238,7 +250,16 @@
}
}
}
+
+ if (length > 0) {
+ return reusableToken.reinit
+ (buffer, 0, length, start, start+length, TOKEN_TYPE_NAMES[tokenType]);
+ } else if (dataLen == -1) {
+ return null;
+ }
- return reusableToken.reinit(buffer, 0, length, start, start+length, tokenType);
+ // Cycle back and try for the next token (don't
+ // return an empty string)
+ }
}
}
Added: lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java?rev=785287&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java (added)
+++ lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java Tue Jun 16 16:38:39 2009
@@ -0,0 +1,155 @@
+package org.apache.lucene.analysis.cjk;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.StringReader;
+
+import junit.framework.TestCase;
+import org.apache.lucene.analysis.Token;
+
+
+public class TestCJKTokenizer extends TestCase{
+
+ public Token newToken(String termText, int start, int end, int type) {
+ Token token = new Token(start, end);
+ token.setTermBuffer(termText);
+ token.setType(CJKTokenizer.TOKEN_TYPE_NAMES[type]);
+ return token;
+ }
+
+ public void checkCJKToken(final String str, final Token[] out_tokens) throws IOException {
+ CJKTokenizer tokenizer = new CJKTokenizer(new StringReader(str));
+ int i = 0;
+ System.out.println("string[" + str + "]");
+ System.out.print("tokens[");
+ final Token reusableToken = new Token();
+ for (Token token = tokenizer.next(reusableToken) ;
+ token != null ;
+ token = tokenizer.next(reusableToken) ) {
+ if (token.term().equals(out_tokens[i].term())
+ && token.startOffset() == out_tokens[i].startOffset()
+ && token.endOffset() == out_tokens[i].endOffset()
+ && token.type().equals(out_tokens[i].type()) ) {
+ System.out.print( token.term() + " ");
+ }
+ else {
+ fail(token.term() + " (start: " + token.startOffset()
+ + " end: " + token.endOffset() + " type: " + token.type() + ") != "
+ + out_tokens[i].term() + " (start: " + out_tokens[i].startOffset()
+ + " end: " + out_tokens[i].endOffset()
+ + " type: " + out_tokens[i].type() + ")");
+ break;
+ }
+ ++i;
+ }
+ System.out.println("]" + System.getProperty("line.separator"));
+ }
+
+ public void testJa1() throws IOException {
+ String str = "\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341";
+
+ Token[] out_tokens = {
+ newToken("\u4e00\u4e8c", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+ newToken("\u4e8c\u4e09", 1, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+ newToken("\u4e09\u56db", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+ newToken("\u56db\u4e94", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+ newToken("\u4e94\u516d", 4, 6, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+ newToken("\u516d\u4e03", 5, 7, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+ newToken("\u4e03\u516b", 6, 8, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+ newToken("\u516b\u4e5d", 7, 9, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+ newToken("\u4e5d\u5341", 8,10, CJKTokenizer.DOUBLE_TOKEN_TYPE)
+ };
+ checkCJKToken(str, out_tokens);
+ }
+
+ public void testJa2() throws IOException {
+ String str = "\u4e00 \u4e8c\u4e09\u56db \u4e94\u516d\u4e03\u516b\u4e5d \u5341";
+
+ Token[] out_tokens = {
+ newToken("\u4e00", 0, 1, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+ newToken("\u4e8c\u4e09", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+ newToken("\u4e09\u56db", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+ newToken("\u4e94\u516d", 6, 8, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+ newToken("\u516d\u4e03", 7, 9, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+ newToken("\u4e03\u516b", 8, 10, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+ newToken("\u516b\u4e5d", 9, 11, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+ newToken("\u5341", 12,13, CJKTokenizer.DOUBLE_TOKEN_TYPE)
+ };
+ checkCJKToken(str, out_tokens);
+ }
+
+ public void testC() throws IOException {
+ String str = "abc defgh ijklmn opqrstu vwxy z";
+
+ Token[] out_tokens = {
+ newToken("abc", 0, 3, CJKTokenizer.SINGLE_TOKEN_TYPE),
+ newToken("defgh", 4, 9, CJKTokenizer.SINGLE_TOKEN_TYPE),
+ newToken("ijklmn", 10, 16, CJKTokenizer.SINGLE_TOKEN_TYPE),
+ newToken("opqrstu", 17, 24, CJKTokenizer.SINGLE_TOKEN_TYPE),
+ newToken("vwxy", 25, 29, CJKTokenizer.SINGLE_TOKEN_TYPE),
+ newToken("z", 30, 31, CJKTokenizer.SINGLE_TOKEN_TYPE),
+ };
+ checkCJKToken(str, out_tokens);
+ }
+
+ public void testMix() throws IOException {
+ String str = "\u3042\u3044\u3046\u3048\u304aabc\u304b\u304d\u304f\u3051\u3053";
+
+ Token[] out_tokens = {
+ newToken("\u3042\u3044", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+ newToken("\u3044\u3046", 1, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+ newToken("\u3046\u3048", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+ newToken("\u3048\u304a", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+ newToken("abc", 5, 8, CJKTokenizer.SINGLE_TOKEN_TYPE),
+ newToken("\u304b\u304d", 8, 10, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+ newToken("\u304d\u304f", 9, 11, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+ newToken("\u304f\u3051", 10,12, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+ newToken("\u3051\u3053", 11,13, CJKTokenizer.DOUBLE_TOKEN_TYPE)
+ };
+ checkCJKToken(str, out_tokens);
+ }
+
+ public void testMix2() throws IOException {
+ String str = "\u3042\u3044\u3046\u3048\u304aab\u3093c\u304b\u304d\u304f\u3051 \u3053";
+
+ Token[] out_tokens = {
+ newToken("\u3042\u3044", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+ newToken("\u3044\u3046", 1, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+ newToken("\u3046\u3048", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+ newToken("\u3048\u304a", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+ newToken("ab", 5, 7, CJKTokenizer.SINGLE_TOKEN_TYPE),
+ newToken("\u3093", 7, 8, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+ newToken("c", 8, 9, CJKTokenizer.SINGLE_TOKEN_TYPE),
+ newToken("\u304b\u304d", 9, 11, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+ newToken("\u304d\u304f", 10, 12, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+ newToken("\u304f\u3051", 11,13, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+ newToken("\u3053", 14,15, CJKTokenizer.DOUBLE_TOKEN_TYPE)
+ };
+ checkCJKToken(str, out_tokens);
+ }
+
+ public void testSingleChar() throws IOException {
+ String str = "\u4e00";
+
+ Token[] out_tokens = {
+ newToken("\u4e00", 0, 1, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+ };
+ checkCJKToken(str, out_tokens);
+ }
+}
Propchange: lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java
------------------------------------------------------------------------------
svn:eol-style = native