You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by mi...@apache.org on 2009/06/16 18:38:39 UTC

svn commit: r785287 - in /lucene/java/trunk/contrib/analyzers/src: java/org/apache/lucene/analysis/cjk/CJKTokenizer.java test/org/apache/lucene/analysis/cjk/ test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java

Author: mikemccand
Date: Tue Jun 16 16:38:39 2009
New Revision: 785287

URL: http://svn.apache.org/viewvc?rev=785287&view=rev
Log:
LUCENE-973: add test case for CJKAnalyzer; fix trailing empty string bug

Added:
    lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/cjk/
    lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java   (with props)
Modified:
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java

Modified: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java?rev=785287&r1=785286&r2=785287&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java (original)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java Tue Jun 16 16:38:39 2009
@@ -37,7 +37,18 @@
  */
 public final class CJKTokenizer extends Tokenizer {
     //~ Static fields/initializers ---------------------------------------------
-
+    /** Word token type */
+    static final int WORD_TYPE = 0;
+  
+    /** Single byte token type */
+    static final int SINGLE_TOKEN_TYPE = 1;
+
+    /** Double byte token type */
+    static final int DOUBLE_TOKEN_TYPE = 2;
+  
+    /** Names for token types */
+    static final String[] TOKEN_TYPE_NAMES = { "word", "single", "double" };
+  
     /** Max word length */
     private static final int MAX_WORD_LEN = 255;
 
@@ -68,7 +79,7 @@
     private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
 
     /** word type: single=>ASCII  double=>non-ASCII word=>default */
-    private String tokenType = "word";
+    private int tokenType = WORD_TYPE;
 
     /**
      * tag: previous character is a cached double-byte character  "C1C2C3C4"
@@ -105,12 +116,15 @@
     public final Token next(final Token reusableToken) throws java.io.IOException {
         /** how many character(s) has been stored in buffer */
         assert reusableToken != null;
-        int length = 0;
 
-        /** the position used to create Token */
-        int start = offset;
+        while(true) { // loop until we find a non-empty token
+
+          int length = 0;
+
+          /** the position used to create Token */
+          int start = offset;
 
-        while (true) {
+          while (true) { // loop until we've found a full token
             /** current character */
             char c;
 
@@ -150,7 +164,7 @@
                 if (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS) {
                   int i = (int) c;
                   if (i >= 65281 && i <= 65374) {
-                    /** convert certain HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN */
+                    // convert certain HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN
                     i = i - 65248;
                     c = (char) i;
                   }
@@ -165,19 +179,17 @@
                         //      ^--: the current character begin to token the ASCII
                         // letter
                         start = offset - 1;
-                    } else if (tokenType == "double") {
+                    } else if (tokenType == DOUBLE_TOKEN_TYPE) {
                         // "javaC1C2C3C4linux" <br>
                         //              ^--: the previous non-ASCII
                         // : the current character
                         offset--;
                         bufferIndex--;
-                        tokenType = "single";
 
                         if (preIsTokened == true) {
                             // there is only one non-ASCII has been stored
                             length = 0;
                             preIsTokened = false;
-
                             break;
                         } else {
                             break;
@@ -186,7 +198,7 @@
 
                     // store the LowerCase(c) in the buffer
                     buffer[length++] = Character.toLowerCase(c);
-                    tokenType = "single";
+                    tokenType = SINGLE_TOKEN_TYPE;
 
                     // break the procedure if buffer overflowed!
                     if (length == MAX_WORD_LEN) {
@@ -206,9 +218,9 @@
                     if (length == 0) {
                         start = offset - 1;
                         buffer[length++] = c;
-                        tokenType = "double";
+                        tokenType = DOUBLE_TOKEN_TYPE;
                     } else {
-                        if (tokenType == "single") {
+                      if (tokenType == SINGLE_TOKEN_TYPE) {
                             offset--;
                             bufferIndex--;
 
@@ -216,7 +228,7 @@
                             break;
                         } else {
                             buffer[length++] = c;
-                            tokenType = "double";
+                            tokenType = DOUBLE_TOKEN_TYPE;
 
                             if (length == 2) {
                                 offset--;
@@ -238,7 +250,16 @@
                 }
             }
         }
+      
+        if (length > 0) {
+            return reusableToken.reinit
+                (buffer, 0, length, start, start+length, TOKEN_TYPE_NAMES[tokenType]);
+        } else if (dataLen == -1) {
+          return null;
+        }
 
-        return reusableToken.reinit(buffer, 0, length, start, start+length, tokenType);
+        // Cycle back and try for the next token (don't
+        // return an empty string)
+      }
     }
 }

Added: lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java?rev=785287&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java (added)
+++ lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java Tue Jun 16 16:38:39 2009
@@ -0,0 +1,155 @@
+package org.apache.lucene.analysis.cjk;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.StringReader;
+
+import junit.framework.TestCase;
+import org.apache.lucene.analysis.Token;
+
+
+public class TestCJKTokenizer extends TestCase{
+
+  public Token newToken(String termText, int start, int end, int type) {
+    Token token = new Token(start, end);
+    token.setTermBuffer(termText);
+    token.setType(CJKTokenizer.TOKEN_TYPE_NAMES[type]);
+    return token;
+  }
+
+  public void checkCJKToken(final String str, final Token[] out_tokens) throws IOException {
+    CJKTokenizer tokenizer = new CJKTokenizer(new StringReader(str));
+    int i = 0;
+    System.out.println("string[" + str + "]");
+    System.out.print("tokens[");
+    final Token reusableToken = new Token();
+    for (Token token = tokenizer.next(reusableToken) ;
+         token != null                               ; 
+         token = tokenizer.next(reusableToken)       ) {
+      if (token.term().equals(out_tokens[i].term()) 
+          && token.startOffset() == out_tokens[i].startOffset() 
+          && token.endOffset() == out_tokens[i].endOffset() 
+          && token.type().equals(out_tokens[i].type()) ) {
+        System.out.print( token.term() + " ");
+      }
+      else {
+        fail(token.term() + " (start: " + token.startOffset() 
+             + " end: " + token.endOffset() + " type: " + token.type() + ") != "
+             + out_tokens[i].term() + " (start: " + out_tokens[i].startOffset() 
+             + " end: " + out_tokens[i].endOffset() 
+             + " type: " + out_tokens[i].type() + ")");
+        break;
+      }
+      ++i;
+    }
+    System.out.println("]" + System.getProperty("line.separator"));
+  }
+  
+  public void testJa1() throws IOException {
+    String str = "\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341";
+       
+    Token[] out_tokens = { 
+      newToken("\u4e00\u4e8c", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE), 
+      newToken("\u4e8c\u4e09", 1, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+      newToken("\u4e09\u56db", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+      newToken("\u56db\u4e94", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE), 
+      newToken("\u4e94\u516d", 4, 6, CJKTokenizer.DOUBLE_TOKEN_TYPE), 
+      newToken("\u516d\u4e03", 5, 7, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+      newToken("\u4e03\u516b", 6, 8, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+      newToken("\u516b\u4e5d", 7, 9, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+      newToken("\u4e5d\u5341", 8,10, CJKTokenizer.DOUBLE_TOKEN_TYPE)
+    };
+    checkCJKToken(str, out_tokens);
+  }
+  
+  public void testJa2() throws IOException {
+    String str = "\u4e00 \u4e8c\u4e09\u56db \u4e94\u516d\u4e03\u516b\u4e5d \u5341";
+       
+    Token[] out_tokens = { 
+      newToken("\u4e00", 0, 1, CJKTokenizer.DOUBLE_TOKEN_TYPE), 
+      newToken("\u4e8c\u4e09", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+      newToken("\u4e09\u56db", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+      newToken("\u4e94\u516d", 6, 8, CJKTokenizer.DOUBLE_TOKEN_TYPE), 
+      newToken("\u516d\u4e03", 7, 9, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+      newToken("\u4e03\u516b", 8, 10, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+      newToken("\u516b\u4e5d", 9, 11, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+      newToken("\u5341", 12,13, CJKTokenizer.DOUBLE_TOKEN_TYPE)
+    };
+    checkCJKToken(str, out_tokens);
+  }
+  
+  public void testC() throws IOException {
+    String str = "abc defgh ijklmn opqrstu vwxy z";
+       
+    Token[] out_tokens = { 
+      newToken("abc", 0, 3, CJKTokenizer.SINGLE_TOKEN_TYPE), 
+      newToken("defgh", 4, 9, CJKTokenizer.SINGLE_TOKEN_TYPE),
+      newToken("ijklmn", 10, 16, CJKTokenizer.SINGLE_TOKEN_TYPE),
+      newToken("opqrstu", 17, 24, CJKTokenizer.SINGLE_TOKEN_TYPE), 
+      newToken("vwxy", 25, 29, CJKTokenizer.SINGLE_TOKEN_TYPE), 
+      newToken("z", 30, 31, CJKTokenizer.SINGLE_TOKEN_TYPE),
+    };
+    checkCJKToken(str, out_tokens);
+  }
+  
+  public void testMix() throws IOException {
+    String str = "\u3042\u3044\u3046\u3048\u304aabc\u304b\u304d\u304f\u3051\u3053";
+       
+    Token[] out_tokens = { 
+      newToken("\u3042\u3044", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE), 
+      newToken("\u3044\u3046", 1, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+      newToken("\u3046\u3048", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+      newToken("\u3048\u304a", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE), 
+      newToken("abc", 5, 8, CJKTokenizer.SINGLE_TOKEN_TYPE), 
+      newToken("\u304b\u304d", 8, 10, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+      newToken("\u304d\u304f", 9, 11, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+      newToken("\u304f\u3051", 10,12, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+      newToken("\u3051\u3053", 11,13, CJKTokenizer.DOUBLE_TOKEN_TYPE)
+    };
+    checkCJKToken(str, out_tokens);
+  }
+  
+  public void testMix2() throws IOException {
+    String str = "\u3042\u3044\u3046\u3048\u304aab\u3093c\u304b\u304d\u304f\u3051 \u3053";
+       
+    Token[] out_tokens = { 
+      newToken("\u3042\u3044", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE), 
+      newToken("\u3044\u3046", 1, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+      newToken("\u3046\u3048", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+      newToken("\u3048\u304a", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE), 
+      newToken("ab", 5, 7, CJKTokenizer.SINGLE_TOKEN_TYPE), 
+      newToken("\u3093", 7, 8, CJKTokenizer.DOUBLE_TOKEN_TYPE), 
+      newToken("c", 8, 9, CJKTokenizer.SINGLE_TOKEN_TYPE), 
+      newToken("\u304b\u304d", 9, 11, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+      newToken("\u304d\u304f", 10, 12, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+      newToken("\u304f\u3051", 11,13, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+      newToken("\u3053", 14,15, CJKTokenizer.DOUBLE_TOKEN_TYPE)
+    };
+    checkCJKToken(str, out_tokens);
+  }
+
+  public void testSingleChar() throws IOException {
+    String str = "\u4e00";
+       
+    Token[] out_tokens = { 
+      newToken("\u4e00", 0, 1, CJKTokenizer.DOUBLE_TOKEN_TYPE), 
+    };
+    checkCJKToken(str, out_tokens);
+  }
+}

Propchange: lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java
------------------------------------------------------------------------------
    svn:eol-style = native