You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@lucene.apache.org by rm...@apache.org on 2012/01/10 20:06:25 UTC

svn commit: r1229689 - /lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiTokenizer.java

Author: rmuir
Date: Tue Jan 10 19:06:25 2012
New Revision: 1229689

URL: http://svn.apache.org/viewvc?rev=1229689&view=rev
Log:
LUCENE-3305: add tests for supplementary characters

Modified:
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiTokenizer.java

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiTokenizer.java?rev=1229689&r1=1229688&r2=1229689&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiTokenizer.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiTokenizer.java Tue Jan 10 19:06:25 2012
@@ -17,6 +17,7 @@ package org.apache.lucene.analysis.kurom
  * limitations under the License.
  */
 
+import java.io.IOException;
 import java.io.Reader;
 import java.io.StringReader;
 
@@ -24,6 +25,8 @@ import org.apache.lucene.analysis.Analyz
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.util.UnicodeUtil;
 import org.apache.lucene.util._TestUtil;
 
 public class TestKuromojiTokenizer extends BaseTokenStreamTestCase {
@@ -107,6 +110,25 @@ public class TestKuromojiTokenizer exten
     }
   }
   
+  /** simple test for supplementary characters */
+  public void testSurrogates() throws IOException {
+    assertAnalyzesTo(analyzer, "ð©¬è±éä¹æ¯ç",
+      new String[] { "ð©¬", "è±", "é", "ä¹", "æ¯", "ç" });
+  }
+  
+  /** random test ensuring we don't ever split supplementaries */
+  public void testSurrogates2() throws IOException {
+    for (int i = 0; i < 10000; i++) {
+      String s = _TestUtil.randomUnicodeString(random, 100);
+      TokenStream ts = analyzer.tokenStream("foo", new StringReader(s));
+      CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
+      ts.reset();
+      while (ts.incrementToken()) {
+        assertTrue(UnicodeUtil.validUTF16String(termAtt));
+      }
+    }
+  }
+  
   // note: test is kinda silly since kuromoji emits punctuation tokens.
   // but, when/if we filter these out it will be useful.
   public void testEnd() throws Exception {