You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/01/10 20:06:25 UTC
svn commit: r1229689 -
/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiTokenizer.java
Author: rmuir
Date: Tue Jan 10 19:06:25 2012
New Revision: 1229689
URL: http://svn.apache.org/viewvc?rev=1229689&view=rev
Log:
LUCENE-3305: add tests for supplementary characters
Modified:
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiTokenizer.java
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiTokenizer.java?rev=1229689&r1=1229688&r2=1229689&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiTokenizer.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiTokenizer.java Tue Jan 10 19:06:25 2012
@@ -17,6 +17,7 @@ package org.apache.lucene.analysis.kurom
* limitations under the License.
*/
+import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
@@ -24,6 +25,8 @@ import org.apache.lucene.analysis.Analyz
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util._TestUtil;
public class TestKuromojiTokenizer extends BaseTokenStreamTestCase {
@@ -107,6 +110,25 @@ public class TestKuromojiTokenizer exten
}
}
+ /** simple test for supplementary characters */
+ public void testSurrogates() throws IOException {
+ assertAnalyzesTo(analyzer, "ð©¬
è±éä¹æ¯ç",
+ new String[] { "ð©¬
", "è±", "é", "ä¹", "æ¯", "ç" });
+ }
+
+ /** random test ensuring we don't ever split supplementaries */
+ public void testSurrogates2() throws IOException {
+ for (int i = 0; i < 10000; i++) {
+ String s = _TestUtil.randomUnicodeString(random, 100);
+ TokenStream ts = analyzer.tokenStream("foo", new StringReader(s));
+ CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
+ ts.reset();
+ while (ts.incrementToken()) {
+ assertTrue(UnicodeUtil.validUTF16String(termAtt));
+ }
+ }
+ }
+
// note: test is kinda silly since kuromoji emits punctuation tokens.
// but, when/if we filter these out it will be useful.
public void testEnd() throws Exception {