You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucy.apache.org by nw...@apache.org on 2011/12/06 01:42:12 UTC

[lucy-commits] svn commit: r1210723 - /incubator/lucy/branches/LUCY-196-uax-tokenizer/core/Lucy/Test/Analysis/TestStandardTokenizer.c

Author: nwellnhof
Date: Tue Dec  6 00:42:11 2011
New Revision: 1210723

URL: http://svn.apache.org/viewvc?rev=1210723&view=rev
Log:
Make sure to test 4 byte UTF-8 characters

Modified:
    incubator/lucy/branches/LUCY-196-uax-tokenizer/core/Lucy/Test/Analysis/TestStandardTokenizer.c

Modified: incubator/lucy/branches/LUCY-196-uax-tokenizer/core/Lucy/Test/Analysis/TestStandardTokenizer.c
URL: http://svn.apache.org/viewvc/incubator/lucy/branches/LUCY-196-uax-tokenizer/core/Lucy/Test/Analysis/TestStandardTokenizer.c?rev=1210723&r1=1210722&r2=1210723&view=diff
==============================================================================
--- incubator/lucy/branches/LUCY-196-uax-tokenizer/core/Lucy/Test/Analysis/TestStandardTokenizer.c (original)
+++ incubator/lucy/branches/LUCY-196-uax-tokenizer/core/Lucy/Test/Analysis/TestStandardTokenizer.c Tue Dec  6 00:42:11 2011
@@ -46,9 +46,10 @@ test_tokenizer(TestBatch *batch) {
         ":"
         "1,02\xC2\xADZ4.38"
         "\xE0\xB8\x81\xC2\xAD\xC2\xAD"
-        "\xE0\xB8\x82"
+        "\xF0\xA0\x80\x80"
+        "a"
         "/",
-        33);
+        35);
     VArray *got = StandardTokenizer_Split(tokenizer, (CharBuf*)word);
     CharBuf *token = (CharBuf*)VA_Fetch(got, 0);
     TEST_TRUE(batch,
@@ -72,7 +73,13 @@ test_tokenizer(TestBatch *batch) {
     TEST_TRUE(batch,
               token
               && CB_Is_A(token, CHARBUF)
-              && CB_Equals_Str(token, "\xE0\xB8\x82", 3),
+              && CB_Equals_Str(token, "\xF0\xA0\x80\x80", 4),
+              "Token: %s", CB_Get_Ptr8(token));
+    token = (CharBuf*)VA_Fetch(got, 4);
+    TEST_TRUE(batch,
+              token
+              && CB_Is_A(token, CHARBUF)
+              && CB_Equals_Str(token, "a", 1),
               "Token: %s", CB_Get_Ptr8(token));
     DECREF(got);
     DECREF(tokenizer);
@@ -80,7 +87,7 @@ test_tokenizer(TestBatch *batch) {
 
 void
 TestStandardTokenizer_run_tests() {
-    TestBatch *batch = TestBatch_new(5);
+    TestBatch *batch = TestBatch_new(6);
 
     TestBatch_Plan(batch);