You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucy.apache.org by nw...@apache.org on 2016/08/06 14:42:20 UTC

[1/4] lucy git commit: Use CharBuf to unescape JSON strings

Repository: lucy
Updated Branches:
  refs/heads/master 0a41b29b1 -> 7071a277c


Use CharBuf to unescape JSON strings

Also use a stricter custom decoder for hex escapes. (strtol allows
leading whitespace and plus signs.)


Project: http://git-wip-us.apache.org/repos/asf/lucy/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucy/commit/3cc03972
Tree: http://git-wip-us.apache.org/repos/asf/lucy/tree/3cc03972
Diff: http://git-wip-us.apache.org/repos/asf/lucy/diff/3cc03972

Branch: refs/heads/master
Commit: 3cc039724fa1862aa71333911f2bfc5c84b50f3e
Parents: 0a41b29
Author: Nick Wellnhofer <we...@aevum.de>
Authored: Tue Aug 2 16:58:43 2016 +0200
Committer: Nick Wellnhofer <we...@aevum.de>
Committed: Tue Aug 2 17:07:30 2016 +0200

----------------------------------------------------------------------
 core/Lucy/Util/Json.c          | 94 +++++++++++++++++++++----------------
 test/Lucy/Test/Util/TestJson.c |  3 +-
 2 files changed, 56 insertions(+), 41 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucy/blob/3cc03972/core/Lucy/Util/Json.c
----------------------------------------------------------------------
diff --git a/core/Lucy/Util/Json.c b/core/Lucy/Util/Json.c
index 6c7473d..f3e7a219 100644
--- a/core/Lucy/Util/Json.c
+++ b/core/Lucy/Util/Json.c
@@ -578,86 +578,100 @@ S_parse_string(const char **json_ptr, const char *limit) {
 
 static String*
 S_unescape_text(const char *top, const char *end) {
-    // The unescaped string will never be longer than the escaped string
-    // because only a \u escape can theoretically be too long and
-    // StrHelp_encode_utf8_char guards against sequences over 4 bytes.
-    // Therefore we can allocate once and not worry about reallocating.
     size_t cap = (size_t)(end - top) + 1;
-    char *target_buf = (char*)MALLOCATE(cap);
-    size_t target_size = 0;
-    for (const char *text = top; text < end; text++) {
+    CharBuf *cb = CB_new(cap);
+    const char *chunk = top;
+    const char *text  = top;
+
+    while (text < end) {
         if (*text != '\\') {
-            target_buf[target_size++] = *text;
+            text++;
         }
         else {
+            if (!StrHelp_utf8_valid(chunk, (size_t)(text - chunk))) {
+                DECREF(cb);
+                String *mess = MAKE_MESS("Bad UTF-8 in JSON");
+                Err_set_error(Err_new(mess));
+                return NULL;
+            }
+            CB_Cat_Trusted_Utf8(cb, chunk, (size_t)(text - chunk));
+
             // Process escape.
             text++;
             switch (*text) {
                 case '"':
-                    target_buf[target_size++] = '"';
-                    break;
                 case '\\':
-                    target_buf[target_size++] = '\\';
-                    break;
                 case '/':
-                    target_buf[target_size++] = '/';
+                    CB_Cat_Trusted_Utf8(cb, text, 1);
                     break;
                 case 'b':
-                    target_buf[target_size++] = '\b';
+                    CB_Cat_Trusted_Utf8(cb, "\b", 1);
                     break;
                 case 'f':
-                    target_buf[target_size++] = '\f';
+                    CB_Cat_Trusted_Utf8(cb, "\f", 1);
                     break;
                 case 'n':
-                    target_buf[target_size++] = '\n';
+                    CB_Cat_Trusted_Utf8(cb, "\n", 1);
                     break;
                 case 'r':
-                    target_buf[target_size++] = '\r';
+                    CB_Cat_Trusted_Utf8(cb, "\r", 1);
                     break;
                 case 't':
-                    target_buf[target_size++] = '\t';
+                    CB_Cat_Trusted_Utf8(cb, "\t", 1);
                     break;
                 case 'u': {
-                        // Copy into a temp buffer because strtol will overrun
-                        // into adjacent text data for e.g. "\uAAAA1".
-                        char temp[5] = { 0, 0, 0, 0, 0 };
-                        memcpy(temp, text + 1, 4);
-                        text += 4;
-                        char *num_end;
-                        long code_point = strtol(temp, &num_end, 16);
-                        char *temp_ptr = temp;
-                        if (num_end != temp_ptr + 4 || code_point < 0) {
-                            FREEMEM(target_buf);
-                            SET_ERROR("Invalid \\u escape", text - 5, end);
-                            return NULL;
+                        int32_t code_point = 0;
+                        for (int i = 1; i < 5; i++) {
+                            char c = text[i];
+                            int32_t digit = 0;
+                            if (c >= '0' && c <= '9') {
+                                digit = c - '0';
+                            }
+                            else if (c >= 'a' && c <= 'f') {
+                                digit = c - 'a' + 10;
+                            }
+                            else if (c >= 'A' && c <= 'F') {
+                                digit = c - 'A' + 10;
+                            }
+                            else {
+                                DECREF(cb);
+                                SET_ERROR("Invalid \\u escape", text - 1, end);
+                                return NULL;
+                            }
+                            code_point = code_point * 16 + digit;
                         }
                         if (code_point >= 0xD800 && code_point <= 0xDFFF) {
-                            FREEMEM(target_buf);
+                            DECREF(cb);
                             SET_ERROR("Surrogate pairs not supported",
-                                      text - 5, end);
+                                      text - 1, end);
                             return NULL;
                         }
-                        target_size += StrHelp_encode_utf8_char((int32_t)code_point,
-                                                                target_buf + target_size);
+                        CB_Cat_Char(cb, code_point);
+                        text += 4;
                     }
                     break;
                 default:
-                    FREEMEM(target_buf);
+                    DECREF(cb);
                     SET_ERROR("Illegal escape", text - 1, end);
                     return NULL;
             }
+
+            text++;
+            chunk = text;
         }
     }
 
-    // NULL-terminate, sanity check, then return the escaped string.
-    target_buf[target_size] = '\0';
-    if (!StrHelp_utf8_valid(target_buf, target_size)) {
-        FREEMEM(target_buf);
+    if (!StrHelp_utf8_valid(chunk, (size_t)(text - chunk))) {
+        DECREF(cb);
         String *mess = MAKE_MESS("Bad UTF-8 in JSON");
         Err_set_error(Err_new(mess));
         return NULL;
     }
-    return Str_new_steal_trusted_utf8(target_buf, target_size);
+    CB_Cat_Trusted_Utf8(cb, chunk, (size_t)(text - chunk));
+
+    String *retval = CB_Yield_String(cb);
+    DECREF(cb);
+    return retval;
 }
 
 static CFISH_INLINE bool

http://git-wip-us.apache.org/repos/asf/lucy/blob/3cc03972/test/Lucy/Test/Util/TestJson.c
----------------------------------------------------------------------
diff --git a/test/Lucy/Test/Util/TestJson.c b/test/Lucy/Test/Util/TestJson.c
index c67e292..78fa959 100644
--- a/test/Lucy/Test/Util/TestJson.c
+++ b/test/Lucy/Test/Util/TestJson.c
@@ -275,6 +275,7 @@ test_syntax_errors(TestBatchRunner *runner) {
     S_verify_bad_syntax(runner, "+1.0 ", "float with prepended plus");
     S_verify_bad_syntax(runner, "\"\\g\"", "invalid char escape");
     S_verify_bad_syntax(runner, "\"\\uAAAZ\"", "invalid \\u escape");
+    S_verify_bad_syntax(runner, "\"\\uAAA\"", "invalid \\u escape");
 }
 
 static void
@@ -342,7 +343,7 @@ void
 TestJson_Run_IMP(TestJson *self, TestBatchRunner *runner) {
     uint32_t num_tests = 105;
 #ifndef LUCY_VALGRIND
-    num_tests += 28; // FIXME: syntax errors leak memory.
+    num_tests += 30; // FIXME: syntax errors leak memory.
 #endif
     TestBatchRunner_Plan(runner, (TestBatch*)self, num_tests);
 


[2/4] lucy git commit: Move some functions from StrHelp to Str

Posted by nw...@apache.org.
Move some functions from StrHelp to Str


Project: http://git-wip-us.apache.org/repos/asf/lucy/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucy/commit/f257a45e
Tree: http://git-wip-us.apache.org/repos/asf/lucy/tree/f257a45e
Diff: http://git-wip-us.apache.org/repos/asf/lucy/diff/f257a45e

Branch: refs/heads/master
Commit: f257a45e0c73cc5875b8f6e3a66ba4eff2e84ae5
Parents: 3cc0397
Author: Nick Wellnhofer <we...@aevum.de>
Authored: Tue Aug 2 18:54:24 2016 +0200
Committer: Nick Wellnhofer <we...@aevum.de>
Committed: Tue Aug 2 18:54:24 2016 +0200

----------------------------------------------------------------------
 core/Lucy/Highlight/Highlighter.c         | 10 +++++-----
 core/Lucy/Plan/TextType.c                 |  4 ++--
 core/Lucy/Search/QueryParser/QueryLexer.c |  4 ++--
 core/Lucy/Util/Freezer.c                  |  2 +-
 core/Lucy/Util/Json.c                     |  6 +++---
 5 files changed, 13 insertions(+), 13 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucy/blob/f257a45e/core/Lucy/Highlight/Highlighter.c
----------------------------------------------------------------------
diff --git a/core/Lucy/Highlight/Highlighter.c b/core/Lucy/Highlight/Highlighter.c
index 6db17dd..62e2828 100644
--- a/core/Lucy/Highlight/Highlighter.c
+++ b/core/Lucy/Highlight/Highlighter.c
@@ -235,7 +235,7 @@ S_find_starting_boundary(StringIterator *top, uint32_t max_skip,
             return true;
         }
 
-        if (StrHelp_is_whitespace(code_point)) {
+        if (Str_is_whitespace(code_point)) {
             if (word == NULL) { word = StrIter_Clone(top); }
         }
         else {
@@ -260,7 +260,7 @@ S_find_starting_boundary(StringIterator *top, uint32_t max_skip,
             break;
         }
 
-        if (word == NULL && StrHelp_is_whitespace(code_point)) {
+        if (word == NULL && Str_is_whitespace(code_point)) {
             word = StrIter_Clone(iter);
             word_offset = i + 1;
         }
@@ -304,7 +304,7 @@ S_find_ending_boundary(StringIterator *tail, uint32_t max_skip,
             DECREF(iter);
             return true;
         }
-    } while (StrHelp_is_whitespace(code_point));
+    } while (Str_is_whitespace(code_point));
 
     // Keep track of the first word boundary.
     StringIterator *word = NULL;
@@ -325,7 +325,7 @@ S_find_ending_boundary(StringIterator *tail, uint32_t max_skip,
             return true;
         }
 
-        if (StrHelp_is_whitespace(code_point)) {
+        if (Str_is_whitespace(code_point)) {
             if (word == NULL) {
                 word = StrIter_Clone(iter);
                 word_offset = i + 1;
@@ -348,7 +348,7 @@ S_find_ending_boundary(StringIterator *tail, uint32_t max_skip,
 
         // Strip whitespace and punctuation that collides with an ellipsis.
         while (STR_OOB != (code_point = StrIter_Prev(tail))) {
-            if (!StrHelp_is_whitespace(code_point)
+            if (!Str_is_whitespace(code_point)
                 && code_point != '.'
                 && code_point != ','
                 && code_point != ';'

http://git-wip-us.apache.org/repos/asf/lucy/blob/f257a45e/core/Lucy/Plan/TextType.c
----------------------------------------------------------------------
diff --git a/core/Lucy/Plan/TextType.c b/core/Lucy/Plan/TextType.c
index 5a3e323..a0e2759 100644
--- a/core/Lucy/Plan/TextType.c
+++ b/core/Lucy/Plan/TextType.c
@@ -160,7 +160,7 @@ TextTermStepper_Read_Key_Frame_IMP(TextTermStepper *self,
     // Set the value text.
     InStream_Read_Bytes(instream, ptr, text_len);
     BB_Set_Size(ivars->bytebuf, text_len);
-    if (!StrHelp_utf8_valid(ptr, text_len)) {
+    if (!Str_utf8_valid(ptr, text_len)) {
         THROW(ERR, "Invalid UTF-8 sequence in '%o' at byte %i64",
               InStream_Get_Filename(instream),
               InStream_Tell(instream) - text_len);
@@ -193,7 +193,7 @@ TextTermStepper_Read_Delta_IMP(TextTermStepper *self, InStream *instream) {
     // Set the value text.
     InStream_Read_Bytes(instream, ptr + text_overlap, finish_chars_len);
     BB_Set_Size(ivars->bytebuf, total_text_len);
-    if (!StrHelp_utf8_valid(ptr, total_text_len)) {
+    if (!Str_utf8_valid(ptr, total_text_len)) {
         THROW(ERR, "Invalid UTF-8 sequence in '%o' at byte %i64",
               InStream_Get_Filename(instream),
               InStream_Tell(instream) - finish_chars_len);

http://git-wip-us.apache.org/repos/asf/lucy/blob/f257a45e/core/Lucy/Search/QueryParser/QueryLexer.c
----------------------------------------------------------------------
diff --git a/core/Lucy/Search/QueryParser/QueryLexer.c b/core/Lucy/Search/QueryParser/QueryLexer.c
index 87200b8..48a57d9 100644
--- a/core/Lucy/Search/QueryParser/QueryLexer.c
+++ b/core/Lucy/Search/QueryParser/QueryLexer.c
@@ -173,7 +173,7 @@ S_consume_keyword(StringIterator *iter, const char *keyword,
         DECREF(temp);
         return NULL;
     }
-    if (StrHelp_is_whitespace(lookahead)
+    if (Str_is_whitespace(lookahead)
         || lookahead == '"'
         || lookahead == '('
         || lookahead == ')'
@@ -257,7 +257,7 @@ S_consume_text(StringIterator *iter) {
         else if (code_point == STR_OOB) {
             break;
         }
-        else if (StrHelp_is_whitespace(code_point)
+        else if (Str_is_whitespace(code_point)
             || code_point == '"'
             || code_point == '('
             || code_point == ')'

http://git-wip-us.apache.org/repos/asf/lucy/blob/f257a45e/core/Lucy/Util/Freezer.c
----------------------------------------------------------------------
diff --git a/core/Lucy/Util/Freezer.c b/core/Lucy/Util/Freezer.c
index 26039d2..f8034f2 100644
--- a/core/Lucy/Util/Freezer.c
+++ b/core/Lucy/Util/Freezer.c
@@ -196,7 +196,7 @@ Freezer_deserialize_string(String *string, InStream *instream) {
     char *buf = (char*)MALLOCATE(size + 1);
     InStream_Read_Bytes(instream, buf, size);
     buf[size] = '\0';
-    if (!StrHelp_utf8_valid(buf, size)) {
+    if (!Str_utf8_valid(buf, size)) {
         THROW(ERR, "Attempt to deserialize invalid UTF-8");
     }
     return Str_init_steal_trusted_utf8(string, buf, size);

http://git-wip-us.apache.org/repos/asf/lucy/blob/f257a45e/core/Lucy/Util/Json.c
----------------------------------------------------------------------
diff --git a/core/Lucy/Util/Json.c b/core/Lucy/Util/Json.c
index f3e7a219..a601f98 100644
--- a/core/Lucy/Util/Json.c
+++ b/core/Lucy/Util/Json.c
@@ -567,7 +567,7 @@ S_parse_string(const char **json_ptr, const char *limit) {
     else {
         // Optimize common case where there are no escapes.
         size_t len = (size_t)(end - top);
-        if (!StrHelp_utf8_valid(top, len)) {
+        if (!Str_utf8_valid(top, len)) {
             String *mess = MAKE_MESS("Bad UTF-8 in JSON");
             Err_set_error(Err_new(mess));
             return NULL;
@@ -588,7 +588,7 @@ S_unescape_text(const char *top, const char *end) {
             text++;
         }
         else {
-            if (!StrHelp_utf8_valid(chunk, (size_t)(text - chunk))) {
+            if (!Str_utf8_valid(chunk, (size_t)(text - chunk))) {
                 DECREF(cb);
                 String *mess = MAKE_MESS("Bad UTF-8 in JSON");
                 Err_set_error(Err_new(mess));
@@ -661,7 +661,7 @@ S_unescape_text(const char *top, const char *end) {
         }
     }
 
-    if (!StrHelp_utf8_valid(chunk, (size_t)(text - chunk))) {
+    if (!Str_utf8_valid(chunk, (size_t)(text - chunk))) {
         DECREF(cb);
         String *mess = MAKE_MESS("Bad UTF-8 in JSON");
         Err_set_error(Err_new(mess));


[4/4] lucy git commit: Merge branch 'string-helpers'

Posted by nw...@apache.org.
Merge branch 'string-helpers'

See CLOWNFISH-76.


Project: http://git-wip-us.apache.org/repos/asf/lucy/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucy/commit/7071a277
Tree: http://git-wip-us.apache.org/repos/asf/lucy/tree/7071a277
Diff: http://git-wip-us.apache.org/repos/asf/lucy/diff/7071a277

Branch: refs/heads/master
Commit: 7071a277cfc2be26fb5eb4e46b5fecbc715eecaf
Parents: 0a41b29 aa48a9e
Author: Nick Wellnhofer <we...@aevum.de>
Authored: Sat Aug 6 16:41:19 2016 +0200
Committer: Nick Wellnhofer <we...@aevum.de>
Committed: Sat Aug 6 16:41:19 2016 +0200

----------------------------------------------------------------------
 c/src/Lucy/Analysis/RegexTokenizer.c      |   2 +-
 core/Lucy/Analysis/StandardTokenizer.c    |   1 +
 core/Lucy/Highlight/Highlighter.c         |  10 +--
 core/Lucy/Index/HighlightWriter.c         |   1 +
 core/Lucy/Index/IndexManager.c            |   2 +-
 core/Lucy/Index/Indexer.c                 |   1 +
 core/Lucy/Index/PolyReader.c              |   1 -
 core/Lucy/Index/Posting/RawPosting.c      |   1 -
 core/Lucy/Index/Segment.c                 |   2 +-
 core/Lucy/Index/Snapshot.c                |   2 +-
 core/Lucy/Index/TermInfo.c                |   1 -
 core/Lucy/Index/TermStepper.c             |   1 -
 core/Lucy/Plan/TextType.c                 |   6 +-
 core/Lucy/Search/QueryParser/QueryLexer.c |   4 +-
 core/Lucy/Store/CompoundFileReader.c      |   1 -
 core/Lucy/Util/Freezer.c                  |   2 +-
 core/Lucy/Util/IndexFileNames.c           |   1 -
 core/Lucy/Util/Json.c                     |  99 ++++++++++++++----------
 core/Lucy/Util/StringHelper.c             |  83 ++++++++++++++++++++
 core/Lucy/Util/StringHelper.cfh           |  59 ++++++++++++++
 core/Lucy/Util/ToolSet.h                  |   1 -
 go/cfext/lucy.c                           |   1 -
 go/lucy/lucy.go                           |   4 +-
 perl/buildlib/Lucy/Build/Binding/Util.pm  |  94 ++++++++++++++++++++++
 perl/lib/Lucy.pm                          |  17 ++++
 perl/lib/Lucy/Util/StringHelper.pm        |  25 ++++++
 perl/lib/LucyX/Index/ZlibDocReader.pm     |   2 +-
 perl/lib/LucyX/Index/ZlibDocWriter.pm     |   2 +-
 perl/t/105-folder.t                       |   2 +-
 perl/t/601-queryparser.t                  |   2 +-
 perl/t/binding/101-simple_io.t            |   2 +-
 perl/t/core/032-string_helper.t           |  25 ++++++
 perl/xs/Lucy/Analysis/RegexTokenizer.c    |   6 +-
 perl/xs/Lucy/Index/Inverter.c             |   1 -
 test/Lucy/Test.c                          |   2 +
 test/Lucy/Test/Util/TestJson.c            |   3 +-
 test/Lucy/Test/Util/TestStringHelper.c    | 103 +++++++++++++++++++++++++
 test/Lucy/Test/Util/TestStringHelper.cfh  |  29 +++++++
 38 files changed, 524 insertions(+), 77 deletions(-)
----------------------------------------------------------------------



[3/4] lucy git commit: Move StringHelper from Clownfish to Lucy

Posted by nw...@apache.org.
Move StringHelper from Clownfish to Lucy


Project: http://git-wip-us.apache.org/repos/asf/lucy/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucy/commit/aa48a9e6
Tree: http://git-wip-us.apache.org/repos/asf/lucy/tree/aa48a9e6
Diff: http://git-wip-us.apache.org/repos/asf/lucy/diff/aa48a9e6

Branch: refs/heads/master
Commit: aa48a9e68c2dd51a7c4391cab1bcd5f77bb39d8a
Parents: f257a45
Author: Nick Wellnhofer <we...@aevum.de>
Authored: Tue Aug 2 19:33:01 2016 +0200
Committer: Nick Wellnhofer <we...@aevum.de>
Committed: Tue Aug 2 19:57:01 2016 +0200

----------------------------------------------------------------------
 c/src/Lucy/Analysis/RegexTokenizer.c     |   2 +-
 core/Lucy/Analysis/StandardTokenizer.c   |   1 +
 core/Lucy/Index/HighlightWriter.c        |   1 +
 core/Lucy/Index/IndexManager.c           |   2 +-
 core/Lucy/Index/Indexer.c                |   1 +
 core/Lucy/Index/PolyReader.c             |   1 -
 core/Lucy/Index/Posting/RawPosting.c     |   1 -
 core/Lucy/Index/Segment.c                |   2 +-
 core/Lucy/Index/Snapshot.c               |   2 +-
 core/Lucy/Index/TermInfo.c               |   1 -
 core/Lucy/Index/TermStepper.c            |   1 -
 core/Lucy/Plan/TextType.c                |   2 +-
 core/Lucy/Store/CompoundFileReader.c     |   1 -
 core/Lucy/Util/IndexFileNames.c          |   1 -
 core/Lucy/Util/Json.c                    |   3 +-
 core/Lucy/Util/StringHelper.c            |  83 +++++++++++++++++++++
 core/Lucy/Util/StringHelper.cfh          |  59 +++++++++++++++
 core/Lucy/Util/ToolSet.h                 |   1 -
 go/cfext/lucy.c                          |   1 -
 go/lucy/lucy.go                          |   4 +-
 perl/buildlib/Lucy/Build/Binding/Util.pm |  94 +++++++++++++++++++++++
 perl/lib/Lucy.pm                         |  17 +++++
 perl/lib/Lucy/Util/StringHelper.pm       |  25 +++++++
 perl/lib/LucyX/Index/ZlibDocReader.pm    |   2 +-
 perl/lib/LucyX/Index/ZlibDocWriter.pm    |   2 +-
 perl/t/105-folder.t                      |   2 +-
 perl/t/601-queryparser.t                 |   2 +-
 perl/t/binding/101-simple_io.t           |   2 +-
 perl/t/core/032-string_helper.t          |  25 +++++++
 perl/xs/Lucy/Analysis/RegexTokenizer.c   |   6 +-
 perl/xs/Lucy/Index/Inverter.c            |   1 -
 test/Lucy/Test.c                         |   2 +
 test/Lucy/Test/Util/TestStringHelper.c   | 103 ++++++++++++++++++++++++++
 test/Lucy/Test/Util/TestStringHelper.cfh |  29 ++++++++
 34 files changed, 457 insertions(+), 25 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucy/blob/aa48a9e6/c/src/Lucy/Analysis/RegexTokenizer.c
----------------------------------------------------------------------
diff --git a/c/src/Lucy/Analysis/RegexTokenizer.c b/c/src/Lucy/Analysis/RegexTokenizer.c
index d47b3ea..9d534ff 100644
--- a/c/src/Lucy/Analysis/RegexTokenizer.c
+++ b/c/src/Lucy/Analysis/RegexTokenizer.c
@@ -26,9 +26,9 @@
 #include "Clownfish/String.h"
 #include "Clownfish/Err.h"
 #include "Clownfish/Util/Memory.h"
-#include "Clownfish/Util/StringHelper.h"
 #include "Lucy/Analysis/Token.h"
 #include "Lucy/Analysis/Inversion.h"
+#include "Lucy/Util/StringHelper.h"
 
 #if defined(CHY_HAS_PCRE_H)
 

http://git-wip-us.apache.org/repos/asf/lucy/blob/aa48a9e6/core/Lucy/Analysis/StandardTokenizer.c
----------------------------------------------------------------------
diff --git a/core/Lucy/Analysis/StandardTokenizer.c b/core/Lucy/Analysis/StandardTokenizer.c
index 012d428..b44e481 100644
--- a/core/Lucy/Analysis/StandardTokenizer.c
+++ b/core/Lucy/Analysis/StandardTokenizer.c
@@ -21,6 +21,7 @@
 #include "Lucy/Analysis/StandardTokenizer.h"
 #include "Lucy/Analysis/Token.h"
 #include "Lucy/Analysis/Inversion.h"
+#include "Lucy/Util/StringHelper.h"
 
 /*
  * We use a modified version of the Word_Break property defined in UAX #29.

http://git-wip-us.apache.org/repos/asf/lucy/blob/aa48a9e6/core/Lucy/Index/HighlightWriter.c
----------------------------------------------------------------------
diff --git a/core/Lucy/Index/HighlightWriter.c b/core/Lucy/Index/HighlightWriter.c
index 64b1a4c..b38a251 100644
--- a/core/Lucy/Index/HighlightWriter.c
+++ b/core/Lucy/Index/HighlightWriter.c
@@ -38,6 +38,7 @@
 #include "Lucy/Store/InStream.h"
 #include "Lucy/Util/Freezer.h"
 #include "Lucy/Util/NumberUtils.h"
+#include "Lucy/Util/StringHelper.h"
 
 static OutStream*
 S_lazy_init(HighlightWriter *self);

http://git-wip-us.apache.org/repos/asf/lucy/blob/aa48a9e6/core/Lucy/Index/IndexManager.c
----------------------------------------------------------------------
diff --git a/core/Lucy/Index/IndexManager.c b/core/Lucy/Index/IndexManager.c
index 8bc80fe..348717a 100644
--- a/core/Lucy/Index/IndexManager.c
+++ b/core/Lucy/Index/IndexManager.c
@@ -29,7 +29,7 @@
 #include "Lucy/Store/LockFactory.h"
 #include "Lucy/Util/IndexFileNames.h"
 #include "Lucy/Util/Json.h"
-#include "Clownfish/Util/StringHelper.h"
+#include "Lucy/Util/StringHelper.h"
 
 #include <stdlib.h>
 

http://git-wip-us.apache.org/repos/asf/lucy/blob/aa48a9e6/core/Lucy/Index/Indexer.c
----------------------------------------------------------------------
diff --git a/core/Lucy/Index/Indexer.c b/core/Lucy/Index/Indexer.c
index 977cf38..16b915b 100644
--- a/core/Lucy/Index/Indexer.c
+++ b/core/Lucy/Index/Indexer.c
@@ -42,6 +42,7 @@
 #include "Lucy/Util/Freezer.h"
 #include "Lucy/Util/IndexFileNames.h"
 #include "Lucy/Util/Json.h"
+#include "Lucy/Util/StringHelper.h"
 
 int32_t Indexer_CREATE   = 0x00000001;
 int32_t Indexer_TRUNCATE = 0x00000002;

http://git-wip-us.apache.org/repos/asf/lucy/blob/aa48a9e6/core/Lucy/Index/PolyReader.c
----------------------------------------------------------------------
diff --git a/core/Lucy/Index/PolyReader.c b/core/Lucy/Index/PolyReader.c
index 63d23b5..e843bcd 100644
--- a/core/Lucy/Index/PolyReader.c
+++ b/core/Lucy/Index/PolyReader.c
@@ -33,7 +33,6 @@
 #include "Lucy/Util/Json.h"
 #include "Lucy/Util/Freezer.h"
 #include "Lucy/Util/IndexFileNames.h"
-#include "Clownfish/Util/StringHelper.h"
 
 // Obtain/release read locks and commit locks.
 static bool

http://git-wip-us.apache.org/repos/asf/lucy/blob/aa48a9e6/core/Lucy/Index/Posting/RawPosting.c
----------------------------------------------------------------------
diff --git a/core/Lucy/Index/Posting/RawPosting.c b/core/Lucy/Index/Posting/RawPosting.c
index db2e5b7..a4b9e5d 100644
--- a/core/Lucy/Index/Posting/RawPosting.c
+++ b/core/Lucy/Index/Posting/RawPosting.c
@@ -28,7 +28,6 @@
 #include "Lucy/Index/TermInfo.h"
 #include "Lucy/Plan/Schema.h"
 #include "Lucy/Store/OutStream.h"
-#include "Clownfish/Util/StringHelper.h"
 
 RawPosting*
 RawPost_new(void *pre_allocated_memory, int32_t doc_id, uint32_t freq,

http://git-wip-us.apache.org/repos/asf/lucy/blob/aa48a9e6/core/Lucy/Index/Segment.c
----------------------------------------------------------------------
diff --git a/core/Lucy/Index/Segment.c b/core/Lucy/Index/Segment.c
index 8507d4d..a3312a8 100644
--- a/core/Lucy/Index/Segment.c
+++ b/core/Lucy/Index/Segment.c
@@ -23,7 +23,7 @@
 #include "Clownfish/Num.h"
 #include "Lucy/Store/Folder.h"
 #include "Lucy/Util/Json.h"
-#include "Clownfish/Util/StringHelper.h"
+#include "Lucy/Util/StringHelper.h"
 #include "Lucy/Util/IndexFileNames.h"
 
 Segment*

http://git-wip-us.apache.org/repos/asf/lucy/blob/aa48a9e6/core/Lucy/Index/Snapshot.c
----------------------------------------------------------------------
diff --git a/core/Lucy/Index/Snapshot.c b/core/Lucy/Index/Snapshot.c
index c1005b2..fe2a89e 100644
--- a/core/Lucy/Index/Snapshot.c
+++ b/core/Lucy/Index/Snapshot.c
@@ -21,7 +21,7 @@
 #include "Clownfish/Boolean.h"
 #include "Lucy/Index/Segment.h"
 #include "Lucy/Store/Folder.h"
-#include "Clownfish/Util/StringHelper.h"
+#include "Lucy/Util/StringHelper.h"
 #include "Lucy/Util/IndexFileNames.h"
 #include "Lucy/Util/Json.h"
 

http://git-wip-us.apache.org/repos/asf/lucy/blob/aa48a9e6/core/Lucy/Index/TermInfo.c
----------------------------------------------------------------------
diff --git a/core/Lucy/Index/TermInfo.c b/core/Lucy/Index/TermInfo.c
index 697d604..34c1689 100644
--- a/core/Lucy/Index/TermInfo.c
+++ b/core/Lucy/Index/TermInfo.c
@@ -18,7 +18,6 @@
 #include "Lucy/Util/ToolSet.h"
 
 #include "Lucy/Index/TermInfo.h"
-#include "Clownfish/Util/StringHelper.h"
 
 TermInfo*
 TInfo_new(int32_t doc_freq) {

http://git-wip-us.apache.org/repos/asf/lucy/blob/aa48a9e6/core/Lucy/Index/TermStepper.c
----------------------------------------------------------------------
diff --git a/core/Lucy/Index/TermStepper.c b/core/Lucy/Index/TermStepper.c
index 26cb876..6691184 100644
--- a/core/Lucy/Index/TermStepper.c
+++ b/core/Lucy/Index/TermStepper.c
@@ -21,7 +21,6 @@
 #include "Lucy/Plan/Schema.h"
 #include "Lucy/Store/InStream.h"
 #include "Lucy/Store/OutStream.h"
-#include "Clownfish/Util/StringHelper.h"
 
 TermStepper*
 TermStepper_init(TermStepper *self) {

http://git-wip-us.apache.org/repos/asf/lucy/blob/aa48a9e6/core/Lucy/Plan/TextType.c
----------------------------------------------------------------------
diff --git a/core/Lucy/Plan/TextType.c b/core/Lucy/Plan/TextType.c
index a0e2759..12fb5ca 100644
--- a/core/Lucy/Plan/TextType.c
+++ b/core/Lucy/Plan/TextType.c
@@ -21,8 +21,8 @@
 #include "Lucy/Plan/TextType.h"
 #include "Lucy/Store/InStream.h"
 #include "Lucy/Store/OutStream.h"
+#include "Lucy/Util/StringHelper.h"
 #include "Clownfish/ByteBuf.h"
-#include "Clownfish/Util/StringHelper.h"
 
 TermStepper*
 TextType_Make_Term_Stepper_IMP(TextType *self) {

http://git-wip-us.apache.org/repos/asf/lucy/blob/aa48a9e6/core/Lucy/Store/CompoundFileReader.c
----------------------------------------------------------------------
diff --git a/core/Lucy/Store/CompoundFileReader.c b/core/Lucy/Store/CompoundFileReader.c
index 06b79a3..ad25272 100644
--- a/core/Lucy/Store/CompoundFileReader.c
+++ b/core/Lucy/Store/CompoundFileReader.c
@@ -26,7 +26,6 @@
 #include "Lucy/Store/InStream.h"
 #include "Lucy/Util/IndexFileNames.h"
 #include "Lucy/Util/Json.h"
-#include "Clownfish/Util/StringHelper.h"
 
 CompoundFileReader*
 CFReader_open(Folder *folder) {

http://git-wip-us.apache.org/repos/asf/lucy/blob/aa48a9e6/core/Lucy/Util/IndexFileNames.c
----------------------------------------------------------------------
diff --git a/core/Lucy/Util/IndexFileNames.c b/core/Lucy/Util/IndexFileNames.c
index 550995e..a1a063b 100644
--- a/core/Lucy/Util/IndexFileNames.c
+++ b/core/Lucy/Util/IndexFileNames.c
@@ -20,7 +20,6 @@
 #include "Lucy/Util/IndexFileNames.h"
 #include "Lucy/Store/DirHandle.h"
 #include "Lucy/Store/Folder.h"
-#include "Clownfish/Util/StringHelper.h"
 
 String*
 IxFileNames_latest_snapshot(Folder *folder) {

http://git-wip-us.apache.org/repos/asf/lucy/blob/aa48a9e6/core/Lucy/Util/Json.c
----------------------------------------------------------------------
diff --git a/core/Lucy/Util/Json.c b/core/Lucy/Util/Json.c
index a601f98..46a0b89 100644
--- a/core/Lucy/Util/Json.c
+++ b/core/Lucy/Util/Json.c
@@ -24,11 +24,12 @@
 #include "Clownfish/Boolean.h"
 #include "Clownfish/CharBuf.h"
 #include "Clownfish/Num.h"
+#include "Clownfish/Util/Memory.h"
 #include "Lucy/Store/Folder.h"
 #include "Lucy/Store/InStream.h"
 #include "Lucy/Store/OutStream.h"
-#include "Clownfish/Util/Memory.h"
 #include "Lucy/Util/Json/JsonParser.h"
+#include "Lucy/Util/StringHelper.h"
 
 /* Routines generated by Lemon. */
 void*

http://git-wip-us.apache.org/repos/asf/lucy/blob/aa48a9e6/core/Lucy/Util/StringHelper.c
----------------------------------------------------------------------
diff --git a/core/Lucy/Util/StringHelper.c b/core/Lucy/Util/StringHelper.c
new file mode 100644
index 0000000..2331901
--- /dev/null
+++ b/core/Lucy/Util/StringHelper.c
@@ -0,0 +1,83 @@
+/* Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <string.h>
+
+#define C_LUCY_STRINGHELPER
+#define LUCY_USE_SHORT_NAMES
+
+#include "Lucy/Util/StringHelper.h"
+
+const uint8_t lucy_StrHelp_UTF8_COUNT[] = {
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+    4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+};
+
+size_t
+StrHelp_overlap(const char *a, const char *b, size_t a_len,  size_t b_len) {
+    size_t i;
+    const size_t len = a_len <= b_len ? a_len : b_len;
+
+    for (i = 0; i < len; i++) {
+        if (*a++ != *b++) { break; }
+    }
+    return i;
+}
+
+static const char base36_chars[] = "0123456789abcdefghijklmnopqrstuvwxyz";
+
+size_t
+StrHelp_to_base36(uint64_t num, void *buffer) {
+    char  my_buf[StrHelp_MAX_BASE36_BYTES];
+    char *buf = my_buf + StrHelp_MAX_BASE36_BYTES - 1;
+    char *end = buf;
+
+    // Null terminate.
+    *buf = '\0';
+
+    // Convert to base 36 characters.
+    do {
+        *(--buf) = base36_chars[num % 36];
+        num /= 36;
+    } while (num > 0);
+
+    size_t size = (size_t)(end - buf);
+    memcpy(buffer, buf, size + 1);
+    return size;
+}
+
+const char*
+StrHelp_back_utf8_char(const char *ptr, const char *start) {
+    while (--ptr >= start) {
+        if ((*ptr & 0xC0) != 0x80) { return ptr; }
+    }
+    return NULL;
+}
+

http://git-wip-us.apache.org/repos/asf/lucy/blob/aa48a9e6/core/Lucy/Util/StringHelper.cfh
----------------------------------------------------------------------
diff --git a/core/Lucy/Util/StringHelper.cfh b/core/Lucy/Util/StringHelper.cfh
new file mode 100644
index 0000000..f78f0a8
--- /dev/null
+++ b/core/Lucy/Util/StringHelper.cfh
@@ -0,0 +1,59 @@
+/* Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+parcel Lucy;
+
+inert class Lucy::Util::StringHelper nickname StrHelp {
+
+    /* A table where the values indicate the number of bytes in a UTF-8
+     * sequence implied by the leading utf8 byte.
+     */
+    inert const uint8_t[] UTF8_COUNT;
+
+    /** Return the number of bytes that two strings have in common.
+     */
+    inert size_t
+    overlap(const char *a, const char *b, size_t a_len,  size_t b_len);
+
+    /** Encode a NULL-terminated string representation of a value in base 36
+     * into `buffer`.
+     *
+     * @param value The number to be encoded.
+     * @param buffer A buffer at least MAX_BASE36_BYTES bytes long.
+     * @return the number of digits encoded (not including the terminating
+     * NULL).
+     */
+    inert size_t
+    to_base36(uint64_t value, void *buffer);
+
+    /** Return the first non-continuation byte before the supplied pointer.
+     * If backtracking progresses beyond the supplied start, return NULL.
+     */
+    inert nullable const char*
+    back_utf8_char(const char *utf8, const char *start);
+}
+
+__C__
+/** The maximum number of bytes encoded by to_base36(), including the
+ * terminating NULL.
+ */
+#define lucy_StrHelp_MAX_BASE36_BYTES 14
+#ifdef LUCY_USE_SHORT_NAMES
+  #define StrHelp_MAX_BASE36_BYTES lucy_StrHelp_MAX_BASE36_BYTES
+#endif
+__END_C__
+
+

http://git-wip-us.apache.org/repos/asf/lucy/blob/aa48a9e6/core/Lucy/Util/ToolSet.h
----------------------------------------------------------------------
diff --git a/core/Lucy/Util/ToolSet.h b/core/Lucy/Util/ToolSet.h
index 2d7a7a9..5fbc2be 100644
--- a/core/Lucy/Util/ToolSet.h
+++ b/core/Lucy/Util/ToolSet.h
@@ -45,7 +45,6 @@ extern "C" {
 #include "Clownfish/Vector.h"
 #include "Clownfish/Class.h"
 #include "Clownfish/Util/Memory.h"
-#include "Clownfish/Util/StringHelper.h"
 
 #ifdef __cplusplus
 }

http://git-wip-us.apache.org/repos/asf/lucy/blob/aa48a9e6/go/cfext/lucy.c
----------------------------------------------------------------------
diff --git a/go/cfext/lucy.c b/go/cfext/lucy.c
index fc2df3b..85778fb 100644
--- a/go/cfext/lucy.c
+++ b/go/cfext/lucy.c
@@ -44,7 +44,6 @@
 #include "Clownfish/Vector.h"
 #include "Clownfish/Class.h"
 #include "Clownfish/Util/Memory.h"
-#include "Clownfish/Util/StringHelper.h"
 #include "Lucy/Analysis/Token.h"
 #include "Lucy/Analysis/Inversion.h"
 #include "Lucy/Document/HitDoc.h"

http://git-wip-us.apache.org/repos/asf/lucy/blob/aa48a9e6/go/lucy/lucy.go
----------------------------------------------------------------------
diff --git a/go/lucy/lucy.go b/go/lucy/lucy.go
index 4a4fc71..3c7f06c 100644
--- a/go/lucy/lucy.go
+++ b/go/lucy/lucy.go
@@ -41,7 +41,6 @@ package lucy
 #include "Clownfish/HashIterator.h"
 #include "Clownfish/Vector.h"
 #include "Clownfish/Err.h"
-#include "Clownfish/Util/StringHelper.h"
 #include "Lucy/Analysis/Analyzer.h"
 #include "Lucy/Analysis/Inversion.h"
 #include "Lucy/Analysis/Token.h"
@@ -55,6 +54,7 @@ package lucy
 #include "Lucy/Store/OutStream.h"
 #include "Lucy/Object/I32Array.h"
 #include "Lucy/Util/Freezer.h"
+#include "Lucy/Util/StringHelper.h"
 
 extern lucy_RegexTokenizer*
 GOLUCY_RegexTokenizer_init(lucy_RegexTokenizer *self, cfish_String *pattern);
@@ -154,7 +154,7 @@ S_count_code_points(const char *string, size_t len) {
     size_t i = 0;
 
     while (i < len) {
-        i += cfish_StrHelp_UTF8_COUNT[(uint8_t)(string[i])];
+        i += lucy_StrHelp_UTF8_COUNT[(uint8_t)(string[i])];
         ++num_code_points;
     }
 

http://git-wip-us.apache.org/repos/asf/lucy/blob/aa48a9e6/perl/buildlib/Lucy/Build/Binding/Util.pm
----------------------------------------------------------------------
diff --git a/perl/buildlib/Lucy/Build/Binding/Util.pm b/perl/buildlib/Lucy/Build/Binding/Util.pm
index 2b09173..10efaa6 100644
--- a/perl/buildlib/Lucy/Build/Binding/Util.pm
+++ b/perl/buildlib/Lucy/Build/Binding/Util.pm
@@ -25,6 +25,7 @@ sub bind_all {
     $class->bind_freezer;
     $class->bind_indexfilenames;
     $class->bind_sortexternal;
+    $class->bind_stringhelper;
 }
 
 sub bind_debug {
@@ -180,4 +181,97 @@ sub bind_sortexternal {
     Clownfish::CFC::Binding::Perl::Class->register($binding);
 }
 
+sub bind_stringhelper {
+    my $xs_code = <<'END_XS_CODE';
+MODULE = Lucy   PACKAGE = Lucy::Util::StringHelper
+
+=for comment
+
+Turn an SV's UTF8 flag on.  Equivalent to Encode::_utf8_on, but we don't have
+to load Encode.
+
+=cut
+
+void
+utf8_flag_on(sv)
+    SV *sv;
+PPCODE:
+    SvUTF8_on(sv);
+
+=for comment
+
+Turn an SV's UTF8 flag off.
+
+=cut
+
+void
+utf8_flag_off(sv)
+    SV *sv;
+PPCODE:
+    SvUTF8_off(sv);
+
+SV*
+to_base36(num)
+    uint64_t num;
+CODE:
+{
+    char base36[lucy_StrHelp_MAX_BASE36_BYTES];
+    size_t size = lucy_StrHelp_to_base36(num, &base36);
+    RETVAL = newSVpvn(base36, size);
+}
+OUTPUT: RETVAL
+
+=for comment
+
+Upgrade a SV to UTF8, converting Latin1 if necessary. Equivalent to
+utf::upgrade().
+
+=cut
+
+void
+utf8ify(sv)
+    SV *sv;
+PPCODE:
+    sv_utf8_upgrade(sv);
+
+bool
+utf8_valid(sv)
+    SV *sv;
+CODE:
+{
+    STRLEN len;
+    char *ptr = SvPV(sv, len);
+    RETVAL = cfish_Str_utf8_valid(ptr, len);
+}
+OUTPUT: RETVAL
+
+=for comment
+
+Concatenate one scalar onto the end of the other, ignoring UTF-8 status of the
+second scalar.  This is necessary because $not_utf8 . $utf8 results in a
+scalar which has been infected by the UTF-8 flag of the second argument.
+
+=cut
+
+void
+cat_bytes(sv, catted)
+    SV *sv;
+    SV *catted;
+PPCODE:
+{
+    STRLEN len;
+    char *ptr = SvPV(catted, len);
+    if (SvUTF8(sv)) { CFISH_THROW(CFISH_ERR, "Can't cat_bytes onto a UTF-8 SV"); }
+    sv_catpvn(sv, ptr, len);
+}
+END_XS_CODE
+
+    my $binding = Clownfish::CFC::Binding::Perl::Class->new(
+        class_name => "Lucy::Util::StringHelper",
+    );
+    $binding->append_xs($xs_code);
+
+    Clownfish::CFC::Binding::Perl::Class->register($binding);
+}
+
 1;

http://git-wip-us.apache.org/repos/asf/lucy/blob/aa48a9e6/perl/lib/Lucy.pm
----------------------------------------------------------------------
diff --git a/perl/lib/Lucy.pm b/perl/lib/Lucy.pm
index 4c812ce..cdc2b54 100644
--- a/perl/lib/Lucy.pm
+++ b/perl/lib/Lucy.pm
@@ -305,6 +305,23 @@ BEGIN {
     }
 }
 
+{
+    package Lucy::Util::StringHelper;
+    our $VERSION = '0.005000';
+    $VERSION = eval $VERSION;
+    BEGIN {
+        push our @ISA, 'Exporter';
+        our @EXPORT_OK = qw(
+            utf8_flag_on
+            utf8_flag_off
+            to_base36
+            utf8ify
+            utf8_valid
+            cat_bytes
+        );
+    }
+}
+
 1;
 
 __END__

http://git-wip-us.apache.org/repos/asf/lucy/blob/aa48a9e6/perl/lib/Lucy/Util/StringHelper.pm
----------------------------------------------------------------------
diff --git a/perl/lib/Lucy/Util/StringHelper.pm b/perl/lib/Lucy/Util/StringHelper.pm
new file mode 100644
index 0000000..098855f
--- /dev/null
+++ b/perl/lib/Lucy/Util/StringHelper.pm
@@ -0,0 +1,25 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+package Lucy::Util::StringHelper;
+use Lucy;
+our $VERSION = '0.005000';
+$VERSION = eval $VERSION;
+
+1;
+
+__END__
+
+

http://git-wip-us.apache.org/repos/asf/lucy/blob/aa48a9e6/perl/lib/LucyX/Index/ZlibDocReader.pm
----------------------------------------------------------------------
diff --git a/perl/lib/LucyX/Index/ZlibDocReader.pm b/perl/lib/LucyX/Index/ZlibDocReader.pm
index 45ff9fc..6727e01 100644
--- a/perl/lib/LucyX/Index/ZlibDocReader.pm
+++ b/perl/lib/LucyX/Index/ZlibDocReader.pm
@@ -20,7 +20,7 @@ package LucyX::Index::ZlibDocReader;
 use base qw( Lucy::Index::DocReader );
 our $VERSION = '0.005000';
 $VERSION = eval $VERSION;
-use Clownfish::Util::StringHelper qw( utf8_valid utf8_flag_on );
+use Lucy::Util::StringHelper qw( utf8_valid utf8_flag_on );
 use Compress::Zlib qw( uncompress );
 use Carp;
 

http://git-wip-us.apache.org/repos/asf/lucy/blob/aa48a9e6/perl/lib/LucyX/Index/ZlibDocWriter.pm
----------------------------------------------------------------------
diff --git a/perl/lib/LucyX/Index/ZlibDocWriter.pm b/perl/lib/LucyX/Index/ZlibDocWriter.pm
index 884dee0..2104f25 100644
--- a/perl/lib/LucyX/Index/ZlibDocWriter.pm
+++ b/perl/lib/LucyX/Index/ZlibDocWriter.pm
@@ -20,7 +20,7 @@ use base qw( Lucy::Index::DataWriter );
 use Carp;
 use Scalar::Util qw( blessed );
 use Compress::Zlib qw( compress );
-use Clownfish::Util::StringHelper qw( cat_bytes );
+use Lucy::Util::StringHelper qw( cat_bytes );
 use Clownfish;
 use bytes;
 no bytes;

http://git-wip-us.apache.org/repos/asf/lucy/blob/aa48a9e6/perl/t/105-folder.t
----------------------------------------------------------------------
diff --git a/perl/t/105-folder.t b/perl/t/105-folder.t
index ef94518..735c162 100644
--- a/perl/t/105-folder.t
+++ b/perl/t/105-folder.t
@@ -21,7 +21,7 @@ use Test::More tests => 25;
 use File::Spec::Functions qw( catfile );
 use Fcntl;
 use Lucy::Test::TestUtils qw( init_test_index_loc );
-use Clownfish::Util::StringHelper qw( to_base36 );
+use Lucy::Util::StringHelper qw( to_base36 );
 
 my $fs_index_loc = init_test_index_loc();
 my $fs_folder    = Lucy::Store::FSFolder->new( path => $fs_index_loc, );

http://git-wip-us.apache.org/repos/asf/lucy/blob/aa48a9e6/perl/t/601-queryparser.t
----------------------------------------------------------------------
diff --git a/perl/t/601-queryparser.t b/perl/t/601-queryparser.t
index 5c2275c..6e29bcd 100644
--- a/perl/t/601-queryparser.t
+++ b/perl/t/601-queryparser.t
@@ -77,7 +77,7 @@ sub make_req_opt_query { shift; MyReqOptQuery->new(@_) }
 
 package main;
 use Test::More tests => 224;
-use Clownfish::Util::StringHelper qw( utf8_flag_on utf8ify );
+use Lucy::Util::StringHelper qw( utf8_flag_on utf8ify );
 use Lucy::Test::TestUtils qw( create_index );
 
 my $folder       = Lucy::Store::RAMFolder->new;

http://git-wip-us.apache.org/repos/asf/lucy/blob/aa48a9e6/perl/t/binding/101-simple_io.t
----------------------------------------------------------------------
diff --git a/perl/t/binding/101-simple_io.t b/perl/t/binding/101-simple_io.t
index 85db57a..e2a03e9 100644
--- a/perl/t/binding/101-simple_io.t
+++ b/perl/t/binding/101-simple_io.t
@@ -19,7 +19,7 @@ use lib 'buildlib';
 
 use Test::More tests => 28;
 use Lucy::Test::TestUtils qw( utf8_test_strings );
-use Clownfish::Util::StringHelper qw( utf8ify utf8_flag_off );
+use Lucy::Util::StringHelper qw( utf8ify utf8_flag_off );
 use bytes;
 no bytes;
 

http://git-wip-us.apache.org/repos/asf/lucy/blob/aa48a9e6/perl/t/core/032-string_helper.t
----------------------------------------------------------------------
diff --git a/perl/t/core/032-string_helper.t b/perl/t/core/032-string_helper.t
new file mode 100644
index 0000000..b8ddea6
--- /dev/null
+++ b/perl/t/core/032-string_helper.t
@@ -0,0 +1,25 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+use strict;
+use warnings;
+
+use Lucy::Test;
+my $success = Lucy::Test::run_tests(
+    "Lucy::Test::Util::TestStringHelper"
+);
+
+exit($success ? 0 : 1);
+

http://git-wip-us.apache.org/repos/asf/lucy/blob/aa48a9e6/perl/xs/Lucy/Analysis/RegexTokenizer.c
----------------------------------------------------------------------
diff --git a/perl/xs/Lucy/Analysis/RegexTokenizer.c b/perl/xs/Lucy/Analysis/RegexTokenizer.c
index 408acf1..a9ee264 100644
--- a/perl/xs/Lucy/Analysis/RegexTokenizer.c
+++ b/perl/xs/Lucy/Analysis/RegexTokenizer.c
@@ -22,8 +22,8 @@
 #include "Lucy/Analysis/RegexTokenizer.h"
 #include "Lucy/Analysis/Token.h"
 #include "Lucy/Analysis/Inversion.h"
+#include "Lucy/Util/StringHelper.h"
 #include "Clownfish/Util/Memory.h"
-#include "Clownfish/Util/StringHelper.h"
 
 static SV*
 S_compile_token_re(pTHX_ cfish_String *pattern);
@@ -154,14 +154,14 @@ LUCY_RegexTokenizer_Tokenize_Utf8_IMP(lucy_RegexTokenizer *self,
 
         // Get start and end offsets in Unicode code points.
         for (; string_arg < start_ptr; num_code_points++) {
-            string_arg += cfish_StrHelp_UTF8_COUNT[(uint8_t)(*string_arg)];
+            string_arg += lucy_StrHelp_UTF8_COUNT[(uint8_t)(*string_arg)];
             if (string_arg > string_end) {
                 THROW(CFISH_ERR, "scanned past end of '%s'", string_beg);
             }
         }
         start = num_code_points;
         for (; string_arg < end_ptr; num_code_points++) {
-            string_arg += cfish_StrHelp_UTF8_COUNT[(uint8_t)(*string_arg)];
+            string_arg += lucy_StrHelp_UTF8_COUNT[(uint8_t)(*string_arg)];
             if (string_arg > string_end) {
                 THROW(CFISH_ERR, "scanned past end of '%s'", string_beg);
             }

http://git-wip-us.apache.org/repos/asf/lucy/blob/aa48a9e6/perl/xs/Lucy/Index/Inverter.c
----------------------------------------------------------------------
diff --git a/perl/xs/Lucy/Index/Inverter.c b/perl/xs/Lucy/Index/Inverter.c
index 28c8e90..2dfbd17 100644
--- a/perl/xs/Lucy/Index/Inverter.c
+++ b/perl/xs/Lucy/Index/Inverter.c
@@ -27,7 +27,6 @@
 #include "Lucy/Plan/NumericType.h"
 #include "Lucy/Plan/Schema.h"
 #include "Lucy/Plan/TextType.h"
-#include "Clownfish/Util/StringHelper.h"
 
 static lucy_InverterEntry*
 S_fetch_entry(pTHX_ lucy_Inverter *self, HE *hash_entry) {

http://git-wip-us.apache.org/repos/asf/lucy/blob/aa48a9e6/test/Lucy/Test.c
----------------------------------------------------------------------
diff --git a/test/Lucy/Test.c b/test/Lucy/Test.c
index 0edda50..943c5ab 100644
--- a/test/Lucy/Test.c
+++ b/test/Lucy/Test.c
@@ -85,6 +85,7 @@
 #include "Lucy/Test/Util/TestNumberUtils.h"
 #include "Lucy/Test/Util/TestPriorityQueue.h"
 #include "Lucy/Test/Util/TestSortExternal.h"
+#include "Lucy/Test/Util/TestStringHelper.h"
 
 TestSuite*
 Test_create_test_suite() {
@@ -95,6 +96,7 @@ Test_create_test_suite() {
     TestSuite_Add_Batch(suite, (TestBatch*)TestSortExternal_new());
     TestSuite_Add_Batch(suite, (TestBatch*)TestMemPool_new());
     TestSuite_Add_Batch(suite, (TestBatch*)TestNumUtil_new());
+    TestSuite_Add_Batch(suite, (TestBatch*)TestStrHelp_new());
     TestSuite_Add_Batch(suite, (TestBatch*)TestIxFileNames_new());
     TestSuite_Add_Batch(suite, (TestBatch*)TestJson_new());
     TestSuite_Add_Batch(suite, (TestBatch*)TestFreezer_new());

http://git-wip-us.apache.org/repos/asf/lucy/blob/aa48a9e6/test/Lucy/Test/Util/TestStringHelper.c
----------------------------------------------------------------------
diff --git a/test/Lucy/Test/Util/TestStringHelper.c b/test/Lucy/Test/Util/TestStringHelper.c
new file mode 100644
index 0000000..0cf44e8
--- /dev/null
+++ b/test/Lucy/Test/Util/TestStringHelper.c
@@ -0,0 +1,103 @@
+/* Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define CFISH_USE_SHORT_NAMES
+#define LUCY_USE_SHORT_NAMES
+#define TESTLUCY_USE_SHORT_NAMES
+
+#include "Lucy/Test/Util/TestStringHelper.h"
+#include "Lucy/Util/StringHelper.h"
+
+#include "Clownfish/Class.h"
+#include "Clownfish/String.h"
+#include "Clownfish/TestHarness/TestBatchRunner.h"
+
+TestStringHelper*
+TestStrHelp_new() {
+    return (TestStringHelper*)Class_Make_Obj(TESTSTRINGHELPER);
+}
+
+static void
+test_overlap(TestBatchRunner *runner) {
+    size_t result;
+    result = StrHelp_overlap("", "", 0, 0);
+    TEST_UINT_EQ(runner, result, 0, "two empty strings");
+    result = StrHelp_overlap("", "foo", 0, 3);
+    TEST_UINT_EQ(runner, result, 0, "first string is empty");
+    result = StrHelp_overlap("foo", "", 3, 0);
+    TEST_UINT_EQ(runner, result, 0, "second string is empty");
+    result = StrHelp_overlap("foo", "foo", 3, 3);
+    TEST_UINT_EQ(runner, result, 3, "equal strings");
+    result = StrHelp_overlap("foo bar", "foo", 7, 3);
+    TEST_UINT_EQ(runner, result, 3, "first string is longer");
+    result = StrHelp_overlap("foo", "foo bar", 3, 7);
+    TEST_UINT_EQ(runner, result, 3, "second string is longer");
+    result = StrHelp_overlap("bar", "baz", 3, 3);
+    TEST_UINT_EQ(runner, result, 2, "different byte");
+}
+
+
+static void
+test_to_base36(TestBatchRunner *runner) {
+    char buffer[StrHelp_MAX_BASE36_BYTES];
+    StrHelp_to_base36(UINT64_MAX, buffer);
+    TEST_STR_EQ(runner, "3w5e11264sgsf", buffer, "base36 UINT64_MAX");
+    StrHelp_to_base36(1, buffer);
+    TEST_STR_EQ(runner, "1", buffer, "base36 1");
+    TEST_INT_EQ(runner, buffer[1], 0, "base36 NULL termination");
+}
+
+static void
+test_back_utf8_char(TestBatchRunner *runner) {
+    char buffer[4];
+    char *buf = buffer + 1;
+    uint32_t len = Str_encode_utf8_char(0x263A, buffer);
+    char *end = buffer + len;
+    TEST_TRUE(runner, StrHelp_back_utf8_char(end, buffer) == buffer,
+              "back_utf8_char");
+    TEST_TRUE(runner, StrHelp_back_utf8_char(end, buf) == NULL,
+              "back_utf8_char returns NULL rather than back up beyond start");
+    TEST_TRUE(runner, StrHelp_back_utf8_char(buffer, buffer) == NULL,
+              "back_utf8_char returns NULL when end == start");
+
+    int32_t code_point;
+    for (code_point = 0; code_point <= 0x10FFFF; code_point++) {
+        uint32_t size = Str_encode_utf8_char(code_point, buffer);
+        char *start = buffer;
+        char *end   = start + size;
+
+        if (StrHelp_back_utf8_char(end, start) != start) {
+            break;
+        }
+    }
+    if (code_point == 0x110000) {
+        PASS(runner, "back_utf8_char works for code points 0 - 0x10FFFF");
+    }
+    else {
+        FAIL(runner, "Failed back_utf8_char at 0x%.1X", (unsigned)code_point);
+    }
+}
+
+void
+TestStrHelp_Run_IMP(TestStringHelper *self, TestBatchRunner *runner) {
+    TestBatchRunner_Plan(runner, (TestBatch*)self, 14);
+    test_overlap(runner);
+    test_to_base36(runner);
+    test_back_utf8_char(runner);
+}
+
+
+

http://git-wip-us.apache.org/repos/asf/lucy/blob/aa48a9e6/test/Lucy/Test/Util/TestStringHelper.cfh
----------------------------------------------------------------------
diff --git a/test/Lucy/Test/Util/TestStringHelper.cfh b/test/Lucy/Test/Util/TestStringHelper.cfh
new file mode 100644
index 0000000..d0be8ec
--- /dev/null
+++ b/test/Lucy/Test/Util/TestStringHelper.cfh
@@ -0,0 +1,29 @@
+/* Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+parcel TestLucy;
+
+class Lucy::Test::Util::TestStringHelper nickname TestStrHelp
+    inherits Clownfish::TestHarness::TestBatch {
+
+    inert incremented TestStringHelper*
+    new();
+
+    void
+    Run(TestStringHelper *self, TestBatchRunner *runner);
+}
+
+