You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucy.apache.org by nw...@apache.org on 2016/08/06 14:29:40 UTC
[4/6] lucy-clownfish git commit: Move some functions from StrHelp to
Str
Move some functions from StrHelp to Str
- utf8_valid
- validate_utf8
- is_whitespace
- encode_utf8_char
Project: http://git-wip-us.apache.org/repos/asf/lucy-clownfish/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucy-clownfish/commit/ed2010ca
Tree: http://git-wip-us.apache.org/repos/asf/lucy-clownfish/tree/ed2010ca
Diff: http://git-wip-us.apache.org/repos/asf/lucy-clownfish/diff/ed2010ca
Branch: refs/heads/master
Commit: ed2010caec0af7ccdcaff76a05cdb516166a6ad4
Parents: 64a1000
Author: Nick Wellnhofer <we...@aevum.de>
Authored: Tue Aug 2 18:46:35 2016 +0200
Committer: Nick Wellnhofer <we...@aevum.de>
Committed: Tue Aug 2 19:05:14 2016 +0200
----------------------------------------------------------------------
runtime/core/Clownfish/CharBuf.c | 3 +-
runtime/core/Clownfish/String.c | 180 +++++++++++-
runtime/core/Clownfish/String.cfh | 33 +++
runtime/core/Clownfish/Util/StringHelper.c | 176 -----------
runtime/core/Clownfish/Util/StringHelper.cfh | 34 ---
.../perl/buildlib/Clownfish/Build/Binding.pm | 2 +-
runtime/perl/xs/XSBind.c | 1 -
runtime/python/cfext/CFBind.c | 3 +-
runtime/ruby/ext/Bind.c | 1 -
runtime/ruby/ext/Clownfish.c | 1 -
runtime/test/Clownfish/Test/TestCharBuf.c | 3 +-
runtime/test/Clownfish/Test/TestString.c | 294 ++++++++++++++++++-
.../test/Clownfish/Test/Util/TestStringHelper.c | 294 +------------------
13 files changed, 522 insertions(+), 503 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/ed2010ca/runtime/core/Clownfish/CharBuf.c
----------------------------------------------------------------------
diff --git a/runtime/core/Clownfish/CharBuf.c b/runtime/core/Clownfish/CharBuf.c
index 2dbae91..30f54dd 100644
--- a/runtime/core/Clownfish/CharBuf.c
+++ b/runtime/core/Clownfish/CharBuf.c
@@ -30,7 +30,6 @@
#include "Clownfish/Err.h"
#include "Clownfish/String.h"
#include "Clownfish/Util/Memory.h"
-#include "Clownfish/Util/StringHelper.h"
#include "Clownfish/Class.h"
// Append trusted UTF-8 to the CharBuf.
@@ -290,7 +289,7 @@ CB_Cat_Char_IMP(CharBuf *self, int32_t code_point) {
size_t old_size = self->size;
SI_add_grow_and_oversize(self, old_size, MAX_UTF8_BYTES);
char *end = self->ptr + old_size;
- size_t count = StrHelp_encode_utf8_char(code_point, (uint8_t*)end);
+ size_t count = Str_encode_utf8_char(code_point, (uint8_t*)end);
self->size += count;
}
http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/ed2010ca/runtime/core/Clownfish/String.c
----------------------------------------------------------------------
diff --git a/runtime/core/Clownfish/String.c b/runtime/core/Clownfish/String.c
index 0353ffd..0de7f28 100644
--- a/runtime/core/Clownfish/String.c
+++ b/runtime/core/Clownfish/String.c
@@ -19,6 +19,7 @@
#define CFISH_USE_SHORT_NAMES
#include <string.h>
+#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
@@ -29,7 +30,6 @@
#include "Clownfish/CharBuf.h"
#include "Clownfish/Err.h"
#include "Clownfish/Util/Memory.h"
-#include "Clownfish/Util/StringHelper.h"
#define STACK_ITER(string, byte_offset) \
S_new_stack_iter(alloca(sizeof(StringIterator)), string, byte_offset)
@@ -40,6 +40,178 @@ S_memmem(String *self, const char *substring, size_t size);
static StringIterator*
S_new_stack_iter(void *allocation, String *string, size_t byte_offset);
+// Return a pointer to the first invalid UTF-8 sequence, or NULL if
+// the UTF-8 is valid.
+static const uint8_t*
+S_find_invalid_utf8(const uint8_t *string, size_t size) {
+ const uint8_t *const end = string + size;
+ while (string < end) {
+ const uint8_t *start = string;
+ const uint8_t header_byte = *string++;
+
+ if (header_byte < 0x80) {
+ // ASCII
+ ;
+ }
+ else if (header_byte < 0xE0) {
+ // Disallow non-shortest-form ASCII and continuation bytes.
+ if (header_byte < 0xC2) { return start; }
+ // Two-byte sequence.
+ if (string == end) { return start; }
+ if ((*string++ & 0xC0) != 0x80) { return start; }
+ }
+ else if (header_byte < 0xF0) {
+ // Three-byte sequence.
+ if (end - string < 2) { return start; }
+ if (header_byte == 0xED) {
+ // Disallow UTF-16 surrogates.
+ if (*string < 0x80 || *string > 0x9F) {
+ return start;
+ }
+ }
+ else if (!(header_byte & 0x0F)) {
+ // Disallow non-shortest-form.
+ if (!(*string & 0x20)) {
+ return start;
+ }
+ }
+ if ((*string++ & 0xC0) != 0x80) { return start; }
+ if ((*string++ & 0xC0) != 0x80) { return start; }
+ }
+ else {
+ if (header_byte > 0xF4) { return start; }
+ // Four-byte sequence.
+ if (end - string < 3) { return start; }
+ if (!(header_byte & 0x07)) {
+ // Disallow non-shortest-form.
+ if (!(*string & 0x30)) {
+ return start;
+ }
+ }
+ else if (header_byte == 0xF4) {
+ // Code point larger than 0x10FFFF.
+ if (*string >= 0x90) {
+ return start;
+ }
+ }
+ if ((*string++ & 0xC0) != 0x80) { return start; }
+ if ((*string++ & 0xC0) != 0x80) { return start; }
+ if ((*string++ & 0xC0) != 0x80) { return start; }
+ }
+ }
+
+ return NULL;
+}
+
+bool
+Str_utf8_valid(const char *ptr, size_t size) {
+ return S_find_invalid_utf8((const uint8_t*)ptr, size) == NULL;
+}
+
+void
+Str_validate_utf8(const char *ptr, size_t size, const char *file, int line,
+ const char *func) {
+ const uint8_t *string = (const uint8_t*)ptr;
+ const uint8_t *invalid = S_find_invalid_utf8(string, size);
+ if (invalid == NULL) { return; }
+
+ CharBuf *buf = CB_new(0);
+ CB_Cat_Trusted_Utf8(buf, "Invalid UTF-8", 13);
+
+ if (invalid > string) {
+ const uint8_t *prefix = invalid;
+ size_t num_code_points = 0;
+
+ // Skip up to 20 code points backwards.
+ while (prefix > string) {
+ prefix -= 1;
+
+ if ((*prefix & 0xC0) != 0x80) {
+ num_code_points += 1;
+ if (num_code_points >= 20) { break; }
+ }
+ }
+
+ CB_Cat_Trusted_Utf8(buf, " after '", 8);
+ CB_Cat_Trusted_Utf8(buf, (const char*)prefix, invalid - prefix);
+ CB_Cat_Trusted_Utf8(buf, "'", 1);
+ }
+
+ CB_Cat_Trusted_Utf8(buf, ":", 1);
+
+ // Append offending bytes as hex.
+ const uint8_t *end = string + size;
+ const uint8_t *max = invalid + 5;
+ for (const uint8_t *byte = invalid; byte < end && byte < max; byte++) {
+ char hex[4];
+ sprintf(hex, " %02X", *byte);
+ CB_Cat_Trusted_Utf8(buf, hex, 3);
+ }
+
+ String *mess = CB_Yield_String(buf);
+ DECREF(buf);
+
+ Err *err = Err_new(mess);
+ Err_Add_Frame(err, file, line, func);
+ Err_do_throw(err);
+}
+
+bool
+Str_is_whitespace(int32_t code_point) {
+ switch (code_point) {
+ // <control-0009>..<control-000D>
+ case 0x0009: case 0x000A: case 0x000B: case 0x000C: case 0x000D:
+ case 0x0020: // SPACE
+ case 0x0085: // <control-0085>
+ case 0x00A0: // NO-BREAK SPACE
+ case 0x1680: // OGHAM SPACE MARK
+ // EN QUAD..HAIR SPACE
+ case 0x2000: case 0x2001: case 0x2002: case 0x2003: case 0x2004:
+ case 0x2005: case 0x2006: case 0x2007: case 0x2008: case 0x2009:
+ case 0x200A:
+ case 0x2028: // LINE SEPARATOR
+ case 0x2029: // PARAGRAPH SEPARATOR
+ case 0x202F: // NARROW NO-BREAK SPACE
+ case 0x205F: // MEDIUM MATHEMATICAL SPACE
+ case 0x3000: // IDEOGRAPHIC SPACE
+ return true;
+
+ default:
+ return false;
+ }
+}
+
+uint32_t
+Str_encode_utf8_char(int32_t code_point, void *buffer) {
+ uint8_t *buf = (uint8_t*)buffer;
+ if (code_point <= 0x7F) { // ASCII
+ buf[0] = (uint8_t)code_point;
+ return 1;
+ }
+ else if (code_point <= 0x07FF) { // 2 byte range
+ buf[0] = (uint8_t)(0xC0 | (code_point >> 6));
+ buf[1] = (uint8_t)(0x80 | (code_point & 0x3f));
+ return 2;
+ }
+ else if (code_point <= 0xFFFF) { // 3 byte range
+ buf[0] = (uint8_t)(0xE0 | (code_point >> 12));
+ buf[1] = (uint8_t)(0x80 | ((code_point >> 6) & 0x3F));
+ buf[2] = (uint8_t)(0x80 | (code_point & 0x3f));
+ return 3;
+ }
+ else if (code_point <= 0x10FFFF) { // 4 byte range
+ buf[0] = (uint8_t)(0xF0 | (code_point >> 18));
+ buf[1] = (uint8_t)(0x80 | ((code_point >> 12) & 0x3F));
+ buf[2] = (uint8_t)(0x80 | ((code_point >> 6) & 0x3F));
+ buf[3] = (uint8_t)(0x80 | (code_point & 0x3f));
+ return 4;
+ }
+ else {
+ THROW(ERR, "Illegal Unicode code point: %u32", code_point);
+ UNREACHABLE_RETURN(uint32_t);
+ }
+}
+
String*
Str_new_from_utf8(const char *utf8, size_t size) {
VALIDATE_UTF8(utf8, size);
@@ -122,7 +294,7 @@ String*
Str_new_from_char(int32_t code_point) {
const size_t MAX_UTF8_BYTES = 4;
char *ptr = (char*)MALLOCATE(MAX_UTF8_BYTES + 1);
- size_t size = StrHelp_encode_utf8_char(code_point, (uint8_t*)ptr);
+ size_t size = Str_encode_utf8_char(code_point, (uint8_t*)ptr);
ptr[size] = '\0';
String *self = (String*)Class_Make_Obj(STRING);
@@ -740,7 +912,7 @@ StrIter_Skip_Whitespace_IMP(StringIterator *self) {
int32_t code_point;
while (STR_OOB != (code_point = StrIter_Next(self))) {
- if (!StrHelp_is_whitespace(code_point)) { break; }
+ if (!Str_is_whitespace(code_point)) { break; }
byte_offset = self->byte_offset;
++num_skipped;
}
@@ -756,7 +928,7 @@ StrIter_Skip_Whitespace_Back_IMP(StringIterator *self) {
int32_t code_point;
while (STR_OOB != (code_point = StrIter_Prev(self))) {
- if (!StrHelp_is_whitespace(code_point)) { break; }
+ if (!Str_is_whitespace(code_point)) { break; }
byte_offset = self->byte_offset;
++num_skipped;
}
http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/ed2010ca/runtime/core/Clownfish/String.cfh
----------------------------------------------------------------------
diff --git a/runtime/core/Clownfish/String.cfh b/runtime/core/Clownfish/String.cfh
index 72f60a1..6b3323e 100644
--- a/runtime/core/Clownfish/String.cfh
+++ b/runtime/core/Clownfish/String.cfh
@@ -24,6 +24,9 @@ __C__
// For CFISH_ALLOCA_OBJ.
#include "Clownfish/Class.h"
+// For CFISH_ERR_FUNC_MACRO.
+#include "Clownfish/Err.h"
+
__END_C__
/**
@@ -37,6 +40,31 @@ public final class Clownfish::String nickname Str
size_t size;
String *origin;
+ /** Return true if the string is valid UTF-8, false otherwise.
+ */
+ public inert bool
+ utf8_valid(const char *ptr, size_t len);
+
+ /** Throws an error if the string isn't valid UTF-8.
+ */
+ public inert void
+ validate_utf8(const char *text, size_t size, const char *file, int line,
+ const char *func);
+
+ /** Returns true if the code point qualifies as Unicode whitespace.
+ */
+ public inert bool
+ is_whitespace(int32_t code_point);
+
+ /** Encode a Unicode code point to a UTF-8 sequence.
+ *
+ * @param code_point A legal unicode code point.
+ * @param buffer Write buffer which must hold at least 4 bytes (the
+ * maximum legal length for a UTF-8 char).
+ */
+ inert uint32_t
+ encode_utf8_char(int32_t code_point, void *buffer);
+
/** Return a String which holds a copy of the supplied UTF-8 character
* data after checking for validity.
*
@@ -506,6 +534,10 @@ public final class Clownfish::StringIterator nickname StrIter
__C__
+#define CFISH_VALIDATE_UTF8(text, size) \
+ cfish_Str_validate_utf8(text, size, \
+ __FILE__, __LINE__, CFISH_ERR_FUNC_MACRO)
+
#define CFISH_SSTR_BLANK() \
cfish_Str_init_stack_string(CFISH_ALLOCA_OBJ(CFISH_STRING), "", 0)
@@ -519,6 +551,7 @@ __C__
#define CFISH_STR_OOB -1
#ifdef CFISH_USE_SHORT_NAMES
+ #define VALIDATE_UTF8 CFISH_VALIDATE_UTF8
#define SSTR_BLANK CFISH_SSTR_BLANK
#define SSTR_WRAP_C CFISH_SSTR_WRAP_C
#define SSTR_WRAP_UTF8 CFISH_SSTR_WRAP_UTF8
http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/ed2010ca/runtime/core/Clownfish/Util/StringHelper.c
----------------------------------------------------------------------
diff --git a/runtime/core/Clownfish/Util/StringHelper.c b/runtime/core/Clownfish/Util/StringHelper.c
index 256c9e0..7b8e9d8 100644
--- a/runtime/core/Clownfish/Util/StringHelper.c
+++ b/runtime/core/Clownfish/Util/StringHelper.c
@@ -15,10 +15,6 @@
*/
#define C_CFISH_STRINGHELPER
-#include <string.h>
-#include <stddef.h>
-#include <stdio.h>
-
#define CFISH_USE_SHORT_NAMES
#include "Clownfish/Util/StringHelper.h"
@@ -79,178 +75,6 @@ StrHelp_to_base36(uint64_t num, void *buffer) {
return size;
}
-// Return a pointer to the first invalid UTF-8 sequence, or NULL if
-// the UTF-8 is valid.
-static const uint8_t*
-S_find_invalid_utf8(const uint8_t *string, size_t size) {
- const uint8_t *const end = string + size;
- while (string < end) {
- const uint8_t *start = string;
- const uint8_t header_byte = *string++;
-
- if (header_byte < 0x80) {
- // ASCII
- ;
- }
- else if (header_byte < 0xE0) {
- // Disallow non-shortest-form ASCII and continuation bytes.
- if (header_byte < 0xC2) { return start; }
- // Two-byte sequence.
- if (string == end) { return start; }
- if ((*string++ & 0xC0) != 0x80) { return start; }
- }
- else if (header_byte < 0xF0) {
- // Three-byte sequence.
- if (end - string < 2) { return start; }
- if (header_byte == 0xED) {
- // Disallow UTF-16 surrogates.
- if (*string < 0x80 || *string > 0x9F) {
- return start;
- }
- }
- else if (!(header_byte & 0x0F)) {
- // Disallow non-shortest-form.
- if (!(*string & 0x20)) {
- return start;
- }
- }
- if ((*string++ & 0xC0) != 0x80) { return start; }
- if ((*string++ & 0xC0) != 0x80) { return start; }
- }
- else {
- if (header_byte > 0xF4) { return start; }
- // Four-byte sequence.
- if (end - string < 3) { return start; }
- if (!(header_byte & 0x07)) {
- // Disallow non-shortest-form.
- if (!(*string & 0x30)) {
- return start;
- }
- }
- else if (header_byte == 0xF4) {
- // Code point larger than 0x10FFFF.
- if (*string >= 0x90) {
- return start;
- }
- }
- if ((*string++ & 0xC0) != 0x80) { return start; }
- if ((*string++ & 0xC0) != 0x80) { return start; }
- if ((*string++ & 0xC0) != 0x80) { return start; }
- }
- }
-
- return NULL;
-}
-
-bool
-StrHelp_utf8_valid(const char *ptr, size_t size) {
- return S_find_invalid_utf8((const uint8_t*)ptr, size) == NULL;
-}
-
-void
-StrHelp_validate_utf8(const char *ptr, size_t size, const char *file,
- int line, const char *func) {
- const uint8_t *string = (const uint8_t*)ptr;
- const uint8_t *invalid = S_find_invalid_utf8(string, size);
- if (invalid == NULL) { return; }
-
- CharBuf *buf = CB_new(0);
- CB_Cat_Trusted_Utf8(buf, "Invalid UTF-8", 13);
-
- if (invalid > string) {
- const uint8_t *prefix = invalid;
- size_t num_code_points = 0;
-
- // Skip up to 20 code points backwards.
- while (prefix > string) {
- prefix -= 1;
-
- if ((*prefix & 0xC0) != 0x80) {
- num_code_points += 1;
- if (num_code_points >= 20) { break; }
- }
- }
-
- CB_Cat_Trusted_Utf8(buf, " after '", 8);
- CB_Cat_Trusted_Utf8(buf, (const char*)prefix, invalid - prefix);
- CB_Cat_Trusted_Utf8(buf, "'", 1);
- }
-
- CB_Cat_Trusted_Utf8(buf, ":", 1);
-
- // Append offending bytes as hex.
- const uint8_t *end = string + size;
- const uint8_t *max = invalid + 5;
- for (const uint8_t *byte = invalid; byte < end && byte < max; byte++) {
- char hex[4];
- sprintf(hex, " %02X", *byte);
- CB_Cat_Trusted_Utf8(buf, hex, 3);
- }
-
- String *mess = CB_Yield_String(buf);
- DECREF(buf);
-
- Err *err = Err_new(mess);
- Err_Add_Frame(err, file, line, func);
- Err_do_throw(err);
-}
-
-bool
-StrHelp_is_whitespace(int32_t code_point) {
- switch (code_point) {
- // <control-0009>..<control-000D>
- case 0x0009: case 0x000A: case 0x000B: case 0x000C: case 0x000D:
- case 0x0020: // SPACE
- case 0x0085: // <control-0085>
- case 0x00A0: // NO-BREAK SPACE
- case 0x1680: // OGHAM SPACE MARK
- // EN QUAD..HAIR SPACE
- case 0x2000: case 0x2001: case 0x2002: case 0x2003: case 0x2004:
- case 0x2005: case 0x2006: case 0x2007: case 0x2008: case 0x2009:
- case 0x200A:
- case 0x2028: // LINE SEPARATOR
- case 0x2029: // PARAGRAPH SEPARATOR
- case 0x202F: // NARROW NO-BREAK SPACE
- case 0x205F: // MEDIUM MATHEMATICAL SPACE
- case 0x3000: // IDEOGRAPHIC SPACE
- return true;
-
- default:
- return false;
- }
-}
-
-uint32_t
-StrHelp_encode_utf8_char(int32_t code_point, void *buffer) {
- uint8_t *buf = (uint8_t*)buffer;
- if (code_point <= 0x7F) { // ASCII
- buf[0] = (uint8_t)code_point;
- return 1;
- }
- else if (code_point <= 0x07FF) { // 2 byte range
- buf[0] = (uint8_t)(0xC0 | (code_point >> 6));
- buf[1] = (uint8_t)(0x80 | (code_point & 0x3f));
- return 2;
- }
- else if (code_point <= 0xFFFF) { // 3 byte range
- buf[0] = (uint8_t)(0xE0 | (code_point >> 12));
- buf[1] = (uint8_t)(0x80 | ((code_point >> 6) & 0x3F));
- buf[2] = (uint8_t)(0x80 | (code_point & 0x3f));
- return 3;
- }
- else if (code_point <= 0x10FFFF) { // 4 byte range
- buf[0] = (uint8_t)(0xF0 | (code_point >> 18));
- buf[1] = (uint8_t)(0x80 | ((code_point >> 12) & 0x3F));
- buf[2] = (uint8_t)(0x80 | ((code_point >> 6) & 0x3F));
- buf[3] = (uint8_t)(0x80 | (code_point & 0x3f));
- return 4;
- }
- else {
- THROW(ERR, "Illegal Unicode code point: %u32", code_point);
- UNREACHABLE_RETURN(uint32_t);
- }
-}
-
const char*
StrHelp_back_utf8_char(const char *ptr, const char *start) {
while (--ptr >= start) {
http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/ed2010ca/runtime/core/Clownfish/Util/StringHelper.cfh
----------------------------------------------------------------------
diff --git a/runtime/core/Clownfish/Util/StringHelper.cfh b/runtime/core/Clownfish/Util/StringHelper.cfh
index 1264bea..1e915e6 100644
--- a/runtime/core/Clownfish/Util/StringHelper.cfh
+++ b/runtime/core/Clownfish/Util/StringHelper.cfh
@@ -16,10 +16,6 @@
parcel Clownfish;
-__C__
-#include "Clownfish/Err.h"
-__END_C__
-
inert class Clownfish::Util::StringHelper nickname StrHelp {
/* A table where the values indicate the number of bytes in a UTF-8
@@ -43,31 +39,6 @@ inert class Clownfish::Util::StringHelper nickname StrHelp {
inert size_t
to_base36(uint64_t value, void *buffer);
- /** Return true if the string is valid UTF-8, false otherwise.
- */
- inert bool
- utf8_valid(const char *ptr, size_t len);
-
- /** Throws an error if the string isn't valid UTF-8.
- */
- inert void
- validate_utf8(const char *text, size_t size, const char *file, int line,
- const char *func);
-
- /** Returns true if the code point qualifies as Unicode whitespace.
- */
- inert bool
- is_whitespace(int32_t code_point);
-
- /** Encode a Unicode code point to a UTF-8 sequence.
- *
- * @param code_point A legal unicode code point.
- * @param buffer Write buffer which must hold at least 4 bytes (the
- * maximum legal length for a UTF-8 char).
- */
- inert uint32_t
- encode_utf8_char(int32_t code_point, void *buffer);
-
/** Return the first non-continuation byte before the supplied pointer.
* If backtracking progresses beyond the supplied start, return NULL.
*/
@@ -76,17 +47,12 @@ inert class Clownfish::Util::StringHelper nickname StrHelp {
}
__C__
-#define CFISH_VALIDATE_UTF8(text, size) \
- cfish_StrHelp_validate_utf8(text, size, \
- __FILE__, __LINE__, CFISH_ERR_FUNC_MACRO)
-
/** The maximum number of bytes encoded by to_base36(), including the
* terminating NULL.
*/
#define cfish_StrHelp_MAX_BASE36_BYTES 14
#ifdef CFISH_USE_SHORT_NAMES
#define StrHelp_MAX_BASE36_BYTES cfish_StrHelp_MAX_BASE36_BYTES
- #define VALIDATE_UTF8 CFISH_VALIDATE_UTF8
#endif
__END_C__
http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/ed2010ca/runtime/perl/buildlib/Clownfish/Build/Binding.pm
----------------------------------------------------------------------
diff --git a/runtime/perl/buildlib/Clownfish/Build/Binding.pm b/runtime/perl/buildlib/Clownfish/Build/Binding.pm
index 71b0ff8..ecc83b5 100644
--- a/runtime/perl/buildlib/Clownfish/Build/Binding.pm
+++ b/runtime/perl/buildlib/Clownfish/Build/Binding.pm
@@ -1010,7 +1010,7 @@ CODE:
{
STRLEN len;
char *ptr = SvPV(sv, len);
- RETVAL = cfish_StrHelp_utf8_valid(ptr, len);
+ RETVAL = cfish_Str_utf8_valid(ptr, len);
}
OUTPUT: RETVAL
http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/ed2010ca/runtime/perl/xs/XSBind.c
----------------------------------------------------------------------
diff --git a/runtime/perl/xs/XSBind.c b/runtime/perl/xs/XSBind.c
index ab9ee82..b566f8d 100644
--- a/runtime/perl/xs/XSBind.c
+++ b/runtime/perl/xs/XSBind.c
@@ -33,7 +33,6 @@
#include "Clownfish/PtrHash.h"
#include "Clownfish/TestHarness/TestUtils.h"
#include "Clownfish/Util/Atomic.h"
-#include "Clownfish/Util/StringHelper.h"
#include "Clownfish/Util/Memory.h"
#define XSBIND_REFCOUNT_FLAG 1
http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/ed2010ca/runtime/python/cfext/CFBind.c
----------------------------------------------------------------------
diff --git a/runtime/python/cfext/CFBind.c b/runtime/python/cfext/CFBind.c
index 0703880..536cb1d 100644
--- a/runtime/python/cfext/CFBind.c
+++ b/runtime/python/cfext/CFBind.c
@@ -39,7 +39,6 @@
#include "Clownfish/TestHarness/TestUtils.h"
#include "Clownfish/Util/Atomic.h"
#include "Clownfish/Util/Memory.h"
-#include "Clownfish/Util/StringHelper.h"
#include "Clownfish/Vector.h"
static bool Err_initialized;
@@ -195,7 +194,7 @@ S_maybe_py_to_cfish(PyObject *py_obj, cfish_Class *klass, bool increment,
Py_ssize_t size;
char *ptr = PyUnicode_AsUTF8AndSize(py_obj, &size);
// TODO: Can we guarantee that Python will always supply valid UTF-8?
- if (!ptr || !cfish_StrHelp_utf8_valid(ptr, size)) {
+ if (!ptr || !cfish_Str_utf8_valid(ptr, size)) {
return false;
}
*obj_ptr = (cfish_Obj*)cfish_Str_new_from_trusted_utf8(ptr, size);
http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/ed2010ca/runtime/ruby/ext/Bind.c
----------------------------------------------------------------------
diff --git a/runtime/ruby/ext/Bind.c b/runtime/ruby/ext/Bind.c
index a12b1e1..70c0a9e 100644
--- a/runtime/ruby/ext/Bind.c
+++ b/runtime/ruby/ext/Bind.c
@@ -16,7 +16,6 @@
#include "ruby.h"
#include "Bind.h"
-#include "Clownfish/Util/StringHelper.h"
VALUE
Bind_cfish_to_ruby(cfish_Obj *obj) {
http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/ed2010ca/runtime/ruby/ext/Clownfish.c
----------------------------------------------------------------------
diff --git a/runtime/ruby/ext/Clownfish.c b/runtime/ruby/ext/Clownfish.c
index 972d2db..8993ad6 100644
--- a/runtime/ruby/ext/Clownfish.c
+++ b/runtime/ruby/ext/Clownfish.c
@@ -17,7 +17,6 @@
#include "ruby.h"
#include "Clownfish/Util/Memory.h"
-#include "Clownfish/Util/StringHelper.h"
#include "Clownfish/String.h"
#include "Clownfish/Test/TestCharBuf.h"
#include "Clownfish/Test.h"
http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/ed2010ca/runtime/test/Clownfish/Test/TestCharBuf.c
----------------------------------------------------------------------
diff --git a/runtime/test/Clownfish/Test/TestCharBuf.c b/runtime/test/Clownfish/Test/TestCharBuf.c
index 0782ce2..9cf5bbf 100644
--- a/runtime/test/Clownfish/Test/TestCharBuf.c
+++ b/runtime/test/Clownfish/Test/TestCharBuf.c
@@ -32,7 +32,6 @@
#include "Clownfish/Test.h"
#include "Clownfish/TestHarness/TestBatchRunner.h"
#include "Clownfish/TestHarness/TestUtils.h"
-#include "Clownfish/Util/StringHelper.h"
#include "Clownfish/Class.h"
static char smiley[] = { (char)0xE2, (char)0x98, (char)0xBA, 0 };
@@ -116,7 +115,7 @@ test_roundtrip(TestBatchRunner *runner) {
size_t size = Str_Get_Size(str);
// Verify that utf8_valid agrees.
- if (!StrHelp_utf8_valid(start, size)) {
+ if (!Str_utf8_valid(start, size)) {
break;
}
http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/ed2010ca/runtime/test/Clownfish/Test/TestString.c
----------------------------------------------------------------------
diff --git a/runtime/test/Clownfish/Test/TestString.c b/runtime/test/Clownfish/Test/TestString.c
index d89b5fe..d557546 100644
--- a/runtime/test/Clownfish/Test/TestString.c
+++ b/runtime/test/Clownfish/Test/TestString.c
@@ -38,6 +38,25 @@ static char smiley[] = { (char)0xE2, (char)0x98, (char)0xBA, 0 };
static uint32_t smiley_len = 3;
static int32_t smiley_cp = 0x263A;
+static const uint8_t UTF8_COUNT[256] = {
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+};
+
TestString*
TestStr_new() {
return (TestString*)Class_Make_Obj(TESTSTRING);
@@ -71,6 +90,274 @@ S_smiley_with_whitespace(size_t *num_spaces_ptr) {
return retval;
}
+/* This alternative implementation of utf8_valid() is (presumably) slower, but
+ * it implements the standard in a more linear, easy-to-grok way.
+ */
+#define TRAIL_OK(n) (n >= 0x80 && n <= 0xBF)
+static bool
+S_utf8_valid_alt(const char *maybe_utf8, size_t size) {
+ const uint8_t *string = (const uint8_t*)maybe_utf8;
+ const uint8_t *const end = string + size;
+ while (string < end) {
+ int count = UTF8_COUNT[*string];
+ bool valid = false;
+ if (count == 1) {
+ if (string[0] <= 0x7F) {
+ valid = true;
+ }
+ }
+ else if (count == 2) {
+ if (string[0] >= 0xC2 && string[0] <= 0xDF) {
+ if (TRAIL_OK(string[1])) {
+ valid = true;
+ }
+ }
+ }
+ else if (count == 3) {
+ if (string[0] == 0xE0) {
+ if (string[1] >= 0xA0 && string[1] <= 0xBF
+ && TRAIL_OK(string[2])
+ ) {
+ valid = true;
+ }
+ }
+ else if (string[0] >= 0xE1 && string[0] <= 0xEC) {
+ if (TRAIL_OK(string[1])
+ && TRAIL_OK(string[2])
+ ) {
+ valid = true;
+ }
+ }
+ else if (string[0] == 0xED) {
+ if (string[1] >= 0x80 && string[1] <= 0x9F
+ && TRAIL_OK(string[2])
+ ) {
+ valid = true;
+ }
+ }
+ else if (string[0] >= 0xEE && string[0] <= 0xEF) {
+ if (TRAIL_OK(string[1])
+ && TRAIL_OK(string[2])
+ ) {
+ valid = true;
+ }
+ }
+ }
+ else if (count == 4) {
+ if (string[0] == 0xF0) {
+ if (string[1] >= 0x90 && string[1] <= 0xBF
+ && TRAIL_OK(string[2])
+ && TRAIL_OK(string[3])
+ ) {
+ valid = true;
+ }
+ }
+ else if (string[0] >= 0xF1 && string[0] <= 0xF3) {
+ if (TRAIL_OK(string[1])
+ && TRAIL_OK(string[2])
+ && TRAIL_OK(string[3])
+ ) {
+ valid = true;
+ }
+ }
+ else if (string[0] == 0xF4) {
+ if (string[1] >= 0x80 && string[1] <= 0x8F
+ && TRAIL_OK(string[2])
+ && TRAIL_OK(string[3])
+ ) {
+ valid = true;
+ }
+ }
+ }
+
+ if (!valid) {
+ return false;
+ }
+ string += count;
+ }
+
+ if (string != end) {
+ return false;
+ }
+
+ return true;
+}
+
+static void
+test_all_code_points(TestBatchRunner *runner) {
+ int32_t code_point;
+ for (code_point = 0; code_point <= 0x10FFFF; code_point++) {
+ char buffer[4];
+ uint32_t size = Str_encode_utf8_char(code_point, buffer);
+ char *start = buffer;
+
+ // Verify length returned by encode_utf8_char().
+ if (size != UTF8_COUNT[(unsigned char)buffer[0]]) {
+ break;
+ }
+ // Verify that utf8_valid() agrees with alternate implementation.
+ if (!!Str_utf8_valid(start, size)
+ != !!S_utf8_valid_alt(start, size)
+ ) {
+ break;
+ }
+ }
+ if (code_point == 0x110000) {
+ PASS(runner, "Successfully round tripped 0 - 0x10FFFF");
+ }
+ else {
+ FAIL(runner, "Failed round trip at 0x%.1X", (unsigned)code_point);
+ }
+}
+
+static void
+S_test_validity(TestBatchRunner *runner, const char *content, size_t size,
+ bool expected, const char *description) {
+ bool sane = Str_utf8_valid(content, size);
+ bool double_check = S_utf8_valid_alt(content, size);
+ if (sane != double_check) {
+ FAIL(runner, "Disagreement: %s", description);
+ }
+ else {
+ TEST_TRUE(runner, sane == expected, "%s", description);
+ }
+}
+
+static void
+test_utf8_valid(TestBatchRunner *runner) {
+ // Musical symbol G clef:
+ // Code point: U+1D11E
+ // UTF-16: 0xD834 0xDD1E
+ // UTF-8 0xF0 0x9D 0x84 0x9E
+ S_test_validity(runner, "\xF0\x9D\x84\x9E", 4, true,
+ "Musical symbol G clef");
+ S_test_validity(runner, "\xED\xA0\xB4\xED\xB4\x9E", 6, false,
+ "G clef as UTF-8 encoded UTF-16 surrogates");
+ S_test_validity(runner, ".\xED\xA0\xB4.", 5, false,
+ "Isolated high surrogate");
+ S_test_validity(runner, ".\xED\xB4\x9E.", 5, false,
+ "Isolated low surrogate");
+
+ // Shortest form.
+ S_test_validity(runner, ".\xC1\x9C.", 4, false,
+ "Non-shortest form ASCII backslash");
+ S_test_validity(runner, ".\xC0\xAF.", 4, false,
+ "Non-shortest form ASCII slash");
+ S_test_validity(runner, ".\xC0\x80.", 4, false,
+ "Non-shortest form ASCII NUL character");
+ S_test_validity(runner, ".\xE0\x9F\xBF.", 5, false,
+ "Non-shortest form three byte sequence");
+ S_test_validity(runner, ".\xF0\x8F\xBF\xBF.", 6, false,
+ "Non-shortest form four byte sequence");
+
+ // Range.
+ S_test_validity(runner, "\xF8\x88\x80\x80\x80", 5, false, "5-byte UTF-8");
+ S_test_validity(runner, "\xF4\x8F\xBF\xBF", 4, true,
+ "Code point 0x10FFFF");
+ S_test_validity(runner, "\xF4\x90\x80\x80", 4, false,
+ "Code point 0x110000 too large");
+ S_test_validity(runner, "\xF5\x80\x80\x80", 4, false,
+ "Sequence starting with 0xF5");
+
+ // Truncated sequences.
+ S_test_validity(runner, "\xC2", 1, false,
+ "Truncated two byte sequence");
+ S_test_validity(runner, "\xE2\x98", 2, false,
+ "Truncated three byte sequence");
+ S_test_validity(runner, "\xF0\x9D\x84", 3, false,
+ "Truncated four byte sequence");
+
+ // Bad continuations.
+ S_test_validity(runner, "\xE2\x98\xBA\xE2\x98\xBA", 6, true,
+ "SmileySmiley");
+ S_test_validity(runner, "\xE2\xBA\xE2\x98\xBA", 5, false,
+ "missing first continuation byte");
+ S_test_validity(runner, "\xE2\x98\xE2\x98\xBA", 5, false,
+ "missing second continuation byte");
+ S_test_validity(runner, "\xE2\xE2\x98\xBA", 4, false,
+ "missing both continuation bytes");
+ S_test_validity(runner, "\xBA\xE2\x98\xBA\xE2\xBA", 5, false,
+ "missing first continuation byte (end)");
+ S_test_validity(runner, "\xE2\x98\xBA\xE2\x98", 5, false,
+ "missing second continuation byte (end)");
+ S_test_validity(runner, "\xE2\x98\xBA\xE2", 4, false,
+ "missing both continuation bytes (end)");
+ S_test_validity(runner, "\xBA\xE2\x98\xBA", 4, false,
+ "isolated continuation byte 0xBA");
+ S_test_validity(runner, "\x98\xE2\x98\xBA", 4, false,
+ "isolated continuation byte 0x98");
+ S_test_validity(runner, "\xE2\x98\xBA\xBA", 4, false,
+ "isolated continuation byte 0xBA (end)");
+ S_test_validity(runner, "\xE2\x98\xBA\x98", 4, false,
+ "isolated continuation byte 0x98 (end)");
+ S_test_validity(runner, "\xF0xxxx", 5, false,
+ "missing continuation byte 2/4");
+ S_test_validity(runner, "\xF0\x9Dxxxx", 5, false,
+ "missing continuation byte 3/4");
+ S_test_validity(runner, "\xF0\x9D\x84xx", 5, false,
+ "missing continuation byte 4/4");
+}
+
+static void
+S_validate_utf8(void *context) {
+ const char *text = (const char*)context;
+ Str_validate_utf8(text, strlen(text), "src.c", 17, "fn");
+}
+
+static void
+test_validate_utf8(TestBatchRunner *runner) {
+ {
+ Err *error = Err_trap(S_validate_utf8, "Sigma\xC1\x9C.");
+ TEST_TRUE(runner, error != NULL, "validate_utf8 throws");
+ String *mess = Err_Get_Mess(error);
+ const char *expected = "Invalid UTF-8 after 'Sigma': C1 9C 2E\n";
+ bool ok = Str_Starts_With_Utf8(mess, expected, strlen(expected));
+ TEST_TRUE(runner, ok, "validate_utf8 throws correct error message");
+ DECREF(error);
+ }
+
+ {
+ Err *error = Err_trap(S_validate_utf8,
+ "xxx123456789\xE2\x93\xAA"
+ "1234567890\xC1\x9C.");
+ String *mess = Err_Get_Mess(error);
+ const char *expected =
+ "Invalid UTF-8 after '123456789\xE2\x93\xAA"
+ "1234567890': C1 9C 2E\n";
+ bool ok = Str_Starts_With_Utf8(mess, expected, strlen(expected));
+ TEST_TRUE(runner, ok, "validate_utf8 truncates long prefix");
+ DECREF(error);
+ }
+}
+
+static void
+test_is_whitespace(TestBatchRunner *runner) {
+ TEST_TRUE(runner, Str_is_whitespace(' '), "space is whitespace");
+ TEST_TRUE(runner, Str_is_whitespace('\n'), "newline is whitespace");
+ TEST_TRUE(runner, Str_is_whitespace('\t'), "tab is whitespace");
+ TEST_TRUE(runner, Str_is_whitespace('\v'),
+ "vertical tab is whitespace");
+ TEST_FALSE(runner, Str_is_whitespace('a'), "'a' isn't whitespace");
+ TEST_FALSE(runner, Str_is_whitespace(0), "NULL isn't whitespace");
+ TEST_FALSE(runner, Str_is_whitespace(0x263A),
+ "Smiley isn't whitespace");
+}
+
+static void
+S_encode_utf8_char(void *context) {
+ int32_t *code_point_ptr = (int32_t*)context;
+ char buffer[4];
+ Str_encode_utf8_char(*code_point_ptr, buffer);
+}
+
+static void
+test_encode_utf8_char(TestBatchRunner *runner) {
+ int32_t code_point = 0x110000;
+ Err *error = Err_trap(S_encode_utf8_char, &code_point);
+ TEST_TRUE(runner, error != NULL, "Encode code point 0x110000 throws");
+ DECREF(error);
+}
+
static void
test_new(TestBatchRunner *runner) {
static char chars[] = "A string " SMILEY " with a smile.";
@@ -813,7 +1100,12 @@ test_iterator_substring(TestBatchRunner *runner) {
void
TestStr_Run_IMP(TestString *self, TestBatchRunner *runner) {
- TestBatchRunner_Plan(runner, (TestBatch*)self, 158);
+ TestBatchRunner_Plan(runner, (TestBatch*)self, 200);
+ test_all_code_points(runner);
+ test_utf8_valid(runner);
+ test_validate_utf8(runner);
+ test_is_whitespace(runner);
+ test_encode_utf8_char(runner);
test_new(runner);
test_Cat(runner);
test_Clone(runner);
http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/ed2010ca/runtime/test/Clownfish/Test/Util/TestStringHelper.c
----------------------------------------------------------------------
diff --git a/runtime/test/Clownfish/Test/Util/TestStringHelper.c b/runtime/test/Clownfish/Test/Util/TestStringHelper.c
index d009a58..2caee84 100644
--- a/runtime/test/Clownfish/Test/Util/TestStringHelper.c
+++ b/runtime/test/Clownfish/Test/Util/TestStringHelper.c
@@ -28,104 +28,11 @@
#include "Clownfish/Util/StringHelper.h"
#include "Clownfish/Class.h"
-/* This alternative implementation of utf8_valid() is (presumably) slower, but
- * it implements the standard in a more linear, easy-to-grok way.
- */
-#define TRAIL_OK(n) (n >= 0x80 && n <= 0xBF)
TestStringHelper*
TestStrHelp_new() {
return (TestStringHelper*)Class_Make_Obj(TESTSTRINGHELPER);
}
-static bool
-S_utf8_valid_alt(const char *maybe_utf8, size_t size) {
- const uint8_t *string = (const uint8_t*)maybe_utf8;
- const uint8_t *const end = string + size;
- while (string < end) {
- int count = StrHelp_UTF8_COUNT[*string];
- bool valid = false;
- if (count == 1) {
- if (string[0] <= 0x7F) {
- valid = true;
- }
- }
- else if (count == 2) {
- if (string[0] >= 0xC2 && string[0] <= 0xDF) {
- if (TRAIL_OK(string[1])) {
- valid = true;
- }
- }
- }
- else if (count == 3) {
- if (string[0] == 0xE0) {
- if (string[1] >= 0xA0 && string[1] <= 0xBF
- && TRAIL_OK(string[2])
- ) {
- valid = true;
- }
- }
- else if (string[0] >= 0xE1 && string[0] <= 0xEC) {
- if (TRAIL_OK(string[1])
- && TRAIL_OK(string[2])
- ) {
- valid = true;
- }
- }
- else if (string[0] == 0xED) {
- if (string[1] >= 0x80 && string[1] <= 0x9F
- && TRAIL_OK(string[2])
- ) {
- valid = true;
- }
- }
- else if (string[0] >= 0xEE && string[0] <= 0xEF) {
- if (TRAIL_OK(string[1])
- && TRAIL_OK(string[2])
- ) {
- valid = true;
- }
- }
- }
- else if (count == 4) {
- if (string[0] == 0xF0) {
- if (string[1] >= 0x90 && string[1] <= 0xBF
- && TRAIL_OK(string[2])
- && TRAIL_OK(string[3])
- ) {
- valid = true;
- }
- }
- else if (string[0] >= 0xF1 && string[0] <= 0xF3) {
- if (TRAIL_OK(string[1])
- && TRAIL_OK(string[2])
- && TRAIL_OK(string[3])
- ) {
- valid = true;
- }
- }
- else if (string[0] == 0xF4) {
- if (string[1] >= 0x80 && string[1] <= 0x8F
- && TRAIL_OK(string[2])
- && TRAIL_OK(string[3])
- ) {
- valid = true;
- }
- }
- }
-
- if (!valid) {
- return false;
- }
- string += count;
- }
-
- if (string != end) {
- return false;
- }
-
- return true;
-}
-
static void
test_overlap(TestBatchRunner *runner) {
size_t result;
@@ -157,210 +64,41 @@ test_to_base36(TestBatchRunner *runner) {
}
static void
-test_utf8_round_trip(TestBatchRunner *runner) {
+test_back_utf8_char(TestBatchRunner *runner) {
+ char buffer[4];
+ char *buf = buffer + 1;
+ uint32_t len = Str_encode_utf8_char(0x263A, buffer);
+ char *end = buffer + len;
+ TEST_TRUE(runner, StrHelp_back_utf8_char(end, buffer) == buffer,
+ "back_utf8_char");
+ TEST_TRUE(runner, StrHelp_back_utf8_char(end, buf) == NULL,
+ "back_utf8_char returns NULL rather than back up beyond start");
+ TEST_TRUE(runner, StrHelp_back_utf8_char(buffer, buffer) == NULL,
+ "back_utf8_char returns NULL when end == start");
+
int32_t code_point;
for (code_point = 0; code_point <= 0x10FFFF; code_point++) {
- char buffer[4];
- uint32_t size = StrHelp_encode_utf8_char(code_point, buffer);
+ uint32_t size = Str_encode_utf8_char(code_point, buffer);
char *start = buffer;
char *end = start + size;
- // Verify length returned by encode_utf8_char().
- if (size != StrHelp_UTF8_COUNT[(unsigned char)buffer[0]]) {
- break;
- }
- // Verify that utf8_valid() agrees with alternate implementation.
- if (!!StrHelp_utf8_valid(start, size)
- != !!S_utf8_valid_alt(start, size)
- ) {
- break;
- }
-
- // Verify back_utf8_char().
if (StrHelp_back_utf8_char(end, start) != start) {
break;
}
}
if (code_point == 0x110000) {
- PASS(runner, "Successfully round tripped 0 - 0x10FFFF");
+ PASS(runner, "back_utf8_char works for code points 0 - 0x10FFFF");
}
else {
- FAIL(runner, "Failed round trip at 0x%.1X", (unsigned)code_point);
- }
-}
-
-static void
-S_test_validity(TestBatchRunner *runner, const char *content, size_t size,
- bool expected, const char *description) {
- bool sane = StrHelp_utf8_valid(content, size);
- bool double_check = S_utf8_valid_alt(content, size);
- if (sane != double_check) {
- FAIL(runner, "Disagreement: %s", description);
- }
- else {
- TEST_TRUE(runner, sane == expected, "%s", description);
- }
-}
-
-static void
-test_utf8_valid(TestBatchRunner *runner) {
- // Musical symbol G clef:
- // Code point: U+1D11E
- // UTF-16: 0xD834 0xDD1E
- // UTF-8 0xF0 0x9D 0x84 0x9E
- S_test_validity(runner, "\xF0\x9D\x84\x9E", 4, true,
- "Musical symbol G clef");
- S_test_validity(runner, "\xED\xA0\xB4\xED\xB4\x9E", 6, false,
- "G clef as UTF-8 encoded UTF-16 surrogates");
- S_test_validity(runner, ".\xED\xA0\xB4.", 5, false,
- "Isolated high surrogate");
- S_test_validity(runner, ".\xED\xB4\x9E.", 5, false,
- "Isolated low surrogate");
-
- // Shortest form.
- S_test_validity(runner, ".\xC1\x9C.", 4, false,
- "Non-shortest form ASCII backslash");
- S_test_validity(runner, ".\xC0\xAF.", 4, false,
- "Non-shortest form ASCII slash");
- S_test_validity(runner, ".\xC0\x80.", 4, false,
- "Non-shortest form ASCII NUL character");
- S_test_validity(runner, ".\xE0\x9F\xBF.", 5, false,
- "Non-shortest form three byte sequence");
- S_test_validity(runner, ".\xF0\x8F\xBF\xBF.", 6, false,
- "Non-shortest form four byte sequence");
-
- // Range.
- S_test_validity(runner, "\xF8\x88\x80\x80\x80", 5, false, "5-byte UTF-8");
- S_test_validity(runner, "\xF4\x8F\xBF\xBF", 4, true,
- "Code point 0x10FFFF");
- S_test_validity(runner, "\xF4\x90\x80\x80", 4, false,
- "Code point 0x110000 too large");
- S_test_validity(runner, "\xF5\x80\x80\x80", 4, false,
- "Sequence starting with 0xF5");
-
- // Truncated sequences.
- S_test_validity(runner, "\xC2", 1, false,
- "Truncated two byte sequence");
- S_test_validity(runner, "\xE2\x98", 2, false,
- "Truncated three byte sequence");
- S_test_validity(runner, "\xF0\x9D\x84", 3, false,
- "Truncated four byte sequence");
-
- // Bad continuations.
- S_test_validity(runner, "\xE2\x98\xBA\xE2\x98\xBA", 6, true,
- "SmileySmiley");
- S_test_validity(runner, "\xE2\xBA\xE2\x98\xBA", 5, false,
- "missing first continuation byte");
- S_test_validity(runner, "\xE2\x98\xE2\x98\xBA", 5, false,
- "missing second continuation byte");
- S_test_validity(runner, "\xE2\xE2\x98\xBA", 4, false,
- "missing both continuation bytes");
- S_test_validity(runner, "\xBA\xE2\x98\xBA\xE2\xBA", 5, false,
- "missing first continuation byte (end)");
- S_test_validity(runner, "\xE2\x98\xBA\xE2\x98", 5, false,
- "missing second continuation byte (end)");
- S_test_validity(runner, "\xE2\x98\xBA\xE2", 4, false,
- "missing both continuation bytes (end)");
- S_test_validity(runner, "\xBA\xE2\x98\xBA", 4, false,
- "isolated continuation byte 0xBA");
- S_test_validity(runner, "\x98\xE2\x98\xBA", 4, false,
- "isolated continuation byte 0x98");
- S_test_validity(runner, "\xE2\x98\xBA\xBA", 4, false,
- "isolated continuation byte 0xBA (end)");
- S_test_validity(runner, "\xE2\x98\xBA\x98", 4, false,
- "isolated continuation byte 0x98 (end)");
- S_test_validity(runner, "\xF0xxxx", 5, false,
- "missing continuation byte 2/4");
- S_test_validity(runner, "\xF0\x9Dxxxx", 5, false,
- "missing continuation byte 3/4");
- S_test_validity(runner, "\xF0\x9D\x84xx", 5, false,
- "missing continuation byte 4/4");
-}
-
-static void
-S_validate_utf8(void *context) {
- const char *text = (const char*)context;
- StrHelp_validate_utf8(text, strlen(text), "src.c", 17, "fn");
-}
-
-static void
-test_validate_utf8(TestBatchRunner *runner) {
- {
- Err *error = Err_trap(S_validate_utf8, "Sigma\xC1\x9C.");
- TEST_TRUE(runner, error != NULL, "validate_utf8 throws");
- String *mess = Err_Get_Mess(error);
- const char *expected = "Invalid UTF-8 after 'Sigma': C1 9C 2E\n";
- bool ok = Str_Starts_With_Utf8(mess, expected, strlen(expected));
- TEST_TRUE(runner, ok, "validate_utf8 throws correct error message");
- DECREF(error);
+ FAIL(runner, "Failed back_utf8_char at 0x%.1X", (unsigned)code_point);
}
-
- {
- Err *error = Err_trap(S_validate_utf8,
- "xxx123456789\xE2\x93\xAA"
- "1234567890\xC1\x9C.");
- String *mess = Err_Get_Mess(error);
- const char *expected =
- "Invalid UTF-8 after '123456789\xE2\x93\xAA"
- "1234567890': C1 9C 2E\n";
- bool ok = Str_Starts_With_Utf8(mess, expected, strlen(expected));
- TEST_TRUE(runner, ok, "validate_utf8 truncates long prefix");
- DECREF(error);
- }
-}
-
-static void
-test_is_whitespace(TestBatchRunner *runner) {
- TEST_TRUE(runner, StrHelp_is_whitespace(' '), "space is whitespace");
- TEST_TRUE(runner, StrHelp_is_whitespace('\n'), "newline is whitespace");
- TEST_TRUE(runner, StrHelp_is_whitespace('\t'), "tab is whitespace");
- TEST_TRUE(runner, StrHelp_is_whitespace('\v'),
- "vertical tab is whitespace");
- TEST_FALSE(runner, StrHelp_is_whitespace('a'), "'a' isn't whitespace");
- TEST_FALSE(runner, StrHelp_is_whitespace(0), "NULL isn't whitespace");
- TEST_FALSE(runner, StrHelp_is_whitespace(0x263A),
- "Smiley isn't whitespace");
-}
-
-static void
-S_encode_utf8_char(void *context) {
- int32_t *code_point_ptr = (int32_t*)context;
- char buffer[4];
- StrHelp_encode_utf8_char(*code_point_ptr, buffer);
-}
-
-static void
-test_encode_utf8_char(TestBatchRunner *runner) {
- int32_t code_point = 0x110000;
- Err *error = Err_trap(S_encode_utf8_char, &code_point);
- TEST_TRUE(runner, error != NULL, "Encode code point 0x110000 throws");
- DECREF(error);
-}
-
-static void
-test_back_utf8_char(TestBatchRunner *runner) {
- char buffer[4];
- char *buf = buffer + 1;
- uint32_t len = StrHelp_encode_utf8_char(0x263A, buffer);
- char *end = buffer + len;
- TEST_TRUE(runner, StrHelp_back_utf8_char(end, buffer) == buffer,
- "back_utf8_char");
- TEST_TRUE(runner, StrHelp_back_utf8_char(end, buf) == NULL,
- "back_utf8_char returns NULL rather than back up beyond start");
- TEST_TRUE(runner, StrHelp_back_utf8_char(buffer, buffer) == NULL,
- "back_utf8_char returns NULL when end == start");
}
void
TestStrHelp_Run_IMP(TestStringHelper *self, TestBatchRunner *runner) {
- TestBatchRunner_Plan(runner, (TestBatch*)self, 55);
+ TestBatchRunner_Plan(runner, (TestBatch*)self, 14);
test_overlap(runner);
test_to_base36(runner);
- test_utf8_round_trip(runner);
- test_utf8_valid(runner);
- test_validate_utf8(runner);
- test_is_whitespace(runner);
- test_encode_utf8_char(runner);
test_back_utf8_char(runner);
}