You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucy.apache.org by ma...@apache.org on 2011/09/07 00:21:08 UTC
[lucy-commits] svn commit: r1165909 - in /incubator/lucy/trunk:
core/Lucy/Test/Util/TestStringHelper.c core/Lucy/Util/StringHelper.c
example-lang/src/Lucy/Util/StringHelper.c perl/MANIFEST
perl/xs/Lucy/Util/StringHelper.c
Author: marvin
Date: Tue Sep 6 22:21:08 2011
New Revision: 1165909
URL: http://svn.apache.org/viewvc?rev=1165909&view=rev
Log:
LUCY-179 - Tighten UTF-8 validity checks.
Provide a core implementation of StrHelp_utf8_valid() rather than relying on
the Perl C API function is_utf8_string(). This implementation tightens the
requirements and passes only truly valid UTF-8, forbidding UTF-8-encoded
UTF-16 surrogates and code points above 0x10FFFF.
Modified:
incubator/lucy/trunk/core/Lucy/Test/Util/TestStringHelper.c
incubator/lucy/trunk/core/Lucy/Util/StringHelper.c
incubator/lucy/trunk/example-lang/src/Lucy/Util/StringHelper.c
incubator/lucy/trunk/perl/MANIFEST
incubator/lucy/trunk/perl/xs/Lucy/Util/StringHelper.c
Modified: incubator/lucy/trunk/core/Lucy/Test/Util/TestStringHelper.c
URL: http://svn.apache.org/viewvc/incubator/lucy/trunk/core/Lucy/Test/Util/TestStringHelper.c?rev=1165909&r1=1165908&r2=1165909&view=diff
==============================================================================
--- incubator/lucy/trunk/core/Lucy/Test/Util/TestStringHelper.c (original)
+++ incubator/lucy/trunk/core/Lucy/Test/Util/TestStringHelper.c Tue Sep 6 22:21:08 2011
@@ -20,6 +20,100 @@
#include "Lucy/Test/Util/TestStringHelper.h"
#include "Lucy/Util/StringHelper.h"
+
+/* This alternative implementation of utf8_valid() is (presumably) slower, but
+ * it implements the standard in a more linear, easy-to-grok way.
+ */
+#define TRAIL_OK(n) (n >= 0x80 && n <= 0xBF)
+static bool_t
+S_utf8_valid_alt(const char *maybe_utf8, size_t size) {
+ const uint8_t *string = maybe_utf8;
+ const uint8_t *const end = maybe_utf8 + size;
+ while (string < end) {
+ int count = StrHelp_UTF8_COUNT[*string];
+ bool_t valid = false;
+ if (count == 1) {
+ if (string[0] <= 0x7F) {
+ valid = true;
+ }
+ }
+ else if (count == 2) {
+ if (string[0] >= 0xC2 && string[0] <= 0xDF) {
+ if (TRAIL_OK(string[1])) {
+ valid = true;
+ }
+ }
+ }
+ else if (count == 3) {
+ if (string[0] == 0xE0) {
+ if (string[1] >= 0xA0 && string[1] <= 0xBF
+ && TRAIL_OK(string[2])
+ ) {
+ valid = true;
+ }
+ }
+ else if (string[0] >= 0xE1 && string[0] <= 0xEC) {
+ if (TRAIL_OK(string[1])
+ && TRAIL_OK(string[2])
+ ) {
+ valid = true;
+ }
+ }
+ else if (string[0] == 0xED) {
+ if (string[1] >= 0x80 && string[1] <= 0x9F
+ && TRAIL_OK(string[2])
+ ) {
+ valid = true;
+ }
+ }
+ else if (string[0] >= 0xEE && string[0] <= 0xEF) {
+ if (TRAIL_OK(string[1])
+ && TRAIL_OK(string[2])
+ ) {
+ valid = true;
+ }
+ }
+ }
+ else if (count == 4) {
+ if (string[0] == 0xF0) {
+ if (string[1] >= 0x90 && string[1] <= 0xBF
+ && TRAIL_OK(string[2])
+ && TRAIL_OK(string[3])
+ ) {
+ valid = true;
+ }
+ }
+ else if (string[0] >= 0xF1 && string[0] <= 0xF3) {
+ if (TRAIL_OK(string[1])
+ && TRAIL_OK(string[2])
+ && TRAIL_OK(string[3])
+ ) {
+ valid = true;
+ }
+ }
+ else if (string[0] == 0xF4) {
+ if (string[1] >= 0x80 && string[1] <= 0x8F
+ && TRAIL_OK(string[2])
+ && TRAIL_OK(string[3])
+ ) {
+ valid = true;
+ }
+ }
+ }
+
+ if (!valid) {
+ return false;
+ }
+ string += count;
+ }
+
+ if (string != end) {
+ return false;
+ }
+
+ return true;
+}
+
static void
test_overlap(TestBatch *batch) {
int32_t result;
@@ -49,36 +143,106 @@ test_to_base36(TestBatch *batch) {
}
static void
-S_round_trip_utf8_code_point(TestBatch *batch, uint32_t code_point) {
- char buffer[4];
- uint32_t len = StrHelp_encode_utf8_char(code_point, buffer);
- char *start = buffer;
- char *end = start + len;
- TEST_TRUE(batch, StrHelp_utf8_valid(buffer, len), "Valid UTF-8 for %lu",
- (unsigned long)code_point);
- TEST_INT_EQ(batch, len, StrHelp_UTF8_COUNT[(unsigned char)buffer[0]],
- "length returned for %lu", (unsigned long)code_point);
- TEST_TRUE(batch, StrHelp_back_utf8_char(end, start) == start,
- "back_utf8_char for %lu", (unsigned long)code_point);
- TEST_INT_EQ(batch, StrHelp_decode_utf8_char(buffer), code_point,
- "round trip encode and decode for %lu", (unsigned long)code_point);
+test_utf8_round_trip(TestBatch *batch) {
+ bool_t failed = false;
+ uint32_t code_point;
+ for (code_point = 0; code_point <= 0x10FFFF; code_point++) {
+ char buffer[4];
+ uint32_t size = StrHelp_encode_utf8_char(code_point, buffer);
+ char *start = buffer;
+ char *end = start + size;
+
+ // Verify length returned by encode_utf8_char().
+ if (size != StrHelp_UTF8_COUNT[(unsigned char)buffer[0]]) {
+ break;
+ }
+ // Verify that utf8_valid() agrees with alternate implementation.
+ if (!!StrHelp_utf8_valid(start, size)
+ != !!S_utf8_valid_alt(start, size)
+ ) {
+ break;
+ }
+
+ // Verify back_utf8_char().
+ if (StrHelp_back_utf8_char(end, start) != start) {
+ break;
+ }
+
+ // Verify round trip of encode/decode.
+ if (StrHelp_decode_utf8_char(buffer) != code_point) {
+ break;
+ }
+ }
+ if (code_point == 0x110000) {
+ PASS(batch, "Successfully round tripped 0 - 0x10FFFF");
+ }
+ else {
+ FAIL(batch, "Failed round trip at 0x%.1X", (unsigned)code_point);
+ }
}
static void
-test_utf8_round_trip(TestBatch *batch) {
- uint32_t code_points[] = {
- 0,
- 0xA, // newline
- 'a',
- 128, // two-byte
- 0x263A, // smiley (three-byte)
- 0x10FFFF, // Max legal code point (four-byte).
- };
- uint32_t num_code_points = sizeof(code_points) / sizeof(uint32_t);
- uint32_t i;
- for (i = 0; i < num_code_points; i++) {
- S_round_trip_utf8_code_point(batch, code_points[i]);
+S_test_validity(TestBatch *batch, const char *content, size_t size,
+ bool_t expected, const char *description) {
+ bool_t sane = StrHelp_utf8_valid(content, size);
+ bool_t double_check = S_utf8_valid_alt(content, size);
+ if (sane != double_check) {
+ FAIL(batch, "Disagreement: %s", description);
}
+ else {
+ TEST_TRUE(batch, sane == expected, "%s", description);
+ }
+}
+
+static void
+test_utf8_valid(TestBatch *batch) {
+ // Musical symbol G clef:
+ // Code point: U+1D11E
+ // UTF-16: 0xD834 0xDD1E
+ // UTF-8 0xF0 0x9D 0x84 0x9E
+ S_test_validity(batch, "\xF0\x9D\x84\x9E", 4, true,
+ "Musical symbol G clef");
+ S_test_validity(batch, "\xED\xA0\xB4\xED\xB4\x9E", 6, false,
+ "G clef as UTF-8 encoded UTF-16 surrogates");
+ S_test_validity(batch, ".\xED\xA0\xB4.", 5, false,
+ "Isolated high surrogate");
+ S_test_validity(batch, ".\xED\xB4\x9E.", 5, false,
+ "Isolated low surrogate");
+
+ // Shortest form.
+ S_test_validity(batch, ".\xC1\x9C.", 4, false,
+ "Non-shortest form ASCII backslash");
+ S_test_validity(batch, ".\xC0\xAF.", 4, false,
+ "Non-shortest form ASCII slash");
+ S_test_validity(batch, ".\xC0\x80.", 4, false,
+ "Non-shortest form ASCII NUL character");
+
+ // Range.
+ S_test_validity(batch, "\xF8\x88\x80\x80\x80", 5, false, "5-byte UTF-8");
+
+ // Bad continuations.
+ S_test_validity(batch, "\xE2\x98\xBA\xE2\x98\xBA", 6, true,
+ "SmileySmiley");
+ S_test_validity(batch, "\xE2\xBA\xE2\x98\xBA", 5, false,
+ "missing first continuation byte");
+ S_test_validity(batch, "\xE2\x98\xE2\x98\xBA", 5, false,
+ "missing second continuation byte");
+ S_test_validity(batch, "\xE2\xE2\x98\xBA", 4, false,
+ "missing both continuation bytes");
+ S_test_validity(batch, "\xBA\xE2\x98\xBA\xE2\xBA", 5, false,
+ "missing first continuation byte (end)");
+ S_test_validity(batch, "\xE2\x98\xBA\xE2\x98", 5, false,
+ "missing second continuation byte (end)");
+ S_test_validity(batch, "\xE2\x98\xBA\xE2", 4, false,
+ "missing both continuation bytes (end)");
+ S_test_validity(batch, "\xBA\xE2\x98\xBA", 4, false,
+ "isolated continuation byte 0xBA");
+ S_test_validity(batch, "\x98\xE2\x98\xBA", 4, false,
+ "isolated continuation byte 0x98");
+ S_test_validity(batch, "\xE2\x98\xBA\xBA", 4, false,
+ "isolated continuation byte 0xBA (end)");
+ S_test_validity(batch, "\xE2\x98\xBA\x98", 4, false,
+ "isolated continuation byte 0x98 (end)");
}
static void
@@ -112,13 +276,14 @@ test_back_utf8_char(TestBatch *batch) {
void
TestStrHelp_run_tests() {
- TestBatch *batch = TestBatch_new(44);
+ TestBatch *batch = TestBatch_new(40);
TestBatch_Plan(batch);
test_overlap(batch);
test_to_base36(batch);
test_utf8_round_trip(batch);
+ test_utf8_valid(batch);
test_is_whitespace(batch);
test_back_utf8_char(batch);
Modified: incubator/lucy/trunk/core/Lucy/Util/StringHelper.c
URL: http://svn.apache.org/viewvc/incubator/lucy/trunk/core/Lucy/Util/StringHelper.c?rev=1165909&r1=1165908&r2=1165909&view=diff
==============================================================================
--- incubator/lucy/trunk/core/Lucy/Util/StringHelper.c (original)
+++ incubator/lucy/trunk/core/Lucy/Util/StringHelper.c Tue Sep 6 22:21:08 2011
@@ -78,6 +78,57 @@ StrHelp_to_base36(uint64_t num, void *bu
}
}
+bool_t
+StrHelp_utf8_valid(const char *ptr, size_t size) {
+ const uint8_t *string = (const uint8_t*)ptr;
+ const uint8_t *const end = string + size;
+ while (string < end) {
+ const uint8_t header_byte = *string++;
+ int count = StrHelp_UTF8_COUNT[header_byte] & 0x7;
+ switch (count & 0x7) {
+ case 1:
+ // ASCII
+ break;
+ case 2:
+ if (string == end) { return false; }
+ // Disallow non-shortest-form ASCII.
+ if (!(header_byte & 0x1E)) { return false; }
+ if ((*string++ & 0xC0) != 0x80) { return false; }
+ break;
+ case 3:
+ if (end - string < 2) { return false; }
+ if (header_byte == 0xED) {
+ if (*string < 0x80 || *string > 0x9F) {
+ return false;
+ }
+ }
+ else if (!(header_byte & 0x0F)) {
+ if (!(*string & 0x20)) {
+ return false;
+ }
+ }
+ if ((*string++ & 0xC0) != 0x80) { return false; }
+ if ((*string++ & 0xC0) != 0x80) { return false; }
+ break;
+ case 4:
+ if (end - string < 3) { return false; }
+ if (!(header_byte & 0x07)) {
+ if (!(*string & 0x30)) {
+ return false;
+ }
+ }
+ if ((*string++ & 0xC0) != 0x80) { return false; }
+ if ((*string++ & 0xC0) != 0x80) { return false; }
+ if ((*string++ & 0xC0) != 0x80) { return false; }
+ break;
+ default:
+ return false;
+ }
+ }
+
+ return true;
+}
+
uint32_t
StrHelp_encode_utf8_char(uint32_t code_point, void *buffer) {
uint8_t *buf = (uint8_t*)buffer;
Modified: incubator/lucy/trunk/perl/MANIFEST
URL: http://svn.apache.org/viewvc/incubator/lucy/trunk/perl/MANIFEST?rev=1165909&r1=1165908&r2=1165909&view=diff
==============================================================================
--- incubator/lucy/trunk/perl/MANIFEST (original)
+++ incubator/lucy/trunk/perl/MANIFEST Tue Sep 6 22:21:08 2011
@@ -404,6 +404,5 @@ xs/Lucy/Object/LockFreeRegistry.c
xs/Lucy/Object/Obj.c
xs/Lucy/Object/VTable.c
xs/Lucy/Store/FSFolder.c
-xs/Lucy/Util/StringHelper.c
xs/XSBind.c
xs/XSBind.h