You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucy.apache.org by ma...@apache.org on 2011/08/12 05:31:04 UTC
[lucy-commits] svn commit: r1156951 - in /incubator/lucy/trunk: ./ clownfish/src/ core/Lucy/Test/Util/ core/Lucy/Util/ core/Lucy/Util/Json/ perl/ perl/buildlib/Lucy/ perl/lib/ perl/xs/Lucy/Util/

Author: marvin
Date: Fri Aug 12 03:31:03 2011
New Revision: 1156951

URL: http://svn.apache.org/viewvc?rev=1156951&view=rev
Log:
LUCY-133 Replace JSON::XS with Lemon-powered parser.

Eliminate Lucy's runtime dependency on the CPAN module JSON::XS with a new
parser powered by the Lemon parser generator which operates directly on
Clownfish data structures.

Added:
    incubator/lucy/trunk/core/Lucy/Util/Json/
    incubator/lucy/trunk/core/Lucy/Util/Json.c
      - copied, changed from r1154651, incubator/lucy/trunk/perl/xs/Lucy/Util/Json.c
    incubator/lucy/trunk/core/Lucy/Util/Json/JsonParser.y
Removed:
    incubator/lucy/trunk/perl/xs/Lucy/Util/Json.c
Modified:
    incubator/lucy/trunk/STATUS
    incubator/lucy/trunk/clownfish/src/CFCBindAliases.c
    incubator/lucy/trunk/core/Lucy/Test/Util/TestJson.c
    incubator/lucy/trunk/core/Lucy/Util/Json.cfh
    incubator/lucy/trunk/perl/Build.PL
    incubator/lucy/trunk/perl/MANIFEST
    incubator/lucy/trunk/perl/buildlib/Lucy/Build.pm
    incubator/lucy/trunk/perl/lib/Lucy.pm

Modified: incubator/lucy/trunk/STATUS
URL: http://svn.apache.org/viewvc/incubator/lucy/trunk/STATUS?rev=1156951&r1=1156950&r2=1156951&view=diff
==============================================================================
--- incubator/lucy/trunk/STATUS (original)
+++ incubator/lucy/trunk/STATUS Fri Aug 12 03:31:03 2011
@@ -29,9 +29,6 @@ TODO LIST:
     <https://issues.apache.org/jira/browse/LUCY-143>
     <https://issues.apache.org/jira/browse/LUCY-134>
 
-  * Replace dependency on JSON::XS with ???
-    <https://issues.apache.org/jira/browse/LUCY-133>
-
   * Refactor away C89 idioms, since we have chosen the intersection of C99 
     and C++ as our C dialect.
     <https://issues.apache.org/jira/browse/LUCY-144>

Modified: incubator/lucy/trunk/clownfish/src/CFCBindAliases.c
URL: http://svn.apache.org/viewvc/incubator/lucy/trunk/clownfish/src/CFCBindAliases.c?rev=1156951&r1=1156950&r2=1156951&view=diff
==============================================================================
--- incubator/lucy/trunk/clownfish/src/CFCBindAliases.c (original)
+++ incubator/lucy/trunk/clownfish/src/CFCBindAliases.c Fri Aug 12 03:31:03 2011
@@ -91,6 +91,7 @@ struct alias aliases[] = {
     {"Cfish_VA_Get_Size", "Lucy_VA_Get_Size"},
     {"Cfish_VA_Resize", "Lucy_VA_Resize"},
     {"Cfish_VA_Store", "Lucy_VA_Store"},
+    {"Cfish_VA_Push", "Lucy_VA_Push"},
 
     {"cfish_VTable", "lucy_VTable"},
     {"CFISH_VTABLE", "LUCY_VTABLE"},

Modified: incubator/lucy/trunk/core/Lucy/Test/Util/TestJson.c
URL: http://svn.apache.org/viewvc/incubator/lucy/trunk/core/Lucy/Test/Util/TestJson.c?rev=1156951&r1=1156950&r2=1156951&view=diff
==============================================================================
--- incubator/lucy/trunk/core/Lucy/Test/Util/TestJson.c (original)
+++ incubator/lucy/trunk/core/Lucy/Test/Util/TestJson.c Fri Aug 12 03:31:03 2011
@@ -269,9 +269,85 @@ test_syntax_errors(TestBatch *batch) {
     S_verify_bad_syntax(batch, "\"\\uAAAZ\"", "invalid \\u escape");
 }
 
+static void
+S_round_trip_integer(TestBatch *batch, int64_t value) {
+    Integer64 *num = Int64_new(value);
+    VArray *array = VA_new(1);
+    VA_Store(array, 0, (Obj*)num);
+    CharBuf *json = Json_to_json((Obj*)array);
+    Obj *dump = Json_from_json(json);
+    TEST_TRUE(batch, VA_Equals(array, dump), "Round trip integer %ld",
+              (long)value);
+    DECREF(dump);
+    DECREF(json);
+    DECREF(array);
+}
+
+static void
+test_integers(TestBatch *batch) {
+    S_round_trip_integer(batch, 0);
+    S_round_trip_integer(batch, -1);
+    S_round_trip_integer(batch, -1000000);
+    S_round_trip_integer(batch, 1000000);
+}
+
+static void
+S_round_trip_float(TestBatch *batch, double value, double max_diff) {
+    Float64 *num = Float64_new(value);
+    VArray *array = VA_new(1);
+    VA_Store(array, 0, (Obj*)num);
+    CharBuf *json = Json_to_json((Obj*)array);
+    Obj *dump = CERTIFY(Json_from_json(json), VARRAY);
+    Float64 *got = (Float64*)CERTIFY(VA_Fetch((VArray*)dump, 0), FLOAT64);
+    double diff = Float64_Get_Value(num) - Float64_Get_Value(got);
+    if (diff < 0) { diff = 0 - diff; }
+    TEST_TRUE(batch, diff <= max_diff, "Round trip float %f", value);
+    DECREF(dump);
+    DECREF(json);
+    DECREF(array);
+}
+
+static void
+test_floats(TestBatch *batch) {
+    S_round_trip_float(batch, 0.0, 0.0);
+    S_round_trip_float(batch, 0.1, 0.00001);
+    S_round_trip_float(batch, -0.1, 0.00001);
+    S_round_trip_float(batch, 1000000.5, 1.0);
+    S_round_trip_float(batch, -1000000.5, 1.0);
+}
+
+static void
+test_max_depth(TestBatch *batch) {
+    Hash *circular = Hash_new(0);
+    Hash_Store_Str(circular, "circular", 8, INCREF(circular));
+    Err_set_error(NULL);
+    CharBuf *not_json = Json_to_json((Obj*)circular);
+    TEST_TRUE(batch, not_json == NULL,
+              "to_json returns NULL when fed recursing data");
+    TEST_TRUE(batch, Err_get_error() != NULL,
+              "to_json sets Err_error when fed recursing data");
+    DECREF(Hash_Delete_Str(circular, "circular", 8));
+    DECREF(circular);
+}
+
+static void
+test_illegal_keys(TestBatch *batch) {
+    Hash *hash = Hash_new(0);
+    Float64 *key = Float64_new(1.1);
+    Hash_Store(hash, (Obj*)key, (Obj*)CB_newf("blah"));
+    Err_set_error(NULL);
+    CharBuf *not_json = Json_to_json((Obj*)hash);
+    TEST_TRUE(batch, not_json == NULL,
+              "to_json returns NULL when fed an illegal key");
+    TEST_TRUE(batch, Err_get_error() != NULL,
+              "to_json sets Err_error when fed an illegal key");
+    DECREF(key);
+    DECREF(hash);
+}
+
 void
 TestJson_run_tests() {
-    int num_tests = 94;
+    int num_tests = 107;
 #ifndef LUCY_VALGRIND
     num_tests += 28; // FIXME: syntax errors leak memory.
 #endif
@@ -286,6 +362,10 @@ TestJson_run_tests() {
     test_escapes(batch);
     test_numbers(batch);
     test_spew_and_slurp(batch);
+    test_integers(batch);
+    test_floats(batch);
+    test_max_depth(batch);
+    test_illegal_keys(batch);
 
 #ifndef LUCY_VALGRIND
     test_syntax_errors(batch);

Copied: incubator/lucy/trunk/core/Lucy/Util/Json.c (from r1154651, incubator/lucy/trunk/perl/xs/Lucy/Util/Json.c)
URL: http://svn.apache.org/viewvc/incubator/lucy/trunk/core/Lucy/Util/Json.c?p2=incubator/lucy/trunk/core/Lucy/Util/Json.c&p1=incubator/lucy/trunk/perl/xs/Lucy/Util/Json.c&r1=1154651&r2=1156951&rev=1156951&view=diff
==============================================================================
--- incubator/lucy/trunk/perl/xs/Lucy/Util/Json.c (original)
+++ incubator/lucy/trunk/core/Lucy/Util/Json.c Fri Aug 12 03:31:03 2011
@@ -14,46 +14,677 @@
  * limitations under the License.
  */
 
+#include <ctype.h>
+#include <stdio.h>
+
 #include "Lucy/Util/ToolSet.h"
 
 #include "Lucy/Util/Json.h"
 #include "Lucy/Object/Host.h"
 #include "Lucy/Store/Folder.h"
+#include "Lucy/Store/InStream.h"
+#include "Lucy/Store/OutStream.h"
+#include "Lucy/Util/Memory.h"
+#include "Lucy/Util/Json/JsonParser.h"
 
-bool_t
-Json_spew_json(Obj *dump, Folder *folder, const CharBuf *path) {
-    bool_t result = (bool_t)Host_callback_i64(JSON, "spew_json", 3,
-                                              ARG_OBJ("dump", dump),
-                                              ARG_OBJ("folder", folder),
-                                              ARG_STR("path", path));
-    if (!result) { ERR_ADD_FRAME(Err_get_error()); }
-    return result;
+/* Routines generated by Lemon. */
+void*
+LucyParseJsonAlloc(void * (*allocate)(size_t));
+void
+LucyParseJson(void *json_parser, int token_type, lucy_Obj *value,
+              lucy_JsonParserState *state);
+void
+LucyParseJsonFree(void *json_parser, void(*freemem)(void*));
+void
+LucyParseJsonTrace(FILE *trace, char *line_prefix);
+
+// Encode JSON for supplied "dump".  On failure, sets Err_error and returns
+// false.
+static bool_t
+S_to_json(Obj *dump, CharBuf *json, int32_t depth);
+
+// Parse JSON from raw UTF-8 in memory.
+static Obj*
+S_parse_json(char *text, size_t size);
+static Obj*
+S_do_parse_json(void *json_parser, char *json, size_t len);
+
+// Parse a JSON number.  Advance the text buffer just past the number.
+static Float64*
+S_parse_number(char **json_ptr, char *const limit);
+
+// Parse a JSON string.  Advance the text buffer from pointing at the opening
+// double quote to pointing just after the closing double quote.
+static CharBuf*
+S_parse_string(char **json_ptr, char *const limit);
+
+// Unescape JSON string text.  Expects pointers bookending the text data (i.e.
+// pointing just after the opening double quote and directly at the closing
+// double quote), and assumes that escapes have already been sanity checked
+// for length.
+static CharBuf*
+S_unescape_text(char *const top, char *const end);
+
+// Check that the supplied text begins with the specified keyword, which must
+// then end on a word boundary (i.e. match "null" but not the first four
+// letters of "nullify").
+static INLINE bool_t
+SI_check_keyword(char *json, char* end, const char *keyword, size_t len);
+
+// Make it possible to be loosen constraints during testing.
+static bool_t tolerant = false;
+
+// Indentation: two spaces per level.
+static const char indentation[]     = "  ";
+static const size_t INDENTATION_LEN = sizeof(indentation) - 1;
+
+// Append indentation spaces x depth.
+static void
+S_cat_whitespace(CharBuf *json, int32_t depth);
+
+// Set Err_error, appending escaped JSON in the vicinity of the error.
+static void
+S_set_error(CharBuf *mess, char *json, char *limit, int line,
+            const char *func);
+#define SET_ERROR(_mess, _json, _end) \
+    S_set_error(_mess, _json, _end, __LINE__, CFISH_ERR_FUNC_MACRO)
+
+Obj*
+Json_from_json(CharBuf *json) {
+    Obj *dump = S_parse_json((char*)CB_Get_Ptr8(json), CB_Get_Size(json));
+    if (!dump) {
+        ERR_ADD_FRAME(Err_get_error());
+    }
+    return dump;
 }
 
 Obj*
 Json_slurp_json(Folder *folder, const CharBuf *path) {
-    Obj *dump = Host_callback_obj(JSON, "slurp_json", 2,
-                                  ARG_OBJ("folder", folder),
-                                  ARG_STR("path", path));
-    if (!dump) { ERR_ADD_FRAME(Err_get_error()); }
+    InStream *instream = Folder_Open_In(folder, path);
+    if (!instream) {
+        ERR_ADD_FRAME(Err_get_error());
+        return NULL;
+    }
+    size_t len = (size_t)InStream_Length(instream);
+    char *buf = InStream_Buf(instream, len);
+    Obj *dump = S_parse_json(buf, len);
+    InStream_Close(instream);
+    DECREF(instream);
+    if (!dump) {
+        ERR_ADD_FRAME(Err_get_error());
+    }
     return dump;
 }
 
+bool_t
+Json_spew_json(Obj *dump, Folder *folder, const CharBuf *path) {
+    CharBuf *json = Json_to_json(dump);
+    if (!json) {
+        ERR_ADD_FRAME(Err_get_error());
+        return false;
+    }
+    OutStream *outstream = Folder_Open_Out(folder, path);
+    if (!outstream) {
+        ERR_ADD_FRAME(Err_get_error());
+        DECREF(json);
+        return false;
+    }
+    size_t size = CB_Get_Size(json);
+    OutStream_Write_Bytes(outstream, CB_Get_Ptr8(json), size);
+    OutStream_Close(outstream);
+    DECREF(outstream);
+    DECREF(json);
+    return true;
+}
+
 CharBuf*
 Json_to_json(Obj *dump) {
-    return Host_callback_str(JSON, "to_json", 1,
-                             ARG_OBJ("dump", dump));
-}
+    // Validate object type, only allowing hashes and arrays per JSON spec.
+    if (!dump || !(Obj_Is_A(dump, HASH) || Obj_Is_A(dump, VARRAY))) {
+        if (!tolerant) {
+            CharBuf *class_name = dump ? Obj_Get_Class_Name(dump) : NULL;
+            CharBuf *mess = MAKE_MESS("Illegal top-level object type: %o",
+                                      class_name);
+            Err_set_error(Err_new(mess));
+            return NULL;
+        }
+    }
 
-Obj*
-Json_from_json(CharBuf *json) {
-    return Host_callback_obj(JSON, "from_json", 1,
-                             ARG_STR("json", json));
+    // Encode.
+    CharBuf *json = CB_new(31);
+    if (!S_to_json(dump, json, 0)) {
+        DECREF(json);
+        ERR_ADD_FRAME(Err_get_error());
+        json = NULL;
+    }
+    else {
+        // Append newline.
+        CB_Cat_Trusted_Str(json, "\n", 1);
+    }
+
+    return json;
 }
 
 void
-Json_set_tolerant(bool_t tolerant) {
-    Host_callback(JSON, "set_tolerant", 1,
-                  ARG_I32("tolerant", tolerant));
+Json_set_tolerant(bool_t tolerance) {
+    tolerant = tolerance;
+}
+
+static const int32_t MAX_DEPTH = 200;
+
+static void
+S_append_json_string(Obj *dump, CharBuf *json) {
+    // Append opening quote.
+    CB_Cat_Trusted_Str(json, "\"", 1);
+
+    // Process string data.
+    ZombieCharBuf *iterator = ZCB_WRAP((CharBuf*)dump);
+    while (ZCB_Get_Size(iterator)) {
+        uint32_t code_point = ZCB_Nip_One(iterator);
+        if (code_point > 127) {
+            // There is no need to escape any high characters, including those
+            // above the BMP, as we assume that the destination channel can
+            // handle arbitrary UTF-8 data.
+            CB_Cat_Char(json, code_point);
+        }
+        else {
+            char buffer[7];
+            size_t len;
+            switch (code_point & 127) {
+                    // Perform all mandatory escapes enumerated in the JSON spec.
+                    // Note that the spec makes escaping forward slash optional;
+                    // we choose not to.
+                case 0x00: case 0x01: case 0x02: case 0x03:
+                case 0x04: case 0x05: case 0x06: case 0x07:
+                case 0x0b: case 0x0e: case 0x0f:
+                case 0x10: case 0x11: case 0x12: case 0x13:
+                case 0x14: case 0x15: case 0x16: case 0x17:
+                case 0x18: case 0x19: case 0x1a: case 0x1b:
+                case 0x1c: case 0x1d: case 0x1e: case 0x1f: {
+                        sprintf(buffer, "\\u%04x", (unsigned)code_point);
+                        len = 6;
+                        break;
+                    }
+                case '\b':
+                    memcpy(buffer, "\\b", 2);
+                    len = 2;
+                    break;
+                case '\t':
+                    memcpy(buffer, "\\t", 2);
+                    len = 2;
+                    break;
+                case '\n':
+                    memcpy(buffer, "\\n", 2);
+                    len = 2;
+                    break;
+                case '\f':
+                    memcpy(buffer, "\\f", 2);
+                    len = 2;
+                    break;
+                case '\r':
+                    memcpy(buffer, "\\r", 2);
+                    len = 2;
+                    break;
+                case '\\':
+                    memcpy(buffer, "\\\\", 2);
+                    len = 2;
+                    break;
+                case '\"':
+                    memcpy(buffer, "\\\"", 2);
+                    len = 2;
+                    break;
+
+                    // Ordinary printable ASCII.
+                default:
+                    buffer[0] = (char)code_point;
+                    len = 1;
+            }
+            CB_Cat_Trusted_Str(json, buffer, len);
+        }
+    }
+
+    // Append closing quote.
+    CB_Cat_Trusted_Str(json, "\"", 1);
+}
+
+static void
+S_cat_whitespace(CharBuf *json, int32_t depth) {
+    while (depth--) {
+        CB_Cat_Trusted_Str(json, indentation, INDENTATION_LEN);
+    }
+}
+
+static bool_t
+S_to_json(Obj *dump, CharBuf *json, int32_t depth) {
+    // Guard against infinite recursion in self-referencing data structures.
+    if (depth > MAX_DEPTH) {
+        CharBuf *mess = MAKE_MESS("Exceeded max depth of %i32", MAX_DEPTH);
+        Err_set_error(Err_new(mess));
+        return false;
+    }
+
+    if (!dump) {
+        CB_Cat_Trusted_Str(json, "null", 4);
+    }
+    else if (dump == (Obj*)CFISH_TRUE) {
+        CB_Cat_Trusted_Str(json, "true", 4);
+    }
+    else if (dump == (Obj*)CFISH_FALSE) {
+        CB_Cat_Trusted_Str(json, "false", 5);
+    }
+    else if (Obj_Is_A(dump, CHARBUF)) {
+        S_append_json_string(dump, json);
+    }
+    else if (Obj_Is_A(dump, INTNUM)) {
+        CB_catf(json, "%i64", Obj_To_I64(dump));
+    }
+    else if (Obj_Is_A(dump, FLOATNUM)) {
+        CB_catf(json, "%f64", Obj_To_F64(dump));
+    }
+    else if (Obj_Is_A(dump, VARRAY)) {
+        VArray *array = (VArray*)dump;
+        size_t size = VA_Get_Size(array);
+        if (size == 0) {
+            // Put empty array on single line.
+            CB_Cat_Trusted_Str(json, "[]", 2);
+            return true;
+        }
+        else if (size == 1) {
+            Obj *elem = VA_Fetch(array, 0);
+            if (!(Obj_Is_A(elem, HASH) || Obj_Is_A(elem, VARRAY))) {
+                // Put array containing single scalar element on one line.
+                CB_Cat_Trusted_Str(json, "[", 1);
+                if (!S_to_json(elem, json, depth + 1)) {
+                    return false;
+                }
+                CB_Cat_Trusted_Str(json, "]", 1);
+                return true;
+            }
+        }
+        // Fall back to spreading elements across multiple lines.
+        CB_Cat_Trusted_Str(json, "[", 1);
+        for (size_t i = 0; i < size; i++) {
+            CB_Cat_Trusted_Str(json, "\n", 1);
+            S_cat_whitespace(json, depth + 1);
+            if (!S_to_json(VA_Fetch(array, i), json, depth + 1)) {
+                return false;
+            }
+            if (i + 1 < size) {
+                CB_Cat_Trusted_Str(json, ",", 1);
+            }
+        }
+        CB_Cat_Trusted_Str(json, "\n", 1);
+        S_cat_whitespace(json, depth);
+        CB_Cat_Trusted_Str(json, "]", 1);
+    }
+    else if (Obj_Is_A(dump, HASH)) {
+        Hash *hash = (Hash*)dump;
+        size_t size = Hash_Get_Size(hash);
+
+        // Put empty hash on single line.
+        if (size == 0) {
+            CB_Cat_Trusted_Str(json, "{}", 2);
+            return true;
+        }
+
+        // Validate that all keys are strings, then sort.
+        VArray *keys = Hash_Keys(hash);
+        for (size_t i = 0; i < size; i++) {
+            Obj *key = VA_Fetch(keys, i);
+            if (!key || !Obj_Is_A(key, CHARBUF)) {
+                DECREF(keys);
+                CharBuf *key_class = key ? Obj_Get_Class_Name(key) : NULL;
+                CharBuf *mess = MAKE_MESS("Illegal key type: %o", key_class);
+                Err_set_error(Err_new(mess));
+                return false;
+            }
+        }
+        VA_Sort(keys, NULL, NULL);
+
+        // Spread pairs across multiple lines.
+        CB_Cat_Trusted_Str(json, "{", 1);
+        for (size_t i = 0; i < size; i++) {
+            Obj *key = VA_Fetch(keys, i);
+            CB_Cat_Trusted_Str(json, "\n", 1);
+            S_cat_whitespace(json, depth + 1);
+            S_append_json_string(key, json);
+            CB_Cat_Trusted_Str(json, ": ", 2);
+            if (!S_to_json(Hash_Fetch(hash, key), json, depth + 1)) {
+                DECREF(keys);
+                return false;
+            }
+            if (i + 1 < size) {
+                CB_Cat_Trusted_Str(json, ",", 1);
+            }
+        }
+        CB_Cat_Trusted_Str(json, "\n", 1);
+        S_cat_whitespace(json, depth);
+        CB_Cat_Trusted_Str(json, "}", 1);
+
+        DECREF(keys);
+    }
+
+    return true;
+}
+
+static Obj*
+S_parse_json(char *text, size_t size) {
+    void *json_parser = LucyParseJsonAlloc(lucy_Memory_wrapped_malloc);
+    if (json_parser == NULL) {
+        CharBuf *mess = MAKE_MESS("Failed to allocate JSON parser");
+        Err_set_error(Err_new(mess));
+        return NULL;
+    }
+    Obj *dump = S_do_parse_json(json_parser, text, size);
+    LucyParseJsonFree(json_parser, lucy_Memory_wrapped_free);
+    return dump;
+}
+
+static Obj*
+S_do_parse_json(void *json_parser, char *json, size_t len) {
+    lucy_JsonParserState state;
+    state.result = NULL;
+    state.errors = false;
+
+    char *text = json;
+    char *const end = text + len;
+    while (text < end) {
+        int  token_type = -1;
+        Obj *value      = NULL;
+        char *const save = text;
+        switch (*text) {
+            case ' ': case '\n': case '\r': case '\t':
+                // Skip insignificant whitespace, which the JSON RFC defines
+                // as only four ASCII characters.
+                text++;
+                continue;
+            case '[':
+                token_type = LUCY_JSON_TOKENTYPE_LEFT_SQUARE_BRACKET;
+                text++;
+                break;
+            case ']':
+                token_type = LUCY_JSON_TOKENTYPE_RIGHT_SQUARE_BRACKET;
+                text++;
+                break;
+            case '{':
+                token_type = LUCY_JSON_TOKENTYPE_LEFT_CURLY_BRACKET;
+                text++;
+                break;
+            case '}':
+                token_type = LUCY_JSON_TOKENTYPE_RIGHT_CURLY_BRACKET;
+                text++;
+                break;
+            case ':':
+                token_type = LUCY_JSON_TOKENTYPE_COLON;
+                text++;
+                break;
+            case ',':
+                token_type = LUCY_JSON_TOKENTYPE_COMMA;
+                text++;
+                break;
+            case '"':
+                value = (Obj*)S_parse_string(&text, end);
+                if (value) {
+                    token_type = LUCY_JSON_TOKENTYPE_STRING;
+                }
+                else {
+                    // Clear out parser and return.
+                    LucyParseJson(json_parser, 0, NULL, &state);
+                    ERR_ADD_FRAME(Err_get_error());
+                    return NULL;
+                }
+                break;
+            case 'n':
+                if (SI_check_keyword(text, end, "null", 4)) {
+                    token_type = LUCY_JSON_TOKENTYPE_NULL;
+                    text += 4;
+                }
+                break;
+            case 't':
+                if (SI_check_keyword(text, end, "true", 4)) {
+                    token_type = LUCY_JSON_TOKENTYPE_TRUE;
+                    value = (Obj*)CFISH_TRUE;
+                    text += 4;
+                }
+                break;
+            case 'f':
+                if (SI_check_keyword(text, end, "false", 5)) {
+                    token_type = LUCY_JSON_TOKENTYPE_FALSE;
+                    value = (Obj*)CFISH_FALSE;
+                    text += 5;
+                }
+                break;
+            case '0': case '1': case '2': case '3': case '4':
+            case '5': case '6': case '7': case '8': case '9':
+            case '-': { // Note no '+', as JSON spec doesn't allow it.
+                    value = (Obj*)S_parse_number(&text, end);
+                    if (value) {
+                        token_type = LUCY_JSON_TOKENTYPE_NUMBER;
+                    }
+                    else {
+                        // Clear out parser and return.
+                        LucyParseJson(json_parser, 0, NULL, &state);
+                        ERR_ADD_FRAME(Err_get_error());
+                        return NULL;
+                    }
+                }
+                break;
+        }
+        LucyParseJson(json_parser, token_type, value, &state);
+        if (state.errors) {
+            SET_ERROR(CB_newf("JSON syntax error"), save, end);
+            return NULL;
+        }
+    }
+
+    // Finish up.
+    LucyParseJson(json_parser, 0, NULL, &state);
+    if (state.errors) {
+        SET_ERROR(CB_newf("JSON syntax error"), json, end);
+        return NULL;
+    }
+    return state.result;
+}
+
+static Float64*
+S_parse_number(char **json_ptr, char *const limit) {
+    char *top = *json_ptr;
+    char *end = top;
+    bool_t terminated = false;
+
+    // We can't assume NULL termination for the JSON string, so we need to
+    // ensure that strtod() cannot overrun and access invalid memory.
+    for (; end < limit; end++) {
+        switch (*end) {
+                // Only these characters may legally follow a number in
+                // Javascript.  If we don't find one before the end of the JSON,
+                // it's a parse error.
+            case ' ': case '\n': case '\r': case '\t':
+            case ']':
+            case '}':
+            case ':':
+            case ',':
+                terminated = true;
+                break;
+        }
+    }
+
+    Float64 *result = NULL;
+    if (terminated) {
+        char *terminus;
+        double number = strtod(top, &terminus);
+        if (terminus != top) {
+            *json_ptr = terminus;
+            result = Float64_new(number);
+        }
+    }
+    if (!result) {
+        SET_ERROR(CB_newf("JSON syntax error"), top, limit);
+    }
+    return result;
+}
+
+static CharBuf*
+S_parse_string(char **json_ptr, char *const limit) {
+    // Find terminating double quote, determine whether there are any escapes.
+    char *top = *json_ptr + 1;
+    char *end = NULL;
+    bool_t saw_backslash = false;
+    for (char *text = top; text < limit; text++) {
+        if (*text == '"') {
+            end = text;
+            break;
+        }
+        else if (*text == '\\') {
+            saw_backslash = true;
+            if (text + 1 < limit && text[1] == 'u') {
+                text += 5;
+            }
+            else {
+                text += 1;
+            }
+        }
+    }
+    if (!end) {
+        SET_ERROR(CB_newf("Unterminated string"), *json_ptr, limit);
+        return NULL;
+    }
+
+    // Advance the text buffer to just beyond the closing quote.
+    *json_ptr = end + 1;
+
+    if (saw_backslash) {
+        return S_unescape_text(top, end);
+    }
+    else {
+        // Optimize common case where there are no escapes.
+        size_t len = end - top;
+        if (!StrHelp_utf8_valid(top, len)) {
+            CharBuf *mess = MAKE_MESS("Bad UTF-8 in JSON");
+            Err_set_error(Err_new(mess));
+            return NULL;
+        }
+        return CB_new_from_trusted_utf8(top, len);
+    }
+}
+
+static CharBuf*
+S_unescape_text(char *const top, char *const end) {
+    // The unescaped string will never be longer than the escaped string
+    // because only a \u escape can theoretically be too long and
+    // StrHelp_encode_utf8_char guards against sequences over 4 bytes.
+    // Therefore we can allocate once and not worry about reallocating.
+    size_t cap = end - top + 1;
+    char *target_buf = (char*)MALLOCATE(cap);
+    size_t target_size = 0;
+    for (char *text = top; text < end; text++) {
+        if (*text != '\\') {
+            target_buf[target_size++] = *text;
+        }
+        else {
+            // Process escape.
+            text++;
+            switch (*text) {
+                case '"':
+                    target_buf[target_size++] = '"';
+                    break;
+                case '\\':
+                    target_buf[target_size++] = '\\';
+                    break;
+                case '/':
+                    target_buf[target_size++] = '/';
+                    break;
+                case 'b':
+                    target_buf[target_size++] = '\b';
+                    break;
+                case 'f':
+                    target_buf[target_size++] = '\f';
+                    break;
+                case 'n':
+                    target_buf[target_size++] = '\n';
+                    break;
+                case 'r':
+                    target_buf[target_size++] = '\r';
+                    break;
+                case 't':
+                    target_buf[target_size++] = '\t';
+                    break;
+                case 'u': {
+                        // Copy into a temp buffer because strtol will overrun
+                        // into adjacent text data for e.g. "\uAAAA1".
+                        char temp[5] = { 0, 0, 0, 0, 0 };
+                        memcpy(temp, text + 1, 4);
+                        text += 4;
+                        char *num_end;
+                        long code_point = strtol(temp, &num_end, 16);
+                        char *temp_ptr = temp;
+                        if (num_end != temp_ptr + 4 || code_point < 0) {
+                            FREEMEM(target_buf);
+                            SET_ERROR(CB_newf("Invalid \\u escape"), text - 5, end);
+                            return NULL;
+                        }
+                        if (code_point >= 0xD800 && code_point <= 0xDFFF) {
+                            FREEMEM(target_buf);
+                            SET_ERROR(CB_newf("Surrogate pairs not supported"),
+                                      text - 5, end);
+                            return NULL;
+                        }
+                        target_size += StrHelp_encode_utf8_char((uint32_t)code_point,
+                                                                target_buf + target_size);
+                    }
+                    break;
+                default:
+                    FREEMEM(target_buf);
+                    SET_ERROR(CB_newf("Illegal escape"), text - 1, end);
+                    return NULL;
+            }
+        }
+    }
+
+    // NULL-terminate, sanity check, then return the escaped string.
+    target_buf[target_size] = '\0';
+    if (!StrHelp_utf8_valid(target_buf, target_size)) {
+        FREEMEM(target_buf);
+        CharBuf *mess = MAKE_MESS("Bad UTF-8 in JSON");
+        Err_set_error(Err_new(mess));
+        return NULL;
+    }
+    return CB_new_steal_from_trusted_str(target_buf, target_size, cap);
+}
+
+static INLINE bool_t
+SI_check_keyword(char *json, char* end, const char *keyword, size_t len) {
+    if (end - json > len
+        && strncmp(json, keyword, len) == 0
+        && json[len] != '_'
+        && !isalnum(json[len])
+       ) {
+        return true;
+    }
+    return false;
+}
+
+static void
+S_set_error(CharBuf *mess, char *json, char *limit, int line,
+            const char *func) {
+    if (func) {
+        CB_catf(mess, " at %s %s line %i32 near ", func, __FILE__,
+                (int32_t)line);
+    }
+    else {
+        CB_catf(mess, " at %s line %i32 near ", __FILE__, (int32_t)line);
+    }
+
+    // Append escaped text.
+    int64_t len = limit - json;
+    if (len > 32) {
+        const char *end = StrHelp_back_utf8_char(json + 32, json);
+        len = end - json;
+    }
+    ZombieCharBuf *snippet = ZCB_WRAP_STR(json, len);
+    S_append_json_string((Obj*)snippet, mess);
+
+    // Set Err_error.
+    Err_set_error(Err_new(mess));
 }
 

Modified: incubator/lucy/trunk/core/Lucy/Util/Json.cfh
URL: http://svn.apache.org/viewvc/incubator/lucy/trunk/core/Lucy/Util/Json.cfh?rev=1156951&r1=1156950&r2=1156951&view=diff
==============================================================================
--- incubator/lucy/trunk/core/Lucy/Util/Json.cfh (original)
+++ incubator/lucy/trunk/core/Lucy/Util/Json.cfh Fri Aug 12 03:31:03 2011
@@ -58,3 +58,14 @@ class Lucy::Util::Json inherits Lucy::Ob
 }
 
 
+__C__
+
+struct lucy_JsonParserState 
+{
+    lucy_Obj   *result;
+    chy_bool_t  errors;
+};
+typedef struct lucy_JsonParserState lucy_JsonParserState;
+
+__END_C__
+

Added: incubator/lucy/trunk/core/Lucy/Util/Json/JsonParser.y
URL: http://svn.apache.org/viewvc/incubator/lucy/trunk/core/Lucy/Util/Json/JsonParser.y?rev=1156951&view=auto
==============================================================================
--- incubator/lucy/trunk/core/Lucy/Util/Json/JsonParser.y (added)
+++ incubator/lucy/trunk/core/Lucy/Util/Json/JsonParser.y Fri Aug 12 03:31:03 2011
@@ -0,0 +1,162 @@
+%name LucyParseJson
+
+/* Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+%token_type { cfish_Obj* }
+%token_destructor { CFISH_DECREF($$); }
+%token_prefix LUCY_JSON_TOKENTYPE_
+
+%include {
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include "Lucy/Object/Hash.h"
+#include "Lucy/Object/VArray.h"
+#include "Lucy/Object/CharBuf.h"
+#include "Lucy/Object/Err.h"
+#include "Lucy/Util/Json.h"
+}
+
+%extra_argument { lucy_JsonParserState *state }
+
+%syntax_error {
+    state->errors = true;
+}
+
+result ::= top_level_value(A).
+{
+     state->result = A;
+}
+
+/* Allow any "value" as a top-level construct.  This "loose", tolerant grammar
+ * makes testing somewhat easier.  A strict JSON parser would only allow JSON
+ * Objects and Arrays at the top level.
+ */
+top_level_value(A) ::= value(B).  { A = B; }
+
+/* Structural characters. */
+begin_array     ::= LEFT_SQUARE_BRACKET.
+end_array       ::= RIGHT_SQUARE_BRACKET.
+begin_object    ::= LEFT_CURLY_BRACKET.
+end_object      ::= RIGHT_CURLY_BRACKET.
+name_separator  ::= COLON.
+value_separator ::= COMMA.
+
+/* Values */
+%type STRING { cfish_CharBuf* }
+
+value(A) ::= FALSE(B).   { A = B; }
+value(A) ::= NULL(B).    { A = B; }
+value(A) ::= TRUE(B).    { A = B; }
+value(A) ::= object(B).  { A = (cfish_Obj*)B; }
+value(A) ::= array(B).   { A = (cfish_Obj*)B; }
+value(A) ::= NUMBER(B).  { A = (cfish_Obj*)B; }
+value(A) ::= STRING(B).  { A = B; }
+
+/* Javascript Objects, implemented as Clownfish Hashes. */
+%type object                    { cfish_Hash* }
+%type empty_object              { cfish_Hash* }
+%type single_pair_object        { cfish_Hash* }
+%type multi_pair_object         { cfish_Hash* }
+%type key_value_pair_list       { cfish_Hash* }
+%destructor object              { CFISH_DECREF($$); }
+%destructor empty_object        { CFISH_DECREF($$); }
+%destructor single_pair_object  { CFISH_DECREF($$); }
+%destructor multi_pair_object   { CFISH_DECREF($$); }
+%destructor key_value_pair_list { CFISH_DECREF($$); }
+
+object(A) ::= empty_object(B).         { A = B; }
+object(A) ::= single_pair_object(B).   { A = B; }
+object(A) ::= multi_pair_object(B).    { A = B; }
+
+empty_object(A) ::= begin_object end_object.
+{ 
+    A = cfish_Hash_new(0);
+}
+
+single_pair_object(A) ::= begin_object STRING(B) name_separator value(C) end_object.
+{
+    A = cfish_Hash_new(1);
+    Cfish_Hash_Store(A, (cfish_Obj*)B, C);
+    CFISH_DECREF(B);
+}
+
+multi_pair_object(A) ::= begin_object key_value_pair_list(B) STRING(C) name_separator value(D) end_object.
+{
+    A = B;
+    Cfish_Hash_Store(A, (cfish_Obj*)C, D);
+    CFISH_DECREF(C);
+}
+
+key_value_pair_list(A) ::= key_value_pair_list(B) STRING(C) name_separator value(D) value_separator.
+{ 
+    A = B; 
+    Cfish_Hash_Store(A, (cfish_Obj*)C, D);
+    CFISH_DECREF(C);
+}
+
+key_value_pair_list(A) ::= STRING(B) name_separator value(C) value_separator.
+{
+    A = cfish_Hash_new(0);
+    Cfish_Hash_Store(A, (cfish_Obj*)B, C);
+    CFISH_DECREF(B);
+}
+
+/* Arrays. */
+%type array                     { cfish_VArray* }
+%type empty_array               { cfish_VArray* }
+%type single_elem_array         { cfish_VArray* }
+%type multi_elem_array          { cfish_VArray* }
+%type array_elem_list           { cfish_VArray* }
+%destructor array               { CFISH_DECREF($$); }
+%destructor single_elem_array   { CFISH_DECREF($$); }
+%destructor multi_elem_array    { CFISH_DECREF($$); }
+%destructor array_elem_list     { CFISH_DECREF($$); }
+
+array(A) ::= empty_array(B).       { A = B; }
+array(A) ::= single_elem_array(B). { A = B; }
+array(A) ::= multi_elem_array(B).  { A = B; }
+
+empty_array(A) ::= begin_array end_array.
+{
+    A = cfish_VA_new(0);
+}
+
+single_elem_array(A) ::= begin_array value(B) end_array.
+{
+    A = cfish_VA_new(1);
+    Cfish_VA_Push(A, B);
+}
+
+multi_elem_array(A) ::= begin_array array_elem_list(B) value(C) end_array.
+{
+    A = B;
+    Cfish_VA_Push(A, C);
+}
+
+array_elem_list(A) ::= array_elem_list(B) value(C) value_separator. 
+{ 
+    A = B; 
+    Cfish_VA_Push(A, C);
+}
+
+array_elem_list(A) ::= value(B) value_separator.
+{
+    A = cfish_VA_new(1);
+    Cfish_VA_Push(A, B);
+}
+

Modified: incubator/lucy/trunk/perl/Build.PL
URL: http://svn.apache.org/viewvc/incubator/lucy/trunk/perl/Build.PL?rev=1156951&r1=1156950&r2=1156951&view=diff
==============================================================================
--- incubator/lucy/trunk/perl/Build.PL (original)
+++ incubator/lucy/trunk/perl/Build.PL Fri Aug 12 03:31:03 2011
@@ -24,11 +24,8 @@ my $builder = Lucy::Build->new(
     license     => 'apache',
     dist_author =>
         'The Apache Lucy Project <lucy-dev at incubator dot apache dot org>',
-    dist_version => '0.2.0',
-    requires     => {
-        'JSON::XS' => 1.53,
-        'perl'     => '5.8.3',
-    },
+    dist_version   => '0.2.0',
+    requires       => { 'perl' => '5.8.3', },
     build_requires => {
         'Parse::RecDescent'  => 1.94,
         'Module::Build'      => 0.280801,

Modified: incubator/lucy/trunk/perl/MANIFEST
URL: http://svn.apache.org/viewvc/incubator/lucy/trunk/perl/MANIFEST?rev=1156951&r1=1156950&r2=1156951&view=diff
==============================================================================
--- incubator/lucy/trunk/perl/MANIFEST (original)
+++ incubator/lucy/trunk/perl/MANIFEST Fri Aug 12 03:31:03 2011
@@ -404,7 +404,6 @@ xs/Lucy/Object/LockFreeRegistry.c
 xs/Lucy/Object/Obj.c
 xs/Lucy/Object/VTable.c
 xs/Lucy/Store/FSFolder.c
-xs/Lucy/Util/Json.c
 xs/Lucy/Util/StringHelper.c
 xs/XSBind.c
 xs/XSBind.h

Modified: incubator/lucy/trunk/perl/buildlib/Lucy/Build.pm
URL: http://svn.apache.org/viewvc/incubator/lucy/trunk/perl/buildlib/Lucy/Build.pm?rev=1156951&r1=1156950&r2=1156951&view=diff
==============================================================================
--- incubator/lucy/trunk/perl/buildlib/Lucy/Build.pm (original)
+++ incubator/lucy/trunk/perl/buildlib/Lucy/Build.pm Fri Aug 12 03:31:03 2011
@@ -82,7 +82,7 @@ sub extra_ccflags {
     my $self      = shift;
     my $gcc_flags = '-std=gnu99 -D_GNU_SOURCE ';
     if ( defined $ENV{LUCY_VALGRIND} ) {
-        return "$gcc_flags -fno-inline-functions ";
+        return "$gcc_flags -DLUCY_VALGRIND -fno-inline-functions ";
     }
     elsif ( defined $ENV{LUCY_DEBUG} ) {
         return "$gcc_flags -DLUCY_DEBUG -pedantic -Wall -Wextra "
@@ -476,10 +476,27 @@ sub ACTION_test_valgrind {
     }
 }
 
+# Run all .y files through lemon.
+sub ACTION_parsers {
+    my $self = shift;
+    $self->dispatch('lemon');
+    my $y_files = $self->rscan_dir( $CORE_SOURCE_DIR, qr/\.y$/ );
+    for my $y_file (@$y_files) {
+        my $c_file = $y_file;
+        my $h_file = $y_file;
+        $c_file =~ s/\.y$/.c/ or die "no match";
+        $h_file =~ s/\.y$/.h/ or die "no match";
+        next if $self->up_to_date( $y_file, [ $c_file, $h_file ] );
+        $self->add_to_cleanup( $c_file, $h_file );
+        system( $LEMON_EXE_PATH, '-q', $y_file ) and die "lemon failed";
+    }
+}
+
 sub ACTION_compile_custom_xs {
     my $self = shift;
 
     $self->dispatch('ppport');
+    $self->dispatch('parsers');
 
     require ExtUtils::ParseXS;
 

Modified: incubator/lucy/trunk/perl/lib/Lucy.pm
URL: http://svn.apache.org/viewvc/incubator/lucy/trunk/perl/lib/Lucy.pm?rev=1156951&r1=1156950&r2=1156951&view=diff
==============================================================================
--- incubator/lucy/trunk/perl/lib/Lucy.pm (original)
+++ incubator/lucy/trunk/perl/lib/Lucy.pm Fri Aug 12 03:31:03 2011
@@ -517,89 +517,6 @@ sub error {$Lucy::Object::Err::error}
 }
 
 {
-    package Lucy::Util::Json;
-    use Scalar::Util qw( blessed );
-    use Lucy qw( to_clownfish );
-    use Lucy::Util::StringHelper qw( utf8_valid utf8_flag_on );
-    use JSON::XS qw();
-
-    my $json_encoder = JSON::XS->new->pretty(1)->canonical(1);
-
-    sub slurp_json {
-        my ( undef, %args ) = @_;
-        my $result;
-        my $instream = $args{folder}->open_in( $args{path} )
-            or return;
-        my $len = $instream->length;
-        my $json;
-        $instream->read( $json, $len );
-        if ( utf8_valid($json) ) {
-            utf8_flag_on($json);
-            $result = eval { to_clownfish( $json_encoder->decode($json) ) };
-        }
-        else {
-            $@ = "Invalid UTF-8";
-        }
-        if ( $@ or !$result ) {
-            Lucy::Object::Err->set_error(
-                Lucy::Object::Err->new( $@ || "Failed to decode JSON" ) );
-            return;
-        }
-        return $result;
-    }
-
-    sub spew_json {
-        my ( undef, %args ) = @_;
-        my $json = eval { $json_encoder->encode( $args{'dump'} ) };
-        if ( !defined $json ) {
-            Lucy::Object::Err->set_error( Lucy::Object::Err->new($@) );
-            return 0;
-        }
-        my $outstream = $args{folder}->open_out( $args{path} );
-        return 0 unless $outstream;
-        eval {
-            $outstream->print($json);
-            $outstream->close;
-        };
-        if ($@) {
-            my $error;
-            if ( blessed($@) && $@->isa("Lucy::Object::Err") ) {
-                $error = $@;
-            }
-            else {
-                $error = Lucy::Object::Err->new($@);
-            }
-            Lucy::Object::Err->set_error($error);
-            return 0;
-        }
-        return 1;
-    }
-
-    sub to_json {
-        my ( undef, $dump ) = @_;
-        my $json = eval { $json_encoder->encode($dump) };
-        if ($@) {
-            my $error = Lucy::Object::Err->new($@);
-            Lucy::Object::Err->set_error($error);
-            return;
-        }
-        return $json;
-    }
-
-    sub from_json {
-        my $dump = eval { to_clownfish( $json_encoder->decode( $_[1] ) ) };
-        if ($@) {
-            my $error = Lucy::Object::Err->new($@);
-            Lucy::Object::Err->set_error($error);
-            return;
-        }
-        return $dump;
-    }
-
-    sub set_tolerant { $json_encoder->allow_nonref( $_[1] ) }
-}
-
-{
     package Lucy::Object::Host;
     BEGIN {
         if ( !__PACKAGE__->isa('Lucy::Object::Obj') ) {