You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@couchdb.apache.org by da...@apache.org on 2014/01/17 23:11:51 UTC
[20/50] [abbrv] git commit: Add an option to ignore UTF-8 encoding
errors
Add an option to ignore UTF-8 encoding errors
By default Jiffy is quite strict in what it encodes. By default it will
not allow invalid UTF-8 to be produced. This can cause issues when
attempting to encode JSON that was decoded by other libraries as UTF-8
semantics are not uniformly enforced.
This patch adds an option 'force_utf8' to the encoder. If encoding hits
an error for an invalid string it will forcefully mutate the object to
contain only valid UTF-8 and return the resulting encoded JSON.
For the most part this means it will strip any garbage data from
binaries replacing it replacement codepoint U+FFFD. Although, it will
also try and the common error of encoding surrogate pairs as three-byte
sequences and reencode them into UTF-8 properly.
Project: http://git-wip-us.apache.org/repos/asf/couchdb-jiffy/repo
Commit: http://git-wip-us.apache.org/repos/asf/couchdb-jiffy/commit/414827d6
Tree: http://git-wip-us.apache.org/repos/asf/couchdb-jiffy/tree/414827d6
Diff: http://git-wip-us.apache.org/repos/asf/couchdb-jiffy/diff/414827d6
Branch: refs/heads/import
Commit: 414827d604b0f28974bd666f7da1068bb36b44ae
Parents: 6f589d4
Author: Paul J. Davis <pa...@gmail.com>
Authored: Fri Jun 1 09:53:41 2012 -0500
Committer: Paul J. Davis <pa...@gmail.com>
Committed: Fri Jun 1 10:35:02 2012 -0500
----------------------------------------------------------------------
c_src/encoder.c | 2 +
c_src/jiffy.c | 1 +
c_src/jiffy.h | 1 +
src/jiffy.erl | 4 ++
src/jiffy_utf8.erl | 104 ++++++++++++++++++++++++++++++++++++++++++++++++
test/004-strings.t | 67 ++++++++++++++++---------------
6 files changed, 148 insertions(+), 31 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/couchdb-jiffy/blob/414827d6/c_src/encoder.c
----------------------------------------------------------------------
diff --git a/c_src/encoder.c b/c_src/encoder.c
index f063f77..1b4baaf 100644
--- a/c_src/encoder.c
+++ b/c_src/encoder.c
@@ -81,6 +81,8 @@ enc_init(Encoder* e, ErlNifEnv* env, ERL_NIF_TERM opts, ErlNifBinary* bin)
e->uescape = 1;
} else if(enif_compare(val, e->atoms->atom_pretty) == 0) {
e->pretty = 1;
+ } else if(enif_compare(val, e->atoms->atom_force_utf8) == 0) {
+ // Ignore, handled in Erlang
} else {
return 0;
}
http://git-wip-us.apache.org/repos/asf/couchdb-jiffy/blob/414827d6/c_src/jiffy.c
----------------------------------------------------------------------
diff --git a/c_src/jiffy.c b/c_src/jiffy.c
index 8fdde2f..3f64fe5 100644
--- a/c_src/jiffy.c
+++ b/c_src/jiffy.c
@@ -22,6 +22,7 @@ load(ErlNifEnv* env, void** priv, ERL_NIF_TERM info)
st->atom_partial = make_atom(env, "partial");
st->atom_uescape = make_atom(env, "uescape");
st->atom_pretty = make_atom(env, "pretty");
+ st->atom_force_utf8 = make_atom(env, "force_utf8");
// Markers used in encoding
st->ref_object = make_atom(env, "$object_ref$");
http://git-wip-us.apache.org/repos/asf/couchdb-jiffy/blob/414827d6/c_src/jiffy.h
----------------------------------------------------------------------
diff --git a/c_src/jiffy.h b/c_src/jiffy.h
index 327657a..c477a43 100644
--- a/c_src/jiffy.h
+++ b/c_src/jiffy.h
@@ -18,6 +18,7 @@ typedef struct {
ERL_NIF_TERM atom_partial;
ERL_NIF_TERM atom_uescape;
ERL_NIF_TERM atom_pretty;
+ ERL_NIF_TERM atom_force_utf8;
ERL_NIF_TERM ref_object;
ERL_NIF_TERM ref_array;
http://git-wip-us.apache.org/repos/asf/couchdb-jiffy/blob/414827d6/src/jiffy.erl
----------------------------------------------------------------------
diff --git a/src/jiffy.erl b/src/jiffy.erl
index b31a526..c4b3d69 100644
--- a/src/jiffy.erl
+++ b/src/jiffy.erl
@@ -25,7 +25,11 @@ encode(Data) ->
encode(Data, Options) ->
+ ForceUTF8 = lists:member(force_utf8, Options),
case nif_encode(Data, Options) of
+ {error, invalid_string} when ForceUTF8 == true ->
+ FixedData = jiffy_utf8:fix(Data),
+ encode(FixedData, Options -- [force_utf8]);
{error, _} = Error ->
throw(Error);
{partial, IOData} ->
http://git-wip-us.apache.org/repos/asf/couchdb-jiffy/blob/414827d6/src/jiffy_utf8.erl
----------------------------------------------------------------------
diff --git a/src/jiffy_utf8.erl b/src/jiffy_utf8.erl
new file mode 100644
index 0000000..ee937fe
--- /dev/null
+++ b/src/jiffy_utf8.erl
@@ -0,0 +1,104 @@
+% This file is part of Jiffy released under the MIT license.
+% See the LICENSE file for more information.
+
+-module(jiffy_utf8).
+-export([fix/1]).
+
+
+fix({Props}) ->
+ fix_props(Props, []);
+fix(Values) when is_list(Values) ->
+ fix_array(Values, []);
+fix(Bin) when is_binary(Bin) ->
+ fix_bin(Bin);
+fix(Val) ->
+ Val.
+
+
+fix_props([], Acc) ->
+ {lists:reverse(Acc)};
+fix_props([{K0, V0} | Rest], Acc) ->
+ K = fix(K0),
+ V = fix(V0),
+ fix_props(Rest, [{K, V} | Acc]).
+
+
+fix_array([], Acc) ->
+ lists:reverse(Acc);
+fix_array([Val | Rest], Acc0) ->
+ Acc = [fix(Val) | Acc0],
+ fix_array(Rest, Acc).
+
+
+fix_bin(Bin) ->
+ Dec0 = loose_decode(Bin, 0, []),
+ Dec1 = try_combining(Dec0, []),
+ Dec2 = replace_garbage(Dec1, []),
+ list_to_binary(xmerl_ucs:to_utf8(Dec2)).
+
+
+loose_decode(Bin, O, Acc) ->
+ case Bin of
+ <<_:O/binary>> ->
+ lists:reverse(Acc);
+ <<_:O/binary, 0:1/integer, V:7/integer, _/binary>> ->
+ loose_decode(Bin, O+1, [V | Acc]);
+ <<_:O/binary, 6:3/integer, V0:5/integer,
+ 2:2/integer, V1:6/integer, _/binary>> ->
+ B = <<0:5/integer, V0:5/integer, V1:6/integer>>,
+ <<V:16/integer>> = B,
+ loose_decode(Bin, O+2, [V | Acc]);
+ <<_:O/binary, 14:4/integer, V0:4/integer,
+ 2:2/integer, V1:6/integer,
+ 2:2/integer, V2:6/integer, _/binary>> ->
+ B = <<V0:4/integer, V1:6/integer, V2:6/integer>>,
+ <<V:16/integer>> = B,
+ loose_decode(Bin, O+3, [V | Acc]);
+ <<_:O/binary, 30:5/integer, V0:3/integer,
+ 2:2/integer, V1:6/integer,
+ 2:2/integer, V2:6/integer,
+ 2:2/integer, V3:6/integer, _/binary>> ->
+ B = <<0:11/integer, V0:3/integer, V1:6/integer,
+ V2:6/integer, V3:6/integer>>,
+ <<V:32/integer>> = B,
+ loose_decode(Bin, O+4, [V | Acc]);
+ <<_:O/binary, _:8/integer, R/binary>> ->
+ % Broken lead or continuation byte. Discard first
+ % byte and all broken continuations. Replace the
+ % whole mess with a replacment code point.
+ T = 1 + count_continuation_bytes(R, 0),
+ loose_decode(Bin, O+T, [16#FFFD | Acc])
+ end.
+
+
+count_continuation_bytes(R, O) ->
+ case R of
+ <<_:O/binary, 2:2/integer, _:6/integer, _/binary>> ->
+ count_continuation_bytes(R, O+1);
+ _ ->
+ O
+ end.
+
+
+try_combining([], Acc) ->
+ lists:reverse(Acc);
+try_combining([H, L | Rest], Acc) when H >= 16#D800, H =< 16#DFFF,
+ L >= 16#D800, L =< 16#DFFF ->
+ Bin = <<H:16/big-unsigned-integer, L:16/big-unsigned-integer>>,
+ try
+ [C] = xmerl_ucs:from_utf16be(Bin),
+ try_combining(Rest, [C | Acc])
+ catch _:_ ->
+ try_combining(Rest, [L, H | Acc])
+ end;
+try_combining([C | Rest], Acc) ->
+ try_combining(Rest, [C | Acc]).
+
+
+replace_garbage([], Acc) ->
+ lists:reverse(Acc);
+replace_garbage([C | Rest], Acc) ->
+ case xmerl_ucs:is_unicode(C) of
+ true -> replace_garbage(Rest, [C | Acc]);
+ false -> replace_garbage(Rest, [16#FFFD | Acc])
+ end.
http://git-wip-us.apache.org/repos/asf/couchdb-jiffy/blob/414827d6/test/004-strings.t
----------------------------------------------------------------------
diff --git a/test/004-strings.t b/test/004-strings.t
index 99852a3..17fb2b5 100755
--- a/test/004-strings.t
+++ b/test/004-strings.t
@@ -6,7 +6,7 @@ main([]) ->
code:add_pathz("ebin"),
code:add_pathz("test"),
- etap:plan(87),
+ etap:plan(116),
util:test_good(good()),
util:test_good(uescaped(), [uescape]),
util:test_errors(errors()),
@@ -61,12 +61,17 @@ errors() ->
test_utf8([]) ->
ok;
-test_utf8([Case | Rest]) ->
+test_utf8([{Case, Fixed} | Rest]) ->
etap:fun_is(
fun({error, invalid_string}) -> true; (Else) -> Else end,
(catch jiffy:encode(Case)),
lists:flatten(io_lib:format("Invalid utf-8: ~p", [Case]))
),
+ etap:fun_is(
+ fun(Fixed) -> true; (Else) -> Else end,
+ jiffy:encode(Case, [force_utf8]),
+ lists:flatten(io_lib:format("Fixed correctly: ~p", [Fixed]))
+ ),
Case2 = <<34, Case/binary, 34>>,
etap:fun_is(
fun({error, {_, invalid_string}}) -> true; (Else) -> Else end,
@@ -78,47 +83,47 @@ test_utf8([Case | Rest]) ->
utf8_cases() ->
[
% Stray continuation byte
- <<16#C2, 16#81, 16#80>>,
- <<"foo", 16#80, "bar">>,
+ {<<16#C2, 16#81, 16#80>>, <<16#C2, 16#81, 16#EF, 16#BF, 16#BD>>},
+ {<<"foo", 16#80, "bar">>, <<"foo", 16#EF, 16#BF, 16#BD, "bar">>},
% Invalid Unicode code points
- <<239, 191, 190>>,
- <<237, 160, 129>>,
+ {<<239, 191, 190>>, <<16#EF, 16#BF, 16#BD>>},
+ {<<237, 160, 129>>, <<16#EF, 16#BF, 16#BD>>},
% Not enough extension bytes
- <<16#C0>>,
+ {<<16#C0>>, <<16#EF, 16#BF, 16#BD>>},
- <<16#E0>>,
- <<16#E0, 16#80>>,
+ {<<16#E0>>, <<16#EF, 16#BF, 16#BD>>},
+ {<<16#E0, 16#80>>, <<16#EF, 16#BF, 16#BD>>},
- <<16#F0>>,
- <<16#F0, 16#80>>,
- <<16#F0, 16#80, 16#80>>,
+ {<<16#F0>>, <<16#EF, 16#BF, 16#BD>>},
+ {<<16#F0, 16#80>>, <<16#EF, 16#BF, 16#BD>>},
+ {<<16#F0, 16#80, 16#80>>, <<16#EF, 16#BF, 16#BD>>},
- <<16#F8>>,
- <<16#F8, 16#80>>,
- <<16#F8, 16#80, 16#80>>,
- <<16#F8, 16#80, 16#80, 16#80>>,
+ {<<16#F8>>, <<16#EF, 16#BF, 16#BD>>},
+ {<<16#F8, 16#80>>, <<16#EF, 16#BF, 16#BD>>},
+ {<<16#F8, 16#80, 16#80>>, <<16#EF, 16#BF, 16#BD>>},
+ {<<16#F8, 16#80, 16#80, 16#80>>, <<16#EF, 16#BF, 16#BD>>},
- <<16#FC>>,
- <<16#FC, 16#80>>,
- <<16#FC, 16#80, 16#80>>,
- <<16#FC, 16#80, 16#80, 16#80>>,
- <<16#FC, 16#80, 16#80, 16#80, 16#80>>,
+ {<<16#FC>>, <<16#EF, 16#BF, 16#BD>>},
+ {<<16#FC, 16#80>>, <<16#EF, 16#BF, 16#BD>>},
+ {<<16#FC, 16#80, 16#80>>, <<16#EF, 16#BF, 16#BD>>},
+ {<<16#FC, 16#80, 16#80, 16#80>>, <<16#EF, 16#BF, 16#BD>>},
+ {<<16#FC, 16#80, 16#80, 16#80, 16#80>>, <<16#EF, 16#BF, 16#BD>>},
% No data in high bits.
- <<16#C0, 16#80>>,
- <<16#C1, 16#80>>,
+ {<<16#C0, 16#80>>, <<"\"\\u0000\"">>},
+ {<<16#C1, 16#80>>, <<"\"\\u0000\"">>},
- <<16#E0, 16#80, 16#80>>,
- <<16#E0, 16#90, 16#80>>,
+ {<<16#E0, 16#80, 16#80>>, <<"\"\\u0000\"">>},
+ {<<16#E0, 16#90, 16#80>>, <<"\"\\u0000\"">>},
- <<16#F0, 16#80, 16#80, 16#80>>,
- <<16#F0, 16#88, 16#80, 16#80>>,
+ {<<16#F0, 16#80, 16#80, 16#80>>, <<"\"\\u0000\"">>},
+ {<<16#F0, 16#88, 16#80, 16#80>>, <<"\"\\u0000\"">>},
- <<16#F8, 16#80, 16#80, 16#80, 16#80>>,
- <<16#F8, 16#84, 16#80, 16#80, 16#80>>,
+ {<<16#F8, 16#80, 16#80, 16#80, 16#80>>, <<"\"\\u0000\"">>},
+ {<<16#F8, 16#84, 16#80, 16#80, 16#80>>, <<"\"\\u0000\"">>},
- <<16#FC, 16#80, 16#80, 16#80, 16#80, 16#80>>,
- <<16#FC, 16#82, 16#80, 16#80, 16#80, 16#80>>
+ {<<16#FC, 16#80, 16#80, 16#80, 16#80, 16#80>>, <<"\"\\u0000\"">>},
+ {<<16#FC, 16#82, 16#80, 16#80, 16#80, 16#80>>, <<"\"\\u0000\"">>}
].