You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@couchdb.apache.org by da...@apache.org on 2014/02/12 07:21:46 UTC

[14/50] [abbrv] mochiweb commit: updated refs/heads/import-master to 3a54dbf

accept UTF-16 surrogate pairs.  Fixes COUCHDB-327, COUCHDB-333

git-svn-id: https://svn.apache.org/repos/asf/couchdb/trunk@782643 13f79535-47bb-0310-9956-ffa450edef68


Project: http://git-wip-us.apache.org/repos/asf/couchdb-mochiweb/repo
Commit: http://git-wip-us.apache.org/repos/asf/couchdb-mochiweb/commit/b207636c
Tree: http://git-wip-us.apache.org/repos/asf/couchdb-mochiweb/tree/b207636c
Diff: http://git-wip-us.apache.org/repos/asf/couchdb-mochiweb/diff/b207636c

Branch: refs/heads/import-master
Commit: b207636cc28672c56b7b7646e8f935855af1884f
Parents: bb0ad1a
Author: Adam Kocoloski <ko...@apache.org>
Authored: Mon Jun 8 14:24:54 2009 +0000
Committer: Adam Kocoloski <ko...@apache.org>
Committed: Mon Jun 8 14:24:54 2009 +0000

----------------------------------------------------------------------
 mochijson2.erl | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/couchdb-mochiweb/blob/b207636c/mochijson2.erl
----------------------------------------------------------------------
diff --git a/mochijson2.erl b/mochijson2.erl
index 8bfd23c..7d7a8af 100644
--- a/mochijson2.erl
+++ b/mochijson2.erl
@@ -371,11 +371,20 @@ tokenize_string(B, S=#decoder{offset=O}, Acc) ->
             tokenize_string(B, ?ADV_COL(S, 2), [$\r | Acc]);
         <<_:O/binary, "\\t", _/binary>> ->
             tokenize_string(B, ?ADV_COL(S, 2), [$\t | Acc]);
-        <<_:O/binary, "\\u", C3, C2, C1, C0, _/binary>> ->
-            %% coalesce UTF-16 surrogate pair?
+        <<_:O/binary, "\\u", C3, C2, C1, C0, Rest/binary>> ->
             C = erlang:list_to_integer([C3, C2, C1, C0], 16),
-            Acc1 = lists:reverse(xmerl_ucs:to_utf8(C), Acc),
-            tokenize_string(B, ?ADV_COL(S, 6), Acc1);
+            if C > 16#D7FF, C < 16#DC00 ->
+                %% coalesce UTF-16 surrogate pair
+                <<"\\u", D3, D2, D1, D0, _/binary>> = Rest,
+                D = erlang:list_to_integer([D3,D2,D1,D0], 16),
+                [CodePoint] = xmerl_ucs:from_utf16be(<<C:16/big-unsigned-integer,
+                    D:16/big-unsigned-integer>>),
+                Acc1 = lists:reverse(xmerl_ucs:to_utf8(CodePoint), Acc),
+                tokenize_string(B, ?ADV_COL(S, 12), Acc1);
+            true ->
+                Acc1 = lists:reverse(xmerl_ucs:to_utf8(C), Acc),
+                tokenize_string(B, ?ADV_COL(S, 6), Acc1)
+            end;
         <<_:O/binary, C, _/binary>> ->
             tokenize_string(B, ?INC_CHAR(S, C), [C | Acc])
     end.
@@ -541,6 +550,7 @@ equiv_list([V1 | L1], [V2 | L2]) ->
 
 test_all() ->
     [1199344435545.0, 1] = decode(<<"[1199344435545.0,1]">>),
+    <<16#F0,16#9D,16#9C,16#95>> = decode([34,"\\ud835","\\udf15",34]),
     test_one(e2j_test_vec(utf8), 1).
 
 test_one([], _N) ->