You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@thrift.apache.org by ns...@apache.org on 2015/11/08 18:45:08 UTC

thrift git commit: THRIFT-2413: UTF-8 sent by PHP as JSON is not understood by TJsonProtocol Client: Python Patch: Phongphan Phuttha

Repository: thrift
Updated Branches:
  refs/heads/master f26488490 -> 7f01e2a8f


THRIFT-2413: UTF-8 sent by PHP as JSON is not understood by TJsonProtocol
Client: Python
Patch: Phongphan Phuttha

This patch allows readJSONString to decode escaped unicode string including encoded surrogate pair.

This closes #673


Project: http://git-wip-us.apache.org/repos/asf/thrift/repo
Commit: http://git-wip-us.apache.org/repos/asf/thrift/commit/7f01e2a8
Tree: http://git-wip-us.apache.org/repos/asf/thrift/tree/7f01e2a8
Diff: http://git-wip-us.apache.org/repos/asf/thrift/diff/7f01e2a8

Branch: refs/heads/master
Commit: 7f01e2a8f869d8622bc56e7584cce98865fa8b0f
Parents: f264884
Author: Phongphan Phuttha <ph...@gmail.com>
Authored: Fri Nov 6 15:46:50 2015 +0700
Committer: Nobuaki Sukegawa <ns...@apache.org>
Committed: Mon Nov 9 02:34:28 2015 +0900

----------------------------------------------------------------------
 lib/py/src/protocol/TJSONProtocol.py | 32 ++++++++++++++++++++++++++++++-
 lib/py/test/thrift_json.py           | 31 ++++++++++++++++++++++++++++++
 2 files changed, 62 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/thrift/blob/7f01e2a8/lib/py/src/protocol/TJSONProtocol.py
----------------------------------------------------------------------
diff --git a/lib/py/src/protocol/TJSONProtocol.py b/lib/py/src/protocol/TJSONProtocol.py
index 3ed8bcb..e98f4cf 100644
--- a/lib/py/src/protocol/TJSONProtocol.py
+++ b/lib/py/src/protocol/TJSONProtocol.py
@@ -243,7 +243,14 @@ class TJSONProtocolBase(TProtocolBase):
       raise TProtocolException(TProtocolException.INVALID_DATA,
                                "Unexpected character: %s" % current)
 
+  def _isHighSurrogate(self, codeunit):
+    return codeunit >= 0xd800 and codeunit <= 0xdbff
+
+  def _isLowSurrogate(self, codeunit):
+    return codeunit >= 0xdc00 and codeunit <= 0xdfff
+
   def readJSONString(self, skipContext):
+    highSurrogate = None
     string = []
     if skipContext is False:
       self.context.read()
@@ -255,7 +262,26 @@ class TJSONProtocolBase(TProtocolBase):
       if ord(character) == ESCSEQ0:
         character = self.reader.read()
         if ord(character) == ESCSEQ1:
-          character = chr(int(self.trans.read(4)))
+          if sys.version_info[0] == 2:
+            import json
+            character = self.trans.read(4)
+            codeunit = int(character, 16)
+            if self._isHighSurrogate(codeunit):
+              if highSurrogate:
+                raise TProtocolException(TProtocolException.INVALID_DATA,
+                                         "Expected low surrogate char")
+              highSurrogate = character
+              continue
+            elif self._isLowSurrogate(codeunit):
+              if not highSurrogate:
+                raise TProtocolException(TProtocolException.INVALID_DATA,
+                                         "Expected high surrogate char")
+              character = json.JSONDecoder().decode('"\\u%s\\u%s"' % (highSurrogate, character)).encode('utf-8')
+              highSurrogate = None
+            else:
+              character = json.JSONDecoder().decode('"\\u%s"' % character).encode('utf-8')
+          else:
+              character = chr(int(self.trans.read(4)))
         else:
           if character not in ESCAPE_CHARS:
             raise TProtocolException(TProtocolException.INVALID_DATA,
@@ -270,6 +296,10 @@ class TJSONProtocolBase(TProtocolBase):
           utf8_bytes.append(ord(self.reader.read()))
         character = utf8_bytes.decode('utf8')
       string.append(character)
+
+      if highSurrogate:
+        raise TProtocolException(TProtocolException.INVALID_DATA,
+                                 "Expected low surrogate char")
     return ''.join(string)
 
   def isJSONNumeric(self, character):

http://git-wip-us.apache.org/repos/asf/thrift/blob/7f01e2a8/lib/py/test/thrift_json.py
----------------------------------------------------------------------
diff --git a/lib/py/test/thrift_json.py b/lib/py/test/thrift_json.py
new file mode 100644
index 0000000..cef8870
--- /dev/null
+++ b/lib/py/test/thrift_json.py
@@ -0,0 +1,31 @@
+from thrift import Thrift
+from thrift.protocol.TJSONProtocol import TJSONProtocol
+from thrift.transport import TTransport
+
+import sys
+import unittest
+
+#
+# In order to run the test under Windows. We need to create symbolic link
+# name 'thrift' to '../src' folder by using:
+#
+# mklink /D thrift ..\src
+#
+
+class TestJSONString(unittest.TestCase):
+
+  def test_escaped_unicode_string(self):
+    unicode_json = '"hello \\u0e01\\u0e02\\u0e03\\ud835\\udcab unicode"'
+    unicode_text = u'hello \u0e01\u0e02\u0e03\U0001D4AB unicode'
+
+    buf = TTransport.TMemoryBuffer(unicode_json)
+    transport = TTransport.TBufferedTransportFactory().getTransport(buf)
+    protocol = TJSONProtocol(transport)
+
+    if sys.version_info[0] == 2:
+      unicode_text = unicode_text.encode('utf8')
+    self.assertEqual(protocol.readString(), unicode_text)
+
+if __name__ == '__main__':
+  unittest.main()
+