You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tajo.apache.org by jh...@apache.org on 2015/08/18 09:54:13 UTC

tajo git commit: TAJO-1777: JsonLineDeserializer returns invalid unicode text, if contains control character

Repository: tajo
Updated Branches:
  refs/heads/master ca7e3fde0 -> 57be230e9


TAJO-1777: JsonLineDeserializer returns invalid unicode text, if contains control character

Closes #696


Project: http://git-wip-us.apache.org/repos/asf/tajo/repo
Commit: http://git-wip-us.apache.org/repos/asf/tajo/commit/57be230e
Tree: http://git-wip-us.apache.org/repos/asf/tajo/tree/57be230e
Diff: http://git-wip-us.apache.org/repos/asf/tajo/diff/57be230e

Branch: refs/heads/master
Commit: 57be230e921ca93029181a9983684d2fa80f8f06
Parents: ca7e3fd
Author: Jinho Kim <jh...@apache.org>
Authored: Tue Aug 18 16:53:10 2015 +0900
Committer: Jinho Kim <jh...@apache.org>
Committed: Tue Aug 18 16:53:10 2015 +0900

----------------------------------------------------------------------
 CHANGES                                         |  3 ++
 .../tajo/storage/json/JsonLineDeserializer.java | 15 +++++----
 .../apache/tajo/storage/json/TestJsonSerDe.java | 32 ++++++++++++++++++++
 .../testUnicodeWithControlChar.json             |  1 +
 4 files changed, 43 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tajo/blob/57be230e/CHANGES
----------------------------------------------------------------------
diff --git a/CHANGES b/CHANGES
index f6a489c..f2d4f64 100644
--- a/CHANGES
+++ b/CHANGES
@@ -223,6 +223,9 @@ Release 0.11.0 - unreleased
 
   BUG FIXES
 
+    TAJO-1777: JsonLineDeserializer returns invalid unicode text, 
+    if contains control character. (jinho)
+
     TAJO-1779: Remove "DFSInputStream has been closed already" messages 
     in DelimitedLineReader. (jinho)
 

http://git-wip-us.apache.org/repos/asf/tajo/blob/57be230e/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/json/JsonLineDeserializer.java
----------------------------------------------------------------------
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/json/JsonLineDeserializer.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/json/JsonLineDeserializer.java
index c720118..9216025 100644
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/json/JsonLineDeserializer.java
+++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/json/JsonLineDeserializer.java
@@ -21,23 +21,22 @@ package org.apache.tajo.storage.json;
 
 import com.facebook.presto.hive.shaded.com.google.common.collect.Lists;
 import io.netty.buffer.ByteBuf;
+import io.netty.util.CharsetUtil;
 import net.minidev.json.JSONObject;
 import net.minidev.json.parser.JSONParser;
 import net.minidev.json.parser.ParseException;
-import org.apache.tajo.catalog.*;
 import org.apache.commons.net.util.Base64;
-import org.apache.tajo.catalog.Schema;
-import org.apache.tajo.catalog.TableMeta;
+import org.apache.tajo.catalog.*;
 import org.apache.tajo.common.TajoDataTypes.Type;
 import org.apache.tajo.datum.DatumFactory;
 import org.apache.tajo.datum.NullDatum;
-import org.apache.tajo.datum.TextDatum;
 import org.apache.tajo.exception.NotImplementedException;
 import org.apache.tajo.storage.Tuple;
 import org.apache.tajo.storage.text.TextLineDeserializer;
 import org.apache.tajo.storage.text.TextLineParsingError;
 
 import java.io.IOException;
+import java.nio.charset.CharsetDecoder;
 import java.util.Map;
 
 public class JsonLineDeserializer extends TextLineDeserializer {
@@ -46,6 +45,7 @@ public class JsonLineDeserializer extends TextLineDeserializer {
   // Full Path -> Type
   private final Map<String, Type> types;
   private final String [] projectedPaths;
+  private final CharsetDecoder decoder = CharsetUtil.getDecoder(CharsetUtil.UTF_8);
 
   public JsonLineDeserializer(Schema schema, TableMeta meta, Column [] projected) {
     super(schema, meta);
@@ -214,17 +214,16 @@ public class JsonLineDeserializer extends TextLineDeserializer {
 
   @Override
   public void deserialize(ByteBuf buf, Tuple output) throws IOException, TextLineParsingError {
-    byte[] line = new byte[buf.readableBytes()];
-    buf.readBytes(line);
+    String line = decoder.decode(buf.nioBuffer(buf.readerIndex(), buf.readableBytes())).toString();
 
     JSONObject object;
     try {
       object = (JSONObject) parser.parse(line);
     } catch (ParseException pe) {
-      throw new TextLineParsingError(new String(line, TextDatum.DEFAULT_CHARSET), pe);
+      throw new TextLineParsingError(line, pe);
     } catch (ArrayIndexOutOfBoundsException ae) {
       // truncated value
-      throw new TextLineParsingError(new String(line, TextDatum.DEFAULT_CHARSET), ae);
+      throw new TextLineParsingError(line, ae);
     }
 
     for (int i = 0; i < projectedPaths.length; i++) {

http://git-wip-us.apache.org/repos/asf/tajo/blob/57be230e/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/json/TestJsonSerDe.java
----------------------------------------------------------------------
diff --git a/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/json/TestJsonSerDe.java b/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/json/TestJsonSerDe.java
index 8095081..88d7536 100644
--- a/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/json/TestJsonSerDe.java
+++ b/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/json/TestJsonSerDe.java
@@ -93,4 +93,36 @@ public class TestJsonSerDe {
 
     assertEquals(baseTuple, tuple);
   }
+
+  @Test
+  public void testUnicodeWithControlChar() throws IOException {
+    TajoConf conf = new TajoConf();
+
+    TableMeta meta = CatalogUtil.newTableMeta("JSON");
+    Path tablePath = new Path(getResourcePath("dataset", "TestJsonSerDe"), "testUnicodeWithControlChar.json");
+    FileSystem fs = FileSystem.getLocal(conf);
+    FileStatus status = fs.getFileStatus(tablePath);
+    FileFragment fragment = new FileFragment("table", tablePath, 0, status.getLen());
+
+    Schema  schema = new Schema();
+    schema.addColumn("col1", TajoDataTypes.Type.TEXT);
+    schema.addColumn("col2", TajoDataTypes.Type.TEXT);
+    schema.addColumn("col3", TajoDataTypes.Type.TEXT);
+    Scanner scanner =  TablespaceManager.getLocalFs().getScanner(meta, schema, fragment);
+    scanner.init();
+
+    Tuple tuple = scanner.next();
+    assertNotNull(tuple);
+    assertNull(scanner.next());
+    scanner.close();
+
+
+    Tuple baseTuple = new VTuple(new Datum[] {
+        DatumFactory.createText("tajo"),
+        DatumFactory.createText("타조"),
+        DatumFactory.createText("타\n조")
+    });
+
+    assertEquals(baseTuple, tuple);
+  }
 }

http://git-wip-us.apache.org/repos/asf/tajo/blob/57be230e/tajo-storage/tajo-storage-hdfs/src/test/resources/dataset/TestJsonSerDe/testUnicodeWithControlChar.json
----------------------------------------------------------------------
diff --git a/tajo-storage/tajo-storage-hdfs/src/test/resources/dataset/TestJsonSerDe/testUnicodeWithControlChar.json b/tajo-storage/tajo-storage-hdfs/src/test/resources/dataset/TestJsonSerDe/testUnicodeWithControlChar.json
new file mode 100644
index 0000000..5446469
--- /dev/null
+++ b/tajo-storage/tajo-storage-hdfs/src/test/resources/dataset/TestJsonSerDe/testUnicodeWithControlChar.json
@@ -0,0 +1 @@
+{"col1": "tajo", "col2":"타조", "col3":"타\n조"}