You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tajo.apache.org by jh...@apache.org on 2015/08/18 09:54:13 UTC
tajo git commit: TAJO-1777: JsonLineDeserializer returns invalid
unicode text, if contains control character
Repository: tajo
Updated Branches:
refs/heads/master ca7e3fde0 -> 57be230e9
TAJO-1777: JsonLineDeserializer returns invalid unicode text, if contains control character
Closes #696
Project: http://git-wip-us.apache.org/repos/asf/tajo/repo
Commit: http://git-wip-us.apache.org/repos/asf/tajo/commit/57be230e
Tree: http://git-wip-us.apache.org/repos/asf/tajo/tree/57be230e
Diff: http://git-wip-us.apache.org/repos/asf/tajo/diff/57be230e
Branch: refs/heads/master
Commit: 57be230e921ca93029181a9983684d2fa80f8f06
Parents: ca7e3fd
Author: Jinho Kim <jh...@apache.org>
Authored: Tue Aug 18 16:53:10 2015 +0900
Committer: Jinho Kim <jh...@apache.org>
Committed: Tue Aug 18 16:53:10 2015 +0900
----------------------------------------------------------------------
CHANGES | 3 ++
.../tajo/storage/json/JsonLineDeserializer.java | 15 +++++----
.../apache/tajo/storage/json/TestJsonSerDe.java | 32 ++++++++++++++++++++
.../testUnicodeWithControlChar.json | 1 +
4 files changed, 43 insertions(+), 8 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tajo/blob/57be230e/CHANGES
----------------------------------------------------------------------
diff --git a/CHANGES b/CHANGES
index f6a489c..f2d4f64 100644
--- a/CHANGES
+++ b/CHANGES
@@ -223,6 +223,9 @@ Release 0.11.0 - unreleased
BUG FIXES
+ TAJO-1777: JsonLineDeserializer returns invalid unicode text,
+ if contains control character. (jinho)
+
TAJO-1779: Remove "DFSInputStream has been closed already" messages
in DelimitedLineReader. (jinho)
http://git-wip-us.apache.org/repos/asf/tajo/blob/57be230e/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/json/JsonLineDeserializer.java
----------------------------------------------------------------------
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/json/JsonLineDeserializer.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/json/JsonLineDeserializer.java
index c720118..9216025 100644
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/json/JsonLineDeserializer.java
+++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/json/JsonLineDeserializer.java
@@ -21,23 +21,22 @@ package org.apache.tajo.storage.json;
import com.facebook.presto.hive.shaded.com.google.common.collect.Lists;
import io.netty.buffer.ByteBuf;
+import io.netty.util.CharsetUtil;
import net.minidev.json.JSONObject;
import net.minidev.json.parser.JSONParser;
import net.minidev.json.parser.ParseException;
-import org.apache.tajo.catalog.*;
import org.apache.commons.net.util.Base64;
-import org.apache.tajo.catalog.Schema;
-import org.apache.tajo.catalog.TableMeta;
+import org.apache.tajo.catalog.*;
import org.apache.tajo.common.TajoDataTypes.Type;
import org.apache.tajo.datum.DatumFactory;
import org.apache.tajo.datum.NullDatum;
-import org.apache.tajo.datum.TextDatum;
import org.apache.tajo.exception.NotImplementedException;
import org.apache.tajo.storage.Tuple;
import org.apache.tajo.storage.text.TextLineDeserializer;
import org.apache.tajo.storage.text.TextLineParsingError;
import java.io.IOException;
+import java.nio.charset.CharsetDecoder;
import java.util.Map;
public class JsonLineDeserializer extends TextLineDeserializer {
@@ -46,6 +45,7 @@ public class JsonLineDeserializer extends TextLineDeserializer {
// Full Path -> Type
private final Map<String, Type> types;
private final String [] projectedPaths;
+ private final CharsetDecoder decoder = CharsetUtil.getDecoder(CharsetUtil.UTF_8);
public JsonLineDeserializer(Schema schema, TableMeta meta, Column [] projected) {
super(schema, meta);
@@ -214,17 +214,16 @@ public class JsonLineDeserializer extends TextLineDeserializer {
@Override
public void deserialize(ByteBuf buf, Tuple output) throws IOException, TextLineParsingError {
- byte[] line = new byte[buf.readableBytes()];
- buf.readBytes(line);
+ String line = decoder.decode(buf.nioBuffer(buf.readerIndex(), buf.readableBytes())).toString();
JSONObject object;
try {
object = (JSONObject) parser.parse(line);
} catch (ParseException pe) {
- throw new TextLineParsingError(new String(line, TextDatum.DEFAULT_CHARSET), pe);
+ throw new TextLineParsingError(line, pe);
} catch (ArrayIndexOutOfBoundsException ae) {
// truncated value
- throw new TextLineParsingError(new String(line, TextDatum.DEFAULT_CHARSET), ae);
+ throw new TextLineParsingError(line, ae);
}
for (int i = 0; i < projectedPaths.length; i++) {
http://git-wip-us.apache.org/repos/asf/tajo/blob/57be230e/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/json/TestJsonSerDe.java
----------------------------------------------------------------------
diff --git a/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/json/TestJsonSerDe.java b/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/json/TestJsonSerDe.java
index 8095081..88d7536 100644
--- a/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/json/TestJsonSerDe.java
+++ b/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/json/TestJsonSerDe.java
@@ -93,4 +93,36 @@ public class TestJsonSerDe {
assertEquals(baseTuple, tuple);
}
+
+ @Test
+ public void testUnicodeWithControlChar() throws IOException {
+ TajoConf conf = new TajoConf();
+
+ TableMeta meta = CatalogUtil.newTableMeta("JSON");
+ Path tablePath = new Path(getResourcePath("dataset", "TestJsonSerDe"), "testUnicodeWithControlChar.json");
+ FileSystem fs = FileSystem.getLocal(conf);
+ FileStatus status = fs.getFileStatus(tablePath);
+ FileFragment fragment = new FileFragment("table", tablePath, 0, status.getLen());
+
+ Schema schema = new Schema();
+ schema.addColumn("col1", TajoDataTypes.Type.TEXT);
+ schema.addColumn("col2", TajoDataTypes.Type.TEXT);
+ schema.addColumn("col3", TajoDataTypes.Type.TEXT);
+ Scanner scanner = TablespaceManager.getLocalFs().getScanner(meta, schema, fragment);
+ scanner.init();
+
+ Tuple tuple = scanner.next();
+ assertNotNull(tuple);
+ assertNull(scanner.next());
+ scanner.close();
+
+
+ Tuple baseTuple = new VTuple(new Datum[] {
+ DatumFactory.createText("tajo"),
+ DatumFactory.createText("타조"),
+ DatumFactory.createText("타\n조")
+ });
+
+ assertEquals(baseTuple, tuple);
+ }
}
http://git-wip-us.apache.org/repos/asf/tajo/blob/57be230e/tajo-storage/tajo-storage-hdfs/src/test/resources/dataset/TestJsonSerDe/testUnicodeWithControlChar.json
----------------------------------------------------------------------
diff --git a/tajo-storage/tajo-storage-hdfs/src/test/resources/dataset/TestJsonSerDe/testUnicodeWithControlChar.json b/tajo-storage/tajo-storage-hdfs/src/test/resources/dataset/TestJsonSerDe/testUnicodeWithControlChar.json
new file mode 100644
index 0000000..5446469
--- /dev/null
+++ b/tajo-storage/tajo-storage-hdfs/src/test/resources/dataset/TestJsonSerDe/testUnicodeWithControlChar.json
@@ -0,0 +1 @@
+{"col1": "tajo", "col2":"타조", "col3":"타\n조"}