You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tajo.apache.org by jh...@apache.org on 2014/08/13 08:05:14 UTC

git commit: TAJO-1000: TextDatum.asChar() is incorrect, if client charset is different. (jinho)

Repository: tajo
Updated Branches:
  refs/heads/master 7e31a3201 -> fb4135a3b


TAJO-1000: TextDatum.asChar() is incorrect, if client charset is different. (jinho)


Project: http://git-wip-us.apache.org/repos/asf/tajo/repo
Commit: http://git-wip-us.apache.org/repos/asf/tajo/commit/fb4135a3
Tree: http://git-wip-us.apache.org/repos/asf/tajo/tree/fb4135a3
Diff: http://git-wip-us.apache.org/repos/asf/tajo/diff/fb4135a3

Branch: refs/heads/master
Commit: fb4135a3bf16e61345ca2a5b6a6fea5b516e7a3e
Parents: 7e31a32
Author: jhkim <jh...@apache.org>
Authored: Wed Aug 13 15:04:14 2014 +0900
Committer: jhkim <jh...@apache.org>
Committed: Wed Aug 13 15:04:14 2014 +0900

----------------------------------------------------------------------
 CHANGES                                         |  3 ++
 tajo-common/pom.xml                             | 15 ++++++++
 .../java/org/apache/tajo/datum/TextDatum.java   |  8 +++--
 .../org/apache/tajo/datum/TestTextDatum.java    | 36 ++++++++++++++++----
 tajo-project/pom.xml                            | 12 +++++++
 5 files changed, 65 insertions(+), 9 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tajo/blob/fb4135a3/CHANGES
----------------------------------------------------------------------
diff --git a/CHANGES b/CHANGES
index 5d500a8..956db89 100644
--- a/CHANGES
+++ b/CHANGES
@@ -112,6 +112,9 @@ Release 0.9.0 - unreleased
 
   BUG FIXES
 
+    TAJO-1000: TextDatum.asChar() is incorrect, if client charset is different.
+    (jinho)
+
     TAJO-995: HiveMetaStoreClient wrapper should retry the connection. (jinho)
 
     TAJO-947: ColPartitionStoreExec can cause URISyntaxException due 

http://git-wip-us.apache.org/repos/asf/tajo/blob/fb4135a3/tajo-common/pom.xml
----------------------------------------------------------------------
diff --git a/tajo-common/pom.xml b/tajo-common/pom.xml
index c0f3402..da2a7d0 100644
--- a/tajo-common/pom.xml
+++ b/tajo-common/pom.xml
@@ -209,6 +209,21 @@ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xs
       <artifactId>junit</artifactId>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>org.mockito</groupId>
+      <artifactId>mockito-core</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.powermock</groupId>
+      <artifactId>powermock-module-junit4</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.powermock</groupId>
+      <artifactId>powermock-api-mockito</artifactId>
+      <scope>test</scope>
+    </dependency>
   </dependencies>
 
   <profiles>

http://git-wip-us.apache.org/repos/asf/tajo/blob/fb4135a3/tajo-common/src/main/java/org/apache/tajo/datum/TextDatum.java
----------------------------------------------------------------------
diff --git a/tajo-common/src/main/java/org/apache/tajo/datum/TextDatum.java b/tajo-common/src/main/java/org/apache/tajo/datum/TextDatum.java
index e8424b3..b642168 100644
--- a/tajo-common/src/main/java/org/apache/tajo/datum/TextDatum.java
+++ b/tajo-common/src/main/java/org/apache/tajo/datum/TextDatum.java
@@ -25,10 +25,14 @@ import org.apache.tajo.exception.InvalidCastException;
 import org.apache.tajo.exception.InvalidOperationException;
 import org.apache.tajo.util.MurmurHash;
 
+import java.nio.charset.Charset;
 import java.util.Comparator;
 
 public class TextDatum extends Datum {
+  static Charset defaultCharset = Charset.forName("UTF-8");
+
   @Expose private final int size;
+  /* encoded in UTF-8 */
   @Expose private final byte[] bytes;
 
   public static final TextDatum EMPTY_TEXT = new TextDatum("");
@@ -41,7 +45,7 @@ public class TextDatum extends Datum {
   }
 
   public TextDatum(String string) {
-    this(string.getBytes());
+    this(string.getBytes(defaultCharset));
   }
 
   @Override
@@ -85,7 +89,7 @@ public class TextDatum extends Datum {
   }
 
   public String asChars() {
-    return new String(this.bytes);
+    return new String(this.bytes, defaultCharset);
   }
 
   @Override

http://git-wip-us.apache.org/repos/asf/tajo/blob/fb4135a3/tajo-common/src/test/java/org/apache/tajo/datum/TestTextDatum.java
----------------------------------------------------------------------
diff --git a/tajo-common/src/test/java/org/apache/tajo/datum/TestTextDatum.java b/tajo-common/src/test/java/org/apache/tajo/datum/TestTextDatum.java
index 7feab46..bf48f78 100644
--- a/tajo-common/src/test/java/org/apache/tajo/datum/TestTextDatum.java
+++ b/tajo-common/src/test/java/org/apache/tajo/datum/TestTextDatum.java
@@ -18,21 +18,22 @@
 
 package org.apache.tajo.datum;
 
-import org.junit.Test;
 import org.apache.tajo.common.TajoDataTypes.Type;
+import org.junit.Test;
+import org.powermock.reflect.Whitebox;
 
-import static org.junit.Assert.assertArrayEquals;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
+import java.nio.charset.Charset;
+
+import static org.junit.Assert.*;
 
 public class TestTextDatum {
-	
+
 	@Test
 	public final void testType() {
 		Datum d = DatumFactory.createText("12345");
 		assertEquals(d.type(), Type.TEXT);
 	}
-	
+
 	@Test
 	public final void testAsInt4() {
 		Datum d = DatumFactory.createText("12345");
@@ -62,7 +63,7 @@ public class TestTextDatum {
 		Datum d = DatumFactory.createText("12345");
 		assertEquals("12345", d.asChars());
 	}
-	
+
 	@Test
   public final void testSize() {
 	  Datum d = DatumFactory.createText("12345");
@@ -74,4 +75,25 @@ public class TestTextDatum {
     Datum d = DatumFactory.createText("12345");
     assertArrayEquals(d.asByteArray(), d.asTextBytes());
   }
+
+  @Test
+  public final void testTextEncoding() {
+    String text = "나랏말싸미 듕귁에 달아 문자와로 서르 사맛디 아니할쎄";
+    TextDatum test = new TextDatum(text);
+
+    TextDatum fromUTF8 = new TextDatum(text.getBytes(Charset.forName("UTF-8")));
+    assertEquals(test, fromUTF8);
+
+    Charset systemCharSet = Charset.defaultCharset();
+    //hack for testing
+    Whitebox.setInternalState(Charset.class, "defaultCharset", Charset.forName("EUC-KR"));
+    assertEquals(Charset.forName("EUC-KR"), Charset.defaultCharset());
+
+    assertEquals(text, test.asChars());
+    assertNotEquals(new String(test.asByteArray()), test.asChars());
+
+    //restore
+    Whitebox.setInternalState(Charset.class, "defaultCharset", systemCharSet);
+    assertEquals(systemCharSet, Charset.defaultCharset());
+  }
 }

http://git-wip-us.apache.org/repos/asf/tajo/blob/fb4135a3/tajo-project/pom.xml
----------------------------------------------------------------------
diff --git a/tajo-project/pom.xml b/tajo-project/pom.xml
index 7c0da53..b3c1fe2 100644
--- a/tajo-project/pom.xml
+++ b/tajo-project/pom.xml
@@ -951,6 +951,18 @@
         <scope>test</scope>
       </dependency>
       <dependency>
+        <groupId>org.powermock</groupId>
+        <artifactId>powermock-module-junit4</artifactId>
+        <version>1.5.5</version>
+        <scope>test</scope>
+      </dependency>
+      <dependency>
+        <groupId>org.powermock</groupId>
+        <artifactId>powermock-api-mockito</artifactId>
+        <version>1.5.5</version>
+        <scope>test</scope>
+      </dependency>
+      <dependency>
         <groupId>io.netty</groupId>
         <artifactId>netty</artifactId>
         <version>3.6.6.Final</version>