You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tajo.apache.org by ji...@apache.org on 2015/03/18 18:25:41 UTC
[02/13] tajo git commit: TAJO-1374: Support multi-bytes delimiter for
CSV file
TAJO-1374: Support multi-bytes delimiter for CSV file
closes #400
Signed-off-by: Jinho Kim <jh...@apache.org>
Project: http://git-wip-us.apache.org/repos/asf/tajo/repo
Commit: http://git-wip-us.apache.org/repos/asf/tajo/commit/7f056955
Tree: http://git-wip-us.apache.org/repos/asf/tajo/tree/7f056955
Diff: http://git-wip-us.apache.org/repos/asf/tajo/diff/7f056955
Branch: refs/heads/index_support
Commit: 7f0569555332665141e71141d7a736371d038509
Parents: d8db8b7
Author: navis.ryu <na...@apache.org>
Authored: Fri Mar 6 14:19:15 2015 +0900
Committer: Jinho Kim <jh...@apache.org>
Committed: Fri Mar 13 15:33:08 2015 +0900
----------------------------------------------------------------------
CHANGES | 3 +
.../java/org/apache/tajo/util/BytesUtils.java | 159 +++++++++++--------
.../java/org/apache/tajo/util/StringUtils.java | 6 +-
.../org/apache/tajo/util/TestStringUtil.java | 4 +-
.../apache/tajo/engine/eval/ExprTestBase.java | 5 +-
.../tajo/engine/query/TestSelectQuery.java | 24 +++
.../multibytes_delimiter1/table1.tbl | 5 +
.../multibytes_delimiter2/table2.tbl | 5 +
.../multibytes_delimiter_table1_ddl.sql | 3 +
.../multibytes_delimiter_table2_ddl.sql | 3 +
.../testMultiBytesDelimiter1.sql | 1 +
.../testMultiBytesDelimiter2.sql | 1 +
.../testMultiBytesDelimiter1.result | 7 +
.../testMultiBytesDelimiter2.result | 7 +
.../org/apache/tajo/storage/TestLazyTuple.java | 4 +-
.../tajo/storage/hbase/ColumnMapping.java | 6 +-
.../apache/tajo/storage/hbase/HBaseScanner.java | 3 +-
.../tajo/storage/hbase/HBaseStorageManager.java | 3 +-
.../java/org/apache/tajo/storage/CSVFile.java | 14 +-
.../sequencefile/SequenceFileScanner.java | 3 +-
20 files changed, 180 insertions(+), 86 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tajo/blob/7f056955/CHANGES
----------------------------------------------------------------------
diff --git a/CHANGES b/CHANGES
index 4eee74f..e6f7917 100644
--- a/CHANGES
+++ b/CHANGES
@@ -9,6 +9,9 @@ Release 0.11.0 - unreleased
IMPROVEMENT
+ TAJO-1374: Support multi-bytes delimiter for CSV file.
+ (Contributed by navis, Committed by jinho)
+
TAJO-1395: Remove deprecated sql files for Oracle and PostgreSQL. (jihun)
TAJO-1394: Support reconnect on tsql.
http://git-wip-us.apache.org/repos/asf/tajo/blob/7f056955/tajo-common/src/main/java/org/apache/tajo/util/BytesUtils.java
----------------------------------------------------------------------
diff --git a/tajo-common/src/main/java/org/apache/tajo/util/BytesUtils.java b/tajo-common/src/main/java/org/apache/tajo/util/BytesUtils.java
index 91165ac..725301c 100644
--- a/tajo-common/src/main/java/org/apache/tajo/util/BytesUtils.java
+++ b/tajo-common/src/main/java/org/apache/tajo/util/BytesUtils.java
@@ -22,6 +22,7 @@ import org.apache.hadoop.io.WritableUtils;
import java.io.ByteArrayOutputStream;
import java.util.ArrayList;
+import java.util.Arrays;
import java.util.List;
/**
@@ -86,22 +87,23 @@ public class BytesUtils {
return buffer;
}
- public static byte[][] splitPreserveAllTokens(byte[] str, char separatorChar, int[] target) {
- return splitWorker(str, 0, -1, separatorChar, true, target);
+ public static byte[][] splitPreserveAllTokens(byte[] str, char separatorChar, int[] target, int numColumns) {
+ return splitWorker(str, 0, -1, separatorChar, target, numColumns);
}
- public static byte[][] splitPreserveAllTokens(byte[] str, int offset, int length, char separatorChar, int[] target) {
- return splitWorker(str, offset, length, separatorChar, true, target);
+ public static byte[][] splitPreserveAllTokens(byte[] str, int offset, int length, byte[] separator, int[] target, int numColumns) {
+ return splitWorker(str, offset, length, separator, target, numColumns);
}
- public static byte[][] splitPreserveAllTokens(byte[] str, char separatorChar) {
- return splitWorker(str, 0, -1, separatorChar, true, null);
+ public static byte[][] splitPreserveAllTokens(byte[] str, char separatorChar, int numColumns) {
+ return splitWorker(str, 0, -1, separatorChar, null, numColumns);
}
- public static byte[][] splitPreserveAllTokens(byte[] str, int length, char separatorChar) {
- return splitWorker(str, 0, length, separatorChar, true, null);
+ private static byte[][] splitWorker(byte[] str, int offset, int length, char separatorChar,
+ int[] target, int numColumns) {
+ return splitWorker(str, offset, length, new byte[] {(byte)separatorChar}, target, numColumns);
}
-
+
/**
* Performs the logic for the <code>split</code> and
* <code>splitPreserveAllTokens</code> methods that do not return a
@@ -109,75 +111,96 @@ public class BytesUtils {
*
* @param str the String to parse, may be <code>null</code>
* @param length amount of bytes to str
- * @param separatorChar the ascii separate character
- * @param preserveAllTokens if <code>true</code>, adjacent separators are
- * treated as empty token separators; if <code>false</code>, adjacent
- * separators are treated as one separator.
+ * @param separator the ascii separate characters
* @param target the projection target
+ * @param numColumns number of columns to be retrieved
* @return an array of parsed Strings, <code>null</code> if null String input
*/
- private static byte[][] splitWorker(byte[] str, int offset, int length, char separatorChar,
- boolean preserveAllTokens, int[] target) {
- // Performance tuned for 2.0 (JDK1.4)
-
+ private static byte[][] splitWorker(byte[] str, int offset, int length, byte[] separator, int[] target, int numColumns) {
if (str == null) {
return null;
}
- int len = length;
- if (len == 0) {
- return new byte[1][0];
- }else if(len < 0){
- len = str.length - offset;
- }
-
- List list = new ArrayList();
- int i = 0, start = 0;
- boolean match = false;
- boolean lastMatch = false;
- int currentTarget = 0;
- int currentIndex = 0;
- while (i < len) {
- if (str[i + offset] == separatorChar) {
- if (match || preserveAllTokens) {
- if (target == null) {
- byte[] bytes = new byte[i - start];
- System.arraycopy(str, start + offset, bytes, 0, bytes.length);
- list.add(bytes);
- } else if (target.length > currentTarget && currentIndex == target[currentTarget]) {
- byte[] bytes = new byte[i - start];
- System.arraycopy(str, start + offset, bytes, 0, bytes.length);
- list.add(bytes);
- currentTarget++;
- } else {
- list.add(null);
- }
- currentIndex++;
- match = false;
- lastMatch = true;
- }
- start = ++i;
- continue;
+ if (length == 0) {
+ return new byte[numColumns][0];
+ }
+ if (length < 0) {
+ length = str.length - offset;
+ }
+ int indexMax = 0;
+ if (target != null) {
+ for (int index : target) {
+ indexMax = Math.max(indexMax, index + 1);
}
- lastMatch = false;
- match = true;
- i++;
- }
- if (match || (preserveAllTokens && lastMatch)) {
- if (target == null) {
- byte[] bytes = new byte[i - start];
- System.arraycopy(str, start + offset, bytes, 0, bytes.length);
- list.add(bytes);
- } else if (target.length > currentTarget && currentIndex == target[currentTarget]) {
- byte[] bytes = new byte[i - start];
- System.arraycopy(str, start + offset, bytes, 0, bytes.length);
- list.add(bytes); //str.substring(start, i));
- currentTarget++;
+ } else {
+ indexMax = numColumns;
+ }
+
+ int[][] indices = split(str, offset, length, separator, new int[indexMax][]);
+ byte[][] result = new byte[numColumns][];
+
+ // not-picked -> null, picked but not-exists -> byte[0]
+ if (target != null) {
+ for (int i : target) {
+ int[] index = indices[i];
+ result[i] = index == null ? new byte[0] : Arrays.copyOfRange(str, index[0], index[1]);
+ }
+ } else {
+ for (int i = 0; i < result.length; i++) {
+ int[] index = indices[i];
+ result[i] = index == null ? new byte[0] : Arrays.copyOfRange(str, index[0], index[1]);
+ }
+ }
+ return result;
+ }
+
+ public static int[][] split(byte[] str, int offset, int length, byte[] separator, int[][] indices) {
+ if (indices.length == 0) {
+ return indices; // trivial
+ }
+ final int limit = offset + length;
+
+ int start = offset;
+ int colIndex = 0;
+ for (int index = offset; index < limit;) {
+ if (onDelimiter(str, index, limit, separator)) {
+ indices[colIndex++] = new int[] {start, index};
+ if (colIndex >= indices.length) {
+ return indices;
+ }
+ index += separator.length;
+ start = index;
} else {
- list.add(null);
+ index++;
}
- currentIndex++;
}
- return (byte[][]) list.toArray(new byte[list.size()][]);
+ if (colIndex < indices.length) {
+ indices[colIndex] = new int[]{start, limit};
+ }
+ return indices;
+ }
+
+ private static boolean onDelimiter(byte[] input, int offset, int limit, byte[] delimiter) {
+ for (int i = 0; i < delimiter.length; i++) {
+ if (offset + i >= limit || input[offset + i] != delimiter[i]) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ public static byte[][] splitTrivial(byte[] value, byte delimiter) {
+ List<byte[]> split = new ArrayList<byte[]>();
+ int prev = 0;
+ for (int i = 0; i < value.length; i++) {
+ if (value[i] == delimiter) {
+ split.add(Arrays.copyOfRange(value, prev, i));
+ prev = i + 1;
+ }
+ }
+ if (prev <= value.length) {
+ split.add(Arrays.copyOfRange(value, prev, value.length));
+ }
+ return split.toArray(new byte[split.size()][]);
}
/**
http://git-wip-us.apache.org/repos/asf/tajo/blob/7f056955/tajo-common/src/main/java/org/apache/tajo/util/StringUtils.java
----------------------------------------------------------------------
diff --git a/tajo-common/src/main/java/org/apache/tajo/util/StringUtils.java b/tajo-common/src/main/java/org/apache/tajo/util/StringUtils.java
index 38c0fd8..d035e4a 100644
--- a/tajo-common/src/main/java/org/apache/tajo/util/StringUtils.java
+++ b/tajo-common/src/main/java/org/apache/tajo/util/StringUtils.java
@@ -186,7 +186,11 @@ public class StringUtils {
public static String unicodeEscapedDelimiter(String value) {
try {
String delimiter = StringEscapeUtils.unescapeJava(value);
- return unicodeEscapedDelimiter(delimiter.charAt(0));
+ StringBuilder builder = new StringBuilder();
+ for (char achar : delimiter.toCharArray()) {
+ builder.append(unicodeEscapedDelimiter(achar));
+ }
+ return builder.toString();
} catch (Throwable e) {
}
return value;
http://git-wip-us.apache.org/repos/asf/tajo/blob/7f056955/tajo-common/src/test/java/org/apache/tajo/util/TestStringUtil.java
----------------------------------------------------------------------
diff --git a/tajo-common/src/test/java/org/apache/tajo/util/TestStringUtil.java b/tajo-common/src/test/java/org/apache/tajo/util/TestStringUtil.java
index 5272586..c4329a1 100644
--- a/tajo-common/src/test/java/org/apache/tajo/util/TestStringUtil.java
+++ b/tajo-common/src/test/java/org/apache/tajo/util/TestStringUtil.java
@@ -103,7 +103,7 @@ public class TestStringUtil {
char separatorChar = '|';
String[] textArray = org.apache.commons.lang.StringUtils.splitPreserveAllTokens(text, separatorChar);
- byte[][] bytesArray = BytesUtils.splitPreserveAllTokens(text.getBytes(), separatorChar);
+ byte[][] bytesArray = BytesUtils.splitPreserveAllTokens(text.getBytes(), separatorChar, 3);
assertEquals(textArray.length, bytesArray.length);
for (int i = 0; i < textArray.length; i++){
@@ -118,7 +118,7 @@ public class TestStringUtil {
char separatorChar = '|';
String[] textArray = org.apache.commons.lang.StringUtils.splitPreserveAllTokens(text, separatorChar);
- byte[][] bytesArray = BytesUtils.splitPreserveAllTokens(text.getBytes(), separatorChar, target);
+ byte[][] bytesArray = BytesUtils.splitPreserveAllTokens(text.getBytes(), separatorChar, target, 3);
assertEquals(textArray.length, bytesArray.length);
http://git-wip-us.apache.org/repos/asf/tajo/blob/7f056955/tajo-core/src/test/java/org/apache/tajo/engine/eval/ExprTestBase.java
----------------------------------------------------------------------
diff --git a/tajo-core/src/test/java/org/apache/tajo/engine/eval/ExprTestBase.java b/tajo-core/src/test/java/org/apache/tajo/engine/eval/ExprTestBase.java
index 4e4b710..876e3e4 100644
--- a/tajo-core/src/test/java/org/apache/tajo/engine/eval/ExprTestBase.java
+++ b/tajo-core/src/test/java/org/apache/tajo/engine/eval/ExprTestBase.java
@@ -238,8 +238,9 @@ public class ExprTestBase {
targetIdx[i] = i;
}
- lazyTuple =
- new LazyTuple(inputSchema, BytesUtils.splitPreserveAllTokens(csvTuple.getBytes(), delimiter, targetIdx),0);
+ byte[][] tokens = BytesUtils.splitPreserveAllTokens(
+ csvTuple.getBytes(), delimiter, targetIdx, inputSchema.size());
+ lazyTuple = new LazyTuple(inputSchema, tokens,0);
vtuple = new VTuple(inputSchema.size());
for (int i = 0; i < inputSchema.size(); i++) {
http://git-wip-us.apache.org/repos/asf/tajo/blob/7f056955/tajo-core/src/test/java/org/apache/tajo/engine/query/TestSelectQuery.java
----------------------------------------------------------------------
diff --git a/tajo-core/src/test/java/org/apache/tajo/engine/query/TestSelectQuery.java b/tajo-core/src/test/java/org/apache/tajo/engine/query/TestSelectQuery.java
index 9ba8a56..dd93dd1 100644
--- a/tajo-core/src/test/java/org/apache/tajo/engine/query/TestSelectQuery.java
+++ b/tajo-core/src/test/java/org/apache/tajo/engine/query/TestSelectQuery.java
@@ -635,4 +635,28 @@ public class TestSelectQuery extends QueryTestCaseBase {
testingCluster.getConfiguration().setSystemTimezone(TimeZone.getTimeZone("GMT"));
}
}
+
+ @Test
+ public void testMultiBytesDelimiter1() throws Exception {
+ executeDDL("multibytes_delimiter_table1_ddl.sql", "multibytes_delimiter1");
+ try {
+ ResultSet res = executeQuery();
+ assertResultSet(res);
+ cleanupQuery(res);
+ } finally {
+ executeString("DROP TABLE table1");
+ }
+ }
+
+ @Test
+ public void testMultiBytesDelimiter2() throws Exception {
+ executeDDL("multibytes_delimiter_table2_ddl.sql", "multibytes_delimiter2");
+ try {
+ ResultSet res = executeQuery();
+ assertResultSet(res);
+ cleanupQuery(res);
+ } finally {
+ executeString("DROP TABLE table2");
+ }
+ }
}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tajo/blob/7f056955/tajo-core/src/test/resources/dataset/TestSelectQuery/multibytes_delimiter1/table1.tbl
----------------------------------------------------------------------
diff --git a/tajo-core/src/test/resources/dataset/TestSelectQuery/multibytes_delimiter1/table1.tbl b/tajo-core/src/test/resources/dataset/TestSelectQuery/multibytes_delimiter1/table1.tbl
new file mode 100644
index 0000000..5acccf6
--- /dev/null
+++ b/tajo-core/src/test/resources/dataset/TestSelectQuery/multibytes_delimiter1/table1.tbl
@@ -0,0 +1,5 @@
+1||ooo||1.1||a
+2||ppp||2.3||
+3||qqq||||
+4||||4.5||
+||xxx||5.6||e
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tajo/blob/7f056955/tajo-core/src/test/resources/dataset/TestSelectQuery/multibytes_delimiter2/table2.tbl
----------------------------------------------------------------------
diff --git a/tajo-core/src/test/resources/dataset/TestSelectQuery/multibytes_delimiter2/table2.tbl b/tajo-core/src/test/resources/dataset/TestSelectQuery/multibytes_delimiter2/table2.tbl
new file mode 100644
index 0000000..b26cdfd
--- /dev/null
+++ b/tajo-core/src/test/resources/dataset/TestSelectQuery/multibytes_delimiter2/table2.tbl
@@ -0,0 +1,5 @@
+1ㅎoooㅎ1.1ㅎa
+2ㅎpppㅎ2.3ㅎ
+3ㅎqqqㅎㅎ
+4ㅎㅎ4.5ㅎ
+ㅎxxxㅎ5.6ㅎe
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tajo/blob/7f056955/tajo-core/src/test/resources/queries/TestSelectQuery/multibytes_delimiter_table1_ddl.sql
----------------------------------------------------------------------
diff --git a/tajo-core/src/test/resources/queries/TestSelectQuery/multibytes_delimiter_table1_ddl.sql b/tajo-core/src/test/resources/queries/TestSelectQuery/multibytes_delimiter_table1_ddl.sql
new file mode 100644
index 0000000..2b4a2ce
--- /dev/null
+++ b/tajo-core/src/test/resources/queries/TestSelectQuery/multibytes_delimiter_table1_ddl.sql
@@ -0,0 +1,3 @@
+create external table table1 (id int, name text, score float, type text) using csv
+with ('csvfile.delimiter'='||', 'csvfile.null'='NULL') location ${table.path};
+
http://git-wip-us.apache.org/repos/asf/tajo/blob/7f056955/tajo-core/src/test/resources/queries/TestSelectQuery/multibytes_delimiter_table2_ddl.sql
----------------------------------------------------------------------
diff --git a/tajo-core/src/test/resources/queries/TestSelectQuery/multibytes_delimiter_table2_ddl.sql b/tajo-core/src/test/resources/queries/TestSelectQuery/multibytes_delimiter_table2_ddl.sql
new file mode 100644
index 0000000..d918ac6
--- /dev/null
+++ b/tajo-core/src/test/resources/queries/TestSelectQuery/multibytes_delimiter_table2_ddl.sql
@@ -0,0 +1,3 @@
+create external table table2 (id int, name text, score float, type text) using csv
+with ('csvfile.delimiter'='ㅎ', 'csvfile.null'='NULL') location ${table.path};
+
http://git-wip-us.apache.org/repos/asf/tajo/blob/7f056955/tajo-core/src/test/resources/queries/TestSelectQuery/testMultiBytesDelimiter1.sql
----------------------------------------------------------------------
diff --git a/tajo-core/src/test/resources/queries/TestSelectQuery/testMultiBytesDelimiter1.sql b/tajo-core/src/test/resources/queries/TestSelectQuery/testMultiBytesDelimiter1.sql
new file mode 100644
index 0000000..bd6b02d
--- /dev/null
+++ b/tajo-core/src/test/resources/queries/TestSelectQuery/testMultiBytesDelimiter1.sql
@@ -0,0 +1 @@
+select * from table1;
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tajo/blob/7f056955/tajo-core/src/test/resources/queries/TestSelectQuery/testMultiBytesDelimiter2.sql
----------------------------------------------------------------------
diff --git a/tajo-core/src/test/resources/queries/TestSelectQuery/testMultiBytesDelimiter2.sql b/tajo-core/src/test/resources/queries/TestSelectQuery/testMultiBytesDelimiter2.sql
new file mode 100644
index 0000000..66a69ec
--- /dev/null
+++ b/tajo-core/src/test/resources/queries/TestSelectQuery/testMultiBytesDelimiter2.sql
@@ -0,0 +1 @@
+select * from table2;
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tajo/blob/7f056955/tajo-core/src/test/resources/results/TestSelectQuery/testMultiBytesDelimiter1.result
----------------------------------------------------------------------
diff --git a/tajo-core/src/test/resources/results/TestSelectQuery/testMultiBytesDelimiter1.result b/tajo-core/src/test/resources/results/TestSelectQuery/testMultiBytesDelimiter1.result
new file mode 100644
index 0000000..d8d43b1
--- /dev/null
+++ b/tajo-core/src/test/resources/results/TestSelectQuery/testMultiBytesDelimiter1.result
@@ -0,0 +1,7 @@
+id,name,score,type
+-------------------------------
+1,ooo,1.1,a
+2,ppp,2.3,
+3,qqq,null,
+4,,4.5,
+null,xxx,5.6,e
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tajo/blob/7f056955/tajo-core/src/test/resources/results/TestSelectQuery/testMultiBytesDelimiter2.result
----------------------------------------------------------------------
diff --git a/tajo-core/src/test/resources/results/TestSelectQuery/testMultiBytesDelimiter2.result b/tajo-core/src/test/resources/results/TestSelectQuery/testMultiBytesDelimiter2.result
new file mode 100644
index 0000000..d8d43b1
--- /dev/null
+++ b/tajo-core/src/test/resources/results/TestSelectQuery/testMultiBytesDelimiter2.result
@@ -0,0 +1,7 @@
+id,name,score,type
+-------------------------------
+1,ooo,1.1,a
+2,ppp,2.3,
+3,qqq,null,
+4,,4.5,
+null,xxx,5.6,e
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tajo/blob/7f056955/tajo-storage/tajo-storage-common/src/test/java/org/apache/tajo/storage/TestLazyTuple.java
----------------------------------------------------------------------
diff --git a/tajo-storage/tajo-storage-common/src/test/java/org/apache/tajo/storage/TestLazyTuple.java b/tajo-storage/tajo-storage-common/src/test/java/org/apache/tajo/storage/TestLazyTuple.java
index c6149f7..fccaf2a 100644
--- a/tajo-storage/tajo-storage-common/src/test/java/org/apache/tajo/storage/TestLazyTuple.java
+++ b/tajo-storage/tajo-storage-common/src/test/java/org/apache/tajo/storage/TestLazyTuple.java
@@ -69,7 +69,7 @@ public class TestLazyTuple {
sb.append(DatumFactory.createInet4("192.168.0.1")).append('|');
sb.append(new String(nullbytes)).append('|');
sb.append(NullDatum.get());
- textRow = BytesUtils.splitPreserveAllTokens(sb.toString().getBytes(), '|');
+ textRow = BytesUtils.splitPreserveAllTokens(sb.toString().getBytes(), '|', 13);
serde = new TextSerializerDeserializer();
}
@@ -220,7 +220,7 @@ public class TestLazyTuple {
@Test
public void testInvalidNumber() {
- byte[][] bytes = BytesUtils.splitPreserveAllTokens(" 1| |2 ||".getBytes(), '|');
+ byte[][] bytes = BytesUtils.splitPreserveAllTokens(" 1| |2 ||".getBytes(), '|', 5);
Schema schema = new Schema();
schema.addColumn("col1", TajoDataTypes.Type.INT2);
schema.addColumn("col2", TajoDataTypes.Type.INT4);
http://git-wip-us.apache.org/repos/asf/tajo/blob/7f056955/tajo-storage/tajo-storage-hbase/src/main/java/org/apache/tajo/storage/hbase/ColumnMapping.java
----------------------------------------------------------------------
diff --git a/tajo-storage/tajo-storage-hbase/src/main/java/org/apache/tajo/storage/hbase/ColumnMapping.java b/tajo-storage/tajo-storage-hbase/src/main/java/org/apache/tajo/storage/hbase/ColumnMapping.java
index 7ddf09a..c3094fd 100644
--- a/tajo-storage/tajo-storage-hbase/src/main/java/org/apache/tajo/storage/hbase/ColumnMapping.java
+++ b/tajo-storage/tajo-storage-hbase/src/main/java/org/apache/tajo/storage/hbase/ColumnMapping.java
@@ -85,7 +85,7 @@ public class ColumnMapping {
for (String eachToken: columnMappingTokens) {
mappingColumns[index] = new byte[2][];
- byte[][] mappingTokens = BytesUtils.splitPreserveAllTokens(eachToken.trim().getBytes(), ':');
+ byte[][] mappingTokens = BytesUtils.splitTrivial(eachToken.trim().getBytes(), (byte)':');
if (mappingTokens.length == 3) {
if (mappingTokens[0].length == 0) {
@@ -230,6 +230,10 @@ public class ColumnMapping {
return numRowKeys;
}
+ public int getNumColumns() {
+ return schema.size();
+ }
+
public boolean[] getIsColumnValues() {
return isColumnValues;
}
http://git-wip-us.apache.org/repos/asf/tajo/blob/7f056955/tajo-storage/tajo-storage-hbase/src/main/java/org/apache/tajo/storage/hbase/HBaseScanner.java
----------------------------------------------------------------------
diff --git a/tajo-storage/tajo-storage-hbase/src/main/java/org/apache/tajo/storage/hbase/HBaseScanner.java b/tajo-storage/tajo-storage-hbase/src/main/java/org/apache/tajo/storage/hbase/HBaseScanner.java
index 5cae077..ab56252 100644
--- a/tajo-storage/tajo-storage-hbase/src/main/java/org/apache/tajo/storage/hbase/HBaseScanner.java
+++ b/tajo-storage/tajo-storage-hbase/src/main/java/org/apache/tajo/storage/hbase/HBaseScanner.java
@@ -218,7 +218,8 @@ public class HBaseScanner implements Scanner {
if (!isBinaryColumns[fieldId] && rowKeyFieldIndexes[fieldId] >= 0) {
int rowKeyFieldIndex = rowKeyFieldIndexes[fieldId];
- byte[][] rowKeyFields = BytesUtils.splitPreserveAllTokens(value, rowKeyDelimiter);
+ byte[][] rowKeyFields = BytesUtils.splitPreserveAllTokens(
+ value, rowKeyDelimiter, columnMapping.getNumColumns());
if (rowKeyFields.length < rowKeyFieldIndex) {
return NullDatum.get();
http://git-wip-us.apache.org/repos/asf/tajo/blob/7f056955/tajo-storage/tajo-storage-hbase/src/main/java/org/apache/tajo/storage/hbase/HBaseStorageManager.java
----------------------------------------------------------------------
diff --git a/tajo-storage/tajo-storage-hbase/src/main/java/org/apache/tajo/storage/hbase/HBaseStorageManager.java b/tajo-storage/tajo-storage-hbase/src/main/java/org/apache/tajo/storage/hbase/HBaseStorageManager.java
index 2a635d8..a9e5bde 100644
--- a/tajo-storage/tajo-storage-hbase/src/main/java/org/apache/tajo/storage/hbase/HBaseStorageManager.java
+++ b/tajo-storage/tajo-storage-hbase/src/main/java/org/apache/tajo/storage/hbase/HBaseStorageManager.java
@@ -1015,7 +1015,8 @@ public class HBaseStorageManager extends StorageManager {
Tuple endTuple = new VTuple(sortSpecs.length);
byte[][] rowKeyFields;
if (sortSpecs.length > 1) {
- byte[][] splitValues = BytesUtils.splitPreserveAllTokens(eachEndKey, columnMapping.getRowKeyDelimiter());
+ byte[][] splitValues = BytesUtils.splitPreserveAllTokens(
+ eachEndKey, columnMapping.getRowKeyDelimiter(), columnMapping.getNumColumns());
if (splitValues.length == sortSpecs.length) {
rowKeyFields = splitValues;
} else {
http://git-wip-us.apache.org/repos/asf/tajo/blob/7f056955/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/CSVFile.java
----------------------------------------------------------------------
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/CSVFile.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/CSVFile.java
index dd5366c..bb628b1 100644
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/CSVFile.java
+++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/CSVFile.java
@@ -61,7 +61,7 @@ public class CSVFile {
private FSDataOutputStream fos;
private DataOutputStream outputStream;
private CompressionOutputStream deflateFilter;
- private char delimiter;
+ private byte[] delimiter;
private TableStatistics stats = null;
private Compressor compressor;
private CompressionCodecFactory codecFactory;
@@ -83,7 +83,7 @@ public class CSVFile {
this.meta = meta;
this.schema = schema;
this.delimiter = StringEscapeUtils.unescapeJava(
- this.meta.getOption(StorageConstants.TEXT_DELIMITER, StorageConstants.DEFAULT_FIELD_DELIMITER)).charAt(0);
+ this.meta.getOption(StorageConstants.TEXT_DELIMITER, StorageConstants.DEFAULT_FIELD_DELIMITER)).getBytes();
this.columnNum = schema.size();
@@ -169,8 +169,8 @@ public class CSVFile {
rowBytes += serde.serialize(schema.getColumn(i), datum, os, nullChars);
if(columnNum - 1 > i){
- os.write((byte) delimiter);
- rowBytes += 1;
+ os.write(delimiter);
+ rowBytes += delimiter.length;
}
if (isShuffle) {
// it is to calculate min/max values, and it is only used for the intermediate file.
@@ -265,7 +265,7 @@ public class CSVFile {
//Delimiter
this.delimiter = StringEscapeUtils.unescapeJava(
meta.getOption(StorageConstants.TEXT_DELIMITER,
- meta.getOption(StorageConstants.CSVFILE_DELIMITER, StorageConstants.DEFAULT_FIELD_DELIMITER))).charAt(0);
+ meta.getOption(StorageConstants.CSVFILE_DELIMITER, StorageConstants.DEFAULT_FIELD_DELIMITER))).getBytes();
String nullCharacters = StringEscapeUtils.unescapeJava(
meta.getOption(StorageConstants.TEXT_NULL,
@@ -279,7 +279,7 @@ public class CSVFile {
}
private final static int DEFAULT_PAGE_SIZE = 256 * 1024;
- private char delimiter;
+ private byte[] delimiter;
private FileSystem fs;
private FSDataInputStream fis;
private InputStream is; //decompressd stream
@@ -476,7 +476,7 @@ public class CSVFile {
}
byte[][] cells = BytesUtils.splitPreserveAllTokens(buffer.getData(), startOffsets.get(currentIdx),
- rowLengthList.get(currentIdx), delimiter, targetColumnIndexes);
+ rowLengthList.get(currentIdx), delimiter, targetColumnIndexes, schema.size());
currentIdx++;
return new LazyTuple(schema, cells, offset, nullChars, serde);
} catch (Throwable t) {
http://git-wip-us.apache.org/repos/asf/tajo/blob/7f056955/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/sequencefile/SequenceFileScanner.java
----------------------------------------------------------------------
diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/sequencefile/SequenceFileScanner.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/sequencefile/SequenceFileScanner.java
index 74563ff..92a041c 100644
--- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/sequencefile/SequenceFileScanner.java
+++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/sequencefile/SequenceFileScanner.java
@@ -171,7 +171,8 @@ public class SequenceFileScanner extends FileScanner {
} else {
Text text = new Text();
reader.getCurrentValue(text);
- cells = BytesUtils.splitPreserveAllTokens(text.getBytes(), delimiter, projectionMap);
+ cells = BytesUtils.splitPreserveAllTokens(text.getBytes(),
+ delimiter, projectionMap, schema.getColumns().size());
totalBytes += (long)text.getBytes().length;
tuple = new LazyTuple(schema, cells, 0, nullChars, serde);
}