You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by sa...@apache.org on 2019/10/24 04:29:32 UTC

[hive] branch master updated: HIVE-22360: MultiDelimitSerDe returns wrong results in last column when the loaded file has more columns than those in table schema (Shubham Chaurasia, reviewed by Sankar Hariappan)

This is an automated email from the ASF dual-hosted git repository.

sankarh pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git


The following commit(s) were added to refs/heads/master by this push:
     new 843cfd9  HIVE-22360: MultiDelimitSerDe returns wrong results in last column when the loaded file has more columns than those in table schema (Shubham Chaurasia, reviewed by Sankar Hariappan)
843cfd9 is described below

commit 843cfd9f53d21bb5545bccb4bb5bb9a627ee1777
Author: Shubham Chaurasia <sc...@cloudera.com>
AuthorDate: Thu Oct 24 09:59:06 2019 +0530

    HIVE-22360: MultiDelimitSerDe returns wrong results in last column when the loaded file has more columns than those in table schema (Shubham Chaurasia, reviewed by Sankar Hariappan)
    
    Signed-off-by: Sankar Hariappan <sa...@apache.org>
---
 data/files/t11_csv_serde.csv                       |  10 +
 data/files/t1_multi_delimit.csv                    |  10 +
 data/files/t2_multi_delimit.csv                    |   4 +
 data/files/t3_multi_delimit.csv                    |  10 +
 .../queries/clientpositive/serde_multi_delimit.q   |  65 ++++++
 .../clientpositive/serde_multi_delimit.q.out       | 232 +++++++++++++++++++++
 .../hadoop/hive/serde2/MultiDelimitSerDe.java      |  11 +-
 .../apache/hadoop/hive/serde2/lazy/LazyStruct.java |  56 ++---
 8 files changed, 361 insertions(+), 37 deletions(-)

diff --git a/data/files/t11_csv_serde.csv b/data/files/t11_csv_serde.csv
new file mode 100644
index 0000000..6e70609
--- /dev/null
+++ b/data/files/t11_csv_serde.csv
@@ -0,0 +1,10 @@
+1,1,,0,0
+2,1,,0,1
+3,1,,0,0
+4,1,,0,1
+5,5
+
+7777
+8,8,,8,8,8
+9,9,,9,9,9,9,,9,9,9
+10101010
\ No newline at end of file
diff --git a/data/files/t1_multi_delimit.csv b/data/files/t1_multi_delimit.csv
new file mode 100644
index 0000000..6c4e729
--- /dev/null
+++ b/data/files/t1_multi_delimit.csv
@@ -0,0 +1,10 @@
+1^,1^,^,0^,0
+2^,1^,^,0^,1
+3^,1^,^,0^,0
+4^,1^,^,0^,1
+5^,5
+
+7777
+8^,8^,^,8^,8^,8
+9^,9^,^,9^,9^,9^,9^,^,9^,9^,9
+10101010
\ No newline at end of file
diff --git a/data/files/t2_multi_delimit.csv b/data/files/t2_multi_delimit.csv
new file mode 100644
index 0000000..0dd42e1
--- /dev/null
+++ b/data/files/t2_multi_delimit.csv
@@ -0,0 +1,4 @@
+1^,1^,^,0^,0^,0
+2^,1^,^,0^,1^,0
+3^,1^,^,0^,0^,0
+4^,1^,^,0^,1^,0
diff --git a/data/files/t3_multi_delimit.csv b/data/files/t3_multi_delimit.csv
new file mode 100644
index 0000000..8c49f6f
--- /dev/null
+++ b/data/files/t3_multi_delimit.csv
@@ -0,0 +1,10 @@
+1^^^^^1^^^^^^^^^^0^^^^^0
+2^^^^^1^^^^^^^^^^0^^^^^1
+3^^^^^1^^^^^^^^^^0^^^^^0
+4^^^^^1^^^^^^^^^^0^^^^^1
+5^^^^^5
+
+7777
+8^^^^^8^^^^^^^^^^8^^^^^8^^^^^8
+9^^^^^9^^^^^^^^^^9^^^^^9^^^^^9
+10101010
\ No newline at end of file
diff --git a/ql/src/test/queries/clientpositive/serde_multi_delimit.q b/ql/src/test/queries/clientpositive/serde_multi_delimit.q
new file mode 100644
index 0000000..0d85175
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/serde_multi_delimit.q
@@ -0,0 +1,65 @@
+-- in this table, rows of different lengths(different number of columns) are loaded
+CREATE TABLE t1_multi_delimit(colA int,
+  colB tinyint,
+  colC timestamp,
+  colD smallint,
+  colE smallint)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.MultiDelimitSerDe'
+WITH SERDEPROPERTIES ("field.delim"="^,")STORED AS TEXTFILE;
+
+LOAD DATA LOCAL INPATH "../../data/files/t1_multi_delimit.csv" INTO TABLE t1_multi_delimit;
+
+SELECT * FROM t1_multi_delimit;
+
+-- in this table, rows of different lengths(different number of columns) and it uses csv serde
+CREATE TABLE t11_csv_serde(colA int,
+  colB tinyint,
+  colC timestamp,
+  colD smallint,
+  colE smallint)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
+WITH SERDEPROPERTIES ("separatorChar" = ",")STORED AS TEXTFILE;
+
+LOAD DATA LOCAL INPATH "../../data/files/t11_csv_serde.csv" INTO TABLE t11_csv_serde;
+
+SELECT * FROM t11_csv_serde;
+
+-- there should not be any difference between MultiDelimitSerDe table and OpenCSVSerde table results
+
+SELECT EXISTS (
+SELECT colA, colB, colC, colD, colE FROM t1_multi_delimit
+MINUS
+SELECT cast(colA as int), cast(colB as tinyint), cast(colC as timestamp), cast(colD as smallint), cast(colE as smallint) FROM t11_csv_serde
+);
+
+-- in this table, file having extra column is loaded
+CREATE TABLE t2_multi_delimit(colA int,
+  colB tinyint,
+  colC timestamp,
+  colD smallint,
+  colE smallint)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.MultiDelimitSerDe'
+WITH SERDEPROPERTIES ("field.delim"="^,")STORED AS TEXTFILE;
+
+LOAD DATA LOCAL INPATH "../../data/files/t2_multi_delimit.csv" INTO TABLE t2_multi_delimit;
+
+SELECT * FROM t2_multi_delimit;
+
+-- in this table, delimiter of 5 characters is used
+CREATE TABLE t3_multi_delimit(colA int,
+  colB tinyint,
+  colC timestamp,
+  colD smallint,
+  colE smallint)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.MultiDelimitSerDe'
+WITH SERDEPROPERTIES ("field.delim"="^^^^^")STORED AS TEXTFILE;
+
+LOAD DATA LOCAL INPATH "../../data/files/t3_multi_delimit.csv" INTO TABLE t3_multi_delimit;
+
+SELECT * FROM t3_multi_delimit;
+
+
+DROP TABLE t1_multi_delimit;
+DROP TABLE t11_csv_serde;
+DROP TABLE t2_multi_delimit;
+DROP TABLE t3_multi_delimit;
\ No newline at end of file
diff --git a/ql/src/test/results/clientpositive/serde_multi_delimit.q.out b/ql/src/test/results/clientpositive/serde_multi_delimit.q.out
new file mode 100644
index 0000000..f13aa59
--- /dev/null
+++ b/ql/src/test/results/clientpositive/serde_multi_delimit.q.out
@@ -0,0 +1,232 @@
+PREHOOK: query: CREATE TABLE t1_multi_delimit(colA int,
+  colB tinyint,
+  colC timestamp,
+  colD smallint,
+  colE smallint)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.MultiDelimitSerDe'
+WITH SERDEPROPERTIES ("field.delim"="^,")STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@t1_multi_delimit
+POSTHOOK: query: CREATE TABLE t1_multi_delimit(colA int,
+  colB tinyint,
+  colC timestamp,
+  colD smallint,
+  colE smallint)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.MultiDelimitSerDe'
+WITH SERDEPROPERTIES ("field.delim"="^,")STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@t1_multi_delimit
+PREHOOK: query: LOAD DATA LOCAL INPATH "../../data/files/t1_multi_delimit.csv" INTO TABLE t1_multi_delimit
+PREHOOK: type: LOAD
+#### A masked pattern was here ####
+PREHOOK: Output: default@t1_multi_delimit
+POSTHOOK: query: LOAD DATA LOCAL INPATH "../../data/files/t1_multi_delimit.csv" INTO TABLE t1_multi_delimit
+POSTHOOK: type: LOAD
+#### A masked pattern was here ####
+POSTHOOK: Output: default@t1_multi_delimit
+PREHOOK: query: SELECT * FROM t1_multi_delimit
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1_multi_delimit
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT * FROM t1_multi_delimit
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1_multi_delimit
+#### A masked pattern was here ####
+1	1	NULL	0	0
+2	1	NULL	0	1
+3	1	NULL	0	0
+4	1	NULL	0	1
+5	5	NULL	NULL	NULL
+NULL	NULL	NULL	NULL	NULL
+7777	NULL	NULL	NULL	NULL
+8	8	NULL	8	8
+9	9	NULL	9	9
+10101010	NULL	NULL	NULL	NULL
+PREHOOK: query: CREATE TABLE t11_csv_serde(colA int,
+  colB tinyint,
+  colC timestamp,
+  colD smallint,
+  colE smallint)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
+WITH SERDEPROPERTIES ("separatorChar" = ",")STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@t11_csv_serde
+POSTHOOK: query: CREATE TABLE t11_csv_serde(colA int,
+  colB tinyint,
+  colC timestamp,
+  colD smallint,
+  colE smallint)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
+WITH SERDEPROPERTIES ("separatorChar" = ",")STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@t11_csv_serde
+PREHOOK: query: LOAD DATA LOCAL INPATH "../../data/files/t11_csv_serde.csv" INTO TABLE t11_csv_serde
+PREHOOK: type: LOAD
+#### A masked pattern was here ####
+PREHOOK: Output: default@t11_csv_serde
+POSTHOOK: query: LOAD DATA LOCAL INPATH "../../data/files/t11_csv_serde.csv" INTO TABLE t11_csv_serde
+POSTHOOK: type: LOAD
+#### A masked pattern was here ####
+POSTHOOK: Output: default@t11_csv_serde
+PREHOOK: query: SELECT * FROM t11_csv_serde
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t11_csv_serde
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT * FROM t11_csv_serde
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t11_csv_serde
+#### A masked pattern was here ####
+1	1		0	0
+2	1		0	1
+3	1		0	0
+4	1		0	1
+5	5	NULL	NULL	NULL
+NULL	NULL	NULL	NULL	NULL
+7777	NULL	NULL	NULL	NULL
+8	8		8	8
+9	9		9	9
+10101010	NULL	NULL	NULL	NULL
+Warning: Shuffle Join JOIN[30][tables = [$hdt$_0, $hdt$_1]] in Stage 'Stage-1:MAPRED' is a cross product
+PREHOOK: query: SELECT EXISTS (
+SELECT colA, colB, colC, colD, colE FROM t1_multi_delimit
+MINUS
+SELECT cast(colA as int), cast(colB as tinyint), cast(colC as timestamp), cast(colD as smallint), cast(colE as smallint) FROM t11_csv_serde
+)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Input: default@t11_csv_serde
+PREHOOK: Input: default@t1_multi_delimit
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT EXISTS (
+SELECT colA, colB, colC, colD, colE FROM t1_multi_delimit
+MINUS
+SELECT cast(colA as int), cast(colB as tinyint), cast(colC as timestamp), cast(colD as smallint), cast(colE as smallint) FROM t11_csv_serde
+)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Input: default@t11_csv_serde
+POSTHOOK: Input: default@t1_multi_delimit
+#### A masked pattern was here ####
+false
+PREHOOK: query: CREATE TABLE t2_multi_delimit(colA int,
+  colB tinyint,
+  colC timestamp,
+  colD smallint,
+  colE smallint)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.MultiDelimitSerDe'
+WITH SERDEPROPERTIES ("field.delim"="^,")STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@t2_multi_delimit
+POSTHOOK: query: CREATE TABLE t2_multi_delimit(colA int,
+  colB tinyint,
+  colC timestamp,
+  colD smallint,
+  colE smallint)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.MultiDelimitSerDe'
+WITH SERDEPROPERTIES ("field.delim"="^,")STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@t2_multi_delimit
+PREHOOK: query: LOAD DATA LOCAL INPATH "../../data/files/t2_multi_delimit.csv" INTO TABLE t2_multi_delimit
+PREHOOK: type: LOAD
+#### A masked pattern was here ####
+PREHOOK: Output: default@t2_multi_delimit
+POSTHOOK: query: LOAD DATA LOCAL INPATH "../../data/files/t2_multi_delimit.csv" INTO TABLE t2_multi_delimit
+POSTHOOK: type: LOAD
+#### A masked pattern was here ####
+POSTHOOK: Output: default@t2_multi_delimit
+PREHOOK: query: SELECT * FROM t2_multi_delimit
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t2_multi_delimit
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT * FROM t2_multi_delimit
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t2_multi_delimit
+#### A masked pattern was here ####
+1	1	NULL	0	0
+2	1	NULL	0	1
+3	1	NULL	0	0
+4	1	NULL	0	1
+PREHOOK: query: CREATE TABLE t3_multi_delimit(colA int,
+  colB tinyint,
+  colC timestamp,
+  colD smallint,
+  colE smallint)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.MultiDelimitSerDe'
+WITH SERDEPROPERTIES ("field.delim"="^^^^^")STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@t3_multi_delimit
+POSTHOOK: query: CREATE TABLE t3_multi_delimit(colA int,
+  colB tinyint,
+  colC timestamp,
+  colD smallint,
+  colE smallint)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.MultiDelimitSerDe'
+WITH SERDEPROPERTIES ("field.delim"="^^^^^")STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@t3_multi_delimit
+PREHOOK: query: LOAD DATA LOCAL INPATH "../../data/files/t3_multi_delimit.csv" INTO TABLE t3_multi_delimit
+PREHOOK: type: LOAD
+#### A masked pattern was here ####
+PREHOOK: Output: default@t3_multi_delimit
+POSTHOOK: query: LOAD DATA LOCAL INPATH "../../data/files/t3_multi_delimit.csv" INTO TABLE t3_multi_delimit
+POSTHOOK: type: LOAD
+#### A masked pattern was here ####
+POSTHOOK: Output: default@t3_multi_delimit
+PREHOOK: query: SELECT * FROM t3_multi_delimit
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t3_multi_delimit
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT * FROM t3_multi_delimit
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t3_multi_delimit
+#### A masked pattern was here ####
+1	1	NULL	0	0
+2	1	NULL	0	1
+3	1	NULL	0	0
+4	1	NULL	0	1
+5	5	NULL	NULL	NULL
+NULL	NULL	NULL	NULL	NULL
+7777	NULL	NULL	NULL	NULL
+8	8	NULL	8	8
+9	9	NULL	9	9
+10101010	NULL	NULL	NULL	NULL
+PREHOOK: query: DROP TABLE t1_multi_delimit
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@t1_multi_delimit
+PREHOOK: Output: default@t1_multi_delimit
+POSTHOOK: query: DROP TABLE t1_multi_delimit
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@t1_multi_delimit
+POSTHOOK: Output: default@t1_multi_delimit
+PREHOOK: query: DROP TABLE t11_csv_serde
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@t11_csv_serde
+PREHOOK: Output: default@t11_csv_serde
+POSTHOOK: query: DROP TABLE t11_csv_serde
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@t11_csv_serde
+POSTHOOK: Output: default@t11_csv_serde
+PREHOOK: query: DROP TABLE t2_multi_delimit
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@t2_multi_delimit
+PREHOOK: Output: default@t2_multi_delimit
+POSTHOOK: query: DROP TABLE t2_multi_delimit
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@t2_multi_delimit
+POSTHOOK: Output: default@t2_multi_delimit
+PREHOOK: query: DROP TABLE t3_multi_delimit
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@t3_multi_delimit
+PREHOOK: Output: default@t3_multi_delimit
+POSTHOOK: query: DROP TABLE t3_multi_delimit
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@t3_multi_delimit
+POSTHOOK: Output: default@t3_multi_delimit
diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/MultiDelimitSerDe.java b/serde/src/java/org/apache/hadoop/hive/serde2/MultiDelimitSerDe.java
index d7d0d87..efe6597 100644
--- a/serde/src/java/org/apache/hadoop/hive/serde2/MultiDelimitSerDe.java
+++ b/serde/src/java/org/apache/hadoop/hive/serde2/MultiDelimitSerDe.java
@@ -69,6 +69,9 @@ public class MultiDelimitSerDe extends AbstractEncodingAwareSerDe {
   // Due to HIVE-6404, define our own constant
   private static final String COLLECTION_DELIM = "collection.delim";
 
+  // actual delimiter(fieldDelimited) is replaced by REPLACEMENT_DELIM in row.
+  private static final String REPLACEMENT_DELIM = "\1";
+
   private int numColumns;
   private String fieldDelimited;
   // we don't support using multiple chars as delimiters within complex types
@@ -90,6 +93,8 @@ public class MultiDelimitSerDe extends AbstractEncodingAwareSerDe {
   private final ByteStream.Output serializeStream = new ByteStream.Output();
   // The Writable to return in serialize
   private final Text serializeCache = new Text();
+  // pattern for delimiter
+  private Pattern delimiterPattern;
 
   @Override
   public void initialize(Configuration conf, Properties tbl) throws SerDeException {
@@ -101,7 +106,7 @@ public class MultiDelimitSerDe extends AbstractEncodingAwareSerDe {
     if (fieldDelimited == null || fieldDelimited.isEmpty()) {
       throw new SerDeException("This table does not have serde property \"field.delim\"!");
     }
-
+    delimiterPattern = Pattern.compile(fieldDelimited, Pattern.LITERAL);
     // get the collection separator and map key separator
     // TODO: use serdeConstants.COLLECTION_DELIM when the typo is fixed
     collSep = LazyUtils.getByte(tbl.getProperty(COLLECTION_DELIM),
@@ -154,10 +159,10 @@ public class MultiDelimitSerDe extends AbstractEncodingAwareSerDe {
     } else {
       throw new SerDeException(getClass() + ": expects either BytesWritable or Text object!");
     }
-    byteArrayRef.setData(rowStr.replaceAll(Pattern.quote(fieldDelimited), "\1").getBytes());
+    byteArrayRef.setData(rowStr.replaceAll(Pattern.quote(fieldDelimited), REPLACEMENT_DELIM).getBytes());
     cachedLazyStruct.init(byteArrayRef, 0, byteArrayRef.getData().length);
     // use the multi-char delimiter to parse the lazy struct
-    cachedLazyStruct.parseMultiDelimit(rowStr.getBytes(), fieldDelimited.getBytes());
+    cachedLazyStruct.parseMultiDelimit(rowStr, delimiterPattern, REPLACEMENT_DELIM);
     return cachedLazyStruct;
   }
 
diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyStruct.java b/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyStruct.java
index f066aaa..9163824 100644
--- a/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyStruct.java
+++ b/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyStruct.java
@@ -20,8 +20,9 @@ package org.apache.hadoop.hive.serde2.lazy;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 
-import com.google.common.primitives.Bytes;
 import org.apache.hadoop.hive.serde2.SerDeException;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -278,8 +279,14 @@ public class LazyStruct extends LazyNonPrimitive<LazySimpleStructObjectInspector
     return serializedSize;
   }
 
-  // parse the struct using multi-char delimiter
-  public void parseMultiDelimit(byte[] rawRow, byte[] fieldDelimit) {
+  /**
+   *  Parses rawRow using multi-char delimiter.
+   *
+   * @param rawRow row to be parsed, delimited by fieldDelimit
+   * @param fieldDelimit pattern of multi-char delimiter
+   * @param replacementDelim delimiter with which fieldDelimit has been replaced in rawRow
+   */
+  public void parseMultiDelimit(final String rawRow, final Pattern fieldDelimit, final String replacementDelim) {
     if (rawRow == null || fieldDelimit == null) {
       return;
     }
@@ -292,47 +299,28 @@ public class LazyStruct extends LazyNonPrimitive<LazySimpleStructObjectInspector
       fieldInited = new boolean[fields.length];
       startPosition = new int[fields.length + 1];
     }
-    // the indexes of the delimiters
-    int[] delimitIndexes = findIndexes(rawRow, fieldDelimit);
-    int diff = fieldDelimit.length - 1;
+    final int delimiterLength = fieldDelimit.toString().length();
+    final int extraBytesInDelim = delimiterLength - replacementDelim.length();
+
     // first field always starts from 0, even when missing
     startPosition[0] = 0;
-    for (int i = 1; i < fields.length; i++) {
-      if (delimitIndexes[i - 1] != -1) {
-        int start = delimitIndexes[i - 1] + fieldDelimit.length;
-        startPosition[i] = start - i * diff;
+    Matcher delimiterMatcher = fieldDelimit.matcher(rawRow);
+    for (int i = 1; i <= fields.length; i++) {
+      if (delimiterMatcher.find()) {
+        // MultiDelimitSerDe replaces actual multi-char delimiter by replacementDelim("\1") which reduces the length
+        // however here we are getting rawRow with original multi-char delimiter
+        // due to this we have to subtract those extra chars to match length of LazyNonPrimitive#bytes which are used
+        // while reading data, see uncheckedGetField()
+        startPosition[i] = delimiterMatcher.start() + delimiterLength - i * extraBytesInDelim;
       } else {
         startPosition[i] = length + 1;
       }
     }
-    startPosition[fields.length] = length + 1;
+
     Arrays.fill(fieldInited, false);
     parsed = true;
   }
 
-  // find all the indexes of the sub byte[]
-  private int[] findIndexes(byte[] array, byte[] target) {
-    if (fields.length <= 1) {
-      return new int[0];
-    }
-    int[] indexes = new int[fields.length - 1];
-    Arrays.fill(indexes, -1);
-    indexes[0] = Bytes.indexOf(array, target);
-    if (indexes[0] == -1) {
-      return indexes;
-    }
-    int indexInNewArray = indexes[0];
-    for (int i = 1; i < indexes.length; i++) {
-      array = Arrays.copyOfRange(array, indexInNewArray + target.length, array.length);
-      indexInNewArray = Bytes.indexOf(array, target);
-      if (indexInNewArray == -1) {
-        break;
-      }
-      indexes[i] = indexInNewArray + indexes[i - 1] + target.length;
-    }
-    return indexes;
-  }
-
   /**
    * Return the data in bytes corresponding to this given struct. This is useful specifically in
    * cases where the data is stored in serialized formats like protobufs or thrift and would need