You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@gobblin.apache.org by ab...@apache.org on 2018/06/14 23:06:16 UTC

incubator-gobblin git commit: [GOBBLIN-98] Orc records get dropped and duplicated

Repository: incubator-gobblin
Updated Branches:
  refs/heads/master a02073e9d -> ab7dfe622


[GOBBLIN-98] Orc records get dropped and duplicated

Closes #2283 from PraTrick/GOBBLIN-98


Project: http://git-wip-us.apache.org/repos/asf/incubator-gobblin/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-gobblin/commit/ab7dfe62
Tree: http://git-wip-us.apache.org/repos/asf/incubator-gobblin/tree/ab7dfe62
Diff: http://git-wip-us.apache.org/repos/asf/incubator-gobblin/diff/ab7dfe62

Branch: refs/heads/master
Commit: ab7dfe62268c8fbe63a6959fb71cb205f6641f50
Parents: a02073e
Author: Prateek Gupta <pr...@myntra.com>
Authored: Thu Jun 14 16:06:09 2018 -0700
Committer: Abhishek Tiwari <ab...@gmail.com>
Committed: Thu Jun 14 16:06:09 2018 -0700

----------------------------------------------------------------------
 .../converter/serde/OrcSerDeWrapper.java        | 48 ++++++++++++++++++++
 .../src/test/resources/serde/serde.properties   |  4 +-
 2 files changed, 51 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-gobblin/blob/ab7dfe62/gobblin-core/src/main/java/org/apache/gobblin/converter/serde/OrcSerDeWrapper.java
----------------------------------------------------------------------
diff --git a/gobblin-core/src/main/java/org/apache/gobblin/converter/serde/OrcSerDeWrapper.java b/gobblin-core/src/main/java/org/apache/gobblin/converter/serde/OrcSerDeWrapper.java
new file mode 100644
index 0000000..bd8a735
--- /dev/null
+++ b/gobblin-core/src/main/java/org/apache/gobblin/converter/serde/OrcSerDeWrapper.java
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package gobblin.converter.serde;
+
+import org.apache.hadoop.hive.ql.io.orc.OrcSerde;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils;
+import org.apache.hadoop.io.Writable;
+
+import java.util.ArrayList;
+
+/**
+ * The Hive's {@link OrcSerde} caches converted records - the {@link OrcSerde} has a single
+ * {@link org.apache.hadoop.hive.ql.io.orc.OrcSerde.OrcSerdeRow} and every time the
+ * {@link org.apache.hadoop.hive.serde2.Serializer#serialize(Object, ObjectInspector)} method is called, the object is
+ * re-used.
+ *
+ * The problem is that {@link org.apache.hadoop.hive.ql.io.orc.OrcSerde.OrcSerdeRow} is package protected and has no
+ * public constructor, so no copy can be made. This would be fine if {@link org.apache.hadoop.hive.ql.io.orc.OrcSerde.OrcSerdeRow}
+ * is immediately written out. But all Gobblin jobs have a buffer that the writer reads from. This buffering can cause
+ * race conditions where records get dropped and duplicated.
+ *
+ * @author Prateek Gupta
+ */
+
+public class OrcSerDeWrapper extends OrcSerde {
+
+  @Override
+  public Writable serialize(Object realRow, ObjectInspector inspector) {
+      Object realRowClone = ObjectInspectorUtils.copyToStandardObject(realRow, inspector);
+      return super.serialize(realRowClone, inspector);
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-gobblin/blob/ab7dfe62/gobblin-core/src/test/resources/serde/serde.properties
----------------------------------------------------------------------
diff --git a/gobblin-core/src/test/resources/serde/serde.properties b/gobblin-core/src/test/resources/serde/serde.properties
index 496ddc6..df42d53 100644
--- a/gobblin-core/src/test/resources/serde/serde.properties
+++ b/gobblin-core/src/test/resources/serde/serde.properties
@@ -17,7 +17,9 @@
 
 avro.schema.url=gobblin-core/src/test/resources/serde/serde.avsc
 source.hadoop.file.input.paths=gobblin-core/src/test/resources/serde/serde.avro
-serde.serializer.type=ORC
+serde.serializer.type=gobblin.converter.serde.OrcSerDeWrapper
+serde.serializer.input.format.type=org.apache.hadoop.hive.ql.io.orc.OrcInputFormat
+serde.serializer.output.format.type=org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat
 serde.deserializer.type=AVRO
 writer.staging.dir=gobblin-core/src/test/resources/serde/output-staging
 writer.output.dir=gobblin-core/src/test/resources/serde/output