You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@gobblin.apache.org by ab...@apache.org on 2018/06/14 23:06:16 UTC
incubator-gobblin git commit: [GOBBLIN-98] Orc records get dropped
and duplicated
Repository: incubator-gobblin
Updated Branches:
refs/heads/master a02073e9d -> ab7dfe622
[GOBBLIN-98] Orc records get dropped and duplicated
Closes #2283 from PraTrick/GOBBLIN-98
Project: http://git-wip-us.apache.org/repos/asf/incubator-gobblin/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-gobblin/commit/ab7dfe62
Tree: http://git-wip-us.apache.org/repos/asf/incubator-gobblin/tree/ab7dfe62
Diff: http://git-wip-us.apache.org/repos/asf/incubator-gobblin/diff/ab7dfe62
Branch: refs/heads/master
Commit: ab7dfe62268c8fbe63a6959fb71cb205f6641f50
Parents: a02073e
Author: Prateek Gupta <pr...@myntra.com>
Authored: Thu Jun 14 16:06:09 2018 -0700
Committer: Abhishek Tiwari <ab...@gmail.com>
Committed: Thu Jun 14 16:06:09 2018 -0700
----------------------------------------------------------------------
.../converter/serde/OrcSerDeWrapper.java | 48 ++++++++++++++++++++
.../src/test/resources/serde/serde.properties | 4 +-
2 files changed, 51 insertions(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-gobblin/blob/ab7dfe62/gobblin-core/src/main/java/org/apache/gobblin/converter/serde/OrcSerDeWrapper.java
----------------------------------------------------------------------
diff --git a/gobblin-core/src/main/java/org/apache/gobblin/converter/serde/OrcSerDeWrapper.java b/gobblin-core/src/main/java/org/apache/gobblin/converter/serde/OrcSerDeWrapper.java
new file mode 100644
index 0000000..bd8a735
--- /dev/null
+++ b/gobblin-core/src/main/java/org/apache/gobblin/converter/serde/OrcSerDeWrapper.java
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package gobblin.converter.serde;
+
+import org.apache.hadoop.hive.ql.io.orc.OrcSerde;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils;
+import org.apache.hadoop.io.Writable;
+
+import java.util.ArrayList;
+
+/**
+ * The Hive's {@link OrcSerde} caches converted records - the {@link OrcSerde} has a single
+ * {@link org.apache.hadoop.hive.ql.io.orc.OrcSerde.OrcSerdeRow} and every time the
+ * {@link org.apache.hadoop.hive.serde2.Serializer#serialize(Object, ObjectInspector)} method is called, the object is
+ * re-used.
+ *
+ * The problem is that {@link org.apache.hadoop.hive.ql.io.orc.OrcSerde.OrcSerdeRow} is package protected and has no
+ * public constructor, so no copy can be made. This would be fine if {@link org.apache.hadoop.hive.ql.io.orc.OrcSerde.OrcSerdeRow}
+ * is immediately written out. But all Gobblin jobs have a buffer that the writer reads from. This buffering can cause
+ * race conditions where records get dropped and duplicated.
+ *
+ * @author Prateek Gupta
+ */
+
+public class OrcSerDeWrapper extends OrcSerde {
+
+ @Override
+ public Writable serialize(Object realRow, ObjectInspector inspector) {
+ Object realRowClone = ObjectInspectorUtils.copyToStandardObject(realRow, inspector);
+ return super.serialize(realRowClone, inspector);
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-gobblin/blob/ab7dfe62/gobblin-core/src/test/resources/serde/serde.properties
----------------------------------------------------------------------
diff --git a/gobblin-core/src/test/resources/serde/serde.properties b/gobblin-core/src/test/resources/serde/serde.properties
index 496ddc6..df42d53 100644
--- a/gobblin-core/src/test/resources/serde/serde.properties
+++ b/gobblin-core/src/test/resources/serde/serde.properties
@@ -17,7 +17,9 @@
avro.schema.url=gobblin-core/src/test/resources/serde/serde.avsc
source.hadoop.file.input.paths=gobblin-core/src/test/resources/serde/serde.avro
-serde.serializer.type=ORC
+serde.serializer.type=gobblin.converter.serde.OrcSerDeWrapper
+serde.serializer.input.format.type=org.apache.hadoop.hive.ql.io.orc.OrcInputFormat
+serde.serializer.output.format.type=org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat
serde.deserializer.type=AVRO
writer.staging.dir=gobblin-core/src/test/resources/serde/output-staging
writer.output.dir=gobblin-core/src/test/resources/serde/output