You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@griffin.apache.org by gu...@apache.org on 2019/09/13 14:17:26 UTC

[griffin] branch master updated: [GRIFFIN-288] optimize hdfs sink

This is an automated email from the ASF dual-hosted git repository.

guoyp pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/griffin.git


The following commit(s) were added to refs/heads/master by this push:
     new 805b096  [GRIFFIN-288] optimize hdfs sink
805b096 is described below

commit 805b09605f83d00639e8f87edde6b80e04ca5529
Author: wankunde <wa...@163.com>
AuthorDate: Fri Sep 13 22:17:03 2019 +0800

    [GRIFFIN-288] optimize hdfs sink
    
    When we sink records to hdfs , it may be OOM if the result is huge.
    
    ```
    19/09/06 18:52:39 INFO LineBufferedStream: 19/09/06 18:52:39 ERROR sink.HdfsSink: Java heap space
    19/09/06 18:52:39 INFO LineBufferedStream: java.lang.OutOfMemoryError: Java heap space
    19/09/06 18:52:39 INFO LineBufferedStream:      at java.util.Arrays.copyOf(Arrays.java:3332)
    19/09/06 18:52:39 INFO LineBufferedStream:      at java.lang.AbstractStringBuilder.ensureCapacityInternal(AbstractStringBuilder.java:124)
    19/09/06 18:52:39 INFO LineBufferedStream:      at java.lang.AbstractStringBuilder.append(AbstractStringBuilder.java:448)
    19/09/06 18:52:39 INFO LineBufferedStream:      at java.lang.StringBuilder.append(StringBuilder.java:136)
    19/09/06 18:52:39 INFO LineBufferedStream:      at scala.collection.mutable.StringBuilder.append(StringBuilder.scala:200)
    19/09/06 18:52:39 INFO LineBufferedStream:      at scala.collection.TraversableOnce$$anonfun$addString$1.apply(TraversableOnce.scala:364)
    19/09/06 18:52:39 INFO LineBufferedStream:      at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
    19/09/06 18:52:39 INFO LineBufferedStream:      at scala.collection.mutable.WrappedArray.foreach(WrappedArray.scala:35)
    19/09/06 18:52:39 INFO LineBufferedStream:      at scala.collection.TraversableOnce$class.addString(TraversableOnce.scala:357)
    19/09/06 18:52:39 INFO LineBufferedStream:      at scala.collection.AbstractTraversable.addString(Traversable.scala:104)
    19/09/06 18:52:39 INFO LineBufferedStream:      at scala.collection.TraversableOnce$class.mkString(TraversableOnce.scala:323)
    19/09/06 18:52:39 INFO LineBufferedStream:      at scala.collection.AbstractTraversable.mkString(Traversable.scala:104)
    19/09/06 18:52:39 INFO LineBufferedStream:      at scala.collection.TraversableOnce$class.mkString(TraversableOnce.scala:325)
    19/09/06 18:52:39 INFO LineBufferedStream:      at scala.collection.AbstractTraversable.mkString(Traversable.scala:104)
    19/09/06 18:52:39 INFO LineBufferedStream:      at org.apache.griffin.measure.sink.HdfsSink.org$apache$griffin$measure$sink$HdfsSink$$sinkRecords2Hdfs(HdfsSink.scala:191)
    19/09/06 18:52:39 INFO LineBufferedStream:      at org.apache.griffin.measure.sink.HdfsSink.sinkRecords(HdfsSink.scala:133)
    19/09/06 18:52:39 INFO LineBufferedStream:      at org.apache.griffin.measure.sink.MultiSinks$$anonfun$sinkRecords$1.apply(MultiSinks.scala:63)
    19/09/06 18:52:39 INFO LineBufferedStream:      at org.apache.griffin.measure.sink.MultiSinks$$anonfun$sinkRecords$1.apply(MultiSinks.scala:61)
    19/09/06 18:52:39 INFO LineBufferedStream:      at scala.collection.immutable.List.foreach(List.scala:392)
    19/09/06 18:52:39 INFO LineBufferedStream:      at org.apache.griffin.measure.sink.MultiSinks.sinkRecords(MultiSinks.scala:61)
    19/09/06 18:52:39 INFO LineBufferedStream:      at org.apache.griffin.measure.step.write.RecordWriteStep.execute(RecordWriteStep.scala:49)
    19/09/06 18:52:39 INFO LineBufferedStream:      at org.apache.griffin.measure.step.transform.SparkSqlTransformStep.doExecute(SparkSqlTransformStep.scala:40)
    19/09/06 18:52:39 INFO LineBufferedStream:      at org.apache.griffin.measure.step.transform.TransformStep$class.execute(TransformStep.scala:72)
    19/09/06 18:52:39 INFO LineBufferedStream:      at org.apache.griffin.measure.step.transform.SparkSqlTransformStep.execute(SparkSqlTransformStep.scala:27)
    19/09/06 18:52:39 INFO LineBufferedStream:      at org.apache.griffin.measure.step.transform.TransformStep$$anonfun$2$$anonfun$apply$1.apply$mcV$sp(TransformStep.scala:51)
    19/09/06 18:52:39 INFO LineBufferedStream:      at org.apache.griffin.measure.step.transform.TransformStep$$anonfun$2$$anonfun$apply$1.apply(TransformStep.scala:50)
    19/09/06 18:52:39 INFO LineBufferedStream:      at org.apache.griffin.measure.step.transform.TransformStep$$anonfun$2$$anonfun$apply$1.apply(TransformStep.scala:50)
    19/09/06 18:52:39 INFO LineBufferedStream:      at scala.concurrent.impl.Future$PromiseCompletingRunnable.liftedTree1$1(Future.scala:24)
    19/09/06 18:52:39 INFO LineBufferedStream:      at scala.concurrent.impl.Future$PromiseCompletingRunnable.run(Future.scala:24)
    19/09/06 18:52:39 INFO LineBufferedStream:      at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
    19/09/06 18:52:39 INFO LineBufferedStream:      at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
    19/09/06 18:52:39 INFO LineBufferedStream:      at java.lang.Thread.run(Thread.java:748)
    ```
    
    Author: wankunde <wa...@163.com>
    
    Closes #533 from wankunde/hdfssink.
---
 .../scala/org/apache/griffin/measure/sink/HdfsSink.scala  | 14 ++++++++++----
 .../scala/org/apache/griffin/measure/utils/HdfsUtil.scala | 15 +++++++++++----
 2 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/measure/src/main/scala/org/apache/griffin/measure/sink/HdfsSink.scala b/measure/src/main/scala/org/apache/griffin/measure/sink/HdfsSink.scala
index 23fb48e..c9be1b0 100644
--- a/measure/src/main/scala/org/apache/griffin/measure/sink/HdfsSink.scala
+++ b/measure/src/main/scala/org/apache/griffin/measure/sink/HdfsSink.scala
@@ -22,7 +22,8 @@ import java.util.Date
 
 import org.apache.spark.rdd.RDD
 
-import org.apache.griffin.measure.utils.{HdfsUtil, JsonUtil}
+import org.apache.griffin.measure.utils.HdfsUtil
+import org.apache.griffin.measure.utils.JsonUtil
 import org.apache.griffin.measure.utils.ParamUtil._
 
 /**
@@ -99,7 +100,9 @@ case class HdfsSink(
   def log(rt: Long, msg: String): Unit = {
     try {
       val logStr = logWrap(rt, msg)
-      HdfsUtil.appendContent(LogFile, logStr)
+      HdfsUtil.withHdfsFile(LogFile) { out =>
+        out.write(logStr.getBytes("utf-8"))
+      }
     } catch {
       case e: Throwable => error(e.getMessage, e)
     }
@@ -188,8 +191,11 @@ case class HdfsSink(
 
   private def sinkRecords2Hdfs(hdfsPath: String, records: Iterable[String]): Unit = {
     try {
-      val recStr = records.mkString("\n")
-      HdfsUtil.writeContent(hdfsPath, recStr)
+      HdfsUtil.withHdfsFile(hdfsPath, false) { out =>
+        records.map { record =>
+          out.write((record + "\n").getBytes("utf-8"))
+        }
+      }
     } catch {
       case e: Throwable => error(e.getMessage, e)
     }
diff --git a/measure/src/main/scala/org/apache/griffin/measure/utils/HdfsUtil.scala b/measure/src/main/scala/org/apache/griffin/measure/utils/HdfsUtil.scala
index ffb7e47..004054f 100644
--- a/measure/src/main/scala/org/apache/griffin/measure/utils/HdfsUtil.scala
+++ b/measure/src/main/scala/org/apache/griffin/measure/utils/HdfsUtil.scala
@@ -66,10 +66,17 @@ object HdfsUtil extends Loggable {
     out.close
   }
 
-  def appendContent(filePath: String, message: String): Unit = {
-    val out = appendOrCreateFile(filePath)
-    out.write(message.getBytes("utf-8"))
-    out.close
+  def withHdfsFile(filePath: String, appendIfExists: Boolean = true)
+                  (f: FSDataOutputStream => Unit): Unit = {
+    val out =
+      if (appendIfExists) {
+        appendOrCreateFile(filePath)
+      } else {
+        createFile(filePath)
+      }
+
+    f(out)
+    out.close()
   }
 
   def createEmptyFile(filePath: String): Unit = {