You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by ni...@apache.org on 2017/10/25 22:43:17 UTC
systemml git commit: [SYSTEMML-540] Include the memory requirement of each layer in the summary table of Caffe2DML

Repository: systemml
Updated Branches:
  refs/heads/master 8f4ecdce2 -> 881caa9ba


[SYSTEMML-540] Include the memory requirement of each layer in the summary table of Caffe2DML

- This helps the user to estimate the batch size she should set for
  optimal performance.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/881caa9b
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/881caa9b
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/881caa9b

Branch: refs/heads/master
Commit: 881caa9ba508b029f72f27d468bb33805704c7cb
Parents: 8f4ecdc
Author: Niketan Pansare <np...@us.ibm.com>
Authored: Wed Oct 25 15:40:21 2017 -0700
Committer: Niketan Pansare <np...@us.ibm.com>
Committed: Wed Oct 25 15:42:04 2017 -0700

----------------------------------------------------------------------
 docs/beginners-guide-caffe2dml.md               | 37 +++++++++-------
 .../org/apache/sysml/api/dl/Caffe2DML.scala     | 46 ++++++++++++++++++--
 2 files changed, 63 insertions(+), 20 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/881caa9b/docs/beginners-guide-caffe2dml.md
----------------------------------------------------------------------
diff --git a/docs/beginners-guide-caffe2dml.md b/docs/beginners-guide-caffe2dml.md
index 4d6b7fd..8814283 100644
--- a/docs/beginners-guide-caffe2dml.md
+++ b/docs/beginners-guide-caffe2dml.md
@@ -64,22 +64,27 @@ lenet.summary()
 Output:
 
 ```
-+-----+---------------+--------------+------------+---------+-----------+---------+
-| Name|           Type|        Output|      Weight|     Bias|        Top|   Bottom|
-+-----+---------------+--------------+------------+---------+-----------+---------+
-|mnist|           Data| (, 1, 28, 28)|            |         |mnist,mnist|         |
-|conv1|    Convolution|(, 32, 28, 28)|   [32 X 25]| [32 X 1]|      conv1|    mnist|
-|relu1|           ReLU|(, 32, 28, 28)|            |         |      relu1|    conv1|
-|pool1|        Pooling|(, 32, 14, 14)|            |         |      pool1|    relu1|
-|conv2|    Convolution|(, 64, 14, 14)|  [64 X 800]| [64 X 1]|      conv2|    pool1|
-|relu2|           ReLU|(, 64, 14, 14)|            |         |      relu2|    conv2|
-|pool2|        Pooling|  (, 64, 7, 7)|            |         |      pool2|    relu2|
-|  ip1|   InnerProduct| (, 512, 1, 1)|[3136 X 512]|[1 X 512]|        ip1|    pool2|
-|relu3|           ReLU| (, 512, 1, 1)|            |         |      relu3|      ip1|
-|drop1|        Dropout| (, 512, 1, 1)|            |         |      drop1|    relu3|
-|  ip2|   InnerProduct|  (, 10, 1, 1)|  [512 X 10]| [1 X 10]|        ip2|    drop1|
-| loss|SoftmaxWithLoss|  (, 10, 1, 1)|            |         |       loss|ip2,mnist|
-+-----+---------------+--------------+------------+---------+-----------+---------+
++-----+---------------+--------------+------------+---------+-----------+---------+--------------------+
+| Name|           Type|        Output|      Weight|     Bias|        Top|   Bottom|Memory* (train/test)|
++-----+---------------+--------------+------------+---------+-----------+---------+--------------------+
+|mnist|           Data| (, 1, 28, 28)|            |         |mnist,mnist|         |                 1/0|
+|conv1|    Convolution|(, 32, 28, 28)|   [32 X 25]| [32 X 1]|      conv1|    mnist|               25/12|
+|relu1|           ReLU|(, 32, 28, 28)|            |         |      relu1|    conv1|               25/12|
+|pool1|        Pooling|(, 32, 14, 14)|            |         |      pool1|    relu1|                 6/3|
+|conv2|    Convolution|(, 64, 14, 14)|  [64 X 800]| [64 X 1]|      conv2|    pool1|                38/7|
+|relu2|           ReLU|(, 64, 14, 14)|            |         |      relu2|    conv2|                12/6|
+|pool2|        Pooling|  (, 64, 7, 7)|            |         |      pool2|    relu2|                 3/2|
+|  ip1|   InnerProduct| (, 512, 1, 1)|[3136 X 512]|[1 X 512]|        ip1|    pool2|              797/13|
+|relu3|           ReLU| (, 512, 1, 1)|            |         |      relu3|      ip1|                 1/0|
+|drop1|        Dropout| (, 512, 1, 1)|            |         |      drop1|    relu3|                 1/0|
+|  ip2|   InnerProduct|  (, 10, 1, 1)|  [512 X 10]| [1 X 10]|        ip2|    drop1|                 3/0|
+| loss|SoftmaxWithLoss|  (, 10, 1, 1)|            |         |       loss|ip2,mnist|                 0/0|
++-----+---------------+--------------+------------+---------+-----------+---------+--------------------+
+
+Total number of layer outputs/errors/weights/bias/gradients: 5568768/5568768/1662752/618/106455680
+Total memory requirements for parameters* for train/test: 910/55
+[Advanced] Key network statistics to compute intermediate CP overhead batchSize/maxThreads/1-thread im2col*(sum, max)/1-thread reshape_col*(sum, max): 64/48/(1, 1)/(0, 0).
+* => memory in megabytes assuming the parameters are in double precision and in dense format.
 ``` 
 
 To train the above lenet model, we use the MNIST dataset. 

http://git-wip-us.apache.org/repos/asf/systemml/blob/881caa9b/src/main/scala/org/apache/sysml/api/dl/Caffe2DML.scala
----------------------------------------------------------------------
diff --git a/src/main/scala/org/apache/sysml/api/dl/Caffe2DML.scala b/src/main/scala/org/apache/sysml/api/dl/Caffe2DML.scala
index 03b9a3b..56be5d6 100644
--- a/src/main/scala/org/apache/sysml/api/dl/Caffe2DML.scala
+++ b/src/main/scala/org/apache/sysml/api/dl/Caffe2DML.scala
@@ -50,6 +50,8 @@ import java.util.Random
 import org.apache.commons.logging.Log
 import org.apache.commons.logging.LogFactory
 import org.apache.sysml.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer
+import org.apache.sysml.hops.OptimizerUtils
+import java.lang.Double
 
 /***************************************************************************************
 DESIGN OF CAFFE2DML:
@@ -306,10 +308,21 @@ class Caffe2DML(val sc: SparkContext,
   def getTrainAlgo(): String = if (inputs.containsKey("$train_algo")) inputs.get("$train_algo") else "minibatch"
   def getTestAlgo(): String  = if (inputs.containsKey("$test_algo")) inputs.get("$test_algo") else "minibatch"
 
+  private def getMemInBytes(l:CaffeLayer, batchSize:Int, isTraining:Boolean):Long = {
+    val numLayerOutput = l.outputShape._1.toLong * l.outputShape._2.toLong * l.outputShape._3.toLong  * batchSize
+    val numLayerError = numLayerOutput
+    val numLayerWeights = if(l.weightShape != null) l.weightShape()(0).toLong * l.weightShape()(1).toLong else 0
+    val numLayerBias = if(l.biasShape != null)l.biasShape()(0).toLong * l.biasShape()(1).toLong else 0
+    val numLayerGradients = (numLayerWeights + numLayerBias) * batchSize
+    if(isTraining) (numLayerOutput + numLayerError + numLayerWeights + numLayerBias + numLayerGradients)*Double.BYTES
+    else (numLayerOutput + numLayerWeights + numLayerBias)*Double.BYTES
+  }
   def summary(sparkSession: org.apache.spark.sql.SparkSession): Unit = {
-    val header = Seq("Name", "Type", "Output", "Weight", "Bias", "Top", "Bottom")
-    val entries = net.getLayers
-      .map(l => (l, net.getCaffeLayer(l)))
+    val layers = net.getLayers .map(l => (l, net.getCaffeLayer(l)))
+    val numDataLayers = layers.filter(l => l._2.isInstanceOf[Data]).length
+    val batchSize = if(numDataLayers == 1) layers.filter(l => l._2.isInstanceOf[Data]).map(l => l._2.param.getDataParam.getBatchSize).get(0) else -1 
+    val header = Seq("Name", "Type", "Output", "Weight", "Bias", "Top", "Bottom", "Memory* (train/test)")
+    val entries = layers
       .map(l => {
         val layer = l._2
         (l._1,
@@ -318,10 +331,35 @@ class Caffe2DML(val sc: SparkContext,
          if (layer.weightShape != null) "[" + layer.weightShape()(0) + " X " + layer.weightShape()(1) + "]" else "",
          if (layer.biasShape != null) "[" + layer.biasShape()(0) + " X " + layer.biasShape()(1) + "]" else "",
          layer.param.getTopList.mkString(","),
-         layer.param.getBottomList.mkString(","))
+         layer.param.getBottomList.mkString(","), 
+         OptimizerUtils.toMB(getMemInBytes(l._2, batchSize, true)) + "/" + OptimizerUtils.toMB(getMemInBytes(l._2, batchSize, false))
+        )
       })
     import sparkSession.implicits._
     sc.parallelize(entries).toDF(header: _*).show(net.getLayers.size)
+    
+    val numLayerOutput = layers.map(l => l._2.outputShape._1.toLong * l._2.outputShape._2.toLong * l._2.outputShape._3.toLong).sum * batchSize
+    val numLayerError = numLayerOutput
+    val numLayerWeights = layers.map(l => if(l._2.weightShape != null) l._2.weightShape()(0).toLong * l._2.weightShape()(1).toLong else 0).sum
+    val numLayerBias = layers.map(l => if(l._2.biasShape != null) l._2.biasShape()(0).toLong * l._2.biasShape()(1).toLong else 0).sum
+    val numLayerGradients = (numLayerWeights + numLayerBias) * batchSize
+    val convLayers = layers.filter(l => l._2.isInstanceOf[Convolution]).map(l => l._2.asInstanceOf[Convolution])
+    val crspq = convLayers.map(l => l.numChannels.toLong*l.kernel_h.toLong*l.kernel_w.toLong*l.outputShape._2.toLong*l.outputShape._3.toLong) 
+    val kpq = convLayers.map(l => l.outputShape._1.toLong*l.outputShape._2.toLong*l.outputShape._3.toLong)
+    
+    if(getTrainAlgo().equals("minibatch") && getTestAlgo().equals("minibatch")) {
+      System.out.println("Total number of layer outputs/errors/weights/bias/gradients: " + numLayerOutput + "/" + numLayerError +
+        "/" + numLayerWeights + "/" + numLayerBias + "/" + numLayerGradients)
+      System.out.println("Total memory requirements for parameters* for train/test: " +
+        OptimizerUtils.toMB(layers.map(l => getMemInBytes(l._2, batchSize, true)).sum) + "/" + 
+        OptimizerUtils.toMB(layers.map(l => getMemInBytes(l._2, batchSize, false)).sum))
+      System.out.println("[Advanced] Key network statistics to compute intermediate CP overhead " + 
+        "batchSize/maxThreads/1-thread im2col*(sum, max)/1-thread reshape_col*(sum, max): " + 
+        batchSize + "/" + OptimizerUtils.getConstrainedNumThreads(-1) + "/(" +
+        OptimizerUtils.toMB(crspq.sum*Double.BYTES) + ", " + OptimizerUtils.toMB(crspq.max*Double.BYTES) + ")/(" + 
+        OptimizerUtils.toMB(kpq.sum*Double.BYTES) + ", " + OptimizerUtils.toMB(kpq.max*Double.BYTES) + ").")
+    }
+    System.out.println("* => memory in megabytes assuming the parameters are in double precision and in dense format.")
   }
 
   // ================================================================================================