You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by mb...@apache.org on 2017/09/02 23:30:24 UTC

[2/2] systemml git commit: [SYSTEMML-1881] Tuning parfor degree of parallelism (over-provisioning)

[SYSTEMML-1881] Tuning parfor degree of parallelism (over-provisioning)

This patch addresses issues of under-utilized CPU resources in parfor
contexts. For example, on Kmeans or MSVM with few runs or classes that
are not a factor of the number of hardware threads, we assign the
remaining parallelism too conservatively. Consider Kmeans with 10 runs
and 16 hardware threads - in this case, we assign k=10 to parfor and k=1
to the operations in the parfor body. This patch fine-tunes this
assignment by slightly over-provisioning CPU resources, which is usually
a good idea due to barriers between operations. We now assign the
remaining operation parallelism with k=round(maxK/parforK). 

On the perftest Kmeans 100K x 1K scenario with 50 classes, 10 runs, and
16 hardware threads, this patch improved performance from 196s to 168s


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/ba73291c
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/ba73291c
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/ba73291c

Branch: refs/heads/master
Commit: ba73291c985d876eaeeb5719623461131bcc7f66
Parents: 2c57cf7
Author: Matthias Boehm <mb...@gmail.com>
Authored: Sat Sep 2 14:30:11 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Sat Sep 2 14:30:11 2017 -0700

----------------------------------------------------------------------
 .../parfor/opt/OptimizerRuleBased.java          | 34 +++++++++++++-------
 1 file changed, 22 insertions(+), 12 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/ba73291c/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/opt/OptimizerRuleBased.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/opt/OptimizerRuleBased.java b/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/opt/OptimizerRuleBased.java
index b8da25a..9dada01 100644
--- a/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/opt/OptimizerRuleBased.java
+++ b/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/opt/OptimizerRuleBased.java
@@ -1247,12 +1247,12 @@ public class OptimizerRuleBased extends Optimizer
 			
 			//constrain max parfor parallelism by problem size
 			int parforK = (int)((_N<kMax)? _N : kMax);
-
-
+			
 			// if gpu mode is enabled, the amount of parallelism is set to
 			// the smaller of the number of iterations and the number of GPUs
 			// otherwise it default to the number of CPU cores and the
 			// operations are run in CP mode
+			//FIXME rework for nested parfor parallelism and body w/o gpu ops
 			if (DMLScript.USE_ACCELERATOR) {
 				long perGPUBudget = GPUContextPool.initialGPUMemBudget();
 				double maxMemUsage = getMaxCPOnlyBudget(n);
@@ -1264,15 +1264,14 @@ public class OptimizerRuleBased extends Optimizer
 							parforK + "]");
 				}
 			}
-
 			
 			//set parfor degree of parallelism
 			pfpb.setDegreeOfParallelism(parforK);
-			n.setK(parforK);	
+			n.setK(parforK);
 			
 			//distribute remaining parallelism 
-			int remainParforK = (int)Math.ceil(((double)(kMax-parforK+1))/parforK);
-			int remainOpsK = Math.max(_lkmaxCP / parforK, 1);
+			int remainParforK = getRemainingParallelismParFor(kMax, parforK);
+			int remainOpsK = getRemainingParallelismOps(_lkmaxCP, parforK);
 			rAssignRemainingParallelism( n, remainParforK, remainOpsK ); 
 		}
 		else // ExecType.MR/ExecType.SPARK
@@ -1334,13 +1333,13 @@ public class OptimizerRuleBased extends Optimizer
 					//set parfor degree of parallelism
 					long id = c.getID();
 					c.setK(tmpK);
-					ParForProgramBlock pfpb = (ParForProgramBlock) OptTreeConverter
-                                                  .getAbstractPlanMapping().getMappedProg(id)[1];
+					ParForProgramBlock pfpb = (ParForProgramBlock) 
+						OptTreeConverter.getAbstractPlanMapping().getMappedProg(id)[1];
 					pfpb.setDegreeOfParallelism(tmpK);
 					
-					//distribute remaining parallelism 
-					int remainParforK = (int)Math.ceil(((double)(parforK-tmpK+1))/tmpK);
-					int remainOpsK = Math.max(opsK / tmpK, 1);
+					//distribute remaining parallelism
+					int remainParforK = getRemainingParallelismParFor(parforK, tmpK);
+					int remainOpsK = getRemainingParallelismOps(opsK, tmpK);
 					rAssignRemainingParallelism(c, remainParforK, remainOpsK);
 				}
 				else if( c.getNodeType() == NodeType.HOP )
@@ -1387,7 +1386,18 @@ public class OptimizerRuleBased extends Optimizer
 			}
 		}
 	}
-
+	
+	private static int getRemainingParallelismParFor(int parforK, int tmpK) {
+		//compute max remaining parfor parallelism k such that k * tmpK <= parforK
+		return (int)Math.ceil((double)(parforK-tmpK+1) / tmpK);
+	}
+	
+	private static int getRemainingParallelismOps(int opsK, int tmpK) {
+		//compute max remaining operations parallelism k with slight over-provisioning 
+		//such that k * tmpK <= 1.5 * opsK; note that if parfor already exploits the
+		//maximum parallelism, this will not introduce any over-provisioning.
+		return (int)Math.max(Math.round((double)opsK / tmpK), 1);
+	}
 	
 	///////
 	//REWRITE set task partitioner