You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by mb...@apache.org on 2018/05/19 04:31:22 UTC

systemml git commit: [SYSTEMML-2332] Simplification and performance ALS-predict script

Repository: systemml
Updated Branches:
  refs/heads/master 5069f9781 -> 1b1c3fea3


[SYSTEMML-2332] Simplification and performance ALS-predict script

This patch simplifies the ALS-predict script by removing unnecessary,
and even counter-productive script-level "optimizations" which stem from
a time when we did not have sparsity-exploiting fused operators. On a
scenario of a 100Kx100 and 20Kx100 factors and 10K queries, this
modification improved the end-to-end runtime (incl matrix read and
write) from 20.7s to 2.6s.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/1b1c3fea
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/1b1c3fea
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/1b1c3fea

Branch: refs/heads/master
Commit: 1b1c3fea355fc39d4db8a8229fbd2d36c11e4258
Parents: 5069f97
Author: Matthias Boehm <mb...@gmail.com>
Authored: Fri May 18 21:32:55 2018 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Fri May 18 21:32:55 2018 -0700

----------------------------------------------------------------------
 scripts/algorithms/ALS_predict.dml | 49 +++++++++++----------------------
 1 file changed, 16 insertions(+), 33 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/1b1c3fea/scripts/algorithms/ALS_predict.dml
----------------------------------------------------------------------
diff --git a/scripts/algorithms/ALS_predict.dml b/scripts/algorithms/ALS_predict.dml
index 0af8301..a4e6bd4 100644
--- a/scripts/algorithms/ALS_predict.dml
+++ b/scripts/algorithms/ALS_predict.dml
@@ -19,7 +19,6 @@
 #
 #-------------------------------------------------------------
 
-#  
 # THIS SCRIPT COMPUTES THE RATING/SCORE FOR A GIVEN LIST OF PAIRS: (USER-ID, ITEM-ID) USING 2 FACTOR MATRICES L AND R
 # WE ASSUME THAT ALL USERS HAVE RATED AT LEAST ONCE AND ALL ITEMS HAVE BEEN RATED AT LEAST ONCE.
 # INPUT   PARAMETERS:
@@ -27,7 +26,7 @@
 # NAME    TYPE     DEFAULT  MEANING
 # ---------------------------------------------------------------------------------------------
 # X       String   ---      The input user-id/item-id list
-# Y	  	  String   ---	    The output user-id/item-id/score
+# Y       String   ---      The output user-id/item-id/score
 # L       String   ---      Location of the factor matrix L: user-id x feature-id 
 # R       String   ---      Location to the factor matrix R: feature-id x item-id
 # Vrows   Integer  ---      The number of rows in the original matrix
@@ -37,16 +36,16 @@
 # OUTPUT: Matrix Y containing the predicted ratings for users and items specified in input matrix X
 #
 # HOW TO INVOKE THIS SCRIPT - EXAMPLE:
-# hadoop jar SystemML.jar -f ALS-predict.dml -nvargs Vrows=100000 Vcols=10000 X=INPUT_DIR/X L=OUTPUT_DIR/L R=OUTPUT_DIR/R 
-#											  Y=OUTPUT_DIR/Y fmt=csv
+# hadoop jar SystemML.jar -f ALS-predict.dml -nvargs Vrows=100000 Vcols=10000 \
+#   X=INPUT_DIR/X L=OUTPUT_DIR/L R=OUTPUT_DIR/R Y=OUTPUT_DIR/Y fmt=csv
 
-fileX      = $X;
-fileY 	   = $Y;
-fileL	   = $L;
-fileR      = $R;
-Vrows	   = $Vrows;
-Vcols	   = $Vcols;	
-fmtO       = ifdef ($fmt, "text");    # $fmt="text";
+fileX = $X;
+fileY = $Y;
+fileL = $L;
+fileR = $R;
+Vrows = $Vrows;
+Vcols = $Vcols;
+fmtO  = ifdef ($fmt, "text");
 
 X = read (fileX);
 L = read (fileL);
@@ -56,8 +55,8 @@ R = read (fileR);
 n = nrow (X);
 m = ncol (X);
 
-if (m != 2){
-	stop("The input matrix must have 2 columns: user-id and item-id");
+if (m != 2) {
+  stop("The input matrix must have 2 columns: user-id and item-id");
 }
 
 Lrows = nrow (L);
@@ -66,33 +65,17 @@ Rcols = ncol (R);
 X_user_max = max (X[,1]);
 X_item_max = max (X[,2]);
 
-# initializing Y matrix
-Y = matrix(0, rows = n, cols = 3);
-
 if (X_user_max > Vrows | X_item_max >  Vcols ) {
-	stop ("Predictions cannot be provided. Maximum user-id (item-id) exceed the number of rows (columns) of V.");
+  stop ("Predictions cannot be provided. Maximum user-id (item-id) exceed the number of rows (columns) of V.");
 }
 if (Lrows != Vrows | Rcols !=  Vcols) {
-	stop ("Predictions cannot be provided. Number of rows of L (columns of R) does not match the number of rows (column) of V.");
+  stop ("Predictions cannot be provided. Number of rows of L (columns of R) does not match the number of rows (column) of V.");
 }
 
-
 # user2item table
-ones = matrix (1, rows = n, cols = 1);
-UI = table (X[,1], X[,2], ones, Vrows, Vcols);
-
-# summing up over all items for all users
-U = rowSums (UI)
-
-# replacing all rows > 1 with 1
-U = U >= 1;
-
-# selecting users from factor L
-U_prime = L * U;
-
-V_prime = (U_prime %*% R);
+UI = table (X[,1], X[,2], Vrows, Vcols);
 
 # Applying items filter
-V_prime = UI * V_prime;
+V_prime = UI * (L %*% R);
 
 write(V_prime, fileY, format = fmtO);