You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by mb...@apache.org on 2017/07/20 08:25:33 UTC

[2/3] systemml git commit: [SYSTEMML-1791] Performance frame block indexing and transformapply

[SYSTEMML-1791] Performance frame block indexing and transformapply

This patch makes the following performance improvements to various frame
operations in order to remove unnecessary overheads:

(1) Shallow column copy on full column indexing.

(2) Bidirectional reuse of recode maps across original meta data frame
blocks and shallow column copies (e.g., after column indexing).

(3) Avoid unnecessary long-string-double conversions on transformapply
(the recently removed file-based transform required string lookups - we
now avoid this long-string conversion which is unnecessary for the
related frame operations).

Furthermore, this patch also makes a couple of cleanups methods which
become obsolete after the removal of the old file-based transform.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/4a24b9a7
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/4a24b9a7
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/4a24b9a7

Branch: refs/heads/master
Commit: 4a24b9a78424dc85fe774e6d2dd5689fea9cd5b1
Parents: 7cd978d
Author: Matthias Boehm <mb...@gmail.com>
Authored: Wed Jul 19 23:16:44 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Thu Jul 20 01:24:24 2017 -0700

----------------------------------------------------------------------
 .../sysml/runtime/matrix/data/FrameBlock.java   | 45 ++++++++++----------
 .../sysml/runtime/transform/encode/Encoder.java | 10 -----
 .../runtime/transform/encode/EncoderBin.java    | 31 --------------
 .../transform/encode/EncoderComposite.java      |  7 ---
 .../transform/encode/EncoderDummycode.java      | 38 -----------------
 .../transform/encode/EncoderMVImpute.java       | 30 -------------
 .../runtime/transform/encode/EncoderOmit.java   |  5 ---
 .../transform/encode/EncoderPassThrough.java    |  5 ---
 .../runtime/transform/encode/EncoderRecode.java | 45 +++-----------------
 9 files changed, 29 insertions(+), 187 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/4a24b9a7/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java b/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java
index bfe236e..5e6404b 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java
@@ -67,13 +67,8 @@ public class FrameBlock implements Writable, CacheBlock, Externalizable
 	/** The data frame data as an ordered list of columns */
 	private Array[] _coldata = null;
 	
-	/** Cache for recode maps from frame meta data, indexed by column 0-based */
-	private Map<Integer, SoftReference<HashMap<String,Long>>> _rcdMapCache = null;
-	
 	public FrameBlock() {
 		_numRows = 0;
-		if( REUSE_RECODE_MAPS )
-			_rcdMapCache = new HashMap<Integer, SoftReference<HashMap<String,Long>>>();
 	}
 	
 	/**
@@ -120,8 +115,6 @@ public class FrameBlock implements Writable, CacheBlock, Externalizable
 			_colmeta[j] = new ColumnMetadata(0);
 		for( int i=0; i<data.length; i++ )
 			appendRow(data[i]);
-		if( REUSE_RECODE_MAPS )
-			_rcdMapCache = new HashMap<Integer, SoftReference<HashMap<String,Long>>>();
 	}
 	
 	/**
@@ -872,16 +865,25 @@ public class FrameBlock implements Writable, CacheBlock, Externalizable
 				ret._colnames[j-cl] = getColumnName(j);
 		}	
 		ret._numRows = ru-rl+1;
-
-		//copy output data
-		if(ret._coldata == null ) { 
+		if(ret._coldata == null )
 			ret._coldata = new Array[numCols];
+		
+		//fast-path: shallow copy column indexing 
+		if( ret._numRows == _numRows ) {
+			//this shallow copy does not only avoid an array copy, but
+			//also allows for bi-directional reuses of recodemaps 
 			for( int j=cl; j<=cu; j++ )
-				ret._coldata[j-cl] = _coldata[j].slice(rl,ru);
+				ret._coldata[j-cl] = _coldata[j];
+		}
+		//copy output data
+		else {
+			for( int j=cl; j<=cu; j++ ) {
+				if( ret._coldata[j-cl] == null )
+					ret._coldata[j-cl] = _coldata[j].slice(rl,ru);
+				else
+					ret._coldata[j-cl].set(0, ru-rl, _coldata[j], rl);
+			}
 		}
-		else
-			for( int j=cl; j<=cu; j++ )
-				ret._coldata[j-cl].set(0, ru-rl, _coldata[j], rl);	
 		
 		return ret;
 	}
@@ -1023,7 +1025,7 @@ public class FrameBlock implements Writable, CacheBlock, Externalizable
 	public HashMap<String,Long> getRecodeMap(int col) {
 		//probe cache for existing map
 		if( REUSE_RECODE_MAPS ) {
-			SoftReference<HashMap<String,Long>> tmp = _rcdMapCache.get(col);
+			SoftReference<HashMap<String,Long>> tmp = _coldata[col]._rcdMapCache;
 			HashMap<String,Long> map = (tmp!=null) ? tmp.get() : null;
 			if( map != null ) return map;
 		}
@@ -1034,10 +1036,8 @@ public class FrameBlock implements Writable, CacheBlock, Externalizable
 		for( int i=0; i<getNumRows(); i++ ) {
 			Object val = ldata.get(i);
 			if( val != null ) {
-//				String[] tmp = IOUtilFunctions.splitCSV(
-//						val.toString(), Lop.DATATYPE_PREFIX);
-
-				// Instead of using splitCSV which is forcing string with RFC-4180 format, using Lop.DATATYPE_PREFIX separator to split token and code 
+				// Instead of using splitCSV which is forcing string with RFC-4180 format, 
+				// using Lop.DATATYPE_PREFIX separator to split token and code 
 				String[] tmp = 	new String[2];
 				int pos = val.toString().lastIndexOf(Lop.DATATYPE_PREFIX);
 				tmp[0] = val.toString().substring(0, pos);
@@ -1047,9 +1047,8 @@ public class FrameBlock implements Writable, CacheBlock, Externalizable
 		}
 		
 		//put created map into cache
-		if( REUSE_RECODE_MAPS ) {
-			_rcdMapCache.put(col, new SoftReference<HashMap<String,Long>>(map));
-		}
+		if( REUSE_RECODE_MAPS )
+			_coldata[col]._rcdMapCache = new SoftReference<>(map);
 		
 		return map;
 	}
@@ -1245,6 +1244,8 @@ public class FrameBlock implements Writable, CacheBlock, Externalizable
 	 * in order to avoid unnecessary dependencies.
 	 */
 	private abstract static class Array<T> implements Writable {
+		protected SoftReference<HashMap<String,Long>> _rcdMapCache = null;
+		
 		protected int _size = 0;
 		protected int newSize() {
 			return (int) Math.max(_size*2, 4); 

http://git-wip-us.apache.org/repos/asf/systemml/blob/4a24b9a7/src/main/java/org/apache/sysml/runtime/transform/encode/Encoder.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/transform/encode/Encoder.java b/src/main/java/org/apache/sysml/runtime/transform/encode/Encoder.java
index 304dcdb..e4af8a6 100644
--- a/src/main/java/org/apache/sysml/runtime/transform/encode/Encoder.java
+++ b/src/main/java/org/apache/sysml/runtime/transform/encode/Encoder.java
@@ -117,16 +117,6 @@ public abstract class Encoder implements Serializable
 	 * @return output matrix block
 	 */
 	public abstract MatrixBlock apply(FrameBlock in, MatrixBlock out);
-	
-	/**
-	 * Encode input data according to existing transform meta
-	 * data (transform apply).
-	 * TODO remove once file-based transform removed
-	 * 
-	 * @param in input data as string array
-	 * @return encoded data as string array
-	 */
-	public abstract String[] apply(String[] in);
 
 	/**
 	 * Construct a frame block out of the transform meta data.

http://git-wip-us.apache.org/repos/asf/systemml/blob/4a24b9a7/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderBin.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderBin.java b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderBin.java
index fbe6994..e70a392 100644
--- a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderBin.java
+++ b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderBin.java
@@ -44,7 +44,6 @@ public class EncoderBin extends Encoder
 
 	private int[] _numBins = null;
 	private double[] _min=null, _max=null;	// min and max among non-missing values
-	private double[] _binWidths = null;		// width of a bin for each attribute
 	
 	//frame transform-apply attributes
 	private double[][] _binMins = null;
@@ -83,8 +82,6 @@ public class EncoderBin extends Encoder
 			Arrays.fill(_min, Double.MAX_VALUE);
 			_max = new double[_colList.length];
 			Arrays.fill(_max, -Double.MAX_VALUE);
-			
-			_binWidths = new double[_colList.length];
 		}
 	}
 
@@ -121,34 +118,6 @@ public class EncoderBin extends Encoder
 		// nothing to do
 	}
 	
-	/**
-	 * Method to apply transformations.
-	 */
-	@Override
-	public String[] apply(String[] words) {
-		if( !isApplicable() )
-			return words;
-	
-		for(int i=0; i < _colList.length; i++) {
-			int colID = _colList[i];
-			try {
-				double val = UtilFunctions.parseToDouble(words[colID-1]);
-				int binid = 1;
-				double tmp = _min[i] + _binWidths[i];
-				while(val > tmp && binid < _numBins[i]) {
-					tmp += _binWidths[i];
-					binid++;
-				}
-				words[colID-1] = Integer.toString(binid);
-			} 
-			catch(NumberFormatException e) {
-				throw new RuntimeException("Encountered \"" + words[colID-1] + "\" in column ID \"" + colID + "\", when expecting a numeric value. Consider adding \"" + words[colID-1] + "\" to na.strings, along with an appropriate imputation method.");
-			}
-		}
-		
-		return words;
-	}
-
 	@Override
 	public MatrixBlock apply(FrameBlock in, MatrixBlock out) {
 		for(int j=0; j<_colList.length; j++) {

http://git-wip-us.apache.org/repos/asf/systemml/blob/4a24b9a7/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderComposite.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderComposite.java b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderComposite.java
index deff887..ffff1df 100644
--- a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderComposite.java
+++ b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderComposite.java
@@ -80,13 +80,6 @@ public class EncoderComposite extends Encoder
 		for( Encoder encoder : _encoders )
 			encoder.build(in);
 	}
-
-	@Override
-	public String[] apply(String[] in) {
-		for( Encoder encoder : _encoders )
-			encoder.apply(in);
-		return in;
-	}
 	
 	@Override 
 	public MatrixBlock apply(FrameBlock in, MatrixBlock out) {

http://git-wip-us.apache.org/repos/asf/systemml/blob/4a24b9a7/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderDummycode.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderDummycode.java b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderDummycode.java
index 743381a..9a2f059 100644
--- a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderDummycode.java
+++ b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderDummycode.java
@@ -58,44 +58,6 @@ public class EncoderDummycode extends Encoder
 		//do nothing
 	}
 	
-	/**
-	 * Method to apply transformations.
-	 * 
-	 * @param words array of strings
-	 * @return array of transformed strings
-	 */
-	@Override
-	public String[] apply(String[] words) 
-	{
-		if( !isApplicable() )
-			return words;
-		
-		String[] nwords = new String[(int)_dummycodedLength];
-		int rcdVal = 0;
-		
-		for(int colID=1, idx=0, ncolID=1; colID <= words.length; colID++) {
-			if(idx < _colList.length && colID==_colList[idx]) {
-				// dummycoded columns
-				try {
-					rcdVal = UtilFunctions.parseToInt(UtilFunctions.unquote(words[colID-1]));
-					nwords[ ncolID-1+rcdVal-1 ] = "1";
-					ncolID += _domainSizes[idx];
-					idx++;
-				} 
-				catch (Exception e) {
-					throw new RuntimeException("Error in dummycoding: colID="+colID + ", rcdVal=" + rcdVal+", word="+words[colID-1] 
-							+ ", domainSize=" + _domainSizes[idx] + ", dummyCodedLength=" + _dummycodedLength);
-				}
-			}
-			else {
-				nwords[ncolID-1] = words[colID-1];
-				ncolID++;
-			}
-		}
-		
-		return nwords;
-	}
-	
 	@Override
 	public MatrixBlock apply(FrameBlock in, MatrixBlock out) 
 	{

http://git-wip-us.apache.org/repos/asf/systemml/blob/4a24b9a7/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderMVImpute.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderMVImpute.java b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderMVImpute.java
index 55a0bde..ae9b809 100644
--- a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderMVImpute.java
+++ b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderMVImpute.java
@@ -336,36 +336,6 @@ public class EncoderMVImpute extends Encoder
 			throw new RuntimeException(ex);
 		}
 	}
-
-	@Override
-	public String[] apply(String[] words) 
-	{	
-		if( isApplicable() )
-			for(int i=0; i < _colList.length; i++) {
-				int colID = _colList[i];
-				String w = UtilFunctions.unquote(words[colID-1]);
-				if(TfUtils.isNA(_NAstrings, w))
-					w = words[colID-1] = _replacementList[i];
-				
-				if ( _isMVScaled.get(i) )
-					if ( _mvscMethodList[i] == MVMethod.GLOBAL_MEAN )
-						words[colID-1] = Double.toString( UtilFunctions.parseToDouble(w) - _meanList[i]._sum );
-					else
-						words[colID-1] = Double.toString( (UtilFunctions.parseToDouble(w) - _meanList[i]._sum) / _varList[i].mean._sum );
-			}
-		
-		if(_scnomvList != null)
-		for(int i=0; i < _scnomvList.length; i++)
-		{
-			int colID = _scnomvList[i];
-			if ( _scnomvMethodList[i] == MVMethod.GLOBAL_MEAN )
-				words[colID-1] = Double.toString( UtilFunctions.parseToDouble(words[colID-1]) - _scnomvMeanList[i]._sum );
-			else
-				words[colID-1] = Double.toString( (UtilFunctions.parseToDouble(words[colID-1]) - _scnomvMeanList[i]._sum) / _scnomvVarList[i].mean._sum );
-		}
-			
-		return words;
-	}
 	
 	@Override
 	public MatrixBlock apply(FrameBlock in, MatrixBlock out) {

http://git-wip-us.apache.org/repos/asf/systemml/blob/4a24b9a7/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderOmit.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderOmit.java b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderOmit.java
index af09cee..0f74590 100644
--- a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderOmit.java
+++ b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderOmit.java
@@ -71,11 +71,6 @@ public class EncoderOmit extends Encoder
 	}
 	
 	@Override
-	public String[] apply(String[] words) {
-		return null;
-	}
-	
-	@Override
 	public MatrixBlock apply(FrameBlock in, MatrixBlock out) 
 	{
 		//determine output size

http://git-wip-us.apache.org/repos/asf/systemml/blob/4a24b9a7/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderPassThrough.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderPassThrough.java b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderPassThrough.java
index d84ea0d..ee22ac1 100644
--- a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderPassThrough.java
+++ b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderPassThrough.java
@@ -48,11 +48,6 @@ public class EncoderPassThrough extends Encoder
 	public void build(FrameBlock in) {
 		//do nothing
 	}
-
-	@Override
-	public String[] apply(String[] in) {
-		return in;
-	}
 	
 	@Override 
 	public MatrixBlock apply(FrameBlock in, MatrixBlock out) {

http://git-wip-us.apache.org/repos/asf/systemml/blob/4a24b9a7/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderRecode.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderRecode.java b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderRecode.java
index 855d565..526d31e 100644
--- a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderRecode.java
+++ b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderRecode.java
@@ -29,7 +29,6 @@ import org.apache.sysml.runtime.matrix.data.FrameBlock;
 import org.apache.sysml.runtime.matrix.data.MatrixBlock;
 import org.apache.sysml.runtime.transform.TfUtils;
 import org.apache.sysml.runtime.transform.meta.TfMetaUtils;
-import org.apache.sysml.runtime.util.UtilFunctions;
 import org.apache.wink.json4j.JSONException;
 import org.apache.wink.json4j.JSONObject;
 
@@ -39,7 +38,6 @@ public class EncoderRecode extends Encoder
 
 	//recode maps and custom map for partial recode maps 
 	private HashMap<Integer, HashMap<String, Long>> _rcdMaps  = new HashMap<Integer, HashMap<String, Long>>();
-	private HashMap<Integer, HashMap<String,String>> _finalMaps = null;
 	private HashMap<Integer, HashSet<Object>> _rcdMapsPart = null;
 	
 	public EncoderRecode(JSONObject parsedSpec, String[] colnames, int clen)
@@ -60,17 +58,9 @@ public class EncoderRecode extends Encoder
 		return _rcdMapsPart; 
 	}
 	
-	public HashMap<Integer, HashMap<String,String>> getRecodeMaps() {
-		return _finalMaps;
-	}
-	
-	private String lookupRCDMap(int colID, String key) {
-		if( _finalMaps!=null )
-			return _finalMaps.get(colID).get(key);
-		else { //used for cp
-			Long tmp = _rcdMaps.get(colID).get(key);
-			return (tmp!=null) ? Long.toString(tmp) : null;
-		}
+	private long lookupRCDMap(int colID, String key) {
+		Long tmp = _rcdMaps.get(colID).get(key);
+		return (tmp!=null) ? tmp : -1;
 	}
 	
 	@Override
@@ -132,28 +122,6 @@ public class EncoderRecode extends Encoder
 		}
 	}
 	
-	/**
-	 * Method to apply transformations.
-	 */
-	@Override
-	public String[] apply(String[] words) 
-	{
-		if( !isApplicable() )
-			return words;
-		
-		//apply recode maps on relevant columns of given row
-		for(int i=0; i < _colList.length; i++) {
-			//prepare input and get code
-			int colID = _colList[i];
-			String key = UtilFunctions.unquote(words[colID-1].trim());
-			String val = lookupRCDMap(colID, key);			
-			// replace unseen keys with NaN 
-			words[colID-1] = (val!=null) ? val : "NaN";
-		}
-			
-		return words;
-	}
-	
 	@Override
 	public MatrixBlock apply(FrameBlock in, MatrixBlock out) {
 		//apply recode maps column wise
@@ -162,9 +130,9 @@ public class EncoderRecode extends Encoder
 			for( int i=0; i<in.getNumRows(); i++ ) {
 				Object okey = in.get(i, colID-1);
 				String key = (okey!=null) ? okey.toString() : null;
-				String val = lookupRCDMap(colID, key);			
-				out.quickSetValue(i, colID-1, (val!=null) ? 
-						Double.parseDouble(val) : Double.NaN);
+				long code = lookupRCDMap(colID, key);			
+				out.quickSetValue(i, colID-1,
+					(code >= 0) ? code : Double.NaN);
 			}
 		}
 		
@@ -228,4 +196,3 @@ public class EncoderRecode extends Encoder
 		return token + Lop.DATATYPE_PREFIX + code.toString();
 	}
 }
- 
\ No newline at end of file