You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by mb...@apache.org on 2017/07/09 05:33:18 UTC

[2/4] systemml git commit: [MINOR] Performance frame transformencode (selective row iterators)

[MINOR] Performance frame transformencode (selective row iterators)

This patch adds selective row iterators to frame blocks, which allows
the transform recode encoder to iterate over rows of selected columns
which avoids unnecessary string conversions for unused columns.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/4a6165b7
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/4a6165b7
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/4a6165b7

Branch: refs/heads/master
Commit: 4a6165b796590a6388a9c182612761219731d77f
Parents: f485ab2
Author: Matthias Boehm <mb...@gmail.com>
Authored: Fri Jul 7 17:14:39 2017 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Sat Jul 8 22:32:03 2017 -0700

----------------------------------------------------------------------
 .../sysml/runtime/matrix/data/FrameBlock.java   | 82 +++++++++++++++++---
 .../transform/decode/DecoderFactory.java        |  2 +-
 .../transform/encode/EncoderFactory.java        |  2 +-
 .../runtime/transform/encode/EncoderRecode.java |  4 +-
 .../sysml/runtime/util/UtilFunctions.java       | 46 +++++++----
 5 files changed, 107 insertions(+), 29 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/4a6165b7/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java b/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java
index 99a6f3f..512b85c 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java
@@ -505,6 +505,17 @@ public class FrameBlock implements Writable, CacheBlock, Externalizable
 	}
 	
 	/**
+	 * Get a row iterator over the frame where all selected fields are 
+	 * encoded as strings independent of their value types.  
+	 * 
+	 * @param cols column selection, 1-based
+	 * @return string array iterator
+	 */
+	public Iterator<String[]> getStringRowIterator(int[] cols) {
+		return new StringRowIterator(0, _numRows, cols);
+	}
+	
+	/**
 	 * Get a row iterator over the frame where all fields are encoded
 	 * as strings independent of their value types.  
 	 * 
@@ -517,6 +528,19 @@ public class FrameBlock implements Writable, CacheBlock, Externalizable
 	}
 	
 	/**
+	 * Get a row iterator over the frame where all selected fields are 
+	 * encoded as strings independent of their value types.  
+	 * 
+	 * @param rl lower row index
+	 * @param ru upper row index
+	 * @param cols column selection, 1-based
+	 * @return string array iterator
+	 */
+	public Iterator<String[]> getStringRowIterator(int rl, int ru, int[] cols) {
+		return new StringRowIterator(rl, ru, cols);
+	}
+	
+	/**
 	 * Get a row iterator over the frame where all fields are encoded
 	 * as boxed objects according to their value types.  
 	 * 
@@ -527,6 +551,17 @@ public class FrameBlock implements Writable, CacheBlock, Externalizable
 	}
 	
 	/**
+	 * Get a row iterator over the frame where all selected fields are 
+	 * encoded as boxed objects according to their value types.  
+	 * 
+	 * @param cols column selection, 1-based
+	 * @return object array iterator
+	 */
+	public Iterator<Object[]> getObjectRowIterator(int[] cols) {
+		return new ObjectRowIterator(0, _numRows, cols);
+	}
+	
+	/**
 	 * Get a row iterator over the frame where all fields are encoded
 	 * as boxed objects according to their value types.  
 	 * 
@@ -537,6 +572,19 @@ public class FrameBlock implements Writable, CacheBlock, Externalizable
 	public Iterator<Object[]> getObjectRowIterator(int rl, int ru) {
 		return new ObjectRowIterator(rl, ru);
 	}
+	
+	/**
+	 * Get a row iterator over the frame where all selected fields are 
+	 * encoded as boxed objects according to their value types.  
+	 * 
+	 * @param rl lower row index
+	 * @param ru upper row index
+	 * @param cols column selection, 1-based
+	 * @return object array iterator
+	 */
+	public Iterator<Object[]> getObjectRowIterator(int rl, int ru, int[] cols) {
+		return new ObjectRowIterator(rl, ru, cols);
+	}
 
 	///////
 	// serialization / deserialization (implementation of writable and externalizable)
@@ -1111,14 +1159,20 @@ public class FrameBlock implements Writable, CacheBlock, Externalizable
 	// row iterators (over strings and boxed objects)
 
 	private abstract class RowIterator<T> implements Iterator<T[]> {
-		protected T[] _curRow = null;
+		protected final int[] _cols;
+		protected final T[] _curRow;
+		protected final int _maxPos;
 		protected int _curPos = -1;
-		protected int _maxPos = -1;
 		
 		protected RowIterator(int rl, int ru) {
-			_curPos = rl;
+			this(rl, ru, UtilFunctions.getSeqArray(1, getNumColumns(), 1));
+		}
+		
+		protected RowIterator(int rl, int ru, int[] cols) {
+			_curRow = createRow(cols.length);
+			_cols = cols;
 			_maxPos = ru;
-			_curRow = createRow(getNumColumns());
+			_curPos = rl;
 		}
 		
 		@Override
@@ -1139,6 +1193,10 @@ public class FrameBlock implements Writable, CacheBlock, Externalizable
 			super(rl, ru);
 		}
 		
+		public StringRowIterator(int rl, int ru, int[] cols) {
+			super(rl, ru, cols);
+		}
+		
 		@Override
 		protected String[] createRow(int size) {
 			return new String[size];
@@ -1146,11 +1204,11 @@ public class FrameBlock implements Writable, CacheBlock, Externalizable
 		
 		@Override
 		public String[] next( ) {
-			for( int j=0; j<getNumColumns(); j++ ) {
-				Object tmp = get(_curPos, j);
+			for( int j=0; j<_cols.length; j++ ) {
+				Object tmp = get(_curPos, _cols[j]-1);
 				_curRow[j] = (tmp!=null) ? tmp.toString() : null;
 			}
-			_curPos++;			
+			_curPos++;
 			return _curRow;
 		}
 	}
@@ -1160,6 +1218,10 @@ public class FrameBlock implements Writable, CacheBlock, Externalizable
 			super(rl, ru);
 		}
 		
+		public ObjectRowIterator(int rl, int ru, int[] cols) {
+			super(rl, ru, cols);
+		}
+		
 		@Override
 		protected Object[] createRow(int size) {
 			return new Object[size];
@@ -1167,9 +1229,9 @@ public class FrameBlock implements Writable, CacheBlock, Externalizable
 		
 		@Override
 		public Object[] next( ) {
-			for( int j=0; j<getNumColumns(); j++ )
-				_curRow[j] = get(_curPos, j);
-			_curPos++;			
+			for( int j=0; j<_cols.length; j++ )
+				_curRow[j] = get(_curPos, _cols[j]-1);
+			_curPos++;
 			return _curRow;
 		}
 	}

http://git-wip-us.apache.org/repos/asf/systemml/blob/4a6165b7/src/main/java/org/apache/sysml/runtime/transform/decode/DecoderFactory.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/transform/decode/DecoderFactory.java b/src/main/java/org/apache/sysml/runtime/transform/decode/DecoderFactory.java
index 425466a..c02609a 100644
--- a/src/main/java/org/apache/sysml/runtime/transform/decode/DecoderFactory.java
+++ b/src/main/java/org/apache/sysml/runtime/transform/decode/DecoderFactory.java
@@ -56,7 +56,7 @@ public class DecoderFactory
 					TfMetaUtils.parseJsonIDList(jSpec, colnames, TfUtils.TXMETHOD_DUMMYCODE))); 
 			rcIDs = new ArrayList<Integer>(CollectionUtils.union(rcIDs, dcIDs));
 			List<Integer> ptIDs = new ArrayList<Integer>(CollectionUtils
-					.subtract(UtilFunctions.getSequenceList(1, meta.getNumColumns(), 1), rcIDs)); 
+					.subtract(UtilFunctions.getSeqList(1, meta.getNumColumns(), 1), rcIDs)); 
 
 			//create default schema if unspecified (with double columns for pass-through)
 			if( schema == null ) {

http://git-wip-us.apache.org/repos/asf/systemml/blob/4a6165b7/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderFactory.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderFactory.java b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderFactory.java
index 13b2810..5e0a178 100644
--- a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderFactory.java
+++ b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderFactory.java
@@ -65,7 +65,7 @@ public class EncoderFactory
 			rcIDs = new ArrayList<Integer>(CollectionUtils.union(rcIDs, dcIDs));
 			List<Integer> binIDs = TfMetaUtils.parseBinningColIDs(jSpec, colnames); 
 			List<Integer> ptIDs = new ArrayList<Integer>(CollectionUtils.subtract(
-					CollectionUtils.subtract(UtilFunctions.getSequenceList(1, clen, 1), rcIDs), binIDs)); 
+					CollectionUtils.subtract(UtilFunctions.getSeqList(1, clen, 1), rcIDs), binIDs)); 
 			List<Integer> oIDs = Arrays.asList(ArrayUtils.toObject(
 					TfMetaUtils.parseJsonIDList(jSpec, colnames, TfUtils.TXMETHOD_OMIT))); 
 			List<Integer> mvIDs = Arrays.asList(ArrayUtils.toObject(

http://git-wip-us.apache.org/repos/asf/systemml/blob/4a6165b7/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderRecode.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderRecode.java b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderRecode.java
index bb8592c..dc75a74 100644
--- a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderRecode.java
+++ b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderRecode.java
@@ -112,7 +112,7 @@ public class EncoderRecode extends Encoder
 		if( !isApplicable() )
 			return;		
 
-		Iterator<String[]> iter = in.getStringRowIterator();
+		Iterator<String[]> iter = in.getStringRowIterator(_colList);
 		while( iter.hasNext() ) {
 			String[] row = iter.next(); 
 			for( int j=0; j<_colList.length; j++ ) {
@@ -122,7 +122,7 @@ public class EncoderRecode extends Encoder
 					_rcdMaps.put(colID, new HashMap<String,Long>());
 				//probe and build column map
 				HashMap<String,Long> map = _rcdMaps.get(colID);
-				String key = row[colID-1];
+				String key = row[j];
 				if( key!=null && !key.isEmpty() && !map.containsKey(key) )
 					map.put(key, Long.valueOf(map.size()+1));
 			}

http://git-wip-us.apache.org/repos/asf/systemml/blob/4a6165b7/src/main/java/org/apache/sysml/runtime/util/UtilFunctions.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/util/UtilFunctions.java b/src/main/java/org/apache/sysml/runtime/util/UtilFunctions.java
index f76d37b..8c4cacd 100644
--- a/src/main/java/org/apache/sysml/runtime/util/UtilFunctions.java
+++ b/src/main/java/org/apache/sysml/runtime/util/UtilFunctions.java
@@ -314,6 +314,37 @@ public class UtilFunctions
 		return 1L + (long) Math.floor(to/incr - from/incr);
 	}
 	
+	/**
+	 * Obtain sequence list
+	 * 
+	 * @param low   lower bound (inclusive)
+	 * @param up    upper bound (inclusive)
+	 * @param incr  increment 
+	 * @return list of integers
+	 */
+	public static List<Integer> getSeqList(int low, int up, int incr) {
+		ArrayList<Integer> ret = new ArrayList<Integer>();
+		for( int i=low; i<=up; i+=incr )
+			ret.add(i);
+		return ret;
+	}
+	
+	/**
+	 * Obtain sequence array
+	 * 
+	 * @param low   lower bound (inclusive)
+	 * @param up    upper bound (inclusive)
+	 * @param incr  increment 
+	 * @return array of integers
+	 */
+	public static int[] getSeqArray(int low, int up, int incr) {
+		int len = (int) getSeqLength(low, up, incr);
+		int[] ret = new int[len];
+		for( int i=0, val=low; i<len; i++, val+=incr )
+			ret[i] = val;
+		return ret;
+	}
+	
  	public static int roundToNext(int val, int factor) {
 		//round up to next non-zero multiple of factor
 		int pval = Math.max(val, factor);
@@ -506,21 +537,6 @@ public class UtilFunctions
 		else
 			return String.format("%d", arg);
 	}
-	
-	/**
-	 * Obtain sequence list
-	 * 
-	 * @param low   lower bound (inclusive)
-	 * @param up    upper bound (inclusive)
-	 * @param incr  increment 
-	 * @return list of integers
-	 */
-	public static List<Integer> getSequenceList(int low, int up, int incr) {
-		ArrayList<Integer> ret = new ArrayList<Integer>();
-		for( int i=low; i<=up; i+=incr )
-			ret.add(i);
-		return ret;
-	}
 
 	public static double getDouble(Object obj) {
 		return (obj instanceof Double) ? (Double)obj :