You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemds.apache.org by ba...@apache.org on 2021/09/15 07:55:21 UTC
[systemds] 04/05: [SYSTEMDS-3132] Update FrameBlock to include
double and boolean
This is an automated email from the ASF dual-hosted git repository.
baunsgaard pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git
commit 79addc3c552b3f05d1cfbdfd89446bc3246855e3
Author: baunsgaard <ba...@tugraz.at>
AuthorDate: Tue Sep 14 21:46:23 2021 +0200
[SYSTEMDS-3132] Update FrameBlock to include double and boolean
The previous rewrite did not include the boolean and double type.
with this commit it does.
---
.../sysds/runtime/matrix/data/FrameBlock.java | 31 ++++-
.../python/systemds/examples/tutorials/adult.py | 22 ++-
src/main/python/systemds/operator/nodes/frame.py | 28 ++--
src/main/python/systemds/operator/nodes/list.py | 5 +-
.../python/systemds/operator/nodes/list_access.py | 9 +-
src/main/python/systemds/operator/nodes/matrix.py | 21 +--
.../python/systemds/operator/nodes/multi_return.py | 11 +-
src/main/python/systemds/operator/nodes/scalar.py | 14 +-
src/main/python/systemds/operator/nodes/source.py | 10 +-
.../python/systemds/operator/operation_node.py | 2 +-
src/main/python/systemds/utils/converters.py | 20 ++-
src/main/python/systemds/utils/helpers.py | 13 +-
.../python/tests/examples/tutorials/test_adult.py | 149 +++++++++++++--------
13 files changed, 213 insertions(+), 122 deletions(-)
diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/FrameBlock.java b/src/main/java/org/apache/sysds/runtime/matrix/data/FrameBlock.java
index 9ff106c..2a290f3 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/FrameBlock.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/FrameBlock.java
@@ -622,21 +622,44 @@ public class FrameBlock implements CacheBlock, Externalizable {
}
public byte[] getColumnAsBytes(int c){
+ final int nRow = getNumRows();
switch(_schema[c]){
case INT64:
long[] colLong = ((LongArray)_coldata[c])._data;
- ByteBuffer longBuffer = ByteBuffer.allocate(8 * getNumRows());
+ ByteBuffer longBuffer = ByteBuffer.allocate(8 * nRow);
longBuffer.order(ByteOrder.LITTLE_ENDIAN);
- for(int i = 0; i < getNumRows(); i++)
+ for(int i = 0; i < nRow; i++)
longBuffer.putLong(colLong[i]);
return longBuffer.array();
case INT32:
int[] colInt = ((IntegerArray)_coldata[c])._data;
- ByteBuffer intBuffer = ByteBuffer.allocate(4 * getNumRows());
+ ByteBuffer intBuffer = ByteBuffer.allocate(4 * nRow);
intBuffer.order(ByteOrder.LITTLE_ENDIAN);
- for(int i = 0; i < getNumRows(); i++)
+ for(int i = 0; i < nRow; i++)
intBuffer.putInt(colInt[i]);
return intBuffer.array();
+ case FP64:
+ double[] colDouble = ((DoubleArray)_coldata[c])._data;
+ ByteBuffer doubleBuffer = ByteBuffer.allocate(8 * nRow);
+ doubleBuffer.order(ByteOrder.nativeOrder());
+ for(int i = 0; i < nRow; i++)
+ doubleBuffer.putDouble(colDouble[i]);
+ return doubleBuffer.array();
+ case FP32:
+ float[] colFloat = ((FloatArray)_coldata[c])._data;
+ ByteBuffer floatBuffer = ByteBuffer.allocate(8 * nRow);
+ floatBuffer.order(ByteOrder.nativeOrder());
+ for(int i = 0; i < nRow; i++)
+ floatBuffer.putDouble(colFloat[i]);
+ return floatBuffer.array();
+ case BOOLEAN:
+ boolean[] colBool = ((BooleanArray)_coldata[c])._data;
+ // over allocating here.. we could maybe bit pack?
+ ByteBuffer booleanBuffer = ByteBuffer.allocate(nRow);
+ booleanBuffer.order(ByteOrder.nativeOrder());
+ for(int i = 0; i < nRow; i++)
+ booleanBuffer.put((byte)(colBool[i]? 1:0));
+ return booleanBuffer.array();
default:
throw new NotImplementedException();
}
diff --git a/src/main/python/systemds/examples/tutorials/adult.py b/src/main/python/systemds/examples/tutorials/adult.py
index c28ed0a..5bd3cc2 100644
--- a/src/main/python/systemds/examples/tutorials/adult.py
+++ b/src/main/python/systemds/examples/tutorials/adult.py
@@ -59,25 +59,41 @@ class DataManager:
def get_train_data(self, sds: SystemDSContext) -> 'Frame':
self._get_data(self._train_data_loc)
- return sds.read(self._train_data_loc)
+ return sds.read(self._train_data_loc)[:,0:14]
def get_train_labels_pandas(self) -> pd.DataFrame:
self._get_data(self._train_data_loc)
return self._parse_data(self._train_data_loc)["income"]
+ def get_train_labels(self, sds: SystemDSContext) -> 'Frame':
+ self._get_data(self._train_data_loc)
+ return sds.read(self._train_data_loc)[:,14]
+
def get_test_data_pandas(self) -> pd.DataFrame:
self._get_data(self._test_data_loc)
return self._parse_data(self._test_data_loc)\
- .drop(labels=["income"], axis=1).to_numpy()
+ .drop(labels=["income"], axis=1)
+
+ def get_test_data(self, sds: SystemDSContext) -> 'Frame':
+ self._get_data(self._test_data_loc)
+ return sds.read(self._test_data_loc)[:,0:14]
def get_test_labels_pandas(self) -> pd.DataFrame:
self._get_data(self._test_data_loc)
return self._parse_data(self._test_data_loc)["income"]
- def get_jspec(self) -> str:
+ def get_test_labels(self, sds: SystemDSContext) -> 'Frame':
+ self._get_data(self._test_data_loc)
+ return sds.read(self._test_data_loc)[:,14]
+
+ def get_jspec_string(self) -> str:
self._get_data(self._jspec_loc)
with open(self._jspec_loc, "r") as f:
return f.read()
+
+ def get_jspec(self, sds: SystemDSContext) -> 'Scalar':
+ self._get_data(self._jspec_loc)
+ return sds.read(self._jspec_loc, data_type="scalar", value_type="string")
def _parse_data(self, loc) -> pd.DataFrame:
return pd.read_csv(loc)
diff --git a/src/main/python/systemds/operator/nodes/frame.py b/src/main/python/systemds/operator/nodes/frame.py
index 5fe02cc..efa75b0 100644
--- a/src/main/python/systemds/operator/nodes/frame.py
+++ b/src/main/python/systemds/operator/nodes/frame.py
@@ -22,16 +22,18 @@
__all__ = ["Frame"]
import os
-from typing import Dict, Optional, Sequence, Tuple, Union, TYPE_CHECKING, Iterable
+from typing import (TYPE_CHECKING, Dict, Iterable, Optional, Sequence, Tuple,
+ Union)
import numpy as np
import pandas as pd
from py4j.java_gateway import JavaObject, JVMView
-from systemds.operator import OperationNode, Matrix, MultiReturn
+from systemds.operator import Matrix, MultiReturn, OperationNode
+from systemds.script_building.dag import DAGNode, OutputType
from systemds.utils.consts import VALID_INPUT_TYPES
+from systemds.utils.converters import (frame_block_to_pandas,
+ pandas_to_frame_block)
from systemds.utils.helpers import get_slice_string
-from systemds.utils.converters import pandas_to_frame_block, frame_block_to_pandas
-from systemds.script_building.dag import OutputType, DAGNode
if TYPE_CHECKING:
# to avoid cyclic dependencies during runtime
@@ -46,7 +48,7 @@ class Frame(OperationNode):
unnamed_input_nodes: Union[str,
Iterable[VALID_INPUT_TYPES]] = None,
named_input_nodes: Dict[str, VALID_INPUT_TYPES] = None,
- local_data: pd.DataFrame = None, brackets:bool = False) -> "Frame":
+ local_data: pd.DataFrame = None, brackets: bool = False) -> "Frame":
is_python_local_data = False
if local_data is not None:
self._pd_dataframe = local_data
@@ -87,11 +89,11 @@ class Frame(OperationNode):
def transform_encode(self, spec: "Scalar"):
params_dict = {"target": self, "spec": spec}
-
- frame = Frame(self.sds_context,"")
- matrix = Matrix(self.sds_context,"")
- output_nodes = [matrix,frame]
-
+
+ frame = Frame(self.sds_context, "")
+ matrix = Matrix(self.sds_context, "")
+ output_nodes = [matrix, frame]
+
op = MultiReturn(
self.sds_context,
"transformencode",
@@ -125,18 +127,18 @@ class Frame(OperationNode):
"""
return Frame(self.sds_context, "cbind", [self, other])
- def replace(self, pattern:str, replacement:str) -> 'Frame':
+ def replace(self, pattern: str, replacement: str) -> 'Frame':
"""
Replace all instances of string with replacement string
:param: pattern the string to replace
:param: replacement the string to replace with
:return: The Frame containing the replaced values
"""
- return Frame(self.sds_context, "replace", named_input_nodes={"target": self, "pattern": f"'{pattern}'", "replacement":f"'{replacement}'"})
+ return Frame(self.sds_context, "replace", named_input_nodes={"target": self, "pattern": f"'{pattern}'", "replacement": f"'{replacement}'"})
def __str__(self):
return "FrameNode"
def __getitem__(self, i) -> 'Frame':
sliceIns = get_slice_string(i)
- return Frame(self.sds_context, '', [self, sliceIns], brackets=True)
\ No newline at end of file
+ return Frame(self.sds_context, '', [self, sliceIns], brackets=True)
diff --git a/src/main/python/systemds/operator/nodes/list.py b/src/main/python/systemds/operator/nodes/list.py
index 09455a3..6f5bfb1 100644
--- a/src/main/python/systemds/operator/nodes/list.py
+++ b/src/main/python/systemds/operator/nodes/list.py
@@ -21,12 +21,11 @@
__all__ = ["List"]
-from typing import Dict, Sequence, Tuple, Union, Iterable, List
+from typing import Dict, Iterable, List, Sequence, Tuple, Union
import numpy as np
from py4j.java_gateway import JavaObject
-
-from systemds.operator import OperationNode, ListAccess
+from systemds.operator import ListAccess, OperationNode
from systemds.script_building.dag import OutputType
from systemds.utils.consts import VALID_INPUT_TYPES
from systemds.utils.converters import numpy_to_matrix_block
diff --git a/src/main/python/systemds/operator/nodes/list_access.py b/src/main/python/systemds/operator/nodes/list_access.py
index 10d51f5..a954f9c 100644
--- a/src/main/python/systemds/operator/nodes/list_access.py
+++ b/src/main/python/systemds/operator/nodes/list_access.py
@@ -21,12 +21,11 @@
__all__ = ["ListAccess"]
-from typing import Dict, Sequence, Tuple, Union, Iterable
+from typing import Dict, Iterable, Sequence, Tuple, Union
import numpy as np
from py4j.java_gateway import JavaObject
-
-from systemds.operator import OperationNode, Matrix, Frame, Scalar
+from systemds.operator import Frame, Matrix, OperationNode, Scalar
from systemds.script_building.dag import OutputType
@@ -49,7 +48,7 @@ class ListAccess(OperationNode):
res = Matrix(self.sds_context, "as.matrix", [ent])
self._list_source._outputs[self._key] = res
return res
-
+
def as_frame(self) -> Frame:
ent = self._list_source[self._key]
res = Frame(self.sds_context, "as.frame", [ent])
@@ -63,4 +62,4 @@ class ListAccess(OperationNode):
return res
def __str__(self):
- return "ListAccessNode"
\ No newline at end of file
+ return "ListAccessNode"
diff --git a/src/main/python/systemds/operator/nodes/matrix.py b/src/main/python/systemds/operator/nodes/matrix.py
index 2603dd5..4f08735 100644
--- a/src/main/python/systemds/operator/nodes/matrix.py
+++ b/src/main/python/systemds/operator/nodes/matrix.py
@@ -22,17 +22,18 @@
__all__ = ["Matrix"]
import os
-from typing import Dict, Optional, Sequence, Tuple, Union, TYPE_CHECKING, Iterable
+from typing import (TYPE_CHECKING, Dict, Iterable, Optional, Sequence, Tuple,
+ Union)
import numpy as np
from py4j.java_gateway import JavaObject, JVMView
from systemds.operator import OperationNode, Scalar
-from systemds.utils.consts import VALID_INPUT_TYPES
-from systemds.utils.converters import numpy_to_matrix_block, matrix_block_to_numpy
-from systemds.utils.helpers import get_slice_string
from systemds.script_building.dag import OutputType
-
-from systemds.utils.consts import VALID_INPUT_TYPES, BINARY_OPERATIONS, VALID_ARITHMETIC_TYPES
+from systemds.utils.consts import (BINARY_OPERATIONS, VALID_ARITHMETIC_TYPES,
+ VALID_INPUT_TYPES)
+from systemds.utils.converters import (matrix_block_to_numpy,
+ numpy_to_matrix_block)
+from systemds.utils.helpers import get_slice_string
class Matrix(OperationNode):
@@ -42,7 +43,7 @@ class Matrix(OperationNode):
unnamed_input_nodes: Union[str,
Iterable[VALID_INPUT_TYPES]] = None,
named_input_nodes: Dict[str, VALID_INPUT_TYPES] = None,
- local_data: np.array = None, brackets:bool = False ) -> 'Matrix':
+ local_data: np.array = None, brackets: bool = False) -> 'Matrix':
is_python_local_data = False
if local_data is not None:
@@ -359,12 +360,12 @@ class Matrix(OperationNode):
:return: The Matrix representing the result of this operation
"""
return Matrix(self.sds_context, "round", [self])
-
- def replace(self, pattern:VALID_INPUT_TYPES, replacement:VALID_INPUT_TYPES) -> 'Matrix':
+
+ def replace(self, pattern: VALID_INPUT_TYPES, replacement: VALID_INPUT_TYPES) -> 'Matrix':
"""
Replace all values with replacement value
"""
- return Matrix(self.sds_context, "replace", named_input_nodes={"target": self, "pattern": pattern, "replacement":replacement})
+ return Matrix(self.sds_context, "replace", named_input_nodes={"target": self, "pattern": pattern, "replacement": replacement})
def __str__(self):
return "MatrixNode"
diff --git a/src/main/python/systemds/operator/nodes/multi_return.py b/src/main/python/systemds/operator/nodes/multi_return.py
index 90b2de5..b766157 100644
--- a/src/main/python/systemds/operator/nodes/multi_return.py
+++ b/src/main/python/systemds/operator/nodes/multi_return.py
@@ -21,15 +21,15 @@
__all__ = ["MultiReturn"]
-from typing import Dict, Sequence, Tuple, Union, Iterable, List
+from typing import Dict, Iterable, List, Sequence, Tuple, Union
import numpy as np
from py4j.java_gateway import JavaObject
-
from systemds.operator import OperationNode
from systemds.script_building.dag import OutputType
from systemds.utils.consts import VALID_INPUT_TYPES
-from systemds.utils.converters import matrix_block_to_numpy,frame_block_to_pandas
+from systemds.utils.converters import (frame_block_to_pandas,
+ matrix_block_to_numpy)
from systemds.utils.helpers import create_params_string
@@ -68,7 +68,7 @@ class MultiReturn(OperationNode):
result_var = []
jvmV = self.sds_context.java_gateway.jvm
for idx, v in enumerate(self._script.out_var_name):
- out_type =self._outputs[idx].output_type
+ out_type = self._outputs[idx].output_type
if out_type == OutputType.MATRIX:
result_var.append(
matrix_block_to_numpy(jvmV, result_variables.getMatrixBlock(v)))
@@ -78,7 +78,8 @@ class MultiReturn(OperationNode):
elif out_type == OutputType.DOUBLE:
result_var.append(result_variables.getDouble(v))
else:
- raise NotImplementedError("Not Implemented Support of type" + out_type)
+ raise NotImplementedError(
+ "Not Implemented Support of type" + out_type)
return result_var
def __iter__(self):
diff --git a/src/main/python/systemds/operator/nodes/scalar.py b/src/main/python/systemds/operator/nodes/scalar.py
index 5078ac0..511ad34 100644
--- a/src/main/python/systemds/operator/nodes/scalar.py
+++ b/src/main/python/systemds/operator/nodes/scalar.py
@@ -22,16 +22,16 @@
__all__ = ["Scalar"]
import os
-from typing import Dict, Optional, Sequence, Tuple, Union, TYPE_CHECKING, Iterable
+from typing import (TYPE_CHECKING, Dict, Iterable, Optional, Sequence, Tuple,
+ Union)
import numpy as np
from py4j.java_gateway import JavaObject, JVMView
from systemds.operator import OperationNode
-from systemds.utils.consts import VALID_INPUT_TYPES
-from systemds.utils.converters import numpy_to_matrix_block
from systemds.script_building.dag import OutputType
-
-from systemds.utils.consts import VALID_INPUT_TYPES, BINARY_OPERATIONS, VALID_ARITHMETIC_TYPES
+from systemds.utils.consts import (BINARY_OPERATIONS, VALID_ARITHMETIC_TYPES,
+ VALID_INPUT_TYPES)
+from systemds.utils.converters import numpy_to_matrix_block
class Scalar(OperationNode):
@@ -66,7 +66,8 @@ class Scalar(OperationNode):
elif self.output_type == OutputType.STRING:
return result_variables.getString(self._script.out_var_name[0])
else:
- raise NotImplemented("Not currently support scalar type: " + self.output_type)
+ raise NotImplemented(
+ "Not currently support scalar type: " + self.output_type)
def __add__(self, other: VALID_ARITHMETIC_TYPES) -> 'Scalar':
return Scalar(self.sds_context, '+', [self, other])
@@ -226,4 +227,3 @@ class Scalar(OperationNode):
def __str__(self):
return "ScalarNode"
-
diff --git a/src/main/python/systemds/operator/nodes/source.py b/src/main/python/systemds/operator/nodes/source.py
index d027209..3a61f11 100644
--- a/src/main/python/systemds/operator/nodes/source.py
+++ b/src/main/python/systemds/operator/nodes/source.py
@@ -26,7 +26,7 @@ from typing import (TYPE_CHECKING, Dict, Iterable, Optional, Sequence, Tuple,
Union)
import numpy as np
-from systemds.operator import Matrix, OperationNode, Scalar, List
+from systemds.operator import List, Matrix, OperationNode, Scalar
from systemds.script_building.dag import OutputType
@@ -48,7 +48,7 @@ class Func(object):
argument_string, named_arguments = self.parse_inputs()
named_intput_nodes = f'named_arguments = {{{named_arguments}}}'
output_object = self.parse_outputs()
-
+
definition = f'def {self._name}(self{argument_string}):'
if self._outputs is None:
output = f'out = {output_object}(self.sds_context, {operation}, named_input_nodes=named_arguments, output_type=OutputType.NONE)'
@@ -101,13 +101,13 @@ class Func(object):
elif var_l[0] == 'i': # integer
if "integer" in var_l:
return (self.split_to_value_and_def(var[7:]), 'Scalar')
- else: # int
+ else: # int
return (self.split_to_value_and_def(var[3:]), 'Scalar')
elif var_l[0] == 'b': # boolean
return (self.split_to_value_and_def(var[7:], True), 'Scalar')
- elif var_l[0] == 'l': # list[unknown]
+ elif var_l[0] == 'l': # list[unknown]
return (self.split_to_value_and_def(var[13:]), 'List')
- elif var_l[0] == 's': # string
+ elif var_l[0] == 's': # string
return (self.split_to_value_and_def(var[6:]), 'Scalar')
else:
raise NotImplementedError(
diff --git a/src/main/python/systemds/operator/operation_node.py b/src/main/python/systemds/operator/operation_node.py
index acc3988..99d823b 100644
--- a/src/main/python/systemds/operator/operation_node.py
+++ b/src/main/python/systemds/operator/operation_node.py
@@ -147,7 +147,7 @@ class OperationNode(DAGNode):
assert len(
unnamed_input_vars) == 2, 'Binary Operations need exactly two input variables'
return f'{var_name}={unnamed_input_vars[0]}{self.operation}{unnamed_input_vars[1]}'
-
+
inputs_comma_sep = create_params_string(
unnamed_input_vars, named_input_vars)
diff --git a/src/main/python/systemds/utils/converters.py b/src/main/python/systemds/utils/converters.py
index 342c7a3..9f88271 100644
--- a/src/main/python/systemds/utils/converters.py
+++ b/src/main/python/systemds/utils/converters.py
@@ -19,10 +19,11 @@
#
# -------------------------------------------------------------
+
import numpy as np
import pandas as pd
-import time
-from py4j.java_gateway import JavaClass, JavaObject, JVMView, JavaGateway
+from py4j.java_gateway import JavaClass, JavaGateway, JavaObject, JVMView
+
def numpy_to_matrix_block(sds: 'SystemDSContext', np_arr: np.array):
"""Converts a given numpy array, to internal matrix block representation.
@@ -33,6 +34,7 @@ def numpy_to_matrix_block(sds: 'SystemDSContext', np_arr: np.array):
assert (np_arr.ndim <= 2), "np_arr invalid, because it has more than 2 dimensions"
rows = np_arr.shape[0]
cols = np_arr.shape[1] if np_arr.ndim == 2 else 1
+
# If not numpy array then convert to numpy array
if not isinstance(np_arr, np.ndarray):
np_arr = np.asarray(np_arr, dtype=np.float64)
@@ -133,7 +135,7 @@ def pandas_to_frame_block(sds: "SystemDSContext", pd_df: pd.DataFrame):
def frame_block_to_pandas(sds: "SystemDSContext", fb: JavaObject):
- start = time.time()
+
num_rows = fb.getNumRows()
num_cols = fb.getNumColumns()
data = []
@@ -156,9 +158,17 @@ def frame_block_to_pandas(sds: "SystemDSContext", fb: JavaObject):
elif d_type == "Long":
byteArray = fb.getColumnAsBytes(c_index)
ret = np.frombuffer(byteArray, dtype=np.int64)
+ elif d_type == "Double":
+ byteArray = fb.getColumnAsBytes(c_index)
+ ret = np.frombuffer(byteArray, dtype=np.float64)
+ elif d_type == "Boolean":
+ # TODO maybe it is more efficient to bit pack the booleans.
+ # https://stackoverflow.com/questions/5602155/numpy-boolean-array-with-1-bit-entries
+ byteArray = fb.getColumnAsBytes(c_index)
+ ret = np.frombuffer(byteArray, dtype=np.dtype("?"))
else:
- raise NotImplementedError(f'Not Implemented {d_type} for systemds to pandas parsing')
+ raise NotImplementedError(
+ f'Not Implemented {d_type} for systemds to pandas parsing')
df[fb.getColumnName(c_index)] = ret
-
return df
diff --git a/src/main/python/systemds/utils/helpers.py b/src/main/python/systemds/utils/helpers.py
index 67b4f8d..83ca596 100644
--- a/src/main/python/systemds/utils/helpers.py
+++ b/src/main/python/systemds/utils/helpers.py
@@ -1,4 +1,4 @@
-#-------------------------------------------------------------
+# -------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
@@ -17,12 +17,12 @@
# specific language governing permissions and limitations
# under the License.
#
-#-------------------------------------------------------------
+# -------------------------------------------------------------
import os
-from itertools import chain
-from typing import Iterable, Dict
from importlib.util import find_spec
+from itertools import chain
+from typing import Dict, Iterable
from systemds.utils.consts import MODULE_NAME
@@ -53,7 +53,8 @@ def get_module_dir() -> os.PathLike:
def get_slice_string(i):
if isinstance(i, tuple):
if len(i) > 2:
- raise ValueError(f'Invalid number of dimensions to slice {len(i)}, Only 2 dimensions allowed')
+ raise ValueError(
+ f'Invalid number of dimensions to slice {len(i)}, Only 2 dimensions allowed')
else:
return f'{get_slice_string(i[0])},{get_slice_string(i[1])}'
elif isinstance(i, slice):
@@ -69,4 +70,4 @@ def get_slice_string(i):
else:
# + 1 since R and systemDS is 1 indexed.
sliceIns = i+1
- return sliceIns
\ No newline at end of file
+ return sliceIns
diff --git a/src/main/python/tests/examples/tutorials/test_adult.py b/src/main/python/tests/examples/tutorials/test_adult.py
index b2e4f84..ddafc96 100644
--- a/src/main/python/tests/examples/tutorials/test_adult.py
+++ b/src/main/python/tests/examples/tutorials/test_adult.py
@@ -24,8 +24,10 @@ import unittest
import numpy as np
from systemds.context import SystemDSContext
from systemds.examples.tutorials.adult import DataManager
-from systemds.operator import OperationNode, Matrix, Frame
-from systemds.operator.algorithm import kmeans, multiLogReg, multiLogRegPredict, l2svm, confusionMatrix, scale, scaleApply, split, winsorize
+from systemds.operator import Frame, Matrix, OperationNode
+from systemds.operator.algorithm import (confusionMatrix, kmeans, l2svm,
+ multiLogReg, multiLogRegPredict,
+ scale, scaleApply, split, winsorize)
from systemds.script_building import DMLScript
@@ -53,62 +55,102 @@ class Test_DMLScript(unittest.TestCase):
def tearDownClass(cls):
cls.sds.close()
- # def test_train_data(self):
- # x = self.d.get_train_data_pandas()
- # self.assertEqual((32561, 14), x.shape)
+ def test_train_data(self):
+ x = self.d.get_train_data_pandas()
+ self.assertEqual((32561, 14), x.shape)
- # def test_train_labels(self):
- # y = self.d.get_train_labels_pandas()
- # self.assertEqual((32561,), y.shape)
+ def test_train_labels(self):
+ y = self.d.get_train_labels_pandas()
+ self.assertEqual((32561,), y.shape)
- # def test_test_data(self):
- # x_l = self.d.get_test_data_pandas()
- # self.assertEqual((16281, 14), x_l.shape)
+ def test_test_data(self):
+ x_l = self.d.get_test_data_pandas()
+ self.assertEqual((16281, 14), x_l.shape)
- # def test_test_labels(self):
- # y_l = self.d.get_test_labels_pandas()
- # self.assertEqual((16281,), y_l.shape)
+ def test_test_labels(self):
+ y_l = self.d.get_test_labels_pandas()
+ self.assertEqual((16281,), y_l.shape)
def test_train_data_pandas_vs_systemds(self):
pandas = self.d.get_train_data_pandas()
- systemds = self.d.get_train_data(self.sds).compute(verbose=True)
- print(pandas)
- print(systemds)
- # self.assertEqual(pandas, systemds)
-
-
- # def test_multi_log_reg(self):
- # # Reduced because we want the tests to finish a bit faster.
- # train_count = 15000
- # test_count = 5000
-
- # train_data, train_labels, test_data, test_labels = self.d.get_preprocessed_dataset()
-
- # # Train data
- # X = self.sds.from_numpy( train_data[:train_count])
- # Y = self.sds.from_numpy( train_labels[:train_count])
- # Y = Y + 1.0
-
- # # Test data
- # Xt = self.sds.from_numpy(test_data[:test_count])
- # Yt = self.sds.from_numpy(test_labels[:test_count])
- # Yt = Yt + 1.0
-
- # betas = multiLogReg(X, Y)
-
- # [_, y_pred, acc] = multiLogRegPredict(Xt, betas, Yt).compute()
-
- # self.assertGreater(acc, 80)
-
- # confusion_matrix_abs, _ = confusionMatrix(self.sds.from_numpy(y_pred), Yt).compute()
-
- # self.assertTrue(
- # np.allclose(
- # confusion_matrix_abs,
- # np.array([[3503, 503],
- # [268, 726]])
- # )
- # )
+ systemds = self.d.get_train_data(self.sds).compute()
+ self.assertTrue(len(pandas.columns.difference(systemds.columns)) == 0)
+ self.assertEqual(pandas.shape, systemds.shape)
+
+ def test_train_labels_pandas_vs_systemds(self):
+ # Pandas does not strip the parsed values.. so i have to do it here.
+ pandas = np.array(
+ [x.strip() for x in self.d.get_train_labels_pandas().to_numpy().flatten()])
+ systemds = self.d.get_train_labels(
+ self.sds).compute().to_numpy().flatten()
+ comp = pandas == systemds
+ self.assertTrue(comp.all())
+
+ def test_test_labels_pandas_vs_systemds(self):
+ # Pandas does not strip the parsed values.. so i have to do it here.
+ pandas = np.array(
+ [x.strip() for x in self.d.get_test_labels_pandas().to_numpy().flatten()])
+ systemds = self.d.get_test_labels(
+ self.sds).compute().to_numpy().flatten()
+ comp = pandas == systemds
+ self.assertTrue(comp.all())
+
+ def test_transform_encode_train_data(self):
+ jspec = self.d.get_jspec(self.sds)
+ train_x, M1 = self.d.get_train_data(self.sds).transform_encode(spec=jspec)
+ train_x_numpy = train_x.compute()
+ self.assertEqual((32561, 107), train_x_numpy.shape)
+
+ def test_transform_encode_apply_test_data(self):
+ jspec = self.d.get_jspec(self.sds)
+ train_x, M1 = self.d.get_train_data(self.sds).transform_encode(spec=jspec)
+ test_x = self.d.get_test_data(self.sds).transform_apply(spec=jspec, meta=M1)
+ test_x_numpy = test_x.compute()
+ self.assertEqual((16281, 107), test_x_numpy.shape)
+
+ def test_transform_encode_train_labels(self):
+ jspec_dict = {"recode":["income"]}
+ jspec = self.sds.scalar(f'"{jspec_dict}"')
+ train_y, M1 = self.d.get_train_labels(self.sds).transform_encode(spec=jspec)
+ train_y_numpy = train_y.compute()
+ self.assertEqual((32561, 1), train_y_numpy.shape)
+
+ def test_transform_encode_test_labels(self):
+ jspec_dict = {"recode":["income"]}
+ jspec = self.sds.scalar(f'"{jspec_dict}"')
+ train_y, M1 = self.d.get_train_labels(self.sds).transform_encode(spec=jspec)
+ test_y = self.d.get_test_labels(self.sds).transform_apply(spec=jspec, meta=M1)
+ test_y_numpy = test_y.compute()
+ self.assertEqual((16281, 1), test_y_numpy.shape)
+
+ def test_multi_log_reg(self):
+ # Reduced because we want the tests to finish a bit faster.
+ train_count = 10000
+ test_count = 500
+
+ jspec_data = self.d.get_jspec(self.sds)
+ train_x_frame = self.d.get_train_data(self.sds)[0:train_count]
+ train_x, M1 = train_x_frame.transform_encode(spec=jspec_data)
+ test_x_frame = self.d.get_test_data(self.sds)[0:test_count]
+ test_x = test_x_frame.transform_apply(spec=jspec_data, meta=M1)
+
+ jspec_dict = {"recode": ["income"]}
+ jspec_labels = self.sds.scalar(f'"{jspec_dict}"')
+ train_y_frame = self.d.get_train_labels(self.sds)[0:train_count]
+ train_y, M2 = train_y_frame.transform_encode(spec=jspec_labels)
+ test_y_frame = self.d.get_test_labels(self.sds)[0:test_count]
+ test_y = test_y_frame.transform_apply(spec=jspec_labels, meta=M2)
+
+ betas = multiLogReg(train_x, train_y)
+ [_, y_pred, acc] = multiLogRegPredict(test_x, betas, test_y)
+
+ [_, conf_avg] = confusionMatrix(y_pred, test_y)
+ confusion_numpy = conf_avg.compute()
+
+ self.assertTrue(confusion_numpy[0][0] > 0.8)
+ self.assertTrue(confusion_numpy[0][1] < 0.5)
+ self.assertTrue(confusion_numpy[1][1] > 0.5)
+ self.assertTrue(confusion_numpy[1][0] < 0.2)
# def test_neural_net(self):
# # Reduced because we want the tests to finish a bit faster.
@@ -137,8 +179,6 @@ class Test_DMLScript(unittest.TestCase):
# #probs = FFN_package.predict(Xt, network).compute(True)
# # FFN_package.eval(Yt, Yt).compute()
-
-
# def test_level1(self):
# # Reduced because we want the tests to finish a bit faster.
# train_count = 15000
@@ -319,6 +359,5 @@ class Test_DMLScript(unittest.TestCase):
# ################################################################################################################
-
if __name__ == "__main__":
unittest.main(exit=False)