You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by du...@apache.org on 2016/09/02 21:06:49 UTC
incubator-systemml git commit: [SYSTEMML-895] Cleanup Python Package
Repository: incubator-systemml
Updated Branches:
refs/heads/master 1f6d55e9e -> 701b9e319
[SYSTEMML-895] Cleanup Python Package
Various cleanups of the Python code to be more Pythonic, adhering to PEP 8.
Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/701b9e31
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/701b9e31
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/701b9e31
Branch: refs/heads/master
Commit: 701b9e319daa8140faff5193e2f4d0401f55db0d
Parents: 1f6d55e
Author: Mike Dusenberry <mw...@us.ibm.com>
Authored: Fri Sep 2 13:59:00 2016 -0700
Committer: Mike Dusenberry <mw...@us.ibm.com>
Committed: Fri Sep 2 13:59:00 2016 -0700
----------------------------------------------------------------------
src/main/python/setup.py | 23 ++++---
src/main/python/systemml/__init__.py | 7 +--
src/main/python/systemml/converters.py | 25 +++-----
src/main/python/systemml/defmatrix.py | 19 +++---
src/main/python/systemml/mlcontext.py | 4 +-
src/main/python/systemml/mllearn/__init__.py | 7 +--
src/main/python/systemml/mllearn/estimators.py | 69 ++++++++++-----------
7 files changed, 75 insertions(+), 79 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/701b9e31/src/main/python/setup.py
----------------------------------------------------------------------
diff --git a/src/main/python/setup.py b/src/main/python/setup.py
index cc8f373..5cb498f 100644
--- a/src/main/python/setup.py
+++ b/src/main/python/setup.py
@@ -1,4 +1,3 @@
-#!/usr/bin/python
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
@@ -8,9 +7,9 @@
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
-#
+#
# http://www.apache.org/licenses/LICENSE-2.0
-#
+#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -20,8 +19,8 @@
#
#-------------------------------------------------------------
-from setuptools import setup, find_packages
import os
+from setuptools import find_packages, setup
import time
VERSION = '0.11.0.dev1'
@@ -37,22 +36,22 @@ PACKAGE_DATA = []
for path, subdirs, files in os.walk('systemml/systemml-java'):
for name in files:
PACKAGE_DATA = PACKAGE_DATA + [ os.path.join(path, name).replace('./', '') ]
-
+
setup(
- name='SystemML',
+ name='systemml',
version=VERSION,
description='Apache SystemML is a distributed and declarative machine learning platform.',
long_description='''
-
+
Apache SystemML is an effort undergoing incubation at the Apache Software Foundation (ASF), sponsored by the Apache Incubator PMC.
While incubation status is not necessarily a reflection of the completeness
or stability of the code, it does indicate that the project has yet to be
fully endorsed by the ASF.
-
- Apache SystemML provides declarative large-scale machine learning (ML) that aims at
- flexible specification of ML algorithms and automatic generation of hybrid runtime
+
+ Apache SystemML provides declarative large-scale machine learning (ML) that aims at
+ flexible specification of ML algorithms and automatic generation of hybrid runtime
plans ranging from single-node, in-memory computations, to distributed computations on Apache Hadoop and Apache Spark.
-
+
Note: This is not a released version and was built with SNAPSHOT available on the date''' + RELEASED_DATE,
url='http://systemml.apache.org/',
author='Apache SystemML',
@@ -74,4 +73,4 @@ setup(
'Topic :: Software Development :: Libraries',
],
license='Apache 2.0',
- )
\ No newline at end of file
+ )
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/701b9e31/src/main/python/systemml/__init__.py
----------------------------------------------------------------------
diff --git a/src/main/python/systemml/__init__.py b/src/main/python/systemml/__init__.py
index 02a940b..04d521b 100644
--- a/src/main/python/systemml/__init__.py
+++ b/src/main/python/systemml/__init__.py
@@ -1,4 +1,3 @@
-#!/usr/bin/python
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
@@ -8,9 +7,9 @@
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
-#
+#
# http://www.apache.org/licenses/LICENSE-2.0
-#
+#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -26,4 +25,4 @@ from .converters import *
__all__ = mlcontext.__all__
__all__ += defmatrix.__all__
-__all__ += converters.__all__
\ No newline at end of file
+__all__ += converters.__all__
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/701b9e31/src/main/python/systemml/converters.py
----------------------------------------------------------------------
diff --git a/src/main/python/systemml/converters.py b/src/main/python/systemml/converters.py
index 9588bec..243a507 100644
--- a/src/main/python/systemml/converters.py
+++ b/src/main/python/systemml/converters.py
@@ -1,4 +1,3 @@
-#!/usr/bin/python
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
@@ -8,9 +7,9 @@
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
-#
+#
# http://www.apache.org/licenses/LICENSE-2.0
-#
+#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -20,15 +19,12 @@
#
#-------------------------------------------------------------
-from pyspark.context import SparkContext
-from pyspark.sql import DataFrame, SQLContext
-from pyspark.rdd import RDD
+__all__ = [ 'getNumCols', 'convertToMatrixBlock', 'convertToNumpyArr', 'convertToPandasDF', 'SUPPORTED_TYPES' , 'convertToLabeledDF']
+
import numpy as np
import pandas as pd
-import sklearn as sk
-
-from scipy.sparse import spmatrix
-from scipy.sparse import coo_matrix
+from pyspark.context import SparkContext
+from scipy.sparse import coo_matrix, spmatrix
SUPPORTED_TYPES = (np.ndarray, pd.DataFrame, spmatrix)
@@ -37,7 +33,8 @@ def getNumCols(numPyArr):
return 1
else:
return numPyArr.shape[1]
-
+
+
def convertToLabeledDF(sqlCtx, X, y=None):
from pyspark.ml.feature import VectorAssembler
if y is not None:
@@ -56,7 +53,7 @@ def convertToLabeledDF(sqlCtx, X, y=None):
return out.select('features', 'label')
else:
return out.select('features')
-
+
def convertToMatrixBlock(sc, src):
if isinstance(src, spmatrix):
@@ -80,7 +77,7 @@ def convertToMatrixBlock(sc, src):
return sc._jvm.org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtilsExt.convertPy4JArrayToMB(buf, numRows, numCols)
else:
raise TypeError('sc needs to be of type SparkContext') # TODO: We can generalize this by creating py4j gateway ourselves
-
+
def convertToNumpyArr(sc, mb):
if isinstance(sc, SparkContext):
@@ -96,5 +93,3 @@ def convertToPandasDF(X):
if not isinstance(X, pd.DataFrame):
return pd.DataFrame(X, columns=['C' + str(i) for i in range(getNumCols(X))])
return X
-
-__all__ = [ 'getNumCols', 'convertToMatrixBlock', 'convertToNumpyArr', 'convertToPandasDF', 'SUPPORTED_TYPES' , 'convertToLabeledDF']
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/701b9e31/src/main/python/systemml/defmatrix.py
----------------------------------------------------------------------
diff --git a/src/main/python/systemml/defmatrix.py b/src/main/python/systemml/defmatrix.py
index 18f6314..2994092 100644
--- a/src/main/python/systemml/defmatrix.py
+++ b/src/main/python/systemml/defmatrix.py
@@ -1,4 +1,3 @@
-#!/usr/bin/python
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
@@ -20,13 +19,14 @@
#
#-------------------------------------------------------------
-import numpy as np
+__all__ = [ 'setSparkContext', 'matrix', 'eval', 'solve']
-from . import pydml, MLContext
-from .converters import *
-from pyspark import SparkContext, RDD
+from pyspark import SparkContext
from pyspark.sql import DataFrame, SQLContext
+from . import MLContext, pydml
+from .converters import *
+
def setSparkContext(sc):
"""
Before using the matrix, the user needs to invoke this function.
@@ -39,10 +39,12 @@ def setSparkContext(sc):
matrix.ml = MLContext(sc)
matrix.sc = sc
+
def checkIfMLContextIsSet():
if matrix.ml is None:
raise Exception('Expected setSparkContext(sc) to be called.')
+
class DMLOp(object):
"""
Represents an intermediate node of Abstract syntax tree created to generate the PyDML script
@@ -63,6 +65,7 @@ def reset():
m.visited = False
matrix.visited = []
+
def binaryOp(lhs, rhs, opStr):
"""
Common function called by all the binary operators in matrix class
@@ -87,6 +90,7 @@ def binaryOp(lhs, rhs, opStr):
dmlOp.dml = [out.ID, ' = ', lhsStr, opStr, rhsStr, '\n']
return out
+
def binaryMatrixFunction(X, Y, fnName):
"""
Common function called by supported PyDML built-in function that has two arguments both of which are matrices.
@@ -100,6 +104,7 @@ def binaryMatrixFunction(X, Y, fnName):
dmlOp.dml = [out.ID, ' = ', fnName,'(', X.ID, ', ', Y.ID, ')\n']
return out
+
def solve(A, b):
"""
Computes the least squares solution for system of linear equations A %*% x = b
@@ -128,6 +133,7 @@ def solve(A, b):
"""
return binaryMatrixFunction(A, b, 'solve')
+
def eval(outputs, outputDF=False, execute=True):
"""
Executes the unevaluated DML script and computes the matrices specified by outputs.
@@ -159,6 +165,7 @@ def eval(outputs, outputDF=False, execute=True):
else:
m.data = results.getNumPyArray(m.ID)
+
class matrix(object):
"""
matrix class is a python wrapper that implements basic matrix operator.
@@ -406,5 +413,3 @@ class matrix(object):
def dot(self, other):
return binaryMatrixFunction(self, other, 'dot')
-
-__all__ = [ 'setSparkContext', 'matrix', 'eval', 'solve']
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/701b9e31/src/main/python/systemml/mlcontext.py
----------------------------------------------------------------------
diff --git a/src/main/python/systemml/mlcontext.py b/src/main/python/systemml/mlcontext.py
index 1b90e70..c578a8e 100644
--- a/src/main/python/systemml/mlcontext.py
+++ b/src/main/python/systemml/mlcontext.py
@@ -1,4 +1,3 @@
-#!/usr/bin/python
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
@@ -19,6 +18,7 @@
# under the License.
#
#-------------------------------------------------------------
+
import os
try:
@@ -26,10 +26,10 @@ try:
from py4j.java_gateway import JavaObject
except ImportError:
raise ImportError('Unable to import JavaObject from py4j.java_gateway. Hint: Make sure you are running with pyspark')
-
from pyspark import SparkContext
import pyspark.mllib.common
from pyspark.sql import DataFrame, SQLContext
+
from .converters import *
def dml(scriptString):
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/701b9e31/src/main/python/systemml/mllearn/__init__.py
----------------------------------------------------------------------
diff --git a/src/main/python/systemml/mllearn/__init__.py b/src/main/python/systemml/mllearn/__init__.py
index 69cab58..8132405 100644
--- a/src/main/python/systemml/mllearn/__init__.py
+++ b/src/main/python/systemml/mllearn/__init__.py
@@ -1,4 +1,3 @@
-#!/usr/bin/python
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
@@ -8,9 +7,9 @@
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
-#
+#
# http://www.apache.org/licenses/LICENSE-2.0
-#
+#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -22,4 +21,4 @@
from .estimators import *
-__all__ = estimators.__all__
\ No newline at end of file
+__all__ = estimators.__all__
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/701b9e31/src/main/python/systemml/mllearn/estimators.py
----------------------------------------------------------------------
diff --git a/src/main/python/systemml/mllearn/estimators.py b/src/main/python/systemml/mllearn/estimators.py
index 5d33d64..97ab6bb 100644
--- a/src/main/python/systemml/mllearn/estimators.py
+++ b/src/main/python/systemml/mllearn/estimators.py
@@ -1,4 +1,3 @@
-#!/usr/bin/python
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
@@ -8,9 +7,9 @@
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
-#
+#
# http://www.apache.org/licenses/LICENSE-2.0
-#
+#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -20,15 +19,13 @@
#
#-------------------------------------------------------------
-from pyspark.context import SparkContext
-from pyspark.sql import DataFrame, SQLContext
-from pyspark.rdd import RDD
+__all__ = ['LinearRegression', 'LogisticRegression', 'SVM', 'NaiveBayes']
+
import numpy as np
-import pandas as pd
-import sklearn as sk
+from pyspark.ml import Estimator
from pyspark.ml.feature import VectorAssembler
-from pyspark.mllib.linalg import Vectors
-from pyspark.ml import Estimator, Model
+from pyspark.sql import DataFrame
+import sklearn as sk
from ..converters import *
@@ -40,32 +37,32 @@ def assemble(sqlCtx, pdf, inputCols, outputCol):
class BaseSystemMLEstimator(Estimator):
featuresCol = 'features'
labelCol = 'label'
-
+
def setFeaturesCol(self, colName):
"""
Sets the default column name for features of PySpark DataFrame.
-
+
Parameters
----------
colName: column name for features (default: 'features')
"""
self.featuresCol = colName
-
+
def setLabelCol(self, colName):
"""
Sets the default column name for features of PySpark DataFrame.
-
+
Parameters
----------
colName: column name for features (default: 'label')
"""
self.labelCol = colName
-
- # Returns a model after calling fit(df) on Estimator object on JVM
+
+ # Returns a model after calling fit(df) on Estimator object on JVM
def _fit(self, X):
"""
Invokes the fit method on Estimator object on JVM if X is PySpark DataFrame
-
+
Parameters
----------
X: PySpark DataFrame that contain the columns featuresCol (default: 'features') and labelCol (default: 'label')
@@ -75,11 +72,11 @@ class BaseSystemMLEstimator(Estimator):
return self
else:
raise Exception('Incorrect usage: Expected dataframe as input with features/label as columns')
-
+
def fit(self, X, y=None, params=None):
"""
Invokes the fit method on Estimator object on JVM if X and y are on of the supported data types
-
+
Parameters
----------
X: NumPy ndarray, Pandas DataFrame, scipy sparse matrix
@@ -109,15 +106,15 @@ class BaseSystemMLEstimator(Estimator):
return self
else:
raise Exception('Unsupported input type')
-
+
def transform(self, X):
return self.predict(X)
-
- # Returns either a DataFrame or MatrixBlock after calling transform(X:MatrixBlock, y:MatrixBlock) on Model object on JVM
+
+ # Returns either a DataFrame or MatrixBlock after calling transform(X:MatrixBlock, y:MatrixBlock) on Model object on JVM
def predict(self, X):
"""
Invokes the transform method on Estimator object on JVM if X and y are on of the supported data types
-
+
Parameters
----------
X: NumPy ndarray, Pandas DataFrame, scipy sparse matrix or PySpark DataFrame
@@ -152,26 +149,28 @@ class BaseSystemMLEstimator(Estimator):
return retDF.sort('ID')
else:
raise Exception('Unsupported input type')
-
+
+
class BaseSystemMLClassifier(BaseSystemMLEstimator):
def score(self, X, y):
"""
Scores the predicted value with ground truth 'y'
-
+
Parameters
----------
X: NumPy ndarray, Pandas DataFrame, scipy sparse matrix
y: NumPy ndarray, Pandas DataFrame, scipy sparse matrix
"""
- return sk.metrics.accuracy_score(y, self.predict(X))
+ return sk.metrics.accuracy_score(y, self.predict(X))
+
class BaseSystemMLRegressor(BaseSystemMLEstimator):
def score(self, X, y):
"""
Scores the predicted value with ground truth 'y'
-
+
Parameters
----------
X: NumPy ndarray, Pandas DataFrame, scipy sparse matrix
@@ -184,7 +183,7 @@ class LogisticRegression(BaseSystemMLClassifier):
def __init__(self, sqlCtx, penalty='l2', fit_intercept=True, max_iter=100, max_inner_iter=0, tol=0.000001, C=1.0, solver='newton-cg', transferUsingDF=False):
"""
Performs both binomial and multinomial logistic regression.
-
+
Parameters
----------
sqlCtx: PySpark SQLContext
@@ -215,12 +214,13 @@ class LogisticRegression(BaseSystemMLClassifier):
if solver != 'newton-cg':
raise Exception('Only newton-cg solver supported')
+
class LinearRegression(BaseSystemMLRegressor):
def __init__(self, sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, solver='newton-cg', transferUsingDF=False):
"""
Performs linear regression to model the relationship between one numerical response variable and one or more explanatory (feature) variables..
-
+
Parameters
----------
sqlCtx: PySpark SQLContext
@@ -228,7 +228,7 @@ class LinearRegression(BaseSystemMLRegressor):
max_iter: Maximum number of conjugate gradient iterations, or 0 if no maximum limit provided (default: 100)
tol: Tolerance used in the convergence criterion (default: 0.000001)
C: 1/regularization parameter (default: 1.0)
- solver: Supports either 'newton-cg' or 'direct-solve' (default: 'newton-cg').
+ solver: Supports either 'newton-cg' or 'direct-solve' (default: 'newton-cg').
Depending on the size and the sparsity of the feature matrix, one or the other solver may be more efficient.
'direct-solve' solver is more efficient when the number of features is relatively small (m < 1000) and
input matrix X is either tall or fairly dense; otherwise 'newton-cg' solver is more efficient.
@@ -256,7 +256,7 @@ class SVM(BaseSystemMLClassifier):
def __init__(self, sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, is_multi_class=False, transferUsingDF=False):
"""
Performs both binary-class and multiclass SVM (Support Vector Machines).
-
+
Parameters
----------
sqlCtx: PySpark SQLContext
@@ -278,14 +278,15 @@ class SVM(BaseSystemMLClassifier):
self.estimator.setTol(tol)
self.estimator.setIcpt(int(fit_intercept))
self.transferUsingDF = transferUsingDF
- self.setOutputRawPredictionsToFalse = False
+ self.setOutputRawPredictionsToFalse = False
+
class NaiveBayes(BaseSystemMLClassifier):
def __init__(self, sqlCtx, laplace=1.0, transferUsingDF=False):
"""
Performs both binary-class and multiclass SVM (Support Vector Machines).
-
+
Parameters
----------
sqlCtx: PySpark SQLContext
@@ -298,5 +299,3 @@ class NaiveBayes(BaseSystemMLClassifier):
self.estimator.setLaplace(laplace)
self.transferUsingDF = transferUsingDF
self.setOutputRawPredictionsToFalse = False
-
-__all__ = ['LogisticRegression', 'LinearRegression', 'SVM', 'NaiveBayes']