You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by du...@apache.org on 2016/09/02 21:06:49 UTC
incubator-systemml git commit: [SYSTEMML-895] Cleanup Python Package

Repository: incubator-systemml
Updated Branches:
  refs/heads/master 1f6d55e9e -> 701b9e319


[SYSTEMML-895] Cleanup Python Package

Various cleanups of the Python code to be more Pythonic, adhering to PEP 8.


Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/701b9e31
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/701b9e31
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/701b9e31

Branch: refs/heads/master
Commit: 701b9e319daa8140faff5193e2f4d0401f55db0d
Parents: 1f6d55e
Author: Mike Dusenberry <mw...@us.ibm.com>
Authored: Fri Sep 2 13:59:00 2016 -0700
Committer: Mike Dusenberry <mw...@us.ibm.com>
Committed: Fri Sep 2 13:59:00 2016 -0700

----------------------------------------------------------------------
 src/main/python/setup.py                       | 23 ++++---
 src/main/python/systemml/__init__.py           |  7 +--
 src/main/python/systemml/converters.py         | 25 +++-----
 src/main/python/systemml/defmatrix.py          | 19 +++---
 src/main/python/systemml/mlcontext.py          |  4 +-
 src/main/python/systemml/mllearn/__init__.py   |  7 +--
 src/main/python/systemml/mllearn/estimators.py | 69 ++++++++++-----------
 7 files changed, 75 insertions(+), 79 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/701b9e31/src/main/python/setup.py
----------------------------------------------------------------------
diff --git a/src/main/python/setup.py b/src/main/python/setup.py
index cc8f373..5cb498f 100644
--- a/src/main/python/setup.py
+++ b/src/main/python/setup.py
@@ -1,4 +1,3 @@
-#!/usr/bin/python
 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
@@ -8,9 +7,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -20,8 +19,8 @@
 #
 #-------------------------------------------------------------
 
-from setuptools import setup, find_packages
 import os
+from setuptools import find_packages, setup
 import time
 
 VERSION = '0.11.0.dev1'
@@ -37,22 +36,22 @@ PACKAGE_DATA = []
 for path, subdirs, files in os.walk('systemml/systemml-java'):
     for name in files:
         PACKAGE_DATA = PACKAGE_DATA + [ os.path.join(path, name).replace('./', '') ]
-        
+
 setup(
-    name='SystemML',
+    name='systemml',
     version=VERSION,
     description='Apache SystemML is a distributed and declarative machine learning platform.',
     long_description='''
-    
+
     Apache SystemML is an effort undergoing incubation at the Apache Software Foundation (ASF), sponsored by the Apache Incubator PMC.
     While incubation status is not necessarily a reflection of the completeness
     or stability of the code, it does indicate that the project has yet to be
     fully endorsed by the ASF.
-    
-    Apache SystemML provides declarative large-scale machine learning (ML) that aims at 
-    flexible specification of ML algorithms and automatic generation of hybrid runtime 
+
+    Apache SystemML provides declarative large-scale machine learning (ML) that aims at
+    flexible specification of ML algorithms and automatic generation of hybrid runtime
     plans ranging from single-node, in-memory computations, to distributed computations on Apache Hadoop and Apache Spark.
-    
+
     Note: This is not a released version and was built with SNAPSHOT available on the date''' + RELEASED_DATE,
     url='http://systemml.apache.org/',
     author='Apache SystemML',
@@ -74,4 +73,4 @@ setup(
         'Topic :: Software Development :: Libraries',
         ],
     license='Apache 2.0',
-    )
\ No newline at end of file
+    )

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/701b9e31/src/main/python/systemml/__init__.py
----------------------------------------------------------------------
diff --git a/src/main/python/systemml/__init__.py b/src/main/python/systemml/__init__.py
index 02a940b..04d521b 100644
--- a/src/main/python/systemml/__init__.py
+++ b/src/main/python/systemml/__init__.py
@@ -1,4 +1,3 @@
-#!/usr/bin/python
 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
@@ -8,9 +7,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -26,4 +25,4 @@ from .converters import *
 
 __all__ = mlcontext.__all__
 __all__ += defmatrix.__all__
-__all__ += converters.__all__
\ No newline at end of file
+__all__ += converters.__all__

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/701b9e31/src/main/python/systemml/converters.py
----------------------------------------------------------------------
diff --git a/src/main/python/systemml/converters.py b/src/main/python/systemml/converters.py
index 9588bec..243a507 100644
--- a/src/main/python/systemml/converters.py
+++ b/src/main/python/systemml/converters.py
@@ -1,4 +1,3 @@
-#!/usr/bin/python
 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
@@ -8,9 +7,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -20,15 +19,12 @@
 #
 #-------------------------------------------------------------
 
-from pyspark.context import SparkContext 
-from pyspark.sql import DataFrame, SQLContext
-from pyspark.rdd import RDD
+__all__ = [ 'getNumCols', 'convertToMatrixBlock', 'convertToNumpyArr', 'convertToPandasDF', 'SUPPORTED_TYPES' , 'convertToLabeledDF']
+
 import numpy as np
 import pandas as pd
-import sklearn as sk
-
-from scipy.sparse import spmatrix
-from scipy.sparse import coo_matrix
+from pyspark.context import SparkContext
+from scipy.sparse import coo_matrix, spmatrix
 
 SUPPORTED_TYPES = (np.ndarray, pd.DataFrame, spmatrix)
 
@@ -37,7 +33,8 @@ def getNumCols(numPyArr):
         return 1
     else:
         return numPyArr.shape[1]
-       
+
+
 def convertToLabeledDF(sqlCtx, X, y=None):
     from pyspark.ml.feature import VectorAssembler
     if y is not None:
@@ -56,7 +53,7 @@ def convertToLabeledDF(sqlCtx, X, y=None):
         return out.select('features', 'label')
     else:
         return out.select('features')
-    
+
 
 def convertToMatrixBlock(sc, src):
     if isinstance(src, spmatrix):
@@ -80,7 +77,7 @@ def convertToMatrixBlock(sc, src):
         return sc._jvm.org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtilsExt.convertPy4JArrayToMB(buf, numRows, numCols)
     else:
         raise TypeError('sc needs to be of type SparkContext') # TODO: We can generalize this by creating py4j gateway ourselves
-    
+
 
 def convertToNumpyArr(sc, mb):
     if isinstance(sc, SparkContext):
@@ -96,5 +93,3 @@ def convertToPandasDF(X):
     if not isinstance(X, pd.DataFrame):
         return pd.DataFrame(X, columns=['C' + str(i) for i in range(getNumCols(X))])
     return X
-    
-__all__ = [ 'getNumCols', 'convertToMatrixBlock', 'convertToNumpyArr', 'convertToPandasDF', 'SUPPORTED_TYPES' , 'convertToLabeledDF']

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/701b9e31/src/main/python/systemml/defmatrix.py
----------------------------------------------------------------------
diff --git a/src/main/python/systemml/defmatrix.py b/src/main/python/systemml/defmatrix.py
index 18f6314..2994092 100644
--- a/src/main/python/systemml/defmatrix.py
+++ b/src/main/python/systemml/defmatrix.py
@@ -1,4 +1,3 @@
-#!/usr/bin/python
 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
@@ -20,13 +19,14 @@
 #
 #-------------------------------------------------------------
 
-import numpy as np
+__all__ = [ 'setSparkContext', 'matrix', 'eval', 'solve']
 
-from . import pydml, MLContext
-from .converters import *
-from pyspark import SparkContext, RDD
+from pyspark import SparkContext
 from pyspark.sql import DataFrame, SQLContext
 
+from . import MLContext, pydml
+from .converters import *
+
 def setSparkContext(sc):
     """
     Before using the matrix, the user needs to invoke this function.
@@ -39,10 +39,12 @@ def setSparkContext(sc):
     matrix.ml = MLContext(sc)
     matrix.sc = sc
 
+
 def checkIfMLContextIsSet():
     if matrix.ml is None:
         raise Exception('Expected setSparkContext(sc) to be called.')
 
+
 class DMLOp(object):
     """
     Represents an intermediate node of Abstract syntax tree created to generate the PyDML script
@@ -63,6 +65,7 @@ def reset():
         m.visited = False
     matrix.visited = []
 
+
 def binaryOp(lhs, rhs, opStr):
     """
     Common function called by all the binary operators in matrix class
@@ -87,6 +90,7 @@ def binaryOp(lhs, rhs, opStr):
     dmlOp.dml = [out.ID, ' = ', lhsStr, opStr, rhsStr, '\n']
     return out
 
+
 def binaryMatrixFunction(X, Y, fnName):
     """
     Common function called by supported PyDML built-in function that has two arguments both of which are matrices.
@@ -100,6 +104,7 @@ def binaryMatrixFunction(X, Y, fnName):
     dmlOp.dml = [out.ID, ' = ', fnName,'(', X.ID, ', ', Y.ID, ')\n']
     return out
 
+
 def solve(A, b):
     """
     Computes the least squares solution for system of linear equations A %*% x = b
@@ -128,6 +133,7 @@ def solve(A, b):
     """
     return binaryMatrixFunction(A, b, 'solve')
 
+
 def eval(outputs, outputDF=False, execute=True):
     """
     Executes the unevaluated DML script and computes the matrices specified by outputs.
@@ -159,6 +165,7 @@ def eval(outputs, outputDF=False, execute=True):
         else:
             m.data = results.getNumPyArray(m.ID)
 
+
 class matrix(object):
     """
     matrix class is a python wrapper that implements basic matrix operator.
@@ -406,5 +413,3 @@ class matrix(object):
 
     def dot(self, other):
         return binaryMatrixFunction(self, other, 'dot')
-
-__all__ = [ 'setSparkContext', 'matrix', 'eval', 'solve']

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/701b9e31/src/main/python/systemml/mlcontext.py
----------------------------------------------------------------------
diff --git a/src/main/python/systemml/mlcontext.py b/src/main/python/systemml/mlcontext.py
index 1b90e70..c578a8e 100644
--- a/src/main/python/systemml/mlcontext.py
+++ b/src/main/python/systemml/mlcontext.py
@@ -1,4 +1,3 @@
-#!/usr/bin/python
 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
@@ -19,6 +18,7 @@
 # under the License.
 #
 #-------------------------------------------------------------
+
 import os
 
 try:
@@ -26,10 +26,10 @@ try:
     from py4j.java_gateway import JavaObject
 except ImportError:
     raise ImportError('Unable to import JavaObject from py4j.java_gateway. Hint: Make sure you are running with pyspark')
-
 from pyspark import SparkContext
 import pyspark.mllib.common
 from pyspark.sql import DataFrame, SQLContext
+
 from .converters import *
 
 def dml(scriptString):

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/701b9e31/src/main/python/systemml/mllearn/__init__.py
----------------------------------------------------------------------
diff --git a/src/main/python/systemml/mllearn/__init__.py b/src/main/python/systemml/mllearn/__init__.py
index 69cab58..8132405 100644
--- a/src/main/python/systemml/mllearn/__init__.py
+++ b/src/main/python/systemml/mllearn/__init__.py
@@ -1,4 +1,3 @@
-#!/usr/bin/python
 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
@@ -8,9 +7,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -22,4 +21,4 @@
 
 from .estimators import *
 
-__all__ = estimators.__all__
\ No newline at end of file
+__all__ = estimators.__all__

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/701b9e31/src/main/python/systemml/mllearn/estimators.py
----------------------------------------------------------------------
diff --git a/src/main/python/systemml/mllearn/estimators.py b/src/main/python/systemml/mllearn/estimators.py
index 5d33d64..97ab6bb 100644
--- a/src/main/python/systemml/mllearn/estimators.py
+++ b/src/main/python/systemml/mllearn/estimators.py
@@ -1,4 +1,3 @@
-#!/usr/bin/python
 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
@@ -8,9 +7,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -20,15 +19,13 @@
 #
 #-------------------------------------------------------------
 
-from pyspark.context import SparkContext 
-from pyspark.sql import DataFrame, SQLContext
-from pyspark.rdd import RDD
+__all__ = ['LinearRegression', 'LogisticRegression', 'SVM', 'NaiveBayes']
+
 import numpy as np
-import pandas as pd
-import sklearn as sk
+from pyspark.ml import Estimator
 from pyspark.ml.feature import VectorAssembler
-from pyspark.mllib.linalg import Vectors
-from pyspark.ml import Estimator, Model
+from pyspark.sql import DataFrame
+import sklearn as sk
 
 from ..converters import *
 
@@ -40,32 +37,32 @@ def assemble(sqlCtx, pdf, inputCols, outputCol):
 class BaseSystemMLEstimator(Estimator):
     featuresCol = 'features'
     labelCol = 'label'
-    
+
     def setFeaturesCol(self, colName):
         """
         Sets the default column name for features of PySpark DataFrame.
-        
+
         Parameters
         ----------
         colName: column name for features (default: 'features')
         """
         self.featuresCol = colName
-        
+
     def setLabelCol(self, colName):
         """
         Sets the default column name for features of PySpark DataFrame.
-        
+
         Parameters
         ----------
         colName: column name for features (default: 'label')
         """
         self.labelCol = colName
-        
-    # Returns a model after calling fit(df) on Estimator object on JVM    
+
+    # Returns a model after calling fit(df) on Estimator object on JVM
     def _fit(self, X):
         """
         Invokes the fit method on Estimator object on JVM if X is PySpark DataFrame
-        
+
         Parameters
         ----------
         X: PySpark DataFrame that contain the columns featuresCol (default: 'features') and labelCol (default: 'label')
@@ -75,11 +72,11 @@ class BaseSystemMLEstimator(Estimator):
             return self
         else:
             raise Exception('Incorrect usage: Expected dataframe as input with features/label as columns')
-    
+
     def fit(self, X, y=None, params=None):
         """
         Invokes the fit method on Estimator object on JVM if X and y are on of the supported data types
-        
+
         Parameters
         ----------
         X: NumPy ndarray, Pandas DataFrame, scipy sparse matrix
@@ -109,15 +106,15 @@ class BaseSystemMLEstimator(Estimator):
             return self
         else:
             raise Exception('Unsupported input type')
-    
+
     def transform(self, X):
         return self.predict(X)
-    
-    # Returns either a DataFrame or MatrixBlock after calling transform(X:MatrixBlock, y:MatrixBlock) on Model object on JVM    
+
+    # Returns either a DataFrame or MatrixBlock after calling transform(X:MatrixBlock, y:MatrixBlock) on Model object on JVM
     def predict(self, X):
         """
         Invokes the transform method on Estimator object on JVM if X and y are on of the supported data types
-        
+
         Parameters
         ----------
         X: NumPy ndarray, Pandas DataFrame, scipy sparse matrix or PySpark DataFrame
@@ -152,26 +149,28 @@ class BaseSystemMLEstimator(Estimator):
             return retDF.sort('ID')
         else:
             raise Exception('Unsupported input type')
-            
+
+
 class BaseSystemMLClassifier(BaseSystemMLEstimator):
 
     def score(self, X, y):
         """
         Scores the predicted value with ground truth 'y'
-        
+
         Parameters
         ----------
         X: NumPy ndarray, Pandas DataFrame, scipy sparse matrix
         y: NumPy ndarray, Pandas DataFrame, scipy sparse matrix
         """
-        return sk.metrics.accuracy_score(y, self.predict(X))    
+        return sk.metrics.accuracy_score(y, self.predict(X))
+
 
 class BaseSystemMLRegressor(BaseSystemMLEstimator):
 
     def score(self, X, y):
         """
         Scores the predicted value with ground truth 'y'
-        
+
         Parameters
         ----------
         X: NumPy ndarray, Pandas DataFrame, scipy sparse matrix
@@ -184,7 +183,7 @@ class LogisticRegression(BaseSystemMLClassifier):
     def __init__(self, sqlCtx, penalty='l2', fit_intercept=True, max_iter=100, max_inner_iter=0, tol=0.000001, C=1.0, solver='newton-cg', transferUsingDF=False):
         """
         Performs both binomial and multinomial logistic regression.
-        
+
         Parameters
         ----------
         sqlCtx: PySpark SQLContext
@@ -215,12 +214,13 @@ class LogisticRegression(BaseSystemMLClassifier):
         if solver != 'newton-cg':
             raise Exception('Only newton-cg solver supported')
 
+
 class LinearRegression(BaseSystemMLRegressor):
 
     def __init__(self, sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, solver='newton-cg', transferUsingDF=False):
         """
         Performs linear regression to model the relationship between one numerical response variable and one or more explanatory (feature) variables..
-        
+
         Parameters
         ----------
         sqlCtx: PySpark SQLContext
@@ -228,7 +228,7 @@ class LinearRegression(BaseSystemMLRegressor):
         max_iter: Maximum number of conjugate gradient iterations, or 0 if no maximum limit provided (default: 100)
         tol: Tolerance used in the convergence criterion (default: 0.000001)
         C: 1/regularization parameter (default: 1.0)
-        solver: Supports either 'newton-cg' or 'direct-solve' (default: 'newton-cg').  
+        solver: Supports either 'newton-cg' or 'direct-solve' (default: 'newton-cg').
         Depending on the size and the sparsity of the feature matrix, one or the other solver may be more efficient.
         'direct-solve' solver is more efficient when the number of features is relatively small (m < 1000) and
         input matrix X is either tall or fairly dense; otherwise 'newton-cg' solver is more efficient.
@@ -256,7 +256,7 @@ class SVM(BaseSystemMLClassifier):
     def __init__(self, sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, is_multi_class=False, transferUsingDF=False):
         """
         Performs both binary-class and multiclass SVM (Support Vector Machines).
-        
+
         Parameters
         ----------
         sqlCtx: PySpark SQLContext
@@ -278,14 +278,15 @@ class SVM(BaseSystemMLClassifier):
         self.estimator.setTol(tol)
         self.estimator.setIcpt(int(fit_intercept))
         self.transferUsingDF = transferUsingDF
-        self.setOutputRawPredictionsToFalse = False    
+        self.setOutputRawPredictionsToFalse = False
+
 
 class NaiveBayes(BaseSystemMLClassifier):
 
     def __init__(self, sqlCtx, laplace=1.0, transferUsingDF=False):
         """
         Performs both binary-class and multiclass SVM (Support Vector Machines).
-        
+
         Parameters
         ----------
         sqlCtx: PySpark SQLContext
@@ -298,5 +299,3 @@ class NaiveBayes(BaseSystemMLClassifier):
         self.estimator.setLaplace(laplace)
         self.transferUsingDF = transferUsingDF
         self.setOutputRawPredictionsToFalse = False
-
-__all__ = ['LogisticRegression', 'LinearRegression', 'SVM', 'NaiveBayes']