You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@liminal.apache.org by li...@apache.org on 2021/08/17 12:19:23 UTC
[incubator-liminal] branch master updated: Spark example
improvements (#67)
This is an automated email from the ASF dual-hosted git repository.
lior pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-liminal.git
The following commit(s) were added to refs/heads/master by this push:
new cb005b5 Spark example improvements (#67)
cb005b5 is described below
commit cb005b5f7a63b8561911a01e27cd85e73196d237
Author: assapin <47...@users.noreply.github.com>
AuthorDate: Tue Aug 17 15:19:20 2021 +0300
Spark example improvements (#67)
* add more ml-related flow to spark example
* exclude disclaimer file from license tests
---
examples/spark-app-demo/k8s/data/iris.csv | 305 ++++++++++++++-------------
examples/spark-app-demo/k8s/data_cleanup.py | 57 +++--
examples/spark-app-demo/k8s/liminal.yml | 10 +-
examples/spark-app-demo/k8s/requirements.txt | 4 +-
examples/spark-app-demo/k8s/training.py | 68 +++---
tests/test_licenses.py | 4 +-
6 files changed, 245 insertions(+), 203 deletions(-)
diff --git a/examples/spark-app-demo/k8s/data/iris.csv b/examples/spark-app-demo/k8s/data/iris.csv
index 339109c..c413d1e 100644
--- a/examples/spark-app-demo/k8s/data/iris.csv
+++ b/examples/spark-app-demo/k8s/data/iris.csv
@@ -15,154 +15,157 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
-ignore: 150,4,setosa,versicolor,virginica,garbage
-5.1,3.5,1.4,0.2,0,21231
-4.9,3.0,1.4,0.2,0,sda
-4.7,3.2,1.3,0.2,0,2321
-4.6,3.1,1.5,0.2,0,
-5.0,3.6,1.4,0.2,0,
-5.4,3.9,1.7,0.4,0,
-4.6,3.4,1.4,0.3,0,
-5.0,3.4,1.5,0.2,0,
-4.4,2.9,1.4,0.2,0,
-4.9,3.1,1.5,0.1,0,
-5.4,3.7,1.5,0.2,0,
-4.8,3.4,1.6,0.2,0,
-4.8,3.0,1.4,0.1,0,
-4.3,3.0,1.1,0.1,0,
-5.8,4.0,1.2,0.2,0,
-5.7,4.4,1.5,0.4,0,
-5.4,3.9,1.3,0.4,0,
-5.1,3.5,1.4,0.3,0,
-5.7,3.8,1.7,0.3,0,
-5.1,3.8,1.5,0.3,0,
-5.4,3.4,1.7,0.2,0,
-5.1,3.7,1.5,0.4,0,
-4.6,3.6,1.0,0.2,0,
-5.1,3.3,1.7,0.5,0,
-4.8,3.4,1.9,0.2,0,
-5.0,3.0,1.6,0.2,0,
-5.0,3.4,1.6,0.4,0,
-5.2,3.5,1.5,0.2,0,
-5.2,3.4,1.4,0.2,0,
-4.7,3.2,1.6,0.2,0,
-4.8,3.1,1.6,0.2,0,
-5.4,3.4,1.5,0.4,0,
-5.2,4.1,1.5,0.1,0,
-5.5,4.2,1.4,0.2,0,
-4.9,3.1,1.5,0.2,0,
-5.0,3.2,1.2,0.2,0,
-5.5,3.5,1.3,0.2,0,
-4.9,3.6,1.4,0.1,0,
-4.4,3.0,1.3,0.2,0,
-5.1,3.4,1.5,0.2,0,
-5.0,3.5,1.3,0.3,0,
-4.5,2.3,1.3,0.3,0,
-4.4,3.2,1.3,0.2,0,
-5.0,3.5,1.6,0.6,0,
-5.1,3.8,1.9,0.4,0,
-4.8,3.0,1.4,0.3,0,
-5.1,3.8,1.6,0.2,0,
-4.6,3.2,1.4,0.2,0,
-5.3,3.7,1.5,0.2,0,
-5.0,3.3,1.4,0.2,0,
-7.0,3.2,4.7,1.4,1,
-6.4,3.2,4.5,1.5,1,
-6.9,3.1,4.9,1.5,1,
-5.5,2.3,4.0,1.3,1,
-6.5,2.8,4.6,1.5,1,
-5.7,2.8,4.5,1.3,1,
-6.3,3.3,4.7,1.6,1,
-4.9,2.4,3.3,1.0,1,
-6.6,2.9,4.6,1.3,1,
-5.2,2.7,3.9,1.4,1,
-5.0,2.0,3.5,1.0,1,
-5.9,3.0,4.2,1.5,1,
-6.0,2.2,4.0,1.0,1,
-6.1,2.9,4.7,1.4,1,
-5.6,2.9,3.6,1.3,1,
-6.7,3.1,4.4,1.4,1,
-5.6,3.0,4.5,1.5,1,
-5.8,2.7,4.1,1.0,1,
-6.2,2.2,4.5,1.5,1,
-5.6,2.5,3.9,1.1,1,
-5.9,3.2,4.8,1.8,1,
-6.1,2.8,4.0,1.3,1,
-6.3,2.5,4.9,1.5,1,
-6.1,2.8,4.7,1.2,1,
-6.4,2.9,4.3,1.3,1,
-6.6,3.0,4.4,1.4,1,
-6.8,2.8,4.8,1.4,1,
-6.7,3.0,5.0,1.7,1,
-6.0,2.9,4.5,1.5,1,
-5.7,2.6,3.5,1.0,1,
-5.5,2.4,3.8,1.1,1,
-5.5,2.4,3.7,1.0,1,
-5.8,2.7,3.9,1.2,1,
-6.0,2.7,5.1,1.6,1,
-5.4,3.0,4.5,1.5,1,
-6.0,3.4,4.5,1.6,1,
-6.7,3.1,4.7,1.5,1,
-6.3,2.3,4.4,1.3,1,
-5.6,3.0,4.1,1.3,1,
-5.5,2.5,4.0,1.3,1,
-5.5,2.6,4.4,1.2,1,
-6.1,3.0,4.6,1.4,1,
-5.8,2.6,4.0,1.2,1,
-5.0,2.3,3.3,1.0,1,
-5.6,2.7,4.2,1.3,1,
-5.7,3.0,4.2,1.2,1,
-5.7,2.9,4.2,1.3,1,
-6.2,2.9,4.3,1.3,1,
-5.1,2.5,3.0,1.1,1,
-5.7,2.8,4.1,1.3,1,
-6.3,3.3,6.0,2.5,2,
-5.8,2.7,5.1,1.9,2,
-7.1,3.0,5.9,2.1,2,
-6.3,2.9,5.6,1.8,2,
-6.5,3.0,5.8,2.2,2,
-7.6,3.0,6.6,2.1,2,
-4.9,2.5,4.5,1.7,2,
-7.3,2.9,6.3,1.8,2,
-6.7,2.5,5.8,1.8,2,
-7.2,3.6,6.1,2.5,2,
-6.5,3.2,5.1,2.0,2,
-6.4,2.7,5.3,1.9,2,
-6.8,3.0,5.5,2.1,2,
-5.7,2.5,5.0,2.0,2,
-5.8,2.8,5.1,2.4,2,
-6.4,3.2,5.3,2.3,2,
-6.5,3.0,5.5,1.8,2,
-7.7,3.8,6.7,2.2,2,
-7.7,2.6,6.9,2.3,2,
-6.0,2.2,5.0,1.5,2,
-6.9,3.2,5.7,2.3,2,
-5.6,2.8,4.9,2.0,2,
-7.7,2.8,6.7,2.0,2,
-6.3,2.7,4.9,1.8,2,
-6.7,3.3,5.7,2.1,2,
-7.2,3.2,6.0,1.8,2,
-6.2,2.8,4.8,1.8,2,
-6.1,3.0,4.9,1.8,2,
-6.4,2.8,5.6,2.1,2,
-7.2,3.0,5.8,1.6,2,
-7.4,2.8,6.1,1.9,2,
-7.9,3.8,6.4,2.0,2,
-6.4,2.8,5.6,2.2,2,
-6.3,2.8,5.1,1.5,2,
-6.1,2.6,5.6,1.4,2,
-7.7,3.0,6.1,2.3,2,
-6.3,3.4,5.6,2.4,2,
-6.4,3.1,5.5,1.8,2,
-6.0,3.0,4.8,1.8,2,
-6.9,3.1,5.4,2.1,2,
-6.7,3.1,5.6,2.4,2,
-6.9,3.1,5.1,2.3,2,
-5.8,2.7,5.1,1.9,2,
-6.8,3.2,5.9,2.3,2,
-6.7,3.3,5.7,2.5,2,
-6.7,3.0,5.2,2.3,2,
-6.3,2.5,5.0,1.9,2,
-6.5,3.0,5.2,2.0,2,
-6.2,3.4,5.4,2.3,2,
-5.9,3.0,5.1,1.8,2,
\ No newline at end of file
+#
+# source: https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data
+sepallength,sepalwidth,petallength,petalwidth,class
+5.1,3.5,1.4,0.2,Iris-setosa
+4.9,3.0,1.4,0.2,Iris-setosa
+4.7,3.2,1.3,0.2,Iris-setosa
+4.6,3.1,1.5,0.2,Iris-setosa
+5.0,3.6,1.4,0.2,Iris-setosa
+5.4,3.9,1.7,0.4,Iris-setosa
+4.6,3.4,1.4,0.3,Iris-setosa
+5.0,3.4,1.5,0.2,Iris-setosa
+4.4,2.9,1.4,0.2,Iris-setosa
+4.9,3.1,1.5,0.1,Iris-setosa
+5.4,3.7,1.5,0.2,Iris-setosa
+4.8,3.4,1.6,0.2,Iris-setosa
+4.8,3.0,1.4,0.1,Iris-setosa
+4.3,3.0,1.1,0.1,Iris-setosa
+5.8,4.0,1.2,0.2,Iris-setosa
+5.7,4.4,1.5,0.4,Iris-setosa
+5.4,3.9,1.3,0.4,Iris-setosa
+5.1,3.5,1.4,0.3,Iris-setosa
+5.7,3.8,1.7,0.3,Iris-setosa
+5.1,3.8,1.5,0.3,Iris-setosa
+5.4,3.4,1.7,0.2,Iris-setosa
+5.1,3.7,1.5,0.4,Iris-setosa
+4.6,3.6,1.0,0.2,Iris-setosa
+5.1,3.3,1.7,0.5,Iris-setosa
+4.8,3.4,1.9,0.2,Iris-setosa
+5.0,3.0,1.6,0.2,Iris-setosa
+5.0,3.4,1.6,0.4,Iris-setosa
+5.2,3.5,1.5,0.2,Iris-setosa
+5.2,3.4,1.4,0.2,Iris-setosa
+4.7,3.2,1.6,0.2,Iris-setosa
+4.8,3.1,1.6,0.2,Iris-setosa
+5.4,3.4,1.5,0.4,Iris-setosa
+5.2,4.1,1.5,0.1,Iris-setosa
+5.5,4.2,1.4,0.2,Iris-setosa
+4.9,3.1,1.5,0.1,Iris-setosa
+5.0,3.2,1.2,0.2,Iris-setosa
+5.5,3.5,1.3,0.2,Iris-setosa
+4.9,3.1,1.5,0.1,Iris-setosa
+4.4,3.0,1.3,0.2,Iris-setosa
+5.1,3.4,1.5,0.2,Iris-setosa
+5.0,3.5,1.3,0.3,Iris-setosa
+4.5,2.3,1.3,0.3,Iris-setosa
+4.4,3.2,1.3,0.2,Iris-setosa
+5.0,3.5,1.6,0.6,Iris-setosa
+5.1,3.8,1.9,0.4,Iris-setosa
+4.8,3.0,1.4,0.3,Iris-setosa
+5.1,3.8,1.6,0.2,Iris-setosa
+4.6,3.2,1.4,0.2,Iris-setosa
+5.3,3.7,1.5,0.2,Iris-setosa
+5.0,3.3,1.4,0.2,Iris-setosa
+7.0,3.2,4.7,1.4,Iris-versicolor
+6.4,3.2,4.5,1.5,Iris-versicolor
+6.9,3.1,4.9,1.5,Iris-versicolor
+5.5,2.3,4.0,1.3,Iris-versicolor
+6.5,2.8,4.6,1.5,Iris-versicolor
+5.7,2.8,4.5,1.3,Iris-versicolor
+6.3,3.3,4.7,1.6,Iris-versicolor
+4.9,2.4,3.3,1.0,Iris-versicolor
+6.6,2.9,4.6,1.3,Iris-versicolor
+5.2,2.7,3.9,1.4,Iris-versicolor
+5.0,2.0,3.5,1.0,Iris-versicolor
+5.9,3.0,4.2,1.5,Iris-versicolor
+6.0,2.2,4.0,1.0,Iris-versicolor
+6.1,2.9,4.7,1.4,Iris-versicolor
+5.6,2.9,3.6,1.3,Iris-versicolor
+6.7,3.1,4.4,1.4,Iris-versicolor
+5.6,3.0,4.5,1.5,Iris-versicolor
+5.8,2.7,4.1,1.0,Iris-versicolor
+6.2,2.2,4.5,1.5,Iris-versicolor
+5.6,2.5,3.9,1.1,Iris-versicolor
+5.9,3.2,4.8,1.8,Iris-versicolor
+6.1,2.8,4.0,1.3,Iris-versicolor
+6.3,2.5,4.9,1.5,Iris-versicolor
+6.1,2.8,4.7,1.2,Iris-versicolor
+6.4,2.9,4.3,1.3,Iris-versicolor
+6.6,3.0,4.4,1.4,Iris-versicolor
+6.8,2.8,4.8,1.4,Iris-versicolor
+6.7,3.0,5.0,1.7,Iris-versicolor
+6.0,2.9,4.5,1.5,Iris-versicolor
+5.7,2.6,3.5,1.0,Iris-versicolor
+5.5,2.4,3.8,1.1,Iris-versicolor
+5.5,2.4,3.7,1.0,Iris-versicolor
+5.8,2.7,3.9,1.2,Iris-versicolor
+6.0,2.7,5.1,1.6,Iris-versicolor
+5.4,3.0,4.5,1.5,Iris-versicolor
+6.0,3.4,4.5,1.6,Iris-versicolor
+6.7,3.1,4.7,1.5,Iris-versicolor
+6.3,2.3,4.4,1.3,Iris-versicolor
+5.6,3.0,4.1,1.3,Iris-versicolor
+5.5,2.5,4.0,1.3,Iris-versicolor
+5.5,2.6,4.4,1.2,Iris-versicolor
+6.1,3.0,4.6,1.4,Iris-versicolor
+5.8,2.6,4.0,1.2,Iris-versicolor
+5.0,2.3,3.3,1.0,Iris-versicolor
+5.6,2.7,4.2,1.3,Iris-versicolor
+5.7,3.0,4.2,1.2,Iris-versicolor
+5.7,2.9,4.2,1.3,Iris-versicolor
+6.2,2.9,4.3,1.3,Iris-versicolor
+5.1,2.5,3.0,1.1,Iris-versicolor
+5.7,2.8,4.1,1.3,Iris-versicolor
+6.3,3.3,6.0,2.5,Iris-virginica
+5.8,2.7,5.1,1.9,Iris-virginica
+7.1,3.0,5.9,2.1,Iris-virginica
+6.3,2.9,5.6,1.8,Iris-virginica
+6.5,3.0,5.8,2.2,Iris-virginica
+7.6,3.0,6.6,2.1,Iris-virginica
+4.9,2.5,4.5,1.7,Iris-virginica
+7.3,2.9,6.3,1.8,Iris-virginica
+6.7,2.5,5.8,1.8,Iris-virginica
+7.2,3.6,6.1,2.5,Iris-virginica
+6.5,3.2,5.1,2.0,Iris-virginica
+6.4,2.7,5.3,1.9,Iris-virginica
+6.8,3.0,5.5,2.1,Iris-virginica
+5.7,2.5,5.0,2.0,Iris-virginica
+5.8,2.8,5.1,2.4,Iris-virginica
+6.4,3.2,5.3,2.3,Iris-virginica
+6.5,3.0,5.5,1.8,Iris-virginica
+7.7,3.8,6.7,2.2,Iris-virginica
+7.7,2.6,6.9,2.3,Iris-virginica
+6.0,2.2,5.0,1.5,Iris-virginica
+6.9,3.2,5.7,2.3,Iris-virginica
+5.6,2.8,4.9,2.0,Iris-virginica
+7.7,2.8,6.7,2.0,Iris-virginica
+6.3,2.7,4.9,1.8,Iris-virginica
+6.7,3.3,5.7,2.1,Iris-virginica
+7.2,3.2,6.0,1.8,Iris-virginica
+6.2,2.8,4.8,1.8,Iris-virginica
+6.1,3.0,4.9,1.8,Iris-virginica
+6.4,2.8,5.6,2.1,Iris-virginica
+7.2,3.0,5.8,1.6,Iris-virginica
+7.4,2.8,6.1,1.9,Iris-virginica
+7.9,3.8,6.4,2.0,Iris-virginica
+6.4,2.8,5.6,2.2,Iris-virginica
+6.3,2.8,5.1,1.5,Iris-virginica
+6.1,2.6,5.6,1.4,Iris-virginica
+7.7,3.0,6.1,2.3,Iris-virginica
+6.3,3.4,5.6,2.4,Iris-virginica
+6.4,3.1,5.5,1.8,Iris-virginica
+6.0,3.0,4.8,1.8,Iris-virginica
+6.9,3.1,5.4,2.1,Iris-virginica
+6.7,3.1,5.6,2.4,Iris-virginica
+6.9,3.1,5.1,2.3,Iris-virginica
+5.8,2.7,5.1,1.9,Iris-virginica
+6.8,3.2,5.9,2.3,Iris-virginica
+6.7,3.3,5.7,2.5,Iris-virginica
+6.7,3.0,5.2,2.3,Iris-virginica
+6.3,2.5,5.0,1.9,Iris-virginica
+6.5,3.0,5.2,2.0,Iris-virginica
+6.2,3.4,5.4,2.3,Iris-virginica
+5.9,3.0,5.1,1.8,Iris-virginica
+
diff --git a/examples/spark-app-demo/k8s/data_cleanup.py b/examples/spark-app-demo/k8s/data_cleanup.py
index a9446ae..0fd5617 100644
--- a/examples/spark-app-demo/k8s/data_cleanup.py
+++ b/examples/spark-app-demo/k8s/data_cleanup.py
@@ -16,25 +16,56 @@
# specific language governing permissions and limitations
# under the License.
-import sys
+import argparse
+import pyspark.sql.functions as F
+from pyspark.ml import Pipeline
+from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler
+from pyspark.ml.functions import vector_to_array
from pyspark.sql import SparkSession
-if __name__ == "__main__":
- if len(sys.argv) != 3:
- print("Usage: source <file> destination <dest>", file=sys.stderr)
- sys.exit(-1)
+def transform(data):
+ columns_to_scale = data.columns[:-1]
+ vectorizer = VectorAssembler(inputCols=columns_to_scale, outputCol="features")
+ scaler = StandardScaler(inputCol="features", outputCol="scaled_features",
+ withStd=True, withMean=True)
+ labeler = StringIndexer(inputCol=data.columns[-1], outputCol='label')
+ pipeline = Pipeline(stages=[vectorizer, scaler, labeler])
+ fitted = pipeline.fit(data)
+ transformed = fitted.transform(data)
+
+ result = (
+ transformed.withColumn("feature_arr", vector_to_array("scaled_features"))
+ .select([F.col("feature_arr")[i].alias(columns_to_scale[i]) for i in range(len(columns_to_scale))] + ['label'])
+ )
+
+ return result
+
+
+def extract(spark, input_uri):
+ return spark.read.csv(input_uri, header=True, inferSchema=True, comment="#")
+
+
+def load(data, output_uri):
+ data.coalesce(1).write.mode("overwrite").csv(output_uri, header=True)
+
+
+def data_pipeline(input_uri, output_uri):
spark = SparkSession \
.builder \
- .appName("CleanData") \
+ .appName("Prepare Iris Data") \
.getOrCreate()
- spark.read.text(sys.argv[1]).rdd.filter(lambda x: not x[0].startswith('#')) \
- .filter(lambda r: not r[0].startswith('ignore')) \
- .map(lambda r: r[0]).map(
- lambda r: (
- r.split(',')[0], r.split(',')[1], r.split(',')[2], r.split(',')[3], r.split(',')[4])) \
- .toDF().coalesce(1).write.mode("overwrite").option("header", "false").csv(sys.argv[2])
-
+ input = extract(spark, input_uri)
+ data = transform(input)
+ load(data, output_uri)
spark.stop()
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--input_uri")
+ parser.add_argument("--output_uri")
+ args = parser.parse_args()
+ data_pipeline(args.input_uri, args.output_uri)
diff --git a/examples/spark-app-demo/k8s/liminal.yml b/examples/spark-app-demo/k8s/liminal.yml
index 93e8679..301e321 100644
--- a/examples/spark-app-demo/k8s/liminal.yml
+++ b/examples/spark-app-demo/k8s/liminal.yml
@@ -20,7 +20,7 @@ name: MyFirstLiminalSparkApp
super: InfraSpark
owner: Bosco Albert Baracus
variables:
- output_path: '{{output_root_dir}}/my_first_liminal_spark_app_outputs/'
+ training_data_path: '{{output_root_dir}}/iris/'
application: data_cleanup.py
images:
- image: myorg/mydatascienceapp
@@ -52,20 +52,22 @@ pipelines:
type: spark
description: prepare the data for training
application_arguments:
+ - '--input_uri'
- '{{input_root_dir}}data/iris.csv'
- - '{{output_path}}'
+ - '--output_uri'
+ - '{{training_data_path}}'
- task: train
type: python
description: train model
image: myorg/mydatascienceapp
- cmd: python -u training.py train '{{output_path}}'
+ cmd: python -u training.py --action train --input_uri '{{training_data_path}}'
env:
MOUNT_PATH: /mnt/gettingstartedvol
- task: validate
type: python
description: validate model and deploy
image: myorg/mydatascienceapp
- cmd: python -u training.py validate
+ cmd: python -u training.py --action validate --input_uri '{{training_data_path}}'
env:
MOUNT_PATH: /mnt/gettingstartedvol
volumes:
diff --git a/examples/spark-app-demo/k8s/requirements.txt b/examples/spark-app-demo/k8s/requirements.txt
index b6b3b87..db52f8d 100644
--- a/examples/spark-app-demo/k8s/requirements.txt
+++ b/examples/spark-app-demo/k8s/requirements.txt
@@ -16,6 +16,6 @@
# specific language governing permissions and limitations
# under the License.
-scikit-learn==0.23.2
+scikit-learn==0.24.1
apache-liminal==0.0.2
-pyspark==3.0.0
+pyspark==3.1.2
diff --git a/examples/spark-app-demo/k8s/training.py b/examples/spark-app-demo/k8s/training.py
index caf05bf..57c4e6c 100644
--- a/examples/spark-app-demo/k8s/training.py
+++ b/examples/spark-app-demo/k8s/training.py
@@ -19,10 +19,13 @@ import sys
import time
import model_store
+import pandas as pd
import numpy as np
from model_store import ModelStore
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import train_test_split
+from sklearn import model_selection
import os
_CANDIDATE_MODEL_STORE = ModelStore(model_store.CANDIDATE)
@@ -30,26 +33,14 @@ _PRODUCTION_MODEL_STORE = ModelStore(model_store.PRODUCTION)
import numpy as np
import csv
-from sklearn.datasets.base import Bunch
-
+import argparse
def load_iris_from_csv_file(f):
- with open(f) as csv_file:
- data_file = csv.reader(csv_file)
- temp = next(data_file)
- n_samples = 150 # number of data rows, don't count header
- n_features = 4 # number of columns for features, don't count target column
- feature_names = ['setosa', 'versicolor', 'virginica']
- target_names = ['f4'] # adjust accordingly
- data = np.empty((n_samples, n_features))
- target = np.empty((n_samples,), dtype=np.int)
-
- for i, sample in enumerate(data_file):
- data[i] = np.asarray(sample[:-1], dtype=np.float64)
- target[i] = np.asarray(sample[-1], dtype=np.int)
-
- return Bunch(data=data, target=target, feature_names=feature_names, target_names=target_names)
-
+ df = pd.read_csv(f, header=0).reset_index(drop=True)
+ types = {col:np.float64 for col in df.columns[:-1]}
+ types['label'] = np.int32
+ df = df.astype(types)
+ return df
def get_dataset(d):
print("searching for csv files in {}".format(d))
@@ -60,40 +51,53 @@ def get_dataset(d):
return os.path.join(d, file)
return None
-
-def train_model(f):
- csv_file = get_dataset(f)
+def load_and_split(input_uri):
+ csv_file = get_dataset(input_uri)
if csv_file:
print("found {} dataset".format(csv_file))
iris = load_iris_from_csv_file(csv_file)
+ return train_test_split(iris, test_size=0.2, random_state=8)
- X = iris["data"][:, 3:] # petal width
- y = (iris["target"] == 2).astype(np.int)
+def train_model(input_uri):
+ train, test = load_and_split(input_uri)
+ y = train.pop("label")
+ X = train.loc[:, train.columns]
- model = LogisticRegression()
+ model = LogisticRegression(max_iter=500)
model.fit(X, y)
+ scoring = 'accuracy'
+ results = model_selection.cross_val_score(model, X, y, cv=5, scoring=scoring)
+ print(f'Accuracy in cross validation: {results.mean()*100} % ({results.std()} std)')
version = round(time.time())
print(f'Saving model with version {version} to candidate model store.')
_CANDIDATE_MODEL_STORE.save_model(model, version)
-def validate_model():
+def validate_model(input_uri):
model, version = _CANDIDATE_MODEL_STORE.load_latest_model()
print(f'Validating model with version {version} to candidate model store.')
- if not isinstance(model.predict([[1]]), np.ndarray):
+ if not isinstance(model.predict([[1,1,1,1]]), np.ndarray):
raise ValueError('Invalid model')
+ train, test = load_and_split(input_uri)
+ y = test.pop("label")
+ X = test.loc[:, test.columns]
+ result = model.score(X, y)
+ print(f'model accuracy {result*100}')
+ if result < 0.85:
+ raise ValueError('model accuracy under threshold (0.85 ). Model is not promoted to production')
print(f'Deploying model with version {version} to production model store.')
_PRODUCTION_MODEL_STORE.save_model(model, version)
if __name__ == '__main__':
- cmd = sys.argv[1]
- if cmd == 'train':
- train_model(sys.argv[2])
- elif cmd == 'validate':
- validate_model()
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--action", choices=['train', 'validate'])
+ parser.add_argument("--input_uri")
+ args = parser.parse_args()
+ if args.action == 'train':
+ train_model(args.input_uri)
else:
- raise ValueError(f"Unknown command {cmd}")
+ validate_model(args.input_uri)
\ No newline at end of file
diff --git a/tests/test_licenses.py b/tests/test_licenses.py
index b48d731..e105b3c 100644
--- a/tests/test_licenses.py
+++ b/tests/test_licenses.py
@@ -24,6 +24,7 @@ from termcolor import colored
EXCLUDED_EXTENSIONS = ['.gif', '.png', '.pyc', 'LICENSE', 'DISCLAIMER', 'NOTICE', '.whl']
EXCLUDED_DIRS = ['docs/build', '.git', '.idea', 'venv', 'apache_liminal.egg-info']
+EXCLUDED_FILES = ['DISCLAIMER-WIP']
PYTHON_LICENSE_HEADER = """
#
@@ -141,7 +142,8 @@ class TestLicenses(TestCase):
if not any(os.path.relpath(r, base_dir).startswith(excluded) for excluded in
EXCLUDED_DIRS):
for file in f:
- if not any(os.path.basename(file).endswith(ext) for ext in EXCLUDED_EXTENSIONS):
+ if not any(os.path.basename(file).endswith(ext) for ext in EXCLUDED_EXTENSIONS) \
+ and not os.path.basename(file) in EXCLUDED_FILES:
files.append(os.path.join(r, file))
output = ''