You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by ri...@apache.org on 2016/02/04 19:32:20 UTC

incubator-madlib git commit: Kmeans: Skip NULL feature values

Repository: incubator-madlib
Updated Branches:
  refs/heads/master 0a48f3ad8 -> 79c50f896


Kmeans: Skip NULL feature values

JIRA: MADLIB-946

Closest column used to throw exception if the matrix or the vector argument
had any null values. Changed to return Null() in this case.  Inserted an
additional check for null values in the compute_kmeans function to accomodate
for the change.


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/79c50f89
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/79c50f89
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/79c50f89

Branch: refs/heads/master
Commit: 79c50f896161a2a043be056f8e99822828776977
Parents: 0a48f3a
Author: Orhan Kislal <ok...@pivotal.io>
Authored: Fri Jan 22 15:28:45 2016 -0800
Committer: Orhan Kislal <ok...@pivotal.io>
Committed: Fri Jan 22 15:28:45 2016 -0800

----------------------------------------------------------------------
 src/modules/linalg/metric.cpp                   | 33 +++++++++++---------
 src/ports/postgres/modules/kmeans/kmeans.py_in  |  1 +
 .../postgres/modules/kmeans/test/kmeans.sql_in  | 23 ++++++++++++++
 3 files changed, 42 insertions(+), 15 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/79c50f89/src/modules/linalg/metric.cpp
----------------------------------------------------------------------
diff --git a/src/modules/linalg/metric.cpp b/src/modules/linalg/metric.cpp
index 8addbc0..bfac611 100644
--- a/src/modules/linalg/metric.cpp
+++ b/src/modules/linalg/metric.cpp
@@ -338,21 +338,24 @@ closestColumnsAndDistancesShortcut(
  */
 AnyType
 closest_column::run(AnyType& args) {
-    MappedMatrix M = args[0].getAs<MappedMatrix>();
-    MappedColumnVector x = args[1].getAs<MappedColumnVector>();
-    FunctionHandle dist = args[2].getAs<FunctionHandle>()
-        .unsetFunctionCallOptions(FunctionHandle::GarbageCollectionAfterCall);
-    string dist_fname = args[3].getAs<char *>();
-
-    std::string fname = dist_fn_name(dist_fname);
-
-    std::tuple<Index, double> result;
-    closestColumnsAndDistancesShortcut(M, x, dist, fname, &result, &result + 1);
-
-    AnyType tuple;
-    return tuple
-        << static_cast<int32_t>(std::get<0>(result))
-        << std::get<1>(result);
+    //if (true) throw std::runtime_error("Begin cc run\n");
+    try{
+        MappedMatrix M = args[0].getAs<MappedMatrix>();
+        MappedColumnVector x = args[1].getAs<MappedColumnVector>();
+        FunctionHandle dist = args[2].getAs<FunctionHandle>()
+            .unsetFunctionCallOptions(FunctionHandle::GarbageCollectionAfterCall);
+        string dist_fname = args[3].getAs<char *>();
+        std::string fname = dist_fn_name(dist_fname);
+        std::tuple<Index, double> result;
+        closestColumnsAndDistancesShortcut(M, x, dist, fname, &result, &result + 1);
+
+        AnyType tuple;
+        return tuple
+            << static_cast<int32_t>(std::get<0>(result))
+            << std::get<1>(result);
+    }catch (const ArrayWithNullException &e) {
+        return Null();
+    }
 }
 
 AnyType

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/79c50f89/src/ports/postgres/modules/kmeans/kmeans.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/kmeans/kmeans.py_in b/src/ports/postgres/modules/kmeans/kmeans.py_in
index de318dc..3e272a8 100644
--- a/src/ports/postgres/modules/kmeans/kmeans.py_in
+++ b/src/ports/postgres/modules/kmeans/kmeans.py_in
@@ -319,6 +319,7 @@ def compute_kmeans(schema_madlib, rel_args, rel_state, rel_source,
                             %(prev_id)s AS _old_centroid_id
                         FROM {rel_source} AS _src
                         WHERE abs(coalesce({schema_madlib}.svec_elsum({expr_point}), 'Infinity'::FLOAT8)) < 'Infinity'::FLOAT8
+                        AND NOT {schema_madlib}.array_contains_null(_src.{expr_point}::FLOAT8[])
                     ) AS _points_with_assignments
                     GROUP BY (_new_centroid).column_id
                 ) AS _new_centroids

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/79c50f89/src/ports/postgres/modules/kmeans/test/kmeans.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/kmeans/test/kmeans.sql_in b/src/ports/postgres/modules/kmeans/test/kmeans.sql_in
index 92699d8..b95693b 100644
--- a/src/ports/postgres/modules/kmeans/test/kmeans.sql_in
+++ b/src/ports/postgres/modules/kmeans/test/kmeans.sql_in
@@ -64,3 +64,26 @@ SELECT * FROM kmeans('kmeans_2d', 'position', ARRAY[
 
 SELECT * FROM kmeans('kmeans_2d', 'position', 'centroids', 'position', 'MADLIB_SCHEMA.dist_norm1');
 SELECT * FROM kmeans('kmeans_2d', 'position', 'centroids', 'position', 'MADLIB_SCHEMA.dist_norm2');
+
+DROP TABLE IF EXISTS km_sample;
+
+CREATE TABLE km_sample(pid int, points double precision[]);
+
+COPY km_sample (pid, points) FROM stdin DELIMITER '|';
+1 | {14.23, 1.71, 2.43, 15.6, 127, 2.8, 3.0600, 0.2800, 2.29, 5.64, 1.04, 3.92, 1065}
+2 | {13.2, 1.78, 2.14, 11.2, 1, 2.65, 2.76, 0.26, 1.28, 4.38, 1.05, 3.49, 1050}
+3 | {13.16, 2.36,  2.67, 18.6, 101, 2.8,  3.24, 0.3, 2.81, 5.6799, 1.03, 3.17, 1185}
+4 | {14.37, 1.95, 2.5, 16.8, 113, 3.85, 3.49, 0.24, 2.18, 7.8, 0.86, 3.45, 1480}
+5 | {13.24, 2.59, 2.87, 21, 118, 2.8, 2.69, 0.39, 1.82, 4.32, 1.04, 2.93, 735}
+6 | {14.2, 1.76, 2.45, 15.2, 112, 3.27, 3.39, 0.34, 1.97, 6.75, 1.05, 2.85, 1450}
+7 | {14.39, 1.87, 2.45, 14.6, 96, 2.5, 2.52, 0.3, 1.98, 5.25, 1.02, 3.58, 1290}
+8 | {14.06, 2.15, 2.61, 17.6, 121, 2.6, 2.51, 0.31, 1.25, 5.05, 1.06, 3.58, 1295}
+9 | {14.83, 1.64, 2.17, 14, 97, 2.8, 2.98, 0.29, 1.98, 5.2, 1.08, 2.85, 1045}
+10 | {13.86, 1.35, 2.27, 16, 98, 2.98, 3.15, 0.22, 1.8500, 7.2199, 1.01, NULL, 1045}
+\.
+
+DROP TABLE IF EXISTS centroids;
+
+SELECT * FROM kmeanspp('km_sample', 'points', 2,
+                       'MADLIB_SCHEMA.squared_dist_norm2',
+                       'MADLIB_SCHEMA.avg', 20, 0.001);