You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by ri...@apache.org on 2016/03/26 00:49:02 UTC

[3/3] incubator-madlib git commit: SVM: Better NULL handling + use temp for random matrices

SVM: Better NULL handling + use temp for random matrices

Closes #28, closes #30


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/62a99ce6
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/62a99ce6
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/62a99ce6

Branch: refs/heads/master
Commit: 62a99ce618280c13c0415dcacc72ad48205e1107
Parents: 360f134
Author: Xiaocheng Tang <xi...@gmail.com>
Authored: Fri Mar 11 09:49:42 2016 -0800
Committer: Rahul Iyer <ri...@pivotal.io>
Committed: Fri Mar 25 16:48:32 2016 -0700

----------------------------------------------------------------------
 .../postgres/modules/linalg/matrix_ops.py_in     |  9 +++++----
 .../modules/svm/kernel_approximation.py_in       | 19 ++++++++++++-------
 2 files changed, 17 insertions(+), 11 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/62a99ce6/src/ports/postgres/modules/linalg/matrix_ops.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/linalg/matrix_ops.py_in b/src/ports/postgres/modules/linalg/matrix_ops.py_in
index d11dcdb..458a763 100644
--- a/src/ports/postgres/modules/linalg/matrix_ops.py_in
+++ b/src/ports/postgres/modules/linalg/matrix_ops.py_in
@@ -3125,8 +3125,8 @@ def matrix_random(schema_madlib, distribution, row_dim, col_dim,
     else:
         distribution = 'uniform'
 
-    in_args_default = {'seed': randint(0, 1000), 'table_type': ''}
-    in_args_types = {'seed': int, 'table_type': str}
+    in_args_default = {'seed': randint(0, 1000), 'temp_out': False}
+    in_args_types = {'seed': int, 'temp_out': bool}
     if distribution == 'normal':
         in_args_default.update({'mu': 0, 'sigma': 1})
         in_args_types.update({'mu': float, 'sigma': float})
@@ -3164,7 +3164,7 @@ def matrix_random(schema_madlib, distribution, row_dim, col_dim,
                    .format(distribution, ', '.join(sorted(supported_dist))))
 
     plpy.execute("""
-        CREATE {in_args_vals[table_type]} TABLE {matrix_out}
+        CREATE {is_temp} TABLE {matrix_out}
         m4_ifdef(`__POSTGRESQL__', `',
            `WITH (APPENDONLY=TRUE,COMPRESSTYPE=QUICKLZ)') AS
             SELECT
@@ -3174,4 +3174,5 @@ def matrix_random(schema_madlib, distribution, row_dim, col_dim,
                 generate_series(1, {row_dim}) as row
             m4_ifdef(`__POSTGRESQL__', `',
                `DISTRIBUTED BY ({out_args[row]})')
-        """.format(**locals()))
+        """.format(is_temp=True if in_args_vals['temp_out'] else False,
+                   **locals()))

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/62a99ce6/src/ports/postgres/modules/svm/kernel_approximation.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/svm/kernel_approximation.py_in b/src/ports/postgres/modules/svm/kernel_approximation.py_in
index 190692e..0a09fbf 100644
--- a/src/ports/postgres/modules/svm/kernel_approximation.py_in
+++ b/src/ports/postgres/modules/svm/kernel_approximation.py_in
@@ -194,7 +194,7 @@ class PolyKernel(object):
             drop table if exists {rd_weights};
             select {schema_madlib}.matrix_random(
                         1, {dim},
-                        'upper=1, lower=-1, seed={seed}, table_type=temp',
+                        'upper=1, lower=-1, seed={seed}, temp_out=true',
                         'bernoulli', '{rd_weights}',
                         'row={id}, val={val}')
         """.format(rd_weights=rd_weights_,
@@ -303,6 +303,7 @@ class PolyKernel(object):
                     {id_col},
                     {grouping_col}
                 from {source_table}
+                WHERE not {schema_madlib}.array_contains_null({independent_varname})
             ) q cross join (select {pro.rd_val} from {pro.weights}) as weights
                 cross join (select {pro.rd_val} from {pro.coefs}) as coefs
                 cross join (select {pro.rd_val} from {pro.reps}) as reps
@@ -348,26 +349,28 @@ class GaussianKernelBase(object):
     def _random_weights(self, row_dim, col_dim, rd_id, rd_val):
         rd_weights = unique_string(desp='random_weights')
         sigma = sqrt(2 * self.gamma)
+        seed = self.random_state
         plpy.execute("""
             drop table if exists {rd_weights};
             select {self.schema_madlib}.matrix_random(
                     {row_dim}, {col_dim},
-                    'mu=0, sigma={sigma}, seed={self.random_state}',
-                    'normal',
-                    '{rd_weights}','row={rd_id}, val={rd_val}');
+                    'mu=0, sigma={sigma}, seed={seed}, temp_out=true',
+                    'normal', '{rd_weights}',
+                    'row={rd_id}, val={rd_val}');
         """.format(**locals()))
         return rd_weights
 
     def _random_offsets(self, row_dim, col_dim, rd_id, rd_val):
         rd_offset = unique_string(desp='random_offsets')
         max_ = 2 * pi
+        seed = self.random_state
         plpy.execute("""
             drop table if exists {rd_offset};
             select {self.schema_madlib}.matrix_random(
                     {row_dim}, {col_dim},
-                    'min=0, max={max_}, seed={self.random_state}',
-                    'uniform',
-                    '{rd_offset}','row={rd_id}, val={rd_val}');
+                    'min=0, max={max_}, seed={seed}, temp_out=true',
+                    'uniform', '{rd_offset}',
+                    'row={rd_id}, val={rd_val}');
         """.format(**locals()))
         return rd_offset
 
@@ -538,6 +541,7 @@ class GaussianKernel(GaussianKernelBase):
                     {id_col},
                     {grouping_col}
                 from {source_table}
+                WHERE not {schema_madlib}.array_contains_null({independent_varname})
         """.format(**locals())
         plpy.execute(run_sql)
         source_table = source_with_id
@@ -688,6 +692,7 @@ class GaussianKernelInMemory(GaussianKernelBase):
                         {id_col},
                         {grouping_col}
                     from {source_table}
+                    WHERE not {schema_madlib}.array_contains_null({independent_varname})
                 ) q
                 cross join (select {self.rd_val} from {self.rd_weights}) as rw
                 cross join (select {self.rd_val} from {self.rd_offset}) as ro