You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by ri...@apache.org on 2016/01/20 22:23:42 UTC

[1/2] incubator-madlib git commit: Minor: Update return value in correlation + matrix

Repository: incubator-madlib
Updated Branches:
  refs/heads/master eec20a4bc -> d282e0f36


Minor: Update return value in correlation + matrix

- In correlation, a special UDT was returned containing run time stats.
  This has been replaced by a TEXT that contains required information.
- In matrix, the decomposition operations did not provide enough
  information about how to use suffixes to query the result tables. This
  has now been added.


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/83391ace
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/83391ace
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/83391ace

Branch: refs/heads/master
Commit: 83391ace1d8d8524b774a76a1328ce4967127031
Parents: eec20a4
Author: Rahul Iyer <ri...@pivotal.io>
Authored: Mon Jan 18 11:39:20 2016 -0800
Committer: Rahul Iyer <ri...@pivotal.io>
Committed: Wed Jan 20 13:08:02 2016 -0800

----------------------------------------------------------------------
 .../postgres/modules/linalg/matrix_ops.sql_in   | 33 ++++++++++++++++----
 .../postgres/modules/stats/correlation.py_in    | 31 +++++++++---------
 .../postgres/modules/stats/correlation.sql_in   | 26 +++++----------
 3 files changed, 51 insertions(+), 39 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/83391ace/src/ports/postgres/modules/linalg/matrix_ops.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/linalg/matrix_ops.sql_in b/src/ports/postgres/modules/linalg/matrix_ops.sql_in
index 46a728d..bddecc1 100644
--- a/src/ports/postgres/modules/linalg/matrix_ops.sql_in
+++ b/src/ports/postgres/modules/linalg/matrix_ops.sql_in
@@ -2599,8 +2599,16 @@ RETURNS MADLIB_SCHEMA.matrix_result AS $$
     PythonFunctionBodyOnly(`linalg', `matrix_ops')
     matrix_ops.matrix_lu(schema_madlib, matrix_in, in_args,
                          matrix_out_prefix, out_args)
-    return ["Prefix = {0}, add _p, _l, _u, _q at the end to "
-            "get respective matrices".format(matrix_out_prefix)]
+    result = """
+    For LU decomposition with full pivoting (PAQ = LU),
+    the corresponding matrices can be obtained by appending following suffixes
+    to {0}:
+          _p for row permutation matrix P
+          _q for column permutation matrix Q
+          _l for lower triangular factor L
+          _u for upper triangular factor U
+    """.format(matrix_out_prefix)
+    return [result]
 $$ LANGUAGE plpythonu
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 
@@ -2710,8 +2718,15 @@ RETURNS MADLIB_SCHEMA.matrix_result AS $$
     PythonFunctionBodyOnly(`linalg', `matrix_ops')
     matrix_ops.matrix_cholesky(schema_madlib, matrix_in, in_args,
                                matrix_out_prefix, out_args)
-    return ["Prefix = {0}, add _p, _l, _d at the end of prefix "
-            "get the P, L and D matrices respectively".format(matrix_out_prefix)]
+    result = """
+    For Cholesky decomposition with left pivoting (PA = LDL*),
+    the corresponding matrices can be obtained by appending following suffixes
+    to {0}:
+          _p for row permutation matrix P
+          _l for lower triangular factor L
+          _d for diagonal matrix D
+    """.format(matrix_out_prefix)
+    return [result]
 $$ LANGUAGE plpythonu
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 
@@ -2766,8 +2781,14 @@ RETURNS MADLIB_SCHEMA.matrix_result AS $$
     PythonFunctionBodyOnly(`linalg', `matrix_ops')
     matrix_ops.matrix_qr(schema_madlib, matrix_in, in_args,
                          matrix_out_prefix, out_args)
-    return ["Prefix = {0}, add _q, _r at the end of prefix "
-            "get the Q and R matrices respectively".format(matrix_out_prefix)]
+    result = """
+    For QR decomposition (A = QR),
+    the corresponding matrices can be obtained by appending following suffixes
+    to {0}:
+          _q for orthogonal matrix Q
+          _r for upper triangular factor R
+    """.format(matrix_out_prefix)
+    return [result]
 $$ LANGUAGE plpythonu
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/83391ace/src/ports/postgres/modules/stats/correlation.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/stats/correlation.py_in b/src/ports/postgres/modules/stats/correlation.py_in
index 83604f4..fc9959b 100644
--- a/src/ports/postgres/modules/stats/correlation.py_in
+++ b/src/ports/postgres/modules/stats/correlation.py_in
@@ -67,22 +67,23 @@ def correlation(schema_madlib, source_table, output_table,
         if len(_existing_target_cols) == 1:
             plpy.error("Correlation error: Only one numeric column found in the target list.")
 
+    run_time = _populate_output_table(schema_madlib, source_table, output_table,
+                                      _existing_target_cols, get_cov, verbose)
     # ---- Output message ----
-    output_text_mesasge = "Summary for 'correlation' function"
+    output_text_list = ["Summary for 'correlation' function"]
+    output_text_list.append("Output table = " + str(output_table))
     if _nonnumeric_target_cols:
-        output_text_mesasge += ("\n Non-numeric columns ignored: {0}".
-                                format(str(_nonnumeric_target_cols)))
+        output_text_list.append(("Non-numeric columns ignored: {0}".
+                                 format(str(_nonnumeric_target_cols))))
     if _nonexisting_target_cols:
-        output_text_mesasge += ("\n Columns that don't exist in '{0}' ignored: {1}".
-                                format(source_table, str(_nonexisting_target_cols)))
+        output_text_list.append(("Columns that don't exist in '{0}' ignored: {1}".
+                                format(source_table, str(_nonexisting_target_cols))))
 
-    output_text_mesasge += ("\n Producing correlation for columns: {0}".
-                            format(str(_existing_target_cols)))
-    plpy.notice(output_text_mesasge)
+    output_text_list.append(("Producing correlation for columns: {0}".
+                            format(str(_existing_target_cols))))
+    output_text_list.append("Total run time = " + str(run_time))
     # ---- Output message ----
-
-    return _populate_output_table(schema_madlib, source_table, output_table,
-                                  _existing_target_cols, get_cov, verbose)
+    return '\n'.join(output_text_list)
 # ------------------------------------------------------------------------------
 
 
@@ -255,7 +256,7 @@ def _populate_output_table(schema_madlib, source_table, output_table,
         # clean up and return
         plpy.execute("DROP TABLE {temp_table}".format(**locals()))
         end = time()
-        return (output_table, len(col_names), end - start)
+        return end - start
 # ------------------------------------------------------------------------------
 
 
@@ -289,9 +290,9 @@ The columns of the table are described as follows:
     in 'source_table'.
 
 The output table is arranged as a lower-traingular matrix with the upper
-triangle set to NULL and the diagonal elements set to 1.0. To obtain the
-result from the output_table in this matrix format ensure to order the
-elements using the 'column_position' column.
+triangle set to NULL. To obtain the result from the output_table in this matrix
+format ensure to order the elements using the 'column_position' column.
+
         """.format(schema_madlib=schema_madlib, func=func)
     elif message is not None and message.lower() in ('example', 'examples'):
         return """

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/83391ace/src/ports/postgres/modules/stats/correlation.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/stats/correlation.sql_in b/src/ports/postgres/modules/stats/correlation.sql_in
index 36cb133..3e4e9bd 100644
--- a/src/ports/postgres/modules/stats/correlation.sql_in
+++ b/src/ports/postgres/modules/stats/correlation.sql_in
@@ -276,16 +276,6 @@ CREATE AGGREGATE MADLIB_SCHEMA.covariance_agg(
 );
 
 -----------------------------------------------------------------------
-
-DROP TYPE IF EXISTS MADLIB_SCHEMA.correlation_result CASCADE;
-CREATE TYPE MADLIB_SCHEMA.correlation_result AS
-(
-    output_table text,
-    row_count    integer,
-    duration     double precision
-);
-
------------------------------------------------------------------------
 -- Main function for correlation
 -----------------------------------------------------------------------
 /* @brief Compute a correlation matrix for a table with optional target columns specified
@@ -308,7 +298,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.correlation(
     output_table varchar, -- output table name
     target_cols  varchar, -- comma separated list of output cols (default = '*')
     verbose      boolean  -- flag to determine verbosity
-) RETURNS MADLIB_SCHEMA.correlation_result AS $$
+) RETURNS TEXT AS $$
     PythonFunction(stats, correlation, correlation)
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
@@ -321,7 +311,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.correlation(
     output_table varchar, -- output table name
     target_cols  varchar  -- comma separated list of output cols (default = '*')
 )
-RETURNS MADLIB_SCHEMA.correlation_result AS $$
+RETURNS TEXT AS $$
     select MADLIB_SCHEMA.correlation($1, $2, $3, FALSE)
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
@@ -330,7 +320,7 @@ m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.correlation(
     source_table varchar, --  input table name
     output_table varchar  -- output table name
-) RETURNS MADLIB_SCHEMA.correlation_result AS $$
+) RETURNS TEXT AS $$
     select MADLIB_SCHEMA.correlation($1, $2, NULL)
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
@@ -348,7 +338,7 @@ m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
 
 -----------------------------------------------------------------------
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.correlation()
-RETURNS text AS $$
+RETURNS TEXT AS $$
     PythonFunctionBodyOnly(`stats', `correlation')
     return correlation.correlation_help_message(schema_madlib, None)
 $$ LANGUAGE plpythonu IMMUTABLE
@@ -378,7 +368,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.covariance(
     output_table varchar, -- output table name
     target_cols  varchar, -- comma separated list of output cols (default = '*')
     verbose      boolean  -- flag to determine verbosity
-) RETURNS MADLIB_SCHEMA.correlation_result AS $$
+) RETURNS TEXT AS $$
     PythonFunctionBodyOnly(`stats', `correlation')
     return correlation.correlation(schema_madlib, source_table, output_table,
                                    target_cols, True, verbose)
@@ -393,7 +383,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.covariance(
     output_table varchar, -- output table name
     target_cols  varchar  -- comma separated list of output cols (default = '*')
 )
-RETURNS MADLIB_SCHEMA.correlation_result AS $$
+RETURNS TEXT AS $$
     select MADLIB_SCHEMA.covariance($1, $2, $3, FALSE)
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
@@ -402,7 +392,7 @@ m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.covariance(
     source_table varchar, --  input table name
     output_table varchar  -- output table name
-) RETURNS MADLIB_SCHEMA.correlation_result AS $$
+) RETURNS TEXT AS $$
     select MADLIB_SCHEMA.covariance($1, $2, NULL)
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
@@ -420,7 +410,7 @@ m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
 
 -----------------------------------------------------------------------
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.covariance()
-RETURNS text AS $$
+RETURNS TEXT AS $$
     PythonFunctionBodyOnly(`stats', `correlation')
     return correlation.correlation_help_message(schema_madlib, None, cov=True)
 $$ LANGUAGE plpythonu IMMUTABLE


[2/2] incubator-madlib git commit: Elastic Net: Check only if features are numeric

Posted by ri...@apache.org.
Elastic Net: Check only if features are numeric

JIRA: MADLIB-952

Columns were being checked to ensure every column is of the same numeric
type. While giving an error for non-numeric types is correct, there is
no need to enforce them to be of same numeric type, as the columns are
cast to float8[] inside the function. The input analyzer has been
changed to relax this condition.

This closes #12.


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/d282e0f3
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/d282e0f3
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/d282e0f3

Branch: refs/heads/master
Commit: d282e0f362c522e74f310b0148d392db6d63ac7c
Parents: 83391ac
Author: Orhan Kislal <ok...@pivotal.io>
Authored: Wed Jan 20 13:13:08 2016 -0800
Committer: Rahul Iyer <ri...@pivotal.io>
Committed: Wed Jan 20 13:22:54 2016 -0800

----------------------------------------------------------------------
 .../modules/elastic_net/elastic_net.py_in       |  6 ++---
 .../elastic_net_optimizer_fista.py_in           |  2 +-
 .../test/elastic_net_install_check.sql_in       | 27 ++++++++++++++++++++
 3 files changed, 30 insertions(+), 5 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/d282e0f3/src/ports/postgres/modules/elastic_net/elastic_net.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/elastic_net/elastic_net.py_in b/src/ports/postgres/modules/elastic_net/elastic_net.py_in
index 21c581a..22266f9 100644
--- a/src/ports/postgres/modules/elastic_net/elastic_net.py_in
+++ b/src/ports/postgres/modules/elastic_net/elastic_net.py_in
@@ -302,7 +302,6 @@ def elastic_net_train(schema_madlib, tbl_source, tbl_result, col_dep_var,
                    SELECT {schema_madlib}.elastic_net_train('gaussian');
                    for supported optimizers.
                    """.format(schema_madlib=schema_madlib))
-
     # handle all special cases of col_ind_var
     col_ind_var, outstr_array = analyze_input_str(schema_madlib, tbl_source,
                                                   col_ind_var, col_dep_var,
@@ -420,13 +419,12 @@ def analyze_input_str(schema_madlib, tbl_source,
                                             col_ind_var)
         else:
             included_col_types = [col_types_dict[i] for i in outstr_array]
-            if not all(i == included_col_types[0] and is_psql_numeric_type(i)
+            if not all(is_psql_numeric_type(i)
                        for i in included_col_types):
                 plpy.error("""
                            Elastic Net error: All columns to be included in the
-                           independent variables should be of the same numeric type.
+                           independent variables should be of the numeric type.
                            """)
-
         col_ind_var_new = "ARRAY[" + ','.join(outstr_array) + "]"
         return (col_ind_var_new, outstr_array)
 

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/d282e0f3/src/ports/postgres/modules/elastic_net/elastic_net_optimizer_fista.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/elastic_net/elastic_net_optimizer_fista.py_in b/src/ports/postgres/modules/elastic_net/elastic_net_optimizer_fista.py_in
index a546de4..e3e5977 100644
--- a/src/ports/postgres/modules/elastic_net/elastic_net_optimizer_fista.py_in
+++ b/src/ports/postgres/modules/elastic_net/elastic_net_optimizer_fista.py_in
@@ -422,7 +422,7 @@ def __compute_fista(schema_madlib, func_step_aggregate, func_state_diff,
         arguments
     @param tbl_state Name of the (temporary) table containing the inter-iteration
         states
-    @param rel_source Name of the relation containing input points
+    @param tbl_source Name of the relation containing input points
     @param col_ind_var Name of the independent variables column
     @param col_dep_var Name of the dependent variable column
     @param drop_table Boolean, whether to use IterationController (True) or

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/d282e0f3/src/ports/postgres/modules/elastic_net/test/elastic_net_install_check.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/elastic_net/test/elastic_net_install_check.sql_in b/src/ports/postgres/modules/elastic_net/test/elastic_net_install_check.sql_in
index c45ccd2..643750a 100644
--- a/src/ports/postgres/modules/elastic_net/test/elastic_net_install_check.sql_in
+++ b/src/ports/postgres/modules/elastic_net/test/elastic_net_install_check.sql_in
@@ -514,6 +514,16 @@ COPY lin_housing_wi (x, y) FROM STDIN NULL '?';
 {1,0.04741,0.00,11.930,0,0.5730,6.0300,80.80,2.5050,1,273.0,21.00,396.90,7.88}	11.90
 \.
 
+DROP TABLE IF EXISTS elastic_type_src;
+
+CREATE TABLE elastic_type_src ( var_int int, var_float8 float8, var_sint smallint);
+COPY elastic_type_src (var_int, var_float8, var_sint) FROM stdin DELIMITER ',' NULL '?' ;
+1, 1.1, 1
+2, 2.2, 2
+3, 3.3, 3
+4, 4.4, 4
+\.
+
 create function check_elastic_net ()
 returns void as $$
 begin
@@ -614,6 +624,23 @@ begin
         'Elastic Net: log-likelihood mismatch (use_active_set = t)!'
     ) from house_en;
 
+    execute 'DROP TABLE IF EXISTS elastic_type_res';
+    perform elastic_net_train('elastic_type_src',
+		'elastic_type_res',
+		'var_int < 0',
+		'*',
+		'binomial',
+		0.6,
+		0.02,
+		TRUE,
+		NULL,
+		'fista',
+		'',
+		'',
+		10000,
+		1e-6
+	);
+
 end;
 $$ language plpgsql volatile;