You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@madlib.apache.org by ri...@apache.org on 2017/03/13 20:57:23 UTC

[01/50] [abbrv] incubator-madlib git commit: Build: Correct madlib version in gppkg spec file

Repository: incubator-madlib
Updated Branches:
  refs/heads/latest_release e1c99c153 -> a3863b6c2


Build: Correct madlib version in gppkg spec file


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/c56b2091
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/c56b2091
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/c56b2091

Branch: refs/heads/latest_release
Commit: c56b20910040c9fb2deabfb54ddc25abe8609942
Parents: e43b449
Author: Rahul Iyer <ri...@apache.org>
Authored: Mon Jan 9 15:44:36 2017 -0800
Committer: Rahul Iyer <ri...@apache.org>
Committed: Mon Jan 9 15:44:36 2017 -0800

----------------------------------------------------------------------
 deploy/gppkg/madlib.spec.in | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/c56b2091/deploy/gppkg/madlib.spec.in
----------------------------------------------------------------------
diff --git a/deploy/gppkg/madlib.spec.in b/deploy/gppkg/madlib.spec.in
index 34f4b89..123eb1c 100644
--- a/deploy/gppkg/madlib.spec.in
+++ b/deploy/gppkg/madlib.spec.in
@@ -2,7 +2,7 @@
 %define __os_install_post %{nil}
 %define _rpmfilename      @MADLIB_GPPKG_RPM_FILE_NAME@
 %define _unpackaged_files_terminate_build 0
-%define _madlib_version  @MADLIB_VERSION_STRING_NO_HYPHEN@
+%define _madlib_version  @MADLIB_VERSION_STRING@
 
 BuildRoot:      @MADLIB_GPPKG_RPM_SOURCE_DIR@
 Summary:        MADlib for @GPDB_VARIANT@ Database
@@ -49,8 +49,7 @@ fi
 # GPPKG version is setup with underscore replacing a hyphen but
 # the actual directories created on disk use the hyphenated version.
 # Hence we replace the underscore to link to the right location
-MADLIB_VERSION_NO_HYPHEN=%{_madlib_version}
-MADLIB_VERSION="${MADLIB_VERSION_NO_HYPHEN/_/-}"
+MADLIB_VERSION=%{_madlib_version}
 ln -nsf $RPM_INSTALL_PREFIX/madlib/Versions/$MADLIB_VERSION $RPM_INSTALL_PREFIX/madlib/Current
 ln -nsf $RPM_INSTALL_PREFIX/madlib/Current/bin $RPM_INSTALL_PREFIX/madlib/bin
 ln -nsf $RPM_INSTALL_PREFIX/madlib/Current/doc $RPM_INSTALL_PREFIX/madlib/doc

[50/50] [abbrv] incubator-madlib git commit: Multiple: Update license headers per Apache guidance

Posted by ri...@apache.org.

Multiple: Update license headers per Apache guidance


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/a3863b6c
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/a3863b6c
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/a3863b6c

Branch: refs/heads/latest_release
Commit: a3863b6c2407eb28ba007f6288d167bf88674e6d
Parents: fa80240
Author: Rahul Iyer <ri...@apache.org>
Authored: Tue Feb 28 10:31:59 2017 -0800
Committer: Rahul Iyer <ri...@apache.org>
Committed: Tue Feb 28 10:31:59 2017 -0800

----------------------------------------------------------------------
 doc/design/figures/class_diagram.mp | 17 +++++++++++++++++
 pom.xml                             |  1 -
 src/madpack/sort-module.py          | 17 -----------------
 3 files changed, 17 insertions(+), 18 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/a3863b6c/doc/design/figures/class_diagram.mp
----------------------------------------------------------------------
diff --git a/doc/design/figures/class_diagram.mp b/doc/design/figures/class_diagram.mp
index d7edbf8..10940f9 100644
--- a/doc/design/figures/class_diagram.mp
+++ b/doc/design/figures/class_diagram.mp
@@ -1,3 +1,20 @@
+% Licensed to the Apache Software Foundation (ASF) under one
+% or more contributor license agreements.  See the NOTICE file
+% distributed with this work for additional information
+% regarding copyright ownership.  The ASF licenses this file
+% to you under the Apache License, Version 2.0 (the
+% "License"); you may not use this file except in compliance
+% with the License.  You may obtain a copy of the License at
+%
+%   http://www.apache.org/licenses/LICENSE-2.0
+%
+% Unless required by applicable law or agreed to in writing,
+% software distributed under the License is distributed on an
+% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+% KIND, either express or implied.  See the License for the
+% specific language governing permissions and limitations
+% under the License.
+
 % Metapost figure compiled offline to create the class diagram for modular fold/reduce
 % need MetaUML (https://github.com/ogheorghies/MetaUML) to compile this file
 % to create pdf figure: mptopdf class_diagram.mp

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/a3863b6c/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index e4eefc4..5defa28 100644
--- a/pom.xml
+++ b/pom.xml
@@ -78,7 +78,6 @@
               <exclude>doc/design/design.tex</exclude>
               <exclude>doc/design/figures/basics_decision_tree.pdf</exclude>
               <exclude>doc/design/figures/class_diagram-1.pdf</exclude>
-              <exclude>doc/design/figures/class_diagram.mp</exclude>
               <exclude>doc/design/figures/decision_tree_linear_model.pdf</exclude>
               <exclude>doc/design/figures/impurity_measures.pdf</exclude>
               <exclude>doc/design/figures/lanczos_bidiag_segment.pdf</exclude>

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/a3863b6c/src/madpack/sort-module.py
----------------------------------------------------------------------
diff --git a/src/madpack/sort-module.py b/src/madpack/sort-module.py
index 3c4691d..0d8d478 100644
--- a/src/madpack/sort-module.py
+++ b/src/madpack/sort-module.py
@@ -1,21 +1,4 @@
 #!/usr/bin/env python
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
 
 """
 @file sort-module.py

[24/50] [abbrv] incubator-madlib git commit: Encode categorical: Update doc and code comments

Posted by ri...@apache.org.

Encode categorical: Update doc and code comments


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/91622717
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/91622717
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/91622717

Branch: refs/heads/latest_release
Commit: 916227178c60279861dfce2412df57e37a06651a
Parents: 59a09ee
Author: Rahul Iyer <ri...@apache.org>
Authored: Wed Feb 1 16:12:46 2017 -0800
Committer: Rahul Iyer <ri...@apache.org>
Committed: Wed Feb 1 16:18:35 2017 -0800

----------------------------------------------------------------------
 .../modules/utilities/encode_categorical.py_in  | 15 ++++++++----
 .../modules/utilities/encode_categorical.sql_in | 24 +++++++++++++-------
 2 files changed, 26 insertions(+), 13 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/91622717/src/ports/postgres/modules/utilities/encode_categorical.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/encode_categorical.py_in b/src/ports/postgres/modules/utilities/encode_categorical.py_in
index 26361a6..a92c559 100644
--- a/src/ports/postgres/modules/utilities/encode_categorical.py_in
+++ b/src/ports/postgres/modules/utilities/encode_categorical.py_in
@@ -159,7 +159,7 @@ class CategoricalEncoder(object):
         self._row_id_cols = split_quoted_delimited_str(self.row_id)
 
         # flag to build a dictionary table
-        self._output_dictionary = True if self.output_type in ['array', 'svec'] else self.output_dictionary
+        self._output_dictionary = True if self.output_type in ('array', 'svec') else self.output_dictionary
 
         # how to distribute the output table (for distributed platforms)
         if not is_platform_pg():
@@ -228,6 +228,9 @@ class CategoricalEncoder(object):
             _assert(is_var_valid(self.source_table, ','.join(self._row_id_cols)),
                     "Encoding categorical: Not all columns from ({0}) present in source table ({1})"
                     .format(self._row_id_cols, self.source_table))
+        _assert(self.output_type in ('column', 'array', 'svec'),
+                "Encoding categorical: Output type should be one of {0}".
+                format(('column', 'array', 'svec')))
     # ------------------------------------------------------------------------------
 
     def _is_col_name_long(self, col_to_values):
@@ -301,8 +304,6 @@ class CategoricalEncoder(object):
                 v_type = str
 
             if self.output_type not in ('array', 'svec'):
-                # array_output = True implies all the case outputs will be wrapped
-                # as an array, hence not requiring an alias for each case
                 if not self._output_dictionary:
                     value_names = {None: 'NULL',
                                    list: self._name_others_col,
@@ -311,6 +312,8 @@ class CategoricalEncoder(object):
                 else:
                     alias = 'AS "{0}_{1}"'.format(col_no_quotes, seq)
             else:
+                # if output_type is array-like then each case does not
+                # require an alias
                 alias = ""
             return ("(CASE WHEN ({col} {value_str}) "
                     "THEN 1 ELSE 0 END)::INTEGER {alias}".
@@ -569,7 +572,8 @@ def encode_categorical_variables(
         @param top: str, Parameter to include only top values of a categorical variable
         @param value_to_drop: str, Parameter to set reference column in dummy coding
         @param encode_null: bool, If True, NULL is treated as a categorical value
-        @param array_output: bool, Parameter to determine if output should be in an array or columns
+        @param output_type: str, Parameter to determine if output should be an array, svec or individual columns
+                                 Can take values ('column', 'array', 'svec')
         @param output_dictionary: bool, If True columns names are simplified and
                     a separate mapping table is created to understand the names
         @param distributed_by: str, Comma-separated list of column names to use for distribution of output
@@ -644,7 +648,8 @@ SELECT {madlib}.encode_categorical_variables (
         value_to_drop,                  -- (Optional) Reference value to drop for each column
         encode_null,                    -- (Optional) Whether NULL should be treated as one of the
                                         --  values of the categorical variable.
-        output_type,                    -- (Optional) Get all encoded variables in an array
+        output_type,                    -- (Optional) Get encoded variables in individual columns
+                                        --    or as an array (Can be 'column', 'array', or 'svec')
         output_dictionary,              -- (Optional) Simplify output column naming and provide
                                         --  a mapping between simple names and meaning
         distributed_by                  -- (Optional) Columns to use for the distribution policy of

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/91622717/src/ports/postgres/modules/utilities/encode_categorical.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/encode_categorical.sql_in b/src/ports/postgres/modules/utilities/encode_categorical.sql_in
index c4151b2..a14337c 100644
--- a/src/ports/postgres/modules/utilities/encode_categorical.sql_in
+++ b/src/ports/postgres/modules/utilities/encode_categorical.sql_in
@@ -159,16 +159,24 @@ encode_categorical_variables (
     all encoded values for that variable will be set to 0.
     </dd>
 
-    <dt>array_output (optional)</dt>
-    <dd>BOOLEAN. default: FALSE.  This parameter controls the output format
-    of the indicator variables. If FALSE, a column is created for each indicator
+    <dt>output_type (optional)</dt>
+    <dd>VARCHAR. default: 'column'.  This parameter controls the output format
+    of the indicator variables. If 'column', a column is created for each indicator
     variable. PostgreSQL limits the number of columns in a table.
     If the total number of indicator columns exceeds the limit, then make this
-    parameter TRUE to combine the indicator columns
-    into an array. The order of the array is the same as specified in 'categorical_cols'.
-    A dictionary will be created when 'array_output' is TRUE to define an index into
-    the array.  The dictionary table will be given the name of the 'output_table'
-    appended by '_dictionary'.
+    parameter either 'array' to combine the indicator columns into an array or
+    'svec' to cast the array output to <em>'madlib.svec'</em> type.
+
+    Since the array output for any single tuple would be sparse
+    (only one non-zero entry for each categorical column), the 'svec' output would
+    be most efficient for storage. The 'array' output is useful if the array is
+    used for post-processing, including concatenating with other non-categorical
+    features.
+
+    The order of the array is the same as specified in 'categorical_cols'.
+    A dictionary will be created when 'output_type' is 'array' or 'svec' to
+    define an index into the array.  The dictionary table will be given the name
+    of the 'output_table' appended by '_dictionary'.
     </dd>
 
     <dt>output_dictionary (optional)</dt>

[40/50] [abbrv] incubator-madlib git commit: Release v1.10:

Posted by ri...@apache.org.

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/90f4dc15/src/madpack/changelist_1.3_1.9.1.yaml
----------------------------------------------------------------------
diff --git a/src/madpack/changelist_1.3_1.9.1.yaml b/src/madpack/changelist_1.3_1.9.1.yaml
deleted file mode 100644
index bb7c195..0000000
--- a/src/madpack/changelist_1.3_1.9.1.yaml
+++ /dev/null
@@ -1,1283 +0,0 @@
-# Changelist for MADlib version 1.3 to 1.7
-
-# This file contains all changes that were introduced in a new version of
-# MADlib. This changelist is used by the upgrade script to detect what objects
-# should be upgraded (while retaining all other objects from the previous version)
-
-# New modules (actually .sql_in files) added in upgrade version
-# For these files the sql_in code is retained as is with the functions in the
-# file installed on the upgrade version. All other files (that don't have
-# updates), are cleaned up to remove object replacements
-new module:
-    # ----------------- Changes from 1.3 to 1.4 -----------------
-    robust_variance_coxph:
-    clustered_variance_coxph:
-    # ----------------- Changes from 1.4 to 1.4.1 ---------------
-    # ----------------- Changes from 1.4.1 to 1.5 ---------------
-    # ----------------- Changes from 1.5 to 1.6 ---------------
-    table_to_pmml:
-    # ----------------- Changes from 1.6 to 1.6.0S ----------
-    # ----------------- Changes from 1.6.0S to 1.7 ----------
-    glm:
-    multiresponseglm:
-    ordinal:
-    decision_tree:
-    random_forest:
-    distribution:
-    # ----------------- Changes from 1.7.1 to 1.8 ----------
-    text_utilities:
-
-# Changes in the types (UDT) including removal and modification
-udt:
-
-    # ----------------- Changes from 1.3 to 1.4 -----------------
-    coxph_result:
-    mlogregr_result:
-    marginal_logregr_result:
-    marginal_mlogregr_result:
-    # ----------------- Changes from 1.4 to 1.4.1 ---------------
-    # ----------------- Changes from 1.4.1 to 1.5 ---------------
-    # ----------------- Changes from 1.5 to 1.6 ---------------
-    __logregr_result:
-    # coxph_result: # duplicate
-    linregr_result:
-    # mlogregr_result: # duplicate
-    # some types missed before upgrade to v1.6
-    __utils_scaled_data:
-    # ----------------- Changes from 1.6 to 1.6.0S ----------
-    # ----------------- Changes from 1.6.0S to 1.7 ----------
-    bytea8:
-    # ----------------- Changes from 1.8 to 1.9 ----------
-    __enc_tbl_result:
-    __gen_acc_time:
-    __rep_type:
-    __train_result:
-    c45_classify_result:
-    c45_train_result:
-    correlation_result:
-    lsvm_sgd_model_rec:
-    lsvm_sgd_result:
-    rf_classify_result:
-    rf_train_result:
-    svm_cls_result:
-    svm_model_pr:
-    svm_model_rec:
-    svm_nd_result:
-    svm_reg_result:
-    svm_support_vector:
-    _prune_result_type:
-    _tree_result_type:
-    linear_svm_result:
-    # ----------------- Changes from 1.9 to 1.9.1 ----------
-    profile_result:
-
-# List of the UDF changes that affect the user externally.  This includes change
-# in function name, change in argument order or argument types, and removal of
-# the function. In each case, the original function is as good as removed and a
-# new function is created. In such cases, we should abort the upgrade if there
-# are user views dependent on this function, since the original function will
-# not be present in the upgraded version.
-udf:
-    # ----------------- Changes from 1.3 to 1.4 -----------------
-    - __internal_get_robust_linregr_insert_string:
-        rettype: character varying
-        argument: schema_madlib.robust_linregr_result, double precision[], text
-    - __internal_get_robust_linregr_result:
-        rettype: schema_madlib.robust_linregr_result
-        argument: character varying, character varying, character varying, double precision[]
-    - __internal_get_robust_logregr_insert_string:
-        rettype: character varying
-        argument: schema_madlib.robust_logregr_result, text
-    - __internal_get_robust_logregr_result:
-        rettype: schema_madlib.robust_logregr_result
-        argument: character varying, character varying, character varying, double precision[]
-    - __internal_get_robust_mlogregr_insert_string:
-        rettype: character varying
-        argument: schema_madlib.robust_mlogregr_result, text
-    - __lda_count_topic_prefunc:
-        rettype: integer[]
-        argument: integer[], integer[]
-    - __lda_count_topic_sfunc:
-        rettype: integer[]
-        argument: integer[], integer[], integer[], integer[], integer, integer
-    - __lda_gibbs_sample:
-        rettype: integer[]
-        argument: integer[], integer[], integer[], integer[], double precision, double precision, integer, integer, integer
-    - __lda_perplexity_ffunc:
-        rettype: double precision
-        argument: integer[]
-    - __lda_perplexity_prefunc:
-        rettype: integer[]
-        argument: integer[], integer[]
-    - __lda_perplexity_sfunc:
-        rettype: integer[]
-        argument: integer[], integer[], integer[], integer[], integer[], double precision, double precision, integer, integer
-    - __lda_util_transpose:
-        rettype: integer[]
-        argument: integer[]
-    - __lda_util_unnest:
-        rettype: SETOF integer[]
-        argument: integer[]
-    - clustered_variance_mlogregr:
-        rettype: void
-        argument: text, text, text, text, text, integer, text, integer, text, double precision
-    - clustered_variance_mlogregr:
-        rettype: void
-        argument: text, text, text, text, text, integer, text, integer, text
-    - clustered_variance_mlogregr:
-        rettype: void
-        argument: text, text, text, text, text, integer, text, integer, text, double precision, boolean
-    - robust_input_checking:
-        rettype: void
-        argument: character varying, character varying, character varying, character varying
-
-    # ----------------- Changes from 1.4 to 1.4.1 ---------------
-    # ----------------- Changes from 1.4.1 to 1.5 ---------------
-    - __cmsketch_final:
-        rettype: bytea
-        argument: bytea
-    - __delete_traininginfo:
-        rettype: void
-        argument: text
-    - __get_encode_table_name:
-        rettype: text
-        argument: text
-    - __get_metatable_name:
-        rettype: text
-        argument: text
-    - __get_routine_id:
-        rettype: integer
-        argument: text
-    - __get_routine_name:
-        rettype: text
-        argument: text
-    - __get_tree_table_name:
-        rettype: text
-        argument: text
-    - __insert_into_traininginfo:
-        rettype: void
-        argument: text, text, text, text, text, text, text, text, double precision, integer, integer
-    - __treemodel_clean:
-        rettype: boolean
-        argument: text
-    - compute_lincrf:
-        rettype: integer
-        argument: character varying, character varying, character varying, character varying, character varying, integer, integer
-    - create_nb_classify_fn:
-        rettype: void
-        argument: character varying, character varying, integer, character varying
-    - create_nb_classify_fn:
-        rettype: void
-        argument: character varying, character varying, character varying, integer, character varying
-    - create_table_as:
-        rettype: void
-        argument: character varying, character varying
-    - create_table_as:
-        rettype: void
-        argument: character varying, character varying, character varying
-    - create_temp_table_as:
-        rettype: void
-        argument: character varying, character varying, character varying
-    - create_temp_table_as:
-        rettype: void
-        argument: character varying, character varying
-    - create_temporary_table_as:
-        rettype: void
-        argument: character varying, character varying, character varying
-    - create_temporary_table_as:
-        rettype: void
-        argument: character varying, character varying
-    - crf_train_fgen:
-        rettype: void
-        argument: text, text, text, text, text
-    - insert_into:
-        rettype: void
-        argument: character varying, character varying
-    - internal_create_table_as:
-        rettype: void
-        argument: boolean, character varying, character varying, character varying
-    - internal_execute_using_kmeans_args:
-        rettype: void
-        argument: character varying, double precision[], regproc, integer, double precision
-    - internal_execute_using_kmeanspp_seeding_args:
-        rettype: void
-        argument: character varying, integer, regproc, double precision[]
-    - internal_execute_using_silhouette_args:
-        rettype: double precision
-        argument: character varying, double precision[], regproc
-    - lincrf:
-        rettype: integer
-        argument: character varying, character varying, character varying, character varying, character varying, integer, character varying, character varying, integer
-    - lincrf:
-        rettype: integer
-        argument: character varying, character varying, character varying, character varying, character varying, integer, character varying, character varying
-    - lsvm_predict:
-        rettype: double precision
-        argument: text, double precision[]
-    - lsvm_predict_combo:
-        rettype: SETOF schema_madlib.svm_model_pr
-        argument: text, double precision[]
-    - lsvm_sgd_update:
-        rettype: schema_madlib.lsvm_sgd_model_rec
-        argument: schema_madlib.lsvm_sgd_model_rec, double precision[], double precision, double precision, double precision
-    - svm_cls_update:
-        rettype: schema_madlib.svm_model_rec
-        argument: schema_madlib.svm_model_rec, double precision[], double precision, text, double precision, double precision
-    - svm_nd_update:
-        rettype: schema_madlib.svm_model_rec
-        argument: schema_madlib.svm_model_rec, double precision[], text, double precision, double precision
-    - svm_predict:
-        rettype: double precision
-        argument: schema_madlib.svm_model_rec, double precision[], text
-    - svm_predict:
-        rettype: double precision
-        argument: text, double precision[]
-    - svm_predict_combo:
-        rettype: SETOF schema_madlib.svm_model_pr
-        argument: text, double precision[]
-    - svm_predict_sub:
-        rettype: double precision
-        argument: integer, integer, double precision[], double precision[], double precision[], text
-    - svm_reg_update:
-        rettype: schema_madlib.svm_model_rec
-        argument: schema_madlib.svm_model_rec, double precision[], double precision, text, double precision, double precision, double precision
-    - utils_normalize_data:
-        rettype: schema_madlib.__utils_scaled_data
-        argument: double precision[], double precision[], double precision[]
-    - vcrf_top1_label:
-        rettype: integer[]
-        argument: integer[], integer[], integer
-    - vcrf_top1_view:
-        rettype: text
-        argument: text, text, text, text
-    # ----------------- Changes from 1.5 to 1.6 ---------------
-    # Removed functions
-    - array_contains_null:
-        rettype: boolean
-        argument: double precision[]
-    - array_sqrt:
-        rettype: anyarray
-        argument: anyarray
-    - coxph_step_final:
-        rettype: double precision[]
-        argument: double precision[]
-    - coxph_step_strata_final:
-        rettype: double precision[]
-        argument: double precision[]
-    - coxph_step_transition:
-        rettype: double precision[]
-        argument: double precision[], double precision[], double precision, boolean, double precision[]
-    - internal_coxph_result:
-        rettype: schema_madlib.coxph_result
-        argument: double precision[]
-    - internal_coxph_step_distance:
-        rettype: double precision
-        argument: double precision[], double precision[]
-    - normalize:
-        rettype: double precision[]
-        argument: double precision[]
-    # Changed functions (return type)
-    # These functions can be recreated correctly even if we don't add them here.
-    # But the view dependency checker needs the information.
-    - __internal_mlogregr_irls_result:
-        rettype: schema_madlib.mlogregr_result
-        argument: double precision[]
-    - __logregr_cg_result:
-        rettype: schema_madlib.__logregr_result
-        argument: double precision[]
-    - __logregr_igd_result:
-        rettype: schema_madlib.__logregr_result
-        argument: double precision[]
-    - __logregr_irls_result:
-        rettype: schema_madlib.__logregr_result
-        argument: double precision[]
-    - linregr_final:
-        rettype: schema_madlib.linregr_result
-        argument: schema_madlib.bytea8
-    - mlogregr:
-        rettype: schema_madlib.mlogregr_result
-        argument: character varying, character varying, character varying, integer, character varying, double precision, integer
-    - mlogregr:
-        rettype: schema_madlib.mlogregr_result
-        argument: character varying, character varying, character varying, integer, character varying
-    - mlogregr:
-        rettype: schema_madlib.mlogregr_result
-        argument: character varying, character varying, character varying, integer
-    - mlogregr:
-        rettype: schema_madlib.mlogregr_result
-        argument: character varying, character varying, character varying
-    # make-ups from upgrade to v1.6
-    - marginal_logregr_step_final:
-        rettype: schema_madlib.marginal_logregr_result
-        argument: double precision[]
-    - mlogregr_marginal_step_final:
-        rettype: schema_madlib.marginal_mlogregr_result
-        argument: double precision[]
-    # ----------------- Changes from 1.6 to 1.6.0S ----------
-    # ----------------- Changes from 1.6.0S to 1.7 ----------
-    - cross_validation_general:   # change in name of argument "fold_num"
-        rettype: void
-        argument: character varying, character varying[], character varying[], character varying, character varying[], character varying, character varying[], character varying[], character varying, character varying[], character varying[], character varying, character varying, boolean, character varying, character varying[], integer
-    - lmf_igd_transition:
-        rettype: double precision[]
-        argument: double precision[], smallint, smallint, double precision, double precision[], smallint, smallint, smallint, double precision, double precision
-    # depending on bytea8
-    # return type is bytea8
-    - __clustered_err_lin_merge:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - __clustered_err_lin_transition:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, double precision, double precision[], double precision[]
-    - __clustered_err_log_merge:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - __clustered_err_log_transition:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, boolean, double precision[], double precision[]
-    - __clustered_err_mlog_merge:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - __clustered_err_mlog_transition:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, integer, double precision[], double precision[], integer, integer
-    - dense_residual_norm_merge_states:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - dense_residual_norm_transition:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, double precision[], double precision, double precision[]
-    - hetero_linregr_merge_states:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - hetero_linregr_transition:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, double precision, double precision[], double precision[]
-    - linregr_merge_states:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - linregr_transition:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, double precision, double precision[]
-    - robust_linregr_merge_states:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - robust_linregr_transition:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, double precision, double precision[], double precision[]
-    - weighted_sample_merge_int64:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - weighted_sample_merge_vector:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - weighted_sample_transition_int64:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, bigint, double precision
-    - weighted_sample_transition_vector:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, double precision[], double precision
-    # argument type bytea8
-    - __clustered_err_lin_final:
-        rettype: schema_madlib.__clustered_agg_result
-        argument: schema_madlib.bytea8
-    - __clustered_err_log_final:
-        rettype: schema_madlib.__clustered_agg_result
-        argument: schema_madlib.bytea8
-    - __clustered_err_mlog_final:
-        rettype: schema_madlib.__clustered_agg_result
-        argument: schema_madlib.bytea8
-    - dense_residual_norm_final:
-        rettype: schema_madlib.residual_norm_result
-        argument: schema_madlib.bytea8
-    - hetero_linregr_final:
-        rettype: schema_madlib.heteroskedasticity_test_result
-        argument: schema_madlib.bytea8
-    - linregr_final:
-        rettype: schema_madlib.linregr_result
-        argument: schema_madlib.bytea8
-    - robust_linregr_final:
-        rettype: schema_madlib.robust_linregr_result
-        argument: schema_madlib.bytea8
-    - weighted_sample_final_int64:
-        rettype: bigint
-        argument: schema_madlib.bytea8
-    - weighted_sample_final_vector:
-        rettype: double precision[]
-        argument: schema_madlib.bytea8
-    # ----------------- Changes from 1.7.1 to 1.8 ----------
-    - __filter_input_relation:
-        rettype: character varying
-        argument: character varying, character varying
-    - __lda_util_unnest:
-        rettype: SETOF bigint[]
-        argument: bigint[]
-    - matrix_block_mult:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, text
-    - matrix_block_square:
-        rettype: schema_madlib.matrix_result
-        argument: text, text
-    - matrix_block_trans:
-        rettype: schema_madlib.matrix_result
-        argument: text, text
-    - matrix_block_trans:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, boolean
-    - matrix_blockize:
-        rettype: schema_madlib.matrix_result
-        argument: text, integer, integer, text
-    - matrix_densify:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, text, text, text, boolean
-    - matrix_mult:
-        rettype: schema_madlib.matrix_result
-        argument: text, boolean, text, boolean, text
-    - matrix_mult:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, text, text, boolean, text, text, text, text, boolean, text
-    - matrix_norm:
-        rettype: double precision
-        argument: text
-    - matrix_scale_and_add:
-        rettype: void
-        argument: text, text, double precision, text
-    - matrix_sparsify:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, boolean
-    - matrix_square:
-        rettype: schema_madlib.matrix_result
-        argument: text, text
-    - matrix_trans:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, boolean
-    - matrix_trans:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, text, text, text, boolean
-    - matrix_unblockize:
-        rettype: schema_madlib.matrix_result
-        argument: text, text
-    # ----------------- Changes from 1.8 to 1.9 ----------
-
-    - _dt_apply:
-       rettype: schema_madlib._tree_result_type
-       argument: schema_madlib.bytea8,schema_madlib.bytea8,schema_madlib.bytea8,smallint,smallint,smallint,boolean,integer
-
-    - internal_linear_svm_igd_result:
-       rettype: schema_madlib.linear_svm_result
-       argument: double precision[]
-
-    - _prune_and_cplist:
-       rettype: schema_madlib._prune_result_type
-       argument: schema_madlib.bytea8,double precision,boolean
-
-    - __array_elem_in:
-       rettype: boolean[]
-       argument: anyarray, anyarray
-
-    - __array_indexed_agg_ffunc:
-       rettype: double precision[]
-       argument: double precision[]
-
-    - __array_indexed_agg_prefunc:
-       rettype: double precision[]
-       argument: double precision[], double precision[]
-
-    - __array_indexed_agg_sfunc:
-       rettype: double precision[]
-       argument: double precision[], double precision, bigint, bigint
-
-    - __array_search:
-       rettype: boolean
-       argument: anyelement, anyarray
-
-    - __array_sort:
-       rettype: anyarray
-       argument: anyarray
-
-    - __assert:
-       rettype: void
-       argument: boolean, text
-
-    - __assert_table:
-       rettype: void
-       argument: text, boolean
-
-    - __best_scv_prefunc:
-       rettype: double precision[]
-       argument: double precision[], double precision[]
-
-    - __best_scv_sfunc:
-       rettype: double precision[]
-       argument: double precision[], double precision[], integer, double precision
-
-    - __bigint_array_add:
-       rettype: bigint[]
-       argument: bigint[], bigint[]
-
-    - __breakup_table:
-       rettype: void
-       argument: text, text, text, text, text, text[], boolean[], integer, integer
-
-    - __check_dt_common_params:
-       rettype: void
-       argument: text, text, text, text, text, text, text, text, integer, double precision, double precision, integer, text
-
-    - __check_training_table:
-       rettype: void
-       argument: text, text[], text[], text, text, integer
-
-    - __column_exists:
-       rettype: boolean
-       argument: text, text
-
-    - __columns_in_table:
-       rettype: boolean
-       argument: text[], text
-
-    - __create_metatable:
-       rettype: void
-       argument: text
-
-    - __create_tree_tables:
-       rettype: void
-       argument: text
-
-    - __csvstr_to_array:
-       rettype: text[]
-       argument: text
-
-    - __display_node_sfunc:
-       rettype: text
-       argument: text, integer, boolean, text, text, double precision, double precision, text, integer
-
-    - __display_tree_no_ordered_aggr:
-       rettype: text
-       argument: text, integer, integer, integer, boolean, double precision, text, integer, integer
-
-    - __distinct_feature_value:
-       rettype: integer
-       argument: text, integer
-
-    - __drop_metatable:
-       rettype: void
-       argument: text
-
-    - __dt_acc_count_sfunc:
-       rettype: bigint[]
-       argument: bigint[], integer, bigint, integer
-
-    - __dt_get_node_split_fids:
-       rettype: integer[]
-       argument: integer, integer, integer, integer[]
-
-    - __ebp_calc_errors:
-       rettype: double precision
-       argument: double precision, double precision, double precision
-
-    - __ebp_prune_tree:
-       rettype: void
-       argument: text
-
-    - __encode_and_train:
-       rettype: record
-       argument: text, text, integer, integer, text, text, text, text, text, text, text, double precision, text, integer, double precision, boolean, double precision, double precision, text, integer
-
-    - __encode_columns:
-       rettype: void
-       argument: text, text, integer, integer
-
-    - __encode_table:
-       rettype: void
-       argument: text, text, text, integer, integer
-
-    - __encode_table:
-       rettype: void
-       argument: text, text, text[], text, text[], text, text, integer, integer
-
-    - __find_best_split:
-       rettype: void
-       argument: text, double precision, text, integer, integer, text, integer, integer
-
-    - __format:
-       rettype: text
-       argument: text, text
-
-    - __format:
-       rettype: text
-       argument: text, text, text
-
-    - __format:
-       rettype: text
-       argument: text, text, text, text
-
-    - __format:
-       rettype: text
-       argument: text, text, text, text, text
-
-    - __format:
-       rettype: text
-       argument: text, text[]
-
-    - __gen_acc:
-       rettype: __gen_acc_time
-       argument: text, text, text, text, text, integer, integer, boolean, integer
-
-    - __gen_enc_meta_names:
-       rettype: text[]
-       argument: text, text
-
-    - __gen_horizontal_encoded_table:
-       rettype: void
-       argument: text, text, integer, integer
-
-    - __gen_vertical_encoded_table:
-       rettype: void
-       argument: text, text, text, boolean, integer
-
-    - __generate_final_tree:
-       rettype: void
-       argument: text
-
-    - __get_class_column_name:
-       rettype: text
-       argument: text
-
-    - __get_class_value:
-       rettype: text
-       argument: integer, text
-
-    - __get_classtable_name:
-       rettype: text
-       argument: text
-
-    - __get_column_value:
-       rettype: text
-       argument: integer, integer, character, text
-
-    - __get_feature_name:
-       rettype: text
-       argument: integer, text
-
-    - __get_feature_value:
-       rettype: text
-       argument: integer, integer, text
-
-    - __get_features_of_nodes:
-       rettype: text
-       argument: text, text, integer, integer, integer
-
-    - __get_id_column_name:
-       rettype: text
-       argument: text
-
-    - __get_schema_name:
-       rettype: text
-       argument: text
-
-    - __get_table_name:
-       rettype: text
-       argument: text
-
-    - __insert_into_metatable:
-       rettype: void
-       argument: text, integer, text, character, boolean, text, integer
-
-    - __is_valid_enc_table:
-       rettype: boolean
-       argument: text
-
-    - __num_of_class:
-       rettype: integer
-       argument: text
-
-    - __num_of_columns:
-       rettype: integer
-       argument: text
-
-    - __num_of_feature:
-       rettype: integer
-       argument: text
-
-    - __regclass_to_text:
-       rettype: text
-       argument: regclass
-
-    - __rename_table:
-       rettype: void
-       argument: text, text
-
-    - __rep_aggr_class_count_ffunc:
-       rettype: bigint[]
-       argument: bigint[]
-
-    - __rep_aggr_class_count_sfunc:
-       rettype: bigint[]
-       argument: bigint[], integer, integer, integer
-
-    - __rep_prune_tree:
-       rettype: void
-       argument: text, text, integer
-
-    - __sample_with_replacement:
-       rettype: void
-       argument: integer, bigint, text, text
-
-    - __sample_within_range:
-       rettype: SETOF bigint
-       argument: bigint, bigint, bigint
-
-    - __scv_aggr_ffunc:
-       rettype: double precision[]
-       argument: double precision[]
-
-    - __scv_aggr_prefunc:
-       rettype: double precision[]
-       argument: double precision[], double precision[]
-
-    - __scv_aggr_sfunc:
-       rettype: double precision[]
-       argument: double precision[], integer, boolean, integer, double precision[], double precision[], bigint
-
-    - __strip_schema_name:
-       rettype: text
-       argument: text
-
-    - __svm_random_ind2:
-       rettype: double precision[]
-       argument: integer
-
-    - __svm_random_ind:
-       rettype: double precision[]
-       argument: integer
-
-    - __svm_target_cl_func:
-       rettype: double precision
-       argument: double precision[]
-
-    - __svm_target_reg_func:
-       rettype: double precision
-       argument: double precision[]
-
-    - __table_exists:
-       rettype: boolean
-       argument: text
-
-    - __train_tree:
-       rettype: __train_result
-       argument: text, integer, integer, text, text, text, text, text, text, double precision, integer, double precision, double precision, double precision, boolean, integer, integer
-
-    - __treemodel_classify_internal:
-       rettype: text[]
-       argument: text, text, integer
-
-    - __treemodel_classify_internal_serial:
-       rettype: text[]
-       argument: text, text, integer
-
-    - __treemodel_display_no_ordered_aggr:
-       rettype: SETOF text
-       argument: text, integer[], integer
-
-    - __treemodel_display_with_ordered_aggr:
-       rettype: SETOF text
-       argument: text, integer[], integer
-
-    - __treemodel_get_vote_result:
-       rettype: void
-       argument: text, text
-
-    - __treemodel_score:
-       rettype: double precision
-       argument: text, text, integer
-
-    - __validate_input_table:
-       rettype: void
-       argument: text, text[], text, text
-
-    - __validate_metatable:
-       rettype: void
-       argument: text
-
-    - c45_classify:
-       rettype: c45_classify_result
-       argument: text, text, text
-
-    - c45_classify:
-       rettype: c45_classify_result
-       argument: text, text, text, integer
-
-    - c45_clean:
-       rettype: boolean
-       argument: text
-
-    - c45_display:
-       rettype: SETOF text
-       argument: text
-
-    - c45_display:
-       rettype: SETOF text
-       argument: text, integer
-
-    - c45_genrule:
-       rettype: SETOF text
-       argument: text
-
-    - c45_genrule:
-       rettype: SETOF text
-       argument: text, integer
-
-    - c45_score:
-       rettype: double precision
-       argument: text, text
-
-    - c45_score:
-       rettype: double precision
-       argument: text, text, integer
-
-    - c45_train:
-       rettype: c45_train_result
-       argument: text, text, text
-
-    - c45_train:
-       rettype: c45_train_result
-       argument: text, text, text, text, text, text, text, text, double precision, text
-
-    - c45_train:
-       rettype: c45_train_result
-       argument: text, text, text, text, text, text, text, text, double precision, text, integer, double precision, double precision, integer
-
-    - correlation:
-       rettype: correlation_result
-       argument: character varying, character varying
-
-    - correlation:
-       rettype: correlation_result
-       argument: character varying, character varying, character varying
-
-    - correlation:
-       rettype: correlation_result
-       argument: character varying, character varying, character varying, boolean
-
-    - linear_svm_igd_transition:
-       rettype: double precision[]
-       argument: double precision[], double precision[], boolean, double precision[], integer, double precision, double precision
-
-    - lsvm_classification:
-       rettype: SETOF lsvm_sgd_result
-       argument: text, text
-
-    - lsvm_classification:
-       rettype: SETOF lsvm_sgd_result
-       argument: text, text, boolean
-
-    - lsvm_classification:
-       rettype: SETOF lsvm_sgd_result
-       argument: text, text, boolean, boolean, double precision, double precision
-
-    - lsvm_classification:
-       rettype: SETOF lsvm_sgd_result
-       argument: text, text, boolean, boolean, double precision, double precision, integer
-
-    - lsvm_predict:
-       rettype: double precision
-       argument: double precision[], double precision[]
-
-    - lsvm_predict_batch:
-       rettype: text
-       argument: text, text, text, text, text
-
-    - lsvm_predict_batch:
-       rettype: text
-       argument: text, text, text, text, text, boolean
-
-    - matrix_block_trans:
-       rettype: matrix_result
-       argument: text, text, text, text, boolean
-
-    - matrix_densify:
-       rettype: matrix_result
-       argument: text, text, text, text, boolean
-
-    - matrix_sparsify:
-       rettype: matrix_result
-       argument: text, text, text, text, boolean
-
-    - matrix_trans:
-       rettype: matrix_result
-       argument: text, text, text, text, boolean
-
-    - rf_classify:
-       rettype: rf_classify_result
-       argument: text, text, text
-
-    - rf_classify:
-       rettype: rf_classify_result
-       argument: text, text, text, boolean, integer
-
-    - rf_classify:
-       rettype: rf_classify_result
-       argument: text, text, text, integer
-
-    - rf_clean:
-       rettype: boolean
-       argument: text
-
-    - rf_display:
-       rettype: SETOF text
-       argument: text
-
-    - rf_display:
-       rettype: SETOF text
-       argument: text, integer[]
-
-    - rf_display:
-       rettype: SETOF text
-       argument: text, integer[], integer
-
-    - rf_score:
-       rettype: double precision
-       argument: text, text
-
-    - rf_score:
-       rettype: double precision
-       argument: text, text, integer
-
-    - rf_train:
-       rettype: rf_train_result
-       argument: text, text, text
-
-    - rf_train:
-       rettype: rf_train_result
-       argument: text, text, text, integer, integer, double precision, text, text, text, text, text, integer, double precision, double precision, integer
-
-    - svdmf_run:
-       rettype: text
-       argument: text, text, text, text, integer
-
-    - svdmf_run:
-       rettype: text
-       argument: text, text, text, text, integer, integer, double precision
-
-    - svm_classification:
-       rettype: SETOF svm_cls_result
-       argument: text, text, boolean, text
-
-    - svm_classification:
-       rettype: SETOF svm_cls_result
-       argument: text, text, boolean, text, boolean, double precision, double precision
-
-    - svm_classification:
-       rettype: SETOF svm_cls_result
-       argument: text, text, boolean, text, boolean, double precision, double precision, double precision
-
-    - svm_classification:
-       rettype: SETOF svm_cls_result
-       argument: text, text, boolean, text, double precision
-
-    - svm_cls_update:
-       rettype: schema_madlib.svm_model_rec
-       argument: schema_madlib.svm_model_rec, double precision[], double precision, text, double precision, double precision, double precision
-
-    - svm_data_normalization:
-       rettype: void
-       argument: text
-
-    - svm_dot:
-       rettype: double precision
-       argument: double precision[], double precision[]
-
-    - svm_dot:
-       rettype: double precision
-       argument: double precision[], double precision[], double precision
-
-    - svm_drop_model:
-       rettype: void
-       argument: text
-
-    - svm_gaussian:
-       rettype: double precision
-       argument: double precision[], double precision[], double precision
-
-    - svm_generate_cls_data:
-       rettype: void
-       argument: text, integer, integer
-
-    - svm_generate_nd_data:
-       rettype: void
-       argument: text, integer, integer
-
-    - svm_generate_reg_data:
-       rettype: void
-       argument: text, integer, integer
-
-    - svm_nd_update:
-       rettype: schema_madlib.svm_model_rec
-       argument: schema_madlib.svm_model_rec, double precision[], text, double precision, double precision, double precision
-
-    - svm_novelty_detection:
-       rettype: SETOF schema_madlib.svm_nd_result
-       argument: text, text, boolean, text
-
-    - svm_novelty_detection:
-       rettype: SETOF schema_madlib.svm_nd_result
-       argument: text, text, boolean, text, boolean, double precision, double precision
-
-    - svm_novelty_detection:
-       rettype: SETOF schema_madlib.svm_nd_result
-       argument: text, text, boolean, text, boolean, double precision, double precision, double precision
-
-    - svm_polynomial:
-       rettype: double precision
-       argument: double precision[], double precision[], double precision
-
-    - svm_predict:
-       rettype: double precision
-       argument: schema_madlib.svm_model_rec, double precision[], text, double precision
-
-    - svm_predict_batch:
-       rettype: text
-       argument: text, text, text, text, text, boolean
-
-    - svm_predict_sub:
-       rettype: double precision
-       argument: integer, integer, double precision[], double precision[], double precision[], text, double precision
-
-    - svm_reg_update:
-       rettype: schema_madlib.svm_model_rec
-       argument: schema_madlib.svm_model_rec, double precision[], double precision, text, double precision, double precision, double precision, double precision
-
-    - svm_regression:
-       rettype: SETOF svm_reg_result
-       argument: text, text, boolean, text
-
-    - svm_regression:
-       rettype: SETOF svm_reg_result
-       argument: text, text, boolean, text, boolean, double precision, double precision, double precision
-
-    - svm_regression:
-       rettype: SETOF svm_reg_result
-       argument: text, text, boolean, text, boolean, double precision, double precision, double precision, double precision
-
-    - svm_store_model:
-       rettype: void
-       argument: text, text, text
-
-    # ----------------- Changes from 1.9 to 1.9.1 ----------
-    - array_collapse:
-        rettype: anyarray
-        argument: anyarray
-    - linear_svm_igd_transition:
-        rettype: double precision[]
-        argument: double precision[], double precision[], double precision, double precision[], integer, double precision, double precision, boolean, integer, double precision, boolean
-    - profile:
-        rettype: SETOF schema_madlib.profile_result
-        argument: text
-    - profile_full:
-        rettype: SETOF schema_madlib.profile_result
-        argument: text, integer
-    - profile:
-        rettype: schema_madlib.profile_result
-        argument: text
-    - profile_full:
-        rettype: schema_madlib.profile_result
-        argument: text, integer
-    - quantile:
-        rettype: double precision
-        argument: text, text, double precision
-    - quantile_big:
-        rettype: double precision
-        argument: text, text, double precision
-
-# Changes to aggregates (UDA) including removal and modification
-# Overloaded functions should be mentioned separately
-uda:
-    # ----------------- Changes from 1.3 to 1.4 -----------------
-    - __lda_count_topic_agg:
-        rettype: integer[]
-        argument: integer[], integer[], integer[], integer, integer
-    - __lda_perplexity_agg:
-        rettype: double precision
-        argument: integer[], integer[], integer[], integer[], double precision, double precision, integer, integer
-    - marginal_logregr:
-        rettype: schema_madlib.marginal_logregr_result
-        argument: boolean, double precision[], double precision[]
-    - marginal_mlogregr:
-        rettype: schema_madlib.marginal_mlogregr_result
-        argument: integer, integer, integer, double precision[], double precision[]
-    # ----------------- Changes from 1.4 to 1.4.1 ---------------
-    # ----------------- Changes from 1.4.1 to 1.5 ---------------
-    - lsvm_sgd_agg:
-        rettype: schema_madlib.lsvm_sgd_model_rec
-        argument: double precision[], double precision, double precision, double precision
-    - svm_cls_agg:
-        rettype: schema_madlib.svm_model_rec
-        argument: double precision[], double precision, text, double precision, double precision
-    - svm_nd_agg:
-        rettype: schema_madlib.svm_model_rec
-        argument: double precision[], text, double precision, double precision
-    - svm_reg_agg:
-        rettype: schema_madlib.svm_model_rec
-        argument: double precision[], double precision, text, double precision, double precision, double precision
-    # ----------------- Changes from 1.5 to 1.6 ---------------
-    - coxph_step:
-        rettype: double precision[]
-        argument: double precision[], double precision, boolean, double precision[]
-    - coxph_strata_step_inner:
-        rettype: double precision[]
-        argument: double precision[], double precision, boolean, double precision[]
-    - coxph_strata_step_outer:
-        rettype: double precision[]
-        argument: double precision[]
-    # return type change
-    - linregr:
-        rettype: schema_madlib.linregr_result
-        argument: double precision, double precision[]
-    # initcond change
-    - __mlogregr_irls_step:
-        rettype: double precision[]
-        argument: integer, integer, integer, double precision[], double precision[]
-    # ----------------- Changes from 1.6 to 1.6.0S ----------
-    # ----------------- Changes from 1.6.0S to 1.7 ----------
-    - lmf_igd_step:
-        rettype: double precision[]
-        argument: smallint, smallint, double precision, double precision[], smallint, smallint, smallint, double precision, double precision
-    - __clustered_err_lin_step:
-        rettype: schema_madlib.__clustered_agg_result
-        argument: double precision, double precision[], double precision[]
-    - __clustered_err_log_step:
-        rettype: schema_madlib.__clustered_agg_result
-        argument: boolean, double precision[], double precision[]
-    - __clustered_err_mlog_step:
-        rettype: schema_madlib.__clustered_agg_result
-        argument: integer, double precision[], double precision[], integer, integer
-    - dense_residual_norm:
-        rettype: schema_madlib.residual_norm_result
-        argument: double precision[], double precision, double precision[]
-    - heteroskedasticity_test_linregr:
-        rettype: schema_madlib.heteroskedasticity_test_result
-        argument: double precision, double precision[], double precision[]
-    - linregr:
-        rettype: schema_madlib.linregr_result
-        argument: double precision, double precision[]
-    - robust_linregr:
-        rettype: schema_madlib.robust_linregr_result
-        argument: double precision, double precision[], double precision[]
-    - weighted_sample:
-        rettype: double precision[]
-        argument: double precision[], double precision
-    - weighted_sample:
-        rettype: bigint
-        argument: bigint, double precision
-    # ----------------- Changes from 1.7.1 to 1.8 ----------
-    - discrete_distribution_agg:
-        rettype: double precision[]
-        argument: integer, double precision, integer
-    - vectorized_distribution_agg:
-        rettype: double precision[]
-        argument: integer[], integer[]
-    # ----------------- Changes from 1.8 to 1.9 ----------
-
-    - __array_indexed_agg:
-        rettype: double precision[]
-        argument: double precision, bigint, bigint
-
-    - __best_scv_aggr:
-        rettype: double precision[]
-        argument: double precision[], integer, double precision
-
-    - __bigint_array_sum:
-        rettype: bigint[]
-        argument: bigint[]
-
-    - __display_tree_aggr:
-        rettype: text
-        argument: integer, boolean, text, text, double precision, double precision, text, integer
-
-    - __dt_acc_count_aggr:
-        rettype: bigint[]
-        argument: integer, bigint, integer
-
-    - __rep_aggr_class_count:
-        rettype: bigint[]
-        argument: integer, integer, integer
-
-    - __scv_aggr:
-        rettype: double precision[]
-        argument: integer, boolean, integer, double precision[], double precision[], bigint
-
-    - linear_svm_igd_step:
-        rettype: double precision[]
-        argument: double precision[], boolean, double precision[], integer, double precision, double precision
-
-    - linear_svm_igd_step_serial:
-        rettype: double precision[]
-        argument: double precision[], boolean, double precision[], integer, double precision, double precision
-
-    - svm_cls_agg:
-        rettype: schema_madlib.svm_model_rec
-        argument: double precision[], double precision, text, double precision, double precision, double precision
-
-    - svm_nd_agg:
-        rettype: schema_madlib.svm_model_rec
-        argument: double precision[], text, double precision, double precision, double precision
-
-    - svm_reg_agg:
-        rettype: schema_madlib.svm_model_rec
-        argument: double precision[], double precision, text, double precision, double precision, double precision, double precision
-
-    - __svm_random_ind2:
-        rettype: double precision[]
-        argument: integer
-    # ----------------- Changes from 1.9 to 1.9.1 ----------
-    - array_agg:
-        rettype: anyarray
-        argument: anyelement
-    - linear_svm_igd_step:
-       rettype: double precision[]
-       argument: double precision[], double precision, double precision[], integer, double precision, double precision, boolean, integer, double precision, boolean
-
-# Casts (UDC) updated/removed
-udc:
-    # ----------------- Changes from 1.3 to 1.4 -----------------
-    # ----------------- Changes from 1.4 to 1.4.1 ---------------
-    # ----------------- Changes from 1.4.1 to 1.5 ---------------
-    bool2text:
-        sourcetype: boolean
-        targettype: text
-    # ----------------- Changes from 1.5 to 1.6 ---------------
-    # ----------------- Changes from 1.6 to 1.6.0S ----------
-    # ----------------- Changes from 1.6.0S to 1.7 ----------
-
-# Operators (UDO) removed/updated
-udo:
-    # ----------------- Changes from 1.5 to 1.6 ---------------
-    - '<':
-        leftarg: schema_madlib.svec
-        rightarg: schema_madlib.svec
-    - '<=':
-        leftarg: schema_madlib.svec
-        rightarg: schema_madlib.svec
-    - '<>':
-        leftarg: schema_madlib.svec
-        rightarg: schema_madlib.svec
-    - '==':
-        leftarg: schema_madlib.svec
-        rightarg: schema_madlib.svec
-    - '>=':
-        leftarg: schema_madlib.svec
-        rightarg: schema_madlib.svec
-    - '>':
-        leftarg: schema_madlib.svec
-        rightarg: schema_madlib.svec
-    # ----------------- Changes from 1.6 to 1.6.0S ----------
-    # ----------------- Changes from 1.6.0S to 1.7 ----------
-
-# Operator Classes (UDOC) removed/updated
-udoc:
-    # ----------------- Changes from 1.5 to 1.6 ---------------
-    # removed
-    - svec_l2_ops:
-        index: btree
-    # ----------------- Changes from 1.6 to 1.6.0S ----------
-    # ----------------- Changes from 1.6.0S to 1.7 ----------

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/90f4dc15/src/madpack/changelist_1.4.1_1.9.1.yaml
----------------------------------------------------------------------
diff --git a/src/madpack/changelist_1.4.1_1.9.1.yaml b/src/madpack/changelist_1.4.1_1.9.1.yaml
deleted file mode 100644
index d892edc..0000000
--- a/src/madpack/changelist_1.4.1_1.9.1.yaml
+++ /dev/null
@@ -1,1216 +0,0 @@
-# Changelist for MADlib version 1.4.1 to 1.7
-
-# This file contains all changes that were introduced in a new version of
-# MADlib. This changelist is used by the upgrade script to detect what objects
-# should be upgraded (while retaining all other objects from the previous version)
-
-# New modules (actually .sql_in files) added in upgrade version
-# For these files the sql_in code is retained as is with the functions in the
-# file installed on the upgrade version. All other files (that don't have
-# updates), are cleaned up to remove object replacements
-new module:
-    # ----------------- Changes from 1.4.1 to 1.5 ---------------
-    # ----------------- Changes from 1.5 to 1.6 ---------------
-    table_to_pmml:
-    # ----------------- Changes from 1.6 to 1.6.0S ----------
-    # ----------------- Changes from 1.6.0S to 1.7 ----------
-    glm:
-    multiresponseglm:
-    ordinal:
-    decision_tree:
-    random_forest:
-    distribution:
-    # ----------------- Changes from 1.7.1 to 1.8 ----------
-    text_utilities:
-
-# Changes in the types (UDT) including removal and modification
-udt:
-
-    # ----------------- Changes from 1.4.1 to 1.5 ---------------
-    __utils_scaled_data:
-    # ----------------- Changes from 1.5 to 1.6 ---------------
-    __logregr_result:
-    coxph_result:
-    linregr_result:
-    mlogregr_result:
-    # ----------------- Changes from 1.6 to 1.6.0S ----------
-    # ----------------- Changes from 1.6.0S to 1.7 ----------
-    bytea8:
-    # ----------------- Changes from 1.8 to 1.9 ----------
-    __enc_tbl_result:
-    __gen_acc_time:
-    __rep_type:
-    __train_result:
-    c45_classify_result:
-    c45_train_result:
-    correlation_result:
-    lsvm_sgd_model_rec:
-    lsvm_sgd_result:
-    rf_classify_result:
-    rf_train_result:
-    svm_cls_result:
-    svm_model_pr:
-    svm_model_rec:
-    svm_nd_result:
-    svm_reg_result:
-    svm_support_vector:
-    _prune_result_type:
-    _tree_result_type:
-    linear_svm_result:
-    # ----------------- Changes from 1.9 to 1.9.1 ----------
-    profile_result:
-
-# List of the UDF changes that affect the user externally.  This includes change
-# in function name, change in argument order or argument types, and removal of
-# the function. In each case, the original function is as good as removed and a
-# new function is created. In such cases, we should abort the upgrade if there
-# are user views dependent on this function, since the original function will
-# not be present in the upgraded version.
-udf:
-    # ----------------- Changes from 1.4.1 to 1.5 ---------------
-    - __cmsketch_final:
-        rettype: bytea
-        argument: bytea
-    - __delete_traininginfo:
-        rettype: void
-        argument: text
-    - __get_encode_table_name:
-        rettype: text
-        argument: text
-    - __get_metatable_name:
-        rettype: text
-        argument: text
-    - __get_routine_id:
-        rettype: integer
-        argument: text
-    - __get_routine_name:
-        rettype: text
-        argument: text
-    - __get_tree_table_name:
-        rettype: text
-        argument: text
-    - __insert_into_traininginfo:
-        rettype: void
-        argument: text, text, text, text, text, text, text, text, double precision, integer, integer
-    - __treemodel_clean:
-        rettype: boolean
-        argument: text
-    - compute_lincrf:
-        rettype: integer
-        argument: character varying, character varying, character varying, character varying, character varying, integer, integer
-    - create_nb_classify_fn:
-        rettype: void
-        argument: character varying, character varying, integer, character varying
-    - create_nb_classify_fn:
-        rettype: void
-        argument: character varying, character varying, character varying, integer, character varying
-    - create_table_as:
-        rettype: void
-        argument: character varying, character varying
-    - create_table_as:
-        rettype: void
-        argument: character varying, character varying, character varying
-    - create_temp_table_as:
-        rettype: void
-        argument: character varying, character varying, character varying
-    - create_temp_table_as:
-        rettype: void
-        argument: character varying, character varying
-    - create_temporary_table_as:
-        rettype: void
-        argument: character varying, character varying, character varying
-    - create_temporary_table_as:
-        rettype: void
-        argument: character varying, character varying
-    - crf_train_fgen:
-        rettype: void
-        argument: text, text, text, text, text
-    - insert_into:
-        rettype: void
-        argument: character varying, character varying
-    - internal_create_table_as:
-        rettype: void
-        argument: boolean, character varying, character varying, character varying
-    - internal_execute_using_kmeans_args:
-        rettype: void
-        argument: character varying, double precision[], regproc, integer, double precision
-    - internal_execute_using_kmeanspp_seeding_args:
-        rettype: void
-        argument: character varying, integer, regproc, double precision[]
-    - internal_execute_using_silhouette_args:
-        rettype: double precision
-        argument: character varying, double precision[], regproc
-    - lincrf:
-        rettype: integer
-        argument: character varying, character varying, character varying, character varying, character varying, integer, character varying, character varying, integer
-    - lincrf:
-        rettype: integer
-        argument: character varying, character varying, character varying, character varying, character varying, integer, character varying, character varying
-    - lsvm_predict:
-        rettype: double precision
-        argument: text, double precision[]
-    - lsvm_predict_combo:
-        rettype: SETOF schema_madlib.svm_model_pr
-        argument: text, double precision[]
-    - lsvm_sgd_update:
-        rettype: schema_madlib.lsvm_sgd_model_rec
-        argument: schema_madlib.lsvm_sgd_model_rec, double precision[], double precision, double precision, double precision
-    - svm_cls_update:
-        rettype: schema_madlib.svm_model_rec
-        argument: schema_madlib.svm_model_rec, double precision[], double precision, text, double precision, double precision
-    - svm_nd_update:
-        rettype: schema_madlib.svm_model_rec
-        argument: schema_madlib.svm_model_rec, double precision[], text, double precision, double precision
-    - svm_predict:
-        rettype: double precision
-        argument: schema_madlib.svm_model_rec, double precision[], text
-    - svm_predict:
-        rettype: double precision
-        argument: text, double precision[]
-    - svm_predict_combo:
-        rettype: SETOF schema_madlib.svm_model_pr
-        argument: text, double precision[]
-    - svm_predict_sub:
-        rettype: double precision
-        argument: integer, integer, double precision[], double precision[], double precision[], text
-    - svm_reg_update:
-        rettype: schema_madlib.svm_model_rec
-        argument: schema_madlib.svm_model_rec, double precision[], double precision, text, double precision, double precision, double precision
-    - utils_normalize_data:
-        rettype: schema_madlib.__utils_scaled_data
-        argument: double precision[], double precision[], double precision[]
-    - vcrf_top1_label:
-        rettype: integer[]
-        argument: integer[], integer[], integer
-    - vcrf_top1_view:
-        rettype: text
-        argument: text, text, text, text
-    # ----------------- Changes from 1.5 to 1.6 ---------------
-    # Removed functions
-    - array_contains_null:
-        rettype: boolean
-        argument: double precision[]
-    - array_sqrt:
-        rettype: anyarray
-        argument: anyarray
-    - coxph_step_final:
-        rettype: double precision[]
-        argument: double precision[]
-    - coxph_step_strata_final:
-        rettype: double precision[]
-        argument: double precision[]
-    - coxph_step_transition:
-        rettype: double precision[]
-        argument: double precision[], double precision[], double precision, boolean, double precision[]
-    - internal_coxph_result:
-        rettype: schema_madlib.coxph_result
-        argument: double precision[]
-    - internal_coxph_step_distance:
-        rettype: double precision
-        argument: double precision[], double precision[]
-    - normalize:
-        rettype: double precision[]
-        argument: double precision[]
-    # Changed functions (return type)
-    # These functions can be recreated correctly even if we don't add them here.
-    # But the view dependency checker needs the information.
-    - __internal_mlogregr_irls_result:
-        rettype: schema_madlib.mlogregr_result
-        argument: double precision[]
-    - __logregr_cg_result:
-        rettype: schema_madlib.__logregr_result
-        argument: double precision[]
-    - __logregr_igd_result:
-        rettype: schema_madlib.__logregr_result
-        argument: double precision[]
-    - __logregr_irls_result:
-        rettype: schema_madlib.__logregr_result
-        argument: double precision[]
-    - linregr_final:
-        rettype: schema_madlib.linregr_result
-        argument: schema_madlib.bytea8
-    - mlogregr:
-        rettype: schema_madlib.mlogregr_result
-        argument: character varying, character varying, character varying, integer, character varying, double precision, integer
-    - mlogregr:
-        rettype: schema_madlib.mlogregr_result
-        argument: character varying, character varying, character varying, integer, character varying
-    - mlogregr:
-        rettype: schema_madlib.mlogregr_result
-        argument: character varying, character varying, character varying, integer
-    - mlogregr:
-        rettype: schema_madlib.mlogregr_result
-        argument: character varying, character varying, character varying
-    # name of rettype changed
-    - compute_coxph_result:
-        rettype: schema_madlib.coxph_new_result
-        argument: double precision[], double precision, double precision[], integer, double precision[]
-    # argument changed
-    - coxph_train:
-        rettype: void
-        argument: text
-    - coxph_train:
-        rettype: void
-        argument: text, text, text, text
-    - coxph_train:
-        rettype: void
-        argument: text, text, text, text, text
-    - coxph_train:
-        rettype: void
-        argument: text, text, text, text, text, text
-    - coxph_train:
-        rettype: void
-        argument: text, text, text, text, text, text, text
-    # ----------------- Changes from 1.6 to 1.6.0S ----------
-    # ----------------- Changes from 1.6.0S to 1.7 ----------
-    - cross_validation_general:   # change in name of argument "fold_num"
-        rettype: void
-        argument: character varying, character varying[], character varying[], character varying, character varying[], character varying, character varying[], character varying[], character varying, character varying[], character varying[], character varying, character varying, boolean, character varying, character varying[], integer
-    - lmf_igd_transition:
-        rettype: double precision[]
-        argument: double precision[], smallint, smallint, double precision, double precision[], smallint, smallint, smallint, double precision, double precision
-    # depending on bytea8
-    # return type is bytea8
-    - __clustered_err_lin_merge:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - __clustered_err_lin_transition:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, double precision, double precision[], double precision[]
-    - __clustered_err_log_merge:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - __clustered_err_log_transition:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, boolean, double precision[], double precision[]
-    - __clustered_err_mlog_merge:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - __clustered_err_mlog_transition:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, integer, double precision[], double precision[], integer, integer
-    - dense_residual_norm_merge_states:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - dense_residual_norm_transition:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, double precision[], double precision, double precision[]
-    - hetero_linregr_merge_states:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - hetero_linregr_transition:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, double precision, double precision[], double precision[]
-    - linregr_merge_states:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - linregr_transition:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, double precision, double precision[]
-    - robust_linregr_merge_states:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - robust_linregr_transition:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, double precision, double precision[], double precision[]
-    - weighted_sample_merge_int64:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - weighted_sample_merge_vector:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - weighted_sample_transition_int64:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, bigint, double precision
-    - weighted_sample_transition_vector:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, double precision[], double precision
-    # argument type bytea8
-    - __clustered_err_lin_final:
-        rettype: schema_madlib.__clustered_agg_result
-        argument: schema_madlib.bytea8
-    - __clustered_err_log_final:
-        rettype: schema_madlib.__clustered_agg_result
-        argument: schema_madlib.bytea8
-    - __clustered_err_mlog_final:
-        rettype: schema_madlib.__clustered_agg_result
-        argument: schema_madlib.bytea8
-    - dense_residual_norm_final:
-        rettype: schema_madlib.residual_norm_result
-        argument: schema_madlib.bytea8
-    - hetero_linregr_final:
-        rettype: schema_madlib.heteroskedasticity_test_result
-        argument: schema_madlib.bytea8
-    - linregr_final:
-        rettype: schema_madlib.linregr_result
-        argument: schema_madlib.bytea8
-    - robust_linregr_final:
-        rettype: schema_madlib.robust_linregr_result
-        argument: schema_madlib.bytea8
-    - weighted_sample_final_int64:
-        rettype: bigint
-        argument: schema_madlib.bytea8
-    - weighted_sample_final_vector:
-        rettype: double precision[]
-        argument: schema_madlib.bytea8
-    # ----------------- Changes from 1.7.1 to 1.8 ----------
-    - __filter_input_relation:
-        rettype: character varying
-        argument: character varying, character varying
-    - __lda_util_unnest:
-        rettype: SETOF bigint[]
-        argument: bigint[]
-    - matrix_block_mult:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, text
-    - matrix_block_square:
-        rettype: schema_madlib.matrix_result
-        argument: text, text
-    - matrix_block_trans:
-        rettype: schema_madlib.matrix_result
-        argument: text, text
-    - matrix_block_trans:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, boolean
-    - matrix_blockize:
-        rettype: schema_madlib.matrix_result
-        argument: text, integer, integer, text
-    - matrix_densify:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, text, text, text, boolean
-    - matrix_mult:
-        rettype: schema_madlib.matrix_result
-        argument: text, boolean, text, boolean, text
-    - matrix_mult:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, text, text, boolean, text, text, text, text, boolean, text
-    - matrix_norm:
-        rettype: double precision
-        argument: text
-    - matrix_scale_and_add:
-        rettype: void
-        argument: text, text, double precision, text
-    - matrix_sparsify:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, boolean
-    - matrix_square:
-        rettype: schema_madlib.matrix_result
-        argument: text, text
-    - matrix_trans:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, boolean
-    - matrix_trans:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, text, text, text, boolean
-    - matrix_unblockize:
-        rettype: schema_madlib.matrix_result
-        argument: text, text
-    # ----------------- Changes from 1.8 to 1.9 ----------
-
-    - _dt_apply:
-       rettype: schema_madlib._tree_result_type
-       argument: schema_madlib.bytea8,schema_madlib.bytea8,schema_madlib.bytea8,smallint,smallint,smallint,boolean,integer
-
-    - internal_linear_svm_igd_result:
-       rettype: schema_madlib.linear_svm_result
-       argument: double precision[]
-
-    - _prune_and_cplist:
-       rettype: schema_madlib._prune_result_type
-       argument: schema_madlib.bytea8,double precision,boolean
-
-    - __array_elem_in:
-       rettype: boolean[]
-       argument: anyarray, anyarray
-
-    - __array_indexed_agg_ffunc:
-       rettype: double precision[]
-       argument: double precision[]
-
-    - __array_indexed_agg_prefunc:
-       rettype: double precision[]
-       argument: double precision[], double precision[]
-
-    - __array_indexed_agg_sfunc:
-       rettype: double precision[]
-       argument: double precision[], double precision, bigint, bigint
-
-    - __array_search:
-       rettype: boolean
-       argument: anyelement, anyarray
-
-    - __array_sort:
-       rettype: anyarray
-       argument: anyarray
-
-    - __assert:
-       rettype: void
-       argument: boolean, text
-
-    - __assert_table:
-       rettype: void
-       argument: text, boolean
-
-    - __best_scv_prefunc:
-       rettype: double precision[]
-       argument: double precision[], double precision[]
-
-    - __best_scv_sfunc:
-       rettype: double precision[]
-       argument: double precision[], double precision[], integer, double precision
-
-    - __bigint_array_add:
-       rettype: bigint[]
-       argument: bigint[], bigint[]
-
-    - __breakup_table:
-       rettype: void
-       argument: text, text, text, text, text, text[], boolean[], integer, integer
-
-    - __check_dt_common_params:
-       rettype: void
-       argument: text, text, text, text, text, text, text, text, integer, double precision, double precision, integer, text
-
-    - __check_training_table:
-       rettype: void
-       argument: text, text[], text[], text, text, integer
-
-    - __column_exists:
-       rettype: boolean
-       argument: text, text
-
-    - __columns_in_table:
-       rettype: boolean
-       argument: text[], text
-
-    - __create_metatable:
-       rettype: void
-       argument: text
-
-    - __create_tree_tables:
-       rettype: void
-       argument: text
-
-    - __csvstr_to_array:
-       rettype: text[]
-       argument: text
-
-    - __display_node_sfunc:
-       rettype: text
-       argument: text, integer, boolean, text, text, double precision, double precision, text, integer
-
-    - __display_tree_no_ordered_aggr:
-       rettype: text
-       argument: text, integer, integer, integer, boolean, double precision, text, integer, integer
-
-    - __distinct_feature_value:
-       rettype: integer
-       argument: text, integer
-
-    - __drop_metatable:
-       rettype: void
-       argument: text
-
-    - __dt_acc_count_sfunc:
-       rettype: bigint[]
-       argument: bigint[], integer, bigint, integer
-
-    - __dt_get_node_split_fids:
-       rettype: integer[]
-       argument: integer, integer, integer, integer[]
-
-    - __ebp_calc_errors:
-       rettype: double precision
-       argument: double precision, double precision, double precision
-
-    - __ebp_prune_tree:
-       rettype: void
-       argument: text
-
-    - __encode_and_train:
-       rettype: record
-       argument: text, text, integer, integer, text, text, text, text, text, text, text, double precision, text, integer, double precision, boolean, double precision, double precision, text, integer
-
-    - __encode_columns:
-       rettype: void
-       argument: text, text, integer, integer
-
-    - __encode_table:
-       rettype: void
-       argument: text, text, text, integer, integer
-
-    - __encode_table:
-       rettype: void
-       argument: text, text, text[], text, text[], text, text, integer, integer
-
-    - __find_best_split:
-       rettype: void
-       argument: text, double precision, text, integer, integer, text, integer, integer
-
-    - __format:
-       rettype: text
-       argument: text, text
-
-    - __format:
-       rettype: text
-       argument: text, text, text
-
-    - __format:
-       rettype: text
-       argument: text, text, text, text
-
-    - __format:
-       rettype: text
-       argument: text, text, text, text, text
-
-    - __format:
-       rettype: text
-       argument: text, text[]
-
-    - __gen_acc:
-       rettype: __gen_acc_time
-       argument: text, text, text, text, text, integer, integer, boolean, integer
-
-    - __gen_enc_meta_names:
-       rettype: text[]
-       argument: text, text
-
-    - __gen_horizontal_encoded_table:
-       rettype: void
-       argument: text, text, integer, integer
-
-    - __gen_vertical_encoded_table:
-       rettype: void
-       argument: text, text, text, boolean, integer
-
-    - __generate_final_tree:
-       rettype: void
-       argument: text
-
-    - __get_class_column_name:
-       rettype: text
-       argument: text
-
-    - __get_class_value:
-       rettype: text
-       argument: integer, text
-
-    - __get_classtable_name:
-       rettype: text
-       argument: text
-
-    - __get_column_value:
-       rettype: text
-       argument: integer, integer, character, text
-
-    - __get_feature_name:
-       rettype: text
-       argument: integer, text
-
-    - __get_feature_value:
-       rettype: text
-       argument: integer, integer, text
-
-    - __get_features_of_nodes:
-       rettype: text
-       argument: text, text, integer, integer, integer
-
-    - __get_id_column_name:
-       rettype: text
-       argument: text
-
-    - __get_schema_name:
-       rettype: text
-       argument: text
-
-    - __get_table_name:
-       rettype: text
-       argument: text
-
-    - __insert_into_metatable:
-       rettype: void
-       argument: text, integer, text, character, boolean, text, integer
-
-    - __is_valid_enc_table:
-       rettype: boolean
-       argument: text
-
-    - __num_of_class:
-       rettype: integer
-       argument: text
-
-    - __num_of_columns:
-       rettype: integer
-       argument: text
-
-    - __num_of_feature:
-       rettype: integer
-       argument: text
-
-    - __regclass_to_text:
-       rettype: text
-       argument: regclass
-
-    - __rename_table:
-       rettype: void
-       argument: text, text
-
-    - __rep_aggr_class_count_ffunc:
-       rettype: bigint[]
-       argument: bigint[]
-
-    - __rep_aggr_class_count_sfunc:
-       rettype: bigint[]
-       argument: bigint[], integer, integer, integer
-
-    - __rep_prune_tree:
-       rettype: void
-       argument: text, text, integer
-
-    - __sample_with_replacement:
-       rettype: void
-       argument: integer, bigint, text, text
-
-    - __sample_within_range:
-       rettype: SETOF bigint
-       argument: bigint, bigint, bigint
-
-    - __scv_aggr_ffunc:
-       rettype: double precision[]
-       argument: double precision[]
-
-    - __scv_aggr_prefunc:
-       rettype: double precision[]
-       argument: double precision[], double precision[]
-
-    - __scv_aggr_sfunc:
-       rettype: double precision[]
-       argument: double precision[], integer, boolean, integer, double precision[], double precision[], bigint
-
-    - __strip_schema_name:
-       rettype: text
-       argument: text
-
-    - __svm_random_ind2:
-       rettype: double precision[]
-       argument: integer
-
-    - __svm_random_ind:
-       rettype: double precision[]
-       argument: integer
-
-    - __svm_target_cl_func:
-       rettype: double precision
-       argument: double precision[]
-
-    - __svm_target_reg_func:
-       rettype: double precision
-       argument: double precision[]
-
-    - __table_exists:
-       rettype: boolean
-       argument: text
-
-    - __train_tree:
-       rettype: __train_result
-       argument: text, integer, integer, text, text, text, text, text, text, double precision, integer, double precision, double precision, double precision, boolean, integer, integer
-
-    - __treemodel_classify_internal:
-       rettype: text[]
-       argument: text, text, integer
-
-    - __treemodel_classify_internal_serial:
-       rettype: text[]
-       argument: text, text, integer
-
-    - __treemodel_display_no_ordered_aggr:
-       rettype: SETOF text
-       argument: text, integer[], integer
-
-    - __treemodel_display_with_ordered_aggr:
-       rettype: SETOF text
-       argument: text, integer[], integer
-
-    - __treemodel_get_vote_result:
-       rettype: void
-       argument: text, text
-
-    - __treemodel_score:
-       rettype: double precision
-       argument: text, text, integer
-
-    - __validate_input_table:
-       rettype: void
-       argument: text, text[], text, text
-
-    - __validate_metatable:
-       rettype: void
-       argument: text
-
-    - c45_classify:
-       rettype: c45_classify_result
-       argument: text, text, text
-
-    - c45_classify:
-       rettype: c45_classify_result
-       argument: text, text, text, integer
-
-    - c45_clean:
-       rettype: boolean
-       argument: text
-
-    - c45_display:
-       rettype: SETOF text
-       argument: text
-
-    - c45_display:
-       rettype: SETOF text
-       argument: text, integer
-
-    - c45_genrule:
-       rettype: SETOF text
-       argument: text
-
-    - c45_genrule:
-       rettype: SETOF text
-       argument: text, integer
-
-    - c45_score:
-       rettype: double precision
-       argument: text, text
-
-    - c45_score:
-       rettype: double precision
-       argument: text, text, integer
-
-    - c45_train:
-       rettype: c45_train_result
-       argument: text, text, text
-
-    - c45_train:
-       rettype: c45_train_result
-       argument: text, text, text, text, text, text, text, text, double precision, text
-
-    - c45_train:
-       rettype: c45_train_result
-       argument: text, text, text, text, text, text, text, text, double precision, text, integer, double precision, double precision, integer
-
-    - correlation:
-       rettype: correlation_result
-       argument: character varying, character varying
-
-    - correlation:
-       rettype: correlation_result
-       argument: character varying, character varying, character varying
-
-    - correlation:
-       rettype: correlation_result
-       argument: character varying, character varying, character varying, boolean
-
-    - linear_svm_igd_transition:
-       rettype: double precision[]
-       argument: double precision[], double precision[], boolean, double precision[], integer, double precision, double precision
-
-    - lsvm_classification:
-       rettype: SETOF lsvm_sgd_result
-       argument: text, text
-
-    - lsvm_classification:
-       rettype: SETOF lsvm_sgd_result
-       argument: text, text, boolean
-
-    - lsvm_classification:
-       rettype: SETOF lsvm_sgd_result
-       argument: text, text, boolean, boolean, double precision, double precision
-
-    - lsvm_classification:
-       rettype: SETOF lsvm_sgd_result
-       argument: text, text, boolean, boolean, double precision, double precision, integer
-
-    - lsvm_predict:
-       rettype: double precision
-       argument: double precision[], double precision[]
-
-    - lsvm_predict_batch:
-       rettype: text
-       argument: text, text, text, text, text
-
-    - lsvm_predict_batch:
-       rettype: text
-       argument: text, text, text, text, text, boolean
-
-    - matrix_block_trans:
-       rettype: matrix_result
-       argument: text, text, text, text, boolean
-
-    - matrix_densify:
-       rettype: matrix_result
-       argument: text, text, text, text, boolean
-
-    - matrix_sparsify:
-       rettype: matrix_result
-       argument: text, text, text, text, boolean
-
-    - matrix_trans:
-       rettype: matrix_result
-       argument: text, text, text, text, boolean
-
-    - rf_classify:
-       rettype: rf_classify_result
-       argument: text, text, text
-
-    - rf_classify:
-       rettype: rf_classify_result
-       argument: text, text, text, boolean, integer
-
-    - rf_classify:
-       rettype: rf_classify_result
-       argument: text, text, text, integer
-
-    - rf_clean:
-       rettype: boolean
-       argument: text
-
-    - rf_display:
-       rettype: SETOF text
-       argument: text
-
-    - rf_display:
-       rettype: SETOF text
-       argument: text, integer[]
-
-    - rf_display:
-       rettype: SETOF text
-       argument: text, integer[], integer
-
-    - rf_score:
-       rettype: double precision
-       argument: text, text
-
-    - rf_score:
-       rettype: double precision
-       argument: text, text, integer
-
-    - rf_train:
-       rettype: rf_train_result
-       argument: text, text, text
-
-    - rf_train:
-       rettype: rf_train_result
-       argument: text, text, text, integer, integer, double precision, text, text, text, text, text, integer, double precision, double precision, integer
-
-    - svdmf_run:
-       rettype: text
-       argument: text, text, text, text, integer
-
-    - svdmf_run:
-       rettype: text
-       argument: text, text, text, text, integer, integer, double precision
-
-    - svm_classification:
-       rettype: SETOF svm_cls_result
-       argument: text, text, boolean, text
-
-    - svm_classification:
-       rettype: SETOF svm_cls_result
-       argument: text, text, boolean, text, boolean, double precision, double precision
-
-    - svm_classification:
-       rettype: SETOF svm_cls_result
-       argument: text, text, boolean, text, boolean, double precision, double precision, double precision
-
-    - svm_classification:
-       rettype: SETOF svm_cls_result
-       argument: text, text, boolean, text, double precision
-
-    - svm_cls_update:
-       rettype: schema_madlib.svm_model_rec
-       argument: schema_madlib.svm_model_rec, double precision[], double precision, text, double precision, double precision, double precision
-
-    - svm_data_normalization:
-       rettype: void
-       argument: text
-
-    - svm_dot:
-       rettype: double precision
-       argument: double precision[], double precision[]
-
-    - svm_dot:
-       rettype: double precision
-       argument: double precision[], double precision[], double precision
-
-    - svm_drop_model:
-       rettype: void
-       argument: text
-
-    - svm_gaussian:
-       rettype: double precision
-       argument: double precision[], double precision[], double precision
-
-    - svm_generate_cls_data:
-       rettype: void
-       argument: text, integer, integer
-
-    - svm_generate_nd_data:
-       rettype: void
-       argument: text, integer, integer
-
-    - svm_generate_reg_data:
-       rettype: void
-       argument: text, integer, integer
-
-    - svm_nd_update:
-       rettype: schema_madlib.svm_model_rec
-       argument: schema_madlib.svm_model_rec, double precision[], text, double precision, double precision, double precision
-
-    - svm_novelty_detection:
-       rettype: SETOF schema_madlib.svm_nd_result
-       argument: text, text, boolean, text
-
-    - svm_novelty_detection:
-       rettype: SETOF schema_madlib.svm_nd_result
-       argument: text, text, boolean, text, boolean, double precision, double precision
-
-    - svm_novelty_detection:
-       rettype: SETOF schema_madlib.svm_nd_result
-       argument: text, text, boolean, text, boolean, double precision, double precision, double precision
-
-    - svm_polynomial:
-       rettype: double precision
-       argument: double precision[], double precision[], double precision
-
-    - svm_predict:
-       rettype: double precision
-       argument: schema_madlib.svm_model_rec, double precision[], text, double precision
-
-    - svm_predict_batch:
-       rettype: text
-       argument: text, text, text, text, text, boolean
-
-    - svm_predict_sub:
-       rettype: double precision
-       argument: integer, integer, double precision[], double precision[], double precision[], text, double precision
-
-    - svm_reg_update:
-       rettype: schema_madlib.svm_model_rec
-       argument: schema_madlib.svm_model_rec, double precision[], double precision, text, double precision, double precision, double precision, double precision
-
-    - svm_regression:
-       rettype: SETOF svm_reg_result
-       argument: text, text, boolean, text
-
-    - svm_regression:
-       rettype: SETOF svm_reg_result
-       argument: text, text, boolean, text, boolean, double precision, double precision, double precision
-
-    - svm_regression:
-       rettype: SETOF svm_reg_result
-       argument: text, text, boolean, text, boolean, double precision, double precision, double precision, double precision
-
-    - svm_store_model:
-       rettype: void
-       argument: text, text, text
-
-
-    # ----------------- Changes from 1.9 to 1.9.1 ----------
-    - array_collapse:
-        rettype: anyarray
-        argument: anyarray
-    - linear_svm_igd_transition:
-        rettype: double precision[]
-        argument: double precision[], double precision[], double precision, double precision[], integer, double precision, double precision, boolean, integer, double precision, boolean
-    - profile:
-        rettype: SETOF schema_madlib.profile_result
-        argument: text
-    - profile_full:
-        rettype: SETOF schema_madlib.profile_result
-        argument: text, integer
-    - profile:
-        rettype: schema_madlib.profile_result
-        argument: text
-    - profile_full:
-        rettype: schema_madlib.profile_result
-        argument: text, integer
-    - quantile:
-        rettype: double precision
-        argument: text, text, double precision
-    - quantile_big:
-        rettype: double precision
-        argument: text, text, double precision
-
-# Changes to aggregates (UDA) including removal and modification
-# Overloaded functions should be mentioned separately
-uda:
-    # ----------------- Changes from 1.4.1 to 1.5 ---------------
-    - lsvm_sgd_agg:
-        rettype: schema_madlib.lsvm_sgd_model_rec
-        argument: double precision[], double precision, double precision, double precision
-    - svm_cls_agg:
-        rettype: schema_madlib.svm_model_rec
-        argument: double precision[], double precision, text, double precision, double precision
-    - svm_nd_agg:
-        rettype: schema_madlib.svm_model_rec
-        argument: double precision[], text, double precision, double precision
-    - svm_reg_agg:
-        rettype: schema_madlib.svm_model_rec
-        argument: double precision[], double precision, text, double precision, double precision, double precision
-    # ----------------- Changes from 1.5 to 1.6 ---------------
-    - coxph_step:
-        rettype: double precision[]
-        argument: double precision[], double precision, boolean, double precision[]
-    - coxph_strata_step_inner:
-        rettype: double precision[]
-        argument: double precision[], double precision, boolean, double precision[]
-    - coxph_strata_step_outer:
-        rettype: double precision[]
-        argument: double precision[]
-    # return type change
-    - linregr:
-        rettype: schema_madlib.linregr_result
-        argument: double precision, double precision[]
-    # initcond change
-    - __mlogregr_irls_step:
-        rettype: double precision[]
-        argument: integer, integer, integer, double precision[], double precision[]
-    # ----------------- Changes from 1.6 to 1.6.0S ----------
-    # ----------------- Changes from 1.6.0S to 1.7 ----------
-    - lmf_igd_step:
-        rettype: double precision[]
-        argument: smallint, smallint, double precision, double precision[], smallint, smallint, smallint, double precision, double precision
-    - __clustered_err_lin_step:
-        rettype: schema_madlib.__clustered_agg_result
-        argument: double precision, double precision[], double precision[]
-    - __clustered_err_log_step:
-        rettype: schema_madlib.__clustered_agg_result
-        argument: boolean, double precision[], double precision[]
-    - __clustered_err_mlog_step:
-        rettype: schema_madlib.__clustered_agg_result
-        argument: integer, double precision[], double precision[], integer, integer
-    - dense_residual_norm:
-        rettype: schema_madlib.residual_norm_result
-        argument: double precision[], double precision, double precision[]
-    - heteroskedasticity_test_linregr:
-        rettype: schema_madlib.heteroskedasticity_test_result
-        argument: double precision, double precision[], double precision[]
-    - linregr:
-        rettype: schema_madlib.linregr_result
-        argument: double precision, double precision[]
-    - robust_linregr:
-        rettype: schema_madlib.robust_linregr_result
-        argument: double precision, double precision[], double precision[]
-    - weighted_sample:
-        rettype: double precision[]
-        argument: double precision[], double precision
-    - weighted_sample:
-        rettype: bigint
-        argument: bigint, double precision
-    # ----------------- Changes from 1.7.1 to 1.8 ----------
-    - discrete_distribution_agg:
-        rettype: double precision[]
-        argument: integer, double precision, integer
-    - vectorized_distribution_agg:
-        rettype: double precision[]
-        argument: integer[], integer[]
-    # ----------------- Changes from 1.8 to 1.9 ----------
-
-    - __array_indexed_agg:
-        rettype: double precision[]
-        argument: double precision, bigint, bigint
-
-    - __best_scv_aggr:
-        rettype: double precision[]
-        argument: double precision[], integer, double precision
-
-    - __bigint_array_sum:
-        rettype: bigint[]
-        argument: bigint[]
-
-    - __display_tree_aggr:
-        rettype: text
-        argument: integer, boolean, text, text, double precision, double precision, text, integer
-
-    - __dt_acc_count_aggr:
-        rettype: bigint[]
-        argument: integer, bigint, integer
-
-    - __rep_aggr_class_count:
-        rettype: bigint[]
-        argument: integer, integer, integer
-
-    - __scv_aggr:
-        rettype: double precision[]
-        argument: integer, boolean, integer, double precision[], double precision[], bigint
-
-    - linear_svm_igd_step:
-        rettype: double precision[]
-        argument: double precision[], boolean, double precision[], integer, double precision, double precision
-
-    - linear_svm_igd_step_serial:
-        rettype: double precision[]
-        argument: double precision[], boolean, double precision[], integer, double precision, double precision
-
-    - svm_cls_agg:
-        rettype: schema_madlib.svm_model_rec
-        argument: double precision[], double precision, text, double precision, double precision, double precision
-
-    - svm_nd_agg:
-        rettype: schema_madlib.svm_model_rec
-        argument: double precision[], text, double precision, double precision, double precision
-
-    - svm_reg_agg:
-        rettype: schema_madlib.svm_model_rec
-        argument: double precision[], double precision, text, double precision, double precision, double precision, double precision
-
-    - __svm_random_ind2:
-        rettype: double precision[]
-        argument: integer
-    # ----------------- Changes from 1.9 to 1.9.1 ----------
-    - array_agg:
-        rettype: anyarray
-        argument: anyelement
-    - linear_svm_igd_step:
-       rettype: double precision[]
-       argument: double precision[], double precision, double precision[], integer, double precision, double precision, boolean, integer, double precision, boolean
-
-# Casts (UDC) updated/removed
-udc:
-    # ----------------- Changes from 1.4.1 to 1.5 ---------------
-    bool2text:
-        sourcetype: boolean
-        targettype: text
-    # ----------------- Changes from 1.5 to 1.6 ---------------
-    # ----------------- Changes from 1.6 to 1.6.0S ----------
-    # ----------------- Changes from 1.6.0S to 1.7 ----------
-
-# Operators (UDO) removed/updated
-udo:
-    # ----------------- Changes from 1.5 to 1.6 ---------------
-    - '<':
-        leftarg: schema_madlib.svec
-        rightarg: schema_madlib.svec
-    - '<=':
-        leftarg: schema_madlib.svec
-        rightarg: schema_madlib.svec
-    - '<>':
-        leftarg: schema_madlib.svec
-        rightarg: schema_madlib.svec
-    - '==':
-        leftarg: schema_madlib.svec
-        rightarg: schema_madlib.svec
-    - '>=':
-        leftarg: schema_madlib.svec
-        rightarg: schema_madlib.svec
-    - '>':
-        leftarg: schema_madlib.svec
-        rightarg: schema_madlib.svec
-    # ----------------- Changes from 1.6 to 1.6.0S ----------
-    # ----------------- Changes from 1.6.0S to 1.7 ----------
-
-# Operator Classes (UDOC) removed/updated
-udoc:
-    # ----------------- Changes from 1.5 to 1.6 ---------------
-    # removed
-    - svec_l2_ops:
-        index: btree
-    # ----------------- Changes from 1.6 to 1.6.0S ----------
-    # ----------------- Changes from 1.6.0S to 1.7 ----------

[48/50] [abbrv] incubator-madlib git commit: Update the copyright year. Add the licese headers for create indicators files.

Posted by ri...@apache.org.

Update the copyright year.
Add the licese headers for create indicators files.

Closes #104


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/0b8415e7
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/0b8415e7
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/0b8415e7

Branch: refs/heads/latest_release
Commit: 0b8415e7eec5c9ebb83fbf22923c69a99b0056ef
Parents: b3495c5
Author: Satoshi Nagayasu <sn...@uptime.jp>
Authored: Fri Feb 17 16:06:37 2017 +0900
Committer: Orhan Kislal <ok...@pivotal.io>
Committed: Tue Feb 21 15:31:31 2017 -0800

----------------------------------------------------------------------
 NOTICE                                          |  2 +-
 pom.xml                                         |  2 --
 .../modules/utilities/create_indicators.py_in   | 19 +++++++++++++++++++
 .../modules/utilities/create_indicators.sql_in  | 20 +++++++++++++++++++-
 4 files changed, 39 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/0b8415e7/NOTICE
----------------------------------------------------------------------
diff --git a/NOTICE b/NOTICE
index f366252..27d5610 100644
--- a/NOTICE
+++ b/NOTICE
@@ -1,5 +1,5 @@
 Apache MADlib (incubating)
-Copyright 2016 The Apache Software Foundation.
+Copyright 2016-2017 The Apache Software Foundation.
 
 This product includes software developed at
 The Apache Software Foundation (http://www.apache.org/).

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/0b8415e7/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index 3971820..e4eefc4 100644
--- a/pom.xml
+++ b/pom.xml
@@ -643,8 +643,6 @@
               <exclude>src/ports/postgres/modules/utilities/admin.py_in</exclude>
               <exclude>src/ports/postgres/modules/utilities/control.py_in</exclude>
               <exclude>src/ports/postgres/modules/utilities/control_composite.py_in</exclude>
-              <exclude>src/ports/postgres/modules/utilities/create_indicators.py_in</exclude>
-              <exclude>src/ports/postgres/modules/utilities/create_indicators.sql_in</exclude>
               <exclude>src/ports/postgres/modules/utilities/group_control.py_in</exclude>
               <exclude>src/ports/postgres/modules/utilities/in_mem_group_control.py_in</exclude>
               <exclude>src/ports/postgres/modules/utilities/math_utils.py_in</exclude>

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/0b8415e7/src/ports/postgres/modules/utilities/create_indicators.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/create_indicators.py_in b/src/ports/postgres/modules/utilities/create_indicators.py_in
index dbbc923..fae4fe1 100644
--- a/src/ports/postgres/modules/utilities/create_indicators.py_in
+++ b/src/ports/postgres/modules/utilities/create_indicators.py_in
@@ -1,3 +1,22 @@
+# coding=utf-8
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 """
 @file create_indicators.py_in
 

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/0b8415e7/src/ports/postgres/modules/utilities/create_indicators.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/create_indicators.sql_in b/src/ports/postgres/modules/utilities/create_indicators.sql_in
index 249aa17..0054e0a 100644
--- a/src/ports/postgres/modules/utilities/create_indicators.sql_in
+++ b/src/ports/postgres/modules/utilities/create_indicators.sql_in
@@ -1,4 +1,22 @@
- /* ----------------------------------------------------------------------- *//**
+/* ----------------------------------------------------------------------- *//**
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
  *
  * @file create_indicators.sql_in
  *

[27/50] [abbrv] incubator-madlib git commit: Kmeans: Adds svec as an accepted column type for expr_point

Posted by ri...@apache.org.

Kmeans: Adds svec as an accepted column type for expr_point


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/153037a2
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/153037a2
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/153037a2

Branch: refs/heads/latest_release
Commit: 153037a2f6a9b2a174dc15fb16fc5da220ae481d
Parents: afb0e23
Author: Orhan Kislal <ok...@pivotal.io>
Authored: Thu Feb 2 10:09:40 2017 -0800
Committer: Orhan Kislal <ok...@pivotal.io>
Committed: Thu Feb 2 10:09:40 2017 -0800

----------------------------------------------------------------------
 src/ports/postgres/modules/kmeans/kmeans.py_in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/153037a2/src/ports/postgres/modules/kmeans/kmeans.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/kmeans/kmeans.py_in b/src/ports/postgres/modules/kmeans/kmeans.py_in
index da75d78..640ff16 100644
--- a/src/ports/postgres/modules/kmeans/kmeans.py_in
+++ b/src/ports/postgres/modules/kmeans/kmeans.py_in
@@ -51,7 +51,7 @@ def kmeans_validate_expr(schema_madlib, rel_source, expr_point, **kwargs):
     # Both formats should return a numeric array type
     if expr_type in ['smallint[]', 'integer[]', 'bigint[]', 'decimal[]',
                         'numeric[]', 'real[]', 'double precision[]',
-                        'serial[]', 'bigserial[]', 'float8[]']:
+                        'serial[]', 'bigserial[]', 'float8[]', 'svec']:
 
         # An array expression should fail this check
         if columns_exist_in_table(rel_source, [expr_point]):

[35/50] [abbrv] incubator-madlib git commit: Adds license header to the graph.tex file.

Posted by ri...@apache.org.

Adds license header to the graph.tex file.


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/b779fdf9
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/b779fdf9
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/b779fdf9

Branch: refs/heads/latest_release
Commit: b779fdf9390e0986f818576a3bc4f7e3927387e5
Parents: fcf21a3
Author: Orhan Kislal <ok...@pivotal.io>
Authored: Thu Feb 9 12:05:46 2017 -0800
Committer: Orhan Kislal <ok...@pivotal.io>
Committed: Thu Feb 9 12:05:46 2017 -0800

----------------------------------------------------------------------
 doc/design/modules/graph.tex | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/b779fdf9/doc/design/modules/graph.tex
----------------------------------------------------------------------
diff --git a/doc/design/modules/graph.tex b/doc/design/modules/graph.tex
index 223d8b5..758f407 100644
--- a/doc/design/modules/graph.tex
+++ b/doc/design/modules/graph.tex
@@ -1,6 +1,23 @@
 % When using TeXShop on the Mac, let it know the root document. The following must be one of the first 20 lines.
 % !TEX root = ../design.tex
 
+% Licensed to the Apache Software Foundation (ASF) under one
+% or more contributor license agreements.  See the NOTICE file
+% distributed with this work for additional information
+% regarding copyright ownership.  The ASF licenses this file
+% to you under the Apache License, Version 2.0 (the
+% "License"); you may not use this file except in compliance
+% with the License.  You may obtain a copy of the License at
+
+%   http://www.apache.org/licenses/LICENSE-2.0
+
+% Unless required by applicable law or agreed to in writing,
+% software distributed under the License is distributed on an
+% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+% KIND, either express or implied.  See the License for the
+% specific language governing permissions and limitations
+% under the License.
+
 \chapter[Graph]{Graph}
 
 \begin{moduleinfo}

[43/50] [abbrv] incubator-madlib git commit: Release v1.10:

Posted by ri...@apache.org.

Release v1.10:

- Updates the changelists and other related files for upgrade.

- Removes old changelists since upgrade is not supported from
versions prior to 1.8.

- Removes hardcoded schema name (madlib) from some of the tests.

Closes #95


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/90f4dc15
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/90f4dc15
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/90f4dc15

Branch: refs/heads/latest_release
Commit: 90f4dc1577695c6f8658009005907a49848995c7
Parents: b779fdf
Author: Orhan Kislal <ok...@pivotal.io>
Authored: Thu Feb 9 16:16:35 2017 -0800
Committer: Orhan Kislal <ok...@pivotal.io>
Committed: Thu Feb 9 16:20:21 2017 -0800

----------------------------------------------------------------------
 deploy/gppkg/CMakeLists.txt                     |    2 +-
 doc/mainpage.dox.in                             |    1 +
 src/config/Version.yml                          |    2 +-
 src/madpack/changelist.yaml                     |  120 +-
 src/madpack/changelist_1.0_1.9.1.yaml           | 1390 ------------------
 src/madpack/changelist_1.1_1.9.1.yaml           | 1385 -----------------
 src/madpack/changelist_1.2_1.9.1.yaml           | 1352 -----------------
 src/madpack/changelist_1.3_1.9.1.yaml           | 1283 ----------------
 src/madpack/changelist_1.4.1_1.9.1.yaml         | 1216 ---------------
 src/madpack/changelist_1.4_1.9.1.yaml           | 1220 ---------------
 src/madpack/changelist_1.5_1.9.1.yaml           | 1077 --------------
 src/madpack/changelist_1.6.0S_1.9.1.yaml        |  945 ------------
 src/madpack/changelist_1.6_1.9.1.yaml           |  951 ------------
 src/madpack/changelist_1.7.1_1.9.1.yaml         |  815 ----------
 src/madpack/changelist_1.7_1.9.1.yaml           |  836 -----------
 src/madpack/changelist_1.8_1.10.yaml            |  857 +++++++++++
 src/madpack/changelist_1.8_1.9.1.yaml           |  772 ----------
 src/madpack/changelist_1.9_1.10.yaml            |  156 ++
 src/madpack/diff_udf.sql                        |    2 +-
 src/madpack/diff_udt.sql                        |   28 +-
 src/madpack/madpack.py                          |    7 +
 src/madpack/upgrade_util.py                     |   53 +-
 .../test/random_forest.sql_in                   |    2 +-
 src/ports/postgres/modules/svm/test/svm.sql_in  |    4 +-
 .../utilities/test/encode_categorical.sql_in    |   12 +-
 25 files changed, 1137 insertions(+), 13351 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/90f4dc15/deploy/gppkg/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/deploy/gppkg/CMakeLists.txt b/deploy/gppkg/CMakeLists.txt
index 6b66000..268d926 100644
--- a/deploy/gppkg/CMakeLists.txt
+++ b/deploy/gppkg/CMakeLists.txt
@@ -2,7 +2,7 @@
 # Packaging for Greenplum's gppkg
 # ------------------------------------------------------------------------------
 
-set(MADLIB_GPPKG_VERSION "1.9.6")
+set(MADLIB_GPPKG_VERSION "1.9.7")
 set(MADLIB_GPPKG_RELEASE_NUMBER 1)
 set(MADLIB_GPPKG_RPM_SOURCE_DIR
     "${CMAKE_BINARY_DIR}/_CPack_Packages/Linux/RPM/${CPACK_PACKAGE_FILE_NAME}"

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/90f4dc15/doc/mainpage.dox.in
----------------------------------------------------------------------
diff --git a/doc/mainpage.dox.in b/doc/mainpage.dox.in
index 3b9c472..9131c10 100644
--- a/doc/mainpage.dox.in
+++ b/doc/mainpage.dox.in
@@ -17,6 +17,7 @@ Useful links:
 <li><a href="https://mail-archives.apache.org/mod_mbox/incubator-madlib-user/">User mailing list</a></li>
 <li><a href="https://mail-archives.apache.org/mod_mbox/incubator-madlib-dev/">Dev mailing list</a></li>
 <li>User documentation for earlier releases:
+    <a href="../v1.9.1/index.html">v1.9.1</a>,
     <a href="../v1.9/index.html">v1.9</a>,
     <a href="../v1.8/index.html">v1.8</a>,
     <a href="../v1.7.1/index.html">v1.7.1</a>,

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/90f4dc15/src/config/Version.yml
----------------------------------------------------------------------
diff --git a/src/config/Version.yml b/src/config/Version.yml
index fe6b919..6176098 100644
--- a/src/config/Version.yml
+++ b/src/config/Version.yml
@@ -1 +1 @@
-version: 1.10.0-dev
+version: 1.10.0

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/90f4dc15/src/madpack/changelist.yaml
----------------------------------------------------------------------
diff --git a/src/madpack/changelist.yaml b/src/madpack/changelist.yaml
index 12cbb5b..16e4144 100644
--- a/src/madpack/changelist.yaml
+++ b/src/madpack/changelist.yaml
@@ -9,10 +9,17 @@
 # file installed on the upgrade version. All other files (that don't have
 # updates), are cleaned up to remove object replacements
 new module:
+    # ----------------- Changes from 1.9.1 to 1.0 ----------
+    sssp:
+    encode_categorical:
+    knn:
 # Changes in the types (UDT) including removal and modification
 udt:
-    # ----------------- Changes from 1.9 to 1.9.1 ----------
-    profile_result:
+    # ----------------- Changes from 1.9.1 to 1.0 ----------
+    _tree_result_type:
+    _prune_result_type:
+    kmeans_result:
+    kmeans_state:
 
 # List of the UDF changes that affect the user externally. This includes change
 # in function name, return type, argument order or types, or removal of
@@ -21,51 +28,90 @@ udt:
 # are user views dependent on this function, since the original function will
 # not be present in the upgraded version.
 udf:
-    # ----------------- Changes from 1.9 to 1.9.1 ----------
-    - array_collapse:
-        rettype: anyarray
-        argument: anyarray
-    - linear_svm_igd_transition:
-        rettype: double precision[]
-        argument: double precision[], double precision[], double precision, double precision[], integer, double precision, double precision, boolean, integer, double precision, boolean
-    - profile:
-        rettype: SETOF schema_madlib.profile_result
-        argument: text
-    - profile_full:
-        rettype: SETOF schema_madlib.profile_result
-        argument: text, integer
-    - profile:
-        rettype: schema_madlib.profile_result
-        argument: text
-    - profile_full:
-        rettype: schema_madlib.profile_result
-        argument: text, integer
-    - quantile:
-        rettype: double precision
-        argument: text, text, double precision
-    - quantile_big:
-        rettype: double precision
-        argument: text, text, double precision
+    # ----------------- Changes from 1.9.1 to 1.0 ----------
+    - _dt_apply:
+        rettype: schema_madlib._tree_result_type
+        argument: schema_madlib.bytea8, schema_madlib.bytea8, schema_madlib.bytea8, smallint, smallint, smallint, boolean, integer
+    - _prune_and_cplist:
+        rettype: schema_madlib._prune_result_type
+        argument: schema_madlib.bytea8, double precision, boolean
+    - kmeans:
+        rettype: schema_madlib.kmeans_result
+        argument: character varying, character varying, double precision[], character varying, character varying, integer, double precision
+    - kmeans:
+        rettype: schema_madlib.kmeans_result
+        argument: character varying, character varying, double precision[], character varying, character varying, integer
+    - kmeans:
+        rettype: schema_madlib.kmeans_result
+        argument: character varying, character varying, double precision[], character varying, character varying
+    - kmeans:
+        rettype: schema_madlib.kmeans_result
+        argument: character varying, character varying, double precision[], character varying
+    - kmeans:
+        rettype: schema_madlib.kmeans_result
+        argument: character varying, character varying, double precision[]
+    - kmeans:
+        rettype: schema_madlib.kmeans_result
+        argument: character varying, character varying, character varying, character varying, character varying, character varying, integer, double precision
+    - kmeans:
+        rettype: schema_madlib.kmeans_result
+        argument: character varying, character varying, character varying, character varying, character varying, character varying, integer
+    - kmeans:
+        rettype: schema_madlib.kmeans_result
+        argument: character varying, character varying, character varying, character varying, character varying, character varying
+    - kmeans:
+        rettype: schema_madlib.kmeans_result
+        argument: character varying, character varying, character varying, character varying, character varying
+    - kmeans:
+        rettype: schema_madlib.kmeans_result
+        argument: character varying, character varying, character varying, character varying
+    - kmeanspp:
+        rettype: schema_madlib.kmeans_result
+        argument: character varying, character varying, integer, character varying, character varying, integer, double precision, double precision
+    - kmeanspp:
+        rettype: schema_madlib.kmeans_result
+        argument: character varying, character varying, integer, character varying, character varying, integer, double precision
+    - kmeanspp:
+        rettype: schema_madlib.kmeans_result
+        argument: character varying, character varying, integer, character varying, character varying, integer
+    - kmeanspp:
+        rettype: schema_madlib.kmeans_result
+        argument: character varying, character varying, integer, character varying, character varying
+    - kmeanspp:
+        rettype: schema_madlib.kmeans_result
+        argument: character varying, character varying, integer, character varying
+    - kmeanspp:
+        rettype: schema_madlib.kmeans_result
+        argument: character varying, character varying, integer
+    - kmeans_random:
+        rettype: schema_madlib.kmeans_result
+        argument: character varying, character varying, integer, character varying, character varying, integer, double precision
+    - kmeans_random:
+        rettype: schema_madlib.kmeans_result
+        argument: character varying, character varying, integer, character varying, character varying, integer
+    - kmeans_random:
+        rettype: schema_madlib.kmeans_result
+        argument: character varying, character varying, integer, character varying, character varying
+    - kmeans_random:
+        rettype: schema_madlib.kmeans_result
+        argument: character varying, character varying, integer, character varying
+    - kmeans_random:
+        rettype: schema_madlib.kmeans_result
+        argument: character varying, character varying, integer
+    - internal_execute_using_kmeans_args:
+        rettype: schema_madlib.kmeans_result
+        argument: character varying, character varying, character varying, character varying, character varying, integer, double precision
+
 
 # Changes to aggregates (UDA) including removal and modification
 # Overloaded functions should be mentioned separately
 uda:
-    # ----------------- Changes from 1.9 to 1.9.1 ----------
-    - array_agg:
-        rettype: anyarray
-        argument: anyelement
-    - linear_svm_igd_step:
-       rettype: double precision[]
-       argument: double precision[], double precision, double precision[], integer, double precision, double precision, boolean, integer, double precision, boolean
 
 # Casts (UDC) updated/removed
 udc:
-    # ----------------- Changes from 1.9 to 1.9.1 ----------
 
 # Operators (UDO) removed/updated
 udo:
-    # ----------------- Changes from 1.9 to 1.9.1 ----------
 
 # Operator Classes (UDOC) removed/updated
 udoc:
-    # ----------------- Changes from 1.9 to 1.9.1 ----------

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/90f4dc15/src/madpack/changelist_1.0_1.9.1.yaml
----------------------------------------------------------------------
diff --git a/src/madpack/changelist_1.0_1.9.1.yaml b/src/madpack/changelist_1.0_1.9.1.yaml
deleted file mode 100644
index d3a6483..0000000
--- a/src/madpack/changelist_1.0_1.9.1.yaml
+++ /dev/null
@@ -1,1390 +0,0 @@
-# Changelist for MADlib version 1.0 to 1.7
-
-# This file contains all changes that were introduced in a new version of
-# MADlib. This changelist is used by the upgrade script to detect what objects
-# should be upgraded (while retaining all other objects from the previous version)
-
-# New modules (actually .sql_in files) added in upgrade version
-# For these files the sql_in code is retained as is with the functions in the
-# file installed on the upgrade version. All other files (that don't have
-# updates), are cleaned up to remove object replacements
-new module:
-    # ----------------- Changes from 1.0 to 1.1 -----------------
-    svd:
-    pca:
-    pca_project:
-    sparse_linear_systems:
-    dense_linear_systems:
-    # ----------------- Changes from 1.1 to 1.2 -----------------
-    arima:
-    arima_forecast:
-    # ----------------- Changes from 1.3 to 1.4 -----------------
-    robust_variance_coxph:
-    clustered_variance_coxph:
-    # ----------------- Changes from 1.4 to 1.4.1 ---------------
-    # ----------------- Changes from 1.4.1 to 1.5 ---------------
-    # ----------------- Changes from 1.5 to 1.6 ---------------
-    table_to_pmml:
-    # ----------------- Changes from 1.6 to 1.6.0S ----------
-    # ----------------- Changes from 1.6.0S to 1.7 ----------
-    glm:
-    multiresponseglm:
-    ordinal:
-    decision_tree:
-    random_forest:
-    distribution:
-    # ----------------- Changes from 1.7.1 to 1.8 ----------
-    text_utilities:
-
-# Changes in the types (UDT) including removal and modification
-udt:
-
-    # ----------------- Changes from 1.0 to 1.1 -----------------
-    # ----------------- Changes from 1.1 to 1.2 -----------------
-    summary_result:
-    # ----------------- Changes from 1.2 to 1.3 -----------------
-    __logregr_result:
-    linregr_result:
-    # ----------------- Changes from 1.3 to 1.4 -----------------
-    # coxph_result: not exists in 1.0
-    mlogregr_result:
-    marginal_logregr_result:
-    marginal_mlogregr_result:
-    # ----------------- Changes from 1.4 to 1.4.1 ---------------
-    # ----------------- Changes from 1.4.1 to 1.5 ---------------
-    # ----------------- Changes from 1.5 to 1.6 ---------------
-    # __logregr_result: appeared before
-    # coxph_result: not exists in 1.0
-    # linregr_result: appeared before
-    # mlogregr_result: appeared before
-    # some types missed before upgrade to v1.6
-    intermediate_cox_prop_hazards_result:
-    __utils_scaled_data:
-    # ----------------- Changes from 1.6 to 1.6.0S ----------
-    # ----------------- Changes from 1.6.0S to 1.7 ----------
-    bytea8:
-    # ----------------- Changes from 1.8 to 1.9 ----------
-    __enc_tbl_result:
-    __gen_acc_time:
-    __rep_type:
-    __train_result:
-    c45_classify_result:
-    c45_train_result:
-    correlation_result:
-    lsvm_sgd_model_rec:
-    lsvm_sgd_result:
-    rf_classify_result:
-    rf_train_result:
-    svm_cls_result:
-    svm_model_pr:
-    svm_model_rec:
-    svm_nd_result:
-    svm_reg_result:
-    svm_support_vector:
-    _prune_result_type:
-    _tree_result_type:
-    linear_svm_result:
-    # ----------------- Changes from 1.9 to 1.9.1 ----------
-    profile_result:
-
-# List of the UDF changes that affect the user externally.  This includes change
-# in function name, change in argument order or argument types, and removal of
-# the function. In each case, the original function is as good as removed and a
-# new function is created. In such cases, we should abort the upgrade if there
-# are user views dependent on this function, since the original function will
-# not be present in the upgraded version.
-udf:
-    # ----------------- Changes from 1.0 to 1.1 -----------------
-    # Matrix operations
-    - __matrix_column_to_array_format:
-        rettype: schema_madlib.matrix_result
-        argument: text, text
-    - __sample_with_replacement:
-        rettype: void
-        argument: integer, integer, text, text
-
-    # ----------------- Changes from 1.1 to 1.2 -----------------
-    - summary:
-        rettype: schema_madlib.summary_result
-        argument: text, text
-    - summary:
-        rettype: schema_madlib.summary_result
-        argument: text, text, text
-    - summary:
-        rettype: schema_madlib.summary_result
-        argument: text, text, text, text
-    - summary:
-        rettype: schema_madlib.summary_result
-        argument: text, text, text, text, boolean
-    - summary:
-        rettype: schema_madlib.summary_result
-        argument: text, text, text, text, boolean, boolean
-    - summary:
-        rettype: schema_madlib.summary_result
-        argument: text, text, text, text, boolean, boolean, double precision[]
-    - summary:
-        rettype: schema_madlib.summary_result
-        argument: text, text, text, text, boolean, boolean, double precision[], integer
-    - summary:
-        rettype: schema_madlib.summary_result
-        argument: text, text, text, text, boolean, boolean, double precision[], integer, boolean
-
-    # ----------------- Changes from 1.2 to 1.3 -----------------
-    # linear regression: 'num_processed' added in 'linregr_result'
-    - linregr_final:
-        rettype: schema_madlib.linregr_result
-        argument: schema_madlib.bytea8
-    - linregr_merge_states:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - linregr_transition:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, double precision, double precision[]
-
-    # logistic regression: 'num_processed' added in '__logregr_result'
-    - __logregr_cg_result:
-        rettype: schema_madlib.__logregr_result
-        argument: double precision[]
-
-    - __logregr_irls_result:
-        rettype: schema_madlib.__logregr_result
-        argument: double precision[]
-
-    - __logregr_igd_result:
-        rettype: schema_madlib.__logregr_result
-        argument: double precision[]
-
-    # ----------------- Changes from 1.3 to 1.4 -----------------
-    - __internal_get_robust_linregr_insert_string:
-        rettype: character varying
-        argument: schema_madlib.robust_linregr_result, double precision[], text
-    - __internal_get_robust_linregr_result:
-        rettype: schema_madlib.robust_linregr_result
-        argument: character varying, character varying, character varying, double precision[]
-    - __internal_get_robust_logregr_insert_string:
-        rettype: character varying
-        argument: schema_madlib.robust_logregr_result, text
-    - __internal_get_robust_logregr_result:
-        rettype: schema_madlib.robust_logregr_result
-        argument: character varying, character varying, character varying, double precision[]
-    - __internal_get_robust_mlogregr_insert_string:
-        rettype: character varying
-        argument: schema_madlib.robust_mlogregr_result, text
-    - __lda_count_topic_prefunc:
-        rettype: integer[]
-        argument: integer[], integer[]
-    - __lda_count_topic_sfunc:
-        rettype: integer[]
-        argument: integer[], integer[], integer[], integer[], integer, integer
-    - __lda_gibbs_sample:
-        rettype: integer[]
-        argument: integer[], integer[], integer[], integer[], double precision, double precision, integer, integer, integer
-    - __lda_perplexity_ffunc:
-        rettype: double precision
-        argument: integer[]
-    - __lda_perplexity_prefunc:
-        rettype: integer[]
-        argument: integer[], integer[]
-    - __lda_perplexity_sfunc:
-        rettype: integer[]
-        argument: integer[], integer[], integer[], integer[], integer[], double precision, double precision, integer, integer
-    - __lda_util_transpose:
-        rettype: integer[]
-        argument: integer[]
-    - __lda_util_unnest:
-        rettype: SETOF integer[]
-        argument: integer[]
-    - clustered_variance_mlogregr:
-        rettype: void
-        argument: text, text, text, text, text, integer, text, integer, text, double precision
-    - clustered_variance_mlogregr:
-        rettype: void
-        argument: text, text, text, text, text, integer, text, integer, text
-    - clustered_variance_mlogregr:
-        rettype: void
-        argument: text, text, text, text, text, integer, text, integer, text, double precision, boolean
-    - robust_input_checking:
-        rettype: void
-        argument: character varying, character varying, character varying, character varying
-
-    # ----------------- Changes from 1.4 to 1.4.1 ---------------
-    # ----------------- Changes from 1.4.1 to 1.5 ---------------
-    - __cmsketch_final:
-        rettype: bytea
-        argument: bytea
-    - __delete_traininginfo:
-        rettype: void
-        argument: text
-    - __get_encode_table_name:
-        rettype: text
-        argument: text
-    - __get_metatable_name:
-        rettype: text
-        argument: text
-    - __get_routine_id:
-        rettype: integer
-        argument: text
-    - __get_routine_name:
-        rettype: text
-        argument: text
-    - __get_tree_table_name:
-        rettype: text
-        argument: text
-    - __insert_into_traininginfo:
-        rettype: void
-        argument: text, text, text, text, text, text, text, text, double precision, integer, integer
-    - __treemodel_clean:
-        rettype: boolean
-        argument: text
-    - compute_lincrf:
-        rettype: integer
-        argument: character varying, character varying, character varying, character varying, character varying, integer, integer
-    - create_nb_classify_fn:
-        rettype: void
-        argument: character varying, character varying, integer, character varying
-    - create_nb_classify_fn:
-        rettype: void
-        argument: character varying, character varying, character varying, integer, character varying
-    - create_table_as:
-        rettype: void
-        argument: character varying, character varying
-    - create_table_as:
-        rettype: void
-        argument: character varying, character varying, character varying
-    - create_temp_table_as:
-        rettype: void
-        argument: character varying, character varying, character varying
-    - create_temp_table_as:
-        rettype: void
-        argument: character varying, character varying
-    - create_temporary_table_as:
-        rettype: void
-        argument: character varying, character varying, character varying
-    - create_temporary_table_as:
-        rettype: void
-        argument: character varying, character varying
-    - crf_train_fgen:
-        rettype: void
-        argument: text, text, text, text, text
-    - insert_into:
-        rettype: void
-        argument: character varying, character varying
-    - internal_create_table_as:
-        rettype: void
-        argument: boolean, character varying, character varying, character varying
-    - internal_execute_using_kmeans_args:
-        rettype: void
-        argument: character varying, double precision[], regproc, integer, double precision
-    - internal_execute_using_kmeanspp_seeding_args:
-        rettype: void
-        argument: character varying, integer, regproc, double precision[]
-    - internal_execute_using_silhouette_args:
-        rettype: double precision
-        argument: character varying, double precision[], regproc
-    - lincrf:
-        rettype: integer
-        argument: character varying, character varying, character varying, character varying, character varying, integer, character varying, character varying, integer
-    - lincrf:
-        rettype: integer
-        argument: character varying, character varying, character varying, character varying, character varying, integer, character varying, character varying
-    - lsvm_predict:
-        rettype: double precision
-        argument: text, double precision[]
-    - lsvm_predict_combo:
-        rettype: SETOF schema_madlib.svm_model_pr
-        argument: text, double precision[]
-    - lsvm_sgd_update:
-        rettype: schema_madlib.lsvm_sgd_model_rec
-        argument: schema_madlib.lsvm_sgd_model_rec, double precision[], double precision, double precision, double precision
-    - svm_cls_update:
-        rettype: schema_madlib.svm_model_rec
-        argument: schema_madlib.svm_model_rec, double precision[], double precision, text, double precision, double precision
-    - svm_nd_update:
-        rettype: schema_madlib.svm_model_rec
-        argument: schema_madlib.svm_model_rec, double precision[], text, double precision, double precision
-    - svm_predict:
-        rettype: double precision
-        argument: schema_madlib.svm_model_rec, double precision[], text
-    - svm_predict:
-        rettype: double precision
-        argument: text, double precision[]
-    - svm_predict_combo:
-        rettype: SETOF schema_madlib.svm_model_pr
-        argument: text, double precision[]
-    - svm_predict_sub:
-        rettype: double precision
-        argument: integer, integer, double precision[], double precision[], double precision[], text
-    - svm_reg_update:
-        rettype: schema_madlib.svm_model_rec
-        argument: schema_madlib.svm_model_rec, double precision[], double precision, text, double precision, double precision, double precision
-    - utils_normalize_data:
-        rettype: schema_madlib.__utils_scaled_data
-        argument: double precision[], double precision[], double precision[]
-    - vcrf_top1_label:
-        rettype: integer[]
-        argument: integer[], integer[], integer
-    - vcrf_top1_view:
-        rettype: text
-        argument: text, text, text, text
-    # ----------------- Changes from 1.5 to 1.6 ---------------
-    # Removed functions
-    - array_contains_null:
-        rettype: boolean
-        argument: double precision[]
-    - array_sqrt:
-        rettype: anyarray
-        argument: anyarray
-    - coxph_step_final:
-        rettype: double precision[]
-        argument: double precision[]
-    - coxph_step_strata_final:
-        rettype: double precision[]
-        argument: double precision[]
-    - coxph_step_transition:
-        rettype: double precision[]
-        argument: double precision[], double precision[], double precision, boolean, double precision[]
-    - internal_coxph_result:
-        rettype: schema_madlib.coxph_result
-        argument: double precision[]
-    - internal_coxph_step_distance:
-        rettype: double precision
-        argument: double precision[], double precision[]
-    - normalize:
-        rettype: double precision[]
-        argument: double precision[]
-    # Changed functions (return type)
-    # These functions can be recreated correctly even if we don't add them here.
-    # But the view dependency checker needs the information.
-    - __internal_mlogregr_irls_result:
-        rettype: schema_madlib.mlogregr_result
-        argument: double precision[]
-    - __logregr_cg_result:
-        rettype: schema_madlib.__logregr_result
-        argument: double precision[]
-    - __logregr_igd_result:
-        rettype: schema_madlib.__logregr_result
-        argument: double precision[]
-    - __logregr_irls_result:
-        rettype: schema_madlib.__logregr_result
-        argument: double precision[]
-    - linregr_final:
-        rettype: schema_madlib.linregr_result
-        argument: schema_madlib.bytea8
-    - mlogregr:
-        rettype: schema_madlib.mlogregr_result
-        argument: character varying, character varying, character varying, integer, character varying, double precision, integer
-    - mlogregr:
-        rettype: schema_madlib.mlogregr_result
-        argument: character varying, character varying, character varying, integer, character varying
-    - mlogregr:
-        rettype: schema_madlib.mlogregr_result
-        argument: character varying, character varying, character varying, integer
-    - mlogregr:
-        rettype: schema_madlib.mlogregr_result
-        argument: character varying, character varying, character varying
-    # make-ups from upgrade to v1.6
-    - __internal_get_cox_prop_hazards_insert_string:
-        rettype: character varying
-        argument: schema_madlib.cox_prop_hazards_result, text
-    - __internal_get_cox_prop_hazards_result:
-        rettype: schema_madlib.cox_prop_hazards_result
-        argument: character varying, character varying, character varying, character varying
-    - __internal_get_hsk_result:
-        rettype: schema_madlib.heteroskedasticity_test_result
-        argument: character varying, character varying, character varying, double precision[]
-    - __internal_get_linreg_result:
-        rettype: schema_madlib.linregr_result
-        argument: character varying, character varying, character varying
-    - __internal_get_linregr_insert_string:
-        rettype: character varying
-        argument: schema_madlib.linregr_result, text
-    - __internal_linregr_train_hetero:
-        rettype: void
-        argument: character varying, character varying, character varying, character varying, boolean
-    - compute_cox_prop_hazards_regr:
-        rettype: integer
-        argument: character varying, character varying, character varying, character varying, integer, character varying, double precision
-    - cox_prop_hazards_step_final:
-        rettype: double precision[]
-        argument: double precision[]
-    - cox_prop_hazards_step_transition:
-        rettype: double precision[]
-        argument: double precision[], double precision[], double precision, boolean, double precision, double precision[], double precision[], double precision[]
-    - intermediate_cox_prop_hazards:
-        rettype: schema_madlib.intermediate_cox_prop_hazards_result
-        argument: double precision[], boolean, double precision[]
-    - internal_cox_prop_hazards_result:
-        rettype: schema_madlib.cox_prop_hazards_result
-        argument: double precision[]
-    - internal_cox_prop_hazards_step_distance:
-        rettype: double precision
-        argument: double precision[], double precision[]
-    - marginal_logregr_step_final:
-        rettype: schema_madlib.marginal_logregr_result
-        argument: double precision[]
-    - mlogregr_marginal_step_final:
-        rettype: schema_madlib.marginal_mlogregr_result
-        argument: double precision[]
-    # ----------------- Changes from 1.6 to 1.6.0S ----------
-    # ----------------- Changes from 1.6.0S to 1.7 ----------
-    - cross_validation_general:   # change in name of argument "fold_num"
-        rettype: void
-        argument: character varying, character varying[], character varying[], character varying, character varying[], character varying, character varying[], character varying[], character varying, character varying[], character varying[], character varying, character varying, boolean, character varying, character varying[], integer
-    - lmf_igd_transition:
-        rettype: double precision[]
-        argument: double precision[], smallint, smallint, double precision, double precision[], smallint, smallint, smallint, double precision, double precision
-    # depending on bytea8
-    # return type is bytea8
-    - __clustered_err_lin_merge:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - __clustered_err_lin_transition:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, double precision, double precision[], double precision[]
-    - __clustered_err_log_merge:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - __clustered_err_log_transition:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, boolean, double precision[], double precision[]
-    - __clustered_err_mlog_merge:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - __clustered_err_mlog_transition:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, integer, double precision[], double precision[], integer, integer
-    - hetero_linregr_merge_states:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - hetero_linregr_transition:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, double precision, double precision[], double precision[]
-    - linregr_merge_states:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - linregr_transition:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, double precision, double precision[]
-    - robust_linregr_merge_states:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - robust_linregr_transition:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, double precision, double precision[], double precision[]
-    - weighted_sample_merge_int64:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - weighted_sample_merge_vector:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - weighted_sample_transition_int64:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, bigint, double precision
-    - weighted_sample_transition_vector:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, double precision[], double precision
-    # argument type bytea8
-    - __clustered_err_lin_final:
-        rettype: schema_madlib.__clustered_agg_result
-        argument: schema_madlib.bytea8
-    - __clustered_err_log_final:
-        rettype: schema_madlib.__clustered_agg_result
-        argument: schema_madlib.bytea8
-    - __clustered_err_mlog_final:
-        rettype: schema_madlib.__clustered_agg_result
-        argument: schema_madlib.bytea8
-    - hetero_linregr_final:
-        rettype: schema_madlib.heteroskedasticity_test_result
-        argument: schema_madlib.bytea8
-    - linregr_final:
-        rettype: schema_madlib.linregr_result
-        argument: schema_madlib.bytea8
-    - robust_linregr_final:
-        rettype: schema_madlib.robust_linregr_result
-        argument: schema_madlib.bytea8
-    - weighted_sample_final_int64:
-        rettype: bigint
-        argument: schema_madlib.bytea8
-    - weighted_sample_final_vector:
-        rettype: double precision[]
-        argument: schema_madlib.bytea8
-    # ----------------- Changes from 1.7.1 to 1.8 ----------
-    - __filter_input_relation:
-        rettype: character varying
-        argument: character varying, character varying
-    - __lda_util_unnest:
-        rettype: SETOF bigint[]
-        argument: bigint[]
-    - matrix_block_mult:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, text
-    - matrix_block_square:
-        rettype: schema_madlib.matrix_result
-        argument: text, text
-    - matrix_block_trans:
-        rettype: schema_madlib.matrix_result
-        argument: text, text
-    - matrix_block_trans:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, boolean
-    - matrix_blockize:
-        rettype: schema_madlib.matrix_result
-        argument: text, integer, integer, text
-    - matrix_densify:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, text, text, text, boolean
-    - matrix_mult:
-        rettype: schema_madlib.matrix_result
-        argument: text, boolean, text, boolean, text
-    - matrix_mult:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, text, text, boolean, text, text, text, text, boolean, text
-    - matrix_norm:
-        rettype: double precision
-        argument: text
-    - matrix_scale_and_add:
-        rettype: void
-        argument: text, text, double precision, text
-    - matrix_sparsify:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, boolean
-    - matrix_square:
-        rettype: schema_madlib.matrix_result
-        argument: text, text
-    - matrix_trans:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, boolean
-    - matrix_trans:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, text, text, text, boolean
-    - matrix_unblockize:
-        rettype: schema_madlib.matrix_result
-        argument: text, text
-
-    # ----------------- Changes from 1.8 to 1.9 ----------
-
-    - _dt_apply:
-       rettype: schema_madlib._tree_result_type
-       argument: schema_madlib.bytea8,schema_madlib.bytea8,schema_madlib.bytea8,smallint,smallint,smallint,boolean,integer
-
-    - internal_linear_svm_igd_result:
-       rettype: schema_madlib.linear_svm_result
-       argument: double precision[]
-
-    - _prune_and_cplist:
-       rettype: schema_madlib._prune_result_type
-       argument: schema_madlib.bytea8,double precision,boolean
-
-    - __array_elem_in:
-       rettype: boolean[]
-       argument: anyarray, anyarray
-
-    - __array_indexed_agg_ffunc:
-       rettype: double precision[]
-       argument: double precision[]
-
-    - __array_indexed_agg_prefunc:
-       rettype: double precision[]
-       argument: double precision[], double precision[]
-
-    - __array_indexed_agg_sfunc:
-       rettype: double precision[]
-       argument: double precision[], double precision, bigint, bigint
-
-    - __array_search:
-       rettype: boolean
-       argument: anyelement, anyarray
-
-    - __array_sort:
-       rettype: anyarray
-       argument: anyarray
-
-    - __assert:
-       rettype: void
-       argument: boolean, text
-
-    - __assert_table:
-       rettype: void
-       argument: text, boolean
-
-    - __best_scv_prefunc:
-       rettype: double precision[]
-       argument: double precision[], double precision[]
-
-    - __best_scv_sfunc:
-       rettype: double precision[]
-       argument: double precision[], double precision[], integer, double precision
-
-    - __bigint_array_add:
-       rettype: bigint[]
-       argument: bigint[], bigint[]
-
-    - __breakup_table:
-       rettype: void
-       argument: text, text, text, text, text, text[], boolean[], integer, integer
-
-    - __check_dt_common_params:
-       rettype: void
-       argument: text, text, text, text, text, text, text, text, integer, double precision, double precision, integer, text
-
-    - __check_training_table:
-       rettype: void
-       argument: text, text[], text[], text, text, integer
-
-    - __column_exists:
-       rettype: boolean
-       argument: text, text
-
-    - __columns_in_table:
-       rettype: boolean
-       argument: text[], text
-
-    - __create_metatable:
-       rettype: void
-       argument: text
-
-    - __create_tree_tables:
-       rettype: void
-       argument: text
-
-    - __csvstr_to_array:
-       rettype: text[]
-       argument: text
-
-    - __display_node_sfunc:
-       rettype: text
-       argument: text, integer, boolean, text, text, double precision, double precision, text, integer
-
-    - __display_tree_no_ordered_aggr:
-       rettype: text
-       argument: text, integer, integer, integer, boolean, double precision, text, integer, integer
-
-    - __distinct_feature_value:
-       rettype: integer
-       argument: text, integer
-
-    - __drop_metatable:
-       rettype: void
-       argument: text
-
-    - __dt_acc_count_sfunc:
-       rettype: bigint[]
-       argument: bigint[], integer, bigint, integer
-
-    - __dt_get_node_split_fids:
-       rettype: integer[]
-       argument: integer, integer, integer, integer[]
-
-    - __ebp_calc_errors:
-       rettype: double precision
-       argument: double precision, double precision, double precision
-
-    - __ebp_prune_tree:
-       rettype: void
-       argument: text
-
-    - __encode_and_train:
-       rettype: record
-       argument: text, text, integer, integer, text, text, text, text, text, text, text, double precision, text, integer, double precision, boolean, double precision, double precision, text, integer
-
-    - __encode_columns:
-       rettype: void
-       argument: text, text, integer, integer
-
-    - __encode_table:
-       rettype: void
-       argument: text, text, text, integer, integer
-
-    - __encode_table:
-       rettype: void
-       argument: text, text, text[], text, text[], text, text, integer, integer
-
-    - __find_best_split:
-       rettype: void
-       argument: text, double precision, text, integer, integer, text, integer, integer
-
-    - __format:
-       rettype: text
-       argument: text, text
-
-    - __format:
-       rettype: text
-       argument: text, text, text
-
-    - __format:
-       rettype: text
-       argument: text, text, text, text
-
-    - __format:
-       rettype: text
-       argument: text, text, text, text, text
-
-    - __format:
-       rettype: text
-       argument: text, text[]
-
-    - __gen_acc:
-       rettype: __gen_acc_time
-       argument: text, text, text, text, text, integer, integer, boolean, integer
-
-    - __gen_enc_meta_names:
-       rettype: text[]
-       argument: text, text
-
-    - __gen_horizontal_encoded_table:
-       rettype: void
-       argument: text, text, integer, integer
-
-    - __gen_vertical_encoded_table:
-       rettype: void
-       argument: text, text, text, boolean, integer
-
-    - __generate_final_tree:
-       rettype: void
-       argument: text
-
-    - __get_class_column_name:
-       rettype: text
-       argument: text
-
-    - __get_class_value:
-       rettype: text
-       argument: integer, text
-
-    - __get_classtable_name:
-       rettype: text
-       argument: text
-
-    - __get_column_value:
-       rettype: text
-       argument: integer, integer, character, text
-
-    - __get_feature_name:
-       rettype: text
-       argument: integer, text
-
-    - __get_feature_value:
-       rettype: text
-       argument: integer, integer, text
-
-    - __get_features_of_nodes:
-       rettype: text
-       argument: text, text, integer, integer, integer
-
-    - __get_id_column_name:
-       rettype: text
-       argument: text
-
-    - __get_schema_name:
-       rettype: text
-       argument: text
-
-    - __get_table_name:
-       rettype: text
-       argument: text
-
-    - __insert_into_metatable:
-       rettype: void
-       argument: text, integer, text, character, boolean, text, integer
-
-    - __is_valid_enc_table:
-       rettype: boolean
-       argument: text
-
-    - __num_of_class:
-       rettype: integer
-       argument: text
-
-    - __num_of_columns:
-       rettype: integer
-       argument: text
-
-    - __num_of_feature:
-       rettype: integer
-       argument: text
-
-    - __regclass_to_text:
-       rettype: text
-       argument: regclass
-
-    - __rename_table:
-       rettype: void
-       argument: text, text
-
-    - __rep_aggr_class_count_ffunc:
-       rettype: bigint[]
-       argument: bigint[]
-
-    - __rep_aggr_class_count_sfunc:
-       rettype: bigint[]
-       argument: bigint[], integer, integer, integer
-
-    - __rep_prune_tree:
-       rettype: void
-       argument: text, text, integer
-
-    - __sample_with_replacement:
-       rettype: void
-       argument: integer, bigint, text, text
-
-    - __sample_within_range:
-       rettype: SETOF bigint
-       argument: bigint, bigint, bigint
-
-    - __scv_aggr_ffunc:
-       rettype: double precision[]
-       argument: double precision[]
-
-    - __scv_aggr_prefunc:
-       rettype: double precision[]
-       argument: double precision[], double precision[]
-
-    - __scv_aggr_sfunc:
-       rettype: double precision[]
-       argument: double precision[], integer, boolean, integer, double precision[], double precision[], bigint
-
-    - __strip_schema_name:
-       rettype: text
-       argument: text
-
-    - __svm_random_ind2:
-       rettype: double precision[]
-       argument: integer
-
-    - __svm_random_ind:
-       rettype: double precision[]
-       argument: integer
-
-    - __svm_target_cl_func:
-       rettype: double precision
-       argument: double precision[]
-
-    - __svm_target_reg_func:
-       rettype: double precision
-       argument: double precision[]
-
-    - __table_exists:
-       rettype: boolean
-       argument: text
-
-    - __train_tree:
-       rettype: __train_result
-       argument: text, integer, integer, text, text, text, text, text, text, double precision, integer, double precision, double precision, double precision, boolean, integer, integer
-
-    - __treemodel_classify_internal:
-       rettype: text[]
-       argument: text, text, integer
-
-    - __treemodel_classify_internal_serial:
-       rettype: text[]
-       argument: text, text, integer
-
-    - __treemodel_display_no_ordered_aggr:
-       rettype: SETOF text
-       argument: text, integer[], integer
-
-    - __treemodel_display_with_ordered_aggr:
-       rettype: SETOF text
-       argument: text, integer[], integer
-
-    - __treemodel_get_vote_result:
-       rettype: void
-       argument: text, text
-
-    - __treemodel_score:
-       rettype: double precision
-       argument: text, text, integer
-
-    - __validate_input_table:
-       rettype: void
-       argument: text, text[], text, text
-
-    - __validate_metatable:
-       rettype: void
-       argument: text
-
-    - c45_classify:
-       rettype: c45_classify_result
-       argument: text, text, text
-
-    - c45_classify:
-       rettype: c45_classify_result
-       argument: text, text, text, integer
-
-    - c45_clean:
-       rettype: boolean
-       argument: text
-
-    - c45_display:
-       rettype: SETOF text
-       argument: text
-
-    - c45_display:
-       rettype: SETOF text
-       argument: text, integer
-
-    - c45_genrule:
-       rettype: SETOF text
-       argument: text
-
-    - c45_genrule:
-       rettype: SETOF text
-       argument: text, integer
-
-    - c45_score:
-       rettype: double precision
-       argument: text, text
-
-    - c45_score:
-       rettype: double precision
-       argument: text, text, integer
-
-    - c45_train:
-       rettype: c45_train_result
-       argument: text, text, text
-
-    - c45_train:
-       rettype: c45_train_result
-       argument: text, text, text, text, text, text, text, text, double precision, text
-
-    - c45_train:
-       rettype: c45_train_result
-       argument: text, text, text, text, text, text, text, text, double precision, text, integer, double precision, double precision, integer
-
-    - correlation:
-       rettype: correlation_result
-       argument: character varying, character varying
-
-    - correlation:
-       rettype: correlation_result
-       argument: character varying, character varying, character varying
-
-    - correlation:
-       rettype: correlation_result
-       argument: character varying, character varying, character varying, boolean
-
-    - linear_svm_igd_transition:
-       rettype: double precision[]
-       argument: double precision[], double precision[], boolean, double precision[], integer, double precision, double precision
-
-    - lsvm_classification:
-       rettype: SETOF lsvm_sgd_result
-       argument: text, text
-
-    - lsvm_classification:
-       rettype: SETOF lsvm_sgd_result
-       argument: text, text, boolean
-
-    - lsvm_classification:
-       rettype: SETOF lsvm_sgd_result
-       argument: text, text, boolean, boolean, double precision, double precision
-
-    - lsvm_classification:
-       rettype: SETOF lsvm_sgd_result
-       argument: text, text, boolean, boolean, double precision, double precision, integer
-
-    - lsvm_predict:
-       rettype: double precision
-       argument: double precision[], double precision[]
-
-    - lsvm_predict_batch:
-       rettype: text
-       argument: text, text, text, text, text
-
-    - lsvm_predict_batch:
-       rettype: text
-       argument: text, text, text, text, text, boolean
-
-    - matrix_block_trans:
-       rettype: matrix_result
-       argument: text, text, text, text, boolean
-
-    - matrix_densify:
-       rettype: matrix_result
-       argument: text, text, text, text, boolean
-
-    - matrix_sparsify:
-       rettype: matrix_result
-       argument: text, text, text, text, boolean
-
-    - matrix_trans:
-       rettype: matrix_result
-       argument: text, text, text, text, boolean
-
-    - rf_classify:
-       rettype: rf_classify_result
-       argument: text, text, text
-
-    - rf_classify:
-       rettype: rf_classify_result
-       argument: text, text, text, boolean, integer
-
-    - rf_classify:
-       rettype: rf_classify_result
-       argument: text, text, text, integer
-
-    - rf_clean:
-       rettype: boolean
-       argument: text
-
-    - rf_display:
-       rettype: SETOF text
-       argument: text
-
-    - rf_display:
-       rettype: SETOF text
-       argument: text, integer[]
-
-    - rf_display:
-       rettype: SETOF text
-       argument: text, integer[], integer
-
-    - rf_score:
-       rettype: double precision
-       argument: text, text
-
-    - rf_score:
-       rettype: double precision
-       argument: text, text, integer
-
-    - rf_train:
-       rettype: rf_train_result
-       argument: text, text, text
-
-    - rf_train:
-       rettype: rf_train_result
-       argument: text, text, text, integer, integer, double precision, text, text, text, text, text, integer, double precision, double precision, integer
-
-    - svdmf_run:
-       rettype: text
-       argument: text, text, text, text, integer
-
-    - svdmf_run:
-       rettype: text
-       argument: text, text, text, text, integer, integer, double precision
-
-    - svm_classification:
-       rettype: SETOF svm_cls_result
-       argument: text, text, boolean, text
-
-    - svm_classification:
-       rettype: SETOF svm_cls_result
-       argument: text, text, boolean, text, boolean, double precision, double precision
-
-    - svm_classification:
-       rettype: SETOF svm_cls_result
-       argument: text, text, boolean, text, boolean, double precision, double precision, double precision
-
-    - svm_classification:
-       rettype: SETOF svm_cls_result
-       argument: text, text, boolean, text, double precision
-
-    - svm_cls_update:
-       rettype: schema_madlib.svm_model_rec
-       argument: schema_madlib.svm_model_rec, double precision[], double precision, text, double precision, double precision, double precision
-
-    - svm_data_normalization:
-       rettype: void
-       argument: text
-
-    - svm_dot:
-       rettype: double precision
-       argument: double precision[], double precision[]
-
-    - svm_dot:
-       rettype: double precision
-       argument: double precision[], double precision[], double precision
-
-    - svm_drop_model:
-       rettype: void
-       argument: text
-
-    - svm_gaussian:
-       rettype: double precision
-       argument: double precision[], double precision[], double precision
-
-    - svm_generate_cls_data:
-       rettype: void
-       argument: text, integer, integer
-
-    - svm_generate_nd_data:
-       rettype: void
-       argument: text, integer, integer
-
-    - svm_generate_reg_data:
-       rettype: void
-       argument: text, integer, integer
-
-    - svm_nd_update:
-       rettype: schema_madlib.svm_model_rec
-       argument: schema_madlib.svm_model_rec, double precision[], text, double precision, double precision, double precision
-
-    - svm_novelty_detection:
-       rettype: SETOF schema_madlib.svm_nd_result
-       argument: text, text, boolean, text
-
-    - svm_novelty_detection:
-       rettype: SETOF schema_madlib.svm_nd_result
-       argument: text, text, boolean, text, boolean, double precision, double precision
-
-    - svm_novelty_detection:
-       rettype: SETOF schema_madlib.svm_nd_result
-       argument: text, text, boolean, text, boolean, double precision, double precision, double precision
-
-    - svm_polynomial:
-       rettype: double precision
-       argument: double precision[], double precision[], double precision
-
-    - svm_predict:
-       rettype: double precision
-       argument: schema_madlib.svm_model_rec, double precision[], text, double precision
-
-    - svm_predict_batch:
-       rettype: text
-       argument: text, text, text, text, text, boolean
-
-    - svm_predict_sub:
-       rettype: double precision
-       argument: integer, integer, double precision[], double precision[], double precision[], text, double precision
-
-    - svm_reg_update:
-       rettype: schema_madlib.svm_model_rec
-       argument: schema_madlib.svm_model_rec, double precision[], double precision, text, double precision, double precision, double precision, double precision
-
-    - svm_regression:
-       rettype: SETOF svm_reg_result
-       argument: text, text, boolean, text
-
-    - svm_regression:
-       rettype: SETOF svm_reg_result
-       argument: text, text, boolean, text, boolean, double precision, double precision, double precision
-
-    - svm_regression:
-       rettype: SETOF svm_reg_result
-       argument: text, text, boolean, text, boolean, double precision, double precision, double precision, double precision
-
-    - svm_store_model:
-       rettype: void
-       argument: text, text, text
-
-    # ----------------- Changes from 1.9 to 1.9.1 ----------
-    - array_collapse:
-        rettype: anyarray
-        argument: anyarray
-    - linear_svm_igd_transition:
-        rettype: double precision[]
-        argument: double precision[], double precision[], double precision, double precision[], integer, double precision, double precision, boolean, integer, double precision, boolean
-    - profile:
-        rettype: SETOF schema_madlib.profile_result
-        argument: text
-    - profile_full:
-        rettype: SETOF schema_madlib.profile_result
-        argument: text, integer
-    - profile:
-        rettype: schema_madlib.profile_result
-        argument: text
-    - profile_full:
-        rettype: schema_madlib.profile_result
-        argument: text, integer
-    - quantile:
-        rettype: double precision
-        argument: text, text, double precision
-    - quantile_big:
-        rettype: double precision
-        argument: text, text, double precision
-
-# Changes to aggregates (UDA) including removal and modification
-# Overloaded functions should be mentioned separately
-uda:
-    # ----------------- Changes from 1.0 to 1.1 -----------------
-    # ----------------- Changes from 1.1 to 1.2 -----------------
-    # ----------------- Changes from 1.2 to 1.3 -----------------
-    - linregr:
-        rettype: schema_madlib.linregr_result
-        argument: double precision, double precision[]
-    # ----------------- Changes from 1.3 to 1.4 -----------------
-    - cox_prop_hazards_step:
-         rettype: double precision[]
-         argument: double precision[], double precision, boolean, double precision, double precision[], double precision[], double precision[]
-    - __lda_count_topic_agg:
-        rettype: integer[]
-        argument: integer[], integer[], integer[], integer, integer
-    - __lda_perplexity_agg:
-        rettype: double precision
-        argument: integer[], integer[], integer[], integer[], double precision, double precision, integer, integer
-    # ----------------- Changes from 1.4 to 1.4.1 ---------------
-    # ----------------- Changes from 1.4.1 to 1.5 ---------------
-    - lsvm_sgd_agg:
-        rettype: schema_madlib.lsvm_sgd_model_rec
-        argument: double precision[], double precision, double precision, double precision
-    - svm_cls_agg:
-        rettype: schema_madlib.svm_model_rec
-        argument: double precision[], double precision, text, double precision, double precision
-    - svm_nd_agg:
-        rettype: schema_madlib.svm_model_rec
-        argument: double precision[], text, double precision, double precision
-    - svm_reg_agg:
-        rettype: schema_madlib.svm_model_rec
-        argument: double precision[], double precision, text, double precision, double precision, double precision
-    # ----------------- Changes from 1.5 to 1.6 ---------------
-    # - coxph_step: not exists in v1.0
-    # - coxph_strata_step_inner: not exists in v1.0
-    # - coxph_strata_step_outer: not exists in v1.0
-    # return type change
-    # - linregr: appeared before
-    # initcond change
-    - __mlogregr_irls_step:
-        rettype: double precision[]
-        argument: integer, integer, integer, double precision[], double precision[]
-    # make-ups from upgrade to v1.6
-    - marginal_logregr:
-        rettype: schema_madlib.marginal_logregr_result
-        argument: boolean, double precision[], double precision[]
-    - marginal_mlogregr:
-        rettype: schema_madlib.marginal_mlogregr_result
-        argument: integer, integer, integer, double precision[], double precision[]
-    # ----------------- Changes from 1.6 to 1.6.0S ----------
-    # ----------------- Changes from 1.6.0S to 1.7 ----------
-    - lmf_igd_step:
-        rettype: double precision[]
-        argument: smallint, smallint, double precision, double precision[], smallint, smallint, smallint, double precision, double precision
-    - __clustered_err_lin_step:
-        rettype: schema_madlib.__clustered_agg_result
-        argument: double precision, double precision[], double precision[]
-    - __clustered_err_log_step:
-        rettype: schema_madlib.__clustered_agg_result
-        argument: boolean, double precision[], double precision[]
-    - __clustered_err_mlog_step:
-        rettype: schema_madlib.__clustered_agg_result
-        argument: integer, double precision[], double precision[], integer, integer
-    - heteroskedasticity_test_linregr:
-        rettype: schema_madlib.heteroskedasticity_test_result
-        argument: double precision, double precision[], double precision[]
-    - linregr:
-        rettype: schema_madlib.linregr_result
-        argument: double precision, double precision[]
-    - robust_linregr:
-        rettype: schema_madlib.robust_linregr_result
-        argument: double precision, double precision[], double precision[]
-    - weighted_sample:
-        rettype: double precision[]
-        argument: double precision[], double precision
-    - weighted_sample:
-        rettype: bigint
-        argument: bigint, double precision
-    # ----------------- Changes from 1.7.1 to 1.8 ----------
-    - discrete_distribution_agg:
-        rettype: double precision[]
-        argument: integer, double precision, integer
-    - vectorized_distribution_agg:
-        rettype: double precision[]
-        argument: integer[], integer[]
-    # ----------------- Changes from 1.8 to 1.9 ----------
-
-    - __array_indexed_agg:
-        rettype: double precision[]
-        argument: double precision, bigint, bigint
-
-    - __best_scv_aggr:
-        rettype: double precision[]
-        argument: double precision[], integer, double precision
-
-    - __bigint_array_sum:
-        rettype: bigint[]
-        argument: bigint[]
-
-    - __display_tree_aggr:
-        rettype: text
-        argument: integer, boolean, text, text, double precision, double precision, text, integer
-
-    - __dt_acc_count_aggr:
-        rettype: bigint[]
-        argument: integer, bigint, integer
-
-    - __rep_aggr_class_count:
-        rettype: bigint[]
-        argument: integer, integer, integer
-
-    - __scv_aggr:
-        rettype: double precision[]
-        argument: integer, boolean, integer, double precision[], double precision[], bigint
-
-    - linear_svm_igd_step:
-        rettype: double precision[]
-        argument: double precision[], boolean, double precision[], integer, double precision, double precision
-
-    - linear_svm_igd_step_serial:
-        rettype: double precision[]
-        argument: double precision[], boolean, double precision[], integer, double precision, double precision
-
-    - svm_cls_agg:
-        rettype: schema_madlib.svm_model_rec
-        argument: double precision[], double precision, text, double precision, double precision, double precision
-
-    - svm_nd_agg:
-        rettype: schema_madlib.svm_model_rec
-        argument: double precision[], text, double precision, double precision, double precision
-
-    - svm_reg_agg:
-        rettype: schema_madlib.svm_model_rec
-        argument: double precision[], double precision, text, double precision, double precision, double precision, double precision
-
-    - __svm_random_ind2:
-        rettype: double precision[]
-        argument: integer
-
-    # ----------------- Changes from 1.9 to 1.9.1 ----------
-    - array_agg:
-        rettype: anyarray
-        argument: anyelement
-    - linear_svm_igd_step:
-       rettype: double precision[]
-       argument: double precision[], double precision, double precision[], integer, double precision, double precision, boolean, integer, double precision, boolean
-
-# Casts (UDC) updated/removed
-udc:
-    # ----------------- Changes from 1.0 to 1.1 -----------------
-    # ----------------- Changes from 1.1 to 1.2 -----------------
-    # ----------------- Changes from 1.2 to 1.3 -----------------
-    # ----------------- Changes from 1.3 to 1.4 -----------------
-    # ----------------- Changes from 1.4 to 1.4.1 ---------------
-    # ----------------- Changes from 1.4.1 to 1.5 ---------------
-    bool2text:
-        sourcetype: boolean
-        targettype: text
-    # ----------------- Changes from 1.5 to 1.6 ---------------
-    # ----------------- Changes from 1.6 to 1.6.0S ----------
-    # ----------------- Changes from 1.6.0S to 1.7 ----------
-
-# Operators (UDO) removed/updated
-udo:
-    # ----------------- Changes from 1.5 to 1.6 ---------------
-    - '<':
-        leftarg: schema_madlib.svec
-        rightarg: schema_madlib.svec
-    - '<=':
-        leftarg: schema_madlib.svec
-        rightarg: schema_madlib.svec
-    - '<>':
-        leftarg: schema_madlib.svec
-        rightarg: schema_madlib.svec
-    - '==':
-        leftarg: schema_madlib.svec
-        rightarg: schema_madlib.svec
-    - '>=':
-        leftarg: schema_madlib.svec
-        rightarg: schema_madlib.svec
-    - '>':
-        leftarg: schema_madlib.svec
-        rightarg: schema_madlib.svec
-    # ----------------- Changes from 1.6 to 1.6.0S ----------
-    # ----------------- Changes from 1.6.0S to 1.7 ----------
-
-# Operator Classes (UDOC) removed/updated
-udoc:
-    # ----------------- Changes from 1.5 to 1.6 ---------------
-    # removed
-    - svec_l2_ops:
-        index: btree
-    # ----------------- Changes from 1.6 to 1.6.0S ----------
-    # ----------------- Changes from 1.6.0S to 1.7 ----------

[46/50] [abbrv] incubator-madlib git commit: Encode categorical: Add distributed_by in Postgresql w/ no-op

Posted by ri...@apache.org.

Encode categorical: Add distributed_by in Postgresql w/ no-op


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/7055dceb
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/7055dceb
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/7055dceb

Branch: refs/heads/latest_release
Commit: 7055dceb3fbde35bae602ac80d4b70486f015748
Parents: ea17530
Author: Rahul Iyer <ri...@apache.org>
Authored: Tue Feb 14 14:36:42 2017 -0800
Committer: Rahul Iyer <ri...@apache.org>
Committed: Tue Feb 14 14:36:42 2017 -0800

----------------------------------------------------------------------
 .../modules/utilities/encode_categorical.py_in  |  2 +-
 .../modules/utilities/encode_categorical.sql_in | 23 ++++++++------------
 2 files changed, 10 insertions(+), 15 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/7055dceb/src/ports/postgres/modules/utilities/encode_categorical.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/encode_categorical.py_in b/src/ports/postgres/modules/utilities/encode_categorical.py_in
index 54b4add..81bc8ed 100644
--- a/src/ports/postgres/modules/utilities/encode_categorical.py_in
+++ b/src/ports/postgres/modules/utilities/encode_categorical.py_in
@@ -77,7 +77,7 @@ class CategoricalEncoder(object):
 
         self.output_type = 'column' if not output_type else output_type.lower()
         self.output_dictionary = output_dictionary
-        self.distributed_by = distributed_by
+        self.distributed_by = distributed_by if not is_platform_pg() else None
 
         self._name_others_col = "_MISC__"
         self._array_out_name = "__encoded_variables__"

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/7055dceb/src/ports/postgres/modules/utilities/encode_categorical.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/encode_categorical.sql_in b/src/ports/postgres/modules/utilities/encode_categorical.sql_in
index d36ef2b..2f22855 100644
--- a/src/ports/postgres/modules/utilities/encode_categorical.sql_in
+++ b/src/ports/postgres/modules/utilities/encode_categorical.sql_in
@@ -337,7 +337,7 @@ SELECT madlib.encode_categorical_variables (
 SELECT * FROM abalone_out ORDER BY id;
 </pre>
 <pre class="result">
- id | sex_F | sex_I | sex_M | rings_7 | rings_8 | rings_9 | rings_10 | rings_11 | rings_12 | rings_14 | rings_15 | rings_16 | rings_19 | rings_20 
+ id | sex_F | sex_I | sex_M | rings_7 | rings_8 | rings_9 | rings_10 | rings_11 | rings_12 | rings_14 | rings_15 | rings_16 | rings_19 | rings_20
 ----+-------+-------+-------+---------+---------+---------+----------+----------+----------+----------+----------+----------+----------+----------
   1 |     0 |     0 |     1 |       0 |       0 |       0 |        0 |        0 |        0 |        0 |        1 |        0 |        0 |        0
   2 |     0 |     0 |     1 |       1 |       0 |       0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0
@@ -379,7 +379,7 @@ SELECT madlib.encode_categorical_variables (
 SELECT * FROM abalone_out ORDER BY id;
 </pre>
 <pre class="result">
- id | sex_M | sex_F | sex__MISC__ | rings_10 | rings_7 | rings_9 | rings__MISC__ 
+ id | sex_M | sex_F | sex__MISC__ | rings_10 | rings_7 | rings_9 | rings__MISC__
 ----+-------+-------+-------------+----------+---------+---------+---------------
   1 |     1 |     0 |           0 |        0 |       0 |       0 |             1
   2 |     1 |     0 |           0 |        0 |       1 |       0 |             0
@@ -421,7 +421,7 @@ SELECT madlib.encode_categorical_variables (
 SELECT * FROM abalone_out ORDER BY id;
 </pre>
 <pre class="result">
- id | sex | rings | sex_F | sex_I | sex_M | rings_7 | rings_8 | rings_9 | rings_10 | rings_11 | rings_12 | rings_14 | rings_15 | rings_16 | rings_19 | rings_20 
+ id | sex | rings | sex_F | sex_I | sex_M | rings_7 | rings_8 | rings_9 | rings_10 | rings_11 | rings_12 | rings_14 | rings_15 | rings_16 | rings_19 | rings_20
 ----+-----+-------+-------+-------+-------+---------+---------+---------+----------+----------+----------+----------+----------+----------+----------+----------
   1 | M   |    15 |     0 |     0 |     1 |       0 |       0 |       0 |        0 |        0 |        0 |        0 |        1 |        0 |        0 |        0
   2 | M   |     7 |     0 |     0 |     1 |       1 |       0 |       0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0
@@ -504,7 +504,7 @@ SELECT madlib.encode_categorical_variables (
 SELECT * FROM abalone_out ORDER BY id;
 </pre>
 <pre class="result">
- id |     __encoded_variables__     
+ id |     __encoded_variables__
 ----+-------------------------------
   1 | {0,0,1,0,0,0,0,0,0,0,1,0,0,0}
   2 | {0,0,1,1,0,0,0,0,0,0,0,0,0,0}
@@ -533,7 +533,7 @@ View the dictionary table that gives the index into the array:
 SELECT * FROM abalone_out_dictionary;
 </pre>
 <pre class="result">
-  encoded_column_name  | index | variable | value 
+  encoded_column_name  | index | variable | value
 -----------------------+-------+----------+-------
  __encoded_variables__ |     1 | sex      | F
  __encoded_variables__ |     2 | sex      | I
@@ -569,7 +569,7 @@ SELECT madlib.encode_categorical_variables (
 SELECT * FROM abalone_out ORDER BY id;
 </pre>
 <pre class="result">
- id | sex_1 | sex_2 | sex_3 | rings_1 | rings_2 | rings_3 | rings_4 | rings_5 | rings_6 | rings_7 | rings_8 | rings_9 | rings_10 | rings_11 
+ id | sex_1 | sex_2 | sex_3 | rings_1 | rings_2 | rings_3 | rings_4 | rings_5 | rings_6 | rings_7 | rings_8 | rings_9 | rings_10 | rings_11
 ----+-------+-------+-------+---------+---------+---------+---------+---------+---------+---------+---------+---------+----------+----------
   1 |     0 |     0 |     1 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       1 |       0 |        0 |        0
   2 |     0 |     0 |     1 |       1 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |        0 |        0
@@ -598,7 +598,7 @@ View the dictionary table that defines the numerical columns in the output table
 SELECT * FROM abalone_out_dictionary ORDER BY encoded_column_name;
 </pre>
 <pre class="result">
- encoded_column_name | index | variable | value 
+ encoded_column_name | index | variable | value
 ---------------------+-------+----------+-------
  "rings_1"           |     1 | rings    | 7
  "rings_10"          |    10 | rings    | 19
@@ -677,10 +677,7 @@ SELECT madlib.encode_categorical_variables (
  *
  */
 
--- We don't create the below function for PostgreSQL since it does not contain a
--- distribution policy.
-m4_changequote(<!,!>)
-m4_ifdef(<!__POSTGRESQL__!>, <!!>, <!
+-- Create the below function for PostgreSQL but ensure that distributed_by is a no-op
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.encode_categorical_variables(
     source_table                    VARCHAR,
     output_table                    VARCHAR,
@@ -696,9 +693,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.encode_categorical_variables(
 ) RETURNS VOID AS $$
     PythonFunction(utilities, encode_categorical, encode_categorical_variables)
 $$ LANGUAGE plpythonu
-m4_ifdef(<!__HAS_FUNCTION_PROPERTIES__!>, <!MODIFIES SQL DATA!>, <!!>);
-!>)
-m4_changequote(<!`!>, <!'!>)
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.encode_categorical_variables(
     source_table                    VARCHAR,

[25/50] [abbrv] incubator-madlib git commit: New Module: k-Nearest Neighbors (k-NN)

Posted by ri...@apache.org.

New Module: k-Nearest Neighbors (k-NN)

JIRA: MADLIB-927

- Add a new module for k-NN, as early stage development.

This version of k-NN considers only one distance metric, which is
MADlib's squared_dist_norm2. There are multiple JIRAs opened to
address and improve this and other such limitations (JIRA IDs:
MADLIB-1059, MADLIB-1060, MADLIB-1061).

Additional authors: Orhan Kislal, Nandish Jayaram

Closes #81


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/61f3c5f0
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/61f3c5f0
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/61f3c5f0

Branch: refs/heads/latest_release
Commit: 61f3c5f038d96cff851a72f4e97b926fc5a02726
Parents: 9162271
Author: Auon Haidar Kazmi <ka...@gmail.com>
Authored: Wed Feb 1 16:54:57 2017 -0800
Committer: Nandish Jayaram <nj...@apache.org>
Committed: Wed Feb 1 16:54:57 2017 -0800

----------------------------------------------------------------------
 doc/mainpage.dox.in                            |   7 +
 src/config/Modules.yml                         |   2 +
 src/ports/postgres/modules/knn/__init__.py_in  |   0
 src/ports/postgres/modules/knn/knn.py_in       | 129 ++++++
 src/ports/postgres/modules/knn/knn.sql_in      | 449 ++++++++++++++++++++
 src/ports/postgres/modules/knn/test/knn.sql_in |  70 +++
 6 files changed, 657 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/61f3c5f0/doc/mainpage.dox.in
----------------------------------------------------------------------
diff --git a/doc/mainpage.dox.in b/doc/mainpage.dox.in
index 0e846e1..85a5d8d 100644
--- a/doc/mainpage.dox.in
+++ b/doc/mainpage.dox.in
@@ -253,6 +253,13 @@ Interface and implementation are subject to change.
     @defgroup grp_cg Conjugate Gradient
     @defgroup grp_bayes Naive Bayes Classification
     @defgroup grp_sample Random Sampling
+
+    @defgroup grp_nene Nearest Neighbors
+    @ingroup grp_super
+    @{A collection of methods to create nearest neigbor based models.@}
+
+        @defgroup grp_knn k-Nearest Neighbors
+        @ingroup grp_nene
 @}
 
 @defgroup grp_deprecated Deprecated Modules

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/61f3c5f0/src/config/Modules.yml
----------------------------------------------------------------------
diff --git a/src/config/Modules.yml b/src/config/Modules.yml
index c3315b6..d5c336e 100644
--- a/src/config/Modules.yml
+++ b/src/config/Modules.yml
@@ -17,6 +17,8 @@ modules:
     - name: graph
     - name: kmeans
       depends: ['array_ops', 'svec_util', 'sample']
+    - name: knn
+      depends: ['array_ops']
     - name: lda
       depends: ['array_ops']
     - name: linalg

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/61f3c5f0/src/ports/postgres/modules/knn/__init__.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/knn/__init__.py_in b/src/ports/postgres/modules/knn/__init__.py_in
new file mode 100644
index 0000000..e69de29

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/61f3c5f0/src/ports/postgres/modules/knn/knn.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/knn/knn.py_in b/src/ports/postgres/modules/knn/knn.py_in
new file mode 100644
index 0000000..da7f9d6
--- /dev/null
+++ b/src/ports/postgres/modules/knn/knn.py_in
@@ -0,0 +1,129 @@
+# coding=utf-8
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+m4_changequote(`<!', `!>')
+
+"""
+@file knn.py_in
+
+@brief knn: Driver functions
+
+@namespace knn
+
+@brief knn: Driver functions
+"""
+
+import plpy
+from utilities.validate_args import table_exists
+from utilities.validate_args import table_is_empty
+from utilities.validate_args import columns_exist_in_table
+from utilities.validate_args import columns_exist_in_table
+from utilities.validate_args import is_col_array
+from utilities.validate_args import array_col_has_no_null
+from utilities.validate_args import get_cols_and_types
+
+STATE_IN_MEM = m4_ifdef(<!__HAWQ__!>, <!True!>, <!False!>)
+HAS_FUNCTION_PROPERTIES = m4_ifdef(<!__HAS_FUNCTION_PROPERTIES__!>, <!True!>, <!False!>)
+UDF_ON_SEGMENT_NOT_ALLOWED = m4_ifdef(<!__UDF_ON_SEGMENT_NOT_ALLOWED__!>, <!True!>, <!False!>)
+# ----------------------------------------------------------------------
+
+
+def knn_validate_src(schema_madlib, **kwargs):
+    trainingSource = kwargs['trainingSource']
+    if not trainingSource:
+        plpy.error("knn error: Invalid training table name!")
+    if not table_exists(trainingSource):
+        plpy.error("knn error: Training table {0} does not exist!".format(trainingSource))
+    if table_is_empty(trainingSource):
+        plpy.error("knn error: Training table {0} is empty!".format(trainingSource))
+
+    testSource = kwargs['testSource']
+    if not testSource:
+        plpy.error("knn error: Invalid test table name!")
+    if not table_exists(testSource):
+        plpy.error("knn error: Test table {0} does not exist!".format(testSource))
+    if table_is_empty(testSource):
+        plpy.error("knn error: Test table {0} is empty!".format(testSource))
+
+    trainingClassColumn = kwargs['trainingClassColumn']
+    trainingFeatureColumn = kwargs['trainingFeatureColumn']
+    for c in (trainingClassColumn, trainingFeatureColumn):
+        if not c:
+            plpy.error("knn error: Invalid column name in training table!")
+        if not columns_exist_in_table(trainingSource, [c]):
+            plpy.error("knn error: " + \
+                    "Column '{0}' does not exist in {1}!".format(c, trainingSource))
+
+    testingFeatureColumn = kwargs['testingFeatureColumn']
+    testingIdColumn = kwargs['testingIdColumn']
+    for c in (testingFeatureColumn, testingIdColumn):
+        if not c:
+            plpy.error("knn error: Invalid column name in test table!")
+        if not columns_exist_in_table(testSource, [c]):
+            plpy.error("knn error: " + \
+                    "Column '{0}' does not exist in {1}!".format(c, testSource))
+
+    if not is_col_array(trainingSource, trainingFeatureColumn):
+        plpy.error("knn error:" + \
+                    "'Feature column {0} in train table is not an array!".format(str(trainingFeatureColumn)))
+    if not is_col_array(testSource, testingFeatureColumn):
+        plpy.error("knn error:" + \
+                    "'Feature column {0} in test table is not an array!".format(str(testingFeatureColumn)))
+
+    if not array_col_has_no_null(trainingSource, trainingFeatureColumn):
+        plpy.error("knn error:" + \
+                    "'Feature column {0} in train table has some NULL values!".format(str(trainingFeatureColumn)))
+    if not array_col_has_no_null(testSource, testingFeatureColumn):
+        plpy.error("knn error:" + \
+                    "'Feature column {0} in test table has some NULL values!".format(str(testingFeatureColumn)))
+
+    k = int(kwargs['K'])
+    if k<=0:
+        plpy.error("knn error:" + \
+                    "'k' {0} is not valid for knn!".format(str(k)))
+    bound = plpy.execute("""SELECT {k} <= count(*)
+            AS bound FROM {tbl}""".format(k=str(k),
+            trainingFeatureColumn=trainingFeatureColumn, tbl=trainingSource))[0]['bound']
+    if not bound:
+        plpy.error("knn error:" + \
+                "'k' {0} is greater than number of rows in training table!".format(str(k)))
+
+    colTypesList = get_cols_and_types(trainingSource)
+    colType = ''
+    for type in colTypesList:
+        if type[0] == trainingClassColumn:
+            colType = type[1]
+            break
+    if colType not in ['INTEGER','integer','double precision','DOUBLE PRECISION','float','FLOAT','boolean','BOOLEAN'] :
+        plpy.error("knn error:" + \
+                    "Data type {0} is not valid as label for scope of knn!".format(str(colType)))
+
+    colTypesTestList = get_cols_and_types(testSource)
+    colType = ''
+    for type in colTypesTestList:
+        if type[0] == testingIdColumn:
+            colType = type[1]
+            break
+    if colType not in ['INTEGER','integer'] :
+        plpy.error("knn error:" + \
+                    "Data type {0} is not valid as Id in test table!".format(str(colType)))
+
+
+# ----------------------------------------------------------------------
+m4_changequote(<!`!>, <!'!>)

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/61f3c5f0/src/ports/postgres/modules/knn/knn.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/knn/knn.sql_in b/src/ports/postgres/modules/knn/knn.sql_in
new file mode 100644
index 0000000..7ee736b
--- /dev/null
+++ b/src/ports/postgres/modules/knn/knn.sql_in
@@ -0,0 +1,449 @@
+/* ----------------------------------------------------------------------- *//**
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ *//* ----------------------------------------------------------------------- */
+
+
+/* ----------------------------------------------------------------------- *//**
+ *
+ * @file knn.sql_in
+ * @brief Set of functions for k-nearest neighbors.
+ * @sa For a brief introduction to k-nearest neighbors algorithm for regression and classification,
+ * see the module description \ref grp_knn.
+ *
+ *
+ *//* ----------------------------------------------------------------------- */
+
+m4_include(`SQLCommon.m4')
+
+
+/**
+@addtogroup grp_knn
+
+<div class="toc"><b>Contents</b>
+<ul>
+<li class="level1"><a href="#knn">K-Nearest Neighbors</a></li>
+<li class="level1"><a href="#usage">Usage</a></li>
+<li class="level1"><a href="#output">Output Format</a></li>
+<li class="level1"><a href="#examples">Examples</a></li>
+<li class="level1"><a href="#background">Technical Background</a></li>
+<li class="level1"><a href="#literature">Literature</a></li>
+<li class="level1"><a href="#related">Related Topics</a></li>
+</ul>
+</div>
+
+@brief Finds k nearest data points to the given data point and outputs majority vote value of output classes in case of classification and average value of target values for regression task.
+
+\warning <em> This MADlib method is still in early stage development. There may be some
+issues that will be addressed in a future version. Interface and implementation
+is subject to change. </em>
+
+@anchor knn
+
+k-Nearest Neighbors is a method for finding k closest points to a
+given data point in terms of a given metric. Its input consist of
+data points as features from testing examples. For a given k, it
+looks for k closest points in training set for each of the data
+points in test set. Algorithm generates one output per testing example.
+The output of KNN depends on the type of task:
+For Classification, the output is majority vote of the classes of
+the k nearest data points. The testing example gets assigned the
+most popular class among nearest neighbors.
+For Regression, the output is average of the values of k nearest
+neighbors of the given testing example.
+
+@anchor usage
+@par Usage
+<pre class="syntax">
+knn( point_source,
+     point_column_name,
+     label_column_name,
+     test_source,
+     test_column_name,
+     id_column_name,
+     output_table,
+     operation,
+     k
+   )
+</pre>
+
+\b Arguments
+<dl class="arglist">
+<dt>point_source</dt>
+<dd>TEXT. The name of the table containing the training data points.
+
+Training data points are expected to be stored row-wise,
+in a column of type <tt>DOUBLE PRECISION[]</tt>.
+</dd>
+
+<dt>point_column_name</dt>
+<dd>TEXT. The name of the column with training data points.</dd>
+
+<dt>label_column_name</dt>
+<dd>TEXT. The name of the column with labels/values of training data points.</dd>
+
+<dt>test_source</dt>
+<dd>TEXT. The name of the table containing the test data points.
+
+Testing data points are expected to be stored row-wise,
+in a column of type <tt>DOUBLE PRECISION[]</tt>.
+</dd>
+
+<dt>test_column_name</dt>
+<dd>TEXT. The name of the column with testing data points.</dd>
+
+<dt>id_column_name</dt>
+<dd>TEXT. Name of the column having ids of data points in test data table.</dd>
+
+<dt>output_table</dt>
+<dd>TEXT. Name of the table to store final results.</dd>
+
+<dt>operation</dt>
+<dd>TEXT. the type of task; r for regression and c for classification.</dd>
+
+<dt>k (optional)</dt>
+<dd>INTEGER. default: 1. The number of nearest neighbors to consider.</dd>
+
+</dl>
+
+
+@anchor output
+@par Output Format
+
+The output of the KNN module is a table with the following columns:
+<table class="output">
+    <tr>
+        <th>id</th>
+        <td>INTEGER. The ids of test data points.</td>
+    </tr>
+    <tr>
+        <th>test_column_name</th>
+        <td>DOUBLE PRECISION[]. The test data points.</td>
+    </tr>
+    <tr>
+        <th>prediction</th>
+        <td>INTEGER. The output of KNN- label in case of classification, average value in case of regression.</td>
+    </tr>
+</table>
+
+
+@anchor examples
+@examp
+
+-#  Prepare some training data.
+<pre class="example">
+CREATE TABLE knn_train_data (id integer, data integer[], label float);
+COPY knn_train_data (id, data, label) from stdin delimiter '|';
+1|{1,1}|1.0
+2|{2,2}|1.0
+3|{3,3}|1.0
+4|{4,4}|1.0
+5|{4,5}|1.0
+6|{20,50}|0.0
+7|{10,31}|0.0
+8|{81,13}|0.0
+9|{1,111}|0.0
+\\.
+</pre>
+
+-#  Prepare some testing data.
+<pre class="example">
+CREATE TABLE knn_test_data (id integer, data integer[]);
+COPY knn_test_data (id, data) from stdin delimiter '|';
+1|{2,1}
+2|{2,6}
+3|{15,40}
+4|{12,1}
+5|{2,90}
+6|{50,45}
+\\.
+</pre>
+
+-#  Run KNN for classification:
+<pre class="example">
+SELECT * FROM madlib.knn( 'knn_train_data',
+                               'data',
+                               'label',
+                               'knn_test_data',
+                               'data',
+                               'id',
+                               'madlib_knn_result_classification',
+                               'c',
+                               3
+                             );
+SELECT * from madlib_knn_result_classification;
+</pre>
+Result:
+<pre class="result">
+  id |   data   | prediction
+-----+----------+-----------
+   1 | {2,1}    |       1
+   2 | {2,6}    |       1
+   3 | {15,40}  |       0
+   4 | {12,1}   |       1
+   5 | {2,90}   |       0
+   6 | {50,45}  |       0
+</pre>
+
+-#  Run KNN for regression:
+<pre class="example">
+SELECT * FROM madlib.knn( 'knn_train_data',
+                               'data',
+                               'label',
+                               'knn_test_data',
+                               'data',
+                               'id',
+                               'madlib_knn_result_regression',
+                               'r',
+                               3
+                             );
+SELECT * from madlib_knn_result_regression;
+</pre>
+Result:
+<pre class="result">
+  id |   data   | prediction
+-----+----------+-----------
+   1 | {2,1}    |      1
+   2 | {2,6}    |      1
+   3 | {15,40}  |      0.5
+   4 | {12,1}   |      1
+   5 | {2,90}   |      0.25
+   6 | {50,45}  |      0.25
+</pre>
+
+
+
+@anchor background
+@par Technical Background
+
+The training data points are vectors in a multidimensional feature space,
+each with a class label. The training phase of the algorithm consists
+only of storing the feature vectors and class labels of the training points.
+
+In the classification phase, k is a user-defined constant, and an unlabeled
+vector (a test point) is classified by assigning the label which is most
+frequent among the k training samples nearest to that test point.
+In case of regression, average of the values of these k training samples
+is assigned to the test point.
+The only distance metric supported in this version is MADlib's squared_dist_norm2.
+Other distance metrics will be added in a future release of this module.
+
+
+@anchor literature
+@literature
+
+@anchor knn-lit-1
+[1] Wikipedia, k-nearest neighbors algorithm,
+    https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm
+
+@anchor knn-lit-2
+[2] N. S. Altman: An Introduction to Kernel and Nearest-Neighbor Nonparametric Regression
+    http://www.stat.washington.edu/courses/stat527/s13/readings/Altman_AmStat_1992.pdf
+
+@anchor knn-lit-3
+[3] Gongde Guo1, Hui Wang, David Bell, Yaxin Bi, Kieran Greer: KNN Model-Based Approach in Classification,
+    https://ai2-s2-pdfs.s3.amazonaws.com/a7e2/814ec5db800d2f8c4313fd436e9cf8273821.pdf
+
+
+@anchor related
+@par Related Topics
+
+File knn.sql_in documenting the knn SQL functions
+
+@internal
+@sa namespace knn (documenting the implementation in Python)
+@endinternal
+*/
+
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__knn_validate_src(
+"trainingSource" VARCHAR,
+"trainingClassColumn" VARCHAR,
+"trainingFeatureColumn" VARCHAR,
+"testSource" VARCHAR,
+"testingIdColumn" VARCHAR,
+"testingFeatureColumn" VARCHAR,
+"K" INTEGER
+) RETURNS VOID AS $$
+    PythonFunction(knn, knn, knn_validate_src)
+$$ LANGUAGE plpythonu
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
+
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn(
+    arg1 VARCHAR
+) RETURNS VOID AS $$
+BEGIN
+    IF arg1 = 'help' OR arg1 = 'usage' OR arg1 = '?' THEN
+	RAISE NOTICE
+'
+-----------------------------------------------------------------------
+                            USAGE
+-----------------------------------------------------------------------
+SELECT {schema_madlib}.knn(
+    point_source,       -- Training data table having training features as vector column and labels
+    point_column_name,  -- Name of column having feature vectors in training data table
+    label_column_name,  -- Name of column having actual label/vlaue for corresponding feature vector in training data table
+    test_source,        -- Test data table having features as vector column. Id of features is mandatory
+    test_column_name,   -- Name of column having feature vectors in test data table
+    id_column_name,     -- Name of column having feature vector Ids in test data table
+    output_table,       -- Name of output table
+    operation,          -- c for classification task, r for regression task
+    k                   -- value of k. Default will go as 1
+    );
+
+-----------------------------------------------------------------------
+                            OUTPUT
+-----------------------------------------------------------------------
+The output of the KNN module is a table with the following columns:
+
+id                  The ids of test data points.
+test_column_name    The test data points.
+prediction          The output of KNN- label in case of classification, average value in case of regression.
+';
+    END IF;
+END;
+$$ LANGUAGE plpgsql VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn(
+) RETURNS VOID AS $$
+BEGIN
+    RAISE NOTICE '
+k-Nearest Neighbors is a method for finding k closest points to a given data
+point in terms of a given metric. Its input consist of data points as features
+from testing examples. For a given k, it looks for k closest points in
+training set for each of the data points in test set. Algorithm generates one
+output per testing example. The output of KNN depends on the type of task:
+For Classification, the output is majority vote of the classes of the k
+nearest data points. The testing example gets assigned the most popular class
+among nearest neighbors. For Regression, the output is average of the values
+of k nearest neighbors of the given testing example.
+    ';
+END;
+$$ LANGUAGE plpgsql VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
+
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn(
+    point_source VARCHAR,
+    point_column_name VARCHAR,
+    label_column_name VARCHAR,
+    test_source VARCHAR,
+    test_column_name VARCHAR,
+    id_column_name VARCHAR,
+    output_table VARCHAR,
+    operation VARCHAR,
+    k INTEGER
+) RETURNS VARCHAR AS $$
+DECLARE
+    class_test_source REGCLASS;
+    class_point_source REGCLASS;
+    l FLOAT;
+    outputTableFlag INTEGER;
+    id INTEGER;
+    vector DOUBLE PRECISION[];
+    cur_pid integer;
+    oldClientMinMessages VARCHAR;
+    returnstring VARCHAR;
+    x_temp_table VARCHAR;
+    y_temp_table VARCHAR;
+BEGIN
+    oldClientMinMessages :=
+        (SELECT setting FROM pg_settings WHERE name = 'client_min_messages');
+    EXECUTE 'SET client_min_messages TO warning';
+    PERFORM MADLIB_SCHEMA.__knn_validate_src(point_source, label_column_name, point_column_name, test_source, id_column_name, test_column_name,k);
+    class_test_source := test_source;
+    class_point_source := point_source;
+    --checks
+    IF (k <= 0) THEN
+        RAISE EXCEPTION 'KNN error: Number of neighbors k must be a positive integer.';
+    END IF;
+    IF (operation != 'c' AND operation != 'r') THEN
+        RAISE EXCEPTION 'KNN error: The operation has to be r for regression OR c for classification.';
+    END IF;
+    PERFORM MADLIB_SCHEMA.create_schema_pg_temp();
+    x_temp_table := 'knn_'||md5('knn_'||now()::text||random()::text)||'_temp';
+    y_temp_table := 'knn_'||md5('knn_'||now()::text||random()::text)||'_temp';
+
+    EXECUTE
+	$sql$
+	SELECT count(*) FROM information_schema.tables WHERE table_name = '$sql$ || output_table || $sql$'$sql$ into outputTableFlag;
+    IF (outputTableFlag != 0) THEN
+	RAISE Exception 'KNN error: Output table % already exists.', output_table;
+    END IF;
+
+    EXECUTE
+        $sql$
+	DROP TABLE IF EXISTS pg_temp.madlib_knn_interm;
+	CREATE TABLE pg_temp.madlib_knn_interm AS
+	SELECT *
+    FROM
+        (
+        SELECT row_number() over (partition by test_id order by dist) as r, $sql$ || x_temp_table || $sql$.*
+        FROM
+            (
+                SELECT test. $sql$ || id_column_name || $sql$ as test_id, MADLIB_SCHEMA.squared_dist_norm2(train.$sql$ || point_column_name || $sql$,test.$sql$ || test_column_name || $sql$) as dist, $sql$ || label_column_name || $sql$ from $sql$ || textin(regclassout(point_source)) || $sql$ AS train, $sql$ || textin(regclassout(test_source)) || $sql$ AS test
+            )$sql$ || x_temp_table || $sql$
+        )$sql$ || y_temp_table || $sql$
+    WHERE $sql$ || y_temp_table || $sql$.r <= $sql$ || k;
+	IF (operation = 'c') THEN
+    	EXECUTE
+        $sql$
+	CREATE TABLE $sql$ || output_table || $sql$ AS
+    SELECT test_id as id, $sql$ || test_column_name || $sql$, MADLIB_SCHEMA.mode($sql$ || label_column_name || $sql$) as prediction from pg_temp.madlib_knn_interm join $sql$ || textin(regclassout(test_source)) || $sql$  on test_id=$sql$ || id_column_name || $sql$ group by test_id, $sql$ || test_column_name;
+        ELSE
+        EXECUTE
+        $sql$
+	CREATE TABLE $sql$ || output_table || $sql$ AS
+        SELECT test_id as id, $sql$ || test_column_name || $sql$ ,avg($sql$ || label_column_name || $sql$) as prediction from pg_temp.madlib_knn_interm join $sql$ || textin(regclassout(test_source)) || $sql$  on test_id=$sql$ || id_column_name || $sql$ group by test_id, $sql$ || test_column_name || $sql$ order by test_id $sql$;
+        END IF;
+
+   EXECUTE 'SET client_min_messages TO ' || oldClientMinMessages;
+   IF (operation = 'c') THEN
+   	returnstring := 'The classification results have been written to table';
+   ELSE
+        returnstring := 'The regression results have been written to table';
+   END IF;
+   DROP TABLE pg_temp.madlib_knn_interm;
+   RETURN returnstring;
+END;
+$$ LANGUAGE plpgsql VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
+
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn(
+    point_source VARCHAR,
+    point_column_name VARCHAR,
+    label_column_name VARCHAR,
+    test_source VARCHAR,
+    test_column_name VARCHAR,
+    id_column_name VARCHAR,
+    output_table VARCHAR,
+    operation VARCHAR
+) RETURNS VARCHAR AS $$
+DECLARE
+    returnstring VARCHAR;
+BEGIN
+    returnstring = MADLIB_SCHEMA.knn($1,$2,$3,$4,$5,$6,$7,$8,1);
+    RETURN returnstring;
+END;
+$$ LANGUAGE plpgsql VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
+

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/61f3c5f0/src/ports/postgres/modules/knn/test/knn.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/knn/test/knn.sql_in b/src/ports/postgres/modules/knn/test/knn.sql_in
new file mode 100644
index 0000000..3c730ee
--- /dev/null
+++ b/src/ports/postgres/modules/knn/test/knn.sql_in
@@ -0,0 +1,70 @@
+/* ----------------------------------------------------------------------- *//**
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ *//* ----------------------------------------------------------------------- */
+
+m4_include(`SQLCommon.m4')
+/* -----------------------------------------------------------------------------
+ * Test knn.
+ *
+ * FIXME: Verify results
+ * -------------------------------------------------------------------------- */
+
+drop table if exists knn_train_data;
+create table knn_train_data (
+id  integer,
+data    integer[],
+label   float);
+copy knn_train_data (id, data, label) from stdin delimiter '|';
+1|{1,1}|1.0
+2|{2,2}|1.0
+3|{3,3}|1.0
+4|{4,4}|1.0
+5|{4,5}|1.0
+6|{20,50}|0.0
+7|{10,31}|0.0
+8|{81,13}|0.0
+9|{1,111}|0.0
+\.
+drop table if exists knn_test_data;
+create table knn_test_data (
+id  integer,
+data integer[]);
+copy knn_test_data (id, data) from stdin delimiter '|';
+1|{2,1}
+2|{2,6}
+3|{15,40}
+4|{12,1}
+5|{2,90}
+6|{50,45}
+\.
+drop table if exists madlib_knn_result_classification;
+select madlib.knn('knn_train_data','data','label','knn_test_data','data','id','madlib_knn_result_classification','c',3);
+select madlib.assert(array_agg(prediction order by id)='{1,1,0,1,0,0}', 'Wrong output in classification with k=3') from madlib_knn_result_classification;
+
+drop table if exists madlib_knn_result_regression;
+select madlib.knn('knn_train_data','data','label','knn_test_data','data','id','madlib_knn_result_regression','r',4);
+select madlib.assert(array_agg(prediction order by id)='{1,1,0.5,1,0.25,0.25}', 'Wrong output in regression') from madlib_knn_result_regression;
+
+drop table if exists madlib_knn_result_classification;
+select madlib.knn('knn_train_data','data','label','knn_test_data','data','id','madlib_knn_result_classification','c');
+select madlib.assert(array_agg(prediction order by id)='{1,1,0,1,0,0}', 'Wrong output in classification with k=1') from madlib_knn_result_classification;
+
+select madlib.knn();
+select madlib.knn('help');

[05/50] [abbrv] incubator-madlib git commit: Elastic Net: Add CV examples, clean user docs

Posted by ri...@apache.org.

Elastic Net: Add CV examples, clean user docs

Closes #85


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/e75a944e
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/e75a944e
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/e75a944e

Branch: refs/heads/latest_release
Commit: e75a944e33ca7e11f736e8571ded78840b29f3c4
Parents: 6f12264
Author: Frank McQuillan <fm...@pivotal.io>
Authored: Thu Jan 5 12:14:55 2017 -0800
Committer: Rahul Iyer <ri...@apache.org>
Committed: Wed Jan 11 15:08:49 2017 -0800

----------------------------------------------------------------------
 .../modules/elastic_net/elastic_net.sql_in      | 482 ++++++++++++-------
 1 file changed, 297 insertions(+), 185 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/e75a944e/src/ports/postgres/modules/elastic_net/elastic_net.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/elastic_net/elastic_net.sql_in b/src/ports/postgres/modules/elastic_net/elastic_net.sql_in
index 9bed5ac..2949fc5 100644
--- a/src/ports/postgres/modules/elastic_net/elastic_net.sql_in
+++ b/src/ports/postgres/modules/elastic_net/elastic_net.sql_in
@@ -29,7 +29,8 @@ m4_include(`SQLCommon.m4')
 and logistic regression problems, combining the L1 and L2 penalties of the
 lasso and ridge methods.
 
-This module implements elastic net regularization for linear and logistic regression problems.
+This module implements elastic net regularization [1] for linear and logistic regression.
+Regularization is a technique often used to prevent overfitting.
 
 @anchor train
 @par Training Function
@@ -58,7 +59,7 @@ elastic_net_train( tbl_source,
 <DD>TEXT. The name of the table containing the training data.</DD>
 
 <DT>tbl_result</DT>
-<DD>TEXT. Name of the generated table containing the output model.
+<DD>TEXT. Name of the output table containing output model.
 The output table produced by the elastic_net_train() function has the following columns:
 <table class="output">
   <tr><th>regress_family</th>
@@ -66,31 +67,31 @@ The output table produced by the elastic_net_train() function has the following
   </tr>
   <tr>
     <th>features</th>
-    <td>An array of the features (independent variables) passed into the analysis.</td>
+    <td>Array of features (independent variables) passed to the algorithm.</td>
   </tr>
   <tr>
     <th>features_selected</th>
-    <td>An array of the features selected by the analysis.</td>
+    <td>Array of features selected by the algorithm.</td>
   </tr>
   <tr>
     <th>coef_nonzero</th>
-    <td>Fitting coefficients for the selected features.</td>
+    <td>Coefficients of the selected features.</td>
   </tr>
   <tr>
     <th>coef_all</th>
-    <td>Coefficients for all selected and unselected features</td>
+    <td>Coefficients of all features, both selected and unselected.</td>
   </tr>
   <tr>
     <th>intercept</th>
-    <td>Fitting intercept for the model.</td>
+    <td>Intercept for the model.</td>
   </tr>
   <tr>
     <th>log_likelihood</th>
-    <td>The negative value of the first equation above (up to a constant depending on the data set).</td>
+    <td>Log of the likelihood value produced by the algorithm.</td>
   </tr>
   <tr>
     <th>standardize</th>
-    <td>BOOLEAN. Whether the data was normalized (\e standardize argument was TRUE).</td>
+    <td>BOOLEAN. If data has been normalized, will be set to TRUE.</td>
   </tr>
   <tr>
     <th>iteration_run</th>
@@ -102,48 +103,53 @@ The output table produced by the elastic_net_train() function has the following
 <DT>col_dep_var</DT>
 <DD>TEXT. An expression for the dependent variable.
 
-Both \e col_dep_var and \e col_ind_var can be valid Postgres
+@note  Both \e col_dep_var and \e col_ind_var can be valid PostgreSQL
 expressions. For example, <tt>col_dep_var = 'log(y+1)'</tt>, and <tt>col_ind_var
-= 'array[exp(x[1]), x[2], 1/(1+x[3])]'</tt>. In the binomial case, you can
+= 'array[exp(x[1]), x[2], 1/(1+x[3])]'</tt>.  In the binomial case, you can
 use a Boolean expression, for example, <tt>col_dep_var = 'y < 0'</tt>.</DD>
 
 <DT>col_ind_var</DT>
 <DD>TEXT. An expression for the independent variables. Use \c '*' to
 specify all columns of <em>tbl_source</em> except those listed in the
-<em>excluded</em> string. If \e col_dep_var is a column name, it is
+<em>excluded</em> string described below. If \e col_dep_var is a column name, it is
 automatically excluded from the independent variables. However, if
-\e col_dep_var is a valid Postgres expression, any column names used
-within the expression are only excluded if they are explicitly included in the
-\e excluded argument. It is a good idea to add all column names involved in
+\e col_dep_var is a valid PostgreSQL expression, any column names used
+within the expression are only excluded if they are explicitly listed in the
+\e excluded argument. Therefore, it is a good idea to add all column names involved in
 the dependent variable expression to the <em>excluded</em> string.</DD>
 
 <DT>regress_family</DT>
-<DD>TEXT. The regression type, either 'gaussian' ('linear') or 'binomial' ('logistic').</DD>
+<DD>TEXT. For regression type, specify either 'gaussian' ('linear') or 'binomial' ('logistic').</DD>
 
 <DT>alpha</DT>
-<DD>FLOAT8. Elastic net control parameter, value in [0, 1], 1 for L-1 regularization, 0 for L-2.</DD>
+<DD>FLOAT8. Elastic net control parameter with a value in the range [0, 1].
+A value of 1 means L1 regularization, and a value of 0 means L2 regularization.</DD>
 
 <DT>lambda_value</DT>
-<DD>FLOAT8. Regularization parameter, positive.</DD>
+<DD>FLOAT8. Regularization parameter (must be positive).</DD>
 
 <DT>standardize (optional)</DT>
-<DD>BOOLEAN, default: TRUE. Whether to normalize the data. Setting this to TRUE usually yields better results and faster convergence.</DD>
+<DD>BOOLEAN, default: TRUE. Whether to normalize the data or not. 
+Setting to TRUE usually yields better results and faster convergence.</DD>
 
 <DT>grouping_col (optional)</DT>
 <DD>TEXT, default: NULL. A single column or a list of comma-separated
-columns that divides the input data into discrete groups, running one
+columns that divides the input data into discrete groups, resulting in one
 regression per group. When this value is NULL, no grouping is used and
-a single result model is generated.
+a single model is generated for all data.
 
-@note <em>We currently do not support expressions for grouping_col. When
-implemented, grouping_col can also be an expression, similar to the SQL
-<tt>GROUP BY</tt> clause. </em></DD>
+@note Expressions are not currently supported for 'grouping_col'.
 
 <DT>optimizer (optional)</DT>
-<DD>TEXT, default: 'fista'. Name of optimizer, either 'fista' or 'igd'.</DD>
+<DD>TEXT, default: 'fista'. Name of optimizer, either 'fista' or 'igd'.  
+FISTA [2] is an algorithm with a fast global rate of convergence for 
+solving linear inverse problems. Incremental gradient descent (IGD)
+is a stochastic approach to minimizing an objective function [4].</DD>
 
 <DT>optimizer_params (optional)</DT>
-<DD>TEXT, default: NULL. Optimizer parameters, delimited with commas. The parameters differ depending on the value of \e optimizer. See the descriptions below for details.</DD>
+<DD>TEXT, default: NULL. Optimizer parameters, delimited with commas. 
+These parameters differ depending on the value of \e optimizer parameter. 
+See the descriptions below for details.</DD>
 
 <DT>excluded (optional)</DT>
 <DD>TEXT, default: NULL. If the \e col_ind_var input is '*' then \e excluded can
@@ -152,15 +158,15 @@ from the features.
 For example, <tt>'col1, col2'</tt>. If the \e col_ind_var is an array,
 \e excluded must be a list of the integer array positions to exclude,
 for example <tt>'1,2'</tt>. If this argument is NULL or an
-empty string <tt>''</tt>, no columns are excluded.</DD>
+empty string, no columns are excluded.</DD>
 
 <DT>max_iter (optional)</DT>
-<DD>INTEGER, default: 1000. The maximum number of iterations that are allowed.</DD>
+<DD>INTEGER, default: 1000. The maximum number of iterations allowed.</DD>
 
 <DT>tolerance</DT>
-<DD>FLOAT8, default: default is 1e-6. The criteria to end iterations. Both the
-'fista' and 'igd' optimizers compute the difference between  the
-loglikelihood of two consecutive iterations, and when the difference is smaller
+<DD>FLOAT8, default: 1e-6. This is the criterion to stop iterating. Both the
+'fista' and 'igd' optimizers compute the difference between the
+log likelihood of two consecutive iterations, and when the difference is smaller
 than \e tolerance or the iteration number is larger than \e max_iter, the
 computation stops.</DD>
 </DL>
@@ -168,11 +174,13 @@ computation stops.</DD>
 @anchor optimizer
 @par Other Parameters
 
-Multiple other (optional) parameters are supplied in a string containing a
-comma-delimited list of name-value pairs. All of these named parameters are
+For \e optimizer_params, there are several 
+parameters that can be supplied in a string containing a
+comma-delimited list of name-value pairs . All of these named parameters are
 optional and use the format "<param_name> = <value>".
 
-The parameters described below are organized by their functionality.
+The parameters described below are organized by category:  warmup, cross validation and 
+optimization.
 
 <em><b>Warmup parameters</b></em>
 <pre class="syntax">
@@ -186,27 +194,31 @@ The parameters described below are organized by their functionality.
 
 <DL class="arglist">
 <DT>warmup</DT>
-<DD>Default: FALSE. If \e warmup is TRUE, a series of lambda values, which is
-strictly descent and ends at the lambda value that the user wants to calculate,
-is used. The larger lambda gives very sparse solution, and the sparse
-solution again is used as the initial guess for the next lambda's solution,
-which speeds up the computation for the next lambda. For larger data sets,
-this can sometimes accelerate the whole computation and may be faster than
-computation on only one lambda value.</DD>
+<DD>Default: FALSE. If \e warmup is TRUE, a series of strictly descending lambda values
+are used, which end with the lambda value that the user wants to calculate.
+A larger lambda gives a sparser solution, and the sparse
+solution is then used as the initial guess for the next lambda's solution,
+which can speed up the computation for the next lambda. For larger data sets,
+this can sometimes accelerate the whole computation and may in fact be faster than
+computation with only a single lambda value.</DD>
 
 <DT>warmup_lambdas</DT>
-<DD>Default: NULL. The lambda value series to use when \e warmup is True. The default is NULL, which means that lambda values will be automatically generated.</DD>
+<DD>Default: NULL. Set of lambda values to use when \e warmup is TRUE. 
+The default is NULL, which means that lambda values will be automatically generated.</DD>
 
 <DT>warmup_lambda_no</DT>
-<DD>Default: 15. How many lambdas are used in warm-up. If \e warmup_lambdas is not NULL, this value is overridden by the number of provided lambda values.</DD>
+<DD>Default: 15. Number of lambda values used in \e warm-up. 
+If \e warmup_lambdas is not NULL, this value is overridden by the number of provided lambda values.</DD>
 
 <DT>warmup_tolerance</DT>
-<DD>The value of tolerance used during warmup. The default is the same as the
-\e tolerance argument.</DD>
+<DD>The value of tolerance used during warmup. The default value is the same as the
+\e tolerance argument described above.</DD>
 </DL>
 
 <em><b>Cross validation parameters</b></em>
-@note Cross validation is not supported if grouping is used.
+@note Please note that for performance reasons, warmup is disabled whenever 
+cross validation is used.  Also, cross validation is not supported if grouping is used.
+
 <pre class="syntax">
   $$
     n_folds = &lt;value>,
@@ -219,15 +231,13 @@ computation on only one lambda value.</DD>
 
 Hyperparameter optimization can be carried out using the built-in cross
 validation mechanism, which is activated by assigning a value greater than 1 to
-the parameter \e n_folds in \e params.  Presently, misclassification error is used
+the parameter \e n_folds.  Misclassification error is used
 for classification and mean squared error is used for regression.
 
 The values of a parameter to cross validate should be provided in a list. For
 example, to regularize with the L1 norm and use a lambda value
-from the set {0.3, 0.4, 0.5}, include 'lambda_value={0.3, 0.4, 0.5}' in
-\e other_params. Note that the use of '{}' and '[]' are both valid
-here.
-
+from the set {0.3, 0.4, 0.5}, include 'lambda_value={0.3, 0.4, 0.5}'. 
+Note that the use of '{}' and '[]' are both valid here.
 
 <DL class="arglist">
 
@@ -238,26 +248,33 @@ If a value of k > 2 is specified, each fold is then used as a validation set onc
 while the other k - 1 folds form the training set.
 </DD>
 
-
 <DT>validation_result</dt>
 <DD>Default: NULL.
-Name of the table to store the cross validation results including the values of
+Name of the table to store the cross validation results, including the values of
 parameters and their averaged error values. The table is only created if the name is not NULL.
 </DD>
 
 <DT>lambda_value</DT>
-<DD>Regularization value. If a list is provided for cross validation, then warmup is
-disabled on each lambda for performance reasons. </DD>
+<DD>Default: NULL. Set of regularization values to be used for cross validation.
+The default is NULL, which means that lambda values will be automatically generated.</DD>
 
 <DT>n_lambdas</DT>
-<DD>Number of lambdas to cross validate over. If a list of lambda values is not
-provided, this parameter can be used to autogenerate a list of lambdas (using the
-warmup procedure)
-disabled on each lambda for performance reasons. </DD>
+<DD>Default: 15. Number of lambdas to cross validate over. If a list of lambda values is not
+provided in the \e lambda_value set above, this parameter can be used to 
+autogenerate the set of lambdas.  If the \e lambda_value set is not NULL, this value
+is overridden by the number of provided lambda values. </DD>
+
+@note If you want to cross validate over alpha only and not lambda,
+then set \e lambda_value to NULL and \e n_lambdas to 0.  In this case, 
+cross validation will be done on the set of \e alpha values specified
+in the next parameter.  The lambda value used will be the one 
+specified in the main function call at the top of this page.
 
 <DT>alpha</DT>
-<DD>Elastic net control parameter. Needs to be a list of values to apply
-cross validation on it.
+<DD>Elastic net control parameter. This is a list of values to apply
+cross validation on.  (Note that alpha values are not autogenerated.)
+If not specified, the alpha value used will be the one 
+specified in the main function call at the top of this page.
 </DD>
 </DL>
 
@@ -282,18 +299,19 @@ smaller step size, <em>stepsize = stepsize/eta</em>, where \e eta must
 be larger than 1. At first glance, this seems to perform repeated iterations for even one step, but using a larger step size actually greatly increases the computation speed and minimizes the total number of iterations. A careful choice of \e max_stepsize can decrease the computation time by more than 10 times.</DD>
 
 <DT>eta</DT>
-<DD>Default: 2. If stepsize does not work \e stepsize / \e eta is tried. Must be greater than 1. </DD>
+<DD>Default: 2.0 If stepsize does not work, \e stepsize/\e eta is tried. Must be greater than 1. </DD>
 
 <DT>use_active_set</DT>
 <DD>Default: FALSE. If \e use_active_set is TRUE, an active-set method is used to
 speed up the computation. Considerable speedup is obtained by organizing the
 iterations around the active set of features&mdash;those with nonzero coefficients.
-After a complete cycle through all the variables, we iterate on only the active
+After a complete cycle through all the variables, we iterate only on the active
 set until convergence. If another complete cycle does not change the active set,
-we are done, otherwise the process is repeated.</DD>
+we are done.  Otherwise, the process is repeated.</DD>
 
 <DT>activeset_tolerance</DT>
-<DD>Default: the value of the tolerance argument. The value of tolerance used during active set calculation. </DD>
+<DD>The value of tolerance used during active set calculation. The default
+value is the same as the \e tolerance argument described above.  </DD>
 
 <DT>random_stepsize</DT>
 <DD>Default: FALSE. Whether to add some randomness to the step size. Sometimes, this can speed
@@ -330,13 +348,13 @@ with the average, and if the resulting absolute value is smaller than
 \e threshold, set the original coefficient to zero.</DD>
 
 <DT>parallel</DT>
-<DD>Whether to run the computation on multiple segments. The default is True.
+<DD>Whether to run the computation on multiple segments. The default is TRUE.
 
 SGD is a sequential algorithm in nature. When running in a distributed
 manner, each segment  of the data runs its own SGD model and then the models
 are averaged to get a model for each iteration. This averaging might slow
-down the convergence speed, although we also acquire the ability to process
-large datasets on multiple machines. This algorithm, therefore, provides the
+down the convergence speed, but it affords the ability to process
+large datasets on a cluster of machines. This algorithm, therefore, provides the
 \e parallel option to allow you to choose whether to do parallel computation.
 </DD>
 </DL>
@@ -346,7 +364,8 @@ large datasets on multiple machines. This algorithm, therefore, provides the
 @par Prediction Function
 
 <h4>Per-Tuple Prediction</h4>
-The prediction function returns a double value for Gaussian family and boolean value for Binomial family.
+The prediction function returns a double value for the Gaussian family 
+and a Boolean value for the Binomial family.
 
 The predict function has the following syntax (elastic_net_gaussian_predict() and elastic_net_binomial_predict()):
 <pre class="syntax">
@@ -360,14 +379,15 @@ elastic_net_<family>_predict(
 \b Arguments
 <DL class="arglist">
   <DT>coefficients</DT>
-  <DD>DOUBLE PRECISION[]. Fitting coefficients, usually coef_all or coef_nonzero.</DD>
+  <DD>DOUBLE PRECISION[]. Fitting coefficients, usually \e coef_all or \e coef_nonzero.</DD>
   <DT>intercept</DT>
-  <DD>DOUBLE PRECISION. The intercept for the model.</DD>
+  <DD>DOUBLE PRECISION. Intercept for the model.</DD>
   <DT>ind_var</DT>
-  <DD>DOUBLE PRECISION[]. Independent variables that correspond to coefficients, use <EM>features</EM> column in <EM>tbl_result</EM> for coef_all, and <EM>features_selected</EM> for coef_nonzero. See also <a href="#additional_example">examples</a>. Note that unexpected results or errors may be returned in the case that this argument is not given properly.</DD>
+  <DD>DOUBLE PRECISION[]. Independent variables that correspond to coefficients.  Use <EM>features</EM> column in <EM>tbl_result</EM> for \e coef_all, and <EM>features_selected</EM> for \e coef_nonzero. See the <a href="#additional_example">examples for this case below</a>. 
+  @note Unexpected results or errors may be returned in the case that this argument \e ind_var is not specified properly.</DD>
 </DL>
 
-For binomial family, there is a function (elastic_net_binomial_prob()) that outputs the probability of the instance being True:
+For the binomial family, there is a function (elastic_net_binomial_prob()) that outputs the probability of the instance being TRUE:
 <pre class="syntax">
 elastic_net_binomial_prob(
                      coefficients,
@@ -390,25 +410,25 @@ elastic_net_predict( tbl_model,
 \b Arguments
 <dl class="arglist">
 <dt>tbl_model</dt>
-<dd>TEXT. The name of the table containing the output from the training function.</dd>
+<dd>TEXT. Name of the table containing the output from the training function.</dd>
 <dt>tbl_new_sourcedata</dt>
-<dd>TEXT. The name of the table containing the new source data.</dd>
+<dd>TEXT. Name of the table containing the new source data.</dd>
 <dt>col_id</dt>
-<dd>TEXT. The unique ID associated with each row.</dd>
+<dd>TEXT. Unique ID associated with each row.</dd>
 <dt>tbl_predict</dt>
-<dd>TEXT. The name of table to store the prediction result. </dd>
+<dd>TEXT. Name of table to store the prediction result. </dd>
 </dl>
 You do not need to specify whether the model is "linear" or "logistic" because this information is already included in the \e tbl_model table.
 
 @anchor examples
 @examp
 
--# Display online help for the elastic_net_train() function.
+-# Display online help for the elastic_net_train() function:
 <pre class="example">
 SELECT madlib.elastic_net_train();
 </pre>
 
--# Create an input data set.
+-# Create an input data set of house prices and features:
 <pre class="example">
 DROP TABLE IF EXISTS houses;
 CREATE TABLE houses ( id INT,
@@ -419,56 +439,55 @@ CREATE TABLE houses ( id INT,
                       size INT,
                       lot INT,
                       zipcode INT);
-COPY houses FROM STDIN WITH DELIMITER '|';
-  1  |  590 |       2 |    1 |  50000 |  770 | 22100 | 94301
-  2  | 1050 |       3 |    2 |  85000 | 1410 | 12000  | 94301
-  3  |   20 |       3 |    1 |  22500 | 1060 |  3500 | 94301
-  4  |  870 |       2 |    2 |  90000 | 1300 | 17500  | 94301
-  5  | 1320 |       3 |    2 | 133000 | 1500 | 30000 | 94301
-  6  | 1350 |       2 |    1 |  90500 |  820 | 25700  | 94301
-  7  | 2790 |       3 |  2.5 | 260000 | 2130 | 25000 | 94301
-  8  |  680 |       2 |    1 | 142500 | 1170 | 22000  | 94301
-  9  | 1840 |       3 |    2 | 160000 | 1500 | 19000 | 94301
-  10 | 3680 |       4 |    2 | 240000 | 2790 | 20000  | 94301
-  11 | 1660 |       3 |    1 |  87000 | 1030 | 17500 | 94301
-  12 | 1620 |       3 |    2 | 118600 | 1250 | 20000  | 94301
-  13 | 3100 |       3 |    2 | 140000 | 1760 | 38000 | 94301
-  14 | 2070 |       2 |    3 | 148000 | 1550 | 14000  | 94301
-  15 |  650 |       3 |  1.5 |  65000 | 1450 | 12000 | 94301
-  16 |  770 |       2 |    2 |  91000 | 1300 | 17500 | 76010
-  17 | 1220 |       3 |    2 | 132300 | 1500 | 30000  | 76010
-  18 | 1150 |       2 |    1 |  91100 |  820 | 25700 | 76010
-  19 | 2690 |       3 |  2.5 | 260011 | 2130 | 25000  | 76010
-  20 |  780 |       2 |    1 | 141800 | 1170 | 22000 | 76010
-  21 | 1910 |       3 |    2 | 160900 | 1500 | 19000  | 76010
-  22 | 3600 |       4 |    2 | 239000 | 2790 | 20000 | 76010
-  23 | 1600 |       3 |    1 |  81010 | 1030 | 17500  | 76010
-  24 | 1590 |       3 |    2 | 117910 | 1250 | 20000 | 76010
-  25 | 3200 |       3 |    2 | 141100 | 1760 | 38000  | 76010
-  26 | 2270 |       2 |    3 | 148011 | 1550 | 14000 | 76010
-  27 |  750 |       3 |  1.5 |  66000 | 1450 | 12000  | 76010
-\\.
+INSERT INTO houses (id, tax, bedroom, bath, price, size, lot, zipcode) VALUES
+(1  ,  590 ,       2 ,    1 ,  50000 ,  770 , 22100  , 94301),
+(2  , 1050 ,       3 ,    2 ,  85000 , 1410 , 12000  , 94301),
+(3  ,   20 ,       3 ,    1 ,  22500 , 1060 ,  3500  , 94301),
+(4  ,  870 ,       2 ,    2 ,  90000 , 1300 , 17500  , 94301),
+(5  , 1320 ,       3 ,    2 , 133000 , 1500 , 30000  , 94301),
+(6  , 1350 ,       2 ,    1 ,  90500 ,  820 , 25700  , 94301),
+(7  , 2790 ,       3 ,  2.5 , 260000 , 2130 , 25000  , 94301),
+(8  ,  680 ,       2 ,    1 , 142500 , 1170 , 22000  , 94301),
+(9  , 1840 ,       3 ,    2 , 160000 , 1500 , 19000  , 94301),
+(10 , 3680 ,       4 ,    2 , 240000 , 2790 , 20000  , 94301),
+(11 , 1660 ,       3 ,    1 ,  87000 , 1030 , 17500  , 94301),
+(12 , 1620 ,       3 ,    2 , 118600 , 1250 , 20000  , 94301),
+(13 , 3100 ,       3 ,    2 , 140000 , 1760 , 38000  , 94301),
+(14 , 2070 ,       2 ,    3 , 148000 , 1550 , 14000  , 94301),
+(15 ,  650 ,       3 ,  1.5 ,  65000 , 1450 , 12000  , 94301),
+(16 ,  770 ,       2 ,    2 ,  91000 , 1300 , 17500  , 76010),
+(17 , 1220 ,       3 ,    2 , 132300 , 1500 , 30000  , 76010),
+(18 , 1150 ,       2 ,    1 ,  91100 ,  820 , 25700  , 76010),
+(19 , 2690 ,       3 ,  2.5 , 260011 , 2130 , 25000  , 76010),
+(20 ,  780 ,       2 ,    1 , 141800 , 1170 , 22000  , 76010),
+(21 , 1910 ,       3 ,    2 , 160900 , 1500 , 19000  , 76010),
+(22 , 3600 ,       4 ,    2 , 239000 , 2790 , 20000  , 76010),
+(23 , 1600 ,       3 ,    1 ,  81010 , 1030 , 17500  , 76010),
+(24 , 1590 ,       3 ,    2 , 117910 , 1250 , 20000  , 76010),
+(25 , 3200 ,       3 ,    2 , 141100 , 1760 , 38000  , 76010),
+(26 , 2270 ,       2 ,    3 , 148011 , 1550 , 14000  , 76010),
+(27 ,  750 ,       3 ,  1.5 ,  66000 , 1450 , 12000  , 76010);
 </pre>
--# Train the model.
+-# Train the model:
 <pre class="example">
 DROP TABLE IF EXISTS houses_en, houses_en_summary;
-SELECT madlib.elastic_net_train( 'houses',                  -- source table
-                                 'houses_en',               -- result table
-                                 'price',                   -- dependent variable
-                                 'array[tax, bath, size]',  -- independent variable
-                                 'gaussian',                -- regression family
-                                 0.5,                       -- alpha value
-                                 0.1,                       -- lambda value
-                                 TRUE,                      -- standardize
-                                 NULL,                      -- grouping column(s)
-                                 'fista',                   -- optimizer
-                                 '',                        -- optimizer parameters
-                                 NULL,                      -- excluded columns
-                                 10000,                     -- maximum iterations
-                                 1e-6                       -- tolerance value
+SELECT madlib.elastic_net_train( 'houses',                  -- Source table
+                                 'houses_en',               -- Result table
+                                 'price',                   -- Dependent variable
+                                 'array[tax, bath, size]',  -- Independent variable
+                                 'gaussian',                -- Regression family
+                                 0.5,                       -- Alpha value
+                                 0.1,                       -- Lambda value
+                                 TRUE,                      -- Standardize
+                                 NULL,                      -- Grouping column(s)
+                                 'fista',                   -- Optimizer
+                                 '',                        -- Optimizer parameters
+                                 NULL,                      -- Excluded columns
+                                 10000,                     -- Maximum iterations
+                                 1e-6                       -- Tolerance value
                                );
 </pre>
--# View the resulting model.
+-# View the resulting model:
 <pre class="example">
 -- Turn on expanded display to make it easier to read results.
 \\x on
@@ -487,7 +506,7 @@ log_likelihood    | -512248641.971
 standardize       | t
 iteration_run     | 10000
 </pre>
--# Use the prediction function to evaluate residuals.
+-# Use the prediction function to evaluate residuals:
 <pre class="example">
 \\x off
 SELECT id, price, predict, price - predict AS residual
@@ -495,35 +514,68 @@ FROM (
     SELECT
         houses.*,
         madlib.elastic_net_gaussian_predict(
-            m.coef_all,
-            m.intercept,
-            ARRAY[tax,bath,size]
+            m.coef_all,             -- Coefficients
+            m.intercept,            -- Intercept
+            ARRAY[tax,bath,size]    -- Features (corresponding to coefficients)
             ) AS predict
     FROM houses, houses_en m) s
 ORDER BY id;
 </pre>
+Result:
+<pre class="result">
+ id | price  |     predict      |     residual      
+----+--------+------------------+-------------------
+  1 |  50000 |  58545.391894031 |   -8545.391894031
+  2 |  85000 | 114804.077663003 |  -29804.077663003
+  3 |  22500 |  61448.835664388 |  -38948.835664388
+  4 |  90000 |  104675.17768007 |   -14675.17768007
+  5 | 133000 |  125887.70644358 |     7112.29355642
+  6 |  90500 |  78601.843595366 |   11898.156404634
+  7 | 260000 | 199257.358231079 |   60742.641768921
+  8 | 142500 |  82514.559377081 |   59985.440622919
+  9 | 160000 |  137735.93215082 |    22264.06784918
+ 10 | 240000 | 250347.627648647 |  -10347.627648647
+ 11 |  87000 |  97172.428263539 |  -10172.428263539
+ 12 | 118600 | 119024.150628605 | -424.150628604999
+ 13 | 140000 | 180692.127913358 |  -40692.127913358
+ 14 | 148000 | 156424.249824545 |   -8424.249824545
+ 15 |  65000 | 102527.938104575 |  -37527.938104575
+ 16 |  91000 |  102396.67273637 |   -11396.67273637
+ 17 | 132300 |  123609.20149988 |     8690.79850012
+ 18 |  91100 |  74044.833707966 |   17055.166292034
+ 19 | 260011 | 196978.853287379 |   63032.146712621
+ 20 | 141800 |  84793.064320781 |   57006.935679219
+ 21 | 160900 |  139330.88561141 |    21569.11438859
+ 22 | 239000 | 248524.823693687 | -9524.82369368701
+ 23 |  81010 |  95805.325297319 |  -14795.325297319
+ 24 | 117910 | 118340.599145495 | -430.599145494998
+ 25 | 141100 | 182970.632857058 |  -41870.632857058
+ 26 | 148011 | 160981.259711945 |  -12970.259711945
+ 27 |  66000 | 104806.443048275 |  -38806.443048275
+</pre>
 
-<h4>Additional Example (with grouping)</h4>
--# Reuse the <a href="#examples">houses</a> table above and train the model by grouping the data on zip code.
+<h4>Example with Grouping</h4>
+-# Reuse the houses table above and train the model by grouping
+on zip code:
 <pre class="example">
 DROP TABLE IF EXISTS houses_en1, houses_en1_summary;
-SELECT madlib.elastic_net_train( 'houses',
-                                 'houses_en1',
-                                 'price',
-                                 'array[tax, bath, size]',
-                                 'gaussian',
-                                 0.5,
-                                 0.1,
-                                 TRUE,
-                                 'zipcode',
-                                 'fista',
-                                 '',
-                                 NULL,
-                                 10000,
-                                 1e-6
+SELECT madlib.elastic_net_train( 'houses',                  -- Source table
+                                 'houses_en1',               -- Result table
+                                 'price',                   -- Dependent variable
+                                 'array[tax, bath, size]',  -- Independent variable
+                                 'gaussian',                -- Regression family
+                                 0.5,                       -- Alpha value
+                                 0.1,                       -- Lambda value
+                                 TRUE,                      -- Standardize
+                                 'zipcode',                 -- Grouping column(s)
+                                 'fista',                   -- Optimizer
+                                 '',                        -- Optimizer parameters
+                                 NULL,                      -- Excluded columns
+                                 10000,                     -- Maximum iterations
+                                 1e-6                       -- Tolerance value
                                );
 </pre>
--# View the resulting model and see a separate model for each group.
+-# View the resulting model with a separate model for each group:
 <pre class="example">
 -- Turn on expanded display to make it easier to read results.
 \\x on
@@ -554,48 +606,46 @@ log_likelihood    | -538806528.45
 standardize       | t
 iteration_run     | 10000
 </pre>
--# Use the prediction function to evaluate residuals.
+-# Use the prediction function to evaluate residuals:
 <pre class="example">
 \\x off
 SELECT madlib.elastic_net_predict(
-                'houses_en1',             -- model table
-                'houses',                 -- new source data table
-                'id',                     -- unique ID associated with each row
-                'houses_en1_prediction'   -- table to store prediction result
+                'houses_en1',             -- Model table
+                'houses',                 -- New source data table
+                'id',                     -- Unique ID associated with each row
+                'houses_en1_prediction'   -- Table to store prediction result
               );
-</pre>
--# View the results:
-<pre class="example">
 SELECT  houses.id,
         houses.price,
         houses_en1_prediction.prediction,
         houses.price - houses_en1_prediction.prediction AS residual
 FROM houses_en1_prediction, houses
-WHERE houses.id = houses_en1_prediction.id;
+WHERE houses.id = houses_en1_prediction.id ORDER BY id;
 </pre>
 
 @anchor additional_example
-<h4>Another Example (when coef_nonzero is different from coef_all)</h4>
--# Reuse the <a href="#examples">houses</a> table above and train the model with alpha=1 (L-1) and a large lambda (30000).
+<h4>Example where coef_nonzero is different from coef_all</h4>
+-# Reuse the <a href="#examples">houses</a> table above and train the model with alpha=1 (L1) 
+and a large lambda value (30000).
 <pre class="example">
 DROP TABLE IF EXISTS houses_en2, houses_en2_summary;
-SELECT madlib.elastic_net_train( 'houses',
-                                 'houses_en2',
-                                 'price',
-                                 'array[tax, bath, size]',
-                                 'gaussian',
-                                 1,
-                                 30000,
-                                 TRUE,
-                                 NULL,
-                                 'fista',
-                                 '',
-                                 NULL,
-                                 10000,
-                                 1e-6
+SELECT madlib.elastic_net_train( 'houses',                  -- Source table
+                                 'houses_en2',              -- Result table
+                                 'price',                   -- Dependent variable
+                                 'array[tax, bath, size]',  -- Independent variable
+                                 'gaussian',                -- Regression family
+                                 1,                         -- Alpha value
+                                 30000,                     -- Lambda value
+                                 TRUE,                      -- Standardize
+                                 NULL,                      -- Grouping column(s)
+                                 'fista',                   -- Optimizer
+                                 '',                        -- Optimizer parameters
+                                 NULL,                      -- Excluded columns
+                                 10000,                     -- Maximum iterations
+                                 1e-6                       -- Tolerance value
                                );
 </pre>
--# View the resulting model and see coef_nonzero is different from coef_all.
+-# View the resulting model and see coef_nonzero is different from coef_all:
 <pre class="example">
 -- Turn on expanded display to make it easier to read results.
 \\x on
@@ -614,7 +664,7 @@ log_likelihood    | -1635348585.07
 standardize       | t
 iteration_run     | 151
 </pre>
--# We can still use the prediction function with coef_all to evaluate residuals.
+-# We can still use the prediction function with \e coef_all to evaluate residuals:
 <pre class="example">
 \\x off
 SELECT id, price, predict, price - predict AS residual
@@ -622,14 +672,17 @@ FROM (
     SELECT
         houses.*,
         madlib.elastic_net_gaussian_predict(
-            m.coef_all,
-            m.intercept,
-            ARRAY[tax,bath,size]
+            m.coef_all,                   -- All coefficients
+            m.intercept,                  -- Intercept
+            ARRAY[tax,bath,size]          -- All features
             ) AS predict
     FROM houses, houses_en2 m) s
 ORDER BY id;
 </pre>
--# While we can also speed up the prediction function with coef_nonzero to evaluate residuals. This requires user to examine the feature_selected column in the result table to construct the correct independent variables.
+-# We can speed up the prediction function with \e coef_nonzero 
+to evaluate residuals. This requires the user to examine the 
+\e feature_selected column in the result table to construct the correct 
+set of independent variables to provide to the prediction function:
 <pre class="example">
 \\x off
 SELECT id, price, predict, price - predict AS residual
@@ -637,14 +690,14 @@ FROM (
     SELECT
         houses.*,
         madlib.elastic_net_gaussian_predict(
-            m.coef_nonzero,
-            m.intercept,
-            ARRAY[tax,size]
+            m.coef_nonzero,               -- Non-zero coefficients
+            m.intercept,                  -- Intercept
+            ARRAY[tax,size]               -- Features corresponding to non-zero coefficients
             ) AS predict
     FROM houses, houses_en2 m) s
 ORDER BY id;
 </pre>
-The two queries are expected to give same residuals:
+The two queries above will result in same residuals:
 <pre class="result">
  id | price  |     predict      |     residual
 ----+--------+------------------+-------------------
@@ -678,6 +731,66 @@ The two queries are expected to give same residuals:
 (27 rows)
 </pre>
 
+<h4>Example with Cross Validation</h4>
+-# Reuse the houses table above.
+Here we use 3-fold cross validation with 3 automatically generated 
+lambda values and 3 specified alpha values. (This can take some time to 
+run since elastic net is effectively being called 27 times.)
+<pre class="example">
+DROP TABLE IF EXISTS houses_en3, houses_en3_summary, houses_en3_cv;
+SELECT madlib.elastic_net_train( 'houses',                  -- Source table
+                                 'houses_en3',               -- Result table
+                                 'price',                   -- Dependent variable
+                                 'array[tax, bath, size]',  -- Independent variable
+                                 'gaussian',                -- Regression family
+                                 0.5,                       -- Alpha value
+                                 0.1,                       -- Lambda value
+                                 TRUE,                      -- Standardize
+                                 NULL,                      -- Grouping column(s)
+                                 'fista',                   -- Optimizer
+                                 $$ n_folds = 3,            -- Cross validation parameters
+                                    validation_result=houses_en3_cv,
+                                    n_lambdas = 3, 
+                                    alpha = {0, 0.1, 1}
+                                 $$,                       
+                                 NULL,                      -- Excluded columns
+                                 10000,                     -- Maximum iterations
+                                 1e-6                       -- Tolerance value
+                               );
+SELECT * FROM houses_en3;
+</pre>
+<pre class="result">
+-[ RECORD 1 ]-----+--------------------------------------------
+family            | gaussian
+features          | {tax,bath,size}
+features_selected | {tax,bath,size}
+coef_nonzero      | {22.4584783679,11657.0825871,52.1622899664}
+coef_all          | {22.4584783679,11657.0825871,52.1622899664}
+intercept         | -5067.27288499
+log_likelihood    | -543193170.15
+standardize       | t
+iteration_run     | 392
+</pre>
+
+-# Details of the cross validation:
+<pre class="example">
+SELECT * FROM houses_en3_cv ORDER BY lambda_value DESC, alpha ASC;
+</pre>
+<pre class="result">
+alpha | lambda_value |        mean         |     std
+------+--------------+---------------------+--------------------
+    0 |       100000 | -1.41777698585e+110 | 1.80536123195e+110
+  0.1 |       100000 | -1.19953054719e+107 | 1.72846143163e+107
+    1 |       100000 |      -4175743937.91 |      2485189261.38
+    0 |          100 |      -4054694238.18 |      2424765457.66
+  0.1 |          100 |      -4041768667.28 |      2418294966.72 
+    1 |          100 |      -1458791218.11 |      483327430.802
+    0 |          0.1 |      -1442293698.38 |      426795110.876
+  0.1 |          0.1 |       -1442705511.6 |       429680202.16
+|   1 |          0.1 |      -1459206061.39 |       485107796.02
+(9 rows)
+</pre>
+
 @anchor notes
 @par Note
 It is \b strongly \b recommended that you run
@@ -721,19 +834,18 @@ Note that fitting after scaling is not equivalent to directly fitting.
 @anchor literature
 @literature
 
-[1] Elastic net regularization. http://en.wikipedia.org/wiki/Elastic_net_regularization
+[1] Elastic net regularization, http://en.wikipedia.org/wiki/Elastic_net_regularization
 
 [2] Beck, A. and M. Teboulle (2009), A fast iterative shrinkage-thresholding algorithm for linear inverse problems. SIAM J. on Imaging Sciences 2(1), 183-202.
 
-[3] Shai Shalev-Shwartz and Ambuj Tewari, Stochastic Methods for l1 Regularized Loss Minimization. Proceedings of the 26th International Conference on Machine Learning, Montreal, Canada, 2009.
+[3] Shai Shalev-Shwartz and Ambuj Tewari, Stochastic Methods for L1 Regularized Loss Minimization. Proceedings of the 26th International Conference on Machine Learning, Montreal, Canada, 2009.
+
+[4] Stochastic gradient descent, https://en.wikipedia.org/wiki/Stochastic_gradient_descent
 
 @anchor related
 @par Related Topics
 
 File elastic_net.sql_in documenting the SQL functions.
-
-grp_validation
-
 */
 
 ------------------------------------------------------------------------

[17/50] [abbrv] incubator-madlib git commit: Documentation: Fix misc errors

Posted by ri...@apache.org.

Documentation: Fix misc errors

Closes #92


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/29acc538
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/29acc538
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/29acc538

Branch: refs/heads/latest_release
Commit: 29acc53868a2ea8d007b07bc3464d5481376cb65
Parents: faec6be
Author: Frank McQuillan <fm...@pivotal.io>
Authored: Tue Jan 24 16:04:22 2017 -0800
Committer: Rahul Iyer <ri...@apache.org>
Committed: Thu Jan 26 14:46:47 2017 -0800

----------------------------------------------------------------------
 src/ports/postgres/modules/lda/lda.sql_in       |  19 +-
 src/ports/postgres/modules/linalg/svd.sql_in    |   4 +-
 .../recursive_partitioning/decision_tree.sql_in | 328 ++++++++++---------
 .../recursive_partitioning/random_forest.sql_in | 210 +++++++-----
 .../postgres/modules/summary/summary.sql_in     |   2 +-
 .../postgres/modules/utilities/path.sql_in      |   3 +-
 .../postgres/modules/utilities/pivot.sql_in     |   3 +
 7 files changed, 325 insertions(+), 244 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/29acc538/src/ports/postgres/modules/lda/lda.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/lda/lda.sql_in b/src/ports/postgres/modules/lda/lda.sql_in
index c9c7323..a26deab 100644
--- a/src/ports/postgres/modules/lda/lda.sql_in
+++ b/src/ports/postgres/modules/lda/lda.sql_in
@@ -185,10 +185,14 @@ lda_train( data_table,
     <dd>TEXT. The name of the table storing the training dataset. Each row is
     in the form <tt>&lt;docid, wordid, count&gt;</tt> where \c docid, \c wordid, and \c count
     are non-negative integers.
-
+  
     The \c docid column refers to the document ID, the \c wordid column is the
     word ID (the index of a word in the vocabulary), and \c count is the
-    number of occurrences of the word in the document. </dd>
+    number of occurrences of the word in the document.
+  
+    Please note that column names for \c docid, \c wordid, and \c count are currently fixed, so you must use these
+    exact names in the data_table.</dd>
+
     <dt>model_table</dt>
     <dd>TEXT. The name of the table storing the learned models. This table has one row and the following columns.
         <table class="output">
@@ -312,12 +316,13 @@ lda_get_perplexity( model_table,
 
 -# Prepare a training dataset for LDA. The examples below are small strings extracted from various Wikipedia documents .
 <pre class="example">
+DROP TABLE IF EXISTS documents;
 CREATE TABLE documents(docid INT4, contents TEXT);
 INSERT INTO documents VALUES
 (0, 'Statistical topic models are a class of Bayesian latent variable models, originally developed for analyzing the semantic content of large document corpora.'),
 (1, 'By the late 1960s, the balance between pitching and hitting had swung in favor of the pitchers. In 1968 Carl Yastrzemski won the American League batting title with an average of just .301, the lowest in history.'),
 (2, 'Machine learning is closely related to and often overlaps with computational statistics; a discipline that also specializes in prediction-making. It has strong ties to mathematical optimization, which deliver methods, theory and application domains to the field.'),
-(3, 'California''s diverse geography ranges from the Sierra Nevada in the east to the Pacific Coast in the west, from the Redwood\u2013Douglas fir forests of the northwest, to the Mojave Desert areas in the southeast. The center of the state is dominated by the Central Valley, a major agricultural area. ')
+(3, 'California''s diverse geography ranges from the Sierra Nevada in the east to the Pacific Coast in the west, from the Redwood\u2013Douglas fir forests of the northwest, to the Mojave Desert areas in the southeast. The center of the state is dominated by the Central Valley, a major agricultural area. ');
 </pre>
 
 -# Build a word count table by extracting the words and building a histogram for
@@ -328,7 +333,7 @@ ALTER TABLE documents ADD COLUMN words TEXT[];
 UPDATE documents SET words = regexp_split_to_array(lower(contents), E'[\\\\s+\\\\.\\\\,]');
 \nbsp
 -- Create the term frequency table
-DROP TABLE IF EXISTS my_training;
+DROP TABLE IF EXISTS my_training, my_training_vocabulary;
 SELECT madlib.term_frequency('documents', 'docid', 'words', 'my_training', TRUE);
 SELECT * FROM my_training order by docid limit 20;
 </pre>
@@ -388,6 +393,7 @@ SELECT * FROM my_training_vocabulary order by wordid limit 20;
 
 -# Create an LDA model using the \c lda_train() function.
 <pre class="example">
+DROP TABLE IF EXISTS my_model, my_outdata;
 SELECT madlib.lda_train( 'my_training',
                          'my_model',
                          'my_outdata',
@@ -398,12 +404,15 @@ SELECT madlib.lda_train( 'my_training',
                          0.01
                        );
 </pre>
+Reminder that column names for \c docid, \c wordid, and \c count are currently fixed, 
+so you must use these exact names in the input table.
 After a successful run of the lda_train() function two tables are generated,
 one for storing the learned model and the other for storing the output data table.
 
 -# To get the detailed information about the learned model, run these commands:
 <pre class="example">
 -- The topic description by top-k words
+DROP TABLE IF EXISTS my_topic_desc;
 SELECT madlib.lda_get_topic_desc( 'my_model',
                                   'my_training_vocabulary',
                                   'my_topic_desc',
@@ -433,8 +442,10 @@ select * from my_topic_desc order by topicid, prob DESC;
 \nbsp
 <pre class="example">
 -- The per-word topic counts (sorted by topic id)
+DROP TABLE IF EXISTS my_word_topic_count;
 SELECT madlib.lda_get_word_topic_count( 'my_model',
                                         'my_word_topic_count');
+SELECT * FROM my_word_topic_count ORDER BY wordid;
 </pre>
 \nbsp
 <pre class="result">

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/29acc538/src/ports/postgres/modules/linalg/svd.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/linalg/svd.sql_in b/src/ports/postgres/modules/linalg/svd.sql_in
index 799e9cb..b6d763b 100644
--- a/src/ports/postgres/modules/linalg/svd.sql_in
+++ b/src/ports/postgres/modules/linalg/svd.sql_in
@@ -416,8 +416,8 @@ called the <em>singular values</em>.
 It is possible to formulate the problem of computing the singular triplets
 (\f$\sigma_i, u_i, v_i\f$) of \f$A\f$ as an eigenvalue problem involving a Hermitian
 matrix related to \f$A\f$. There are two possible ways of achieving this:
--# With the cross product matrix, \f$A^TA\f$ and \f$AA^T\f$
--# With the cyclic matrix
+- With the cross product matrix, \f$A^TA\f$ and \f$AA^T\f$
+- With the cyclic matrix
       \f[
           H(A) =
       \begin{bmatrix}

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/29acc538/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in b/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
index d6b7b5a..b5ed4a2 100644
--- a/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
+++ b/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
@@ -24,17 +24,15 @@ m4_include(`SQLCommon.m4')
 </ul></div>
 
 @brief Decision Trees.
-Decision trees use a tree-based predictive model to
-predict the value of a target variable based on several input variables.
-
-Decision trees are a supervised learning method that uses a predictive model to
-predict the value of a target variable, based on several input variables. They
-use a tree-based representation of the model such that, the interior nodes of
-the tree correspond to the input variables, the edges of the nodes correspond to
-values that the input variables can take, and leaf nodes represent values of the
-target variable, given the values of the input variables, represented by the
-path from the root to the leaf nodes.
+Decision trees use a tree-based model to predict the value of a
+target variable based on several input variables.
 
+A decision tree is a supervised learning method that can be used for
+classification and regression.  It consists of a structure in which
+internal nodes represent tests on attributes, and the branches from
+nodes represent the result of those tests.  Each leaf node is a class
+label and the paths from root to leaf nodes define the set of classification
+or regression rules.
 
 @anchor train
 @par Training Function
@@ -63,18 +61,18 @@ tree_train(
 \b Arguments
 <dl class="arglist">
   <dt>training_table_name</dt>
-  <dd>TEXT. The name of the table containing the training data</dd>
+  <dd>TEXT. The name of the table containing the training data.</dd>
 
   <dt>output_table_name</dt>
   <dd>TEXT. The name of the generated table containing the model. If a table
   with the same name already exists, then the function will return an error.</dd>
 
-    The model table produced by the train function contains the following columns:
+    The model table produced by the training function contains the following columns:
 
     <table class="output">
       <tr>
         <th>&lt;...&gt;</th>
-        <td>Grouping columns, if provided in input, same types as in the training table.
+        <td>Grouping columns, if provided as input, in the same types as the training table.
         This could be multiple columns depending on the \c grouping_cols input.</td>
       </tr>
       <tr>
@@ -83,22 +81,22 @@ tree_train(
       </tr>
       <tr>
         <th>cat_levels_in_text</th>
-        <td>TEXT[]. Ordered levels of categorical variables</td>
+        <td>TEXT[]. Ordered levels of categorical variables.</td>
       </tr>
       <tr>
         <th>cat_n_levels</th>
-        <td>INTEGER[]. Number of levels for each categorical variable</td>
+        <td>INTEGER[]. Number of levels for each categorical variable.</td>
       </tr>
 
       <tr>
       <th>tree_depth</th>
-      <td>INTEGER. The maximum depth the tree obtained after training (root has depth 0)</td>
+      <td>INTEGER. The maximum depth the tree obtained after training (root has depth 0).</td>
       </tr>
 
       <tr>
       <th>pruning_cp</th>
       <td>DOUBLE PRECISION. The cost-complexity parameter used for pruning
-       the trained tree(s). This would be different from the input cp value if
+       the trained tree(s). This would be different from the 'input_cp' value if
        cross-validation is used.
        </td>
       </tr>
@@ -116,89 +114,90 @@ tree_train(
 
     <tr>
     <th>is_classification</th>
-    <td>BOOLEAN. TRUE if the decision trees are for classification, FALSE if regression</td>
+    <td>BOOLEAN. TRUE if the decision trees are for classification, FALSE if for regression.</td>
     </tr>
 
     <tr>
     <th>source_table</th>
-    <td>TEXT. The data source table name</td>
+    <td>TEXT. The data source table name.</td>
     </tr>
 
     <tr>
     <th>model_table</th>
-    <td>TEXT. The model table name</td>
+    <td>TEXT. The model table name.</td>
     </tr>
 
     <tr>
     <th>id_col_name</th>
-    <td>TEXT. The ID column name</td>
+    <td>TEXT. The ID column name.</td>
     </tr>
 
     <tr>
     <th>dependent_varname</th>
-    <td>TEXT. The dependent variable</td>
+    <td>TEXT. The dependent variable.</td>
     </tr>
 
     <tr>
     <th>independent_varname</th>
-    <td>TEXT. The independent variables</td>
+    <td>TEXT. The independent variables.</td>
     </tr>
 
     <tr>
       <th>cat_features</th>
-      <td>TEXT. The list of categorical feature names as a comma-separated string</td>
+      <td>TEXT. The list of categorical feature names as a comma-separated string.</td>
     </tr>
+
     <tr>
       <th>con_features</th>
-      <td>TEXT. The list of continuous feature names as a comma-separated string</td>
+      <td>TEXT. The list of continuous feature names as a comma-separated string.</td>
     </tr>
 
     <tr>
     <th>grouping_col</th>
-    <td>TEXT. Names of grouping columns</td>
+    <td>TEXT. Names of grouping columns.</td>
     </tr>
 
     <tr>
     <th>num_all_groups</th>
-    <td>INTEGER. Number of groups in decision tree training</td>
+    <td>INTEGER. Number of groups in decision tree training.</td>
     </tr>
 
     <tr>
     <th>num_failed_groups</th>
-    <td>INTEGER. Number of failed groups in decision tree training</td>
+    <td>INTEGER. Number of failed groups in decision tree training.</td>
     </tr>
 
     <tr>
       <th>total_rows_processed</th>
-      <td>BIGINT. Total numbers of rows processed in all groups</td>
+      <td>BIGINT. Total numbers of rows processed in all groups.</td>
     </tr>
 
     <tr>
       <th>total_rows_skipped</th>
       <td>BIGINT. Total numbers of rows skipped in all groups due to missing
-      values or failures</td>
+      values or failures.</td>
     </tr>
 
     <tr>
     <th>dependent_var_levels</th>
-    <td>TEXT. For classification, the distinct levels of the dependent variable</td>
+    <td>TEXT. For classification, the distinct levels of the dependent variable.</td>
     </tr>
 
     <tr>
     <th>dependent_var_type</th>
-    <td>TEXT. The type of dependent variable</td>
+    <td>TEXT. The type of dependent variable.</td>
     </tr>
 
    <tr>
     <th>input_cp</th>
     <td>DOUBLE PRECISION. The complexity parameter (cp) used for pruning the trained tree(s)
-    (before cross-validation is run). This is same as the cp value inputed
-    through the <em>pruning_params</em></td>
+    before cross-validation is run. This is same as the cp value input
+    using the <em>pruning_params</em>.</td>
     </tr>
 
     <tr>
     <th>independent_var_types</th>
-    <td>TEXT. A comma separated string, the types of independent variables</td>
+    <td>TEXT. A comma separated string for the types of independent variables.</td>
     </tr>
 
    </table>
@@ -207,7 +206,7 @@ tree_train(
   <DT>id_col_name</DT>
   <DD>TEXT. Name of the column containing id information in the training data.
   This is a mandatory argument and is used for prediction and cross-validation.
-  The values are expected to be unique for each row
+  The values are expected to be unique for each row.
   </DD>
 
   <DT>dependent_variable</DT>
@@ -215,7 +214,7 @@ tree_train(
   training. Boolean, integer and text types are considered classification
   outputs, while double precision values are considered regression outputs.
   The response variable for a classification tree can be multinomial, but the
-  time and space complexity of train function increases linearly as the
+  time and space complexity of the training function increases linearly as the
   number of response classes increases.</DD>
 
   <DT>list_of_features</DT>
@@ -226,11 +225,11 @@ tree_train(
   double precision columns are considered continuous. The categorical variables
   are not encoded and used as is for the training.
 
-  There are no limitations to the number of levels in a categorical variable.
+  There are no limitations on the number of levels in a categorical variable.
   It is, however, important to note that we don't test for every combination of
-  levels of a categorical variable for evaluating a split. We order the levels
-  of the variable by the entropy of the varible in predicting the response. The
-  splits at each node is evaluated between these ordered levels
+  levels of a categorical variable when evaluating a split. We order the levels
+  of the variable by the entropy of the variable in predicting the response. The
+  split at each node is evaluated between these ordered levels.
   </DD>
 
   <DT>list_of_features_to_exclude</DT>
@@ -240,44 +239,44 @@ tree_train(
       <em>dependent_variable</em> expression,
       otherwise those columns will be included in the features.
       The names in this parameter should be identical to the names used in the table and
-      quoted appropriately</DD>
+      quoted appropriately.</DD>
 
   <DT>split_criterion</DT>
   <DD>TEXT, default = 'gini' for classification, 'mse' for regression.
   Impurity function to compute the feature to use for the split.
   Supported criteria are 'gini', 'entropy', 'misclassification' for
   classification trees. For regression trees, split_criterion of 'mse'
-  is always used (irrespective of the input for this argument)
+  is always used (irrespective of the input for this argument).
   </DD>
 
   <DT>grouping_cols (optional)</DT>
   <DD>TEXT, default: NULL. Comma-separated list of column names to group the
-      data by. This will lead to creating multiple decision trees, one for
-      each group</DD>
+      data by. This will result in multiple decision trees, one for
+      each group.</DD>
 
   <DT>weights (optional)</DT>
-  <DD>TEXT. Column name containing weights for each observation</DD>
+  <DD>TEXT. Column name containing weights for each observation.</DD>
 
   <DT>max_depth (optional)</DT>
   <DD>INTEGER, default: 10. Maximum depth of any node of the final tree,
-      with the root node counted as depth 0</DD>
+      with the root node counted as depth 0.</DD>
 
   <DT>min_split (optional)</DT>
   <DD>INTEGER, default: 20. Minimum number of observations that must exist
       in a node for a split to be attempted. The best value for this parameter
-      depends on the number of tuples in the dataset</DD>
+      depends on the number of tuples in the dataset.</DD>
 
   <DT>min_bucket (optional)</DT>
   <DD>INTEGER, default: min_split/3. Minimum number of observations in any terminal
       node. If only one of min_bucket or min_split is specified, min_split is
-      set to min_bucket*3 or min_bucket to min_split/3, as appropriate</DD>
+      set to min_bucket*3 or min_bucket to min_split/3, as appropriate.</DD>
 
   <DT>num_splits (optional)</DT>
   <DD>INTEGER, default: 100. Continuous-valued features are binned into
       discrete quantiles to compute split boundaries. This global parameter
       is used to compute the resolution of splits for continuous features.
       Higher number of bins will lead to better prediction,
-      but will also result in higher processing time</DD>
+      but will also result in longer processing.</DD>
 
   <DT>pruning_params (optional)</DT>
   <DD>TEXT. Comma-separated string of key-value pairs giving
@@ -296,7 +295,7 @@ tree_train(
       <tr>
       <th>n_folds</th>
       <td>
-        Default: 0 (i.e. No cross-validation).
+        Default: 0 (i.e. no cross-validation).
         Number of cross-validation folds to use to compute the best value of
         <em>cp</em>. To perform cross-validation, a positive value of
         <em>n_folds</em> (greater than 2) should be given. An additional output
@@ -306,9 +305,9 @@ tree_train(
         cross-validation error (we pick the maximum <em>cp</em> if multiple
         values have same error).
 
-        The list of <em>cp</em> values are automatically computed by parsing
+        The list of <em>cp</em> values is automatically computed by parsing
         through the tree initially trained on the complete dataset. The tree
-        outputted is a subset of this initial tree corresponding to the best
+        output is a subset of this initial tree corresponding to the best
         computed <em>cp</em>.
 
       </td>
@@ -321,33 +320,33 @@ tree_train(
   of surrogate splits for each node. A surrogate variable is another predictor
   variable that is associated (correlated) with the primary predictor variable
   for a split. The surrogate variable comes into use when the primary predictior
-  value is NULL. This parameter currently accepts the below argument:
+  value is NULL. This parameter currently accepts one argument:
     <table class='output'>
     <tr>
       <th>max_surrogates</th>
-      <td>Default: 0. Number of surrogates to store for each node</td>
+      <td>Default: 0. Number of surrogates to store for each node.</td>
     </tr>
     </table>
   </DD>
 
   <DT>verbosity (optional)</DT>
-  <DD>BOOLEAN, default: FALSE. Provides verbose output of the results of training</DD>
+  <DD>BOOLEAN, default: FALSE. Provides verbose output of the training result.</DD>
 </DL>
 
 @note
 - Many of the parameters are designed to be similar to the popular R package 'rpart'.
-An important distinction between rpart and the above MADlib function is that
+An important distinction between rpart and the MADlib function is that
 for both response and feature variables, MADlib considers integer values as
 categorical values, while rpart considers them as continuous.
-- When using no surrogates (<em>max_surrogates</em>=0), all rows containing NULL value
+- When using no surrogates (<em>max_surrogates</em>=0), all rows containing NULL values
 for any of the features used for training will be ignored from training and prediction.
-- When cross-validation is not used (<em>n_folds</em>=0), each tree outputed
-is pruned by inputed cost-complextity (<em>cp</em>). With cross-validation,
-inputed <em>cp</em> is the minimum value of all the explored values of 'cp'.
+- When cross-validation is not used (<em>n_folds</em>=0), each tree output
+is pruned by the input cost-complextity (<em>cp</em>). With cross-validation,
+the input <em>cp</em> is the minimum value of all the explored values of 'cp'.
 During cross-validation, we train an initial tree using the
-provided <em>cp</em> and explore all possible sub-trees (upto a single-node tree)
+provided <em>cp</em> and explore all possible sub-trees (up to a single-node tree)
 to compute the optimal sub-tree. The optimal sub-tree and the 'cp' corresponding
-to this optimal sub-tree is placed in the <em>output_table</em>, with their
+to this optimal sub-tree is placed in the <em>output_table</em>, with the
 columns named as <em>tree</em> and <em>pruning_cp</em> respectively.
 - The main parameters that affect memory usage are:  depth of tree, number
 of features, and number of values per feature.  If you are hitting VMEM limits,
@@ -355,7 +354,7 @@ consider reducing one or more of these parameters.
 
 @anchor predict
 @par Prediction Function
-The prediction function is provided to estimate the conditional mean given a new
+The prediction function estimates the conditional mean given a new
 predictor. It has the following syntax:
 <pre class="syntax">
 tree_predict(tree_model,
@@ -369,16 +368,16 @@ tree_predict(tree_model,
 <DL class="arglist">
   <DT>tree_model</DT>
   <DD>TEXT. Name of the table containing the decision tree model. This should
-  be the output table returned from <em>tree_train</em></DD>
+  be the output table returned from <em>tree_train.</em></DD>
 
   <DT>new_data_table</DT>
   <DD>TEXT. Name of the table containing prediction data. This table is
   expected to contain the same features that were used during training. The table
-  should also contain <em>id_col_name</em> used for identifying each row</DD>
+  should also contain <em>id_col_name</em> used for identifying each row.</DD>
 
   <DT>output_table</DT>
-  <DD>TEXT. Name of the table to output prediction results to. If this table
-  already exists then an error is returned.
+  <DD>TEXT. Name of the table to output prediction results. If this table
+  already exists, an error is returned.
   The table contains the <em>id_col_name</em> column giving
   the 'id' for each prediction and the prediction columns for the dependent variable.
 
@@ -389,7 +388,7 @@ tree_predict(tree_model,
   If <em>type</em> = 'prob', then the table has multiple additional columns, one for each
   possible value of the response variable. The columns are labeled as
   'estimated_prob_<em>dep_value</em>', where <em>dep_value</em> represents each value
-  of the response</DD>
+  of the response variable.</DD>
 
   <DT>type</DT>
   <DD>TEXT, optional, default: 'response'. For regression trees, the output is
@@ -397,26 +396,26 @@ tree_predict(tree_model,
       trees, the <em>type</em> variable can be 'response', giving the
       classification prediction as output, or 'prob', giving the class
       probabilities as output. For each value of the dependent variable, a
-      column with the probabilities is added to the output table
+      column with the probabilities is added to the output table.
   </DD>
 </DL>
 
 @note If the <em>new_data_table</em> contains categories of categorical variables
-not seen in the training data then the prediction for that row will be NULL.
+not seen in the training data, the prediction for that row will be NULL.
 
 @anchor display
 @par Display Function
-The display function is provided to output a graph representation of the
+The display function outputs a graph representation of the
 decision tree. The output can either be in the popular 'dot' format that can
 be visualized using various programs including those in the GraphViz package, or
-in a simple text format. The details of the text format is outputted with the
+in a simple text format. The details of the text format are output with the
 tree.
 <pre class="syntax">
 tree_display(tree_model, dot_format, verbosity)
 </pre>
 
 An additional display function is provided to output the surrogate splits chosen
-for each internal node.
+for each internal node:
 <pre class="syntax">
 tree_surr_display(tree_model)
 </pre>
@@ -426,7 +425,7 @@ nodes are sorted in ascending order by id. This is equivalent to viewing the
 tree in a breadth-first manner. For each surrogate, we output the surrogate
 split (variable and threshold) and also give the number of rows that were common
 between the primary split and the surrogate split. Finally, the number of rows
-present in the majority branch of the primary split is also presented. Only
+present in the majority branch of the primary split is also shown. Only
 surrogates that perform better than this majority branch are included in the
 surrogate list. When the primary variable has a NULL value the surrogate variables
 are used in order to compute the split for that node. If all surrogates variables
@@ -434,23 +433,25 @@ are NULL, then the majority branch is used to compute the split for a tuple.
 
 \b Arguments
 <DL class="arglist">
-    <DT>tree_model_name</DT>
-    <DD>TEXT. Name of the table containing the decision tree model</DD>
+    <DT>tree_model</DT>
+    <DD>TEXT. Name of the table containing the decision tree model.</DD>
     <DT>dot_format</DT>
     <DD>BOOLEAN, default = TRUE. Output can either be in a dot format or a text
-    format. If TRUE, the result is in the dot format, else output is in text format</DD>
+    format. If TRUE, the result is in the dot format, else output is in text format.</DD>
     <DT>verbosity</DT>
-    <DD>BOOLEAN, default = FALSE. If true, the dot format output will contain
-    additional information (impurity, sample size, etc.)</DD>
+    <DD>BOOLEAN, default = FALSE. If set to TRUE, the dot format output will contain
+    additional information (impurity, sample size, number of weighted rows
+    for each response variable, classification or prediction if the tree
+    was pruned at this level)</DD>
 </DL>
 
 The output is always returned as a 'TEXT'. For the dot format, the output can be
 redirected to a file on the client side and then rendered using visualization
 programs.
 
-If the user wants to export the dot format result to an external file,
-he can use the following method (Note: the user needs to use unaligned
-table output mode for psql with '-A' flag. And inside psql client,
+To export the dot format result to an external file,
+use the method below. Use unaligned
+table output mode for psql with '-A' flag. And inside the psql client,
 both '\\t' and '\\o' should be used):
 
 <pre class="example">
@@ -464,19 +465,22 @@ both '\\t' and '\\o' should be used):
 \# \\t
 </pre>
 
-After the desired dot file has been generated, one can then use third-party
-plotting software to plot the trees in a nice figure:
+After the dot file has been generated, use third-party
+plotting software to plot the trees in a nice format:
 <pre class="example">
 \> \# under bash, convert the dot file into a PDF file
 \> dot -Tpdf test.dot \> test.pdf
 \> xpdf test.pdf\&
 </pre>
 
+Please see the examples below for more details on the contents
+of the tree output formats.
+
 @anchor examples
 @examp
-*Decision tree classification example*
+<h4>Decision Tree Classification Example</h4>
 
--# Prepare input data.
+-# Prepare input data:
 <pre class="example">
 DROP TABLE IF EXISTS dt_golf;
 CREATE TABLE dt_golf (
@@ -486,7 +490,7 @@ CREATE TABLE dt_golf (
     humidity double precision,
     windy text,
     class text
-) ;
+);
 </pre>
 <pre class="example">
 COPY dt_golf (id,"OUTLOOK",temperature,humidity,windy,class) FROM stdin WITH DELIMITER '|';
@@ -507,8 +511,9 @@ COPY dt_golf (id,"OUTLOOK",temperature,humidity,windy,class) FROM stdin WITH DEL
 \\.
 </pre>
 
--# Run Decision tree train function.
+-# Run the decision tree training function:
 <pre class="example">
+DROP TABLE IF EXISTS train_output, train_output_summary;
 SELECT madlib.tree_train('dt_golf',         -- source table
                          'train_output',    -- output model table
                          'id',              -- id column
@@ -525,17 +530,19 @@ SELECT madlib.tree_train('dt_golf',         -- source table
                          );
 </pre>
 
--# Predict output categories for the same data as was used for input.
+-# Predict output categories for the same data that was used for input:
 <pre class="example">
-SELECT madlib.tree_predict('train_output',
-                           'dt_golf',
-                           'prediction_results',
-                           'response');
-SELECT * FROM prediction_results;
+DROP TABLE IF EXISTS prediction_results;
+SELECT madlib.tree_predict('train_output',          -- tree model
+                           'dt_golf',               -- new data table
+                           'prediction_results',    -- output table
+                           'response');             -- show prediction
+SELECT * FROM prediction_results ORDER BY id;
 </pre>
 Result:
 <pre class="result">
-&nbsp;----+-----------------
+ id | estimated_class
+----+-----------------
   1 | 'Don't Play'
   2 | 'Don't Play'
   3 | 'Play'
@@ -553,7 +560,51 @@ Result:
 (14 rows)
 </pre>
 
--# Obtain a dot format display of the tree
+-# Create a text display of the tree:
+<pre class="example">
+SELECT madlib.tree_display('train_output', FALSE);
+</pre>
+Result:
+<pre class="result">
+&nbsp;-------------------------------------
+&nbsp;- Each node represented by 'id' inside ().
+&nbsp;- Leaf nodes have a * while internal nodes have the split condition at the end.
+&nbsp;- For each internal node (i), it's children will be at (2i+1) and (2i+2).
+&nbsp;- For each split the first indented child (2i+1) is the 'True' node and
+second indented child (2i+2) is the 'False' node.
+&nbsp;- Number of (weighted) rows for each response variable inside [].
+&nbsp;- Order of values = ['"Don\'t Play"', '"Play"']
+&nbsp;-------------------------------------
+(0)[ 5 9]  "OUTLOOK"<={overcast}
+  (1)[ 0 4]  *
+  (2)[ 5 5]  temperature<=75
+    (5)[ 3 5]  temperature<=65
+      (11)[ 1 0]  *
+      (12)[ 2 5]  temperature<=70
+        (25)[ 0 3]  *
+        (26)[ 2 2]  temperature<=72
+          (53)[ 2 0]  *
+          (54)[ 0 2]  *
+    (6)[ 2 0]  *
+&nbsp;-------------------------------------
+</pre>
+Here are some more details on how to interpret the tree display above...
+Node numbering starts at 0 for the root node and would be
+contiguous 1,2...n if the tree was completely full (no pruning).
+Since the tree has been pruned, the node numbering is not
+contiguous.
+The order of values [x y] indicates the number of weighted
+rows that correspond to ["Don't play" "Play"] <em>before</em> the node test.
+For example, at the root node 0, there are 5 rows that "Don't play"
+and 9 rows that "Play" in the raw data.
+If we apply the test
+of "OUTLOOK" being overcast, then the True result is
+leaf node 1 which is "Play".  There are 0 "Don't play" rows
+and 4 "Play" rows that correspond to this case (overcast).
+The remaining 5 "Don't play" rows and 5 "Play rows" are then
+tested at node 2 on temperature<=75.  And so on down the tree.
+
+-# Create a dot format display of the tree:
 <pre class="example">
 SELECT madlib.tree_display('train_output', TRUE);
 </pre>
@@ -587,7 +638,7 @@ digraph "Classification tree for dt_golf" {
 &nbsp;} //---end of digraph---------
 </pre>
 
--# Obtain a dot format display of the tree with additional info
+-# Now create a dot format display of the tree with additional information:
 <pre class="example">
 SELECT madlib.tree_display('train_output', TRUE, TRUE);
 </pre>
@@ -620,41 +671,13 @@ digraph "Classification tree for dt_golf" {
 &nbsp;&nbsp;&nbsp;} //--- end of subgraph------------
 &nbsp;} //---end of digraph---------
 </pre>
+The additional information in each node is: impurity, sample size, number of weighted rows for each response variable, and classification if the tree was pruned at this level.
 
--# Obtain a text display of the tree
-<pre class="example">
-SELECT madlib.tree_display('train_output', FALSE);
-</pre>
-Result:
-<pre class="result">
-&nbsp;-------------------------------------
-&nbsp;- Each node represented by 'id' inside ().
-&nbsp;- Leaf nodes have a * while internal nodes have the split condition at the end.
-&nbsp;- For each internal node (i), it's children will be at (2i+1) and (2i+2).
-&nbsp;- For each split the first indented child (2i+1) is the 'True' node and
-second indented child (2i+2) is the 'False' node.
-&nbsp;- Number of (weighted) rows for each response variable inside [].
-&nbsp;- Order of values = ['"Don\'t Play"', '"Play"']
-&nbsp;-------------------------------------
-(0)[ 5 9]  "OUTLOOK"<={overcast}
-  (1)[ 0 4]  *
-  (2)[ 5 5]  temperature<=75
-    (5)[ 3 5]  temperature<=65
-      (11)[ 1 0]  *
-      (12)[ 2 5]  temperature<=70
-        (25)[ 0 3]  *
-        (26)[ 2 2]  temperature<=72
-          (53)[ 2 0]  *
-          (54)[ 0 2]  *
-    (6)[ 2 0]  *
-&nbsp;-------------------------------------
-</pre>
-
-
-*Decision tree regression example*
+<h4>Decision Tree Regression Example</h4>
 
 -# Prepare input data.
 <pre class="example">
+DROP TABLE IF EXISTS mt_cars;
 CREATE TABLE mt_cars (
     id integer NOT NULL,
     mpg double precision,
@@ -668,10 +691,10 @@ CREATE TABLE mt_cars (
     am integer,
     gear integer,
     carb integer
-) ;
+);
 </pre>
 <pre class="example">
-COPY mt_cars (id,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb) FROM stdin WITH DELIMITER '|' NULL '\\null';
+COPY mt_cars (id,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb) FROM stdin WITH DELIMITER '|' NULL 'null';
 1|18.7|8|360|175|3.15|3.44|17.02|0|0|3|2
 2|21|6|160|110|3.9|2.62|16.46|0|1|4|4
 3|24.4|4|146.7|62|3.69|3.19|20|1|0|4|2
@@ -680,7 +703,7 @@ COPY mt_cars (id,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb) FROM stdin WITH D
 6|16.4|8|275.8|180|3.078|4.07|17.4|0|0|3|3
 7|22.8|4|108|93|3.85|2.32|18.61|1|1|4|1
 8|17.3|8|275.8|180|3.078|3.73|17.6|0|0|3|3
-9|21.4|\\null|258|110|3.08|3.215|19.44|1|0|3|1
+9|21.4|null|258|110|3.08|3.215|19.44|1|0|3|1
 10|15.2|8|275.8|180|3.078|3.78|18|0|0|3|3
 11|18.1|6|225|105|2.768|3.46|20.22|1|0|3|1
 12|32.4|4|78.7|66|4.08|2.20|19.47|1|1|4|1
@@ -689,7 +712,7 @@ COPY mt_cars (id,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb) FROM stdin WITH D
 15|30.4|4|75.7|52|4.93|1.615|18.52|1|1|4|2
 16|19.2|6|167.6|123|3.92|3.44|18.3|1|0|4|4
 17|33.9|4|71.14|65|4.22|1.835|19.9|1|1|4|1
-18|15.2|\\null|304|150|3.15|3.435|17.3|0|0|3|2
+18|15.2|null|304|150|3.15|3.435|17.3|0|0|3|2
 19|10.4|8|472|205|2.93|5.25|17.98|0|0|3|4
 20|27.3|4|79|66|4.08|1.935|18.9|1|1|4|1
 21|10.4|8|460|215|3|5.424|17.82|0|0|3|4
@@ -707,28 +730,28 @@ COPY mt_cars (id,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb) FROM stdin WITH D
 \\.
 </pre>
 
--# Run Decision Tree train function.
+-# Run the decision tree training function:
 <pre class="example">
 DROP TABLE IF EXISTS train_output, train_output_summary;
-SELECT madlib.tree_train('mt_cars',
-                         'train_output',
-                         'id',
-                         'mpg',
-                         '*',
+SELECT madlib.tree_train('mt_cars',         -- source table
+                         'train_output',    -- output model table
+                         'id',              -- id column
+                         'mpg',             -- dependent variable
+                         '*',               -- features
                          'id, hp, drat, am, gear, carb',  -- exclude columns
-                         'mse',
-                         NULL::text,
-                         NULL::text,
-                         10,
-                         8,
-                         3,
-                         10,
-                         NULL,
-                         'max_surrogates=2'
+                         'mse',             -- split criterion
+                         NULL::text,        -- no grouping
+                         NULL::text,        -- no weights
+                         10,                -- max depth
+                         8,                 -- min split
+                         3,                 -- number of bins per continuous variable
+                         10,                -- number of splits
+                         NULL,              -- pruning parameters
+                         'max_surrogates=2' -- number of surrogates
                          );
 </pre>
 
--# Display the decision tree in basic text format.
+-# Display the decision tree in basic text format:
 <pre class="example">
 SELECT madlib.tree_display('train_output', FALSE);
 </pre>
@@ -757,7 +780,7 @@ Result:
 (1 row)
 </pre>
 
--# Display the surrogates in the decision tree.
+-# Display the surrogates in the decision tree:
 <pre class="example">
 SELECT madlib.tree_surr_display('train_output');
 </pre>
@@ -789,7 +812,6 @@ Result:
 &nbsp;-------------------------------------
 (1 row)
 </pre>
-
 @note The 'cyl' parameter above has two tuples with null values.
 In the prediction example below, the surrogate splits for the
 <em>cyl in {8, 6}</em> split are used to predict those
@@ -799,14 +821,14 @@ the two tuples have non-NULL values for <em>disp</em>, hence the <em>disp > 146.
 split is used to make the prediction. If all the surrogate variables had been
 NULL then the majority branch would have been followed.
 
--# Predict regression output for the same data and compare with original.
+-# Predict regression output for the same data and compare with original:
 <pre class="example">
 DROP TABLE IF EXISTS prediction_results;
 SELECT madlib.tree_predict('train_output',
                            'mt_cars',
                            'prediction_results',
                            'response');
-SELECT s.id, mpg, estimated_mpg FROM prediction_results p, mt_cars s where s.id = p.id;
+SELECT s.id, mpg, estimated_mpg FROM prediction_results p, mt_cars s where s.id = p.id ORDER BY id;
 </pre>
 Result:
 <pre class="result">

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/29acc538/src/ports/postgres/modules/recursive_partitioning/random_forest.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/recursive_partitioning/random_forest.sql_in b/src/ports/postgres/modules/recursive_partitioning/random_forest.sql_in
index 0cc6534..3d4da87 100644
--- a/src/ports/postgres/modules/recursive_partitioning/random_forest.sql_in
+++ b/src/ports/postgres/modules/recursive_partitioning/random_forest.sql_in
@@ -24,17 +24,15 @@ m4_include(`SQLCommon.m4')
 </ul></div>
 
 @brief
-Random forests are an ensemble learning method for classification (and
-regression) that operate by constructing a multitude of decision trees at
-training time and outputting the class that is the mode of the classes output by
+Random forests are an ensemble learning method for classification and
+regression that construct a multitude of decision trees at
+training time, then produce the class that is the mode of the classes of the
 individual trees.
 
 Random forests build an ensemble of classifiers, each of which is a tree model
 constructed using bootstrapped samples from the input data. The results of these
-models are then combined to yield a single prediction, which, although at the
+models are then combined to yield a single prediction, which, at the
 expense of some loss in interpretation, have been found to be highly accurate.
-Such methods of using multiple Random Forests to make predictions are called
-random forest methods.
 
 @anchor train
 @par Training Function
@@ -64,12 +62,12 @@ forest_train(training_table_name,
 \b Arguments
 <dl class="arglist">
   <dt>training_table_name</dt>
-  <dd>text. the name of the table containing the training data.</dd>
+  <dd>text. Name of the table containing the training data.</dd>
 
   <dt>output_table_name</dt>
-  <dd>text. the name of the generated table containing the model.</dd>
+  <dd>text. Name of the generated table containing the model.</dd>
 
-    The model table produced by the train function contains the following columns:
+    The model table produced by the training function contains the following columns:
 
     <table class="output">
       <tr>
@@ -78,16 +76,16 @@ forest_train(training_table_name,
       </tr>
       <tr>
         <th>sample_id</th>
-        <td>integer. id of the bootstrap sample that this tree is a part of.</td>
+        <td>integer. The id of the bootstrap sample that this tree is a part of.</td>
       </tr>
       <tr>
         <th>tree</th>
-        <td>bytea8. trained tree model stored in binary format.</td>
+        <td>bytea8. Trained tree model stored in binary format.</td>
       </tr>
     </table>
 
     A summary table named <em>\<model_table\>_summary</em> is also created at
-    the same time, which has the following columns:
+    the same time, which contains the following columns:
     <table class="output">
     <tr>
       <th>method</th>
@@ -101,12 +99,12 @@ forest_train(training_table_name,
 
     <tr>
       <th>source_table</th>
-      <td>text. The data source table name.</td>
+      <td>text. Data source table name.</td>
     </tr>
 
     <tr>
       <th>model_table</th>
-      <td>text. The model table name.</td>
+      <td>text. Model table name.</td>
     </tr>
 
     <tr>
@@ -116,22 +114,22 @@ forest_train(training_table_name,
 
     <tr>
       <th>dependent_varname</th>
-      <td>text. The dependent variable.</td>
+      <td>text. Dependent variable.</td>
     </tr>
 
     <tr>
       <th>independent_varname</th>
-      <td>text. The independent variables</td>
+      <td>text. Independent variables</td>
     </tr>
 
     <tr>
       <th>cat_features</th>
-      <td>text. categorical feature names.</td>
+      <td>text. Categorical feature names.</td>
     </tr>
 
     <tr>
       <th>con_features</th>
-      <td>text. continuous feature names.</td>
+      <td>text. Continuous feature names.</td>
     </tr>
 
     <tr>
@@ -161,27 +159,27 @@ forest_train(training_table_name,
 
     <tr>
       <th>min_bucket</th>
-      <td>int. minimum number of observations in any terminal node.</td>
+      <td>int. Minimum number of observations in any terminal node.</td>
     </tr>
 
     <tr>
       <th>num_splits</th>
-      <td>int. number of buckets for continuous variables.</td>
+      <td>int. Number of buckets for continuous variables.</td>
     </tr>
 
     <tr>
       <th>verbose</th>
-      <td>boolean. whether or not to display debug info.</td>
+      <td>boolean. Whether or not to display debug info.</td>
     </tr>
 
     <tr>
       <th>importance</th>
-      <td>boolean. whether or not to calculate variable importance.</td>
+      <td>boolean. Whether or not to calculate variable importance.</td>
     </tr>
 
     <tr>
       <th>num_permutations</th>
-      <td>int. number of times feature values are permuted while calculating
+      <td>int. Number of times feature values are permuted while calculating
       variable importance. The default value is 1.</td>
     </tr>
 
@@ -221,7 +219,7 @@ forest_train(training_table_name,
 
       <tr>
         <th>gid</th>
-        <td>integer. group id that uniquely identifies a set of grouping column values.</td>
+        <td>integer. Group id that uniquely identifies a set of grouping column values.</td>
       </tr>
 
       <tr>
@@ -303,13 +301,16 @@ forest_train(training_table_name,
       features to randomly select at each split.</DD>
 
   <DT>importance (optional)</DT>
-  <DD>boolean, default: true. Whether or not to calculate variable importance.</DD>
+  <DD>boolean, default: true. Whether or not to calculate variable importance.
+  If set to true, variable importance for categorical and continuous features
+  will be output in the group table <em>\<model_table\>_group</em> described 
+  above.  Will increase run time when variable importance is turned on. </DD>
 
   <DT>num_permutations (optional)</DT>
   <DD>integer, default: 1. Number of times to permute each feature value while
       calculating variable importance.
 
-      Variable importance for a feature is computed by permuting the variable with
+      @note Variable importance for a feature is computed by permuting the variable with
       random values and computing the drop in predictive accuracy (using OOB samples).
       Setting this greater than 1 performs an average over multiple importance
       calculation. This increases the total run time and in most cases
@@ -356,13 +357,12 @@ forest_train(training_table_name,
     table is expected to be used for training each tree in the forest. A ratio that
     is close to 0 may result in trees with only the root node.
     This allows users to experiment with the function in a speedy fashion.</DD>
-
+</DL>
     @note The main parameters that affect memory usage are:  depth of tree, number
-    of features, and number of values per feature.  If you are hitting VMEM limits,
+    of features, and number of values per feature (controlled by num_splits).  
+    If you are hitting VMEM limits,
     consider reducing one or more of these parameters.
 
-</DL>
-
 @anchor predict
 @par Prediction Function
 The prediction function is provided to estimate the conditional mean given a new
@@ -397,8 +397,8 @@ forest_predict(random_forest_model,
 
 @anchor get_tree
 @par Display Function
-The get_tree function is provided to output a graph representation of a
-single tree of the Random Forest. The output can either be in the popular
+The 'get_tree' function is provided to output a graph representation of a
+single tree of the random forest. The output can either be in the popular
 'dot' format that can be visualized using various programs including those
 in the GraphViz package, or in a simple text format. The details of the
 text format is outputted with the tree.
@@ -446,7 +446,9 @@ are NULL, then the majority branch is used to compute the split for a tuple.
 
     <DT>verbose (optional)</DT>
     <DD>boolean, default = FALSE. If true, the dot format output will contain
-    additional information (impurity, sample size, etc.)</DD>
+    additional information (impurity, sample size, number of weighted rows for 
+    each response variable, classification or prediction if the tree was 
+    pruned at this level)</DD>
 </DL>
 
 The output is always returned as a 'TEXT'. For the dot format, the output can be
@@ -459,7 +461,7 @@ programs.
 
 <b>Random Forest Classification Example</b>
 
--# Prepare input data.
+-# Prepare input data:
 <pre class="example">
 DROP TABLE IF EXISTS dt_golf;
 CREATE TABLE dt_golf (
@@ -469,7 +471,7 @@ CREATE TABLE dt_golf (
     humidity double precision,
     windy text,
     class text
-) ;
+);
 </pre>
 <pre class="example">
 INSERT INTO dt_golf (id,"OUTLOOK",temperature,humidity,windy,class) VALUES
@@ -489,7 +491,7 @@ INSERT INTO dt_golf (id,"OUTLOOK",temperature,humidity,windy,class) VALUES
 (14, 'rain', 71, 80, 'true', 'Don''t Play');
 </pre>
 
--# Run Random Forest train function.
+-# Run the random forest training function and view summary output:
 <pre class="example">
 DROP TABLE IF EXISTS train_output, train_output_group, train_output_summary;
 SELECT madlib.forest_train('dt_golf',         -- source table
@@ -510,35 +512,79 @@ SELECT madlib.forest_train('dt_golf',         -- source table
                            );
 \\x on
 SELECT * FROM train_output_summary;
+</pre>
+Result:
+<pre class="result">
+-[ RECORD 1 ]---------+-----------------------------------------------
+method                | forest_train
+is_classification     | t
+source_table          | dt_golf
+model_table           | train_output
+id_col_name           | id
+dependent_varname     | class
+independent_varnames  | "OUTLOOK",windy,temperature,humidity
+cat_features          | "OUTLOOK",windy
+con_features          | temperature,humidity
+grouping_cols         | 
+num_trees             | 20
+num_random_features   | 2
+max_tree_depth        | 8
+min_split             | 3
+min_bucket            | 1
+num_splits            | 10
+verbose               | f
+importance            | t
+num_permutations      | 1
+num_all_groups        | 1
+num_failed_groups     | 0
+total_rows_processed  | 14
+total_rows_skipped    | 0
+dependent_var_levels  | "Don't Play","Play"
+dependent_var_type    | text
+independent_var_types | text, text, double precision, double precision
+</pre>
+View the group table output:
+<pre class="example">
 SELECT * FROM train_output_group;
-\\x off
+</pre>
+Result:
+<pre class="result">
+-[ RECORD 1 ]------+----------------------------------------
+gid                | 1
+success            | t
+cat_n_levels       | {3,2}
+cat_levels_in_text | {overcast,rain,sunny,false,true}
+oob_error          | 0.50000000000000000000
+cat_var_importance | {-0.206309523809524,-0.234345238095238}
+con_var_importance | {-0.308690476190476,-0.272678571428571}
 </pre>
 
 -# Obtain a dot format display of a single tree
-within the forest.
+within the forest:
 <pre class="example">
+\\x off
 SELECT madlib.get_tree('train_output',1,2);
 </pre>
 Result:
 <pre class="result">
-digraph "Classification tree for dt_golf" {
-"0" [label="temperature<=70", shape=ellipse];
-"0" -> "1"[label="yes"];
-"1" [label="\"'Play'\"",shape=box];
-"0" -> "2"[label="no"];
-"2" [label="\"OUTLOOK\"<={overcast}", shape=ellipse];
-"2" -> "5"[label="yes"];
-"5" [label="\"'Play'\"",shape=box];
-"2" -> "6"[label="no"];
-"6" [label="humidity<=70", shape=ellipse];
-"6" -> "13"[label="yes"];
-"13" [label="\"'Play'\"",shape=box];
-"6" -> "14"[label="no"];
-"14" [label="\"'Don''t Play'\"",shape=box];
-} //---end of digraph---------
+ digraph "Classification tree for dt_golf" {                 
+ "0" [label="humidity <= 75", shape=ellipse];                
+ "0" -> "1"[label="yes"];                                    
+ "1" [label="\"Play\"",shape=box];                           
+ "0" -> "2"[label="no"];                                     
+ "2" [label="humidity <= 80", shape=ellipse];                
+ "2" -> "5"[label="yes"];                                    
+ "5" [label="\"Don't Play\"",shape=box];                     
+ "2" -> "6"[label="no"];                                     
+ "6" [label="\"OUTLOOK\" in {overcast,rain}", shape=ellipse];
+ "6" -> "13"[label="yes"];                                   
+ "13" [label="\"Play\"",shape=box];                          
+ "6" -> "14"[label="no"];                                    
+ "14" [label="\"Don't Play\"",shape=box];                                                                               
+ } //---end of digraph--------- 
 </pre>
 
--# Obtain a text display of the tree
+-# Obtain a text display of the tree:
 <pre class="example">
 SELECT madlib.get_tree('train_output',1,2,FALSE);
 </pre>
@@ -553,17 +599,17 @@ second indented child (2i+2) is the 'False' node.
 &nbsp;- Number of (weighted) rows for each response variable inside [].
 &nbsp;- Order of values = ['"Don\'t Play"', '"Play"']
 &nbsp;-------------------------------------
-(0)[  3 11]  temperature<=70
-    (1)[ 0 7]  * --> "'Play'"
-    (2)[ 3 4]  "OUTLOOK"<={overcast}
-       (5)[ 0 3]  * --> "'Play'"
-       (6)[ 3 1]  humidity<=70
-          (13)[ 0 1]  * --> "'Play'"
-          (14)[ 3 0]  * --> "'Don''t Play'"
+ (0)[ 4 10]  humidity <= 75                                               
+    (1)[0 7]  * --> "Play"                                                
+    (2)[4 3]  humidity <= 80                                              
+       (5)[3 1]  * --> "Don't Play"                                       
+       (6)[1 2]  "OUTLOOK" in {overcast,rain}                             
+          (13)[0 2]  * --> "Play"                                        
+          (14)[1 0]  * --> "Don't Play"                                   
 &nbsp;-------------------------------------
 </pre>
 
--# Predict output categories for the same data as was used for input.
+-# Predict output categories for the same data as was used for input:
 <pre class="example">
 DROP TABLE IF EXISTS prediction_results;
 SELECT madlib.forest_predict('train_output',
@@ -577,7 +623,7 @@ ORDER BY id;
 </pre>
 Result:
 <pre class="result">
- id | estimated_class |   class
+  id | estimated_class |   class    
 ----+-----------------+------------
   1 | Don't Play      | Don't Play
   2 | Don't Play      | Don't Play
@@ -596,7 +642,7 @@ Result:
 (14 rows)
 </pre>
 
--# Predict probablities of output categories for the same data.
+-# Predict probablities of output categories for the same data:
 <pre class="example">
 DROP TABLE IF EXISTS prediction_prob;
 SELECT madlib.forest_predict('train_output',
@@ -610,29 +656,29 @@ ORDER BY id;
 </pre>
 Result:
 <pre class="result">
- id | estimated_prob_Play |   class
+ id | estimated_prob_Play |   class    
 ----+---------------------+------------
-  1 |                0.15 | Don't Play
-  2 |                 0.1 | Don't Play
+  1 |                0.05 | Don't Play
+  2 |                0.15 | Don't Play
   3 |                0.95 | Play
-  4 |                 0.7 | Play
-  5 |                0.85 | Play
-  6 |                0.25 | Don't Play
-  7 |                0.75 | Play
+  4 |                0.65 | Play
+  5 |                0.75 | Play
+  6 |                 0.4 | Don't Play
+  7 |                 0.7 | Play
   8 |                 0.1 | Don't Play
-  9 |                0.85 | Play
- 10 |                 0.7 | Play
- 11 |                0.35 | Play
- 12 |                0.75 | Play
- 13 |                0.95 | Play
- 14 |                0.15 | Don't Play
+  9 |                 0.9 | Play
+ 10 |                0.85 | Play
+ 11 |                 0.8 | Play
+ 12 |                 0.7 | Play
+ 13 |                   1 | Play
+ 14 |                 0.4 | Don't Play
 (14 rows)
 </pre>
 
 
 <b>Random Forest Regression Example</b>
 
--# Prepare input data.
+-# Prepare input data:
 <pre class="example">
 DROP TABLE IF EXISTS mt_cars;
 CREATE TABLE mt_cars (
@@ -648,7 +694,7 @@ CREATE TABLE mt_cars (
     am integer,
     gear integer,
     carb integer
-) ;
+);
 </pre>
 <pre class="example">
 INSERT INTO mt_cars (id,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb) VALUES
@@ -686,7 +732,7 @@ INSERT INTO mt_cars (id,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb) VALUES
 (32,21.4,4,121,109,4.11,2.78,18.6,1,1,4,2);
 </pre>
 
--# Run Random Forest train function.
+-# Run the random forest training function:
 <pre class="example">
 DROP TABLE IF EXISTS mt_cars_output, mt_cars_output_group, mt_cars_output_summary;
 SELECT madlib.forest_train('mt_cars',
@@ -711,7 +757,7 @@ SELECT * FROM mt_cars_output_group;
 \\x off
 </pre>
 
--# Display a single tree of the Random Forest in dot format.
+-# Display a single tree of the random forest in dot format:
 <pre class="example">
 SELECT madlib.get_tree('mt_cars_output',1,1);
 </pre>
@@ -722,7 +768,7 @@ digraph "Regression tree for mt_cars" {
 } //---end of digraph---------
 </pre>
 
--# Predict regression output for the same data and compare with original.
+-# Predict regression output for the same data and compare with original:
 <pre class="example">
 DROP TABLE IF EXISTS prediction_results;
 SELECT madlib.forest_predict('mt_cars_output',

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/29acc538/src/ports/postgres/modules/summary/summary.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/summary/summary.sql_in b/src/ports/postgres/modules/summary/summary.sql_in
index 5457463..9f56b4d 100644
--- a/src/ports/postgres/modules/summary/summary.sql_in
+++ b/src/ports/postgres/modules/summary/summary.sql_in
@@ -186,7 +186,7 @@ provide an array of quantile values for the parameter 'ntile_array'.
 <dt>how_many_mfv (optional)</dt>
 <dd>INTEGER, default: 10. The number of most-frequent-values to compute.</dd>
 <dt>get_estimates (optional)</dt>
-<dd>BOOLEAN, default TRUE. If TRUE, estimated values are produced. If FALSE, exact values are calculated.</dd>
+<dd>BOOLEAN, default TRUE. If TRUE, estimated values are produced for distinct values and most frequent values. If FALSE, exact values are calculated (may take longer to run depending on data size).</dd>
 </DL>
 
 

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/29acc538/src/ports/postgres/modules/utilities/path.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/path.sql_in b/src/ports/postgres/modules/utilities/path.sql_in
index 9f58536..f97c6a0 100644
--- a/src/ports/postgres/modules/utilities/path.sql_in
+++ b/src/ports/postgres/modules/utilities/path.sql_in
@@ -185,8 +185,7 @@ wine selection page, and checkout.  Other pages on the site like help
 pages show up in the logs as well. Let\u2019s assume that the log has been
 sessionized.
 
-Create the date table:
-
+-# Create the date table:
 <pre class="example">
 DROP TABLE IF EXISTS eventlog;
 CREATE TABLE eventlog (event_timestamp TIMESTAMP,

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/29acc538/src/ports/postgres/modules/utilities/pivot.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/pivot.sql_in b/src/ports/postgres/modules/utilities/pivot.sql_in
index 1c36cef..cb2c223 100644
--- a/src/ports/postgres/modules/utilities/pivot.sql_in
+++ b/src/ports/postgres/modules/utilities/pivot.sql_in
@@ -138,6 +138,9 @@ allowed so NULLs are ignored.
 - It is not allowed to set the fill_value parameter without setting the
 aggregate_func parameter due to possible ambiguity. Set
 aggregate_func to NULL for the default behavior and use fill_value as desired.
+Please note that full_value must be of the same type as the output of the 
+aggregate_func (or capable of being cast to the same type by PostgreSQL), 
+or else an error will result.
 - It is not allowed to set the output_col_dictionary parameter without setting
 the keep_null parameter due to possible ambiguity. Set
 keep_null to NULL for the default behavior and use output_col_dictionary as

[23/50] [abbrv] incubator-madlib git commit: Encode categorical: Allow svec array output

Posted by ri...@apache.org.

Encode categorical: Allow svec array output

- Replace input parameter `array_output' with `output_type'
- Add cast to 'madlib.svec' if output_type = 'svec'

Closes #93


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/59a09eeb
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/59a09eeb
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/59a09eeb

Branch: refs/heads/latest_release
Commit: 59a09eeb4e0bbf4d63238a3e87cea90d020d24c2
Parents: 0829479
Author: Satoshi Nagayasu <sn...@uptime.jp>
Authored: Thu Jan 26 00:54:59 2017 +0900
Committer: Rahul Iyer <ri...@apache.org>
Committed: Wed Feb 1 16:16:39 2017 -0800

----------------------------------------------------------------------
 .../modules/utilities/encode_categorical.py_in  | 23 ++++++++++++--------
 .../modules/utilities/encode_categorical.sql_in | 10 ++++-----
 .../utilities/test/encode_categorical.sql_in    |  8 +++----
 3 files changed, 23 insertions(+), 18 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/59a09eeb/src/ports/postgres/modules/utilities/encode_categorical.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/encode_categorical.py_in b/src/ports/postgres/modules/utilities/encode_categorical.py_in
index 5d7ead4..26361a6 100644
--- a/src/ports/postgres/modules/utilities/encode_categorical.py_in
+++ b/src/ports/postgres/modules/utilities/encode_categorical.py_in
@@ -60,7 +60,7 @@ class CategoricalEncoder(object):
                  top=None,
                  value_to_drop=None,
                  encode_null=False,
-                 array_output=False,
+                 output_type='column',
                  output_dictionary=False,
                  distributed_by=None,
                  **kwargs):
@@ -74,7 +74,7 @@ class CategoricalEncoder(object):
         self.top = top
         self.value_to_drop = value_to_drop
         self.encode_null = encode_null
-        self.array_output = array_output
+        self.output_type = output_type
         self.output_dictionary = output_dictionary
         self.distributed_by = distributed_by
 
@@ -121,10 +121,15 @@ class CategoricalEncoder(object):
                                     if (c not in self._output_cols and
                                         unquote_ident(c) not in self._output_cols)])
 
-        if self.array_output:
+        if self.output_type == 'array':
             categorical_col_str = ("ARRAY[{0}] AS {1}".
                                    format(categorical_col_str,
                                           self._array_out_name))
+        elif self.output_type == 'svec':
+            categorical_col_str = ("ARRAY[{0}]::float8[]::{1}.svec AS {2}".
+                                   format(categorical_col_str,
+                                          self.schema_madlib,
+                                          self._array_out_name))
         out_sql = """
             CREATE TABLE {out} AS (
                 SELECT
@@ -154,7 +159,7 @@ class CategoricalEncoder(object):
         self._row_id_cols = split_quoted_delimited_str(self.row_id)
 
         # flag to build a dictionary table
-        self._output_dictionary = True if self.array_output else self.output_dictionary
+        self._output_dictionary = True if self.output_type in ['array', 'svec'] else self.output_dictionary
 
         # how to distribute the output table (for distributed platforms)
         if not is_platform_pg():
@@ -295,7 +300,7 @@ class CategoricalEncoder(object):
                 value_str = "= '{v}'".format(v=str(v))
                 v_type = str
 
-            if not self.array_output:
+            if self.output_type not in ('array', 'svec'):
                 # array_output = True implies all the case outputs will be wrapped
                 # as an array, hence not requiring an alias for each case
                 if not self._output_dictionary:
@@ -350,7 +355,7 @@ class CategoricalEncoder(object):
             values = col_to_values[col]
             local_seq = count(1)
             col_no_quotes = strip_end_quotes(col)
-            if self.array_output:
+            if self.output_type != 'column':
                 encoded_col_name = "__encoded_variables__"
                 seq = global_seq
             else:
@@ -549,7 +554,7 @@ def encode_categorical_variables(
         top=None,
         value_to_drop=None,
         encode_null=False,
-        array_output=None,
+        output_type='column',
         output_dictionary=False,
         distributed_by=None,
         **kwargs):
@@ -573,7 +578,7 @@ def encode_categorical_variables(
         encoder = CategoricalEncoder(schema_madlib, source_table, output_table,
                                      categorical_cols, categorical_cols_to_exclude,
                                      row_id, top, value_to_drop, encode_null,
-                                     array_output, output_dictionary,
+                                     output_type, output_dictionary,
                                      distributed_by)
         encoder.build_output_table()
     return None
@@ -639,7 +644,7 @@ SELECT {madlib}.encode_categorical_variables (
         value_to_drop,                  -- (Optional) Reference value to drop for each column
         encode_null,                    -- (Optional) Whether NULL should be treated as one of the
                                         --  values of the categorical variable.
-        array_output,                   -- (Optional) Get all encoded variables in an array
+        output_type,                    -- (Optional) Get all encoded variables in an array
         output_dictionary,              -- (Optional) Simplify output column naming and provide
                                         --  a mapping between simple names and meaning
         distributed_by                  -- (Optional) Columns to use for the distribution policy of

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/59a09eeb/src/ports/postgres/modules/utilities/encode_categorical.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/encode_categorical.sql_in b/src/ports/postgres/modules/utilities/encode_categorical.sql_in
index f591590..c4151b2 100644
--- a/src/ports/postgres/modules/utilities/encode_categorical.sql_in
+++ b/src/ports/postgres/modules/utilities/encode_categorical.sql_in
@@ -79,7 +79,7 @@ encode_categorical_variables (
         top,                            -- Optional
         value_to_drop,                  -- Optional
         encode_null,                    -- Optional
-        array_output,                   -- Optional
+        output_type,                    -- Optional
         output_dictionary,              -- Optional
         distributed_by                  -- Optional
         )
@@ -660,7 +660,7 @@ SELECT madlib.encode_categorical_variables (
  * @param top Parameter to include only top values of a categorical variable
  * @param value_to_drop Parameter to set reference column in dummy coding
  * @param encode_null Boolean to determine the behavior for rows with NULL value
- * @param array_output Boolean to determine if output should be in an array or columns
+ * @param output_type Parameter to set output data type: 'column', 'array' or 'svec'
  * @param output_dictionary Boolean to simplify column naming and with a separate
  *                              mapping table to actual values
  * @param distributed_by Comma-separated list of column names to use for distribution of output
@@ -682,7 +682,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.encode_categorical_variables(
     top                             VARCHAR,
     value_to_drop                   VARCHAR,
     encode_null                     BOOLEAN,
-    array_output                    BOOLEAN,
+    output_type                     VARCHAR,
     output_dictionary               BOOLEAN,
     distributed_by                  VARCHAR
 ) RETURNS VOID AS $$
@@ -701,7 +701,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.encode_categorical_variables(
     top                             VARCHAR,
     value_to_drop                   VARCHAR,
     encode_null                     BOOLEAN,
-    array_output                    BOOLEAN,
+    output_type                     VARCHAR,
     output_dictionary               BOOLEAN
 ) RETURNS VOID AS $$
     PythonFunction(utilities, encode_categorical, encode_categorical_variables)
@@ -720,7 +720,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.encode_categorical_variables(
     top                             VARCHAR,
     value_to_drop                   VARCHAR,
     encode_null                     BOOLEAN,
-    array_output                    BOOLEAN
+    output_type                     VARCHAR
 ) RETURNS VOID AS $$
     PythonFunction(utilities, encode_categorical, encode_categorical_variables)
 $$ LANGUAGE plpythonu

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/59a09eeb/src/ports/postgres/modules/utilities/test/encode_categorical.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/test/encode_categorical.sql_in b/src/ports/postgres/modules/utilities/test/encode_categorical.sql_in
index 0c451a4..79cfba9 100644
--- a/src/ports/postgres/modules/utilities/test/encode_categorical.sql_in
+++ b/src/ports/postgres/modules/utilities/test/encode_categorical.sql_in
@@ -70,7 +70,7 @@ select * from abalone_out2;
 select madlib.encode_categorical_variables('abalone', 'abalone_out3',
                                            'sex, "Class"', 'class',
                                            'id, sex, "Class"', '2', 'sex=M, Class=1',
-                                           true, false, false
+                                           true, 'column', false
                                            );
 select * from abalone_out3;
 
@@ -78,7 +78,7 @@ select * from abalone_out3;
 select madlib.encode_categorical_variables('abalone', 'abalone_out4',
                                            '*', '"Class"',
                                            'id', '2', 'M',
-                                           true, false, false
+                                           true, 'column', false
                                            );
 select * from abalone_out4;
 
@@ -87,7 +87,7 @@ select * from abalone_out4;
 select madlib.encode_categorical_variables('abalone', 'abalone_out5',
                                            'sex, "Class"', '',
                                            'id', '0.5', 'M',
-                                           true, true, false
+                                           true, 'array', false
                                            );
 select * from abalone_out5;
 select * from abalone_out5_dictionary order by index;
@@ -96,7 +96,7 @@ select * from abalone_out5_dictionary order by index;
 select madlib.encode_categorical_variables('abalone', 'abalone_out6',
                                            'sex, "Class"', '',
                                            'id', '3', 'class=1',
-                                           true, false, true
+                                           true, 'svec', true
                                            );
 select * from abalone_out6;
 select * from abalone_out6_dictionary order by variable, index;

[49/50] [abbrv] incubator-madlib git commit: Build: Fix module sort order for PGXN installation

Posted by ri...@apache.org.

Build: Fix module sort order for PGXN installation

JIRA: MADLIB-1024

PGXN installation involves creating a single extension sql file that
contains all the SQL commands run during MADlib deployment. The modules
added into this extension file are to be placed in the right order,
taking dependencies into account.

MADlib has a function that compares a given file path with topologically
sorted modules to decide the order of concatenation to extension file.
This comparison is faulty since the module name is searched for in the
whole path, leading to false positive with modules that have another
module name as substring.  The specific bug was reported as 'svec_util'
being flagged in same order as 'svec'.

This commit fixes this issue taking advantage of the file path names being
of the form '.../modules/<module_name>/...', hence comparing the
whole module name.

Closes #106


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/fa80240f
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/fa80240f
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/fa80240f

Branch: refs/heads/latest_release
Commit: fa80240f72a6551c2ee567d471afa499fd1d1efe
Parents: 0b8415e
Author: Rahul Iyer <ri...@apache.org>
Authored: Fri Feb 24 14:32:32 2017 -0800
Committer: Rahul Iyer <ri...@apache.org>
Committed: Mon Feb 27 13:13:18 2017 -0800

----------------------------------------------------------------------
 src/madpack/sort-module.py | 77 +++++++++++++++++++++++++++++++++++------
 1 file changed, 66 insertions(+), 11 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/fa80240f/src/madpack/sort-module.py
----------------------------------------------------------------------
diff --git a/src/madpack/sort-module.py b/src/madpack/sort-module.py
index 7312e95..3c4691d 100644
--- a/src/madpack/sort-module.py
+++ b/src/madpack/sort-module.py
@@ -1,7 +1,24 @@
 #!/usr/bin/env python
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
 
 """
-sort-module.py
+@file sort-module.py
 
 Sort input strings based on the module name in them such that
 the module dependencies are resolved.  Example:
@@ -13,21 +30,59 @@ Note this script assumes to be run at the exact current directory,
 so you need change directory first to run it.
 """
 
+import re
+import sys
+
 import configyml
 
-portspecs = configyml.get_modules("../config")
 
-def find_order(path):
+def get_modules_in_order():
+    """ Return a mapping between module_name and the order of the module.
+
+    Returns:
+        Dict. The output is of the form:
+            {'module_a': 1, 'module_b': 2, ...}
+    """
+    portspecs = configyml.get_modules("../config")
+    # get_modules returns a dictionary in following form:
+    #   { 'modules': [{'name': 'array_ops'}, {'name': 'bayes'}, ...,
+    #                 {'depends': ['array_ops'], 'name': 'stats'}, ... ]
+    #                 }
+    # The list of modules is pre-sorted using topological sort
+    module_order = dict()
     for i, moduleinfo in enumerate(portspecs['modules']):
-        modname = moduleinfo['name']
-        if modname in path:
-            return i
-    # return as the last if not found.
-    return len(portspecs['modules'])
+        module_order[moduleinfo['name']] = i
+    return module_order
+# ----------------------------------------------------------------------
+
+
+module_order = get_modules_in_order()
+
+# Not all modules are captured in the config file. For a missing module, assume
+# the sort rank is maximum (i.e. missing modules are installed last).
+MAX_RANK = len(module_order) + 1
+
+
+def find_order(path):
+    """ Return the position of a given file within the module order
+
+    Args:
+        @param: path: str, The path for a single SQL file. This path is assumed
+                to have the form: '.../modules/<module_name>/.../<file_name>.sql'
+    """
+    mod_name = re.match(r'.+/modules/(.+)/.+', path).group(1)
+    return module_order.get(mod_name, MAX_RANK)
+
+
+def main(file_paths):
+    """
+    Args:
+        @param: file_paths: List of paths to SQL files, where each path
+                is of the form: '.../modules/<module_name>/...'.
+    """
+    file_order = sorted(file_paths, key=find_order)
+    print " ".join(file_order)
 
-def main(args):
-    print " ".join(sorted(args, key = find_order))
 
 if __name__ == '__main__':
-    import sys
     main(sys.argv[1:])

[02/50] [abbrv] incubator-madlib git commit: DT and RF: Adds verbose option for the dot output format.

Posted by ri...@apache.org.

DT and RF: Adds verbose option for the dot output format.

JIRA: MADLIB-1051

Closes #86


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/02f4602a
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/02f4602a
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/02f4602a

Branch: refs/heads/latest_release
Commit: 02f4602a5554491c1a6d96654d34da29e5275254
Parents: c56b209
Author: Orhan Kislal <ok...@pivotal.io>
Authored: Mon Jan 9 17:04:54 2017 -0800
Committer: Orhan Kislal <ok...@pivotal.io>
Committed: Mon Jan 9 17:04:54 2017 -0800

----------------------------------------------------------------------
 src/modules/recursive_partitioning/DT_impl.hpp  |  70 ++++++-----
 src/modules/recursive_partitioning/DT_proto.hpp |  10 +-
 .../recursive_partitioning/decision_tree.cpp    |   9 +-
 .../recursive_partitioning/decision_tree.py_in  |   6 +-
 .../recursive_partitioning/decision_tree.sql_in | 116 ++++++++++++++-----
 .../recursive_partitioning/random_forest.py_in  |   6 +-
 .../recursive_partitioning/random_forest.sql_in |  42 +++++--
 7 files changed, 179 insertions(+), 80 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/02f4602a/src/modules/recursive_partitioning/DT_impl.hpp
----------------------------------------------------------------------
diff --git a/src/modules/recursive_partitioning/DT_impl.hpp b/src/modules/recursive_partitioning/DT_impl.hpp
index 702ac2c..f622f94 100644
--- a/src/modules/recursive_partitioning/DT_impl.hpp
+++ b/src/modules/recursive_partitioning/DT_impl.hpp
@@ -1092,7 +1092,8 @@ string
 DecisionTree<Container>::displayLeafNode(
             Index id,
             ArrayHandle<text*> &dep_levels,
-            const std::string & id_prefix){
+            const std::string & id_prefix,
+            bool verbose){
     std::stringstream predict_str;
     if (static_cast<bool>(is_regression)){
         predict_str << predict_response(id);
@@ -1105,14 +1106,14 @@ DecisionTree<Container>::displayLeafNode(
     std::stringstream display_str;
     display_str << "\"" << id_prefix << id << "\" [label=\"" << predict_str.str();
 
-    // // uncomment below if distribution of rows is required in leaf node
-    // display_str << "\\n[";
-    // if (is_regression)
-    //      display_str << statCount(predictions.row(id)) << ", "
-    //                  << statPredict(predictions.row(id));
-    // else
-    //     display_str << predictions.row(id);
-    // display_str << "]";
+    if(verbose){
+        display_str << "\\n samples = " << statCount(predictions.row(id)) << "\\n value = ";
+        if (is_regression)
+            display_str << statPredict(predictions.row(id));
+        else{
+            display_str << "[" << predictions.row(id).head(n_y_labels)<< "]";
+        }
+    }
     display_str << "\",shape=box]" << ";";
     return display_str.str();
 }
@@ -1130,7 +1131,9 @@ DecisionTree<Container>::displayInternalNode(
             ArrayHandle<text*> &con_features_str,
             ArrayHandle<text*> &cat_levels_text,
             ArrayHandle<int> &cat_n_levels,
-            const std::string & id_prefix
+            ArrayHandle<text*> &dep_levels,
+            const std::string & id_prefix,
+            bool verbose
             ){
 
     string feature_name;
@@ -1149,16 +1152,26 @@ DecisionTree<Container>::displayInternalNode(
 
     std::stringstream display_str;
     display_str << "\"" << id_prefix << id << "\" [label=\"" << label_str.str();
-    // // uncomment below if distribution of rows is required in internal node
-    // display_str << "\\n[";
-    // if (is_regression)
-    //      display_str << statCount(predictions.row(id)) << ", "
-    //                  << statPredict(predictions.row(id));
-    // else
-    //     display_str << predictions.row(id);
-    // display_str << "]";
-    display_str <<"\", shape=ellipse]" << ";";
-   return display_str.str();
+    if(verbose){
+
+        display_str << "\\n impurity = "<< impurity(predictions.row(id)) << "\\n samples = " << statCount(predictions.row(id)) << "\\n value = ";
+        if (is_regression)
+            display_str << statPredict(predictions.row(id));
+        else{
+            display_str << "[" << predictions.row(id).head(n_y_labels)<< "]";
+        }
+        std::stringstream predict_str;
+        if (static_cast<bool>(is_regression)){
+            predict_str << predict_response(id);
+        }
+        else{
+            std::string dep_value = get_text(dep_levels, static_cast<int>(predict_response(id)));
+            predict_str << escape_quotes(dep_value);
+        }
+        display_str << "\\n class = " << predict_str.str();
+    }
+    display_str << "\", shape=ellipse]" << ";";
+    return display_str.str();
 }
 // -------------------------------------------------------------------------
 
@@ -1174,11 +1187,12 @@ DecisionTree<Container>::display(
         ArrayHandle<text*> &cat_levels_text,
         ArrayHandle<int> &cat_n_levels,
         ArrayHandle<text*> &dependent_levels,
-        const std::string &id_prefix) {
+        const std::string &id_prefix,
+        bool verbose) {
 
     std::stringstream display_string;
     if (feature_indices(0) == FINISHED_LEAF){
-        display_string << displayLeafNode(0, dependent_levels, id_prefix)
+        display_string << displayLeafNode(0, dependent_levels, id_prefix, verbose)
                        << std::endl;
     }
     else{
@@ -1189,7 +1203,7 @@ DecisionTree<Container>::display(
 
                 display_string << displayInternalNode(
                         index, cat_features_str, con_features_str,
-                        cat_levels_text, cat_n_levels, id_prefix) << std::endl;
+                        cat_levels_text, cat_n_levels, dependent_levels, id_prefix, verbose) << std::endl;
 
                 // Display the children
                 Index tc = trueChild(index);
@@ -1203,7 +1217,7 @@ DecisionTree<Container>::display(
                     if (feature_indices(tc) == IN_PROCESS_LEAF ||
                         feature_indices(tc) == FINISHED_LEAF)
                         display_string
-                            << displayLeafNode(tc, dependent_levels, id_prefix)
+                            << displayLeafNode(tc, dependent_levels, id_prefix, verbose)
                             << std::endl;
                 }
 
@@ -1218,7 +1232,7 @@ DecisionTree<Container>::display(
                     if (feature_indices(fc) == IN_PROCESS_LEAF ||
                         feature_indices(fc) == FINISHED_LEAF)
                         display_string
-                            << displayLeafNode(fc, dependent_levels, id_prefix)
+                            << displayLeafNode(fc, dependent_levels, id_prefix, verbose)
                             << std::endl;
                 }
             }
@@ -1664,7 +1678,7 @@ TreeAccumulator<Container, DTree>::operator<<(const surr_tuple_type& inTuple) {
                             updateSurrStats(true,
                                             is_primary_true == is_surrogate_true,
                                             row_index,
-                                            col_index, 
+                                            col_index,
                                             dup_count);
                         }
                     }
@@ -1731,7 +1745,7 @@ TreeAccumulator<Container, DTree>::updateNodeStats(bool is_regression,
                                                   const double weight) {
     ColumnVector stats(stats_per_split);
     stats.fill(0);
-    int n_rows = this->weights_as_rows ? static_cast<int>(weight) : 1; 
+    int n_rows = this->weights_as_rows ? static_cast<int>(weight) : 1;
     if (is_regression){
         double w_response = weight * response;
         stats << weight, w_response, w_response * response, n_rows;
@@ -1758,7 +1772,7 @@ TreeAccumulator<Container, DTree>::updateStats(bool is_regression,
                                                const double weight) {
     ColumnVector stats(stats_per_split);
     stats.fill(0);
-    int n_rows = this->weights_as_rows ? static_cast<int>(weight) : 1; 
+    int n_rows = this->weights_as_rows ? static_cast<int>(weight) : 1;
     if (is_regression){
         double w_response = weight * response;
         stats << weight, w_response, w_response * response, n_rows;

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/02f4602a/src/modules/recursive_partitioning/DT_proto.hpp
----------------------------------------------------------------------
diff --git a/src/modules/recursive_partitioning/DT_proto.hpp b/src/modules/recursive_partitioning/DT_proto.hpp
index 2f5e211..a2881a5 100644
--- a/src/modules/recursive_partitioning/DT_proto.hpp
+++ b/src/modules/recursive_partitioning/DT_proto.hpp
@@ -123,15 +123,17 @@ public:
     }
 
     uint16_t recomputeTreeDepth() const;
-    string displayLeafNode(Index id, ArrayHandle<text*> &dep_levels, const std::string & id_prefix);
+    string displayLeafNode(Index id, ArrayHandle<text*> &dep_levels, const std::string & id_prefix, bool verbose);
     string displayInternalNode(Index id,
                                ArrayHandle<text*> &cat_features_str,
                                ArrayHandle<text*> &con_features_str,
                                ArrayHandle<text*> &cat_levels_text,
                                ArrayHandle<int> &cat_n_levels,
-                               const std::string & id_prefix);
+                               ArrayHandle<text*> &dep_levels,
+                               const std::string & id_prefix,
+                               bool verbose);
     string display(ArrayHandle<text*>&, ArrayHandle<text*>&, ArrayHandle<text*>&,
-                   ArrayHandle<int>&, ArrayHandle<text*>&, const std::string&);
+                   ArrayHandle<int>&, ArrayHandle<text*>&, const std::string&, bool verbose);
     string getCatLabels(Index, Index, Index, ArrayHandle<text*> &,
                         ArrayHandle<int> &);
     string print_split(bool, bool, Index, double,
@@ -234,7 +236,7 @@ public:
                         MappedColumnVector,  // continuous feature values
                         MappedIntegerVector, // levels for each categorical feature
                         MappedMatrix,        // split values for each continuous feature
-                        int                  // duplicated count for each tuple 
+                        int                  // duplicated count for each tuple
                                              //   (used in random forest)
                        > surr_tuple_type;
 

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/02f4602a/src/modules/recursive_partitioning/decision_tree.cpp
----------------------------------------------------------------------
diff --git a/src/modules/recursive_partitioning/decision_tree.cpp b/src/modules/recursive_partitioning/decision_tree.cpp
index 8f74c07..7a8ec95 100644
--- a/src/modules/recursive_partitioning/decision_tree.cpp
+++ b/src/modules/recursive_partitioning/decision_tree.cpp
@@ -166,7 +166,7 @@ compute_leaf_stats_transition::run(AnyType & args){
                      static_cast<uint16_t>(con_features.size()),
                      static_cast<uint32_t>(cat_levels.sum()),
                      static_cast<uint16_t>(dt.tree_depth),
-                     stats_per_split, 
+                     stats_per_split,
                      weights_as_rows
                     );
         // compute cumulative sum of the levels of the categorical variables
@@ -408,10 +408,11 @@ display_decision_tree::run(AnyType &args) {
     ArrayHandle<int> cat_n_levels = args[4].getAs<ArrayHandle<int> >();
     ArrayHandle<text*> dependent_var_levels = args[5].getAs<ArrayHandle<text*> >();
     std::string id_prefix = args[6].getAs<std::string>();
+    bool verbose = args[7].getAs<bool>();
 
     string tree_str = dt.display(cat_feature_names, con_feature_names,
                                  cat_levels_text, cat_n_levels,
-                                 dependent_var_levels, id_prefix);
+                                 dependent_var_levels, id_prefix, verbose);
     return tree_str;
 }
 
@@ -893,9 +894,9 @@ void fill_one_row(MutableNativeMatrix &frame, Tree &dt, int me, int i,
 /*
  * PivotalR: randomForest
  * Convert to R's randomForest format for getTree(..) function
- * 
+ *
  */
-AnyType 
+AnyType
 convert_to_random_forest_format::run(AnyType &args) {
     Tree dt = args[0].getAs<ByteString>();
 

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/02f4602a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
index ccba636..40f4b7e 100644
--- a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
+++ b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
@@ -1791,7 +1791,7 @@ def _get_display_header(table_name, dep_levels, is_regression, dot_format=True):
 #------------------------------------------------------------------------------
 
 
-def tree_display(schema_madlib, model_table, dot_format=True,
+def tree_display(schema_madlib, model_table, dot_format=True, verbose=False,
                  disp_surr=False, **kwargs):
 
     if dot_format:
@@ -1862,9 +1862,9 @@ def tree_display(schema_madlib, model_table, dot_format=True,
                     return_str_list.append('\t label="{0}"'.format(group_name.replace('"', '\\"')))
                     sql = """
                             SELECT {0}._display_decision_tree(
-                                        $1, $2, $3, $4, $5, $6, '{1}'
+                                        $1, $2, $3, $4, $5, $6, '{1}', {2}
                                     ) as display_tree
-                          """.format(schema_madlib, "g" + str(index) + "_")
+                          """.format(schema_madlib, "g" + str(index) + "_", verbose)
                 else:
                     if group_name:
                         return_str_list.append("--- Tree for {0} ---".format(group_name))

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/02f4602a/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in b/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
index a2538cf..d6b7b5a 100644
--- a/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
+++ b/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
@@ -412,7 +412,7 @@ be visualized using various programs including those in the GraphViz package, or
 in a simple text format. The details of the text format is outputted with the
 tree.
 <pre class="syntax">
-tree_display(tree_model, dot_format)
+tree_display(tree_model, dot_format, verbosity)
 </pre>
 
 An additional display function is provided to output the surrogate splits chosen
@@ -439,6 +439,9 @@ are NULL, then the majority branch is used to compute the split for a tuple.
     <DT>dot_format</DT>
     <DD>BOOLEAN, default = TRUE. Output can either be in a dot format or a text
     format. If TRUE, the result is in the dot format, else output is in text format</DD>
+    <DT>verbosity</DT>
+    <DD>BOOLEAN, default = FALSE. If true, the dot format output will contain
+    additional information (impurity, sample size, etc.)</DD>
 </DL>
 
 The output is always returned as a 'TEXT'. For the dot format, the output can be
@@ -487,20 +490,20 @@ CREATE TABLE dt_golf (
 </pre>
 <pre class="example">
 COPY dt_golf (id,"OUTLOOK",temperature,humidity,windy,class) FROM stdin WITH DELIMITER '|';
-1|sunny|85|85|'false'|'Don''t Play'
-2|sunny|80|90|'true'|'Don''t Play'
+1|sunny|85|85|'false'|'Don't Play'
+2|sunny|80|90|'true'|'Don't Play'
 3|overcast|83|78|'false'|'Play'
 4|rain|70|96|'false'|'Play'
 5|rain|68|80|'false'|'Play'
-6|rain|65|70|'true'|'Don''t Play'
+6|rain|65|70|'true'|'Don't Play'
 7|overcast|64|65|'true'|'Play'
-8|sunny|72|95|'false'|'Don''t Play'
+8|sunny|72|95|'false'|'Don't Play'
 9|sunny|69|70|'false'|'Play'
 10|rain|75|80|'false'|'Play'
 11|sunny|75|70|'true'|'Play'
 12|overcast|72|90|'true'|'Play'
 13|overcast|81|75|'false'|'Play'
-14|rain|71|80|'true'|'Don''t Play'
+14|rain|71|80|'true'|'Don't Play'
 \\.
 </pre>
 
@@ -532,28 +535,27 @@ SELECT * FROM prediction_results;
 </pre>
 Result:
 <pre class="result">
- id | estimated_class
 &nbsp;----+-----------------
-  1 | Don't Play
-  2 | Don't Play
-  3 | Play
-  4 | Play
-  5 | Play
-  6 | Don't Play
-  7 | Play
-  8 | Don't Play
-  9 | Play
- 10 | Play
- 11 | Play
- 12 | Play
- 13 | Play
- 14 | Don't Play
+  1 | 'Don't Play'
+  2 | 'Don't Play'
+  3 | 'Play'
+  4 | 'Play'
+  5 | 'Play'
+  6 | 'Don't Play'
+  7 | 'Play'
+  8 | 'Don't Play'
+  9 | 'Play'
+ 10 | 'Play'
+ 11 | 'Play'
+ 12 | 'Play'
+ 13 | 'Play'
+ 14 | 'Don't Play'
 (14 rows)
 </pre>
 
 -# Obtain a dot format display of the tree
 <pre class="example">
-SELECT madlib.tree_display('train_output');
+SELECT madlib.tree_display('train_output', TRUE);
 </pre>
 Result:
 <pre class="result">
@@ -585,6 +587,40 @@ digraph "Classification tree for dt_golf" {
 &nbsp;} //---end of digraph---------
 </pre>
 
+-# Obtain a dot format display of the tree with additional info
+<pre class="example">
+SELECT madlib.tree_display('train_output', TRUE, TRUE);
+</pre>
+Result:
+<pre class="result">
+digraph "Classification tree for dt_golf" {
+         subgraph "cluster0"{
+         label=""
+"g0_0" [label="\"OUTLOOK\" in {overcast}\\n impurity = 0.459184\\n samples = 14\\n value = [5 9]\\n class = \"'Play'\"", shape=ellipse];
+"g0_0" -> "g0_1"[label="yes"];
+"g0_1" [label="\"'Play'\"\\n samples = 4\\n value = [0 4]",shape=box];
+"g0_0" -> "g0_2"[label="no"];
+"g0_2" [label="temperature <= 75\\n impurity = 0.5\\n samples = 10\\n value = [5 5]\\n class = \"'Don't Play'\"", shape=ellipse];
+"g0_2" -> "g0_5"[label="yes"];
+"g0_2" -> "g0_6"[label="no"];
+"g0_6" [label="\"'Don't Play'\"\\n samples = 2\\n value = [2 0]",shape=box];
+"g0_5" [label="temperature <= 65\\n impurity = 0.46875\\n samples = 8\\n value = [3 5]\\n class = \"'Play'\"", shape=ellipse];
+"g0_5" -> "g0_11"[label="yes"];
+"g0_11" [label="\"'Don't Play'\"\\n samples = 1\\n value = [1 0]",shape=box];
+"g0_5" -> "g0_12"[label="no"];
+"g0_12" [label="temperature <= 70\\n impurity = 0.408163\\n samples = 7\\n value = [2 5]\\n class = \"'Play'\"", shape=ellipse];
+"g0_12" -> "g0_25"[label="yes"];
+"g0_25" [label="\"'Play'\"\\n samples = 3\\n value = [0 3]",shape=box];
+"g0_12" -> "g0_26"[label="no"];
+"g0_26" [label="temperature <= 72\\n impurity = 0.5\\n samples = 4\\n value = [2 2]\\n class = \"'Don't Play'\"", shape=ellipse];
+"g0_26" -> "g0_53"[label="yes"];
+"g0_53" [label="\"'Don't Play'\"\\n samples = 2\\n value = [2 0]",shape=box];
+"g0_26" -> "g0_54"[label="no"];
+"g0_54" [label="\"'Play'\"\\n samples = 2\\n value = [0 2]",shape=box];
+&nbsp;&nbsp;&nbsp;} //--- end of subgraph------------
+&nbsp;} //---end of digraph---------
+</pre>
+
 -# Obtain a text display of the tree
 <pre class="example">
 SELECT madlib.tree_display('train_output', FALSE);
@@ -1312,7 +1348,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_surr_display(
 ) RETURNS VARCHAR AS $$
 PythonFunctionBodyOnly(recursive_partitioning, decision_tree, tree_display)
     return decision_tree.tree_display(schema_madlib, model_table, dot_format=False,
-                                      disp_surr=True)
+                                      verbose=False, disp_surr=True)
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
 
@@ -1353,15 +1389,24 @@ m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
   */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_display(
     model_table    TEXT,
-    dot_format     BOOLEAN
+    dot_format     BOOLEAN,
+    verbose        BOOLEAN
 ) RETURNS VARCHAR AS $$
 PythonFunction(recursive_partitioning, decision_tree, tree_display)
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
 
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_display(
+    model_table    TEXT,
+    dot_format     BOOLEAN
+) RETURNS VARCHAR AS $$
+    SELECT MADLIB_SCHEMA.tree_display($1, $2, FALSE);
+$$ LANGUAGE SQL VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
+
 
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_display(
-    model_table             TEXT
+    model_table    TEXT
 ) RETURNS VARCHAR AS $$
     SELECT MADLIB_SCHEMA.tree_display($1, TRUE);
 $$ LANGUAGE SQL VOLATILE
@@ -1381,10 +1426,13 @@ tree.
 ------------------------------------------------------------
 SELECT MADLIB_SCHEMA.tree_display(
     tree_model,             -- TEXT. Name of the table containing the decision tree model
-    dot_format              -- BOOLEAN. (OPTIONAL, Default = TRUE)
+    dot_format,             -- BOOLEAN. (OPTIONAL, Default = TRUE)
                             -- Tree can be outputed either in a dot format or a text
                             --   format. If TRUE, the result is in the dot format,
                             --   else output is in text format
+    verbose                 -- BOOLEAN. (OPTIONAL, Default = FALSE)
+                            -- If TRUE, the dot format output will contain additional
+                            -- information
     )
 ------------------------------------------------------------
 The output is always returned as a 'TEXT'. For the dot format, the output can be
@@ -1403,12 +1451,26 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._display_decision_tree(
   cat_levels_in_text TEXT[],
   cat_n_levels       INTEGER[],
   dependent_levels   TEXT[],
-  id_prefix          TEXT
+  id_prefix          TEXT,
+  verbose            BOOLEAN
 )  RETURNS TEXT
 AS 'MODULE_PATHNAME', 'display_decision_tree'
 LANGUAGE C STRICT IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
 
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._display_decision_tree(
+  tree               MADLIB_SCHEMA.bytea8,
+  cat_features       TEXT[],
+  con_features       TEXT[],
+  cat_levels_in_text TEXT[],
+  cat_n_levels       INTEGER[],
+  dependent_levels   TEXT[],
+  id_prefix          TEXT
+)  RETURNS TEXT AS $$
+    SELECT MADLIB_SCHEMA._display_decision_tree($1, $2, $3, $4, $5, $6, $7, FALSE);
+$$ LANGUAGE SQL VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
+
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._display_decision_tree_surrogate(
   tree               MADLIB_SCHEMA.bytea8,
   cat_features       TEXT[],

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/02f4602a/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in b/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in
index 43ad64e..e006a34 100644
--- a/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in
+++ b/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in
@@ -809,7 +809,7 @@ def get_tree_surr(schema_madlib, model_table, gid, sample_id, **kwargs):
 
 
 def get_tree(schema_madlib, model_table, gid, sample_id,
-             dot_format=True, disp_surr=False, **kwargs):
+             dot_format=True, verbose=False, disp_surr=False, **kwargs):
     """Random forest tree display function"""
 
     _validate_get_tree(model_table, gid, sample_id)
@@ -885,9 +885,9 @@ def get_tree(schema_madlib, model_table, gid, sample_id,
             if dot_format:
                 sql_display = """
                         SELECT {0}._display_decision_tree(
-                            $1, $2, $3, $4, $5, $6, '{1}'
+                            $1, $2, $3, $4, $5, $6, '{1}', {2}
                             ) as display_tree
-                        """.format(schema_madlib, "")
+                        """.format(schema_madlib, "", verbose)
             else:
                 sql_display = """
                         SELECT {0}._display_text_decision_tree(

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/02f4602a/src/ports/postgres/modules/recursive_partitioning/random_forest.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/recursive_partitioning/random_forest.sql_in b/src/ports/postgres/modules/recursive_partitioning/random_forest.sql_in
index e463225..0cc6534 100644
--- a/src/ports/postgres/modules/recursive_partitioning/random_forest.sql_in
+++ b/src/ports/postgres/modules/recursive_partitioning/random_forest.sql_in
@@ -406,7 +406,8 @@ text format is outputted with the tree.
 get_tree(forest_model_table,
          gid,
          sample_id,
-         dot_format)
+         dot_format,
+         verbose)
 </pre>
 
 An additional display function is provided to output the surrogate splits chosen
@@ -439,9 +440,13 @@ are NULL, then the majority branch is used to compute the split for a tuple.
     <DT>sample_id</DT>
     <DD>integer. Id of the bootstrap sample that this tree if a part of.</DD>
 
-    <DT>dot_format</DT>
+    <DT>dot_format (optional)</DT>
     <DD>boolean, default = TRUE. Output can either be in a dot format or a text
     format. If TRUE, the result is in the dot format, else output is in text format.</DD>
+
+    <DT>verbose (optional)</DT>
+    <DD>boolean, default = FALSE. If true, the dot format output will contain
+    additional information (impurity, sample size, etc.)</DD>
 </DL>
 
 The output is always returned as a 'TEXT'. For the dot format, the output can be
@@ -978,12 +983,14 @@ m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');
   *@param gid Group id of the tree to display
   *@param sample_id Sample id of the tree to display
   *@dot_format TRUE if dot format, FALSE for text format
+  *@verbose TRUE if the dot format output will contain additional information
   */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.get_tree(
     "model_table"    TEXT,
     "gid"    INTEGER,
     "sample_id"    INTEGER,
-    "dot_format"  BOOLEAN
+    "dot_format"  BOOLEAN,
+    "verbose"        BOOLEAN
 ) RETURNS VARCHAR AS $$
 PythonFunction(recursive_partitioning, random_forest, get_tree)
 $$ LANGUAGE plpythonu VOLATILE
@@ -992,9 +999,19 @@ m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.get_tree(
     "model_table"     TEXT,
     "gid"  INTEGER,
+    "sample_id"   INTEGER,
+    "dot_format"  BOOLEAN
+) RETURNS VARCHAR AS $$
+    SELECT MADLIB_SCHEMA.get_tree($1, $2, $3, $4, FALSE::BOOLEAN);
+$$ LANGUAGE SQL VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.get_tree(
+    "model_table"     TEXT,
+    "gid"  INTEGER,
     "sample_id"   INTEGER
 ) RETURNS VARCHAR AS $$
-    SELECT MADLIB_SCHEMA.get_tree($1, $2, $3, TRUE::BOOLEAN);
+    SELECT MADLIB_SCHEMA.get_tree($1, $2, $3, TRUE::BOOLEAN, FALSE::BOOLEAN);
 $$ LANGUAGE SQL VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
 
@@ -1012,13 +1029,16 @@ tree.
                         USAGE
 ------------------------------------------------------------
 SELECT MADLIB_SCHEMA.get_tree(
-    forest_model,            -- TEXT. Name of the table containing the random forest model
-    gid,                -- INTEGER. Group id of the tree to be displayed
-    sample_id,               -- INTEGER. Sample of the tree to be displayed
-    dot_format        -- BOOLEAN. (OPTIONAL, Default = TRUE)
-                             -- Output can either be in a dot format or a text
-                             --   format. If TRUE, the result is in the dot format,
-                             --   else output is in text format
+    forest_model,           -- TEXT. Name of the table containing the random forest model
+    gid,                    -- INTEGER. Group id of the tree to be displayed
+    sample_id,              -- INTEGER. Sample of the tree to be displayed
+    dot_format,             -- BOOLEAN. (OPTIONAL, Default = TRUE)
+                            -- Output can either be in a dot format or a text
+                            --   format. If TRUE, the result is in the dot format,
+                            --   else output is in text format
+    verbose                 -- BOOLEAN. (OPTIONAL, Default = FALSE)
+                            -- If TRUE, the dot format output will contain additional
+                            -- information
     )
 ------------------------------------------------------------
 The output is always returned as a 'TEXT'. For the dot format, the output can be

[34/50] [abbrv] incubator-madlib git commit: Encode categorical: Allow NULL value for output_type

Posted by ri...@apache.org.

Encode categorical: Allow NULL value for output_type


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/fcf21a3b
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/fcf21a3b
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/fcf21a3b

Branch: refs/heads/latest_release
Commit: fcf21a3bd272ada8f3977fbf23a2df66c2fd6f57
Parents: 8e7c6eb
Author: Rahul Iyer <ri...@apache.org>
Authored: Wed Feb 8 17:45:47 2017 -0800
Committer: Rahul Iyer <ri...@apache.org>
Committed: Wed Feb 8 17:45:53 2017 -0800

----------------------------------------------------------------------
 .../modules/utilities/encode_categorical.py_in  | 24 +++++++++++++++-----
 1 file changed, 18 insertions(+), 6 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/fcf21a3b/src/ports/postgres/modules/utilities/encode_categorical.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/encode_categorical.py_in b/src/ports/postgres/modules/utilities/encode_categorical.py_in
index a92c559..54b4add 100644
--- a/src/ports/postgres/modules/utilities/encode_categorical.py_in
+++ b/src/ports/postgres/modules/utilities/encode_categorical.py_in
@@ -60,7 +60,7 @@ class CategoricalEncoder(object):
                  top=None,
                  value_to_drop=None,
                  encode_null=False,
-                 output_type='column',
+                 output_type=None,
                  output_dictionary=False,
                  distributed_by=None,
                  **kwargs):
@@ -74,7 +74,8 @@ class CategoricalEncoder(object):
         self.top = top
         self.value_to_drop = value_to_drop
         self.encode_null = encode_null
-        self.output_type = output_type
+
+        self.output_type = 'column' if not output_type else output_type.lower()
         self.output_dictionary = output_dictionary
         self.distributed_by = distributed_by
 
@@ -158,8 +159,22 @@ class CategoricalEncoder(object):
         # columns that determine the index for output table
         self._row_id_cols = split_quoted_delimited_str(self.row_id)
 
+        # output type for specific supported types
+        all_output_types = sorted(['array', 'column', 'svec'])
+        try:
+            # allow user to specify a prefix substring of
+            # supported output types. This works because the supported
+            # output types have unique prefixes.
+            self.output_type = next(s for s in all_output_types
+                                    if s.startswith(self.output_type))
+        except StopIteration:
+            # next() returns a StopIteration if no element found
+            plpy.error("Encoding categorical: Output type should be one of {0}".
+                       format(','.join(all_output_types)))
+
         # flag to build a dictionary table
-        self._output_dictionary = True if self.output_type in ('array', 'svec') else self.output_dictionary
+        self._output_dictionary = (True if self.output_type in ('array', 'svec')
+                                   else self.output_dictionary)
 
         # how to distribute the output table (for distributed platforms)
         if not is_platform_pg():
@@ -228,9 +243,6 @@ class CategoricalEncoder(object):
             _assert(is_var_valid(self.source_table, ','.join(self._row_id_cols)),
                     "Encoding categorical: Not all columns from ({0}) present in source table ({1})"
                     .format(self._row_id_cols, self.source_table))
-        _assert(self.output_type in ('column', 'array', 'svec'),
-                "Encoding categorical: Output type should be one of {0}".
-                format(('column', 'array', 'svec')))
     # ------------------------------------------------------------------------------
 
     def _is_col_name_long(self, col_to_values):

[30/50] [abbrv] incubator-madlib git commit: Kmeans: Add MADlib schema name to svec type

Posted by ri...@apache.org.

Kmeans: Add MADlib schema name to svec type


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/8278da8f
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/8278da8f
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/8278da8f

Branch: refs/heads/latest_release
Commit: 8278da8fd88d23469d089bf566aed6941a59a3ad
Parents: d035faa
Author: Rahul Iyer <ri...@apache.org>
Authored: Mon Feb 6 13:20:31 2017 -0800
Committer: Rahul Iyer <ri...@apache.org>
Committed: Mon Feb 6 13:20:31 2017 -0800

----------------------------------------------------------------------
 src/ports/postgres/modules/kmeans/kmeans.py_in | 22 ++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/8278da8f/src/ports/postgres/modules/kmeans/kmeans.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/kmeans/kmeans.py_in b/src/ports/postgres/modules/kmeans/kmeans.py_in
index 640ff16..e62aab5 100644
--- a/src/ports/postgres/modules/kmeans/kmeans.py_in
+++ b/src/ports/postgres/modules/kmeans/kmeans.py_in
@@ -35,9 +35,9 @@ def kmeans_validate_src(schema_madlib, rel_source, **kwargs):
         plpy.error("kmeans error: Data table does not exist!")
     if table_is_empty(rel_source):
         plpy.error("kmeans error: Data table is empty!")
-
 # ----------------------------------------------------------------------
 
+
 def kmeans_validate_expr(schema_madlib, rel_source, expr_point, **kwargs):
     """
     Validation function for the expr_point parameter
@@ -46,24 +46,24 @@ def kmeans_validate_expr(schema_madlib, rel_source, expr_point, **kwargs):
         - A numeric array expression
     """
 
-    expr_type = get_expr_type(expr_point,rel_source).lower()
+    expr_type = get_expr_type(expr_point, rel_source).lower()
 
     # Both formats should return a numeric array type
     if expr_type in ['smallint[]', 'integer[]', 'bigint[]', 'decimal[]',
-                        'numeric[]', 'real[]', 'double precision[]',
-                        'serial[]', 'bigserial[]', 'float8[]', 'svec']:
-
-        # An array expression should fail this check
+                     'numeric[]', 'real[]', 'double precision[]',
+                     'serial[]', 'bigserial[]', 'float8[]',
+                     schema_madlib + '.svec']:
         if columns_exist_in_table(rel_source, [expr_point]):
+            # An array expression would fail this check
             return False
-        return True
+        else:
+            return True
     else:
-        plpy.error(
-            """Kmeans error: {expr_point} is not a valid column or array!
-            """.format(**locals()))
-
+        plpy.error("Kmeans error: {0} is not a valid "
+                   "column or array".format(expr_point))
 # ----------------------------------------------------------------------
 
+
 def compute_kmeanspp_seeding(schema_madlib, rel_args, rel_state, rel_source,
                              expr_point, **kwargs):
     """

[18/50] [abbrv] incubator-madlib git commit: Madpack: Add password into connection args

Posted by ri...@apache.org.

Madpack: Add password into connection args

Closes #88


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/f7cb980f
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/f7cb980f
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/f7cb980f

Branch: refs/heads/latest_release
Commit: f7cb980f75bf4625839cc7c68f4e3c65830ac726
Parents: 29acc53
Author: Jim Klucar <ji...@immuta.com>
Authored: Mon Jan 23 10:22:12 2017 -0500
Committer: Rahul Iyer <ri...@apache.org>
Committed: Thu Jan 26 15:53:39 2017 -0800

----------------------------------------------------------------------
 src/madpack/madpack.py | 2 ++
 1 file changed, 2 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/f7cb980f/src/madpack/madpack.py
----------------------------------------------------------------------
diff --git a/src/madpack/madpack.py b/src/madpack/madpack.py
index 6348899..e15bb4a 100755
--- a/src/madpack/madpack.py
+++ b/src/madpack/madpack.py
@@ -1081,6 +1081,8 @@ def main(argv):
         con_args['host'] = c_host + ':' + c_port
         con_args['database'] = c_db
         con_args['user'] = c_user
+        if c_pass is not None:
+            con_args['password'] = c_pass
 
         # Try connecting to the database
         _info("Testing database connection...", verbose)

[14/50] [abbrv] incubator-madlib git commit: Minor fixes

Posted by ri...@apache.org.

Minor fixes

RF: Renames the madlib.mode function to avoid confusion with the
built-in postgres mode function.
Pivot: Adjusts the warning level.
Note that there are only 2 new lines but the diff seems large
because of the additional spaces.


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/9d04b7d0
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/9d04b7d0
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/9d04b7d0

Branch: refs/heads/latest_release
Commit: 9d04b7d0409adf8c4f5fc220d8222ca9bc7406f5
Parents: 8e5da2f
Author: Orhan Kislal <ok...@pivotal.io>
Authored: Wed Jan 25 17:20:18 2017 -0800
Committer: Orhan Kislal <ok...@pivotal.io>
Committed: Wed Jan 25 17:22:13 2017 -0800

----------------------------------------------------------------------
 .../recursive_partitioning/random_forest.py_in  |   4 +-
 .../postgres/modules/utilities/pivot.py_in      | 416 ++++++++++---------
 .../postgres/modules/utilities/utilities.sql_in |   4 +-
 3 files changed, 213 insertions(+), 211 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/9d04b7d0/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in b/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in
index 0eb5985..affa9f9 100644
--- a/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in
+++ b/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in
@@ -715,7 +715,7 @@ def forest_predict(schema_madlib, model, source, output, pred_type='response',
         majority_pred_expression = "avg(aggregated_prediction)"
     else:
         majority_pred_expression = """($sql${{ {dep_levels} }}$sql$::varchar[])[
-                                    {schema_madlib}.mode(aggregated_prediction + 1)]::TEXT
+                                    {schema_madlib}.rf_mode(aggregated_prediction + 1)]::TEXT
                                     """.format(**locals())
 
     if dep_type.lower() == "boolean":
@@ -1164,7 +1164,7 @@ def _calculate_oob_error(schema_madlib, oob_prediction_table, oob_error_table,
                     THEN 0.
                     ELSE 1.
                 END""".format(**locals())
-        forest_prediction_agg = "{schema_madlib}.mode".format(**locals())
+        forest_prediction_agg = "{schema_madlib}.rf_mode".format(**locals())
 
     sql_compute_oob_error = """
             CREATE TABLE {oob_error_table} AS

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/9d04b7d0/src/ports/postgres/modules/utilities/pivot.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/pivot.py_in b/src/ports/postgres/modules/utilities/pivot.py_in
index 14098c2..7e342f1 100644
--- a/src/ports/postgres/modules/utilities/pivot.py_in
+++ b/src/ports/postgres/modules/utilities/pivot.py_in
@@ -80,215 +80,217 @@ def pivot(schema_madlib, source_table, out_table, index, pivot_cols,
         FROM pivset GROUP BY id ORDER BY id)
     """
 
-    # If there are more than 1000 columns for the output table, we give a
-    # warning as it might give an error.
-    MAX_OUTPUT_COLUMN_COUNT = 1000
-
-    # If a column name has more than 63 characters it gets trimmed automatically,
-    # which may cause an exception. Enable the output dictionary in this case.
-    MAX_COLUMN_LENGTH = 63
-
-    indices = split_quoted_delimited_str(index)
-    pcols = split_quoted_delimited_str(pivot_cols)
-    pvals = split_quoted_delimited_str(pivot_values)
-    validate_pivot_coding(source_table, out_table, indices, pcols, pvals)
-
-    # Strip the end quotes for building output columns (this can only be
-    # performed after the validation)
-    pcols = [strip_end_quotes(pcol.strip()) for pcol in pcols]
-    pvals = [strip_end_quotes(pval.strip()) for pval in pvals]
-
-    # Create a dictionary that assigns one or more aggregate functions for every
-    # value column.
-    agg_dict = parse_aggregates(pvals, aggregate_func)
-
-    # Find the distinct values of pivot_cols
-    array_agg_str = ', '.join("array_agg(DISTINCT {pcol}) AS {pcol}_values".
-                              format(pcol=pcol) for pcol in pcols)
-    if keep_null:
-        # Some platforms don't include NULL values as part of the array_agg(DISTINCT ...)
-        # Below clause checks explicitly for NULL values
-        null_str = ", " + ', '.join(
-            "bool_or(CASE WHEN {pcol} IS NULL THEN True ELSE False END)"
-            "AS {pcol}_isnull".format(pcol=pcol) for pcol in pcols)
-    else:
-        null_str = ""
-    distinct_values = plpy.execute("SELECT {0} {1} FROM {2}".
-                                   format(array_agg_str, null_str, source_table))[0]
-
-    # Collect the distinct values for every pivot column into a dictionary
-    pcol_distinct_values = {}
-    pcol_max_length = 0
-    for pcol in pcols:
-        pcol_tmp = set(item for item in distinct_values[pcol + "_values"])
-        if not keep_null:
-            pcol_tmp.discard(None)
-        elif distinct_values[pcol + "_isnull"]:
-            pcol_tmp.add(None)
-
-        pcol_distinct_values[pcol] = sorted(pcol_tmp)
-        # Max pcol length calculation: the name of column (pcol) +
-        #                              name of longest value in column (item) +
-        #                              underscore (1)
-        pcol_max_length += (len(pcol) +
-                            max([len(str(item)) for item in pcol_tmp]) +
-                            1)
-
-    # Create the combination of every possible pivot column
-    # Assume piv and piv2 are pivot columns. piv=(1,2) and piv2=(3,4,5)
-    # pivot_comb = ((1,3),(1,4),(1,5),(2,3),(2,4),(2,5))
-    pivot_comb = list(itertools.product(*([pcol_distinct_values[pcol]
-                                          for pcol in pcols])))
-
-    # Check the max possible length of a output column name
-    # If it is over 63 (psql upper limit) create table lookup
-    for pval in pvals:
-        agg_func = agg_dict[pval]
-        # Length calculation: value column length + aggregate length +
-        # 2 underscores + pivots and their values (pcol_max_length)
-        # Example: val _ sum _ piv1_10_piv2_100
-        col_name_len = (2 + len(pval) + pcol_max_length +
-                        max([len(item) for item in agg_func]))
-        if col_name_len > MAX_COLUMN_LENGTH:
-            with MinWarning("warning"):
-                plpy.warning("Pivot: Output columns are renamed to keep them "
-                             "under 63 characters. Please refer to "
-                             "{source_table}_dictionary for the original names.".
-                             format(**locals()))
-            output_col_dictionary = True
-
-    # Create the output dictionary if needed
-    if output_col_dictionary:
-        out_dict = out_table + "_dictionary"
-        _assert(not table_exists(out_dict),
-                "Pivot: Output dictionary table already exists!")
-
-        # Collect the types for pivot columns
-        types_str = ','.join("pg_typeof(\"{pcol}\") as {pcol}_type".
-                             format(pcol=pcol) for pcol in pcols)
-        pcol_types = plpy.execute("SELECT {0} FROM {1} LIMIT 1".
-                                  format(types_str, source_table))
-
-        # Create the empty dictionary table
-        dict_str = ', '.join(" {pcol} {pcol_type} ".
-                             format(pcol=pcol, pcol_type=pcol_types[0][pcol+"_type"])
-                             for pcol in pcols)
-        plpy.execute("""
-            CREATE TABLE {out_dict} (
-                __pivot_cid__ VARCHAR, pval VARCHAR,
-                agg VARCHAR, {dict_str}, col_name VARCHAR)
-            """.format(**locals()))
-
-        # The holder for rows to insert into output dictionary
-        insert_str = []
-        # Counter for the new output column names
-        dict_counter = 0
-
-    pivot_str_sel_list = []
-    pivot_str_from_list = []
-    # Prepare the wrapper for fill value
-    if fill_value is not None:
-        fill_str_begin = " COALESCE("
-        fill_str_end = ", " + fill_value + " ) "
-    else:
-        fill_str_begin, fill_str_end = "", ""
-
-    for pval in pvals:
-        agg_func = agg_dict[pval]
-        for agg in agg_func:
-            for comb in pivot_comb:
-                pivot_col_condition = []
-                pivot_col_name = ["\"{pval}_{agg}".format(pval=pval, agg=agg)]
-
-                if output_col_dictionary:
-                    # Prepare the entry for the dictionary
-                    insert_str.append("(\'__p_{dict_counter}__\', \'{pval}\', "
-                                      "\'{agg}\' ".format(dict_counter=dict_counter,
-                                                          pval=pval, agg=agg))
-
-                # For every pivot column in a given combination
-                for counter, pcol in enumerate(pcols):
-                    # If we encounter a NULL value that means it is not filtered
-                    # because of keep_null. Use "IS NULL" for comparison
-                    if comb[counter] is None:
-                        pivot_col_condition.append(" \"{0}\" IS NULL".format(pcol))
-                        pivot_col_name.append("_{0}_null".format(pcol))
-                    else:
-                        pivot_col_condition.append(" \"{0}\" = '{1}'".
-                                                   format(pcol, comb[counter]))
-                        pivot_col_name.append("_{0}_{1}".format(pcol, comb[counter]))
-
-                    # Collect pcol values for the dict
+    with MinWarning('warning'):
+
+        # If there are more than 1000 columns for the output table, we give a
+        # warning as it might give an error.
+        MAX_OUTPUT_COLUMN_COUNT = 1000
+
+        # If a column name has more than 63 characters it gets trimmed automatically,
+        # which may cause an exception. Enable the output dictionary in this case.
+        MAX_COLUMN_LENGTH = 63
+
+        indices = split_quoted_delimited_str(index)
+        pcols = split_quoted_delimited_str(pivot_cols)
+        pvals = split_quoted_delimited_str(pivot_values)
+        validate_pivot_coding(source_table, out_table, indices, pcols, pvals)
+
+        # Strip the end quotes for building output columns (this can only be
+        # performed after the validation)
+        pcols = [strip_end_quotes(pcol.strip()) for pcol in pcols]
+        pvals = [strip_end_quotes(pval.strip()) for pval in pvals]
+
+        # Create a dictionary that assigns one or more aggregate functions for every
+        # value column.
+        agg_dict = parse_aggregates(pvals, aggregate_func)
+
+        # Find the distinct values of pivot_cols
+        array_agg_str = ', '.join("array_agg(DISTINCT {pcol}) AS {pcol}_values".
+                                  format(pcol=pcol) for pcol in pcols)
+        if keep_null:
+            # Some platforms don't include NULL values as part of the array_agg(DISTINCT ...)
+            # Below clause checks explicitly for NULL values
+            null_str = ", " + ', '.join(
+                "bool_or(CASE WHEN {pcol} IS NULL THEN True ELSE False END)"
+                "AS {pcol}_isnull".format(pcol=pcol) for pcol in pcols)
+        else:
+            null_str = ""
+        distinct_values = plpy.execute("SELECT {0} {1} FROM {2}".
+                                       format(array_agg_str, null_str, source_table))[0]
+
+        # Collect the distinct values for every pivot column into a dictionary
+        pcol_distinct_values = {}
+        pcol_max_length = 0
+        for pcol in pcols:
+            pcol_tmp = set(item for item in distinct_values[pcol + "_values"])
+            if not keep_null:
+                pcol_tmp.discard(None)
+            elif distinct_values[pcol + "_isnull"]:
+                pcol_tmp.add(None)
+
+            pcol_distinct_values[pcol] = sorted(pcol_tmp)
+            # Max pcol length calculation: the name of column (pcol) +
+            #                              name of longest value in column (item) +
+            #                              underscore (1)
+            pcol_max_length += (len(pcol) +
+                                max([len(str(item)) for item in pcol_tmp]) +
+                                1)
+
+        # Create the combination of every possible pivot column
+        # Assume piv and piv2 are pivot columns. piv=(1,2) and piv2=(3,4,5)
+        # pivot_comb = ((1,3),(1,4),(1,5),(2,3),(2,4),(2,5))
+        pivot_comb = list(itertools.product(*([pcol_distinct_values[pcol]
+                                              for pcol in pcols])))
+
+        # Check the max possible length of a output column name
+        # If it is over 63 (psql upper limit) create table lookup
+        for pval in pvals:
+            agg_func = agg_dict[pval]
+            # Length calculation: value column length + aggregate length +
+            # 2 underscores + pivots and their values (pcol_max_length)
+            # Example: val _ sum _ piv1_10_piv2_100
+            col_name_len = (2 + len(pval) + pcol_max_length +
+                            max([len(item) for item in agg_func]))
+            if col_name_len > MAX_COLUMN_LENGTH:
+                with MinWarning("warning"):
+                    plpy.warning("Pivot: Output columns are renamed to keep them "
+                                 "under 63 characters. Please refer to "
+                                 "{source_table}_dictionary for the original names.".
+                                 format(**locals()))
+                output_col_dictionary = True
+
+        # Create the output dictionary if needed
+        if output_col_dictionary:
+            out_dict = out_table + "_dictionary"
+            _assert(not table_exists(out_dict),
+                    "Pivot: Output dictionary table already exists!")
+
+            # Collect the types for pivot columns
+            types_str = ','.join("pg_typeof(\"{pcol}\") as {pcol}_type".
+                                 format(pcol=pcol) for pcol in pcols)
+            pcol_types = plpy.execute("SELECT {0} FROM {1} LIMIT 1".
+                                      format(types_str, source_table))
+
+            # Create the empty dictionary table
+            dict_str = ', '.join(" {pcol} {pcol_type} ".
+                                 format(pcol=pcol, pcol_type=pcol_types[0][pcol+"_type"])
+                                 for pcol in pcols)
+            plpy.execute("""
+                CREATE TABLE {out_dict} (
+                    __pivot_cid__ VARCHAR, pval VARCHAR,
+                    agg VARCHAR, {dict_str}, col_name VARCHAR)
+                """.format(**locals()))
+
+            # The holder for rows to insert into output dictionary
+            insert_str = []
+            # Counter for the new output column names
+            dict_counter = 0
+
+        pivot_str_sel_list = []
+        pivot_str_from_list = []
+        # Prepare the wrapper for fill value
+        if fill_value is not None:
+            fill_str_begin = " COALESCE("
+            fill_str_end = ", " + fill_value + " ) "
+        else:
+            fill_str_begin, fill_str_end = "", ""
+
+        for pval in pvals:
+            agg_func = agg_dict[pval]
+            for agg in agg_func:
+                for comb in pivot_comb:
+                    pivot_col_condition = []
+                    pivot_col_name = ["\"{pval}_{agg}".format(pval=pval, agg=agg)]
+
                     if output_col_dictionary:
-                        insert_str.append("{0}".format(
-                            comb[counter] if comb[counter] is not None else "NULL"))
-                pivot_col_name.append("\"")
-
-                if output_col_dictionary:
-                    # Store the whole string in case some user wants it
-                    insert_str.append("\'{column_name}\')".
-                                      format(column_name=''.join(pivot_col_name)))
-                    pivot_col_name = ["__p_"+str(dict_counter)+"__"]
-                    dict_counter += 1
-                # Collecting the whole sql query
-                # Please refer to the earlier comment for a sample output
-
-                # Build the pivot column with NULL values in tuples that don't
-                # satisfy that column's condition
-                pivot_str_from = ("(CASE WHEN {condition} THEN {pval} END) "
-                                  "AS {pivot_col_name}".
-                                  format(pval=pval,
-                                         condition=' AND '.join(pivot_col_condition),
-                                         pivot_col_name=''.join(pivot_col_name)))
-                pivot_str_from_list.append(pivot_str_from)
-                # Aggregate over each pivot column, while filtering all NULL values
-                # created by previous query.
-                pivot_str_sel = ("{fill_str_begin}"
-                                 "  {agg} ({pivot_col_name}) "
-                                 "    FILTER (WHERE {pivot_col_name} IS NOT NULL) "
-                                 "{fill_str_end} AS {pivot_col_name}".
-                                 format(agg=agg, fill_str_begin=fill_str_begin,
-                                        fill_str_end=fill_str_end,
-                                        pivot_col_name=''.join(pivot_col_name)))
-                pivot_str_sel_list.append(pivot_str_sel)
-
-    try:
-        plpy.execute("""
-            CREATE TABLE {out_table} AS
-                SELECT {index},
-                       {pivot_str_sel_list}
-                FROM (
-                        SELECT {index},
-                               {pivot_str_from_list}
-                        FROM {source_table}
-                    ) x
-                GROUP BY {index}
-            """.format(out_table=out_table,
-                       index=index,
-                       source_table=source_table,
-                       pivot_str_from_list=', '.join(pivot_str_from_list),
-                       pivot_str_sel_list=', '.join(pivot_str_sel_list)))
+                        # Prepare the entry for the dictionary
+                        insert_str.append("(\'__p_{dict_counter}__\', \'{pval}\', "
+                                          "\'{agg}\' ".format(dict_counter=dict_counter,
+                                                              pval=pval, agg=agg))
+
+                    # For every pivot column in a given combination
+                    for counter, pcol in enumerate(pcols):
+                        # If we encounter a NULL value that means it is not filtered
+                        # because of keep_null. Use "IS NULL" for comparison
+                        if comb[counter] is None:
+                            pivot_col_condition.append(" \"{0}\" IS NULL".format(pcol))
+                            pivot_col_name.append("_{0}_null".format(pcol))
+                        else:
+                            pivot_col_condition.append(" \"{0}\" = '{1}'".
+                                                       format(pcol, comb[counter]))
+                            pivot_col_name.append("_{0}_{1}".format(pcol, comb[counter]))
+
+                        # Collect pcol values for the dict
+                        if output_col_dictionary:
+                            insert_str.append("{0}".format(
+                                comb[counter] if comb[counter] is not None else "NULL"))
+                    pivot_col_name.append("\"")
 
-        if output_col_dictionary:
-            plpy.execute("INSERT INTO {out_dict} VALUES {insert_sql}".
-                         format(out_dict=out_dict,
-                                insert_sql=', '.join(insert_str)))
-    except plpy.SPIError:
-        # Warn user if the number of columns is over the limit
-        with MinWarning("warning"):
-            # The column options from value columns and aggregates
-            # times the number of pivot combinations
-            if ((sum([len(item) for item in agg_dict.values()])*
-                    len(pivot_comb)) > MAX_OUTPUT_COLUMN_COUNT):
-                plpy.warning(
-                    "Pivot: Too many distinct values for pivoting! "
-                    "The execution may fail due to too many columns in the "
-                    "output table.")
-            else:
-                plpy.warning(
-                    "Pivot: Pivoting is only supported over aggregates with "
-                    "transition functions defined as STRICT.")
-        raise
+                    if output_col_dictionary:
+                        # Store the whole string in case some user wants it
+                        insert_str.append("\'{column_name}\')".
+                                          format(column_name=''.join(pivot_col_name)))
+                        pivot_col_name = ["__p_"+str(dict_counter)+"__"]
+                        dict_counter += 1
+                    # Collecting the whole sql query
+                    # Please refer to the earlier comment for a sample output
+
+                    # Build the pivot column with NULL values in tuples that don't
+                    # satisfy that column's condition
+                    pivot_str_from = ("(CASE WHEN {condition} THEN {pval} END) "
+                                      "AS {pivot_col_name}".
+                                      format(pval=pval,
+                                             condition=' AND '.join(pivot_col_condition),
+                                             pivot_col_name=''.join(pivot_col_name)))
+                    pivot_str_from_list.append(pivot_str_from)
+                    # Aggregate over each pivot column, while filtering all NULL values
+                    # created by previous query.
+                    pivot_str_sel = ("{fill_str_begin}"
+                                     "  {agg} ({pivot_col_name}) "
+                                     "    FILTER (WHERE {pivot_col_name} IS NOT NULL) "
+                                     "{fill_str_end} AS {pivot_col_name}".
+                                     format(agg=agg, fill_str_begin=fill_str_begin,
+                                            fill_str_end=fill_str_end,
+                                            pivot_col_name=''.join(pivot_col_name)))
+                    pivot_str_sel_list.append(pivot_str_sel)
+
+        try:
+            plpy.execute("""
+                CREATE TABLE {out_table} AS
+                    SELECT {index},
+                           {pivot_str_sel_list}
+                    FROM (
+                            SELECT {index},
+                                   {pivot_str_from_list}
+                            FROM {source_table}
+                        ) x
+                    GROUP BY {index}
+                """.format(out_table=out_table,
+                           index=index,
+                           source_table=source_table,
+                           pivot_str_from_list=', '.join(pivot_str_from_list),
+                           pivot_str_sel_list=', '.join(pivot_str_sel_list)))
+
+            if output_col_dictionary:
+                plpy.execute("INSERT INTO {out_dict} VALUES {insert_sql}".
+                             format(out_dict=out_dict,
+                                    insert_sql=', '.join(insert_str)))
+        except plpy.SPIError:
+            # Warn user if the number of columns is over the limit
+            with MinWarning("warning"):
+                # The column options from value columns and aggregates
+                # times the number of pivot combinations
+                if ((sum([len(item) for item in agg_dict.values()])*
+                        len(pivot_comb)) > MAX_OUTPUT_COLUMN_COUNT):
+                    plpy.warning(
+                        "Pivot: Too many distinct values for pivoting! "
+                        "The execution may fail due to too many columns in the "
+                        "output table.")
+                else:
+                    plpy.warning(
+                        "Pivot: Pivoting is only supported over aggregates with "
+                        "transition functions defined as STRICT.")
+            raise
 
     return None
 # ------------------------------------------------------------------------------

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/9d04b7d0/src/ports/postgres/modules/utilities/utilities.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/utilities.sql_in b/src/ports/postgres/modules/utilities/utilities.sql_in
index 0ec864d..b2415a7 100644
--- a/src/ports/postgres/modules/utilities/utilities.sql_in
+++ b/src/ports/postgres/modules/utilities/utilities.sql_in
@@ -466,8 +466,8 @@ LANGUAGE 'sql' IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');
 
 -- Tell Postgres how to use our aggregate
-DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.mode(double precision) CASCADE;
-CREATE AGGREGATE MADLIB_SCHEMA.mode(double precision) (
+DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.rf_mode(double precision) CASCADE;
+CREATE AGGREGATE MADLIB_SCHEMA.rf_mode(double precision) (
   SFUNC=array_append, --Function to call for each row. Just builds the array
   STYPE=double precision[],
   FINALFUNC=MADLIB_SCHEMA._final_mode, --Function to call after everything has been added to array

[41/50] [abbrv] incubator-madlib git commit: Release v1.10:

Posted by ri...@apache.org.

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/90f4dc15/src/madpack/changelist_1.2_1.9.1.yaml
----------------------------------------------------------------------
diff --git a/src/madpack/changelist_1.2_1.9.1.yaml b/src/madpack/changelist_1.2_1.9.1.yaml
deleted file mode 100644
index 1fb533e..0000000
--- a/src/madpack/changelist_1.2_1.9.1.yaml
+++ /dev/null
@@ -1,1352 +0,0 @@
-# Changelist for MADlib version 1.2 to 1.7
-
-# This file contains all changes that were introduced in a new version of
-# MADlib. This changelist is used by the upgrade script to detect what objects
-# should be upgraded (while retaining all other objects from the previous version)
-
-# New modules (actually .sql_in files) added in upgrade version
-# For these files the sql_in code is retained as is with the functions in the
-# file installed on the upgrade version. All other files (that don't have
-# updates), are cleaned up to remove object replacements
-new module:
-    # ----------------- Changes from 1.2 to 1.3 -----------------
-    # ----------------- Changes from 1.3 to 1.4 -----------------
-    robust_variance_coxph:
-    clustered_variance_coxph:
-    # ----------------- Changes from 1.4 to 1.4.1 ---------------
-    # ----------------- Changes from 1.4.1 to 1.5 ---------------
-    # ----------------- Changes from 1.5 to 1.6 ---------------
-    table_to_pmml:
-    # ----------------- Changes from 1.6 to 1.6.0S ----------
-    # ----------------- Changes from 1.6.0S to 1.7 ----------
-    glm:
-    multiresponseglm:
-    ordinal:
-    decision_tree:
-    random_forest:
-    distribution:
-    # ----------------- Changes from 1.7.1 to 1.8 ----------
-    text_utilities:
-
-# Changes in the types (UDT) including removal and modification
-udt:
-
-    # ----------------- Changes from 1.2 to 1.3 -----------------
-    __logregr_result:
-    linregr_result:
-    intermediate_cox_prop_hazards_result:
-    # ----------------- Changes from 1.3 to 1.4 -----------------
-    # coxph_result: not exists in 1.2
-    mlogregr_result:
-    marginal_logregr_result:
-    marginal_mlogregr_result:
-    # ----------------- Changes from 1.4 to 1.4.1 ---------------
-    # ----------------- Changes from 1.4.1 to 1.5 ---------------
-    # ----------------- Changes from 1.5 to 1.6 ---------------
-    # __logregr_result: appeared before
-    # coxph_result: not exists in 1.2
-    # linregr_result: appeared before
-    # mlogregr_result: appeared before
-    # some types missed before upgrade to v1.6
-    __utils_scaled_data:
-    # ----------------- Changes from 1.6 to 1.6.0S ----------
-    # ----------------- Changes from 1.6.0S to 1.7 ----------
-    bytea8:
-    # ----------------- Changes from 1.8 to 1.9 ----------
-    __enc_tbl_result:
-    __gen_acc_time:
-    __rep_type:
-    __train_result:
-    c45_classify_result:
-    c45_train_result:
-    correlation_result:
-    lsvm_sgd_model_rec:
-    lsvm_sgd_result:
-    rf_classify_result:
-    rf_train_result:
-    svm_cls_result:
-    svm_model_pr:
-    svm_model_rec:
-    svm_nd_result:
-    svm_reg_result:
-    svm_support_vector:
-    _prune_result_type:
-    _tree_result_type:
-    linear_svm_result:
-    # ----------------- Changes from 1.9 to 1.9.1 ----------
-    profile_result:
-
-# List of the UDF changes that affect the user externally.  This includes change
-# in function name, change in argument order or argument types, and removal of
-# the function. In each case, the original function is as good as removed and a
-# new function is created. In such cases, we should abort the upgrade if there
-# are user views dependent on this function, since the original function will
-# not be present in the upgraded version.
-udf:
-    # ----------------- Changes from 1.2 to 1.3 -----------------
-    # linear regression: 'num_processed' added in 'linregr_result'
-    - linregr_final:
-        rettype: schema_madlib.linregr_result
-        argument: schema_madlib.bytea8
-    - linregr_merge_states:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - linregr_transition:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, double precision, double precision[]
-
-    # logistic regression: 'num_processed' added in '__logregr_result'
-    - __logregr_cg_result:
-        rettype: schema_madlib.__logregr_result
-        argument: double precision[]
-
-    - __logregr_irls_result:
-        rettype: schema_madlib.__logregr_result
-        argument: double precision[]
-
-    - __logregr_igd_result:
-        rettype: schema_madlib.__logregr_result
-        argument: double precision[]
-
-    # there were once missing till upgrade to v1.6
-    - __internal_get_cox_prop_hazards_insert_string:
-        rettype: character varying
-        argument: schema_madlib.cox_prop_hazards_result, text
-    - __internal_get_cox_prop_hazards_result:
-        rettype: schema_madlib.cox_prop_hazards_result
-        argument: character varying, character varying, character varying, character varying
-    - __internal_get_hsk_result:
-        rettype: schema_madlib.heteroskedasticity_test_result
-        argument: character varying, character varying, character varying, double precision[]
-    - __internal_get_linreg_result:
-        rettype: schema_madlib.linregr_result
-        argument: character varying, character varying, character varying
-    - __internal_get_linregr_insert_string:
-        rettype: character varying
-        argument: schema_madlib.linregr_result, text
-    - __internal_linregr_train_hetero:
-        rettype: void
-        argument: character varying, character varying, character varying, character varying, boolean
-    - compute_cox_prop_hazards_regr:
-        rettype: integer
-        argument: character varying, character varying, character varying, character varying, integer, character varying, double precision
-    - cox_prop_hazards_step_final:
-        rettype: double precision[]
-        argument: double precision[]
-    - cox_prop_hazards_step_transition:
-        rettype: double precision[]
-        argument: double precision[], double precision[], double precision, boolean, double precision, double precision[], double precision[], double precision[]
-    - intermediate_cox_prop_hazards:
-        rettype: schema_madlib.intermediate_cox_prop_hazards_result
-        argument: double precision[], boolean, double precision[]
-    - internal_cox_prop_hazards_result:
-        rettype: schema_madlib.cox_prop_hazards_result
-        argument: double precision[]
-    - internal_cox_prop_hazards_step_distance:
-        rettype: double precision
-        argument: double precision[], double precision[]
-    # ----------------- Changes from 1.3 to 1.4 -----------------
-    - __internal_get_robust_linregr_insert_string:
-        rettype: character varying
-        argument: schema_madlib.robust_linregr_result, double precision[], text
-    - __internal_get_robust_linregr_result:
-        rettype: schema_madlib.robust_linregr_result
-        argument: character varying, character varying, character varying, double precision[]
-    - __internal_get_robust_logregr_insert_string:
-        rettype: character varying
-        argument: schema_madlib.robust_logregr_result, text
-    - __internal_get_robust_logregr_result:
-        rettype: schema_madlib.robust_logregr_result
-        argument: character varying, character varying, character varying, double precision[]
-    - __internal_get_robust_mlogregr_insert_string:
-        rettype: character varying
-        argument: schema_madlib.robust_mlogregr_result, text
-    - __lda_count_topic_prefunc:
-        rettype: integer[]
-        argument: integer[], integer[]
-    - __lda_count_topic_sfunc:
-        rettype: integer[]
-        argument: integer[], integer[], integer[], integer[], integer, integer
-    - __lda_gibbs_sample:
-        rettype: integer[]
-        argument: integer[], integer[], integer[], integer[], double precision, double precision, integer, integer, integer
-    - __lda_perplexity_ffunc:
-        rettype: double precision
-        argument: integer[]
-    - __lda_perplexity_prefunc:
-        rettype: integer[]
-        argument: integer[], integer[]
-    - __lda_perplexity_sfunc:
-        rettype: integer[]
-        argument: integer[], integer[], integer[], integer[], integer[], double precision, double precision, integer, integer
-    - __lda_util_transpose:
-        rettype: integer[]
-        argument: integer[]
-    - __lda_util_unnest:
-        rettype: SETOF integer[]
-        argument: integer[]
-    - clustered_variance_mlogregr:
-        rettype: void
-        argument: text, text, text, text, text, integer, text, integer, text, double precision
-    - clustered_variance_mlogregr:
-        rettype: void
-        argument: text, text, text, text, text, integer, text, integer, text
-    - clustered_variance_mlogregr:
-        rettype: void
-        argument: text, text, text, text, text, integer, text, integer, text, double precision, boolean
-    - robust_input_checking:
-        rettype: void
-        argument: character varying, character varying, character varying, character varying
-
-    # ----------------- Changes from 1.4 to 1.4.1 ---------------
-    # ----------------- Changes from 1.4.1 to 1.5 ---------------
-    - __cmsketch_final:
-        rettype: bytea
-        argument: bytea
-    - __delete_traininginfo:
-        rettype: void
-        argument: text
-    - __get_encode_table_name:
-        rettype: text
-        argument: text
-    - __get_metatable_name:
-        rettype: text
-        argument: text
-    - __get_routine_id:
-        rettype: integer
-        argument: text
-    - __get_routine_name:
-        rettype: text
-        argument: text
-    - __get_tree_table_name:
-        rettype: text
-        argument: text
-    - __insert_into_traininginfo:
-        rettype: void
-        argument: text, text, text, text, text, text, text, text, double precision, integer, integer
-    - __treemodel_clean:
-        rettype: boolean
-        argument: text
-    - compute_lincrf:
-        rettype: integer
-        argument: character varying, character varying, character varying, character varying, character varying, integer, integer
-    - create_nb_classify_fn:
-        rettype: void
-        argument: character varying, character varying, integer, character varying
-    - create_nb_classify_fn:
-        rettype: void
-        argument: character varying, character varying, character varying, integer, character varying
-    - create_table_as:
-        rettype: void
-        argument: character varying, character varying
-    - create_table_as:
-        rettype: void
-        argument: character varying, character varying, character varying
-    - create_temp_table_as:
-        rettype: void
-        argument: character varying, character varying, character varying
-    - create_temp_table_as:
-        rettype: void
-        argument: character varying, character varying
-    - create_temporary_table_as:
-        rettype: void
-        argument: character varying, character varying, character varying
-    - create_temporary_table_as:
-        rettype: void
-        argument: character varying, character varying
-    - crf_train_fgen:
-        rettype: void
-        argument: text, text, text, text, text
-    - insert_into:
-        rettype: void
-        argument: character varying, character varying
-    - internal_create_table_as:
-        rettype: void
-        argument: boolean, character varying, character varying, character varying
-    - internal_execute_using_kmeans_args:
-        rettype: void
-        argument: character varying, double precision[], regproc, integer, double precision
-    - internal_execute_using_kmeanspp_seeding_args:
-        rettype: void
-        argument: character varying, integer, regproc, double precision[]
-    - internal_execute_using_silhouette_args:
-        rettype: double precision
-        argument: character varying, double precision[], regproc
-    - lincrf:
-        rettype: integer
-        argument: character varying, character varying, character varying, character varying, character varying, integer, character varying, character varying, integer
-    - lincrf:
-        rettype: integer
-        argument: character varying, character varying, character varying, character varying, character varying, integer, character varying, character varying
-    - lsvm_predict:
-        rettype: double precision
-        argument: text, double precision[]
-    - lsvm_predict_combo:
-        rettype: SETOF schema_madlib.svm_model_pr
-        argument: text, double precision[]
-    - lsvm_sgd_update:
-        rettype: schema_madlib.lsvm_sgd_model_rec
-        argument: schema_madlib.lsvm_sgd_model_rec, double precision[], double precision, double precision, double precision
-    - svm_cls_update:
-        rettype: schema_madlib.svm_model_rec
-        argument: schema_madlib.svm_model_rec, double precision[], double precision, text, double precision, double precision
-    - svm_nd_update:
-        rettype: schema_madlib.svm_model_rec
-        argument: schema_madlib.svm_model_rec, double precision[], text, double precision, double precision
-    - svm_predict:
-        rettype: double precision
-        argument: schema_madlib.svm_model_rec, double precision[], text
-    - svm_predict:
-        rettype: double precision
-        argument: text, double precision[]
-    - svm_predict_combo:
-        rettype: SETOF schema_madlib.svm_model_pr
-        argument: text, double precision[]
-    - svm_predict_sub:
-        rettype: double precision
-        argument: integer, integer, double precision[], double precision[], double precision[], text
-    - svm_reg_update:
-        rettype: schema_madlib.svm_model_rec
-        argument: schema_madlib.svm_model_rec, double precision[], double precision, text, double precision, double precision, double precision
-    - utils_normalize_data:
-        rettype: schema_madlib.__utils_scaled_data
-        argument: double precision[], double precision[], double precision[]
-    - vcrf_top1_label:
-        rettype: integer[]
-        argument: integer[], integer[], integer
-    - vcrf_top1_view:
-        rettype: text
-        argument: text, text, text, text
-    # ----------------- Changes from 1.5 to 1.6 ---------------
-    # Removed functions
-    - array_contains_null:
-        rettype: boolean
-        argument: double precision[]
-    - array_sqrt:
-        rettype: anyarray
-        argument: anyarray
-    - coxph_step_final:
-        rettype: double precision[]
-        argument: double precision[]
-    - coxph_step_strata_final:
-        rettype: double precision[]
-        argument: double precision[]
-    - coxph_step_transition:
-        rettype: double precision[]
-        argument: double precision[], double precision[], double precision, boolean, double precision[]
-    - internal_coxph_result:
-        rettype: schema_madlib.coxph_result
-        argument: double precision[]
-    - internal_coxph_step_distance:
-        rettype: double precision
-        argument: double precision[], double precision[]
-    - normalize:
-        rettype: double precision[]
-        argument: double precision[]
-    # Changed functions (return type)
-    # These functions can be recreated correctly even if we don't add them here.
-    # But the view dependency checker needs the information.
-    - __internal_mlogregr_irls_result:
-        rettype: schema_madlib.mlogregr_result
-        argument: double precision[]
-    - __logregr_cg_result:
-        rettype: schema_madlib.__logregr_result
-        argument: double precision[]
-    - __logregr_igd_result:
-        rettype: schema_madlib.__logregr_result
-        argument: double precision[]
-    - __logregr_irls_result:
-        rettype: schema_madlib.__logregr_result
-        argument: double precision[]
-    - linregr_final:
-        rettype: schema_madlib.linregr_result
-        argument: schema_madlib.bytea8
-    - mlogregr:
-        rettype: schema_madlib.mlogregr_result
-        argument: character varying, character varying, character varying, integer, character varying, double precision, integer
-    - mlogregr:
-        rettype: schema_madlib.mlogregr_result
-        argument: character varying, character varying, character varying, integer, character varying
-    - mlogregr:
-        rettype: schema_madlib.mlogregr_result
-        argument: character varying, character varying, character varying, integer
-    - mlogregr:
-        rettype: schema_madlib.mlogregr_result
-        argument: character varying, character varying, character varying
-    # make-ups from upgrade to v1.6
-    - marginal_logregr_step_final:
-        rettype: schema_madlib.marginal_logregr_result
-        argument: double precision[]
-    - mlogregr_marginal_step_final:
-        rettype: schema_madlib.marginal_mlogregr_result
-        argument: double precision[]
-    # ----------------- Changes from 1.6 to 1.6.0S ----------
-    # ----------------- Changes from 1.6.0S to 1.7 ----------
-    - cross_validation_general:   # change in name of argument "fold_num"
-        rettype: void
-        argument: character varying, character varying[], character varying[], character varying, character varying[], character varying, character varying[], character varying[], character varying, character varying[], character varying[], character varying, character varying, boolean, character varying, character varying[], integer
-    - lmf_igd_transition:
-        rettype: double precision[]
-        argument: double precision[], smallint, smallint, double precision, double precision[], smallint, smallint, smallint, double precision, double precision
-    # depending on bytea8
-    # return type is bytea8
-    - __clustered_err_lin_merge:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - __clustered_err_lin_transition:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, double precision, double precision[], double precision[]
-    - __clustered_err_log_merge:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - __clustered_err_log_transition:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, boolean, double precision[], double precision[]
-    - __clustered_err_mlog_merge:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - __clustered_err_mlog_transition:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, integer, double precision[], double precision[], integer, integer
-    - dense_residual_norm_merge_states:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - dense_residual_norm_transition:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, double precision[], double precision, double precision[]
-    - hetero_linregr_merge_states:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - hetero_linregr_transition:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, double precision, double precision[], double precision[]
-    - linregr_merge_states:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - linregr_transition:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, double precision, double precision[]
-    - robust_linregr_merge_states:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - robust_linregr_transition:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, double precision, double precision[], double precision[]
-    - weighted_sample_merge_int64:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - weighted_sample_merge_vector:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - weighted_sample_transition_int64:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, bigint, double precision
-    - weighted_sample_transition_vector:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, double precision[], double precision
-    # argument type bytea8
-    - __clustered_err_lin_final:
-        rettype: schema_madlib.__clustered_agg_result
-        argument: schema_madlib.bytea8
-    - __clustered_err_log_final:
-        rettype: schema_madlib.__clustered_agg_result
-        argument: schema_madlib.bytea8
-    - __clustered_err_mlog_final:
-        rettype: schema_madlib.__clustered_agg_result
-        argument: schema_madlib.bytea8
-    - dense_residual_norm_final:
-        rettype: schema_madlib.residual_norm_result
-        argument: schema_madlib.bytea8
-    - hetero_linregr_final:
-        rettype: schema_madlib.heteroskedasticity_test_result
-        argument: schema_madlib.bytea8
-    - linregr_final:
-        rettype: schema_madlib.linregr_result
-        argument: schema_madlib.bytea8
-    - robust_linregr_final:
-        rettype: schema_madlib.robust_linregr_result
-        argument: schema_madlib.bytea8
-    - weighted_sample_final_int64:
-        rettype: bigint
-        argument: schema_madlib.bytea8
-    - weighted_sample_final_vector:
-        rettype: double precision[]
-        argument: schema_madlib.bytea8
-    # ----------------- Changes from 1.7.1 to 1.8 ----------
-    - __filter_input_relation:
-        rettype: character varying
-        argument: character varying, character varying
-    - __lda_util_unnest:
-        rettype: SETOF bigint[]
-        argument: bigint[]
-    - matrix_block_mult:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, text
-    - matrix_block_square:
-        rettype: schema_madlib.matrix_result
-        argument: text, text
-    - matrix_block_trans:
-        rettype: schema_madlib.matrix_result
-        argument: text, text
-    - matrix_block_trans:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, boolean
-    - matrix_blockize:
-        rettype: schema_madlib.matrix_result
-        argument: text, integer, integer, text
-    - matrix_densify:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, text, text, text, boolean
-    - matrix_mult:
-        rettype: schema_madlib.matrix_result
-        argument: text, boolean, text, boolean, text
-    - matrix_mult:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, text, text, boolean, text, text, text, text, boolean, text
-    - matrix_norm:
-        rettype: double precision
-        argument: text
-    - matrix_scale_and_add:
-        rettype: void
-        argument: text, text, double precision, text
-    - matrix_sparsify:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, boolean
-    - matrix_square:
-        rettype: schema_madlib.matrix_result
-        argument: text, text
-    - matrix_trans:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, boolean
-    - matrix_trans:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, text, text, text, boolean
-    - matrix_unblockize:
-        rettype: schema_madlib.matrix_result
-        argument: text, text
-    # ----------------- Changes from 1.8 to 1.9 ----------
-
-
-    - _dt_apply:
-       rettype: schema_madlib._tree_result_type
-       argument: schema_madlib.bytea8,schema_madlib.bytea8,schema_madlib.bytea8,smallint,smallint,smallint,boolean,integer
-
-    - internal_linear_svm_igd_result:
-       rettype: schema_madlib.linear_svm_result
-       argument: double precision[]
-
-    - _prune_and_cplist:
-       rettype: schema_madlib._prune_result_type
-       argument: schema_madlib.bytea8,double precision,boolean
-
-    - __array_elem_in:
-       rettype: boolean[]
-       argument: anyarray, anyarray
-
-    - __array_indexed_agg_ffunc:
-       rettype: double precision[]
-       argument: double precision[]
-
-    - __array_indexed_agg_prefunc:
-       rettype: double precision[]
-       argument: double precision[], double precision[]
-
-    - __array_indexed_agg_sfunc:
-       rettype: double precision[]
-       argument: double precision[], double precision, bigint, bigint
-
-    - __array_search:
-       rettype: boolean
-       argument: anyelement, anyarray
-
-    - __array_sort:
-       rettype: anyarray
-       argument: anyarray
-
-    - __assert:
-       rettype: void
-       argument: boolean, text
-
-    - __assert_table:
-       rettype: void
-       argument: text, boolean
-
-    - __best_scv_prefunc:
-       rettype: double precision[]
-       argument: double precision[], double precision[]
-
-    - __best_scv_sfunc:
-       rettype: double precision[]
-       argument: double precision[], double precision[], integer, double precision
-
-    - __bigint_array_add:
-       rettype: bigint[]
-       argument: bigint[], bigint[]
-
-    - __breakup_table:
-       rettype: void
-       argument: text, text, text, text, text, text[], boolean[], integer, integer
-
-    - __check_dt_common_params:
-       rettype: void
-       argument: text, text, text, text, text, text, text, text, integer, double precision, double precision, integer, text
-
-    - __check_training_table:
-       rettype: void
-       argument: text, text[], text[], text, text, integer
-
-    - __column_exists:
-       rettype: boolean
-       argument: text, text
-
-    - __columns_in_table:
-       rettype: boolean
-       argument: text[], text
-
-    - __create_metatable:
-       rettype: void
-       argument: text
-
-    - __create_tree_tables:
-       rettype: void
-       argument: text
-
-    - __csvstr_to_array:
-       rettype: text[]
-       argument: text
-
-    - __display_node_sfunc:
-       rettype: text
-       argument: text, integer, boolean, text, text, double precision, double precision, text, integer
-
-    - __display_tree_no_ordered_aggr:
-       rettype: text
-       argument: text, integer, integer, integer, boolean, double precision, text, integer, integer
-
-    - __distinct_feature_value:
-       rettype: integer
-       argument: text, integer
-
-    - __drop_metatable:
-       rettype: void
-       argument: text
-
-    - __dt_acc_count_sfunc:
-       rettype: bigint[]
-       argument: bigint[], integer, bigint, integer
-
-    - __dt_get_node_split_fids:
-       rettype: integer[]
-       argument: integer, integer, integer, integer[]
-
-    - __ebp_calc_errors:
-       rettype: double precision
-       argument: double precision, double precision, double precision
-
-    - __ebp_prune_tree:
-       rettype: void
-       argument: text
-
-    - __encode_and_train:
-       rettype: record
-       argument: text, text, integer, integer, text, text, text, text, text, text, text, double precision, text, integer, double precision, boolean, double precision, double precision, text, integer
-
-    - __encode_columns:
-       rettype: void
-       argument: text, text, integer, integer
-
-    - __encode_table:
-       rettype: void
-       argument: text, text, text, integer, integer
-
-    - __encode_table:
-       rettype: void
-       argument: text, text, text[], text, text[], text, text, integer, integer
-
-    - __find_best_split:
-       rettype: void
-       argument: text, double precision, text, integer, integer, text, integer, integer
-
-    - __format:
-       rettype: text
-       argument: text, text
-
-    - __format:
-       rettype: text
-       argument: text, text, text
-
-    - __format:
-       rettype: text
-       argument: text, text, text, text
-
-    - __format:
-       rettype: text
-       argument: text, text, text, text, text
-
-    - __format:
-       rettype: text
-       argument: text, text[]
-
-    - __gen_acc:
-       rettype: __gen_acc_time
-       argument: text, text, text, text, text, integer, integer, boolean, integer
-
-    - __gen_enc_meta_names:
-       rettype: text[]
-       argument: text, text
-
-    - __gen_horizontal_encoded_table:
-       rettype: void
-       argument: text, text, integer, integer
-
-    - __gen_vertical_encoded_table:
-       rettype: void
-       argument: text, text, text, boolean, integer
-
-    - __generate_final_tree:
-       rettype: void
-       argument: text
-
-    - __get_class_column_name:
-       rettype: text
-       argument: text
-
-    - __get_class_value:
-       rettype: text
-       argument: integer, text
-
-    - __get_classtable_name:
-       rettype: text
-       argument: text
-
-    - __get_column_value:
-       rettype: text
-       argument: integer, integer, character, text
-
-    - __get_feature_name:
-       rettype: text
-       argument: integer, text
-
-    - __get_feature_value:
-       rettype: text
-       argument: integer, integer, text
-
-    - __get_features_of_nodes:
-       rettype: text
-       argument: text, text, integer, integer, integer
-
-    - __get_id_column_name:
-       rettype: text
-       argument: text
-
-    - __get_schema_name:
-       rettype: text
-       argument: text
-
-    - __get_table_name:
-       rettype: text
-       argument: text
-
-    - __insert_into_metatable:
-       rettype: void
-       argument: text, integer, text, character, boolean, text, integer
-
-    - __is_valid_enc_table:
-       rettype: boolean
-       argument: text
-
-    - __num_of_class:
-       rettype: integer
-       argument: text
-
-    - __num_of_columns:
-       rettype: integer
-       argument: text
-
-    - __num_of_feature:
-       rettype: integer
-       argument: text
-
-    - __regclass_to_text:
-       rettype: text
-       argument: regclass
-
-    - __rename_table:
-       rettype: void
-       argument: text, text
-
-    - __rep_aggr_class_count_ffunc:
-       rettype: bigint[]
-       argument: bigint[]
-
-    - __rep_aggr_class_count_sfunc:
-       rettype: bigint[]
-       argument: bigint[], integer, integer, integer
-
-    - __rep_prune_tree:
-       rettype: void
-       argument: text, text, integer
-
-    - __sample_with_replacement:
-       rettype: void
-       argument: integer, bigint, text, text
-
-    - __sample_within_range:
-       rettype: SETOF bigint
-       argument: bigint, bigint, bigint
-
-    - __scv_aggr_ffunc:
-       rettype: double precision[]
-       argument: double precision[]
-
-    - __scv_aggr_prefunc:
-       rettype: double precision[]
-       argument: double precision[], double precision[]
-
-    - __scv_aggr_sfunc:
-       rettype: double precision[]
-       argument: double precision[], integer, boolean, integer, double precision[], double precision[], bigint
-
-    - __strip_schema_name:
-       rettype: text
-       argument: text
-
-    - __svm_random_ind2:
-       rettype: double precision[]
-       argument: integer
-
-    - __svm_random_ind:
-       rettype: double precision[]
-       argument: integer
-
-    - __svm_target_cl_func:
-       rettype: double precision
-       argument: double precision[]
-
-    - __svm_target_reg_func:
-       rettype: double precision
-       argument: double precision[]
-
-    - __table_exists:
-       rettype: boolean
-       argument: text
-
-    - __train_tree:
-       rettype: __train_result
-       argument: text, integer, integer, text, text, text, text, text, text, double precision, integer, double precision, double precision, double precision, boolean, integer, integer
-
-    - __treemodel_classify_internal:
-       rettype: text[]
-       argument: text, text, integer
-
-    - __treemodel_classify_internal_serial:
-       rettype: text[]
-       argument: text, text, integer
-
-    - __treemodel_display_no_ordered_aggr:
-       rettype: SETOF text
-       argument: text, integer[], integer
-
-    - __treemodel_display_with_ordered_aggr:
-       rettype: SETOF text
-       argument: text, integer[], integer
-
-    - __treemodel_get_vote_result:
-       rettype: void
-       argument: text, text
-
-    - __treemodel_score:
-       rettype: double precision
-       argument: text, text, integer
-
-    - __validate_input_table:
-       rettype: void
-       argument: text, text[], text, text
-
-    - __validate_metatable:
-       rettype: void
-       argument: text
-
-    - c45_classify:
-       rettype: c45_classify_result
-       argument: text, text, text
-
-    - c45_classify:
-       rettype: c45_classify_result
-       argument: text, text, text, integer
-
-    - c45_clean:
-       rettype: boolean
-       argument: text
-
-    - c45_display:
-       rettype: SETOF text
-       argument: text
-
-    - c45_display:
-       rettype: SETOF text
-       argument: text, integer
-
-    - c45_genrule:
-       rettype: SETOF text
-       argument: text
-
-    - c45_genrule:
-       rettype: SETOF text
-       argument: text, integer
-
-    - c45_score:
-       rettype: double precision
-       argument: text, text
-
-    - c45_score:
-       rettype: double precision
-       argument: text, text, integer
-
-    - c45_train:
-       rettype: c45_train_result
-       argument: text, text, text
-
-    - c45_train:
-       rettype: c45_train_result
-       argument: text, text, text, text, text, text, text, text, double precision, text
-
-    - c45_train:
-       rettype: c45_train_result
-       argument: text, text, text, text, text, text, text, text, double precision, text, integer, double precision, double precision, integer
-
-    - correlation:
-       rettype: correlation_result
-       argument: character varying, character varying
-
-    - correlation:
-       rettype: correlation_result
-       argument: character varying, character varying, character varying
-
-    - correlation:
-       rettype: correlation_result
-       argument: character varying, character varying, character varying, boolean
-
-    - linear_svm_igd_transition:
-       rettype: double precision[]
-       argument: double precision[], double precision[], boolean, double precision[], integer, double precision, double precision
-
-    - lsvm_classification:
-       rettype: SETOF lsvm_sgd_result
-       argument: text, text
-
-    - lsvm_classification:
-       rettype: SETOF lsvm_sgd_result
-       argument: text, text, boolean
-
-    - lsvm_classification:
-       rettype: SETOF lsvm_sgd_result
-       argument: text, text, boolean, boolean, double precision, double precision
-
-    - lsvm_classification:
-       rettype: SETOF lsvm_sgd_result
-       argument: text, text, boolean, boolean, double precision, double precision, integer
-
-    - lsvm_predict:
-       rettype: double precision
-       argument: double precision[], double precision[]
-
-    - lsvm_predict_batch:
-       rettype: text
-       argument: text, text, text, text, text
-
-    - lsvm_predict_batch:
-       rettype: text
-       argument: text, text, text, text, text, boolean
-
-    - matrix_block_trans:
-       rettype: matrix_result
-       argument: text, text, text, text, boolean
-
-    - matrix_densify:
-       rettype: matrix_result
-       argument: text, text, text, text, boolean
-
-    - matrix_sparsify:
-       rettype: matrix_result
-       argument: text, text, text, text, boolean
-
-    - matrix_trans:
-       rettype: matrix_result
-       argument: text, text, text, text, boolean
-
-    - rf_classify:
-       rettype: rf_classify_result
-       argument: text, text, text
-
-    - rf_classify:
-       rettype: rf_classify_result
-       argument: text, text, text, boolean, integer
-
-    - rf_classify:
-       rettype: rf_classify_result
-       argument: text, text, text, integer
-
-    - rf_clean:
-       rettype: boolean
-       argument: text
-
-    - rf_display:
-       rettype: SETOF text
-       argument: text
-
-    - rf_display:
-       rettype: SETOF text
-       argument: text, integer[]
-
-    - rf_display:
-       rettype: SETOF text
-       argument: text, integer[], integer
-
-    - rf_score:
-       rettype: double precision
-       argument: text, text
-
-    - rf_score:
-       rettype: double precision
-       argument: text, text, integer
-
-    - rf_train:
-       rettype: rf_train_result
-       argument: text, text, text
-
-    - rf_train:
-       rettype: rf_train_result
-       argument: text, text, text, integer, integer, double precision, text, text, text, text, text, integer, double precision, double precision, integer
-
-    - svdmf_run:
-       rettype: text
-       argument: text, text, text, text, integer
-
-    - svdmf_run:
-       rettype: text
-       argument: text, text, text, text, integer, integer, double precision
-
-    - svm_classification:
-       rettype: SETOF svm_cls_result
-       argument: text, text, boolean, text
-
-    - svm_classification:
-       rettype: SETOF svm_cls_result
-       argument: text, text, boolean, text, boolean, double precision, double precision
-
-    - svm_classification:
-       rettype: SETOF svm_cls_result
-       argument: text, text, boolean, text, boolean, double precision, double precision, double precision
-
-    - svm_classification:
-       rettype: SETOF svm_cls_result
-       argument: text, text, boolean, text, double precision
-
-    - svm_cls_update:
-       rettype: schema_madlib.svm_model_rec
-       argument: schema_madlib.svm_model_rec, double precision[], double precision, text, double precision, double precision, double precision
-
-    - svm_data_normalization:
-       rettype: void
-       argument: text
-
-    - svm_dot:
-       rettype: double precision
-       argument: double precision[], double precision[]
-
-    - svm_dot:
-       rettype: double precision
-       argument: double precision[], double precision[], double precision
-
-    - svm_drop_model:
-       rettype: void
-       argument: text
-
-    - svm_gaussian:
-       rettype: double precision
-       argument: double precision[], double precision[], double precision
-
-    - svm_generate_cls_data:
-       rettype: void
-       argument: text, integer, integer
-
-    - svm_generate_nd_data:
-       rettype: void
-       argument: text, integer, integer
-
-    - svm_generate_reg_data:
-       rettype: void
-       argument: text, integer, integer
-
-    - svm_nd_update:
-       rettype: schema_madlib.svm_model_rec
-       argument: schema_madlib.svm_model_rec, double precision[], text, double precision, double precision, double precision
-
-    - svm_novelty_detection:
-       rettype: SETOF schema_madlib.svm_nd_result
-       argument: text, text, boolean, text
-
-    - svm_novelty_detection:
-       rettype: SETOF schema_madlib.svm_nd_result
-       argument: text, text, boolean, text, boolean, double precision, double precision
-
-    - svm_novelty_detection:
-       rettype: SETOF schema_madlib.svm_nd_result
-       argument: text, text, boolean, text, boolean, double precision, double precision, double precision
-
-    - svm_polynomial:
-       rettype: double precision
-       argument: double precision[], double precision[], double precision
-
-    - svm_predict:
-       rettype: double precision
-       argument: schema_madlib.svm_model_rec, double precision[], text, double precision
-
-    - svm_predict_batch:
-       rettype: text
-       argument: text, text, text, text, text, boolean
-
-    - svm_predict_sub:
-       rettype: double precision
-       argument: integer, integer, double precision[], double precision[], double precision[], text, double precision
-
-    - svm_reg_update:
-       rettype: schema_madlib.svm_model_rec
-       argument: schema_madlib.svm_model_rec, double precision[], double precision, text, double precision, double precision, double precision, double precision
-
-    - svm_regression:
-       rettype: SETOF svm_reg_result
-       argument: text, text, boolean, text
-
-    - svm_regression:
-       rettype: SETOF svm_reg_result
-       argument: text, text, boolean, text, boolean, double precision, double precision, double precision
-
-    - svm_regression:
-       rettype: SETOF svm_reg_result
-       argument: text, text, boolean, text, boolean, double precision, double precision, double precision, double precision
-
-    - svm_store_model:
-       rettype: void
-       argument: text, text, text
-
-    # ----------------- Changes from 1.9 to 1.9.1 ----------
-    - array_collapse:
-        rettype: anyarray
-        argument: anyarray
-    - linear_svm_igd_transition:
-        rettype: double precision[]
-        argument: double precision[], double precision[], double precision, double precision[], integer, double precision, double precision, boolean, integer, double precision, boolean
-    - profile:
-        rettype: SETOF schema_madlib.profile_result
-        argument: text
-    - profile_full:
-        rettype: SETOF schema_madlib.profile_result
-        argument: text, integer
-    - profile:
-        rettype: schema_madlib.profile_result
-        argument: text
-    - profile_full:
-        rettype: schema_madlib.profile_result
-        argument: text, integer
-    - quantile:
-        rettype: double precision
-        argument: text, text, double precision
-    - quantile_big:
-        rettype: double precision
-        argument: text, text, double precision
-
-# Changes to aggregates (UDA) including removal and modification
-# Overloaded functions should be mentioned separately
-uda:
-    # ----------------- Changes from 1.2 to 1.3 -----------------
-    - linregr:
-        rettype: schema_madlib.linregr_result
-        argument: double precision, double precision[]
-    - cox_prop_hazards_step:
-         rettype: double precision[]
-         argument: double precision[], double precision, boolean, double precision, double precision[], double precision[], double precision[]
-    # ----------------- Changes from 1.3 to 1.4 -----------------
-    - __lda_count_topic_agg:
-        rettype: integer[]
-        argument: integer[], integer[], integer[], integer, integer
-    - __lda_perplexity_agg:
-        rettype: double precision
-        argument: integer[], integer[], integer[], integer[], double precision, double precision, integer, integer
-    # ----------------- Changes from 1.4 to 1.4.1 ---------------
-    # ----------------- Changes from 1.4.1 to 1.5 ---------------
-    - lsvm_sgd_agg:
-        rettype: schema_madlib.lsvm_sgd_model_rec
-        argument: double precision[], double precision, double precision, double precision
-    - svm_cls_agg:
-        rettype: schema_madlib.svm_model_rec
-        argument: double precision[], double precision, text, double precision, double precision
-    - svm_nd_agg:
-        rettype: schema_madlib.svm_model_rec
-        argument: double precision[], text, double precision, double precision
-    - svm_reg_agg:
-        rettype: schema_madlib.svm_model_rec
-        argument: double precision[], double precision, text, double precision, double precision, double precision
-    # ----------------- Changes from 1.5 to 1.6 ---------------
-    # - coxph_step: not exists in v1.2
-    # - coxph_strata_step_inner: not exists in v1.2
-    # - coxph_strata_step_outer: not exists in v1.2
-    # return type change
-    # - linregr: appeared before
-    # initcond change
-    - __mlogregr_irls_step:
-        rettype: double precision[]
-        argument: integer, integer, integer, double precision[], double precision[]
-    # make-ups from upgrade to v1.6
-    - marginal_logregr:
-        rettype: schema_madlib.marginal_logregr_result
-        argument: boolean, double precision[], double precision[]
-    - marginal_mlogregr:
-        rettype: schema_madlib.marginal_mlogregr_result
-        argument: integer, integer, integer, double precision[], double precision[]
-    # ----------------- Changes from 1.6 to 1.6.0S ----------
-    # ----------------- Changes from 1.6.0S to 1.7 ----------
-    - lmf_igd_step:
-        rettype: double precision[]
-        argument: smallint, smallint, double precision, double precision[], smallint, smallint, smallint, double precision, double precision
-    - __clustered_err_lin_step:
-        rettype: schema_madlib.__clustered_agg_result
-        argument: double precision, double precision[], double precision[]
-    - __clustered_err_log_step:
-        rettype: schema_madlib.__clustered_agg_result
-        argument: boolean, double precision[], double precision[]
-    - __clustered_err_mlog_step:
-        rettype: schema_madlib.__clustered_agg_result
-        argument: integer, double precision[], double precision[], integer, integer
-    - dense_residual_norm:
-        rettype: schema_madlib.residual_norm_result
-        argument: double precision[], double precision, double precision[]
-    - heteroskedasticity_test_linregr:
-        rettype: schema_madlib.heteroskedasticity_test_result
-        argument: double precision, double precision[], double precision[]
-    - linregr:
-        rettype: schema_madlib.linregr_result
-        argument: double precision, double precision[]
-    - robust_linregr:
-        rettype: schema_madlib.robust_linregr_result
-        argument: double precision, double precision[], double precision[]
-    - weighted_sample:
-        rettype: double precision[]
-        argument: double precision[], double precision
-    - weighted_sample:
-        rettype: bigint
-        argument: bigint, double precision
-    # ----------------- Changes from 1.7.1 to 1.8 ----------
-    - discrete_distribution_agg:
-        rettype: double precision[]
-        argument: integer, double precision, integer
-    - vectorized_distribution_agg:
-        rettype: double precision[]
-        argument: integer[], integer[]
-    # ----------------- Changes from 1.8 to 1.9 ----------
-
-    - __array_indexed_agg:
-        rettype: double precision[]
-        argument: double precision, bigint, bigint
-
-    - __best_scv_aggr:
-        rettype: double precision[]
-        argument: double precision[], integer, double precision
-
-    - __bigint_array_sum:
-        rettype: bigint[]
-        argument: bigint[]
-
-    - __display_tree_aggr:
-        rettype: text
-        argument: integer, boolean, text, text, double precision, double precision, text, integer
-
-    - __dt_acc_count_aggr:
-        rettype: bigint[]
-        argument: integer, bigint, integer
-
-    - __rep_aggr_class_count:
-        rettype: bigint[]
-        argument: integer, integer, integer
-
-    - __scv_aggr:
-        rettype: double precision[]
-        argument: integer, boolean, integer, double precision[], double precision[], bigint
-
-    - linear_svm_igd_step:
-        rettype: double precision[]
-        argument: double precision[], boolean, double precision[], integer, double precision, double precision
-
-    - linear_svm_igd_step_serial:
-        rettype: double precision[]
-        argument: double precision[], boolean, double precision[], integer, double precision, double precision
-
-    - svm_cls_agg:
-        rettype: schema_madlib.svm_model_rec
-        argument: double precision[], double precision, text, double precision, double precision, double precision
-
-    - svm_nd_agg:
-        rettype: schema_madlib.svm_model_rec
-        argument: double precision[], text, double precision, double precision, double precision
-
-    - svm_reg_agg:
-        rettype: schema_madlib.svm_model_rec
-        argument: double precision[], double precision, text, double precision, double precision, double precision, double precision
-
-    - __svm_random_ind2:
-        rettype: double precision[]
-        argument: integer
-    # ----------------- Changes from 1.9 to 1.9.1 ----------
-    - array_agg:
-        rettype: anyarray
-        argument: anyelement
-    - linear_svm_igd_step:
-       rettype: double precision[]
-       argument: double precision[], double precision, double precision[], integer, double precision, double precision, boolean, integer, double precision, boolean
-
-# Casts (UDC) updated/removed
-udc:
-    # ----------------- Changes from 1.2 to 1.3 -----------------
-    # ----------------- Changes from 1.3 to 1.4 -----------------
-    # ----------------- Changes from 1.4 to 1.4.1 ---------------
-    # ----------------- Changes from 1.4.1 to 1.5 ---------------
-    bool2text:
-        sourcetype: boolean
-        targettype: text
-    # ----------------- Changes from 1.5 to 1.6 ---------------
-    # ----------------- Changes from 1.6 to 1.6.0S ----------
-    # ----------------- Changes from 1.6.0S to 1.7 ----------
-
-# Operators (UDO) removed/updated
-udo:
-    # ----------------- Changes from 1.5 to 1.6 ---------------
-    - '<':
-        leftarg: schema_madlib.svec
-        rightarg: schema_madlib.svec
-    - '<=':
-        leftarg: schema_madlib.svec
-        rightarg: schema_madlib.svec
-    - '<>':
-        leftarg: schema_madlib.svec
-        rightarg: schema_madlib.svec
-    - '==':
-        leftarg: schema_madlib.svec
-        rightarg: schema_madlib.svec
-    - '>=':
-        leftarg: schema_madlib.svec
-        rightarg: schema_madlib.svec
-    - '>':
-        leftarg: schema_madlib.svec
-        rightarg: schema_madlib.svec
-    # ----------------- Changes from 1.6 to 1.6.0S ----------
-    # ----------------- Changes from 1.6.0S to 1.7 ----------
-
-# Operator Classes (UDOC) removed/updated
-udoc:
-    # ----------------- Changes from 1.5 to 1.6 ---------------
-    # removed
-    - svec_l2_ops:
-        index: btree
-    # ----------------- Changes from 1.6 to 1.6.0S ----------
-    # ----------------- Changes from 1.6.0S to 1.7 ----------

[08/50] [abbrv] incubator-madlib git commit: Madpack: Disable psqlrc when executing queries

Posted by ri...@apache.org.

Madpack: Disable psqlrc when executing queries

JIRA: MADLIB-1053


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/e0439ed8
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/e0439ed8
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/e0439ed8

Branch: refs/heads/latest_release
Commit: e0439ed8d715c4a72dfbf2e450f7467925bf8784
Parents: c564e31
Author: Rahul Iyer <ri...@apache.org>
Authored: Fri Jan 13 10:37:22 2017 -0800
Committer: Rahul Iyer <ri...@apache.org>
Committed: Fri Jan 13 10:37:22 2017 -0800

----------------------------------------------------------------------
 src/madpack/madpack.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/e0439ed8/src/madpack/madpack.py
----------------------------------------------------------------------
diff --git a/src/madpack/madpack.py b/src/madpack/madpack.py
index e08c1c0..6348899 100755
--- a/src/madpack/madpack.py
+++ b/src/madpack/madpack.py
@@ -127,11 +127,13 @@ def run_query(sql, show_error, con_args=con_args):
               '-U', con_args['user'],
               '-F', delimiter,
               '--no-password',
-              '-Ac', "set CLIENT_MIN_MESSAGES=error; " + sql]
+              '--no-psqlrc',
+              '--no-align',
+              '-c', sql]
     runenv = os.environ
     if 'password' in con_args:
         runenv["PGPASSWORD"] = con_args['password']
-    runenv["PGOPTIONS"] = '-c search_path=public -c client_min_messages=notice'
+    runenv["PGOPTIONS"] = '-c search_path=public -c client_min_messages=error'
     std, err = subprocess.Popen(runcmd, env=runenv, stdout=subprocess.PIPE,
                                 stderr=subprocess.PIPE).communicate()

[12/50] [abbrv] incubator-madlib git commit: RF: Fixes the online help and example

Posted by ri...@apache.org.

RF: Fixes the online help and example


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/e384c1fc
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/e384c1fc
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/e384c1fc

Branch: refs/heads/latest_release
Commit: e384c1fc7bb27b7c2401b17b6049cee1374fee1a
Parents: 498c559
Author: Orhan Kislal <ok...@pivotal.io>
Authored: Mon Jan 23 15:45:08 2017 -0800
Committer: Orhan Kislal <ok...@pivotal.io>
Committed: Mon Jan 23 15:45:08 2017 -0800

----------------------------------------------------------------------
 .../recursive_partitioning/random_forest.py_in  | 83 +++++++++++---------
 1 file changed, 47 insertions(+), 36 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/e384c1fc/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in b/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in
index e006a34..0eb5985 100644
--- a/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in
+++ b/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in
@@ -103,6 +103,10 @@ SELECT {schema_madlib}.forest_train(
                                 is an positive integer with the default 0.
     verbose,                -- Boolean, whether to print more info,
                               default is False
+    sample_ratio            -- Double precision, in the range of (0, 1], default: 1
+                                If sample_ratio is less than 1, a bootstrap sample
+                                size smaller than the data table is expected to be
+                                used for training each tree in the forest.
 );
 
 ------------------------------------------------------------
@@ -175,44 +179,51 @@ it has the following columns:
 ------------------------------------------------------------
                         EXAMPLE
 ------------------------------------------------------------
-DROP TABLE IF EXISTS dummy_dt_con_src CASCADE;
-CREATE TABLE dummy_dt_con_src (
-    id  INTEGER,
-    cat INTEGER[],
-    con FLOAT8[],
-    y   FLOAT8
+DROP TABLE IF EXISTS dt_golf;
+CREATE TABLE dt_golf (
+    id integer NOT NULL,
+    "OUTLOOK" text,
+    temperature double precision,
+    humidity double precision,
+    windy text,
+    class text
 );
 
-INSERT INTO dummy_dt_src VALUES
-(1, '{0}'::INTEGER[], ARRAY[0], 0.5),
-(2, '{0}'::INTEGER[], ARRAY[1], 0.5),
-(3, '{0}'::INTEGER[], ARRAY[4], 0.5),
-(4, '{0}'::INTEGER[], ARRAY[4], 0.5),
-(5, '{0}'::INTEGER[], ARRAY[4], 0.5),
-(6, '{0}'::INTEGER[], ARRAY[5], 0.1),
-(7, '{0}'::INTEGER[], ARRAY[6], 0.1),
-(8, '{1}'::INTEGER[], ARRAY[9], 0.1);
-(9, '{1}'::INTEGER[], ARRAY[9], 0.1);
-(10, '{1}'::INTEGER[], ARRAY[9], 0.1);
-(11, '{1}'::INTEGER[], ARRAY[9], 0.1);
-
-DROP TABLE IF EXISTS forest_out, forest_out_summary;
-SELECT madlib.forest_train(
-    'dummy_dt_src',
-    'forest_out',
-    'id',
-    'y',
-    'cat, con',
-    '',
-    'mse',
-    NULL::Text,
-    NULL::Text,
-    3,
-    2,
-    1,
-    5);
-
-SELECT madlib.forest_display('forest_out');
+INSERT INTO dt_golf (id,"OUTLOOK",temperature,humidity,windy,class) VALUES
+(1, 'sunny', 85, 85, 'false', 'Don''t Play'),
+(2, 'sunny', 80, 90, 'true', 'Don''t Play'),
+(3, 'overcast', 83, 78, 'false', 'Play'),
+(4, 'rain', 70, 96, 'false', 'Play'),
+(5, 'rain', 68, 80, 'false', 'Play'),
+(6, 'rain', 65, 70, 'true', 'Don''t Play'),
+(7, 'overcast', 64, 65, 'true', 'Play'),
+(8, 'sunny', 72, 95, 'false', 'Don''t Play'),
+(9, 'sunny', 69, 70, 'false', 'Play'),
+(10, 'rain', 75, 80, 'false', 'Play'),
+(11, 'sunny', 75, 70, 'true', 'Play'),
+(12, 'overcast', 72, 90, 'true', 'Play'),
+(13, 'overcast', 81, 75, 'false', 'Play'),
+(14, 'rain', 71, 80, 'true', 'Don''t Play');
+
+DROP TABLE IF EXISTS train_output, train_output_group, train_output_summary;
+SELECT madlib.forest_train('dt_golf',         -- source table
+    'train_output',    -- output model table
+    'id',              -- id column
+    'class',           -- response
+    '"OUTLOOK", temperature, humidity, windy',   -- features
+    NULL,              -- exclude columns
+    NULL,              -- grouping columns
+    20::integer,       -- number of trees
+    2::integer,        -- number of random features
+    TRUE::boolean,     -- variable importance
+    1::integer,        -- num_permutations
+    8::integer,        -- max depth
+    3::integer,        -- min split
+    1::integer,        -- min bucket
+    10::integer        -- number of splits per continuous variable
+);
+SELECT madlib.get_tree('train_output',1,2,FALSE);
+
         """
     else:
         help_string = "No such option. Use {schema_madlib}.forest_train('usage')"

[45/50] [abbrv] incubator-madlib git commit: Release v1.10.0:

Posted by ri...@apache.org.

Release v1.10.0:

Adds Apache license declaration to the new files.
Fixes pom.xml to pass mvn apache-rat:check.

Closes #100


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/ea17530b
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/ea17530b
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/ea17530b

Branch: refs/heads/latest_release
Commit: ea17530bfe22a1fde173d7fa83508cbcd9924c20
Parents: 97e795d
Author: Satoshi Nagayasu <sa...@gmail.com>
Authored: Sun Feb 12 15:55:31 2017 -0800
Committer: Orhan Kislal <ok...@pivotal.io>
Committed: Sun Feb 12 15:58:17 2017 -0800

----------------------------------------------------------------------
 configure                                       |  2 +-
 pom.xml                                         |  6 +++---
 src/madpack/changelist_1.9_1.10.yaml            | 19 +++++++++++++++++++
 src/ports/greenplum/5/CMakeLists.txt            | 18 ++++++++++++++++++
 src/ports/greenplum/cmake/FindGreenplum_5.cmake | 17 +++++++++++++++++
 5 files changed, 58 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/ea17530b/configure
----------------------------------------------------------------------
diff --git a/configure b/configure
index 7164a94..be4126f 100755
--- a/configure
+++ b/configure
@@ -36,7 +36,7 @@ if [ -e META.json ]; then
 	cat <<MAKEFILE >>Makefile
 install: extension-install
 # keep a tab to do nothing for install
-	
+
 extension-install:
 	\$(MAKE) -C build \$@
 MAKEFILE

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/ea17530b/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index 20d2e0d..3971820 100644
--- a/pom.xml
+++ b/pom.xml
@@ -22,7 +22,7 @@
 
   <groupId>org.apache.madlib</groupId>
   <artifactId>madlib</artifactId>
-  <version>1.9</version>
+  <version>1.10</version>
   <packaging>pom</packaging>
 
   <build>
@@ -643,8 +643,8 @@
               <exclude>src/ports/postgres/modules/utilities/admin.py_in</exclude>
               <exclude>src/ports/postgres/modules/utilities/control.py_in</exclude>
               <exclude>src/ports/postgres/modules/utilities/control_composite.py_in</exclude>
-              <exclude>src/ports/postgres/modules/utilities/data_preparation.py_in</exclude>
-              <exclude>src/ports/postgres/modules/utilities/data_preparation.sql_in</exclude>
+              <exclude>src/ports/postgres/modules/utilities/create_indicators.py_in</exclude>
+              <exclude>src/ports/postgres/modules/utilities/create_indicators.sql_in</exclude>
               <exclude>src/ports/postgres/modules/utilities/group_control.py_in</exclude>
               <exclude>src/ports/postgres/modules/utilities/in_mem_group_control.py_in</exclude>
               <exclude>src/ports/postgres/modules/utilities/math_utils.py_in</exclude>

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/ea17530b/src/madpack/changelist_1.9_1.10.yaml
----------------------------------------------------------------------
diff --git a/src/madpack/changelist_1.9_1.10.yaml b/src/madpack/changelist_1.9_1.10.yaml
index 41082ad..8d1a773 100644
--- a/src/madpack/changelist_1.9_1.10.yaml
+++ b/src/madpack/changelist_1.9_1.10.yaml
@@ -1,3 +1,22 @@
+# ------------------------------------------------------------------------------
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# ------------------------------------------------------------------------------
+
 # Changelist for MADlib version 1.9 to 1.10
 
 # This file contains all changes that were introduced in a new version of

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/ea17530b/src/ports/greenplum/5/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/src/ports/greenplum/5/CMakeLists.txt b/src/ports/greenplum/5/CMakeLists.txt
index 015d76e..b93df9e 100644
--- a/src/ports/greenplum/5/CMakeLists.txt
+++ b/src/ports/greenplum/5/CMakeLists.txt
@@ -1 +1,19 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
 add_current_greenplum_version()

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/ea17530b/src/ports/greenplum/cmake/FindGreenplum_5.cmake
----------------------------------------------------------------------
diff --git a/src/ports/greenplum/cmake/FindGreenplum_5.cmake b/src/ports/greenplum/cmake/FindGreenplum_5.cmake
index fe15861..c5ea28f 100644
--- a/src/ports/greenplum/cmake/FindGreenplum_5.cmake
+++ b/src/ports/greenplum/cmake/FindGreenplum_5.cmake
@@ -1,2 +1,19 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 set(_FIND_PACKAGE_FILE "${CMAKE_CURRENT_LIST_FILE}")
 include("${CMAKE_CURRENT_LIST_DIR}/FindGreenplum.cmake")

[07/50] [abbrv] incubator-madlib git commit: Build: Update madpack versioning to include _ and +

Posted by ri...@apache.org.

Build: Update madpack versioning to include _ and +


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/c564e31d
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/c564e31d
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/c564e31d

Branch: refs/heads/latest_release
Commit: c564e31d70c0848cc2a2786dca5c6f1a90c2f20f
Parents: 3cf3f67
Author: Rahul Iyer <ri...@apache.org>
Authored: Thu Jan 12 12:08:37 2017 -0800
Committer: Rahul Iyer <ri...@apache.org>
Committed: Thu Jan 12 12:08:37 2017 -0800

----------------------------------------------------------------------
 RELEASE_NOTES                          |  3 +--
 ReadMe.txt                             |  4 ++--
 deploy/PGXN/META.json.in               |  8 ++++----
 deploy/gppkg/gppkg_spec.yml.in         |  2 +-
 deploy/hawq_install.sh                 |  2 +-
 doc/etc/header.html                    |  4 ++--
 src/madpack/madpack.py                 | 30 ++++++++++++++++-------------
 src/modules/stats/chi_squared_test.cpp |  3 +--
 8 files changed, 29 insertions(+), 27 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/c564e31d/RELEASE_NOTES
----------------------------------------------------------------------
diff --git a/RELEASE_NOTES b/RELEASE_NOTES
index 420c421..6eb5c22 100644
--- a/RELEASE_NOTES
+++ b/RELEASE_NOTES
@@ -7,8 +7,7 @@ with most recent versions listed at the top.
 A complete list of changes for each release can be obtained by viewing the git
 commit history located at https://github.com/madlib/madlib/commits/master.
 
-Current list of bugs and issues can be found at http://jira.madlib.net.
-
+Current list of bugs and issues can be found at https://issues.apache.org/jira/browse/MADLIB.
 \u2014-------------------------------------------------------------------------
 MADlib v1.9.1
 

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/c564e31d/ReadMe.txt
----------------------------------------------------------------------
diff --git a/ReadMe.txt b/ReadMe.txt
index 3b1a0e0..2116d7f 100644
--- a/ReadMe.txt
+++ b/ReadMe.txt
@@ -5,13 +5,13 @@ MADlib is an open-source library for scalable in-database analytics.
 It provides data-parallel implementations of mathematical, statistical
 and machine learning methods for structured and unstructured data.
 
-See the project web site located at http://madlib.net for links to the latest
+See the project web site located at http://madlib.incubator.apache.org/ for links to the latest
 binary and source packages.
 
 For installation and contribution guides, please see the MADlib wiki at
 https://github.com/madlib/madlib/wiki.
 
-The latest documentation of MADlib modules can be found at http://doc.madlib.net
+The latest documentation of MADlib modules can be found at http://madlib.incubator.apache.org/docs
 or can be accessed directly from the MADlib installation directory by opening
 doc/user/html/index.html.
 

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/c564e31d/deploy/PGXN/META.json.in
----------------------------------------------------------------------
diff --git a/deploy/PGXN/META.json.in b/deploy/PGXN/META.json.in
index 0e027eb..47dd9cb 100644
--- a/deploy/PGXN/META.json.in
+++ b/deploy/PGXN/META.json.in
@@ -13,13 +13,13 @@
         }
     },
     "resources": {
-        "homepage": "http://madlib.net/",
+        "homepage": "http://madlib.incubator.apache.org/",
         "bugtracker": {
-            "web": "http://jira.madlib.net/"
+            "web": "https://issues.apache.org/jira/browse/MADLIB/"
         },
         "repository": {
-            "url":  "https://github.com/madlib/madlib.git",
-            "web":  "https://github.com/madlib/madlib",
+            "url":  "https://github.com/apache/incubator-madlib.git",
+            "web":  "https://github.com/apache/incubator-madlib",
             "type": "git"
         }
     },

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/c564e31d/deploy/gppkg/gppkg_spec.yml.in
----------------------------------------------------------------------
diff --git a/deploy/gppkg/gppkg_spec.yml.in b/deploy/gppkg/gppkg_spec.yml.in
index 6ecf6c5..1212a75 100644
--- a/deploy/gppkg/gppkg_spec.yml.in
+++ b/deploy/gppkg/gppkg_spec.yml.in
@@ -16,4 +16,4 @@ PostInstall:
            echo '       instead of \"install\" ';
            echo 'For additional options run:';
            echo '$ madpack --help';
-           echo 'Release notes and additional documentation can be found at http://madlib.net';"
+           echo 'Release notes and additional documentation can be found at http://madlib.incubator.apache.org';"

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/c564e31d/deploy/hawq_install.sh
----------------------------------------------------------------------
diff --git a/deploy/hawq_install.sh b/deploy/hawq_install.sh
index 23220e1..3ac7985 100755
--- a/deploy/hawq_install.sh
+++ b/deploy/hawq_install.sh
@@ -195,5 +195,5 @@ echo "        This will install MADlib objects into a Greenplum database named \
 echo "        running on server \"mdw\" on port 5432. Installer will try to login as \"gpadmin\""
 echo "        and will prompt for password. The target schema will be \"madlib\"."
 echo "For additional options run: madpack --help"
-echo "Release notes and additional documentation can be found at http://madlib.net"
+echo "Release notes and additional documentation can be found at http://madlib.incubator.apache.org/"
 

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/c564e31d/doc/etc/header.html
----------------------------------------------------------------------
diff --git a/doc/etc/header.html b/doc/etc/header.html
index c9b93b7..679bfca 100644
--- a/doc/etc/header.html
+++ b/doc/etc/header.html
@@ -27,7 +27,7 @@ $extrastylesheet
   m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
   })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
 
-  ga('create', 'UA-45382226-1', 'madlib.net');
+  ga('create', 'UA-45382226-1', 'madlib.incubator.apache.org');
   ga('send', 'pageview');
 
 </script>
@@ -41,7 +41,7 @@ $extrastylesheet
  <tbody>
  <tr style="height: 56px;">
   <!--BEGIN PROJECT_LOGO-->
-  <td id="projectlogo"><a href="http://madlib.net"><img alt="Logo" src="$relpath^$projectlogo" height="50" style="padding-left:0.5em;" border="0"/ ></a></td>
+  <td id="projectlogo"><a href="http://madlib.incubator.apache.org"><img alt="Logo" src="$relpath^$projectlogo" height="50" style="padding-left:0.5em;" border="0"/ ></a></td>
   <!--END PROJECT_LOGO-->
   <!--BEGIN PROJECT_NAME-->
   <td style="padding-left: 0.5em;">

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/c564e31d/src/madpack/madpack.py
----------------------------------------------------------------------
diff --git a/src/madpack/madpack.py b/src/madpack/madpack.py
index 732eabd..e08c1c0 100755
--- a/src/madpack/madpack.py
+++ b/src/madpack/madpack.py
@@ -26,8 +26,8 @@ py_min_ver = [2, 6]
 
 # Check python version
 if sys.version_info[:2] < py_min_ver:
-    print "ERROR: python version too old (%s). You need %s or greater." \
-          % ('.'.join(str(i) for i in sys.version_info[:3]), '.'.join(str(i) for i in py_min_ver))
+    print("ERROR: python version too old (%s). You need %s or greater." %
+          ('.'.join(str(i) for i in sys.version_info[:3]), '.'.join(str(i) for i in py_min_ver)))
     exit(1)
 
 # Find MADlib root directory. This file is installed to
@@ -417,21 +417,19 @@ def _get_rev_num(rev):
                 Valid inputs:
                     1.9.0, 1.10.0, 2.5.0
                     1.0.0-alpha, 1.0.0-alpha.1, 1.0.0-0.3.7, 1.0.0-x.7.z.92
-
+                    1.0.0+20130313144700, 1.0.0-beta+exp.sha.5114f85
     """
     try:
-        rev_parts = rev.split('-')  # text to the right of - is treated as single str
+        rev_parts = re.split('[-+_]', rev)
         # get numeric part of the version string
         num = [int(i) for i in rev_parts[0].split('.')]
         num += [0] * (3 - len(num))  # normalize num to be of length 3
         # get identifier part of the version string
         if len(rev_parts) > 1:
-            num.append(str(rev_parts[1]))
-
-        if num:
-            return num
-        else:
-            return [0]
+            num.extend(map(str, rev_parts[1:]))
+        if not num:
+            num = [0]
+        return num
     except:
         # invalid revision
         return [0]
@@ -653,7 +651,7 @@ def _db_upgrade(schema, dbrev):
                 _info("""Dependency on 'linregr_result' could be due to objects
                         created from the output of the aggregate 'linregr'.
                         Please refer to the Linear Regression documentation
-                        <http://doc.madlib.net/latest/group__grp__linreg.html#warning>
+                        <http://madlib.incubator.apache.org/docs/latest/group__grp__linreg.html#warning>
                         for the recommended solution.
                         """, False)
             abort = True
@@ -1443,7 +1441,10 @@ class RevTest(unittest.TestCase):
         self.assertTrue(_get_rev_num('4.3.10') >= _get_rev_num('4.3.5'))
         self.assertTrue(_get_rev_num('1.9.10-dev') >= _get_rev_num('1.9.9'))
         self.assertNotEqual(_get_rev_num('1.9.10-dev'), _get_rev_num('1.9.10'))
-        self.assertEqual(_get_rev_num('1.9.10'), _get_rev_num('1.9.10'))
+        self.assertEqual(_get_rev_num('1.9.10'), [1, 9, 10])
+        self.assertEqual(_get_rev_num('1.0.0+20130313144700'), [1, 0, 0, '20130313144700'])
+        self.assertNotEqual(_get_rev_num('1.0.0+20130313144700'),
+                            _get_rev_num('1.0.0-beta+exp.sha.5114f85'))
 
     def test_is_rev_gte(self):
         # 1.0.0-alpha < 1.0.0-alpha.1 < 1.0.0-alpha.beta <
@@ -1457,7 +1458,7 @@ class RevTest(unittest.TestCase):
         self.assertTrue(_is_rev_gte(_get_rev_num('1.9.1'), _get_rev_num('1.9.0')))
         self.assertTrue(_is_rev_gte(_get_rev_num('1.9.1'), _get_rev_num('1.9')))
         self.assertTrue(_is_rev_gte(_get_rev_num('1.9.0'), _get_rev_num('1.9.0-dev')))
-        self.assertTrue(_is_rev_gte(_get_rev_num('1.9.1'), _get_rev_num('1.9.0-dev')))
+        self.assertTrue(_is_rev_gte(_get_rev_num('1.9.1'), _get_rev_num('1.9-dev')))
         self.assertTrue(_is_rev_gte(_get_rev_num('1.9.0-dev'), _get_rev_num('1.9.0-dev')))
         self.assertTrue(_is_rev_gte([1, 9, 'rc', 1], [1, 9, 'dev', 0]))
 
@@ -1472,6 +1473,9 @@ class RevTest(unittest.TestCase):
         self.assertFalse(_is_rev_gte([1, 9, '0.2'], [1, 9, '0.3']))
         self.assertFalse(_is_rev_gte([1, 9, 'build2'], [1, 9, 'build3']))
 
+        self.assertFalse(_is_rev_gte(_get_rev_num('1.0.0+20130313144700'),
+                                     _get_rev_num('1.0.0-beta+exp.sha.5114f85')))
+
 
 # ------------------------------------------------------------------------------
 # Start Here

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/c564e31d/src/modules/stats/chi_squared_test.cpp
----------------------------------------------------------------------
diff --git a/src/modules/stats/chi_squared_test.cpp b/src/modules/stats/chi_squared_test.cpp
index e6c8105..f87f9b9 100644
--- a/src/modules/stats/chi_squared_test.cpp
+++ b/src/modules/stats/chi_squared_test.cpp
@@ -67,7 +67,6 @@ updateSumSquaredDeviations(double &ioLeftNumRows, double &ioLeftSumExp,
         return;
 
     // FIXME: Use compensated sums for numerical stability
-    // http://jira.madlib.net/browse/MADLIB-501
     ioLeftSumSquaredDeviations
            += inRightSumSquaredDeviations
             + ioLeftSumExp * inRightSumObsSquareOverExp
@@ -98,7 +97,7 @@ chi2_gof_test_transition::run(AnyType &args) {
     if (observed < 0)
         throw std::invalid_argument("Number of observations must be "
             "nonnegative.");
-    else if (expected < 0) 
+    else if (expected < 0)
         throw std::invalid_argument("Value of expected (count or probability) "
          "must be nonnegative.");
     else if (df < 0)

[31/50] [abbrv] incubator-madlib git commit: Kmeans: Avoid schema name in type match

Posted by ri...@apache.org.

Kmeans: Avoid schema name in type match


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/735dc35c
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/735dc35c
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/735dc35c

Branch: refs/heads/latest_release
Commit: 735dc35c06a51a38949f68a58e190861c1791b0d
Parents: 8278da8
Author: Rahul Iyer <ri...@apache.org>
Authored: Mon Feb 6 13:34:46 2017 -0800
Committer: Rahul Iyer <ri...@apache.org>
Committed: Mon Feb 6 13:34:46 2017 -0800

----------------------------------------------------------------------
 src/ports/postgres/modules/kmeans/kmeans.py_in | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/735dc35c/src/ports/postgres/modules/kmeans/kmeans.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/kmeans/kmeans.py_in b/src/ports/postgres/modules/kmeans/kmeans.py_in
index e62aab5..d6101f2 100644
--- a/src/ports/postgres/modules/kmeans/kmeans.py_in
+++ b/src/ports/postgres/modules/kmeans/kmeans.py_in
@@ -49,10 +49,10 @@ def kmeans_validate_expr(schema_madlib, rel_source, expr_point, **kwargs):
     expr_type = get_expr_type(expr_point, rel_source).lower()
 
     # Both formats should return a numeric array type
-    if expr_type in ['smallint[]', 'integer[]', 'bigint[]', 'decimal[]',
-                     'numeric[]', 'real[]', 'double precision[]',
-                     'serial[]', 'bigserial[]', 'float8[]',
-                     schema_madlib + '.svec']:
+    if expr_type.endswith(('smallint[]', 'integer[]', 'bigint[]', 'decimal[]',
+                           'numeric[]', 'real[]', 'double precision[]',
+                           'serial[]', 'bigserial[]', 'float8[]',
+                           'svec')):
         if columns_exist_in_table(rel_source, [expr_point]):
             # An array expression would fail this check
             return False

[06/50] [abbrv] incubator-madlib git commit: Build: Exclude AggCheckCallContext for GPDB5

Posted by ri...@apache.org.

Build: Exclude AggCheckCallContext for GPDB5

- Adds build files to compile MADlib with GPDB5
- GPDB5 cherrypicked AggCheckCallContext, we have to exclude it for GPDB5 builds

Closes #83


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/3cf3f677
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/3cf3f677
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/3cf3f677

Branch: refs/heads/latest_release
Commit: 3cf3f6771ab51dd26605ce4d70cd70aee5d896dd
Parents: e75a944
Author: Dave Cramer <da...@gmail.com>
Authored: Wed Jan 11 15:17:01 2017 -0800
Committer: Rahul Iyer <ri...@apache.org>
Committed: Wed Jan 11 15:17:01 2017 -0800

----------------------------------------------------------------------
 src/ports/greenplum/5.0/CMakeLists.txt            | 1 +
 src/ports/greenplum/cmake/FindGreenplum_5_0.cmake | 2 ++
 src/ports/greenplum/dbconnector/Compatibility.hpp | 3 ++-
 3 files changed, 5 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/3cf3f677/src/ports/greenplum/5.0/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/src/ports/greenplum/5.0/CMakeLists.txt b/src/ports/greenplum/5.0/CMakeLists.txt
new file mode 100644
index 0000000..015d76e
--- /dev/null
+++ b/src/ports/greenplum/5.0/CMakeLists.txt
@@ -0,0 +1 @@
+add_current_greenplum_version()

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/3cf3f677/src/ports/greenplum/cmake/FindGreenplum_5_0.cmake
----------------------------------------------------------------------
diff --git a/src/ports/greenplum/cmake/FindGreenplum_5_0.cmake b/src/ports/greenplum/cmake/FindGreenplum_5_0.cmake
new file mode 100644
index 0000000..fe15861
--- /dev/null
+++ b/src/ports/greenplum/cmake/FindGreenplum_5_0.cmake
@@ -0,0 +1,2 @@
+set(_FIND_PACKAGE_FILE "${CMAKE_CURRENT_LIST_FILE}")
+include("${CMAKE_CURRENT_LIST_DIR}/FindGreenplum.cmake")

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/3cf3f677/src/ports/greenplum/dbconnector/Compatibility.hpp
----------------------------------------------------------------------
diff --git a/src/ports/greenplum/dbconnector/Compatibility.hpp b/src/ports/greenplum/dbconnector/Compatibility.hpp
index a9424b4..dcffdfb 100644
--- a/src/ports/greenplum/dbconnector/Compatibility.hpp
+++ b/src/ports/greenplum/dbconnector/Compatibility.hpp
@@ -27,6 +27,7 @@ namespace {
 	SearchSysCache(cacheId, key1, 0, 0, 0)
 #endif
 
+#if (GP_VERSION_NUM < 40399)
 /*
  * In commit 2d4db3675fa7a2f4831b755bc98242421901042f,
  * by Tom Lane <tg...@sss.pgh.pa.us> Wed, 6 Jun 2007 23:00:50 +0000,
@@ -75,7 +76,7 @@ AggCheckCallContext(FunctionCallInfo fcinfo, MemoryContext *aggcontext) {
 		*aggcontext = NULL;
 	return 0;
 }
-
+#endif // GP_VERSION_NUM < 40399
 } // namespace
 
 inline ArrayType* madlib_construct_array

[26/50] [abbrv] incubator-madlib git commit: Fixes the knn documentation location for doxygen

Posted by ri...@apache.org.

Fixes the knn documentation location for doxygen


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/afb0e236
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/afb0e236
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/afb0e236

Branch: refs/heads/latest_release
Commit: afb0e2365693d597a4a7527652cd16d71a31d550
Parents: 61f3c5f
Author: Orhan Kislal <ok...@pivotal.io>
Authored: Wed Feb 1 17:38:25 2017 -0800
Committer: Orhan Kislal <ok...@pivotal.io>
Committed: Wed Feb 1 17:38:25 2017 -0800

----------------------------------------------------------------------
 doc/mainpage.dox.in | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/afb0e236/doc/mainpage.dox.in
----------------------------------------------------------------------
diff --git a/doc/mainpage.dox.in b/doc/mainpage.dox.in
index 85a5d8d..3b9c472 100644
--- a/doc/mainpage.dox.in
+++ b/doc/mainpage.dox.in
@@ -255,11 +255,10 @@ Interface and implementation are subject to change.
     @defgroup grp_sample Random Sampling
 
     @defgroup grp_nene Nearest Neighbors
-    @ingroup grp_super
-    @{A collection of methods to create nearest neigbor based models.@}
+    @{A collection of methods to create nearest neigbor based models.
 
         @defgroup grp_knn k-Nearest Neighbors
-        @ingroup grp_nene
+    @}
 @}
 
 @defgroup grp_deprecated Deprecated Modules

[11/50] [abbrv] incubator-madlib git commit: Graph: SSSP

Posted by ri...@apache.org.

Graph: SSSP

JIRA: MADLIB-992

- Introduces a new module: Graph.
- Implements the single source shortest path algorithm (Bellman-Ford).

Closes #78


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/498c5590
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/498c5590
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/498c5590

Branch: refs/heads/latest_release
Commit: 498c55906a56e4c6d3d561a088d9c113994368cf
Parents: 02a7ef4
Author: Orhan Kislal <ok...@pivotal.io>
Authored: Thu Jan 19 11:51:05 2017 -0800
Committer: Orhan Kislal <ok...@pivotal.io>
Committed: Thu Jan 19 13:25:08 2017 -0800

----------------------------------------------------------------------
 doc/design/design.tex                           |   1 +
 doc/design/modules/graph.tex                    |  71 +++
 doc/literature.bib                              |  15 +
 doc/mainpage.dox.in                             |   4 +
 src/config/Modules.yml                          |   1 +
 src/ports/postgres/modules/graph/__init__.py_in |   0
 src/ports/postgres/modules/graph/sssp.py_in     | 432 +++++++++++++++++++
 src/ports/postgres/modules/graph/sssp.sql_in    | 289 +++++++++++++
 .../postgres/modules/graph/test/sssp.sql_in     |  78 ++++
 .../postgres/modules/utilities/pivot.sql_in     |  84 ++--
 10 files changed, 933 insertions(+), 42 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/498c5590/doc/design/design.tex
----------------------------------------------------------------------
diff --git a/doc/design/design.tex b/doc/design/design.tex
index 516a23a..30e0ef7 100644
--- a/doc/design/design.tex
+++ b/doc/design/design.tex
@@ -229,6 +229,7 @@
 \input{modules/decision-trees}
 \input{modules/random-forests}
 \input{modules/SVM}
+\input{modules/graph}
 \printbibliography
 
 \end{document}

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/498c5590/doc/design/modules/graph.tex
----------------------------------------------------------------------
diff --git a/doc/design/modules/graph.tex b/doc/design/modules/graph.tex
new file mode 100644
index 0000000..223d8b5
--- /dev/null
+++ b/doc/design/modules/graph.tex
@@ -0,0 +1,71 @@
+% When using TeXShop on the Mac, let it know the root document. The following must be one of the first 20 lines.
+% !TEX root = ../design.tex
+
+\chapter[Graph]{Graph}
+
+\begin{moduleinfo}
+\item[Author] \href{mailto:okislal@pivotal.io}{Orhan Kislal}
+\item[History]
+	\begin{modulehistory}
+		\item[v0.1] Initial version, SSSP only.
+	\end{modulehistory}
+\end{moduleinfo}
+
+
+% Abstract. What is the problem we want to solve?
+
+This module implements various graph algorithms that are used in a number of applications such as social networks, telecommunications and road networks.
+
+% \section{Graph Representation} \label{sec:graph:rep}
+
+% Our graph representation depends on two structures, a \emph{vertex} table and an \emph{edge} table.
+
+\section{Single Source Shortest Path} \label{sec:graph:sssp}
+
+Given a graph and a source vertex, single source shortest path (SSSP) algorithm finds a path for every vertex such that the sum of the weights of its constituent edges is minimized.
+
+Shortest path is defined as follows. Let $e_{i,j}$ be the edge from vertex $i$ to vertex $j$ and $w_{i,j}$ be its weight. Given a graph G, the shortest path from $s$ to $d$ is $P = (v_1, v_2 \dots, v_n)$ (where $v_1=s$ and $v_n=d$) that over all possible $n$ minimizes the sum $ \sum _{i=1}^{n-1}f(e_{i,i+1})$.
+
+% \subsection{Bellman Ford Algorithm}
+
+Bellman-Ford Algorithm \cite{bellman1958routing,ford1956network} is based on the following idea: We start with a naive approximation for the cost of reaching every vertex. At each iteration, these values are refined based on the edge list and the existing approximations. If there are no refinements at any given step, the algorithm returns the calculated results. If the algorithm does not converge in $|V|-1$ iterations, this indicates the existence of a negative cycle in the graph.
+
+
+\begin{algorithm}[SSSP$(V,E,start)$] \label{alg:sssp}
+\alginput{Vertex set $V$, edge set $E$, starting vertex $start$}
+\algoutput{Distance and parent set for every vertex $cur$}
+\begin{algorithmic}[1]
+	\State $toupdate(0) \set (start,0,start)$
+	\For{every $i \in 0\dots|V|-1$}
+		\For{every tuple $t \in toupdate(i)$} \label{alg:sssp:update}
+			\For{every edge $e \mid e.src = t.id$}
+		 		\State $local \set e.val + t.val$
+		 		\If{$local < toupdate(i+1,e.dest).val$} \label{alg:sssp:single}
+		 			\State $toupdate(i+1,dest) \set (local,e.src)$
+		 		\EndIf
+			\EndFor
+		\EndFor
+		\For{every tuple $t \in toupdate(i+1)$}
+		 	\If{$t.val < cur(t.id).val$}
+		 		\State $cur(t.id) \set (t.val,t.parent)$
+		 	\EndIf
+		\EndFor
+	\EndFor
+\end{algorithmic}
+\end{algorithm}
+
+\begin{description}
+\item edge: (src,dest,val). The edges of the graph.
+\item cur: id -> (val,parent). The intermediate SSSP results.
+\item toupdate: iter -> (id -> (val,parent)). The set of updates.
+\end{description}
+
+Changes from the standard Bellman-Ford algorithm:
+
+\begin{description}
+\item Line~\ref{alg:sssp:update}: We only check the vertices that have been updated in the last iteration.
+\item Line~\ref{alg:sssp:single}: At each iteration, we update a given vertex only one time. This means the toupdate set cannot contain multiple records for the same vertex which requires the comparison with the existing value.
+\end{description}
+
+This is not a 1-to-1 pseudocode for the implementation since we don't compare the `toupdate` table records one by one but calculate the overall minimum. In addition, the comparison with `cur` values take place earlier to reduce the number of tuples in the `toupdate` table.
+

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/498c5590/doc/literature.bib
----------------------------------------------------------------------
diff --git a/doc/literature.bib b/doc/literature.bib
index 64f5b0d..0353c23 100644
--- a/doc/literature.bib
+++ b/doc/literature.bib
@@ -892,3 +892,18 @@ Applied Survival Analysis},
  pages = {1443--1471},
 }
 
+@article{bellman1958routing,
+  title={On a routing problem},
+  author={Bellman, Richard},
+  journal={Quarterly of applied mathematics},
+  pages={87--90},
+  year={1958},
+  publisher={JSTOR}
+}
+
+@techreport{ford1956network,
+  title={Network flow theory},
+  author={Ford Jr, Lester R},
+  year={1956},
+  institution={DTIC Document}
+}

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/498c5590/doc/mainpage.dox.in
----------------------------------------------------------------------
diff --git a/doc/mainpage.dox.in b/doc/mainpage.dox.in
index b85ef15..0e846e1 100644
--- a/doc/mainpage.dox.in
+++ b/doc/mainpage.dox.in
@@ -119,6 +119,10 @@ complete matrix stored as a distributed table.
 
         @defgroup grp_stemmer Stemming
         @ingroup grp_datatrans
+@defgroup grp_graph Graph
+@{Contains graph algorithms. @}
+    @defgroup grp_sssp Single Source Shortest Path
+    @ingroup grp_graph
 
 @defgroup grp_mdl Model Evaluation
 @{Contains functions for evaluating accuracy and validation of predictive methods. @}

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/498c5590/src/config/Modules.yml
----------------------------------------------------------------------
diff --git a/src/config/Modules.yml b/src/config/Modules.yml
index fd3c5e6..c3315b6 100644
--- a/src/config/Modules.yml
+++ b/src/config/Modules.yml
@@ -14,6 +14,7 @@ modules:
     - name: elastic_net
     - name: glm
       depends: ['utilities']
+    - name: graph
     - name: kmeans
       depends: ['array_ops', 'svec_util', 'sample']
     - name: lda

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/498c5590/src/ports/postgres/modules/graph/__init__.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/graph/__init__.py_in b/src/ports/postgres/modules/graph/__init__.py_in
new file mode 100644
index 0000000..e69de29

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/498c5590/src/ports/postgres/modules/graph/sssp.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/graph/sssp.py_in b/src/ports/postgres/modules/graph/sssp.py_in
new file mode 100644
index 0000000..558ec3d
--- /dev/null
+++ b/src/ports/postgres/modules/graph/sssp.py_in
@@ -0,0 +1,432 @@
+# coding=utf-8
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Single Source Shortest Path
+
+# Please refer to the sssp.sql_in file for the documentation
+
+"""
+@file sssp.py_in
+
+@namespace graph
+"""
+
+import plpy
+from utilities.control import MinWarning
+from utilities.utilities import _assert
+from utilities.utilities import extract_keyvalue_params
+from utilities.utilities import unique_string
+from utilities.validate_args import get_cols
+from utilities.validate_args import unquote_ident
+from utilities.validate_args import table_exists
+from utilities.validate_args import columns_exist_in_table
+from utilities.validate_args import table_is_empty
+
+m4_changequote(`<!', `!>')
+
+def graph_sssp(schema_madlib, vertex_table, vertex_id, edge_table,
+		edge_args, source_vertex, out_table, **kwargs):
+	"""
+    Single source shortest path function for graphs using the Bellman-Ford
+    algorhtm [1].
+    Args:
+        @param vertex_table     Name of the table that contains the vertex data.
+        @param vertex_id        Name of the column containing the vertex ids.
+        @param edge_table       Name of the table that contains the edge data.
+        @param edge_args        A comma-delimited string containing multiple
+        						named arguments of the form "name=value".
+        @param source_vertex    The source vertex id for the algorithm to start.
+        @param out_table   	    Name of the table to store the result of SSSP.
+
+    [1] https://en.wikipedia.org/wiki/Bellman-Ford_algorithm
+    """
+
+	with MinWarning("warning"):
+
+		INT_MAX = 2147483647
+		EPSILON = 0.000001
+
+		message = unique_string(desp='message')
+
+		oldupdate = unique_string(desp='oldupdate')
+		newupdate = unique_string(desp='newupdate')
+
+		params_types = {'src': str, 'dest': str, 'weight': str}
+		default_args = {'src': 'src', 'dest': 'dest', 'weight': 'weight'}
+		edge_params = extract_keyvalue_params(edge_args,
+                                            params_types,
+                                            default_args)
+		if vertex_id is None:
+			vertex_id = "id"
+
+		src = edge_params["src"]
+		dest = edge_params["dest"]
+		weight = edge_params["weight"]
+
+		distribution = m4_ifdef(<!__POSTGRESQL__!>, <!''!>,
+			<!"DISTRIBUTED BY ({0})".format(vertex_id)!>)
+		local_distribution = m4_ifdef(<!__POSTGRESQL__!>, <!''!>,
+			<!"DISTRIBUTED BY (id)"!>)
+
+		validate_graph_coding(vertex_table, vertex_id, edge_table,
+			edge_params, source_vertex, out_table)
+
+		plpy.execute(" DROP TABLE IF EXISTS {0},{1},{2}".format(
+			message,oldupdate,newupdate))
+
+		# We keep a table of every vertex, the minimum cost to that destination
+		# seen so far and the parent to this vertex in the associated shortest
+		# path. This table will be updated throughtout the execution.
+		plpy.execute(
+			""" CREATE TABLE {out_table} AS
+				SELECT {vertex_id} AS {vertex_id},
+					CAST('Infinity' AS DOUBLE PRECISION) AS {weight},
+					NULL::INT AS parent
+				FROM {vertex_table}
+				WHERE {vertex_id} IS NOT NULL
+				{distribution} """.format(**locals()))
+
+		# We keep 2 update tables and alternate them during the execution.
+		# This is necessary since we need to know which vertices are updated in
+		# the previous iteration to calculate the next set of updates.
+		plpy.execute(
+			""" CREATE TEMP TABLE {oldupdate}(
+				id INT, val DOUBLE PRECISION, parent INT)
+				{local_distribution}
+				""".format(**locals()))
+		plpy.execute(
+			""" CREATE TEMP TABLE {newupdate}(
+				id INT, val DOUBLE PRECISION, parent INT)
+				{local_distribution}
+				""".format(**locals()))
+
+		# Since HAWQ does not allow us to update, we create a new table and
+		# rename at every iteration
+		temp_table = unique_string(desp='temp')
+		sql = m4_ifdef(<!__HAWQ__!>,
+			""" CREATE TABLE {temp_table} (
+					{vertex_id} INT, {weight} DOUBLE PRECISION, parent INT)
+					{distribution};
+			""",  <!''!>)
+		plpy.execute(sql.format(**locals()))
+
+		# GPDB and HAWQ have distributed by clauses to help them with indexing.
+		# For Postgres we add the indices manually.
+		sql_index = m4_ifdef(<!__POSTGRESQL__!>,
+			<!""" CREATE INDEX ON {out_table} ({vertex_id});
+				CREATE INDEX ON {oldupdate} (id);
+				CREATE INDEX ON {newupdate} (id);
+			""".format(**locals())!>,
+			<!''!>)
+		plpy.execute(sql_index)
+
+		# The source can be reached with 0 cost and it has itself as the parent.
+		plpy.execute(
+			""" INSERT INTO {oldupdate}
+				VALUES({source_vertex},0,{source_vertex})
+			""".format(**locals()))
+
+		v_cnt = plpy.execute(
+			"""SELECT count(*) FROM {vertex_table}
+			WHERE {vertex_id} IS NOT NULL""".format(**locals()))[0]['count']
+		for i in range(0,v_cnt+1):
+
+			# Apply the updates calculated in the last iteration
+			sql = m4_ifdef(<!__HAWQ__!>,
+				<!"""
+				TRUNCATE TABLE {temp_table};
+				INSERT INTO {temp_table}
+					SELECT *
+					FROM {out_table}
+					WHERE {out_table}.{vertex_id} NOT IN (
+						SELECT {oldupdate}.id FROM {oldupdate})
+					UNION
+					SELECT * FROM {oldupdate};
+				DROP TABLE {out_table};
+				ALTER TABLE {temp_table} RENAME TO {out_table};
+				CREATE TABLE {temp_table} (
+					{vertex_id} INT, {weight} DOUBLE PRECISION, parent INT)
+					{distribution};
+				"""!>,
+				<!"""
+				UPDATE {out_table} SET
+				{weight}=oldupdate.val,
+				parent=oldupdate.parent
+				FROM
+				{oldupdate} AS oldupdate
+				WHERE
+				{out_table}.{vertex_id}=oldupdate.id
+				"""!>)
+			plpy.execute(sql.format(**locals()))
+
+
+			plpy.execute("TRUNCATE TABLE {0}".format(newupdate))
+
+			# 'oldupdate' table has the update info from the last iteration
+
+			# Consider every edge that has an updated source
+			# From these edges:
+			# For every destination vertex, find the min total cost to reach.
+			# Note that, just calling an aggregate function with group by won't
+			# let us store the src field of the edge (needed for the parent).
+			# This is why we need the 'x'; it gives a list of destinations and
+			# associated min values. Using these values, we identify which edge
+			# is selected.
+
+			# Since using '='' with floats is dangerous we use an epsilon value
+			# for comparison.
+
+			# Once we have a list of edges and values (stores as 'message'),
+			# we check if these values are lower than the existing shortest path
+			# values.
+
+			sql = (""" INSERT INTO {newupdate}
+				SELECT DISTINCT ON (message.id) message.id AS id,
+					message.val AS val,
+					message.parent AS parent
+				FROM {out_table} AS out_table INNER JOIN
+					(
+						SELECT edge_table.{dest} AS id, x.val AS val,
+							oldupdate.id AS parent
+						FROM {oldupdate} AS oldupdate INNER JOIN
+							{edge_table} AS edge_table ON
+							(edge_table.{src} = oldupdate.id) INNER JOIN
+							(
+								SELECT edge_table.{dest} AS id,
+									min(oldupdate.val + edge_table.{weight})
+									AS val
+								FROM {oldupdate} AS oldupdate INNER JOIN
+									{edge_table} AS edge_table ON
+									(edge_table.{src}=oldupdate.id)
+								GROUP BY edge_table.{dest}
+							) x ON (edge_table.{dest} = x.id)
+						WHERE ABS(oldupdate.val + edge_table.{weight} - x.val)
+							< {EPSILON}
+					) AS message ON (message.id = out_table.{vertex_id})
+				WHERE message.val<out_table.{weight}
+				""".format(**locals()))
+
+			# If there are no updates, SSSP is finalized
+			ret = plpy.execute(sql)
+			if ret.nrows() == 0:
+				break
+
+			# Swap the update tables for the next iteration
+			tmp = oldupdate
+			oldupdate = newupdate
+			newupdate = tmp
+
+		# Bellman-Ford should converge in |V|-1 iterations.
+		if i == v_cnt:
+			plpy.execute("DROP TABLE IF EXISTS {out_table}".format(**locals()))
+			plpy.error("Graph SSSP: Detected a negative cycle in the graph.")
+
+		m4_ifdef(<!__HAWQ__!>,
+			plpy.execute("DROP TABLE {temp_table} ".format(**locals())), <!''!>)
+
+	return None
+
+def graph_sssp_get_path(schema_madlib, sssp_table, dest_vertex, **kwargs):
+	"""
+	Helper function that can be used to get the shortest path for a vertex
+    Args:
+    	@param source_table	Name of the table that contains the SSSP output.
+        @param out_table	The vertex that will be the destination of the
+            				desired path.
+	"""
+
+	validate_get_path(sssp_table, dest_vertex)
+	cur = dest_vertex
+	cols = get_cols(sssp_table)
+	id = cols[0]
+	ret = [dest_vertex]
+	plan_name = unique_string(desp='plan')
+
+	# Follow the 'parent' chain until you reach the source.
+	# We don't need to know what the source is since it is the only vertex with
+	# itself as its parent
+	plpy.execute(""" PREPARE {plan_name} (int) AS
+		SELECT parent FROM {sssp_table} WHERE {id} = $1 LIMIT 1
+		""".format(**locals()))
+	sql = "EXECUTE {plan_name} ({cur})"
+	parent = plpy.execute(sql.format(**locals()))
+
+	if parent.nrows() == 0:
+		plpy.error(
+			"Graph SSSP: Vertex {0} is not present in the sssp table {1}".
+			format(dest_vertex,sssp_table))
+
+	while 1:
+		parent = parent[0]['parent']
+		if parent == cur:
+			ret.reverse()
+			return ret
+		else:
+			ret.append(parent)
+			cur = parent
+		parent = plpy.execute(sql.format(**locals()))
+
+	return None
+
+def validate_graph_coding(vertex_table, vertex_id, edge_table, edge_params,
+	source_vertex, out_table, **kwargs):
+
+	_assert(out_table and out_table.strip().lower() not in ('null', ''),
+		"Graph SSSP: Invalid output table name!")
+	_assert(not table_exists(out_table),
+		"Graph SSSP: Output table already exists!")
+
+	_assert(vertex_table and vertex_table.strip().lower() not in ('null', ''),
+		"Graph SSSP: Invalid vertex table name!")
+	_assert(table_exists(vertex_table),
+		"Graph SSSP: Vertex table ({0}) is missing!".format(vertex_table))
+	_assert(not table_is_empty(vertex_table),
+		"Graph SSSP: Vertex table ({0}) is empty!".format(vertex_table))
+
+	_assert(edge_table and edge_table.strip().lower() not in ('null', ''),
+		"Graph SSSP: Invalid edge table name!")
+	_assert(table_exists(edge_table),
+		"Graph SSSP: Edge table ({0}) is missing!".format(edge_table))
+	_assert(not table_is_empty(edge_table),
+		"Graph SSSP: Edge table ({0}) is empty!".format(edge_table))
+
+	existing_cols = set(unquote_ident(i) for i in get_cols(vertex_table))
+	_assert(vertex_id in existing_cols,
+		"""Graph SSSP: The vertex column {vertex_id} is not present in vertex
+		table ({vertex_table}) """.format(**locals()))
+	_assert(columns_exist_in_table(edge_table, edge_params.values()),
+		"Graph SSSP: Not all columns from {0} present in edge table ({1})".
+		format(edge_params.values(), edge_table))
+
+	_assert(isinstance(source_vertex,int),
+		"""Graph SSSP: Source vertex {source_vertex} has to be an integer """.
+		format(**locals()))
+	src_exists = plpy.execute("""
+		SELECT * FROM {vertex_table} WHERE {vertex_id}={source_vertex}
+		""".format(**locals()))
+
+	if src_exists.nrows() == 0:
+		plpy.error(
+			"""Graph SSSP: Source vertex {source_vertex} is not present in the
+			vertex table {vertex_table} """.format(**locals()))
+
+	vt_error = plpy.execute(
+		""" SELECT {vertex_id}
+			FROM {vertex_table}
+			WHERE {vertex_id} IS NOT NULL
+			GROUP BY {vertex_id}
+			HAVING count(*) > 1 """.format(**locals()))
+
+	if vt_error.nrows() != 0:
+		plpy.error(
+			"""Graph SSSP: Source vertex table {vertex_table}
+			contains duplicate vertex id's """.format(**locals()))
+
+	return None
+
+def validate_get_path(sssp_table, dest_vertex, **kwargs):
+
+	_assert(sssp_table and sssp_table.strip().lower() not in ('null', ''),
+		"Graph SSSP: Invalid SSSP table name!")
+	_assert(table_exists(sssp_table),
+		"Graph SSSP: SSSP table ({0}) is missing!".format(sssp_table))
+	_assert(not table_is_empty(sssp_table),
+		"Graph SSSP: SSSP table ({0}) is empty!".format(sssp_table))
+
+
+def graph_sssp_help(schema_madlib, message, **kwargs):
+    """
+    Help function for graph_sssp and graph_sssp_get_path
+
+    Args:
+        @param schema_madlib
+        @param message: string, Help message string
+        @param kwargs
+
+    Returns:
+        String. Help/usage information
+    """
+    if not message:
+        help_string = """
+-----------------------------------------------------------------------
+                            SUMMARY
+-----------------------------------------------------------------------
+
+Given a graph and a source vertex, single source shortest path (SSSP)
+algorithm finds a path for every vertex such that the the sum of the
+weights of its constituent edges is minimized.
+
+For more details on function usage:
+    SELECT {schema_madlib}.graph_sssp('usage')
+            """
+    elif message in ['usage', 'help', '?']:
+        help_string = """
+----------------------------------------------------------------------------
+                            USAGE
+----------------------------------------------------------------------------
+ SELECT {schema_madlib}.graph_sssp(
+    vertex_table  TEXT, -- Name of the table that contains the vertex data.
+    vertex_id     TEXT, -- Name of the column containing the vertex ids.
+    edge_table    TEXT, -- Name of the table that contains the edge data.
+    edge_args     TEXT, -- A comma-delimited string containing multiple
+    			-- named arguments of the form "name=value".
+    source_vertex INT,  -- The source vertex id for the algorithm to start.
+    out_table     TEXT  -- Name of the table to store the result of SSSP.
+);
+
+The following parameters are supported for edge table arguments ('edge_args'
+	above):
+
+src (default = 'src')		: Name of the column containing the source
+				vertex ids in the edge table.
+dest (default = 'dest')		: Name of the column containing the destination
+				vertex ids in the edge table.
+weight (default = 'weight')	: Name of the column containing the weight of
+				edges in the edge table.
+
+To retrieve the path for a specific vertex:
+
+ SELECT {schema_madlib}.graph_sssp_get_path(
+    sssp_table	TEXT, -- Name of the table that contains the SSSP output.
+    dest_vertex	INT   -- The vertex that will be the destination of the
+    		  -- desired path.
+);
+
+----------------------------------------------------------------------------
+                            OUTPUT
+----------------------------------------------------------------------------
+The output table ('out_table' above) will contain a row for every vertex from
+vertex_table and have the following columns:
+
+vertex_id 	: The id for the destination. Will use the input parameter
+		(vertex_id) for column naming.
+weight 		: The total weight of the shortest path from the source vertex
+		to this particular vertex. Will use the input parameter (weight)
+		for column naming.
+parent 		: The parent of this vertex in the shortest path from source.
+		Will use "parent" for column naming.
+
+The graph_sssp_get_path function will return an INT array that contains the
+shortest path from the initial source vertex to the desired destination vertex.
+"""
+    else:
+        help_string = "No such option. Use {schema_madlib}.graph_sssp()"
+
+    return help_string.format(schema_madlib=schema_madlib)
+# ---------------------------------------------------------------------

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/498c5590/src/ports/postgres/modules/graph/sssp.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/graph/sssp.sql_in b/src/ports/postgres/modules/graph/sssp.sql_in
new file mode 100644
index 0000000..7534a75
--- /dev/null
+++ b/src/ports/postgres/modules/graph/sssp.sql_in
@@ -0,0 +1,289 @@
+/* ----------------------------------------------------------------------- *//**
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ *
+ * @file graph.sql_in
+ *
+ * @brief SQL functions for graph analytics
+ * @date Nov 2016
+ *
+ * @sa Provides various graph algorithms.
+ *
+ *//* ----------------------------------------------------------------------- */
+m4_include(`SQLCommon.m4')
+
+
+/**
+@addtogroup grp_sssp
+
+<div class="toc"><b>Contents</b>
+<ul>
+<li><a href="#sssp">SSSP</a></li>
+<li><a href="#notes">Notes</a></li>
+<li><a href="#examples">Examples</a></li>
+<li><a href="#literature">Literature</a></li>
+</ul>
+</div>
+
+@brief Finds the shortest path from a single source vertex to every other vertex in a given graph.
+
+Given a graph and a source vertex, the single source shortest path (SSSP) algorithm
+finds a path from the source vertex to every other vertex in the graph,
+such that the sum of the weights of the path edges is minimized.
+
+@anchor sssp
+@par SSSP
+<pre class="syntax">
+graph_sssp( vertex_table,
+            vertex_id,
+            edge_table,
+            edge_args,
+            source_vertex,
+            out_table
+          )
+</pre>
+
+\b Arguments
+<dl class="arglist">
+<dt>vertex_table</dt>
+<dd>TEXT. Name of the table containing the vertex data for the graph. Must contain the
+column specified in the 'vertex_id' parameter below.</dd>
+
+<dt>vertex_id</dt>
+<dd>TEXT, default = 'id'. Name of the column in 'vertex_table' containing
+vertex ids.  The vertex ids are of type INTEGER with no duplicates.
+They do not need to be contiguous.</dd>
+
+<dt>edge_table</dt>
+<dd>TEXT. Name of the table containing the edge data. The edge table must
+contain columns for source vertex, destination vertex and edge weight.
+Column naming convention is described below in the 'edge_args' parameter.</dd>
+
+<dt>edge_args</dt>
+<dd>TEXT. A comma-delimited string containing multiple named arguments of
+the form "name=value". The following parameters are supported for
+this string argument:
+  - src (INTEGER): Name of the column containing the source vertex ids in the edge table. Default column name is 'src'.
+  - dest (INTEGER): Name of the column containing the destination vertex ids in the edge table. Default column name is 'dest'.
+  - weight (FLOAT8): Name of the column containing the edge weights in the edge table. Default column name is 'weight'.</dd>
+
+<dt>source_vertex</dt>
+<dd>INTEGER. The source vertex id for the algorithm to start. This vertex id must
+exist in the 'vertex_id' column of 'vertex_table'.</dd>
+
+<dt>out_table</dt>
+<dd>TEXT. Name of the table to store the result of SSSP.
+It will contain a row for every vertex from 'vertex_table' and have
+the following columns:
+  - vertex_id : The id for the destination. Will use the input parameter 'vertex_id' for column naming.
+  - weight : The total weight of the shortest path from the source vertex to this particular vertex.
+  Will use the input parameter (weight) for column naming.
+  - parent : The parent of this vertex in the shortest path from source. Will use 'parent' for column naming.</dd>
+</dl>
+
+@par Path Retrieval
+
+The path retrieval function returns the shortest path from the
+source vertex to a specified desination vertex.
+
+<pre class="syntax">
+graph_sssp( sssp_table,
+            dest_vertex
+          )
+</pre>
+
+\b Arguments
+<dl class="arglist">
+<dt>sssp_table</dt>
+<dd>TEXT. Name of the table that contains the SSSP output.</dd>
+
+<dt>dest_vertex</dt>
+<dd>INTEGER. The vertex that will be the destination of the desired path.</dd>
+</dl>
+
+@anchor notes
+@par Notes
+
+The Bellman-Ford algorithm [1] is used to implement SSSP. This algorithm allows
+negative edges but not negative cycles. In the case of graphs with
+negative cycles, an error will be given and no output table will be generated.
+
+Also see the Grail project [2] for more background on graph analytics processing
+in relational databases.
+
+@anchor examples
+@examp
+
+-# Create vertex and edge tables to represent the graph:
+<pre class="syntax">
+DROP TABLE IF EXISTS vertex, edge;
+CREATE TABLE vertex(
+        id INTEGER
+        );
+CREATE TABLE edge(
+        src INTEGER,
+        dest INTEGER,
+        weight FLOAT8
+        );
+INSERT INTO vertex VALUES
+(0),
+(1),
+(2),
+(3),
+(4),
+(5),
+(6),
+(7);
+INSERT INTO edge VALUES
+(0, 1, 1.0),
+(0, 2, 1.0),
+(0, 4, 10.0),
+(1, 2, 2.0),
+(1, 3, 10.0),
+(2, 3, 1.0),
+(2, 5, 1.0),
+(2, 6, 3.0),
+(3, 0, 1.0),
+(4, 0, -2.0),
+(5, 6, 1.0),
+(6, 7, 1.0);
+</pre>
+
+-# Calculate the shortest paths from vertex 0:
+<pre class="syntax">
+DROP TABLE IF EXISTS out;
+SELECT madlib.graph_sssp(
+                         'vertex',      -- Vertex table
+                         NULL,          -- Vertix id column (NULL means use default naming)
+                         'edge',        -- Edge table
+                         NULL,          -- Edge arguments (NULL means use default naming)
+                         0,             -- Source vertex for path calculation
+                         'out');        -- Output table of shortest paths
+SELECT * FROM out ORDER BY id;
+</pre>
+<pre class="result">
+ id | weight | parent
+----+--------+--------
+  0 |      0 |      0
+  1 |      1 |      0
+  2 |      1 |      0
+  3 |      2 |      2
+  4 |     10 |      0
+  5 |      2 |      2
+  6 |      3 |      5
+  7 |      4 |      6
+(8 rows)
+</pre>
+
+-# Get the shortest path to vertex 6:
+<pre class="syntax">
+SELECT madlib.graph_sssp_get_path('out',6) AS spath;
+</pre>
+<pre class="result">
+   spath
+\-----------
+ {0,2,5,6}
+</pre>
+
+-# Now let's do a similar example except using
+different column names in the tables (i.e., not the defaults).
+Create the vertex and edge tables:
+<pre class="syntax">
+DROP TABLE IF EXISTS vertex_alt, edge_alt;
+CREATE TABLE vertex_alt AS SELECT id AS v_id FROM vertex;
+CREATE TABLE edge_alt AS SELECT src AS e_src, dest, weight AS e_weight FROM edge;
+</pre>
+
+-# Get the shortest path from vertex 1:
+<pre class="syntax">
+DROP TABLE IF EXISTS out_alt;
+SELECT madlib.graph_sssp(
+                         'vertex_alt',                  -- Vertex table
+                         'v_id',                        -- Vertix id column (NULL means use default naming)
+                         'edge_alt',                    -- Edge table
+                         'src=e_src, weight=e_weight',  -- Edge arguments (NULL means use default naming)
+                         1,                             -- Source vertex for path calculation
+                         'out_alt');                    -- Output table of shortest paths
+SELECT * FROM out_alt ORDER BY v_id;
+</pre>
+<pre class="result">
+ v_id | e_weight | parent
+------+----------+--------
+    0 |        4 |      3
+    1 |        0 |      1
+    2 |        2 |      1
+    3 |        3 |      2
+    4 |       14 |      0
+    5 |        3 |      2
+    6 |        4 |      5
+    7 |        5 |      6
+(8 rows)
+</pre>
+
+@anchor literature
+@par Literature
+
+[1] Bellman\u2013Ford algorithm. https://en.wikipedia.org/wiki/Bellman%E2%80%93Ford_algorithm
+
+[2] The case against specialized graph analytics engines, J. Fan, G. Soosai Raj,
+and J. M. Patel. CIDR 2015. http://cidrdb.org/cidr2015/Papers/CIDR15_Paper20.pdf
+*/
+
+-------------------------------------------------------------------------
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_sssp(
+    vertex_table            TEXT,
+    vertex_id               TEXT,
+    edge_table              TEXT,
+    edge_args               TEXT,
+    source_vertex           INT,
+    out_table               TEXT
+
+) RETURNS VOID AS $$
+    PythonFunction(graph, sssp, graph_sssp)
+$$ LANGUAGE plpythonu VOLATILE
+m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
+-------------------------------------------------------------------------
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_sssp_get_path(
+    sssp_table             TEXT,
+    dest_vertex            INT
+
+) RETURNS INT[] AS $$
+    PythonFunction(graph, sssp, graph_sssp_get_path)
+$$ LANGUAGE plpythonu VOLATILE
+m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `');
+-------------------------------------------------------------------------
+
+-- Online help
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_sssp(
+    message VARCHAR
+) RETURNS VARCHAR AS $$
+    PythonFunction(graph, sssp, graph_sssp_help)
+$$ LANGUAGE plpythonu IMMUTABLE
+m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `');
+
+--------------------------------------------------------------------------------
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_sssp()
+RETURNS VARCHAR AS $$
+    SELECT MADLIB_SCHEMA.graph_sssp('');
+$$ LANGUAGE sql IMMUTABLE
+m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `');
+--------------------------------------------------------------------------------
+

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/498c5590/src/ports/postgres/modules/graph/test/sssp.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/graph/test/sssp.sql_in b/src/ports/postgres/modules/graph/test/sssp.sql_in
new file mode 100644
index 0000000..e2342c5
--- /dev/null
+++ b/src/ports/postgres/modules/graph/test/sssp.sql_in
@@ -0,0 +1,78 @@
+/* ----------------------------------------------------------------------- *//**
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ *//* ----------------------------------------------------------------------- */
+
+
+DROP TABLE IF EXISTS vertex,edge,out,vertex_alt,edge_alt,out_alt;
+
+
+CREATE TABLE vertex(
+                  id INTEGER
+                );
+
+CREATE TABLE edge(
+                  src INTEGER,
+                  dest INTEGER,
+                  weight INTEGER
+                );
+
+INSERT INTO vertex VALUES
+(0),
+(1),
+(2),
+(3),
+(4),
+(5),
+(6),
+(7)
+;
+INSERT INTO edge VALUES
+(0, 1, 1),
+(0, 2, 1),
+(0, 4, 10),
+(1, 2, 2),
+(1, 3, 10),
+(2, 3, 1),
+(2, 5, 1),
+(2, 6, 3),
+(3, 0, 1),
+(4, 0, -2),
+(5, 6, 1),
+(6, 7, 1)
+;
+
+SELECT graph_sssp('vertex',NULL,'edge',NULL,0,'out');
+
+SELECT * FROM out;
+
+SELECT assert(weight = 3, 'Wrong output in graph (SSSP)') FROM out WHERE id = 6;
+SELECT assert(parent = 5, 'Wrong parent in graph (SSSP)') FROM out WHERE id = 6;
+
+SELECT graph_sssp_get_path('out',6);
+
+CREATE TABLE vertex_alt AS SELECT id AS v_id FROM vertex;
+CREATE TABLE edge_alt AS SELECT src AS e_src, dest, weight AS e_weight FROM edge;
+
+SELECT graph_sssp('vertex_alt','v_id','edge_alt','src=e_src, weight=e_weight',1,'out_alt');
+
+SELECT * FROM out_alt;
+
+SELECT assert(e_weight = 4, 'Wrong output in graph (SSSP)') FROM out_alt WHERE v_id = 6;
+SELECT assert(parent = 5, 'Wrong parent in graph (SSSP)') FROM out_alt WHERE v_id = 6;

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/498c5590/src/ports/postgres/modules/utilities/pivot.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/pivot.sql_in b/src/ports/postgres/modules/utilities/pivot.sql_in
index 2cadab0..1c36cef 100644
--- a/src/ports/postgres/modules/utilities/pivot.sql_in
+++ b/src/ports/postgres/modules/utilities/pivot.sql_in
@@ -21,7 +21,7 @@
  * @file pivot.sql_in
  *
  * @brief SQL functions for pivoting
- * @date June 2014
+ * @date June 2016
  *
  * @sa Creates a pivot table for data summarization.
  *
@@ -34,7 +34,7 @@ m4_include(`SQLCommon.m4')
 
 <div class="toc"><b>Contents</b>
 <ul>
-<li><a href="#categorical">Pivoting</a></li>
+<li><a href="#pivoting">Pivoting</a></li>
 <li><a href="#notes">Notes</a></li>
 <li><a href="#examples">Examples</a></li>
 <li><a href="#literature">Literature</a></li>
@@ -43,7 +43,7 @@ m4_include(`SQLCommon.m4')
 
 @brief Provides pivoting functions helpful for data preparation before modeling
 
-@anchor categorical
+@anchor pivoting
 The goal of the MADlib pivot function is to provide a data summarization tool
 that can do basic OLAP type operations on data stored in one table and output
 the summarized data to a second table.
@@ -70,22 +70,22 @@ pivot(
     <dt>output_table</dt>
     <dd>VARCHAR. Name of output table that contains the pivoted data.
     The output table contains all the columns present in
-    the <em>'index'</em> column list, plus additional columns for each 
-    distinct value in <em>'pivot_cols'</em>.	
+    the <em>'index'</em> column list, plus additional columns for each
+    distinct value in <em>'pivot_cols'</em>.
 
     @note The names of the columns in the output table are auto-generated.
     Please see the examples section below to see how this works in practice.
-    The convention used is to concatenate the following strings and separate 
+    The convention used is to concatenate the following strings and separate
     each by an underscore '_' :
     - name of the value column <em>'pivot_values'</em>
     - aggregate function
     - name of the pivot column <em>'pivot_cols'</em>
     - values in the pivot column
-    
+
     </dd>
     <dt>index </dt>
     <dd>VARCHAR. Comma-separated columns that will form the index of the output
-    pivot table.  By index we mean the values to group by; these are the rows 
+    pivot table.  By index we mean the values to group by; these are the rows
     in the output pivot table.</dd>
     <dt>pivot_cols </dt>
     <dd>VARCHAR. Comma-separated columns that will form the columns of the
@@ -99,13 +99,13 @@ pivot(
     possible to assign a set of aggregates per value column. Please refer to the
      examples 12\-14 below for syntax details.</dd>
 
-     @note  Only aggregates with 
+     @note  Only aggregates with
      strict transition functions are permitted here.
-     A strict transition function means rows with null values are ignored; 
-     the function is not called and the previous state value is retained.  
+     A strict transition function means rows with null values are ignored;
+     the function is not called and the previous state value is retained.
      If you need some other behavior for null inputs, this should
      be done prior to calling the pivot function.
-     Aggregates with strict transition 
+     Aggregates with strict transition
      functions are described in [2,3].
 
     <dt>fill_value (optional)</dt>
@@ -133,7 +133,7 @@ pivot(
 @note
 - NULLs in the index column are treated like any other value.
 - NULLs in the pivot column are ignored unless keep_null is TRUE.
-- Only strict transition functions are 
+- Only strict transition functions are
 allowed so NULLs are ignored.
 - It is not allowed to set the fill_value parameter without setting the
 aggregate_func parameter due to possible ambiguity. Set
@@ -199,7 +199,7 @@ SELECT id,id2,piv,piv2,val,val2 FROM pivset_ext
 ORDER BY id,id2,piv,piv2,val,val2;
 </pre>
 <pre class="result">
- id | id2 | piv | piv2 | val | val2 
+ id | id2 | piv | piv2 | val | val2
 ----+-----+-----+------+-----+------
   0 |   0 |  10 |    0 |   1 |   11
   0 |   1 |  10 |  100 |   2 |   12
@@ -240,14 +240,14 @@ DROP AGGREGATE IF EXISTS array_accum1 (anyelement);
 CREATE AGGREGATE array_accum1 (anyelement) (
     sfunc = array_add1,
     stype = anyarray,
-    initcond = '{}'                                                                                                                                           
+    initcond = '{}'
 );
 DROP TABLE IF EXISTS pivout;
 SELECT madlib.pivot('pivset_ext', 'pivout', 'id', 'piv', 'val', 'array_accum1');
 SELECT * FROM pivout ORDER BY id;
 </pre>
 <pre class="result">
-  id | val_array_accum1_piv_10 | val_array_accum1_piv_20 | val_array_accum1_piv_30 
+  id | val_array_accum1_piv_10 | val_array_accum1_piv_20 | val_array_accum1_piv_30
 ----+-------------------------+-------------------------+-------------------------
   0 | {1,2}                   | {3}                     | {}
   1 | {7}                     | {4}                     | {5,6}
@@ -316,30 +316,30 @@ SELECT * FROM pivout ORDER BY id;
 id                      | 0
 val_avg_piv_10_piv2_0   | 1
 val_avg_piv_10_piv2_100 | 2
-val_avg_piv_10_piv2_200 | 
-val_avg_piv_10_piv2_300 | 
-val_avg_piv_20_piv2_0   | 
+val_avg_piv_10_piv2_200 |
+val_avg_piv_10_piv2_300 |
+val_avg_piv_20_piv2_0   |
 val_avg_piv_20_piv2_100 | 3
-val_avg_piv_20_piv2_200 | 
-val_avg_piv_20_piv2_300 | 
-val_avg_piv_30_piv2_0   | 
-val_avg_piv_30_piv2_100 | 
-val_avg_piv_30_piv2_200 | 
-val_avg_piv_30_piv2_300 | 
+val_avg_piv_20_piv2_200 |
+val_avg_piv_20_piv2_300 |
+val_avg_piv_30_piv2_0   |
+val_avg_piv_30_piv2_100 |
+val_avg_piv_30_piv2_200 |
+val_avg_piv_30_piv2_300 |
 -[ RECORD 2 ]-----------+----
 id                      | 1
-val_avg_piv_10_piv2_0   | 
-val_avg_piv_10_piv2_100 | 
+val_avg_piv_10_piv2_0   |
+val_avg_piv_10_piv2_100 |
 val_avg_piv_10_piv2_200 | 7
-val_avg_piv_10_piv2_300 | 
-val_avg_piv_20_piv2_0   | 
+val_avg_piv_10_piv2_300 |
+val_avg_piv_20_piv2_0   |
 val_avg_piv_20_piv2_100 | 4
-val_avg_piv_20_piv2_200 | 
-val_avg_piv_20_piv2_300 | 
-val_avg_piv_30_piv2_0   | 
-val_avg_piv_30_piv2_100 | 
+val_avg_piv_20_piv2_200 |
+val_avg_piv_20_piv2_300 |
+val_avg_piv_30_piv2_0   |
+val_avg_piv_30_piv2_100 |
 val_avg_piv_30_piv2_200 | 5.5
-val_avg_piv_30_piv2_300 | 
+val_avg_piv_30_piv2_300 |
 ...
 </pre>
 
@@ -354,10 +354,10 @@ SELECT * FROM pivout ORDER BY id;
 id              | 0
 val_avg_piv_10  | 1.5
 val_avg_piv_20  | 3
-val_avg_piv_30  | 
+val_avg_piv_30  |
 val2_avg_piv_10 | 11.5
 val2_avg_piv_20 | 13
-val2_avg_piv_30 | 
+val2_avg_piv_30 |
 -[ RECORD 2 ]---+-----
 id              | 1
 val_avg_piv_10  | 7
@@ -381,10 +381,10 @@ SELECT * FROM pivout ORDER BY id;
 id             | 0
 val_avg_piv_10 | 1.5
 val_avg_piv_20 | 3
-val_avg_piv_30 | 
+val_avg_piv_30 |
 val_sum_piv_10 | 3
 val_sum_piv_20 | 3
-val_sum_piv_30 | 
+val_sum_piv_30 |
 -[ RECORD 2 ]--+----
 id             | 1
 val_avg_piv_10 | 7
@@ -435,13 +435,13 @@ SELECT * FROM pivout ORDER BY id;
 id              | 0
 val_avg_piv_10  | 1.5
 val_avg_piv_20  | 3
-val_avg_piv_30  | 
+val_avg_piv_30  |
 val2_avg_piv_10 | 11.5
 val2_avg_piv_20 | 13
-val2_avg_piv_30 | 
+val2_avg_piv_30 |
 val2_sum_piv_10 | 23
 val2_sum_piv_20 | 13
-val2_sum_piv_30 | 
+val2_sum_piv_30 |
 -[ RECORD 2 ]---+-----
 id              | 1
 val_avg_piv_10  | 7
@@ -577,7 +577,7 @@ SELECT madlib.pivot('pivset_ext', 'pivout', 'id, id2', 'piv, piv2', 'val, val2',
 SELECT * FROM pivout_dictionary;
 </pre>
 <pre class="result">
-  __pivot_cid__ | pval | agg | piv | piv2 |           col_name           
+  __pivot_cid__ | pval | agg | piv | piv2 |           col_name
 ---------------+------+-----+-----+------+------------------------------
  __p_1__       | val  | avg |     |  100 | "val_avg_piv_null_piv2_100"
  __p_5__       | val  | avg |  10 |  100 | "val_avg_piv_10_piv2_100"

[21/50] [abbrv] incubator-madlib git commit: Build: Use relative path for installation using GPPKG

Posted by ri...@apache.org.

Build: Use relative path for installation using GPPKG

GPDB and HAWQ use a relative link outside GPHOME to indicate the current
version of the database. During a DB upgrade, this relative link is
updated to point to the right location. Installing MADlib using GPPKG
leads to madlib files within GPHOME, but the functions use the absolute
path instead of this symbolic link. This commit changes those locations
to employ the symbolic link, allowing easy DB upgrades.

Closes #94


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/071128d7
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/071128d7
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/071128d7

Branch: refs/heads/latest_release
Commit: 071128d7cbf8cefe3b3bb72fea3978bd4f5837dd
Parents: d65dca5
Author: Rahul Iyer <ri...@apache.org>
Authored: Tue Jan 24 16:59:50 2017 -0800
Committer: Rahul Iyer <ri...@apache.org>
Committed: Wed Feb 1 13:16:36 2017 -0800

----------------------------------------------------------------------
 src/madpack/madpack.py | 82 ++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 69 insertions(+), 13 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/071128d7/src/madpack/madpack.py
----------------------------------------------------------------------
diff --git a/src/madpack/madpack.py b/src/madpack/madpack.py
index 8b0e64d..ddd75df 100755
--- a/src/madpack/madpack.py
+++ b/src/madpack/madpack.py
@@ -35,8 +35,7 @@ if sys.version_info[:2] < py_min_ver:
 # two levels up in the directory hierarchy. We use (a) os.path.realpath and
 # (b) __file__ (instead of sys.argv[0]) because madpack.py could be called
 # (a) through a symbolic link and (b) not as the main module.
-maddir = os.path.abspath(os.path.dirname(os.path.realpath(
-    __file__)) + "/..")   # MADlib root dir
+maddir = os.path.abspath(os.path.dirname(os.path.realpath(__file__)) + "/..")   # MADlib root dir
 sys.path.append(maddir + "/madpack")
 
 # Import MADlib python modules
@@ -57,6 +56,8 @@ portid_list = []
 for port in ports:
     portid_list.append(port)
 
+SUPPORTED_PORTS = ('postgres', 'greenplum', 'hawq')
+
 # Global variables
 portid = None       # Target port ID (eg: pg90, gp40)
 dbconn = None       # DB Connection object
@@ -182,10 +183,48 @@ def _internal_run_query(sql, show_error):
 # ------------------------------------------------------------------------------
 
 
+def _get_relative_maddir(maddir, port):
+    """ Return a relative path version of maddir
+
+    GPDB and HAWQ installations have a symlink outside of GPHOME that
+    links to the current GPHOME. After a DB upgrade, this symlink is updated to
+    the new GPHOME.
+
+    'maddir_lib', which uses the absolute path of GPHOME, is hardcoded into each
+    madlib function definition. Replacing the GPHOME path with the equivalent
+    relative path makes it simpler to perform DB upgrades without breaking MADlib.
+    """
+    if port not in ('greenplum', 'hawq'):
+        # do nothing for postgres
+        return maddir
+
+    # e.g. maddir_lib = $GPHOME/madlib/Versions/1.9/lib/libmadlib.so
+    # 'madlib' is supposed to be in this path, which is the default folder
+    # used by GPPKG to install madlib
+    try:
+        abs_gphome, tail = maddir.split('madlib/')
+    except ValueError:
+        return maddir
+
+    link_name = 'greenplum-db' if port == 'greenplum' else 'hawq'
+
+    # Check outside $GPHOME if there is a symlink to this absolute path
+    # os.pardir is equivalent to ..
+    # os.path.normpath removes the extraneous .. from that path
+    rel_gphome = os.path.normpath(os.path.join(abs_gphome, os.pardir, link_name))
+    if os.path.islink(rel_gphome) and os.path.realpath(rel_gphome) == os.path.realpath(abs_gphome):
+        # if the relative link exists and is pointing to current location
+        return os.path.join(rel_gphome, 'madlib', tail)
+    else:
+        return maddir
+# ------------------------------------------------------------------------------
+
+
 def _run_sql_file(schema, maddir_mod_py, module, sqlfile,
                   tmpfile, logfile, pre_sql, upgrade=False,
                   sc=None):
-    """Run SQL file
+    """
+        Run SQL file
             @param schema name of the target schema
             @param maddir_mod_py name of the module dir with Python code
             @param module  name of the module
@@ -955,8 +994,7 @@ def parseConnectionStr(connectionStr):
 # ------------------------------------------------------------------------------
 
 
-def main(argv):
-
+def parse_arguments():
     parser = argparse.ArgumentParser(
         prog="madpack",
         description='MADlib package manager (' + str(rev) + ')',
@@ -1018,7 +1056,12 @@ def main(argv):
                         help="Module names to test, comma separated. Effective only for install-check.")
 
     # Get the arguments
-    args = parser.parse_args()
+    return parser.parse_args()
+
+
+def main(argv):
+    args = parse_arguments()
+
     global verbose
     verbose = args.verbose
     _info("Arguments: " + str(args), verbose)
@@ -1040,9 +1083,6 @@ def main(argv):
 
     # Parse DB Platform (== PortID) and compare with Ports.yml
     global portid
-    global dbver
-    global is_hawq2
-
     if args.platform:
         try:
             # Get the DB platform name == DB port id
@@ -1060,7 +1100,7 @@ def main(argv):
         (c_user, c_pass, c_host, c_port, c_db) = parseConnectionStr(connStr)
 
         # Find the default values for PG and GP
-        if portid in ('postgres', 'greenplum', 'hawq'):
+        if portid in SUPPORTED_PORTS:
             if c_user is None:
                 c_user = os.environ.get('PGUSER', getpass.getuser())
             if c_pass is None:
@@ -1093,15 +1133,25 @@ def main(argv):
             _error('Failed to connect to database', True)
 
         # Get DB version
+        global dbver
         dbver = _get_dbver()
-
-        # Get MADlib version in DB
-        dbrev = _get_madlib_dbrev(schema)
+        global is_hawq2
+        if portid == "hawq" and _is_rev_gte(_get_rev_num(dbver), _get_rev_num('2.0')):
+            is_hawq2 = True
+        else:
+            is_hawq2 = False
 
         # HAWQ < 2.0 has hard-coded schema name 'madlib'
         if portid == 'hawq' and not is_hawq2 and schema.lower() != 'madlib':
             _error("*** Installation is currently restricted only to 'madlib' schema ***", True)
 
+        # update maddir to use a relative path if available
+        global maddir
+        maddir = _get_relative_maddir(maddir, portid)
+
+        # Get MADlib version in DB
+        dbrev = _get_madlib_dbrev(schema)
+
         portdir = os.path.join(maddir, "ports", portid)
         supportedVersions = [dirItem for dirItem in os.listdir(portdir)
                              if os.path.isdir(os.path.join(portdir, dirItem)) and
@@ -1151,11 +1201,17 @@ def main(argv):
         global maddir_conf
         if os.path.isdir(maddir + "/ports/" + portid + "/" + dbver + "/config"):
             maddir_conf = maddir + "/ports/" + portid + "/" + dbver + "/config"
+        else:
+            maddir_conf = maddir + "/config"
+
         global maddir_lib
         if os.path.isfile(maddir + "/ports/" + portid + "/" + dbver +
                           "/lib/libmadlib.so"):
             maddir_lib = maddir + "/ports/" + portid + "/" + dbver + \
                 "/lib/libmadlib.so"
+        else:
+            maddir_lib = maddir + "/lib/libmadlib.so"
+
         # Get the list of modules for this port
         global portspecs
         portspecs = configyml.get_modules(maddir_conf)

[20/50] [abbrv] incubator-madlib git commit: Build: Use only major version for GPDB 5, HAWQ 2

Posted by ri...@apache.org.

Build: Use only major version for GPDB 5, HAWQ 2

GPDB, starting 5.0, and HAWQ, starting 2.0, are using semantic
versioning for releases. This implies a binary compatibility between
same major versions. Hence, we only need to compile a single MADlib
binary for the same major version, with the folder name being just the
major version instead of 'major.minor'.

Closes #79, closes #91

0e00a27 closes #76


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/d65dca54
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/d65dca54
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/d65dca54

Branch: refs/heads/latest_release
Commit: d65dca5450c68df00a2350648f12a72151676692
Parents: 0e00a27
Author: Rahul Iyer <ri...@apache.org>
Authored: Mon Jan 23 16:58:29 2017 -0800
Committer: Rahul Iyer <ri...@apache.org>
Committed: Wed Feb 1 13:15:31 2017 -0800

----------------------------------------------------------------------
 src/madpack/madpack.py                          | 44 +++++++++++++-------
 src/ports/greenplum/5.0/CMakeLists.txt          |  1 -
 src/ports/greenplum/5/CMakeLists.txt            |  1 +
 src/ports/greenplum/cmake/FindGreenplum_5.cmake |  2 +
 .../greenplum/cmake/FindGreenplum_5_0.cmake     |  2 -
 src/ports/hawq/2.0/CMakeLists.txt               | 20 ---------
 src/ports/hawq/2/CMakeLists.txt                 | 19 +++++++++
 src/ports/hawq/CMakeLists.txt                   |  2 +-
 src/ports/hawq/cmake/FindHAWQ_2.cmake           | 21 ++++++++++
 src/ports/hawq/cmake/FindHAWQ_2_0.cmake         | 21 ----------
 src/ports/postgres/cmake/PostgreSQLUtils.cmake  | 36 ++++++++++------
 11 files changed, 96 insertions(+), 73 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/d65dca54/src/madpack/madpack.py
----------------------------------------------------------------------
diff --git a/src/madpack/madpack.py b/src/madpack/madpack.py
index e15bb4a..8b0e64d 100755
--- a/src/madpack/madpack.py
+++ b/src/madpack/madpack.py
@@ -317,13 +317,9 @@ def _get_dbver():
         if portid == 'postgres':
             match = re.search("PostgreSQL[a-zA-Z\s]*(\d+\.\d+)", versionStr)
         elif portid == 'greenplum':
-            match = re.search("Greenplum[a-zA-Z\s]*(\d+\.\d+)", versionStr)
-            # Due to the ABI incompatibility between 4.3.4 and 4.3.5,
-            # MADlib treat 4.3.5+ as DB version 4.3V2 that is different from 4.3
-            if match and match.group(1) == '4.3':
-                match_details = re.search("Greenplum[a-zA-Z\s]*(\d+\.\d+.\d+)", versionStr)
-                if _is_rev_gte(_get_rev_num(match_details.group(1)), _get_rev_num('4.3.5')):
-                    return '4.3ORCA'
+            # for Greenplum the 3rd digit is necessary to differentiate
+            # 4.3.5+ from versions < 4.3.5
+            match = re.search("Greenplum[a-zA-Z\s]*(\d+\.\d+\.\d+)", versionStr)
         elif portid == 'hawq':
             match = re.search("HAWQ[a-zA-Z\s]*(\d+\.\d+)", versionStr)
         return None if match is None else match.group(1)
@@ -1098,11 +1094,6 @@ def main(argv):
 
         # Get DB version
         dbver = _get_dbver()
-        portdir = os.path.join(maddir, "ports", portid)
-        if portid == "hawq" and _is_rev_gte(_get_rev_num(dbver), _get_rev_num('2.0')):
-            is_hawq2 = True
-        else:
-            is_hawq2 = False
 
         # Get MADlib version in DB
         dbrev = _get_madlib_dbrev(schema)
@@ -1111,12 +1102,14 @@ def main(argv):
         if portid == 'hawq' and not is_hawq2 and schema.lower() != 'madlib':
             _error("*** Installation is currently restricted only to 'madlib' schema ***", True)
 
+        portdir = os.path.join(maddir, "ports", portid)
         supportedVersions = [dirItem for dirItem in os.listdir(portdir)
                              if os.path.isdir(os.path.join(portdir, dirItem)) and
-                             re.match("^\d+\.\d+", dirItem)]
+                             re.match("^\d+", dirItem)]
         if dbver is None:
-            dbver = ".".join(map(str, max([map(int, versionStr.split('.'))
-                                           for versionStr in supportedVersions])))
+            dbver = ".".join(
+                map(str, max([versionStr.split('.')
+                              for versionStr in supportedVersions])))
             _info("Could not parse version string reported by {DBMS}. Will "
                   "default to newest supported version of {DBMS} "
                   "({version}).".format(DBMS=ports[portid]['name'],
@@ -1124,6 +1117,27 @@ def main(argv):
         else:
             _info("Detected %s version %s." % (ports[portid]['name'], dbver),
                   True)
+
+            if portid == "hawq":
+                # HAWQ (starting 2.0) and GPDB (starting 5.0) uses semantic versioning,
+                # which implies all HAWQ 2.x or GPDB 5.x versions will have binary
+                # compatibility. Hence, we can keep single folder for all 2.X / 5.X.
+                if (_is_rev_gte(_get_rev_num(dbver), _get_rev_num('2.0')) and
+                        not _is_rev_gte(_get_rev_num(dbver), _get_rev_num('3.0'))):
+                    is_hawq2 = True
+                    dbver = '2'
+            elif portid == 'greenplum':
+                # similar to HAWQ above, collapse all 5.X versions
+                if (_is_rev_gte(_get_rev_num(dbver), _get_rev_num('5.0')) and
+                        not _is_rev_gte(_get_rev_num(dbver), _get_rev_num('6.0'))):
+                    dbver = '5'
+                # Due to the ABI incompatibility between 4.3.4 and 4.3.5,
+                # MADlib treats 4.3.5+ as DB version 4.3ORCA which is different
+                # from 4.3. The name is suffixed with ORCA since optimizer (ORCA) is
+                # 'on' by default in 4.3.5
+                elif _is_rev_gte(_get_rev_num(dbver), _get_rev_num('4.3.4')):
+                    dbver = '4.3ORCA'
+
             if not os.path.isdir(os.path.join(portdir, dbver)):
                 _error("This version is not among the %s versions for which "
                        "MADlib support files have been installed (%s)." %

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/d65dca54/src/ports/greenplum/5.0/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/src/ports/greenplum/5.0/CMakeLists.txt b/src/ports/greenplum/5.0/CMakeLists.txt
deleted file mode 100644
index 015d76e..0000000
--- a/src/ports/greenplum/5.0/CMakeLists.txt
+++ /dev/null
@@ -1 +0,0 @@
-add_current_greenplum_version()

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/d65dca54/src/ports/greenplum/5/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/src/ports/greenplum/5/CMakeLists.txt b/src/ports/greenplum/5/CMakeLists.txt
new file mode 100644
index 0000000..015d76e
--- /dev/null
+++ b/src/ports/greenplum/5/CMakeLists.txt
@@ -0,0 +1 @@
+add_current_greenplum_version()

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/d65dca54/src/ports/greenplum/cmake/FindGreenplum_5.cmake
----------------------------------------------------------------------
diff --git a/src/ports/greenplum/cmake/FindGreenplum_5.cmake b/src/ports/greenplum/cmake/FindGreenplum_5.cmake
new file mode 100644
index 0000000..fe15861
--- /dev/null
+++ b/src/ports/greenplum/cmake/FindGreenplum_5.cmake
@@ -0,0 +1,2 @@
+set(_FIND_PACKAGE_FILE "${CMAKE_CURRENT_LIST_FILE}")
+include("${CMAKE_CURRENT_LIST_DIR}/FindGreenplum.cmake")

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/d65dca54/src/ports/greenplum/cmake/FindGreenplum_5_0.cmake
----------------------------------------------------------------------
diff --git a/src/ports/greenplum/cmake/FindGreenplum_5_0.cmake b/src/ports/greenplum/cmake/FindGreenplum_5_0.cmake
deleted file mode 100644
index fe15861..0000000
--- a/src/ports/greenplum/cmake/FindGreenplum_5_0.cmake
+++ /dev/null
@@ -1,2 +0,0 @@
-set(_FIND_PACKAGE_FILE "${CMAKE_CURRENT_LIST_FILE}")
-include("${CMAKE_CURRENT_LIST_DIR}/FindGreenplum.cmake")

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/d65dca54/src/ports/hawq/2.0/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/src/ports/hawq/2.0/CMakeLists.txt b/src/ports/hawq/2.0/CMakeLists.txt
deleted file mode 100644
index 0a2eb24..0000000
--- a/src/ports/hawq/2.0/CMakeLists.txt
+++ /dev/null
@@ -1,20 +0,0 @@
-# ------------------------------------------------------------------------------
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# ------------------------------------------------------------------------------
-
-add_current_hawq_version()

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/d65dca54/src/ports/hawq/2/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/src/ports/hawq/2/CMakeLists.txt b/src/ports/hawq/2/CMakeLists.txt
new file mode 100644
index 0000000..a024a53
--- /dev/null
+++ b/src/ports/hawq/2/CMakeLists.txt
@@ -0,0 +1,19 @@
+# ------------------------------------------------------------------------------
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# ------------------------------------------------------------------------------
+add_current_hawq_version()

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/d65dca54/src/ports/hawq/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/src/ports/hawq/CMakeLists.txt b/src/ports/hawq/CMakeLists.txt
index 8610238..2ba532d 100644
--- a/src/ports/hawq/CMakeLists.txt
+++ b/src/ports/hawq/CMakeLists.txt
@@ -85,7 +85,7 @@ add_sql_files(
     "../postgres/modules"
     "${CMAKE_CURRENT_BINARY_DIR}/modules"
 )
-# Add Greenplum-specific modules. Files will be appended to SQL_TARGET_FILES.
+# Add HAWQ-specific modules. Files will be appended to SQL_TARGET_FILES.
 add_sql_files(
     SQL_TARGET_FILES
     "modules"

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/d65dca54/src/ports/hawq/cmake/FindHAWQ_2.cmake
----------------------------------------------------------------------
diff --git a/src/ports/hawq/cmake/FindHAWQ_2.cmake b/src/ports/hawq/cmake/FindHAWQ_2.cmake
new file mode 100644
index 0000000..0cdbf63
--- /dev/null
+++ b/src/ports/hawq/cmake/FindHAWQ_2.cmake
@@ -0,0 +1,21 @@
+# ------------------------------------------------------------------------------
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# ------------------------------------------------------------------------------
+
+set(_FIND_PACKAGE_FILE "${CMAKE_CURRENT_LIST_FILE}")
+include("${CMAKE_CURRENT_LIST_DIR}/FindHAWQ.cmake")

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/d65dca54/src/ports/hawq/cmake/FindHAWQ_2_0.cmake
----------------------------------------------------------------------
diff --git a/src/ports/hawq/cmake/FindHAWQ_2_0.cmake b/src/ports/hawq/cmake/FindHAWQ_2_0.cmake
deleted file mode 100644
index 0cdbf63..0000000
--- a/src/ports/hawq/cmake/FindHAWQ_2_0.cmake
+++ /dev/null
@@ -1,21 +0,0 @@
-# ------------------------------------------------------------------------------
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# ------------------------------------------------------------------------------
-
-set(_FIND_PACKAGE_FILE "${CMAKE_CURRENT_LIST_FILE}")
-include("${CMAKE_CURRENT_LIST_DIR}/FindHAWQ.cmake")

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/d65dca54/src/ports/postgres/cmake/PostgreSQLUtils.cmake
----------------------------------------------------------------------
diff --git a/src/ports/postgres/cmake/PostgreSQLUtils.cmake b/src/ports/postgres/cmake/PostgreSQLUtils.cmake
index e5e4228..7acdf9a 100644
--- a/src/ports/postgres/cmake/PostgreSQLUtils.cmake
+++ b/src/ports/postgres/cmake/PostgreSQLUtils.cmake
@@ -4,7 +4,7 @@ function(define_postgresql_features IN_VERSION OUT_FEATURES)
     if(NOT ${IN_VERSION} VERSION_LESS "9.0")
         list(APPEND ${OUT_FEATURES} __HAS_ORDERED_AGGREGATES__)
     endif()
-    
+
     # Pass values to caller
     set(${OUT_FEATURES} "${${OUT_FEATURES}}" PARENT_SCOPE)
 endfunction(define_postgresql_features)
@@ -54,7 +54,7 @@ endfunction(cpack_add_version_component)
 #
 function(determine_target_versions OUT_VERSIONS)
     get_subdirectories("${CMAKE_CURRENT_SOURCE_DIR}" SUPPORTED_VERSIONS)
-    get_filtered_list(SUPPORTED_VERSIONS "^[0-9]+.[0-9]+.*$" ${SUPPORTED_VERSIONS})
+    get_filtered_list(SUPPORTED_VERSIONS "^[0-9]+.*$" ${SUPPORTED_VERSIONS})
 
     foreach(VERSION ${SUPPORTED_VERSIONS})
         string(REPLACE "." "_" VERSION_UNDERSCORE "${VERSION}")
@@ -64,18 +64,28 @@ function(determine_target_versions OUT_VERSIONS)
     endforeach(VERSION)
     if(NOT DEFINED ${OUT_VERSIONS})
         find_package(${PORT})
-
         if(${PORT_UC}_FOUND)
-            # Due to the ABI incompatibility between 4.3.4 and 4.3.5,
-            # MADlib treat 4.3.5+ as DB version that is different from 4.3
-            if(${PORT_UC} STREQUAL "GREENPLUM" AND
-                    ${${PORT_UC}_VERSION_MAJOR} EQUAL 4 AND
-                    ${${PORT_UC}_VERSION_MINOR} EQUAL 3 AND
-                    ${${PORT_UC}_VERSION_PATCH} GREATER 4)
-                set(VERSION "4.3ORCA")
-            else()
-                set(VERSION "${${PORT_UC}_VERSION_MAJOR}.${${PORT_UC}_VERSION_MINOR}")
+            set(VERSION "${${PORT_UC}_VERSION_MAJOR}.${${PORT_UC}_VERSION_MINOR}")
+            if(${PORT_UC} STREQUAL "GREENPLUM")
+                # Starting GPDB 5.0, semantic versioning will be followed,
+                # implying we only need 1 folder for same major versions
+                if(${${PORT_UC}_VERSION_MAJOR} EQUAL 5)
+                    set(VERSION "5")
+
+                # Due to the ABI incompatibility between 4.3.4 and 4.3.5,
+                # MADlib treat 4.3.5+ as DB version that is different from 4.3
+                elseif(${${PORT_UC}_VERSION_MAJOR} EQUAL 4 AND
+                        ${${PORT_UC}_VERSION_MINOR} EQUAL 3 AND
+                        ${${PORT_UC}_VERSION_PATCH} GREATER 4)
+                    set(VERSION "4.3ORCA")
+                endif()
+            elseif(${PORT_UC} STREQUAL "HAWQ" AND
+                    ${${PORT_UC}_VERSION_MAJOR} EQUAL 2)
+                # Starting HAWQ 2.0, semantic versioning will be followed,
+                # implying we only need 1 folder for same major versions
+                set(VERSION "2")
             endif()
+
             list(FIND SUPPORTED_VERSIONS "${VERSION}" _POS)
             if(_POS EQUAL -1)
                 string(REPLACE ";" ", " _SUPPORTED_VERSIONS_STR "${SUPPORTED_VERSIONS}")
@@ -96,7 +106,7 @@ function(determine_target_versions OUT_VERSIONS)
             endif(_POS EQUAL -1)
         endif(${PORT_UC}_FOUND)
     endif(NOT DEFINED ${OUT_VERSIONS})
-    
+
     # Pass values to caller
     set(${OUT_VERSIONS} "${${OUT_VERSIONS}}" PARENT_SCOPE)
     # ${PORT_UC}_${_VERSION_UNDERSCORE}_PG_CONFIG might have been set earlier!

[09/50] [abbrv] incubator-madlib git commit: PCA: Add grouping support to PCA

Posted by ri...@apache.org.

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/02a7ef45/src/ports/postgres/modules/pca/pca_project.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/pca/pca_project.py_in b/src/ports/postgres/modules/pca/pca_project.py_in
index 98d9e3f..62bf2b1 100644
--- a/src/ports/postgres/modules/pca/pca_project.py_in
+++ b/src/ports/postgres/modules/pca/pca_project.py_in
@@ -19,6 +19,8 @@ from utilities.utilities import _array_to_string
 from utilities.validate_args import columns_exist_in_table
 from utilities.validate_args import table_exists
 from utilities.utilities import add_postfix
+from utilities.validate_args import get_cols, get_cols_and_types
+from utilities.control import MinWarning
 
 
 version_wrapper = __mad_version()
@@ -27,69 +29,149 @@ array_to_string = version_wrapper.select_vec_return()
 ZERO_THRESHOLD = 1e-6
 
 
-# Dense PCA help function
+# Dense PCA project help function
 def pca_project_help(schema_madlib, usage_string=None, **kwargs):
     """
     Given a usage string, give out function usage information.
     """
-    if usage_string is None:
-        usage_string = ''
-
-    if (usage_string.lower() == "usage"):
+    if usage_string is not None and \
+            usage_string.lower() in ("usage", "help", "?"):
         return """
-        ----------------------------------------------------------------
-                                Usage
-        ----------------------------------------------------------------
-        SELECT {schema_madlib}.pca_project (
-            'tbl_source',          -- Data table
-            'tbl_pc',              -- Table with principal componenents
-                                        (obtained as output from pca_train)
-            'tbl_result',          -- Result table
-            'row_id',              -- Name of the column containing the row_id
-            -- Optional Parameters
-            ----------------------------------------------------------------
-            'tbl_residual',        -- Residual table (Default: NULL)
-            'tbl_result_summary',  -- Result summary table (Default : NULL)
-        );
-
-                                Output Tables
-        --------------------------------------------------------------------
-        The output is divided into three tables (two of which are optional)
-
-        --------------------------------------------------------------------
-         The output table ('tbl_result' above) encodes a dense matrix
-         with the projection onto the principal components. The matrix contains
-         the following columns:
-
-         'row_id'        INTEGER,            -- Row id of the output matrix
-         'row_vec'       DOUBLE PRECISION[], -- A vector containing elements in the row of the matrix
-
-        --------------------------------------------------------------------
-         The residual table ('tbl_residual' above) encodes a dense residual
-         matrix which has the following columns
-
-         'row_id'        INTEGER,            -- Row id of the output matrix
-         'row_vec'       DOUBLE PRECISION[], -- A vector containing elements in the row of the matrix
-
-        --------------------------------------------------------------------
-         The result summary table ('tbl_result_summary' above) has the following columns
-
-          'exec_time'                INTEGER,            -- Wall clock time (ms) of the function.
-          'residual_norm'            DOUBLE PRECISION,   -- Absolute error of the residuals
-          'relative_residual_norm'   DOUBLE PRECISION    -- Relative error of the residuals
-        ----------------------------------------------------------------
+----------------------------------------------------------------
+                        Usage
+----------------------------------------------------------------
+SELECT {schema_madlib}.pca_project (
+    'tbl_source',          -- Data table
+    'pc_table',            -- Table with principal componenents
+                                (obtained as output from pca_train)
+    'tbl_result',          -- Result table
+    'row_id',              -- Name of the column containing the row_id
+    -- Optional Parameters
+    ----------------------------------------------------------------
+    'tbl_residual',        -- Residual table (Default: NULL)
+    'tbl_result_summary',  -- Result summary table (Default : NULL)
+);
+
+Note that if the principal components in pc_table were learnt using
+grouping_cols in {schema_madlib}.pca_train(), the tbl_source used
+here must also have those grouping columns. This will fail otherwise.
+
+                        Output Tables
+--------------------------------------------------------------------
+The output is divided into three tables (two of which are optional)
+
+--------------------------------------------------------------------
+ The output table ('tbl_result' above) encodes a dense matrix
+ with the projection onto the principal components. The matrix contains
+ the following columns:
+
+ 'row_id'        INTEGER,            -- Row id of the output matrix
+ 'row_vec'       DOUBLE PRECISION[], -- A vector containing elements in the row of the matrix
+ grouping_col                        -- The grouping columns present in the 'pc_table', if any
+
+--------------------------------------------------------------------
+ The residual table ('tbl_residual' above) encodes a dense residual
+ matrix which has the following columns
+
+ 'row_id'        INTEGER,            -- Row id of the output matrix
+ 'row_vec'       DOUBLE PRECISION[], -- A vector containing elements in the row of the matrix
+ grouping_col                        -- The grouping columns present in the 'pc_table', if any
+
+--------------------------------------------------------------------
+ The result summary table ('tbl_result_summary' above) has the following columns
+
+  'exec_time'                INTEGER,            -- Wall clock time (ms) of the function.
+  'residual_norm'            DOUBLE PRECISION,   -- Absolute error of the residuals
+  'relative_residual_norm'   DOUBLE PRECISION,   -- Relative error of the residuals
+  grouping_col                                   -- The grouping columns present in the 'pc_table', if any
+----------------------------------------------------------------
         """.format(schema_madlib=schema_madlib)
     else:
-        return """
-        ----------------------------------------------------------------
-                 Summary: PCA Projection
-        ----------------------------------------------------------------
-        PCA Projection: Projects a dataset to an already trained
-        space of principal components.
-        --
-        For function usage information, run
-        SELECT {schema_madlib}.pca_project('usage');
-        --
+        if usage_string is not None and \
+                usage_string.lower() in ("example", "examples"):
+            return """
+----------------------------------------------------------------
+                        Examples
+----------------------------------------------------------------
+-- Run pca_project() using a model table generated without grouping_cols.
+-- Create input table for pca_project()
+
+DROP TABLE IF EXISTS mat_proj;
+CREATE TABLE mat_proj (
+    row_id integer,
+    row_vec double precision[]
+);
+COPY mat_proj (row_id, row_vec) FROM stdin DELIMITER '|';
+1|{{1,2,3}}
+2|{{2,1,2}}
+3|{{3,2,1}}
+11|{{1,2,3}}
+21|{{2,1,2}}
+31|{{3,2,1}}
+41|{{1,2,4}}
+12|{{1,3,3}}
+\.
+
+-- NOTE: Use the 'result_table' created using the example shown in
+-- {schema_madlib}.pca_train('examples'), as the 'pc_table' parameter here.
+
+DROP TABLE IF EXISTS mat_proj_out;
+SELECT {schema_madlib}.pca_project(
+    'mat_proj',
+    'result_table',
+    'mat_proj_out',
+    'row_id'
+    );
+
+SELECT * FROM mat_proj_out;
+
+-----------------------------------------------------------------------
+
+-- Run pca_project() using a model table generated with grouping_cols.
+-- Create input table for pca_project(), with grouping
+
+DROP TABLE IF EXISTS mat_proj_grouped;
+CREATE TABLE mat_proj_grouped (
+    row_id integer,
+    row_vec double precision[],
+    matrix_id integer
+);
+COPY mat_proj_grouped (row_id, row_vec, matrix_id) FROM stdin DELIMITER '|';
+1|{{1,2,3}}|1
+2|{{2,1,2}}|1
+3|{{3,2,1}}|1
+4|{{1,2,3,4,5}}|2
+5|{{2,1,2,4,5}}|2
+6|{{3,2,1,4,5}}|2
+\.
+
+-- NOTE: Use the 'result_table_grp' created using the example shown
+-- in {schema_madlib}.pca_train('examples'), as the 'pc_table' parameter
+-- here. 'result_table_grp' was created with 'matrix_id' as the
+-- grouping column, and the table 'mat_proj_grouped' should also have the
+-- 'matrix_id' column in it.
+
+DROP TABLE IF EXISTS mat_proj_grouped_out;
+SELECT {schema_madlib}.pca_project(
+    'mat_proj_grouped',
+    'result_table_grp',
+    'mat_proj_grouped_out',
+    'row_id'
+    );
+
+SELECT * FROM mat_proj_grouped_out;
+            """.format(schema_madlib=schema_madlib)
+        else:
+            return """
+----------------------------------------------------------------
+         Summary: PCA Projection
+----------------------------------------------------------------
+PCA Projection: Projects a dataset to an already trained
+space of principal components.
+--
+For function usage information, run
+SELECT {schema_madlib}.pca_project('usage');
+--
         """.format(schema_madlib=schema_madlib)
 
 
@@ -99,73 +181,156 @@ def pca_sparse_project_help(schema_madlib, usage_string=None, **kwargs):
     """
     Given a usage string, give out function usage information.
     """
-    if usage_string is None:
-        usage_string = ''
-
-    if (usage_string.lower() == "usage"):
+    if usage_string is not None and \
+            usage_string.lower() in ("usage", "help", "?"):
         return """
-        ----------------------------------------------------------------
-                                Usage
-        ----------------------------------------------------------------
-        SELECT {schema_madlib}.pca_sparse_project (
-            'tbl_source',          -- Data table
-            'tbl_pc',              -- Table with principal componenents
-                                        (obtained as output from pca_train)
-            'tbl_result',          -- Result table
-            'row_id',              -- Name of the column containing the row_id
-            'col_id',              -- Name of the column containing the col_id
-            'val_id',              -- Name of the column containing the val_id
-            'row_dim'              -- Row dimension of the sparse matrix
-            'col_dim'              -- Column dimension of the sparse matrix
-            -- Optional Parameters
-            ----------------------------------------------------------------
-            'tbl_residual',        -- Residual table (Default: NULL)
-            'tbl_result_summary',  -- Result summary table (Default : NULL)
-        );
-
-                                Output Tables
-        ----------------------------------------------------------------
-        The output is divided into three tables (two of which are optional)
-
-        -----------------------------------------------------------------------------------------
-         The output table ('tbl_result' above) encodes a dense matrix
-         with the projection onto the principal components. The matrix contains
-         the following columns:
-
-         'row_id'        INTEGER,            -- Row id of the output matrix
-         'row_vec'       DOUBLE PRECISION[], -- A vector containing elements in the row of the matrix
-
-        -----------------------------------------------------------------------------------------
-         The residual table ('tbl_residual' above) encodes a dense residual
-         matrix which has the following columns
-
-         'row_id'        INTEGER,            -- Row id of the output matrix
-         'row_vec'       DOUBLE PRECISION[], -- A vector containing elements in the row of the matrix
-
-        -----------------------------------------------------------------------------------------
-         The result summary table ('tbl_result_summary' above) has the following columns
-
-          'exec_time'                INTEGER,            -- Wall clock time (ms) of the function.
-          'residual_norm'            DOUBLE PRECISION,   -- Absolute error of the residuals
-          'relative_residual_norm'   DOUBLE PRECISION    -- Relative error of the residuals
-        ----------------------------------------------------------------
+----------------------------------------------------------------
+                        Usage
+----------------------------------------------------------------
+SELECT {schema_madlib}.pca_sparse_project (
+    'tbl_source',          -- Data table
+    'pc_table',            -- Table with principal componenents
+                                (obtained as output from pca_train)
+    'tbl_result',          -- Result table
+    'row_id',              -- Name of the column containing the row_id
+    'col_id',              -- Name of the column containing the col_id
+    'val_id',              -- Name of the column containing the val_id
+    'row_dim'              -- Row dimension of the sparse matrix
+    'col_dim'              -- Column dimension of the sparse matrix
+    -- Optional Parameters
+    ----------------------------------------------------------------
+    'tbl_residual',        -- Residual table (Default: NULL)
+    'tbl_result_summary',  -- Result summary table (Default : NULL)
+);
+
+Note that if the principal components in 'pc_table' were learnt using
+grouping_cols in {schema_madlib}.pca_train(), the tbl_source used
+here must also have those grouping columns. This will fail otherwise.
+
+                        Output Tables
+----------------------------------------------------------------
+The output is divided into three tables (two of which are optional)
+
+-----------------------------------------------------------------------------------------
+ The output table ('tbl_result' above) encodes a dense matrix
+ with the projection onto the principal components. The matrix contains
+ the following columns:
+
+ 'row_id'        INTEGER,            -- Row id of the output matrix
+ 'row_vec'       DOUBLE PRECISION[], -- A vector containing elements in the row of the matrix
+ grouping_col                        -- The grouping columns present in the 'pc_table', if any
+
+-----------------------------------------------------------------------------------------
+ The residual table ('tbl_residual' above) encodes a dense residual
+ matrix which has the following columns
+
+ 'row_id'        INTEGER,            -- Row id of the output matrix
+ 'row_vec'       DOUBLE PRECISION[], -- A vector containing elements in the row of the matrix
+ grouping_col                        -- The grouping columns present in the 'pc_table', if any
+
+-----------------------------------------------------------------------------------------
+ The result summary table ('tbl_result_summary' above) has the following columns
+
+  'exec_time'                INTEGER,            -- Wall clock time (ms) of the function.
+  'residual_norm'            DOUBLE PRECISION,   -- Absolute error of the residuals
+  'relative_residual_norm'   DOUBLE PRECISION,   -- Relative error of the residuals
+  grouping_col                                   -- The grouping columns present in the 'pc_table', if any
+----------------------------------------------------------------
         """.format(schema_madlib=schema_madlib)
     else:
-        return """
-        ----------------------------------------------------------------
-                 Summary: PCA Projection
-        ----------------------------------------------------------------
-        PCA Projection: Projects a dataset to an already trained
-        space of principal components.
-        --
-        For function usage information, run
-        SELECT {schema_madlib}.pca_sparse_project('usage');
-        --
+        if usage_string is not None and \
+                usage_string.lower() in ("example", "examples"):
+            return """
+----------------------------------------------------------------
+                        Examples
+----------------------------------------------------------------
+-- Run pca_sparse_project() using a model table generated without grouping_cols.
+-- Create input table for pca_sparse_project()
+
+DROP TABLE IF EXISTS sparse_proj_mat;
+CREATE TABLE sparse_proj_mat (
+    row_id integer,
+    col_id integer,
+    val_id integer
+);
+COPY sparse_proj_mat (row_id, col_id, val_id) FROM stdin delimiter '|';
+1|2|4
+1|5|6
+3|8|4
+8|1|2
+8|7|2
+9|3|4
+9|8|2
+\.
+
+-- NOTE: Use the 'result_table_sparse' created using the example shown in
+-- {schema_madlib}.pca_sparse_train('examples'), as the 'pc_table' parameter here.
+
+SELECT {schema_madlib}.pca_sparse_project(
+    'sparse_proj_mat',
+    'result_table_sparse',
+    'sparse_proj_mat_out',
+    'row_id',
+    'col_id',
+    'val_id',
+    10,
+    10
+    );
+
+SELECT * FROM sparse_proj_mat_out;
+
+
+-- Run pca_sparse_project() using a model table generated with grouping_cols.
+-- Create input table for pca_sparse_project(), with grouping
+
+DROP TABLE IF EXISTS sparse_proj_mat_with_grouping;
+CREATE TABLE sparse_proj_mat_with_grouping (
+    row_id integer,
+    col_id integer,
+    val_id integer,
+    matrix_id integer
+);
+COPY sparse_proj_mat_with_grouping (row_id, col_id, val_id, matrix_id) FROM stdin delimiter '|';
+8|7|2|1
+9|3|4|1
+9|8|2|1
+1|2|4|2
+1|5|6|2
+6|6|12|2
+\.
+
+-- NOTE: Use the 'result_table_sparsed_grouped' created using the example shown
+-- in {schema_madlib}.pca_sparse_train('examples'), as the 'pc_table' parameter
+-- here. 'result_table_sparsed_grouped' was created with 'matrix_id' as the
+-- grouping column, and the table 'sparse_proj_mat_with_grouping' should also have
+-- the 'matrix_id' column in it.
+
+SELECT {schema_madlib}.pca_sparse_project(
+    'sparse_proj_mat_with_grouping',
+    'result_table_sparsed_grouped',
+    'sparse_proj_mat_with_grouping_out',
+    'row_id',
+    'col_id',
+    'val_id',
+    10,
+    10
+    );
+
+SELECT * FROM sparse_proj_mat_with_grouping_out;
+            """.format(schema_madlib=schema_madlib)
+        else:
+            return """
+----------------------------------------------------------------
+         Summary: PCA Projection
+----------------------------------------------------------------
+PCA Projection: Projects a dataset to an already trained
+space of principal components.
+--
+For function usage information, run:
+SELECT {schema_madlib}.pca_sparse_project('usage');
+--
         """.format(schema_madlib=schema_madlib)
 
-
-# Validate arguments: Same as pca
-# ------------------------------------------------------------------------
 def _validate_args(schema_madlib,
                    source_table,
                    pc_table,
@@ -249,9 +414,6 @@ def _validate_args(schema_madlib,
         _assert(row_dim > 0 and col_dim > 0,
                 "PCA error: row_dim/col_dim should be positive integer")
 
-        validate_sparse(source_table,
-                        {'row': row_id, 'col': col_id, 'val': val_id},
-                        check_col=False)
 # ------------------------------------------------------------------------
 
 
@@ -290,65 +452,9 @@ def pca_sparse_project(schema_madlib,
     Throws:
         plpy.error if any argument is invalid
     """
-
-    # Reset the message level to avoid random messages
-    old_msg_level = plpy.execute("""
-                                  SELECT setting
-                                  FROM pg_settings
-                                  WHERE name='client_min_messages'
-                                  """)[0]['setting']
-    plpy.execute('SET client_min_messages TO warning')
-
-    # Step 1: Validate the input arguments
-    _validate_args(schema_madlib,
-                   source_table,
-                   pc_table,
-                   out_table,
-                   row_id,
-                   col_id,
-                   val_id,
-                   row_dim,
-                   col_dim,
-                   residual_table,
-                   result_summary_table)
-
-    # Step 2: Create a copy of the sparse matrix and add row_dims and col_dims
-    # Warning: This changes the column names of the table
-    sparse_table_copy = "pg_temp." + unique_string() + "_sparse_table_copy"
-    create_temp_sparse_matrix_table_with_dims(source_table,
-                                              sparse_table_copy,
-                                              row_id,
-                                              col_id,
-                                              val_id,
-                                              row_dim,
-                                              col_dim)
-
-    # Step 3: Densify the input matrix
-    x_dense = "pg_temp." + unique_string() + "_dense"
-    plpy.execute("""
-        SELECT {schema_madlib}.matrix_densify(
-            '{sparse_table_copy}', 'row={row_id}, col={col_id}, val={val_id}',
-            '{x_dense}', 'row=row_id, col=col_id,val=row_vec')
-        """.format(**locals()))
-
-    # Step 4: Pass the densified matrix to regular PCA
-    pca_project(schema_madlib,
-                x_dense,
-                pc_table,
-                out_table,
-                'row_id',
-                residual_table,
-                result_summary_table)
-
-    # Step 4: Clean up
-    plpy.execute(
-        """
-        DROP TABLE IF EXISTS {x_dense};
-        DROP TABLE IF EXISTS {sparse_table_copy};
-        """.format(x_dense=x_dense,
-                   sparse_table_copy=sparse_table_copy))
-
-    plpy.execute("SET client_min_messages TO %s" % old_msg_level)
+    pca_project_wrap(schema_madlib, source_table, pc_table, out_table,
+                       row_id, residual_table, result_summary_table,
+                       True, col_id, val_id, row_dim, col_dim)
 
 
 # ------------------------------------------------------------------------
@@ -379,8 +485,21 @@ def pca_project(schema_madlib,
     Throws:
         plpy.error if any argument is invalid
     """
-    t0 = time.time()  # measure the starting time
+    pca_project_wrap(schema_madlib, source_table, pc_table, out_table,
+                       row_id, residual_table, result_summary_table)
 
+
+def pca_project_wrap(schema_madlib, source_table, pc_table, out_table,
+                       row_id, residual_table,
+                       result_summary_table, is_sparse=False,
+                       col_id=None, val_id=None, row_dim=None,
+                       col_dim=None, **kwargs):
+    """
+    This wrapper was added to support grouping columns. This
+    function does the necessary pre-processing for handling
+    grouping_cols, if set. It then constructs a single query that
+    includes a separate "madlib.pca_project_wrap(...)" for each group.
+    """
     # Reset the message level to avoid random messages
     old_msg_level = plpy.execute("""
                                   SELECT setting
@@ -388,24 +507,269 @@ def pca_project(schema_madlib,
                                   WHERE name='client_min_messages'
                                   """)[0]['setting']
     plpy.execute('SET client_min_messages TO warning')
-
-    # Step 1: Validate the input arguments
-    _validate_args(schema_madlib, source_table, pc_table, out_table,
+    if is_sparse:
+        _validate_args(schema_madlib, source_table, pc_table, out_table,
+                   row_id, col_id, val_id, row_dim, col_dim, residual_table,
+                   result_summary_table)
+    else:
+        _validate_args(schema_madlib, source_table, pc_table, out_table,
                    row_id, None, None, None, None,
                    residual_table, result_summary_table)
+    # If we add new columns to the pca_train output table in the future, they should
+    # be included in this list:
+    pc_table_model_cols = ['row_id', 'principal_components', 'std_dev', 'proportion']
+    grouping_cols_list = [col for col in get_cols(pc_table) if col not in pc_table_model_cols]
+    grouping_cols = ''
+    if grouping_cols_list:
+        grouping_cols = ', '.join(grouping_cols_list)
+
+    other_columns_in_table = [col for col in get_cols(source_table) if col not in grouping_cols_list]
+    grouping_cols_clause = ''
+    if(grouping_cols):
+        # validate the grouping columns. We currently only support grouping_cols
+        # to be column names in the source_table, and not expressions!
+        _assert(columns_exist_in_table(source_table, grouping_cols_list, schema_madlib),
+                """PCA error: One or more grouping columns in {0} do not exist in {1}, but
+                the model in {2} was learnt with grouping!""".format(grouping_cols,
+                    source_table, pc_table))
+        distinct_grouping_values = plpy.execute("""
+                SELECT DISTINCT {grouping_cols} FROM {source_table}
+            """.format(grouping_cols=grouping_cols, source_table=source_table))
+        cols_names_types = get_cols_and_types(source_table)
+        grouping_cols_clause = ', ' + ', '.join([c_name+" "+c_type
+            for (c_name, c_type) in cols_names_types if c_name in grouping_cols_list])
+    # Create all output tables
+    plpy.execute("""
+            DROP TABLE IF EXISTS {0};
+            CREATE TABLE {0} (
+                row_id      INTEGER,
+                row_vec     double precision[]
+                {1}
+            ) """.format(out_table, grouping_cols_clause))
+    if result_summary_table:
+        plpy.execute(
+                """
+                DROP TABLE IF EXISTS {0};
+                CREATE TABLE {0} (
+                    exec_time               FLOAT8,
+                    residual_norm           FLOAT8,
+                    relative_residual_norm  FLOAT8
+                    {1}
+                ) """.format(result_summary_table, grouping_cols_clause))
+    else:
+        result_summary_table = ''
+    if residual_table:
+        plpy.execute("""
+            DROP TABLE IF EXISTS {0};
+            CREATE TABLE {0} (
+                row_id      INTEGER,
+                row_vec     double precision[]
+                {1}
+            ) """.format(residual_table, grouping_cols_clause))
+    if not residual_table:
+        residual_table = ''
+
+    # declare variables whose values will be different for each group, if
+    # grouping_cols is specified
+    grouping_where_clause = ''
+    sparse_where_condition = ''
+    select_grouping_cols = ''
+    grouping_cols_values = ''
+    result_summary_table_temp = ''
+    other_columns_in_pc_table = [col for col in get_cols(pc_table) if col not in grouping_cols_list]
+    temp_pc_table_columns = ', '.join(other_columns_in_pc_table)
+    original_row_id = row_id
+
+    other_columns_in_table.remove(row_id)
+    temp_source_table_columns = ','.join(other_columns_in_table)
+
+    pca_union_call_list = []
+    grp_id = 0
+    if not is_sparse:
+        col_id = 'NULL'
+        val_id = 'NULL'
+        row_dim = 0
+        col_dim = 0
+    while True:
+        if grouping_cols:
+            grp_value_dict = distinct_grouping_values[grp_id]
+            where_conditions = ' AND '.join([str(key)+"="+str(value) for (key, value) in grp_value_dict.items()])
+            sparse_where_condition = ' AND ' + where_conditions
+            grouping_where_clause = ' WHERE ' + where_conditions
+            select_grouping_cols = ', ' + ', '.join([str(value)+" AS "+key for (key, value) in grp_value_dict.items()])
+            grouping_cols_values = ', ' + ', '.join([str(value) for (key, value) in grp_value_dict.items()])
+
+        pca_union_call_list.append("""
+            {schema_madlib}._pca_project_union('{source_table}', '{pc_table}', '{out_table}',
+                '{row_id}', '{original_row_id}', '{grouping_cols}',
+                '{grouping_cols_clause}', '{residual_table}', '{result_summary_table}',
+                {grp_id}, '{grouping_where_clause}', '{sparse_where_condition}','{select_grouping_cols}',
+                '{grouping_cols_values}', '{temp_source_table_columns}', '{temp_pc_table_columns}',
+                {is_sparse}, '{col_id}', '{val_id}', {row_dim}, {col_dim})
+            """.format(schema_madlib=schema_madlib,
+                source_table=source_table, pc_table=pc_table,
+                out_table=out_table, row_id=row_id,
+                original_row_id=original_row_id,
+                grouping_cols=grouping_cols,
+                grouping_cols_clause=grouping_cols_clause,
+                residual_table=residual_table,
+                result_summary_table=result_summary_table,
+                grp_id=grp_id, grouping_where_clause=grouping_where_clause,
+                sparse_where_condition=sparse_where_condition,
+                select_grouping_cols=select_grouping_cols,
+                grouping_cols_values=grouping_cols_values,
+                temp_source_table_columns=temp_source_table_columns,
+                temp_pc_table_columns=temp_pc_table_columns, is_sparse=is_sparse,
+                col_id=col_id, val_id=val_id, row_dim=row_dim, col_dim=col_dim))
+        grp_id += 1
+        if not grouping_cols_list or len(distinct_grouping_values) == grp_id:
+            break
+    # "SELECT <query_1>, <query_2>, <query_3>, ..." is expected to run each
+    # <query_i> in parallel.
+    pca_union_call = 'SELECT ' + ', '.join(pca_union_call_list)
+    plpy.execute(pca_union_call)
+    plpy.execute("SET client_min_messages TO %s" % old_msg_level)
+
+def _pca_project_union(schema_madlib, source_table, pc_table, out_table,
+        row_id, original_row_id, grouping_cols, grouping_cols_clause,
+        residual_table, result_summary_table, grp_id, grouping_where_clause,
+        sparse_where_condition, select_grouping_cols, grouping_cols_values,
+        temp_source_table_columns, temp_pc_table_columns, is_sparse, col_id,
+        val_id, row_dim, col_dim, **kwargs):
+    """
+    The pca_project is performed over each group, if any.
+
+    Args:
+        @param schema_madlib -- madlib schema name
+        @param source_table -- Source table name (dense matrix)
+        @param pc_table -- Output table name for the principal components
+        @param out_table -- Output table name
+        @param row_id -- Column name for the ID for each row
+        @param original_row_id  -- copy of the row_id originally passed
+        @param grouping_cols -- Comma-separated list of grouping columns (Default: NULL)
+        @param grouping_cols_clause -- Part of the SQL query to be used with grouping_cols
+        @param residual_table -- Residual table name
+        @param result_summary_table -- Table name to store summary of results (Default: NULL)
+        @param grp_id -- a place holder id for each group
+        @param grouping_where_clause -- WHERE clause using grouping_cols
+        @param select_grouping_cols -- SELECT clause using grouping_cols
+        @param grouping_cols_values -- distinct values of the grouping_cols
+        @param temp_source_table_columns -- SELECT caluse for creating temporary copy of the source_table
+        @param temp_pc_table_columns -- non grouping_cols of the source_table
+        @param is_sparse -- specifies if the PCA call is for sparse or dense matrices
+        @param col_id -- sparse representation based detail
+        @param val_id -- sparse representation based detail
+        @param row_dim -- sparse representation based detail
+        @param col_dim -- sparse representation based detail
+
+    Returns:
+        None
+    """
+    out_table_grouped = "pg_temp." + unique_string() + "group_" + str(grp_id)
+    if grouping_cols:
+        pc_table_grouped = "pg_temp." + unique_string() + "group_" + str(grp_id)
+        plpy.execute("""
+                CREATE TABLE {pc_table_grouped} AS
+                SELECT {temp_pc_table_columns}
+                FROM {pc_table}
+                {grouping_where_clause}
+            """.format(pc_table_grouped=pc_table_grouped,
+                pc_table=pc_table, grouping_where_clause=grouping_where_clause,
+                temp_pc_table_columns=temp_pc_table_columns))
+    else:
+        pc_table_grouped = pc_table
 
+    t0 = time.time()  # measure the starting time
+    # Step 1: Validate the input arguments
+    if is_sparse:
+        # Step 1.1: Create a copy of the sparse matrix and add row_dims and col_dims
+        # Warning: This changes the column names of the table
+        sparse_table_copy = "pg_temp." + unique_string() + "_sparse_table_copy"
+        create_temp_sparse_matrix_table_with_dims(source_table, sparse_table_copy,
+                                                  row_id, col_id, val_id,
+                                                  row_dim, col_dim, sparse_where_condition)
+        validate_sparse(sparse_table_copy,
+                        {'row': row_id, 'col': col_id, 'val': val_id},
+                        check_col=False)
+        # Step 1.2: Densify the input matrix
+        x_dense = "pg_temp." + unique_string() + "_dense"
+        plpy.execute("""
+            SELECT {schema_madlib}.matrix_densify(
+                '{sparse_table_copy}', 'row={row_id}, col={col_id}, val={val_id}',
+                '{x_dense}', 'row=row_id, col=col_id,val=row_vec')
+            """.format(schema_madlib=schema_madlib,
+                sparse_table_copy=sparse_table_copy, row_id=row_id,
+                col_id=col_id, val_id=val_id, x_dense=x_dense))
+        plpy.execute("""
+            DROP TABLE IF EXISTS {0};
+            """.format(sparse_table_copy))
+        source_table_grouped = x_dense
+    else:
+        # For Dense matrix format only:
+        # We can now ignore the original row_id for all computations since we will
+        # create a new table with a row_id column that has not duplicates and ranges
+        # from 1 to number of rows in the group/table. This is to mainly support the
+        # grouping scneario where the row_id values might not range between 1 and
+        # number of rows in the group, for each group. Doing this also just extends
+        # this behavior for non-grouping scenarios too. If creating a new temp table
+        # that corrects the row_id column is not of much importance in non-grouping
+        # cases, we can avoid creating the temp table and save some computation time.
+        # But, at the moment, the code creates the temp table even for the non-grouping
+        # scenario.
+        # We don't need to do this for sparse representation because of the nature
+        # of its definition.
+
+        # Preserve the mapping between new row_id created and the original row_id. This is
+        # required only for dense input format.
+        temp_row_id = "original_row_id" + unique_string()
+        row_id_map_table = "rowid" + unique_string()
+        plpy.execute("""
+                CREATE TABLE {row_id_map_table} AS
+                SELECT
+                    {source_table}.{original_row_id} AS {temp_row_id},
+                    {select_clause}
+                FROM {source_table}
+                {grouping_where_clause}
+            """.format(row_id_map_table=row_id_map_table,
+                original_row_id=original_row_id,
+                temp_row_id=temp_row_id,
+                source_table=source_table,
+                select_clause=""" ROW_NUMBER() OVER() AS row_id """,
+                grouping_where_clause=grouping_where_clause))
+
+        # Creation of this temp table is unnecessary if the scenario does not involve
+        # grouping, and/or, the input table had perfect values for the row_id column.
+        # This temp table will ensure pca works even when row_id of the source_table
+        # does not have serially increasing numbers starting from 1;
+        source_table_grouped = "pg_temp." + unique_string() + "group_" + str(grp_id)
+        plpy.execute("""
+                    CREATE TABLE {source_table_grouped} AS
+                    SELECT {row_id_map_table}.row_id, {temp_source_table_columns}
+                    FROM
+                        (
+                            SELECT *
+                            FROM {source_table}
+                            {grouping_where_clause}
+                        ) t1
+                    INNER JOIN {row_id_map_table}
+                    ON {row_id_map_table}.{temp_row_id}=t1.{row_id}
+                """.format(source_table_grouped=source_table_grouped,
+                    temp_row_id=temp_row_id, row_id_map_table=row_id_map_table, row_id=row_id,
+                    source_table=source_table, grouping_where_clause=grouping_where_clause,
+                    temp_source_table_columns=temp_source_table_columns))
+
+    row_id = 'row_id'
     # Make sure that the table has row_id and row_vec
     source_table_copy = "pg_temp." + unique_string()
     need_new_column_names = cast_dense_input_table_to_correct_columns(
-        schema_madlib, source_table, source_table_copy, row_id)
+        schema_madlib, source_table_grouped, source_table_copy, row_id)
 
     if(need_new_column_names):
-        source_table = source_table_copy
-
-    [row_dim, col_dim] = get_dims(source_table,
+        source_table_grouped = source_table_copy
+    [row_dim, col_dim] = get_dims(source_table_grouped,
                                   {'row': 'row_id', 'col': 'col_id',
                                    'val': 'row_vec'})
-    validate_dense(source_table,
+    validate_dense(source_table_grouped,
                    {'row': 'row_id', 'col': 'col_id', 'val': 'row_vec'},
                    check_col=False, row_dim=row_dim)
 
@@ -426,49 +790,54 @@ def pca_project(schema_madlib,
                 row_id,
                 ({schema_madlib}.utils_normalize_data(
                                   row_vec,
-                                  (select column_mean from {pc_table_mean}),
+                                  (select column_mean from {pc_table_mean}
+                                  {grouping_where_clause}),
                                   '{x_std_str}'::double precision[]))
                     AS row_vec
-            FROM {source_table}
+            FROM {source_table_grouped}
         """.format(schema_madlib=schema_madlib,
                    pc_table_mean=pc_table_mean,
-                   source_table=source_table,
+                   source_table_grouped=source_table_grouped,
                    scaled_source_table=scaled_source_table,
+                   grouping_where_clause=grouping_where_clause,
                    x_std_str=x_std_str))
 
     plpy.execute(
         """
         SELECT {schema_madlib}.matrix_mult('{scaled_source_table}',
                                             'trans=false,row=row_id, col=col_id, val=row_vec',
-                                           '{pc_table}',
+                                           '{pc_table_grouped}',
                                             'trans=TRUE, row=row_id, col=col_id, val=principal_components',
-                                            '{out_table}',
+                                            '{out_table_grouped}',
                                             'row=row_id, col=col_id,val=row_vec');
         """.format(schema_madlib=schema_madlib,
                    scaled_source_table=scaled_source_table,
-                   pc_table=pc_table,
-                   out_table=out_table))
+                   pc_table_grouped=pc_table_grouped,
+                   out_table_grouped=out_table_grouped))
 
     # Step 3: Compute the Residual table (if required)
     # Residual table: res = mat - proj
+    create_residual_table = False
     if residual_table or result_summary_table:
+        residual_table_grouped = "pg_temp." + unique_string() + "_temp_residual"
         create_temp_residual_table = False
         if not residual_table:
             create_temp_residual_table = True
-            residual_table = "pg_temp." + unique_string() + "_temp_residual"
+        else:
+            create_residual_table = True
         approx_table = "pg_temp." + unique_string() + "_approx"
         # Build an approximate reconstruction of the data
         plpy.execute(
             """
-            SELECT {schema_madlib}.matrix_mult('{out_table}',
+            SELECT {schema_madlib}.matrix_mult('{out_table_grouped}',
                                                 'row=row_id, col=col_id, val=row_vec',
-                                               '{pc_table}',
+                                               '{pc_table_grouped}',
                                                 'row=row_id, col=col_id, val=principal_components',
                                                 '{approx_table}',
                                                 'row=row_id, col=col_id, val=row_vec');
             """.format(schema_madlib=schema_madlib,
-                       out_table=out_table,
-                       pc_table=pc_table,
+                       out_table_grouped=out_table_grouped,
+                       pc_table_grouped=pc_table_grouped,
                        approx_table=approx_table))
 
         # Compute the difference between the reconstruction and real data
@@ -481,12 +850,12 @@ def pca_project(schema_madlib,
                                         '{approx_table}',
                                         'row=row_id, col=col_id, val=row_vec',
                                         -1,
-                                        '{residual_table}',
+                                        '{residual_table_grouped}',
                                         'row=row_id, col=col_id, val=row_vec');
             """.format(schema_madlib=schema_madlib,
                        scaled_source_table=scaled_source_table,
                        approx_table=approx_table,
-                       residual_table=residual_table))
+                       residual_table_grouped=residual_table_grouped))
 
         # Step 4: Compute the results summary table (if required)
         # If the residual table is not asked by the user, but he does ask for
@@ -494,19 +863,19 @@ def pca_project(schema_madlib,
         if result_summary_table:
             source_table_norm = plpy.execute(
                 """
-                SELECT {schema_madlib}.matrix_norm('{source_table}',
+                SELECT {schema_madlib}.matrix_norm('{source_table_grouped}',
                                                    'row=row_id, col=col_id, val=row_vec') as r
                 """.format(schema_madlib=schema_madlib,
-                           source_table=source_table,
+                           source_table_grouped=source_table_grouped,
                            row_id=row_id))[0]['r']
 
             # Compute the norm of the residual table
             residual_norm = plpy.execute(
                 """
-                SELECT {schema_madlib}.matrix_norm('{residual_table_name}',
+                SELECT {schema_madlib}.matrix_norm('{residual_table_grouped}',
                                                    'row=row_id, col=col_id, val=row_vec') as r
                 """.format(schema_madlib=schema_madlib,
-                           residual_table_name=residual_table,
+                           residual_table_grouped=residual_table_grouped,
                            row_id=row_id))[0]['r']
             # Compute the relative error of the norm
             # Prevent division by zero
@@ -514,12 +883,6 @@ def pca_project(schema_madlib,
                 relative_residual_norm = residual_norm / source_table_norm
             else:
                 relative_residual_norm = 0
-            plpy.execute(
-                """
-                CREATE TABLE {result_summary_table} ( exec_time FLOAT8,
-                                                      residual_norm FLOAT8,
-                                                      relative_residual_norm FLOAT8);
-                """.format(result_summary_table=result_summary_table))
             # Compute the time in milli-seconds
             t1 = time.time()
             dt = (t1 - t0) * 1000.
@@ -529,19 +892,85 @@ def pca_project(schema_madlib,
                 INSERT INTO {result_summary_table} VALUES
                     ({dt},
                      {residual_norm}::double precision,
-                     {relative_residual_norm}::double precision);
-                """.format(dt=dt,
-                           residual_norm=residual_norm,
+                     {relative_residual_norm}::double precision
+                     {grouping_cols_values}
+                     );
+                """.format(dt=dt, residual_norm=residual_norm,
                            result_summary_table=result_summary_table,
-                           relative_residual_norm=relative_residual_norm))
+                           relative_residual_norm=relative_residual_norm,
+                           grouping_cols_values=grouping_cols_values))
 
             plpy.execute("""
                      DROP TABLE IF EXISTS {approx_table};
                     """.format(approx_table=approx_table))
             if create_temp_residual_table:
                 plpy.execute("""
-                     DROP TABLE IF EXISTS {residual_table};
-                     """.format(residual_table=residual_table))
+                     DROP TABLE IF EXISTS {0};
+                     """.format(residual_table_grouped))
+
+    if is_sparse:
+        ## We don't have to join based on row_id for sparse project.
+        if create_residual_table:
+            plpy.execute("""
+                    INSERT INTO {residual_table}
+                    SELECT * {select_grouping_cols}
+                    FROM {residual_table_grouped}
+                """.format(residual_table=residual_table,
+                    select_grouping_cols=select_grouping_cols,
+                    residual_table_grouped=residual_table_grouped))
+        plpy.execute("""
+                INSERT INTO {out_table}
+                SELECT * {select_grouping_cols}
+                FROM {out_table_grouped}
+            """.format(out_table=out_table,
+                    select_grouping_cols=select_grouping_cols,
+                    out_table_grouped=out_table_grouped))
+    else:
+        output_table_cols = get_cols(out_table_grouped)
+        output_table_cols.remove('row_id')
+        output_table_select_clause = """{row_id_map_table}.{temp_row_id},
+                {out_table_cols}
+                {select_grouping_cols}
+            """.format(row_id_map_table=row_id_map_table,
+                temp_row_id=temp_row_id,
+                out_table_cols=', '.join(output_table_cols),
+                select_grouping_cols=select_grouping_cols)
+        if create_residual_table:
+            plpy.execute("""
+                INSERT INTO {residual_table}
+                SELECT {select_clause}
+                FROM {residual_table_grouped}
+                INNER JOIN {row_id_map_table}
+                ON {row_id_map_table}.row_id={residual_table_grouped}.row_id
+                """.format(residual_table=residual_table,
+                    select_clause=output_table_select_clause,
+                    residual_table_grouped=residual_table_grouped,
+                    row_id_map_table=row_id_map_table))
+        plpy.execute("""
+                    INSERT INTO {out_table}
+                    SELECT {select_clause}
+                    FROM {out_table_grouped}
+                    INNER JOIN {row_id_map_table}
+                    ON {row_id_map_table}.row_id={out_table_grouped}.row_id
+                    """.format(out_table=out_table,
+                        select_clause=output_table_select_clause,
+                        out_table_grouped=out_table_grouped,
+                        row_id_map_table=row_id_map_table))
+        plpy.execute("""
+                DROP TABLE IF EXISTS {0};
+            """.format(row_id_map_table))
+    if residual_table or result_summary_table:
+        plpy.execute("""
+                DROP TABLE IF EXISTS {0}
+            """.format(residual_table_grouped))
+    plpy.execute("""
+            DROP TABLE IF EXISTS {0};
+            DROP TABLE IF EXISTS {1};
+            DROP TABLE IF EXISTS {2};
+        """.format(scaled_source_table,
+            source_table_grouped, out_table_grouped))
+    if grouping_cols:
+        plpy.execute("""
+            DROP TABLE IF EXISTS {0};
+        """.format(pc_table_grouped))
 
-    plpy.execute("DROP TABLE IF EXISTS {0}".format(scaled_source_table))
-    plpy.execute("SET client_min_messages TO %s" % old_msg_level)

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/02a7ef45/src/ports/postgres/modules/pca/pca_project.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/pca/pca_project.sql_in b/src/ports/postgres/modules/pca/pca_project.sql_in
index fde15d0..1023569 100644
--- a/src/ports/postgres/modules/pca/pca_project.sql_in
+++ b/src/ports/postgres/modules/pca/pca_project.sql_in
@@ -180,84 +180,297 @@ The <em>result_summary_table</em> contains information about the performance tim
 SELECT madlib.pca_project();
 </pre>
 
--# Create the sample data:
+-# Create sample data in dense matrix form:
 <pre class="example">
 DROP TABLE IF EXISTS mat;
-CREATE TABLE mat (
-    row_id integer,
-    row_vec double precision[]
-);
+CREATE TABLE mat (id integer,
+                  row_vec double precision[]
+                  );
 INSERT INTO mat VALUES
-(1, ARRAY[4,7,5]),
-(2, ARRAY[1,2,5]),
-(3, ARRAY[7,4,4]),
-(4, ARRAY[9,2,4]),
-(5, ARRAY[8,5,7]),
-(6, ARRAY[0,5,5]);
+(1, '{1,2,3}'),
+(2, '{2,1,2}'),
+(3, '{3,2,1}');
 </pre>
--# Run the PCA function and keep only the top two principle components:
+
+-# Run the PCA function for a specified number of principal components and view the results:
 <pre class="example">
 DROP TABLE IF EXISTS result_table, result_table_mean;
-SELECT madlib.pca_train ( 'mat',
-                        'result_table',
-                        'row_id',
-                        2
-                        );
-SELECT * FROM result_table;
+SELECT madlib.pca_train('mat',             -- Source table
+                        'result_table',    -- Output table
+                        'id',              -- Row id of source table
+                         2);               -- Number of principal components
+SELECT * FROM result_table ORDER BY row_id;
 </pre>
 <pre class="result">
- row_id |                    principal_components                     |     std_dev      |    proportion     
---------+-------------------------------------------------------------+------------------+-------------------
-      1 | {-0.995725178022077,0.0921925100100751,0.00564897786728818} |  3.7757341932744 | 0.745095924995985
-      2 | {0.0900421470942019,0.955231794466691,0.281823758023718}    | 1.97776947254051 | 0.204437565497132
+ row_id |                     principal_components                     |      std_dev      |    proportion     
+--------+--------------------------------------------------------------+-------------------+-------------------
+      1 | {0.707106781186547,-6.93889390390723e-18,-0.707106781186548} |  1.41421356237309 | 0.857142857142244
+      2 | {0,1,0}                                                      | 0.577350269189626 | 0.142857142857041
+(2 rows)
 </pre>
--# Project the original data into a low-dimensional representation:
+
+-# Project the original data to a lower dimensional representation and view the result of the projection:
 <pre class="example">
 DROP TABLE IF EXISTS residual_table, result_summary_table, out_table;
 SELECT madlib.pca_project( 'mat',
-                            'result_table',
-                            'out_table',
-                            'row_id',
-                            'residual_table',
-                            'result_summary_table'
-                            );
-</pre>
--# View dense matrix with the projection onto the principal components:
-<pre class="example">
+                           'result_table',
+                           'out_table',
+                           'id',
+                           'residual_table',
+                           'result_summary_table'
+                           );
 SELECT * FROM out_table ORDER BY row_id;
 </pre>
 <pre class="result">
- row_id |                row_vec                
---------+---------------------------------------
-      1 | {1.09098309337665,-2.63145496174091}
-      2 | {3.61719607739251,2.41483045187516}
-      3 | {-2.17841894858709,0.24593773840028}
-      4 | {-4.3542543246514,1.97631703314526}
-      5 | {-3.06500468299723,-1.64480747723177}
-      6 | {4.88949878544481,-0.360822784430716}
+ row_id |               row_vec                
+--------+--------------------------------------
+      1 | {-1.41421356237309,-0.33333333333}
+      2 | {2.77555756157677e-17,0.66666666667}
+      3 | {1.41421356237309,-0.33333333333}
+(3 rows)
 </pre>
--# Check the error in the projection:
+Check the error in the projection:
 <pre class="example">
 SELECT * FROM result_summary_table;
 </pre>
 <pre class="result">
-   exec_time   | residual_norm | relative_residual_norm 
----------------+---------------+------------------------
- 56.8881034851 | 2.19726255664 |         0.099262204234
+   exec_time   |   residual_norm   | relative_residual_norm 
+---------------+-------------------+------------------------
+ 331.792116165 | 5.89383520611e-16 |      9.68940539229e-17
+(1 row)
 </pre>
--# Check the residuals:
+Check the residuals:
 <pre class="example">
 SELECT * FROM residual_table ORDER BY row_id;
 </pre>
 <pre class="result">
- row_id |                          row_vec                           
---------+------------------------------------------------------------
-      1 | {0.0160441468047001,0.219103418411008,-0.747769465736052}
-      2 | {-0.0141636064722857,-0.193422226365899,0.660123132354738}
-      3 | {0.0197048332985021,0.269094791232932,-0.918383061897929}
-      4 | {0.00897783310632772,0.122603834748268,-0.418429820364218}
-      5 | {-0.0333376637524658,-0.455268589780183,1.55376831915842}
-      6 | {0.00277445701511336,0.037888771752409,-0.129309103509956}
+ row_id |                              row_vec                               
+--------+--------------------------------------------------------------------
+      1 | {-2.22044604925031e-16,-1.11022302462516e-16,3.33066907387547e-16}
+      2 | {-1.12243865646685e-18,0,4.7381731349413e-17}
+      3 | {2.22044604925031e-16,1.11022302462516e-16,-3.33066907387547e-16}
+(3 rows)
+</pre>
+
+-# Now we use grouping in dense form to learn different models for different groups.
+First, we create sample data in dense matrix form with a grouping column.
+Note we actually have different matrix sizes for the different groups, which 
+is allowed for dense:
+<pre class="example">
+DROP TABLE IF EXISTS mat_group;
+CREATE TABLE mat_group (
+    id integer,
+    row_vec double precision[],
+    matrix_id integer
+);
+INSERT INTO mat_group VALUES
+(1, '{1,2,3}', 1),
+(2, '{2,1,2}', 1),
+(3, '{3,2,1}', 1),
+(4, '{1,2,3,4,5}', 2),
+(5, '{2,5,2,4,1}', 2),
+(6, '{5,4,3,2,1}', 2);
+</pre>
+
+-# Run the PCA function with grouping for a specified proportion of variance and view the results:
+<pre class="example">
+DROP TABLE IF EXISTS result_table_group, result_table_group_mean;
+SELECT madlib.pca_train('mat_group',             -- Source table
+                        'result_table_group',    -- Output table
+                        'id',                    -- Row id of source table
+                         0.8,                    -- Proportion of variance
+                        'matrix_id');            -- Grouping column
+SELECT * FROM result_table_group ORDER BY matrix_id, row_id;
+</pre>
+<pre class="result">
+ row_id |                                      principal_components                                      |     std_dev     |    proportion     | matrix_id 
+--------+------------------------------------------------------------------------------------------------+-----------------+-------------------+-----------
+      1 | {0.707106781186548,0,-0.707106781186547}                                                       | 1.4142135623731 | 0.857142857142245 |         1
+      1 | {-0.555378486712784,-0.388303582074091,0.0442457354870796,0.255566375612852,0.688115693174023} | 3.2315220311722 | 0.764102534485173 |         2
+      2 | {0.587384101786277,-0.485138064894743,0.311532046315153,-0.449458074050715,0.347212037159181}  |  1.795531127192 | 0.235897465516047 |         2
+(3 rows)
+</pre>
+
+-# Run the PCA projection on subsets of an input table based on grouping columns. 
+Note that the parameter 'pc_table' used for projection must be generated in training 
+using the same grouping columns. 
+<pre class="example">
+DROP TABLE IF EXISTS mat_group_projected;
+SELECT madlib.pca_project('mat_group',
+                          'result_table_group',
+                          'mat_group_projected',
+                          'id');
+SELECT * FROM mat_group_projected ORDER BY matrix_id, row_id;
+</pre>
+<pre class="result">
+ row_id |                row_vec                | matrix_id 
+--------+---------------------------------------+-----------
+      1 | {1.4142135623731}                     |         1
+      2 | {7.40148683087139e-17}                |         1
+      3 | {-1.4142135623731}                    |         1
+      4 | {-3.59290479201926,0.559694003674779} |         2
+      5 | {0.924092949098971,-2.00871628417505} |         2
+      6 | {2.66881184290186,1.44902228049511}   |         2
+(6 rows)
+</pre>
+
+-# Now let's look at sparse matrices.  Create sample data in sparse matrix form:
+<pre class="example">
+DROP TABLE IF EXISTS mat_sparse;
+CREATE TABLE mat_sparse (
+    row_id integer,
+    col_id integer,
+    value double precision
+);
+INSERT INTO mat_sparse VALUES
+(1, 1, 1.0),
+(2, 2, 2.0),
+(3, 3, 3.0),
+(4, 4, 4.0),
+(1, 5, 5.0),
+(2, 4, 6.0),
+(3, 2, 7.0),
+(4, 3, 8.0);
+</pre>
+As an aside, this is what the sparse matrix above looks like when 
+put in dense form:
+<pre class="example">
+DROP TABLE IF EXISTS mat_dense;
+SELECT madlib.matrix_densify('mat_sparse', 
+                            'row=row_id, col=col_id, val=value', 
+                            'mat_dense');
+SELECT * FROM mat_dense ORDER BY row_id;
+</pre>
+<pre class="result">
+ row_id |    value    
+--------+-------------
+      1 | {1,0,0,0,5}
+      2 | {0,2,0,6,0}
+      3 | {0,7,3,0,0}
+      4 | {0,0,8,4,0}
+(4 rows)
+</pre>
+
+-# Run the PCA sparse function for a specified number of principal components and view the results:
+<pre class="example">DROP TABLE IF EXISTS result_table, result_table_mean;
+SELECT madlib.pca_sparse_train( 'mat_sparse',       -- Source table
+                                'result_table',     -- Output table
+                                'row_id',           -- Row id of source table
+                                'col_id',           -- Column id of source table
+                                'value',            -- Value of matrix at row_id, col_id
+                                4,                  -- Actual number of rows in the matrix
+                                5,                  -- Actual number of columns in the matrix
+                                3);                 -- Number of principal components                            
+SELECT * FROM result_table ORDER BY row_id;
+</pre>
+Result (with principal components truncated for readability):
+<pre class="result">
+ row_id |         principal_components                 |     std_dev      |    proportion     
+--------+----------------------------------------------+------------------+-------------------
+      1 | {-0.0876046030186158,-0.0968983772909994,... | 4.21362803829554 | 0.436590030617467
+      2 | {-0.0647272661608605,0.877639526308692,...   | 3.68408023747461 | 0.333748701544697
+      3 | {-0.0780380267884855,0.177956517174911,...   | 3.05606908060098 | 0.229661267837836
+(3 rows)
+</pre>
+
+-# Project the original sparse data to low-dimensional representation:
+<pre class="example">
+DROP TABLE IF EXISTS mat_sparse_out;
+SELECT madlib.pca_sparse_project(
+                    'mat_sparse',
+                    'result_table',
+                    'mat_sparse_out',
+                    'row_id',
+                    'col_id',
+                    'value',
+                    4,
+                    5
+                    );
+SELECT * FROM mat_sparse_out ORDER BY row_id;
+</pre>
+<pre class="result">
+ row_id |                         row_vec                         
+--------+---------------------------------------------------------
+      1 | {4.66617015032369,-2.63552220635847,2.1865220849604}
+      2 | {0.228360685652383,-1.21616275892926,-4.46864627611561}
+      3 | {0.672067460100428,5.45249627172823,0.56445525585642}
+      4 | {-5.5665982960765,-1.6008113064405,1.71766893529879}
+(4 rows)
+</pre>
+
+-# Now we use grouping in sparse form to learn different models for different groups.
+First, we create sample data in sparse matrix form with a grouping column:
+<pre class="example">
+DROP TABLE IF EXISTS mat_sparse_group;
+CREATE TABLE mat_sparse_group (
+    row_id integer,
+    col_id integer,
+    value double precision,
+    matrix_id integer);
+INSERT INTO mat_sparse_group VALUES
+(1, 1, 1.0, 1),
+(2, 2, 2.0, 1),
+(3, 3, 3.0, 1),
+(4, 4, 4.0, 1),
+(1, 5, 5.0, 1),
+(2, 4, 6.0, 2),
+(3, 2, 7.0, 2),
+(4, 3, 8.0, 2);
+</pre>
+
+-#  Run the PCA function with grouping for a specified proportion of variance
+and view the results:
+<pre class="example">
+DROP TABLE IF EXISTS result_table_group, result_table_group_mean;
+SELECT madlib.pca_sparse_train( 'mat_sparse_group',       -- Source table
+                                'result_table_group',     -- Output table
+                                'row_id',           -- Row id of source table
+                                'col_id',           -- Column id of source table
+                                'value',            -- Value of matrix at row_id, col_id
+                                4,                 -- Actual number of rows in the matrix
+                                5,                 -- Actual number of columns in the matrix
+                                0.8,                 -- Proportion of variance
+                                'matrix_id');
+SELECT * FROM result_table_group ORDER BY matrix_id, row_id;
+</pre>
+Result (with principal components truncated for readability):
+<pre class="result">
+ row_id |           principal_components             |     std_dev      |    proportion     | matrix_id 
+--------+--------------------------------------------+------------------+-------------------+-----------
+      1 | {-0.17805696611353,0.0681313257646983,...  | 2.73659933165925 | 0.544652792875481 |         1
+      2 | {-0.0492086814863993,0.149371585357526,... | 2.06058314533194 | 0.308800210823714 |         1
+      1 | {0,-0.479486114660443,...                  | 4.40325305087975 | 0.520500333693473 |         2
+      2 | {0,0.689230898585949,...                   |  3.7435566458567 | 0.376220573442628 |         2
+(4 rows)
+</pre>
+
+-# Projection in sparse format with grouping:
+<pre class="example">
+DROP TABLE IF EXISTS mat_sparse_group_projected;
+SELECT madlib.pca_sparse_project(
+    'mat_sparse_group',
+    'result_table_group',
+    'mat_sparse_group_projected',
+    'row_id',
+    'col_id',
+    'value',
+    4,
+    5
+    );
+SELECT * FROM mat_sparse_group_projected ORDER BY matrix_id, row_id;
+</pre>
+<pre class="result">
+ row_id |                 row_vec                 | matrix_id 
+--------+-----------------------------------------+-----------
+      1 | {-4.00039298524261,-0.626820612715982}  |         1
+      2 | {0.765350785238575,0.951348276645455}   |         1
+      3 | {1.04951017256904,2.22388180170356}     |         1
+      4 | {2.185532027435,-2.54840946563303}      |         1
+      1 | {-0.627846810195469,-0.685031603549092} |         2
+      2 | {-1.64754249747757,-4.7662114622896}    |         2
+      3 | {-3.98424961281857,4.13958468655255}    |         2
+      4 | {6.25963892049161,1.31165837928614}     |         2
+(8 rows)
 </pre>
 
 @anchor notes
@@ -269,12 +482,14 @@ containing the principal components. If this table is not found by the MADlib
 projection function, it will trigger an error.  As long the principal component
 tables are created with MADlib functions, then the column-means table will be
 automatically found by the MADlib projection functions.
+
 - Because of the centering step in PCA projection
 (see "Technical Background"), sparse matrices almost always
 become dense during the projection
 process.  Thus, this implementation automatically densifies sparse matrix input,
 and there should be no expected performance improvement in using sparse matrix
 input over dense matrix input.
+
 - Table names can be optionally schema qualified (current_schemas() is
 searched if a schema name is not provided) and all table and column names
 should follow case-sensitivity and quoting rules per the database.
@@ -282,6 +497,10 @@ should follow case-sensitivity and quoting rules per the database.
 If mixed-case or multi-byte characters are desired for entity names then the
 string should be double-quoted; in this case the input would be '"MyTable"').
 
+- If the input table for pca_project (pca_sparse_project) contains grouping columns,
+the same grouping columns must be used in the training function used to generate the
+principal components too.
+
 
 @anchor background_project
 @par Technical Background
@@ -333,7 +552,7 @@ MADLIB_SCHEMA.pca_project(
     out_table               TEXT,    -- Output table name for the principal components
     row_id                  TEXT,    -- Column name for the ID for each row
     residual_table          TEXT,    -- Residual table (Default: NULL)
-    result_summary_table    TEXT     -- Table name to store summary of results (Default: NULL)
+    result_summary_table    TEXT    -- Table name to store summary of results (Default: NULL)
 )
 RETURNS VOID AS $$
 PythonFunction(pca, pca_project, pca_project)
@@ -457,6 +676,34 @@ $$ LANGUAGE SQL
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 
 
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._pca_project_union(
+    source_table                TEXT,    -- Source table name (dense matrix)
+    pc_table                    TEXT,    -- Output table name for the principal components
+    out_table                   TEXT,    -- Output table name
+    row_id                      TEXT,    -- Column name for the ID for each row
+    original_row_id             TEXT,    -- copy of the row_id originally passed
+    grouping_cols               TEXT,    -- Comma-separated list of grouping columns (Default: NULL)
+    grouping_cols_clause        TEXT,    -- Part of the SQL query to be used with grouping_cols
+    residual_table              TEXT,    -- Residual table name
+    result_summary_table        TEXT,    -- Table name to store summary of results (Default: NULL)
+    grp_id                      INTEGER, -- a place holder id for each group
+    grouping_where_clause       TEXT,    -- WHERE clause using grouping_cols
+    sparse_where_condition      TEXT,   -- WHERE clause used when creating temp sparse matrix table with dims
+    select_grouping_cols        TEXT,    -- SELECT clause using grouping_cols
+    grouping_cols_values        TEXT,    -- distinct values of the grouping_cols
+    temp_source_table_columns   TEXT,    -- SELECT caluse for creating temporary copy of the source_table
+    temp_pc_table_columns       TEXT,    -- non grouping_cols of the source_table
+    is_sparse                   BOOLEAN, -- specifies if the PCA call is for sparse or dense matrices
+    col_id                      TEXT,    -- sparse representation based detail
+    val_id                      TEXT,    -- sparse representation based detail
+    row_dim                     INTEGER, -- sparse representation based detail
+    col_dim                     INTEGER  -- sparse representation based detail
+)
+RETURNS VOID AS $$
+PythonFunction(pca, pca_project, _pca_project_union)
+$$ LANGUAGE plpythonu
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
+
 
 -- Help and usage functions
 -----------------------------------------------------------------------------

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/02a7ef45/src/ports/postgres/modules/pca/test/pca.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/pca/test/pca.sql_in b/src/ports/postgres/modules/pca/test/pca.sql_in
index 8cff3db..12d8ab1 100644
--- a/src/ports/postgres/modules/pca/test/pca.sql_in
+++ b/src/ports/postgres/modules/pca/test/pca.sql_in
@@ -118,6 +118,86 @@ NULL, 5, FALSE, 'result_table_214712398172490838');
 select * from result_table_214712398172490837;
 select * from result_table_214712398172490838;
 
+-- Test dense data with grouping
+DROP TABLE IF EXISTS mat;
+CREATE TABLE mat (
+    id integer,
+    row_vec double precision[],
+    grp integer
+);
+
+COPY mat (id, row_vec, grp) FROM stdin delimiter '|';
+1|{396,840,353,446,318,886,15,584,159,383}|1
+2|{691,58,899,163,159,533,604,582,269,390}|1
+3|{293,742,298,75,404,857,941,662,846,2}|1
+4|{462,532,787,265,982,306,600,608,212,885}|1
+5|{304,151,337,387,643,753,603,531,459,652}|1
+6|{327,946,368,943,7,516,272,24,591,204}|1
+7|{877,59,260,302,891,498,710,286,864,675}|1
+8|{458,959,774,376,228,354,300,669,718,565}|2
+9|{824,390,818,844,180,943,424,520,65,913}|2
+10|{882,761,398,688,761,405,125,484,222,873}|2
+11|{528,1,860,18,814,242,314,965,935,809}|2
+12|{492,220,576,289,321,261,173,1,44,241}|2
+13|{415,701,221,503,67,393,479,218,219,916}|2
+14|{350,192,211,633,53,783,30,444,176,932}|2
+15|{909,472,871,695,930,455,398,893,693,838}|2
+16|{739,651,678,577,273,935,661,47,373,618}|2
+\.
+
+-- Learn individaul PCA models based on grouping column (grp)
+drop table if exists result_table_214712398172490837;
+drop table if exists result_table_214712398172490837_mean;
+drop table if exists result_table_214712398172490838;
+select pca_train('mat', 'result_table_214712398172490837', 'id', 0.8,
+'grp', 5, FALSE, 'result_table_214712398172490838');
+select * from result_table_214712398172490837;
+select * from result_table_214712398172490838;
+
+-- Matrix in the column format
+DROP TABLE IF EXISTS cmat;
+CREATE TABLE cmat (
+    id  integer,
+    val0    float8,
+    val1    float8,
+    val2    float8,
+    val3    float8,
+    val4    float8,
+    val5    float8,
+    val6    float8,
+    val7    float8,
+    val8    float8,
+    val9    float8,
+    grp     integer
+);
+
+COPY cmat (id, val0, val1, val2, val3, val4, val5, val6, val7, val8, val9, grp) FROM stdin delimiter '|';
+1|396|840|353|446|318|886|15|584|159|383|1
+2|691|58|899|163|159|533|604|582|269|390|1
+3|293|742|298|75|404|857|941|662|846|2|1
+4|462|532|787|265|982|306|600|608|212|885|1
+5|304|151|337|387|643|753|603|531|459|652|1
+6|327|946|368|943|7|516|272|24|591|204|1
+7|877|59|260|302|891|498|710|286|864|675|2
+8|458|959|774|376|228|354|300|669|718|565|2
+9|824|390|818|844|180|943|424|520|65|913|2
+10|882|761|398|688|761|405|125|484|222|873|2
+11|528|1|860|18|814|242|314|965|935|809|2
+12|492|220|576|289|321|261|173|1|44|241|2
+13|415|701|221|503|67|393|479|218|219|916|2
+14|350|192|211|633|53|783|30|444|176|932|2
+15|909|472|871|695|930|455|398|893|693|838|2
+16|739|651|678|577|273|935|661|47|373|618|2
+\.
+-- Learn individaul PCA models based on grouping column (grp)
+drop table if exists result_table_214712398172490837;
+drop table if exists result_table_214712398172490837_mean;
+drop table if exists result_table_214712398172490838;
+select pca_train('mat', 'result_table_214712398172490837', 'id', 0.8,
+'grp', 5, FALSE, 'result_table_214712398172490838');
+select * from result_table_214712398172490837;
+select * from result_table_214712398172490838;
+
 -- SPARSE PCA: Make sure all possible default calls for sparse PCA work
 -----------------------------------------------------------------------------
 
@@ -240,6 +320,32 @@ select pca_sparse_train('sparse_mat', 'result_table_214712398172490837',
 'rownr', 'colnr', 'vals', 10, 10, 10);
 select * from result_table_214712398172490837;
 
+-- Sparse input data with grouping column
+DROP TABLE IF EXISTS sparse_mat_grp;
+CREATE TABLE sparse_mat_grp (
+    id integer,
+    col_id integer,
+    val_id integer,
+    grp    integer
+);
+COPY sparse_mat_grp (id, col_id, val_id, grp) FROM stdin delimiter '|';
+1|2|4|1
+1|5|6|1
+3|8|4|1
+5|4|2|1
+1|2|4|2
+1|5|6|2
+3|8|4|2
+5|4|2|2
+\.
+-- Learn individaul PCA models based on grouping column (grp)
+drop table if exists result_table_214712398172490837;
+drop table if exists result_table_214712398172490837_mean;
+drop table if exists result_table_214712398172490838;
+select pca_sparse_train('sparse_mat_grp', 'result_table_214712398172490837',
+'id', 'col_id', 'val_id', 10, 10, 0.8, 'grp', 0, FALSE, 'result_table_214712398172490838');
+select * from result_table_214712398172490837;
+select * from result_table_214712398172490838;
 
 -------------------------------------------------------------------------
 drop table if exists mat;

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/02a7ef45/src/ports/postgres/modules/pca/test/pca_project.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/pca/test/pca_project.sql_in b/src/ports/postgres/modules/pca/test/pca_project.sql_in
index 82c547a..beb0ef4 100644
--- a/src/ports/postgres/modules/pca/test/pca_project.sql_in
+++ b/src/ports/postgres/modules/pca/test/pca_project.sql_in
@@ -30,8 +30,6 @@ drop table if exists result_table_214712398172490837;
 drop table if exists result_table_214712398172490837_mean;
 select pca_sparse_train('sparse_mat', 'result_table_214712398172490837',
 'row_id', 'col_id', 'val_id', 10, 10, 10);
-select * from result_table_214712398172490837;
-
 
 drop table if exists out_table_214712398172490837;
 select pca_sparse_project( 'sparse_mat',
@@ -120,6 +118,7 @@ select pca_project( 'mat',
 
 drop table if exists out_table_214712398172490837;
 drop table if exists residual_table_214712398172490837;
+drop table if exists result_summary_table_214712398172490837;
 select pca_project( 'mat',
     'result_table_214712398172490837',
     'out_table_214712398172490837',
@@ -172,4 +171,129 @@ select pca_project( 'cmat',
                     'out_table_214712398172490837',
                     'row_id');
 
+-- Sparse matrix with grouping column
+DROP TABLE IF EXISTS sparse_mat;
+CREATE TABLE sparse_mat (
+    id integer,
+    col_id integer,
+    val_id integer,
+    grp    integer
+);
+COPY sparse_mat (id, col_id, val_id, grp) FROM stdin delimiter '|';
+1|2|4|1
+1|5|6|1
+3|8|4|1
+5|4|2|1
+6|6|12|2
+8|7|2|2
+8|1|2|2
+9|8|2|2
+9|3|4|2
+\.
+-- project sparse matrix using model leart with grouping_cols
+drop table if exists result_table_214712398172490837;
+drop table if exists result_table_214712398172490837_mean;
+select pca_sparse_train('sparse_mat', 'result_table_214712398172490837',
+'id', 'col_id', 'val_id', 10, 10, 10, 'grp');
+
+drop table if exists out_table_214712398172490837;
+drop table if exists residual_table_214712398172490837;
+drop table if exists summary_table_214712398172490837;
+select pca_sparse_project( 'sparse_mat',
+                    'result_table_214712398172490837',
+                    'out_table_214712398172490837',
+                    'id',
+                    'col_id',
+                    'val_id',
+                    10,
+                    10,
+                    'residual_table_214712398172490837',
+                    'summary_table_214712398172490837');
+
+-- Test data (Indentical to SVD) with grouping column
+DROP TABLE IF EXISTS mat;
+CREATE TABLE mat (
+    id integer,
+    row_vec double precision[],
+    grp integer
+);
+
+COPY mat (id, row_vec, grp) FROM stdin delimiter '|';
+1|{396,840,353,446,318,886,15,584,159,383}|1
+2|{691,58,899,163,159,533,604,582,269,390}|1
+3|{293,742,298,75,404,857,941,662,846,2}|1
+4|{462,532,787,265,982,306,600,608,212,885}|1
+5|{304,151,337,387,643,753,603,531,459,652}|1
+6|{327,946,368,943,7,516,272,24,591,204}|1
+7|{877,59,260,302,891,498,710,286,864,675}|1
+8|{458,959,774,376,228,354,300,669,718,565}|2
+9|{824,390,818,844,180,943,424,520,65,913}|2
+10|{882,761,398,688,761,405,125,484,222,873}|2
+11|{528,1,860,18,814,242,314,965,935,809}|2
+12|{492,220,576,289,321,261,173,1,44,241}|2
+13|{415,701,221,503,67,393,479,218,219,916}|2
+14|{350,192,211,633,53,783,30,444,176,932}|2
+15|{909,472,871,695,930,455,398,893,693,838}|2
+16|{739,651,678,577,273,935,661,47,373,618}|2
+\.
+-- project sparse matrix using model leart with grouping_cols
+drop table if exists result_table_214712398172490837;
+drop table if exists result_table_214712398172490837_mean;
+select pca_train('mat', 'result_table_214712398172490837', 'id', 5, 'grp');
+
+drop table if exists out_table_214712398172490837;
+drop table if exists residual_table_214712398172490837;
+drop table if exists result_summary_table_214712398172490837;
+select pca_project( 'mat',
+    'result_table_214712398172490837',
+    'out_table_214712398172490837',
+    'id',
+    'residual_table_214712398172490837',
+    'result_summary_table_214712398172490837');
+
+
+-- Matrix in the column format with grouping column
+DROP TABLE IF EXISTS cmat;
+CREATE TABLE cmat (
+    id  integer,
+    val0    float8,
+    val1    float8,
+    val2    float8,
+    val3    float8,
+    val4    float8,
+    val5    float8,
+    val6    float8,
+    val7    float8,
+    val8    float8,
+    val9    float8,
+    grp     integer
+);
+
+COPY cmat (id, val0, val1, val2, val3, val4, val5, val6, val7, val8, val9, grp) FROM stdin delimiter '|';
+1|396|840|353|446|318|886|15|584|159|383|1
+2|691|58|899|163|159|533|604|582|269|390|1
+3|293|742|298|75|404|857|941|662|846|2|1
+4|462|532|787|265|982|306|600|608|212|885|1
+5|304|151|337|387|643|753|603|531|459|652|1
+6|327|946|368|943|7|516|272|24|591|204|1
+7|877|59|260|302|891|498|710|286|864|675|2
+8|458|959|774|376|228|354|300|669|718|565|2
+9|824|390|818|844|180|943|424|520|65|913|2
+10|882|761|398|688|761|405|125|484|222|873|2
+11|528|1|860|18|814|242|314|965|935|809|2
+12|492|220|576|289|321|261|173|1|44|241|2
+13|415|701|221|503|67|393|479|218|219|916|2
+14|350|192|211|633|53|783|30|444|176|932|2
+15|909|472|871|695|930|455|398|893|693|838|2
+16|739|651|678|577|273|935|661|47|373|618|2
+\.
+-- project sparse matrix using model leart with grouping_cols
+drop table if exists result_table_214712398172490837;
+drop table if exists result_table_214712398172490837_mean;
+select pca_train('cmat', 'result_table_214712398172490837', 'id', 5, 'grp');
 
+drop table if exists out_table_214712398172490837;
+select pca_project( 'cmat',
+                    'result_table_214712398172490837',
+                    'out_table_214712398172490837',
+                    'id');

[37/50] [abbrv] incubator-madlib git commit: Release v1.10:

Posted by ri...@apache.org.

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/90f4dc15/src/madpack/changelist_1.7_1.9.1.yaml
----------------------------------------------------------------------
diff --git a/src/madpack/changelist_1.7_1.9.1.yaml b/src/madpack/changelist_1.7_1.9.1.yaml
deleted file mode 100644
index 0134607..0000000
--- a/src/madpack/changelist_1.7_1.9.1.yaml
+++ /dev/null
@@ -1,836 +0,0 @@
-# Changelist for MADlib version 1.7 to 1.7.1
-
-# This file contains all changes that were introduced in a new version of
-# MADlib. This changelist is used by the upgrade script to detect what objects
-# should be upgraded (while retaining all other objects from the previous version)
-
-# New modules (actually .sql_in files) added in upgrade version
-# For these files the sql_in code is retained as is with the functions in the
-# file installed on the upgrade version. All other files (that don't have
-# updates), are cleaned up to remove object replacements
-new module:
-    # ----------------- Changes from 1.7 to 1.7.1 ----------
-    # ----------------- Changes from 1.7.1 to 1.8 ----------
-    text_utilities:
-
-# Changes in the types (UDT) including removal and modification
-udt:
-
-    # ----------------- Changes from 1.7 to 1.7.1 ----------
-    # ----------------- Changes from 1.8 to 1.9 ----------
-    __enc_tbl_result:
-    __gen_acc_time:
-    __rep_type:
-    __train_result:
-    c45_classify_result:
-    c45_train_result:
-    correlation_result:
-    lsvm_sgd_model_rec:
-    lsvm_sgd_result:
-    rf_classify_result:
-    rf_train_result:
-    svm_cls_result:
-    svm_model_pr:
-    svm_model_rec:
-    svm_nd_result:
-    svm_reg_result:
-    svm_support_vector:
-    _prune_result_type:
-    _tree_result_type:
-    linear_svm_result:
-    # ----------------- Changes from 1.9 to 1.9.1 ----------
-    profile_result:
-
-# List of the UDF changes that affect the user externally. This includes change
-# in function name, return type, argument order or types, or removal of
-# the function. In each case, the original function is as good as removed and a
-# new function is created. In such cases, we should abort the upgrade if there
-# are user views dependent on this function, since the original function will
-# not be present in the upgraded version.
-udf:
-    # ----------------- Changes from 1.7 to 1.7.1 ----------
-    - discrete_distribution_merge: # replaced by array_add
-        rettype: double precision[]
-        argument: double precision[], double precision[]
-    - _compute_leaf_stats_transition:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8, integer[], double precision[], double precision, double precision, integer[], schema_madlib.bytea8, smallint
-    - _compute_surr_stats_transition:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8, integer[], double precision[], integer[], schema_madlib.bytea8
-    # ----------------- Changes from 1.7.1 to 1.8 ----------
-    - __filter_input_relation:
-        rettype: character varying
-        argument: character varying, character varying
-    - __lda_util_unnest:
-        rettype: SETOF bigint[]
-        argument: bigint[]
-    - matrix_block_mult:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, text
-    - matrix_block_square:
-        rettype: schema_madlib.matrix_result
-        argument: text, text
-    - matrix_block_trans:
-        rettype: schema_madlib.matrix_result
-        argument: text, text
-    - matrix_block_trans:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, boolean
-    - matrix_blockize:
-        rettype: schema_madlib.matrix_result
-        argument: text, integer, integer, text
-    - matrix_densify:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, text, text, text, boolean
-    - matrix_mult:
-        rettype: schema_madlib.matrix_result
-        argument: text, boolean, text, boolean, text
-    - matrix_mult:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, text, text, boolean, text, text, text, text, boolean, text
-    - matrix_norm:
-        rettype: double precision
-        argument: text
-    - matrix_scale_and_add:
-        rettype: void
-        argument: text, text, double precision, text
-    - matrix_sparsify:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, boolean
-    - matrix_square:
-        rettype: schema_madlib.matrix_result
-        argument: text, text
-    - matrix_trans:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, boolean
-    - matrix_trans:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, text, text, text, boolean
-    - matrix_unblockize:
-        rettype: schema_madlib.matrix_result
-        argument: text, text
-    # ----------------- Changes from 1.8 to 1.9 ----------
-
-    - _dt_apply:
-       rettype: schema_madlib._tree_result_type
-       argument: schema_madlib.bytea8,schema_madlib.bytea8,schema_madlib.bytea8,smallint,smallint,smallint,boolean,integer
-
-    - internal_linear_svm_igd_result:
-       rettype: schema_madlib.linear_svm_result
-       argument: double precision[]
-
-    - _prune_and_cplist:
-       rettype: schema_madlib._prune_result_type
-       argument: schema_madlib.bytea8,double precision,boolean
-
-    - __array_elem_in:
-       rettype: boolean[]
-       argument: anyarray, anyarray
-
-    - __array_indexed_agg_ffunc:
-       rettype: double precision[]
-       argument: double precision[]
-
-    - __array_indexed_agg_prefunc:
-       rettype: double precision[]
-       argument: double precision[], double precision[]
-
-    - __array_indexed_agg_sfunc:
-       rettype: double precision[]
-       argument: double precision[], double precision, bigint, bigint
-
-    - __array_search:
-       rettype: boolean
-       argument: anyelement, anyarray
-
-    - __array_sort:
-       rettype: anyarray
-       argument: anyarray
-
-    - __assert:
-       rettype: void
-       argument: boolean, text
-
-    - __assert_table:
-       rettype: void
-       argument: text, boolean
-
-    - __best_scv_prefunc:
-       rettype: double precision[]
-       argument: double precision[], double precision[]
-
-    - __best_scv_sfunc:
-       rettype: double precision[]
-       argument: double precision[], double precision[], integer, double precision
-
-    - __bigint_array_add:
-       rettype: bigint[]
-       argument: bigint[], bigint[]
-
-    - __breakup_table:
-       rettype: void
-       argument: text, text, text, text, text, text[], boolean[], integer, integer
-
-    - __check_dt_common_params:
-       rettype: void
-       argument: text, text, text, text, text, text, text, text, integer, double precision, double precision, integer, text
-
-    - __check_training_table:
-       rettype: void
-       argument: text, text[], text[], text, text, integer
-
-    - __column_exists:
-       rettype: boolean
-       argument: text, text
-
-    - __columns_in_table:
-       rettype: boolean
-       argument: text[], text
-
-    - __create_metatable:
-       rettype: void
-       argument: text
-
-    - __create_tree_tables:
-       rettype: void
-       argument: text
-
-    - __csvstr_to_array:
-       rettype: text[]
-       argument: text
-
-    - __display_node_sfunc:
-       rettype: text
-       argument: text, integer, boolean, text, text, double precision, double precision, text, integer
-
-    - __display_tree_no_ordered_aggr:
-       rettype: text
-       argument: text, integer, integer, integer, boolean, double precision, text, integer, integer
-
-    - __distinct_feature_value:
-       rettype: integer
-       argument: text, integer
-
-    - __drop_metatable:
-       rettype: void
-       argument: text
-
-    - __dt_acc_count_sfunc:
-       rettype: bigint[]
-       argument: bigint[], integer, bigint, integer
-
-    - __dt_get_node_split_fids:
-       rettype: integer[]
-       argument: integer, integer, integer, integer[]
-
-    - __ebp_calc_errors:
-       rettype: double precision
-       argument: double precision, double precision, double precision
-
-    - __ebp_prune_tree:
-       rettype: void
-       argument: text
-
-    - __encode_and_train:
-       rettype: record
-       argument: text, text, integer, integer, text, text, text, text, text, text, text, double precision, text, integer, double precision, boolean, double precision, double precision, text, integer
-
-    - __encode_columns:
-       rettype: void
-       argument: text, text, integer, integer
-
-    - __encode_table:
-       rettype: void
-       argument: text, text, text, integer, integer
-
-    - __encode_table:
-       rettype: void
-       argument: text, text, text[], text, text[], text, text, integer, integer
-
-    - __find_best_split:
-       rettype: void
-       argument: text, double precision, text, integer, integer, text, integer, integer
-
-    - __format:
-       rettype: text
-       argument: text, text
-
-    - __format:
-       rettype: text
-       argument: text, text, text
-
-    - __format:
-       rettype: text
-       argument: text, text, text, text
-
-    - __format:
-       rettype: text
-       argument: text, text, text, text, text
-
-    - __format:
-       rettype: text
-       argument: text, text[]
-
-    - __gen_acc:
-       rettype: __gen_acc_time
-       argument: text, text, text, text, text, integer, integer, boolean, integer
-
-    - __gen_enc_meta_names:
-       rettype: text[]
-       argument: text, text
-
-    - __gen_horizontal_encoded_table:
-       rettype: void
-       argument: text, text, integer, integer
-
-    - __gen_vertical_encoded_table:
-       rettype: void
-       argument: text, text, text, boolean, integer
-
-    - __generate_final_tree:
-       rettype: void
-       argument: text
-
-    - __get_class_column_name:
-       rettype: text
-       argument: text
-
-    - __get_class_value:
-       rettype: text
-       argument: integer, text
-
-    - __get_classtable_name:
-       rettype: text
-       argument: text
-
-    - __get_column_value:
-       rettype: text
-       argument: integer, integer, character, text
-
-    - __get_feature_name:
-       rettype: text
-       argument: integer, text
-
-    - __get_feature_value:
-       rettype: text
-       argument: integer, integer, text
-
-    - __get_features_of_nodes:
-       rettype: text
-       argument: text, text, integer, integer, integer
-
-    - __get_id_column_name:
-       rettype: text
-       argument: text
-
-    - __get_schema_name:
-       rettype: text
-       argument: text
-
-    - __get_table_name:
-       rettype: text
-       argument: text
-
-    - __insert_into_metatable:
-       rettype: void
-       argument: text, integer, text, character, boolean, text, integer
-
-    - __is_valid_enc_table:
-       rettype: boolean
-       argument: text
-
-    - __num_of_class:
-       rettype: integer
-       argument: text
-
-    - __num_of_columns:
-       rettype: integer
-       argument: text
-
-    - __num_of_feature:
-       rettype: integer
-       argument: text
-
-    - __regclass_to_text:
-       rettype: text
-       argument: regclass
-
-    - __rename_table:
-       rettype: void
-       argument: text, text
-
-    - __rep_aggr_class_count_ffunc:
-       rettype: bigint[]
-       argument: bigint[]
-
-    - __rep_aggr_class_count_sfunc:
-       rettype: bigint[]
-       argument: bigint[], integer, integer, integer
-
-    - __rep_prune_tree:
-       rettype: void
-       argument: text, text, integer
-
-    - __sample_with_replacement:
-       rettype: void
-       argument: integer, bigint, text, text
-
-    - __sample_within_range:
-       rettype: SETOF bigint
-       argument: bigint, bigint, bigint
-
-    - __scv_aggr_ffunc:
-       rettype: double precision[]
-       argument: double precision[]
-
-    - __scv_aggr_prefunc:
-       rettype: double precision[]
-       argument: double precision[], double precision[]
-
-    - __scv_aggr_sfunc:
-       rettype: double precision[]
-       argument: double precision[], integer, boolean, integer, double precision[], double precision[], bigint
-
-    - __strip_schema_name:
-       rettype: text
-       argument: text
-
-    - __svm_random_ind2:
-       rettype: double precision[]
-       argument: integer
-
-    - __svm_random_ind:
-       rettype: double precision[]
-       argument: integer
-
-    - __svm_target_cl_func:
-       rettype: double precision
-       argument: double precision[]
-
-    - __svm_target_reg_func:
-       rettype: double precision
-       argument: double precision[]
-
-    - __table_exists:
-       rettype: boolean
-       argument: text
-
-    - __train_tree:
-       rettype: __train_result
-       argument: text, integer, integer, text, text, text, text, text, text, double precision, integer, double precision, double precision, double precision, boolean, integer, integer
-
-    - __treemodel_classify_internal:
-       rettype: text[]
-       argument: text, text, integer
-
-    - __treemodel_classify_internal_serial:
-       rettype: text[]
-       argument: text, text, integer
-
-    - __treemodel_display_no_ordered_aggr:
-       rettype: SETOF text
-       argument: text, integer[], integer
-
-    - __treemodel_display_with_ordered_aggr:
-       rettype: SETOF text
-       argument: text, integer[], integer
-
-    - __treemodel_get_vote_result:
-       rettype: void
-       argument: text, text
-
-    - __treemodel_score:
-       rettype: double precision
-       argument: text, text, integer
-
-    - __validate_input_table:
-       rettype: void
-       argument: text, text[], text, text
-
-    - __validate_metatable:
-       rettype: void
-       argument: text
-
-    - c45_classify:
-       rettype: c45_classify_result
-       argument: text, text, text
-
-    - c45_classify:
-       rettype: c45_classify_result
-       argument: text, text, text, integer
-
-    - c45_clean:
-       rettype: boolean
-       argument: text
-
-    - c45_display:
-       rettype: SETOF text
-       argument: text
-
-    - c45_display:
-       rettype: SETOF text
-       argument: text, integer
-
-    - c45_genrule:
-       rettype: SETOF text
-       argument: text
-
-    - c45_genrule:
-       rettype: SETOF text
-       argument: text, integer
-
-    - c45_score:
-       rettype: double precision
-       argument: text, text
-
-    - c45_score:
-       rettype: double precision
-       argument: text, text, integer
-
-    - c45_train:
-       rettype: c45_train_result
-       argument: text, text, text
-
-    - c45_train:
-       rettype: c45_train_result
-       argument: text, text, text, text, text, text, text, text, double precision, text
-
-    - c45_train:
-       rettype: c45_train_result
-       argument: text, text, text, text, text, text, text, text, double precision, text, integer, double precision, double precision, integer
-
-    - correlation:
-       rettype: correlation_result
-       argument: character varying, character varying
-
-    - correlation:
-       rettype: correlation_result
-       argument: character varying, character varying, character varying
-
-    - correlation:
-       rettype: correlation_result
-       argument: character varying, character varying, character varying, boolean
-
-    - linear_svm_igd_transition:
-       rettype: double precision[]
-       argument: double precision[], double precision[], boolean, double precision[], integer, double precision, double precision
-
-    - lsvm_classification:
-       rettype: SETOF lsvm_sgd_result
-       argument: text, text
-
-    - lsvm_classification:
-       rettype: SETOF lsvm_sgd_result
-       argument: text, text, boolean
-
-    - lsvm_classification:
-       rettype: SETOF lsvm_sgd_result
-       argument: text, text, boolean, boolean, double precision, double precision
-
-    - lsvm_classification:
-       rettype: SETOF lsvm_sgd_result
-       argument: text, text, boolean, boolean, double precision, double precision, integer
-
-    - lsvm_predict:
-       rettype: double precision
-       argument: double precision[], double precision[]
-
-    - lsvm_predict_batch:
-       rettype: text
-       argument: text, text, text, text, text
-
-    - lsvm_predict_batch:
-       rettype: text
-       argument: text, text, text, text, text, boolean
-
-    - matrix_block_trans:
-       rettype: matrix_result
-       argument: text, text, text, text, boolean
-
-    - matrix_densify:
-       rettype: matrix_result
-       argument: text, text, text, text, boolean
-
-    - matrix_sparsify:
-       rettype: matrix_result
-       argument: text, text, text, text, boolean
-
-    - matrix_trans:
-       rettype: matrix_result
-       argument: text, text, text, text, boolean
-
-    - rf_classify:
-       rettype: rf_classify_result
-       argument: text, text, text
-
-    - rf_classify:
-       rettype: rf_classify_result
-       argument: text, text, text, boolean, integer
-
-    - rf_classify:
-       rettype: rf_classify_result
-       argument: text, text, text, integer
-
-    - rf_clean:
-       rettype: boolean
-       argument: text
-
-    - rf_display:
-       rettype: SETOF text
-       argument: text
-
-    - rf_display:
-       rettype: SETOF text
-       argument: text, integer[]
-
-    - rf_display:
-       rettype: SETOF text
-       argument: text, integer[], integer
-
-    - rf_score:
-       rettype: double precision
-       argument: text, text
-
-    - rf_score:
-       rettype: double precision
-       argument: text, text, integer
-
-    - rf_train:
-       rettype: rf_train_result
-       argument: text, text, text
-
-    - rf_train:
-       rettype: rf_train_result
-       argument: text, text, text, integer, integer, double precision, text, text, text, text, text, integer, double precision, double precision, integer
-
-    - svdmf_run:
-       rettype: text
-       argument: text, text, text, text, integer
-
-    - svdmf_run:
-       rettype: text
-       argument: text, text, text, text, integer, integer, double precision
-
-    - svm_classification:
-       rettype: SETOF svm_cls_result
-       argument: text, text, boolean, text
-
-    - svm_classification:
-       rettype: SETOF svm_cls_result
-       argument: text, text, boolean, text, boolean, double precision, double precision
-
-    - svm_classification:
-       rettype: SETOF svm_cls_result
-       argument: text, text, boolean, text, boolean, double precision, double precision, double precision
-
-    - svm_classification:
-       rettype: SETOF svm_cls_result
-       argument: text, text, boolean, text, double precision
-
-    - svm_cls_update:
-       rettype: schema_madlib.svm_model_rec
-       argument: schema_madlib.svm_model_rec, double precision[], double precision, text, double precision, double precision, double precision
-
-    - svm_data_normalization:
-       rettype: void
-       argument: text
-
-    - svm_dot:
-       rettype: double precision
-       argument: double precision[], double precision[]
-
-    - svm_dot:
-       rettype: double precision
-       argument: double precision[], double precision[], double precision
-
-    - svm_drop_model:
-       rettype: void
-       argument: text
-
-    - svm_gaussian:
-       rettype: double precision
-       argument: double precision[], double precision[], double precision
-
-    - svm_generate_cls_data:
-       rettype: void
-       argument: text, integer, integer
-
-    - svm_generate_nd_data:
-       rettype: void
-       argument: text, integer, integer
-
-    - svm_generate_reg_data:
-       rettype: void
-       argument: text, integer, integer
-
-    - svm_nd_update:
-       rettype: schema_madlib.svm_model_rec
-       argument: schema_madlib.svm_model_rec, double precision[], text, double precision, double precision, double precision
-
-    - svm_novelty_detection:
-       rettype: SETOF schema_madlib.svm_nd_result
-       argument: text, text, boolean, text
-
-    - svm_novelty_detection:
-       rettype: SETOF schema_madlib.svm_nd_result
-       argument: text, text, boolean, text, boolean, double precision, double precision
-
-    - svm_novelty_detection:
-       rettype: SETOF schema_madlib.svm_nd_result
-       argument: text, text, boolean, text, boolean, double precision, double precision, double precision
-
-    - svm_polynomial:
-       rettype: double precision
-       argument: double precision[], double precision[], double precision
-
-    - svm_predict:
-       rettype: double precision
-       argument: schema_madlib.svm_model_rec, double precision[], text, double precision
-
-    - svm_predict_batch:
-       rettype: text
-       argument: text, text, text, text, text, boolean
-
-    - svm_predict_sub:
-       rettype: double precision
-       argument: integer, integer, double precision[], double precision[], double precision[], text, double precision
-
-    - svm_reg_update:
-       rettype: schema_madlib.svm_model_rec
-       argument: schema_madlib.svm_model_rec, double precision[], double precision, text, double precision, double precision, double precision, double precision
-
-    - svm_regression:
-       rettype: SETOF svm_reg_result
-       argument: text, text, boolean, text
-
-    - svm_regression:
-       rettype: SETOF svm_reg_result
-       argument: text, text, boolean, text, boolean, double precision, double precision, double precision
-
-    - svm_regression:
-       rettype: SETOF svm_reg_result
-       argument: text, text, boolean, text, boolean, double precision, double precision, double precision, double precision
-
-    - svm_store_model:
-       rettype: void
-       argument: text, text, text
-
-    # ----------------- Changes from 1.9 to 1.9.1 ----------
-    - array_collapse:
-        rettype: anyarray
-        argument: anyarray
-    - linear_svm_igd_transition:
-        rettype: double precision[]
-        argument: double precision[], double precision[], double precision, double precision[], integer, double precision, double precision, boolean, integer, double precision, boolean
-    - profile:
-        rettype: SETOF schema_madlib.profile_result
-        argument: text
-    - profile_full:
-        rettype: SETOF schema_madlib.profile_result
-        argument: text, integer
-    - profile:
-        rettype: schema_madlib.profile_result
-        argument: text
-    - profile_full:
-        rettype: schema_madlib.profile_result
-        argument: text, integer
-    - quantile:
-        rettype: double precision
-        argument: text, text, double precision
-    - quantile_big:
-        rettype: double precision
-        argument: text, text, double precision
-
-# Changes to aggregates (UDA) including removal and modification
-# Overloaded functions should be mentioned separately
-uda:
-    # ----------------- Changes from 1.7 to 1.7.1 ----------
-    - discrete_distribution_agg: # merge function replaced
-        rettype: double precision[]
-        argument: integer, double precision, integer
-    - _compute_leaf_stats: # dup_count added
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, integer[], double precision[], double precision, double precision, integer[], schema_madlib.bytea8, smallint
-    - _compute_surr_stats: # dup_count added
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, integer[], double precision[], integer[], schema_madlib.bytea8
-    # ----------------- Changes from 1.7.1 to 1.8 ----------
-    - discrete_distribution_agg:
-        rettype: double precision[]
-        argument: integer, double precision, integer
-    - vectorized_distribution_agg:
-        rettype: double precision[]
-        argument: integer[], integer[]
-    # ----------------- Changes from 1.8 to 1.9 ----------
-
-    - __array_indexed_agg:
-        rettype: double precision[]
-        argument: double precision, bigint, bigint
-
-    - __best_scv_aggr:
-        rettype: double precision[]
-        argument: double precision[], integer, double precision
-
-    - __bigint_array_sum:
-        rettype: bigint[]
-        argument: bigint[]
-
-    - __display_tree_aggr:
-        rettype: text
-        argument: integer, boolean, text, text, double precision, double precision, text, integer
-
-    - __dt_acc_count_aggr:
-        rettype: bigint[]
-        argument: integer, bigint, integer
-
-    - __rep_aggr_class_count:
-        rettype: bigint[]
-        argument: integer, integer, integer
-
-    - __scv_aggr:
-        rettype: double precision[]
-        argument: integer, boolean, integer, double precision[], double precision[], bigint
-
-    - linear_svm_igd_step:
-        rettype: double precision[]
-        argument: double precision[], boolean, double precision[], integer, double precision, double precision
-
-    - linear_svm_igd_step_serial:
-        rettype: double precision[]
-        argument: double precision[], boolean, double precision[], integer, double precision, double precision
-
-    - svm_cls_agg:
-        rettype: schema_madlib.svm_model_rec
-        argument: double precision[], double precision, text, double precision, double precision, double precision
-
-    - svm_nd_agg:
-        rettype: schema_madlib.svm_model_rec
-        argument: double precision[], text, double precision, double precision, double precision
-
-    - svm_reg_agg:
-        rettype: schema_madlib.svm_model_rec
-        argument: double precision[], double precision, text, double precision, double precision, double precision, double precision
-
-    - __svm_random_ind2:
-        rettype: double precision[]
-        argument: integer
-    # ----------------- Changes from 1.9 to 1.9.1 ----------
-    - array_agg:
-        rettype: anyarray
-        argument: anyelement
-    - linear_svm_igd_step:
-       rettype: double precision[]
-       argument: double precision[], double precision, double precision[], integer, double precision, double precision, boolean, integer, double precision, boolean
-
-# Casts (UDC) updated/removed
-udc:
-    # ----------------- Changes from 1.7 to 1.7.1 ----------
-
-# Operators (UDO) removed/updated
-udo:
-    # ----------------- Changes from 1.7 to 1.7.1 ----------
-
-# Operator Classes (UDOC) removed/updated
-udoc:
-    # ----------------- Changes from 1.7 to 1.7.1 ----------

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/90f4dc15/src/madpack/changelist_1.8_1.10.yaml
----------------------------------------------------------------------
diff --git a/src/madpack/changelist_1.8_1.10.yaml b/src/madpack/changelist_1.8_1.10.yaml
new file mode 100644
index 0000000..d85877b
--- /dev/null
+++ b/src/madpack/changelist_1.8_1.10.yaml
@@ -0,0 +1,857 @@
+# ------------------------------------------------------------------------------
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# ------------------------------------------------------------------------------
+
+# Changelist for MADlib version 1.8 to 1.10
+
+# This file contains all changes that were introduced in a new version of
+# MADlib. This changelist is used by the upgrade script to detect what objects
+# should be upgraded (while retaining all other objects from the previous version)
+
+# New modules (actually .sql_in files) added in upgrade version
+# For these files the sql_in code is retained as is with the functions in the
+# file installed on the upgrade version. All other files (that don't have
+# updates), are cleaned up to remove object replacements
+new module:
+    # ----------------- Changes from 1.9.1 to 1.0 ----------
+    sssp:
+    encode_categorical:
+    knn:
+
+# Changes in the types (UDT) including removal and modification
+udt:
+
+    # ----------------- Changes from 1.8 to 1.9 ----------
+    __enc_tbl_result:
+    __gen_acc_time:
+    __rep_type:
+    __train_result:
+    c45_classify_result:
+    c45_train_result:
+    correlation_result:
+    lsvm_sgd_model_rec:
+    lsvm_sgd_result:
+    rf_classify_result:
+    rf_train_result:
+    svm_cls_result:
+    svm_model_pr:
+    svm_model_rec:
+    svm_nd_result:
+    svm_reg_result:
+    svm_support_vector:
+    _prune_result_type:
+    _tree_result_type:
+    linear_svm_result:
+    # ----------------- Changes from 1.9 to 1.9.1 ----------
+    profile_result:
+        # ----------------- Changes from 1.9.1 to 1.0 ----------
+    _tree_result_type:
+    _prune_result_type:
+    kmeans_result:
+    kmeans_state:
+
+# List of the UDF changes that affect the user externally. This includes change
+# in function name, return type, argument order or types, or removal of
+# the function. In each case, the original function is as good as removed and a
+# new function is created. In such cases, we should abort the upgrade if there
+# are user views dependent on this function, since the original function will
+# not be present in the upgraded version.
+udf:
+
+    # ----------------- Changes from 1.8 to 1.9 ----------
+    - _dt_apply:
+       rettype: schema_madlib._tree_result_type
+       argument: schema_madlib.bytea8,schema_madlib.bytea8,schema_madlib.bytea8,smallint,smallint,smallint,boolean,integer
+
+    - internal_linear_svm_igd_result:
+       rettype: schema_madlib.linear_svm_result
+       argument: double precision[]
+
+    - _prune_and_cplist:
+       rettype: schema_madlib._prune_result_type
+       argument: schema_madlib.bytea8,double precision,boolean
+
+    - __array_elem_in:
+       rettype: boolean[]
+       argument: anyarray, anyarray
+
+    - __array_indexed_agg_ffunc:
+       rettype: double precision[]
+       argument: double precision[]
+
+    - __array_indexed_agg_prefunc:
+       rettype: double precision[]
+       argument: double precision[], double precision[]
+
+    - __array_indexed_agg_sfunc:
+       rettype: double precision[]
+       argument: double precision[], double precision, bigint, bigint
+
+    - __array_search:
+       rettype: boolean
+       argument: anyelement, anyarray
+
+    - __array_sort:
+       rettype: anyarray
+       argument: anyarray
+
+    - __assert:
+       rettype: void
+       argument: boolean, text
+
+    - __assert_table:
+       rettype: void
+       argument: text, boolean
+
+    - __best_scv_prefunc:
+       rettype: double precision[]
+       argument: double precision[], double precision[]
+
+    - __best_scv_sfunc:
+       rettype: double precision[]
+       argument: double precision[], double precision[], integer, double precision
+
+    - __bigint_array_add:
+       rettype: bigint[]
+       argument: bigint[], bigint[]
+
+    - __breakup_table:
+       rettype: void
+       argument: text, text, text, text, text, text[], boolean[], integer, integer
+
+    - __check_dt_common_params:
+       rettype: void
+       argument: text, text, text, text, text, text, text, text, integer, double precision, double precision, integer, text
+
+    - __check_training_table:
+       rettype: void
+       argument: text, text[], text[], text, text, integer
+
+    - __column_exists:
+       rettype: boolean
+       argument: text, text
+
+    - __columns_in_table:
+       rettype: boolean
+       argument: text[], text
+
+    - __create_metatable:
+       rettype: void
+       argument: text
+
+    - __create_tree_tables:
+       rettype: void
+       argument: text
+
+    - __csvstr_to_array:
+       rettype: text[]
+       argument: text
+
+    - __display_node_sfunc:
+       rettype: text
+       argument: text, integer, boolean, text, text, double precision, double precision, text, integer
+
+    - __display_tree_no_ordered_aggr:
+       rettype: text
+       argument: text, integer, integer, integer, boolean, double precision, text, integer, integer
+
+    - __distinct_feature_value:
+       rettype: integer
+       argument: text, integer
+
+    - __drop_metatable:
+       rettype: void
+       argument: text
+
+    - __dt_acc_count_sfunc:
+       rettype: bigint[]
+       argument: bigint[], integer, bigint, integer
+
+    - __dt_get_node_split_fids:
+       rettype: integer[]
+       argument: integer, integer, integer, integer[]
+
+    - __ebp_calc_errors:
+       rettype: double precision
+       argument: double precision, double precision, double precision
+
+    - __ebp_prune_tree:
+       rettype: void
+       argument: text
+
+    - __encode_and_train:
+       rettype: record
+       argument: text, text, integer, integer, text, text, text, text, text, text, text, double precision, text, integer, double precision, boolean, double precision, double precision, text, integer
+
+    - __encode_columns:
+       rettype: void
+       argument: text, text, integer, integer
+
+    - __encode_table:
+       rettype: void
+       argument: text, text, text, integer, integer
+
+    - __encode_table:
+       rettype: void
+       argument: text, text, text[], text, text[], text, text, integer, integer
+
+    - __find_best_split:
+       rettype: void
+       argument: text, double precision, text, integer, integer, text, integer, integer
+
+    - __format:
+       rettype: text
+       argument: text, text
+
+    - __format:
+       rettype: text
+       argument: text, text, text
+
+    - __format:
+       rettype: text
+       argument: text, text, text, text
+
+    - __format:
+       rettype: text
+       argument: text, text, text, text, text
+
+    - __format:
+       rettype: text
+       argument: text, text[]
+
+    - __gen_acc:
+       rettype: __gen_acc_time
+       argument: text, text, text, text, text, integer, integer, boolean, integer
+
+    - __gen_enc_meta_names:
+       rettype: text[]
+       argument: text, text
+
+    - __gen_horizontal_encoded_table:
+       rettype: void
+       argument: text, text, integer, integer
+
+    - __gen_vertical_encoded_table:
+       rettype: void
+       argument: text, text, text, boolean, integer
+
+    - __generate_final_tree:
+       rettype: void
+       argument: text
+
+    - __get_class_column_name:
+       rettype: text
+       argument: text
+
+    - __get_class_value:
+       rettype: text
+       argument: integer, text
+
+    - __get_classtable_name:
+       rettype: text
+       argument: text
+
+    - __get_column_value:
+       rettype: text
+       argument: integer, integer, character, text
+
+    - __get_feature_name:
+       rettype: text
+       argument: integer, text
+
+    - __get_feature_value:
+       rettype: text
+       argument: integer, integer, text
+
+    - __get_features_of_nodes:
+       rettype: text
+       argument: text, text, integer, integer, integer
+
+    - __get_id_column_name:
+       rettype: text
+       argument: text
+
+    - __get_schema_name:
+       rettype: text
+       argument: text
+
+    - __get_table_name:
+       rettype: text
+       argument: text
+
+    - __insert_into_metatable:
+       rettype: void
+       argument: text, integer, text, character, boolean, text, integer
+
+    - __is_valid_enc_table:
+       rettype: boolean
+       argument: text
+
+    - __num_of_class:
+       rettype: integer
+       argument: text
+
+    - __num_of_columns:
+       rettype: integer
+       argument: text
+
+    - __num_of_feature:
+       rettype: integer
+       argument: text
+
+    - __regclass_to_text:
+       rettype: text
+       argument: regclass
+
+    - __rename_table:
+       rettype: void
+       argument: text, text
+
+    - __rep_aggr_class_count_ffunc:
+       rettype: bigint[]
+       argument: bigint[]
+
+    - __rep_aggr_class_count_sfunc:
+       rettype: bigint[]
+       argument: bigint[], integer, integer, integer
+
+    - __rep_prune_tree:
+       rettype: void
+       argument: text, text, integer
+
+    - __sample_with_replacement:
+       rettype: void
+       argument: integer, bigint, text, text
+
+    - __sample_within_range:
+       rettype: SETOF bigint
+       argument: bigint, bigint, bigint
+
+    - __scv_aggr_ffunc:
+       rettype: double precision[]
+       argument: double precision[]
+
+    - __scv_aggr_prefunc:
+       rettype: double precision[]
+       argument: double precision[], double precision[]
+
+    - __scv_aggr_sfunc:
+       rettype: double precision[]
+       argument: double precision[], integer, boolean, integer, double precision[], double precision[], bigint
+
+    - __strip_schema_name:
+       rettype: text
+       argument: text
+
+    - __svm_random_ind2:
+       rettype: double precision[]
+       argument: integer
+
+    - __svm_random_ind:
+       rettype: double precision[]
+       argument: integer
+
+    - __svm_target_cl_func:
+       rettype: double precision
+       argument: double precision[]
+
+    - __svm_target_reg_func:
+       rettype: double precision
+       argument: double precision[]
+
+    - __table_exists:
+       rettype: boolean
+       argument: text
+
+    - __train_tree:
+       rettype: __train_result
+       argument: text, integer, integer, text, text, text, text, text, text, double precision, integer, double precision, double precision, double precision, boolean, integer, integer
+
+    - __treemodel_classify_internal:
+       rettype: text[]
+       argument: text, text, integer
+
+    - __treemodel_classify_internal_serial:
+       rettype: text[]
+       argument: text, text, integer
+
+    - __treemodel_display_no_ordered_aggr:
+       rettype: SETOF text
+       argument: text, integer[], integer
+
+    - __treemodel_display_with_ordered_aggr:
+       rettype: SETOF text
+       argument: text, integer[], integer
+
+    - __treemodel_get_vote_result:
+       rettype: void
+       argument: text, text
+
+    - __treemodel_score:
+       rettype: double precision
+       argument: text, text, integer
+
+    - __validate_input_table:
+       rettype: void
+       argument: text, text[], text, text
+
+    - __validate_metatable:
+       rettype: void
+       argument: text
+
+    - c45_classify:
+       rettype: c45_classify_result
+       argument: text, text, text
+
+    - c45_classify:
+       rettype: c45_classify_result
+       argument: text, text, text, integer
+
+    - c45_clean:
+       rettype: boolean
+       argument: text
+
+    - c45_display:
+       rettype: SETOF text
+       argument: text
+
+    - c45_display:
+       rettype: SETOF text
+       argument: text, integer
+
+    - c45_genrule:
+       rettype: SETOF text
+       argument: text
+
+    - c45_genrule:
+       rettype: SETOF text
+       argument: text, integer
+
+    - c45_score:
+       rettype: double precision
+       argument: text, text
+
+    - c45_score:
+       rettype: double precision
+       argument: text, text, integer
+
+    - c45_train:
+       rettype: c45_train_result
+       argument: text, text, text
+
+    - c45_train:
+       rettype: c45_train_result
+       argument: text, text, text, text, text, text, text, text, double precision, text
+
+    - c45_train:
+       rettype: c45_train_result
+       argument: text, text, text, text, text, text, text, text, double precision, text, integer, double precision, double precision, integer
+
+    - correlation:
+       rettype: correlation_result
+       argument: character varying, character varying
+
+    - correlation:
+       rettype: correlation_result
+       argument: character varying, character varying, character varying
+
+    - correlation:
+       rettype: correlation_result
+       argument: character varying, character varying, character varying, boolean
+
+    - linear_svm_igd_transition:
+       rettype: double precision[]
+       argument: double precision[], double precision[], boolean, double precision[], integer, double precision, double precision
+
+    - lsvm_classification:
+       rettype: SETOF lsvm_sgd_result
+       argument: text, text
+
+    - lsvm_classification:
+       rettype: SETOF lsvm_sgd_result
+       argument: text, text, boolean
+
+    - lsvm_classification:
+       rettype: SETOF lsvm_sgd_result
+       argument: text, text, boolean, boolean, double precision, double precision
+
+    - lsvm_classification:
+       rettype: SETOF lsvm_sgd_result
+       argument: text, text, boolean, boolean, double precision, double precision, integer
+
+    - lsvm_predict:
+       rettype: double precision
+       argument: double precision[], double precision[]
+
+    - lsvm_predict_batch:
+       rettype: text
+       argument: text, text, text, text, text
+
+    - lsvm_predict_batch:
+       rettype: text
+       argument: text, text, text, text, text, boolean
+
+    - matrix_block_trans:
+       rettype: matrix_result
+       argument: text, text, text, text, boolean
+
+    - matrix_densify:
+       rettype: matrix_result
+       argument: text, text, text, text, boolean
+
+    - matrix_sparsify:
+       rettype: matrix_result
+       argument: text, text, text, text, boolean
+
+    - matrix_trans:
+       rettype: matrix_result
+       argument: text, text, text, text, boolean
+
+    - rf_classify:
+       rettype: rf_classify_result
+       argument: text, text, text
+
+    - rf_classify:
+       rettype: rf_classify_result
+       argument: text, text, text, boolean, integer
+
+    - rf_classify:
+       rettype: rf_classify_result
+       argument: text, text, text, integer
+
+    - rf_clean:
+       rettype: boolean
+       argument: text
+
+    - rf_display:
+       rettype: SETOF text
+       argument: text
+
+    - rf_display:
+       rettype: SETOF text
+       argument: text, integer[]
+
+    - rf_display:
+       rettype: SETOF text
+       argument: text, integer[], integer
+
+    - rf_score:
+       rettype: double precision
+       argument: text, text
+
+    - rf_score:
+       rettype: double precision
+       argument: text, text, integer
+
+    - rf_train:
+       rettype: rf_train_result
+       argument: text, text, text
+
+    - rf_train:
+       rettype: rf_train_result
+       argument: text, text, text, integer, integer, double precision, text, text, text, text, text, integer, double precision, double precision, integer
+
+    - svdmf_run:
+       rettype: text
+       argument: text, text, text, text, integer
+
+    - svdmf_run:
+       rettype: text
+       argument: text, text, text, text, integer, integer, double precision
+
+    - svm_classification:
+       rettype: SETOF svm_cls_result
+       argument: text, text, boolean, text
+
+    - svm_classification:
+       rettype: SETOF svm_cls_result
+       argument: text, text, boolean, text, boolean, double precision, double precision
+
+    - svm_classification:
+       rettype: SETOF svm_cls_result
+       argument: text, text, boolean, text, boolean, double precision, double precision, double precision
+
+    - svm_classification:
+       rettype: SETOF svm_cls_result
+       argument: text, text, boolean, text, double precision
+
+    - svm_cls_update:
+       rettype: schema_madlib.svm_model_rec
+       argument: schema_madlib.svm_model_rec, double precision[], double precision, text, double precision, double precision, double precision
+
+    - svm_data_normalization:
+       rettype: void
+       argument: text
+
+    - svm_dot:
+       rettype: double precision
+       argument: double precision[], double precision[]
+
+    - svm_dot:
+       rettype: double precision
+       argument: double precision[], double precision[], double precision
+
+    - svm_drop_model:
+       rettype: void
+       argument: text
+
+    - svm_gaussian:
+       rettype: double precision
+       argument: double precision[], double precision[], double precision
+
+    - svm_generate_cls_data:
+       rettype: void
+       argument: text, integer, integer
+
+    - svm_generate_nd_data:
+       rettype: void
+       argument: text, integer, integer
+
+    - svm_generate_reg_data:
+       rettype: void
+       argument: text, integer, integer
+
+    - svm_nd_update:
+       rettype: schema_madlib.svm_model_rec
+       argument: schema_madlib.svm_model_rec, double precision[], text, double precision, double precision, double precision
+
+    - svm_novelty_detection:
+       rettype: SETOF schema_madlib.svm_nd_result
+       argument: text, text, boolean, text
+
+    - svm_novelty_detection:
+       rettype: SETOF schema_madlib.svm_nd_result
+       argument: text, text, boolean, text, boolean, double precision, double precision
+
+    - svm_novelty_detection:
+       rettype: SETOF schema_madlib.svm_nd_result
+       argument: text, text, boolean, text, boolean, double precision, double precision, double precision
+
+    - svm_polynomial:
+       rettype: double precision
+       argument: double precision[], double precision[], double precision
+
+    - svm_predict:
+       rettype: double precision
+       argument: schema_madlib.svm_model_rec, double precision[], text, double precision
+
+    - svm_predict_batch:
+       rettype: text
+       argument: text, text, text, text, text, boolean
+
+    - svm_predict_sub:
+       rettype: double precision
+       argument: integer, integer, double precision[], double precision[], double precision[], text, double precision
+
+    - svm_reg_update:
+       rettype: schema_madlib.svm_model_rec
+       argument: schema_madlib.svm_model_rec, double precision[], double precision, text, double precision, double precision, double precision, double precision
+
+    - svm_regression:
+       rettype: SETOF svm_reg_result
+       argument: text, text, boolean, text
+
+    - svm_regression:
+       rettype: SETOF svm_reg_result
+       argument: text, text, boolean, text, boolean, double precision, double precision, double precision
+
+    - svm_regression:
+       rettype: SETOF svm_reg_result
+       argument: text, text, boolean, text, boolean, double precision, double precision, double precision, double precision
+
+    - svm_store_model:
+       rettype: void
+       argument: text, text, text
+
+    # ----------------- Changes from 1.9 to 1.9.1 ----------
+    - array_collapse:
+        rettype: anyarray
+        argument: anyarray
+    - linear_svm_igd_transition:
+        rettype: double precision[]
+        argument: double precision[], double precision[], double precision, double precision[], integer, double precision, double precision, boolean, integer, double precision, boolean
+    - profile:
+        rettype: SETOF schema_madlib.profile_result
+        argument: text
+    - profile_full:
+        rettype: SETOF schema_madlib.profile_result
+        argument: text, integer
+    - profile:
+        rettype: schema_madlib.profile_result
+        argument: text
+    - profile_full:
+        rettype: schema_madlib.profile_result
+        argument: text, integer
+    - quantile:
+        rettype: double precision
+        argument: text, text, double precision
+    - quantile_big:
+        rettype: double precision
+        argument: text, text, double precision
+
+    # ----------------- Changes from 1.9.1 to 1.0 ----------
+    - _dt_apply:
+        rettype: schema_madlib._tree_result_type
+        argument: schema_madlib.bytea8, schema_madlib.bytea8, schema_madlib.bytea8, smallint, smallint, smallint, boolean, integer
+    - _prune_and_cplist:
+        rettype: schema_madlib._prune_result_type
+        argument: schema_madlib.bytea8, double precision, boolean
+    - kmeans:
+        rettype: schema_madlib.kmeans_result
+        argument: character varying, character varying, double precision[], character varying, character varying, integer, double precision
+    - kmeans:
+        rettype: schema_madlib.kmeans_result
+        argument: character varying, character varying, double precision[], character varying, character varying, integer
+    - kmeans:
+        rettype: schema_madlib.kmeans_result
+        argument: character varying, character varying, double precision[], character varying, character varying
+    - kmeans:
+        rettype: schema_madlib.kmeans_result
+        argument: character varying, character varying, double precision[], character varying
+    - kmeans:
+        rettype: schema_madlib.kmeans_result
+        argument: character varying, character varying, double precision[]
+    - kmeans:
+        rettype: schema_madlib.kmeans_result
+        argument: character varying, character varying, character varying, character varying, character varying, character varying, integer, double precision
+    - kmeans:
+        rettype: schema_madlib.kmeans_result
+        argument: character varying, character varying, character varying, character varying, character varying, character varying, integer
+    - kmeans:
+        rettype: schema_madlib.kmeans_result
+        argument: character varying, character varying, character varying, character varying, character varying, character varying
+    - kmeans:
+        rettype: schema_madlib.kmeans_result
+        argument: character varying, character varying, character varying, character varying, character varying
+    - kmeans:
+        rettype: schema_madlib.kmeans_result
+        argument: character varying, character varying, character varying, character varying
+    - kmeanspp:
+        rettype: schema_madlib.kmeans_result
+        argument: character varying, character varying, integer, character varying, character varying, integer, double precision, double precision
+    - kmeanspp:
+        rettype: schema_madlib.kmeans_result
+        argument: character varying, character varying, integer, character varying, character varying, integer, double precision
+    - kmeanspp:
+        rettype: schema_madlib.kmeans_result
+        argument: character varying, character varying, integer, character varying, character varying, integer
+    - kmeanspp:
+        rettype: schema_madlib.kmeans_result
+        argument: character varying, character varying, integer, character varying, character varying
+    - kmeanspp:
+        rettype: schema_madlib.kmeans_result
+        argument: character varying, character varying, integer, character varying
+    - kmeanspp:
+        rettype: schema_madlib.kmeans_result
+        argument: character varying, character varying, integer
+    - kmeans_random:
+        rettype: schema_madlib.kmeans_result
+        argument: character varying, character varying, integer, character varying, character varying, integer, double precision
+    - kmeans_random:
+        rettype: schema_madlib.kmeans_result
+        argument: character varying, character varying, integer, character varying, character varying, integer
+    - kmeans_random:
+        rettype: schema_madlib.kmeans_result
+        argument: character varying, character varying, integer, character varying, character varying
+    - kmeans_random:
+        rettype: schema_madlib.kmeans_result
+        argument: character varying, character varying, integer, character varying
+    - kmeans_random:
+        rettype: schema_madlib.kmeans_result
+        argument: character varying, character varying, integer
+    - internal_execute_using_kmeans_args:
+        rettype: schema_madlib.kmeans_result
+        argument: character varying, character varying, character varying, character varying, character varying, integer, double precision
+
+
+# Changes to aggregates (UDA) including removal and modification
+# Overloaded functions should be mentioned separately
+uda:
+
+    # ----------------- Changes from 1.8 to 1.9 ----------
+    - __array_indexed_agg:
+        rettype: double precision[]
+        argument: double precision, bigint, bigint
+
+    - __best_scv_aggr:
+        rettype: double precision[]
+        argument: double precision[], integer, double precision
+
+    - __bigint_array_sum:
+        rettype: bigint[]
+        argument: bigint[]
+
+    - __display_tree_aggr:
+        rettype: text
+        argument: integer, boolean, text, text, double precision, double precision, text, integer
+
+    - __dt_acc_count_aggr:
+        rettype: bigint[]
+        argument: integer, bigint, integer
+
+    - __rep_aggr_class_count:
+        rettype: bigint[]
+        argument: integer, integer, integer
+
+    - __scv_aggr:
+        rettype: double precision[]
+        argument: integer, boolean, integer, double precision[], double precision[], bigint
+
+    - linear_svm_igd_step:
+        rettype: double precision[]
+        argument: double precision[], boolean, double precision[], integer, double precision, double precision
+
+    - linear_svm_igd_step_serial:
+        rettype: double precision[]
+        argument: double precision[], boolean, double precision[], integer, double precision, double precision
+
+    - svm_cls_agg:
+        rettype: schema_madlib.svm_model_rec
+        argument: double precision[], double precision, text, double precision, double precision, double precision
+
+    - svm_nd_agg:
+        rettype: schema_madlib.svm_model_rec
+        argument: double precision[], text, double precision, double precision, double precision
+
+    - svm_reg_agg:
+        rettype: schema_madlib.svm_model_rec
+        argument: double precision[], double precision, text, double precision, double precision, double precision, double precision
+
+    - __svm_random_ind2:
+        rettype: double precision[]
+        argument: integer
+
+    # ----------------- Changes from 1.9 to 1.9.1 ----------
+    - array_agg:
+        rettype: anyarray
+        argument: anyelement
+    - linear_svm_igd_step:
+       rettype: double precision[]
+       argument: double precision[], double precision, double precision[], integer, double precision, double precision, boolean, integer, double precision, boolean
+
+# Casts (UDC) updated/removed
+udc:
+    # ----------------- Changes from 1.8 to 1.9 ----------
+
+# Operators (UDO) removed/updated
+udo:
+    # ----------------- Changes from 1.8 to 1.9 ----------
+
+# Operator Classes (UDOC) removed/updated
+udoc:
+    # ----------------- Changes from 1.8 to 1.9 ----------

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/90f4dc15/src/madpack/changelist_1.8_1.9.1.yaml
----------------------------------------------------------------------
diff --git a/src/madpack/changelist_1.8_1.9.1.yaml b/src/madpack/changelist_1.8_1.9.1.yaml
deleted file mode 100644
index 2a50cdb..0000000
--- a/src/madpack/changelist_1.8_1.9.1.yaml
+++ /dev/null
@@ -1,772 +0,0 @@
-# ------------------------------------------------------------------------------
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# ------------------------------------------------------------------------------
-
-# Changelist for MADlib version 1.8 to 1.9.1
-
-# This file contains all changes that were introduced in a new version of
-# MADlib. This changelist is used by the upgrade script to detect what objects
-# should be upgraded (while retaining all other objects from the previous version)
-
-# New modules (actually .sql_in files) added in upgrade version
-# For these files the sql_in code is retained as is with the functions in the
-# file installed on the upgrade version. All other files (that don't have
-# updates), are cleaned up to remove object replacements
-new module:
-# Changes in the types (UDT) including removal and modification
-udt:
-
-    # ----------------- Changes from 1.8 to 1.9 ----------
-    __enc_tbl_result:
-    __gen_acc_time:
-    __rep_type:
-    __train_result:
-    c45_classify_result:
-    c45_train_result:
-    correlation_result:
-    lsvm_sgd_model_rec:
-    lsvm_sgd_result:
-    rf_classify_result:
-    rf_train_result:
-    svm_cls_result:
-    svm_model_pr:
-    svm_model_rec:
-    svm_nd_result:
-    svm_reg_result:
-    svm_support_vector:
-    _prune_result_type:
-    _tree_result_type:
-    linear_svm_result:
-    # ----------------- Changes from 1.9 to 1.9.1 ----------
-    profile_result:
-
-# List of the UDF changes that affect the user externally. This includes change
-# in function name, return type, argument order or types, or removal of
-# the function. In each case, the original function is as good as removed and a
-# new function is created. In such cases, we should abort the upgrade if there
-# are user views dependent on this function, since the original function will
-# not be present in the upgraded version.
-udf:
-
-    # ----------------- Changes from 1.8 to 1.9 ----------
-    - _dt_apply:
-       rettype: schema_madlib._tree_result_type
-       argument: schema_madlib.bytea8,schema_madlib.bytea8,schema_madlib.bytea8,smallint,smallint,smallint,boolean,integer
-
-    - internal_linear_svm_igd_result:
-       rettype: schema_madlib.linear_svm_result
-       argument: double precision[]
-
-    - _prune_and_cplist:
-       rettype: schema_madlib._prune_result_type
-       argument: schema_madlib.bytea8,double precision,boolean
-
-    - __array_elem_in:
-       rettype: boolean[]
-       argument: anyarray, anyarray
-
-    - __array_indexed_agg_ffunc:
-       rettype: double precision[]
-       argument: double precision[]
-
-    - __array_indexed_agg_prefunc:
-       rettype: double precision[]
-       argument: double precision[], double precision[]
-
-    - __array_indexed_agg_sfunc:
-       rettype: double precision[]
-       argument: double precision[], double precision, bigint, bigint
-
-    - __array_search:
-       rettype: boolean
-       argument: anyelement, anyarray
-
-    - __array_sort:
-       rettype: anyarray
-       argument: anyarray
-
-    - __assert:
-       rettype: void
-       argument: boolean, text
-
-    - __assert_table:
-       rettype: void
-       argument: text, boolean
-
-    - __best_scv_prefunc:
-       rettype: double precision[]
-       argument: double precision[], double precision[]
-
-    - __best_scv_sfunc:
-       rettype: double precision[]
-       argument: double precision[], double precision[], integer, double precision
-
-    - __bigint_array_add:
-       rettype: bigint[]
-       argument: bigint[], bigint[]
-
-    - __breakup_table:
-       rettype: void
-       argument: text, text, text, text, text, text[], boolean[], integer, integer
-
-    - __check_dt_common_params:
-       rettype: void
-       argument: text, text, text, text, text, text, text, text, integer, double precision, double precision, integer, text
-
-    - __check_training_table:
-       rettype: void
-       argument: text, text[], text[], text, text, integer
-
-    - __column_exists:
-       rettype: boolean
-       argument: text, text
-
-    - __columns_in_table:
-       rettype: boolean
-       argument: text[], text
-
-    - __create_metatable:
-       rettype: void
-       argument: text
-
-    - __create_tree_tables:
-       rettype: void
-       argument: text
-
-    - __csvstr_to_array:
-       rettype: text[]
-       argument: text
-
-    - __display_node_sfunc:
-       rettype: text
-       argument: text, integer, boolean, text, text, double precision, double precision, text, integer
-
-    - __display_tree_no_ordered_aggr:
-       rettype: text
-       argument: text, integer, integer, integer, boolean, double precision, text, integer, integer
-
-    - __distinct_feature_value:
-       rettype: integer
-       argument: text, integer
-
-    - __drop_metatable:
-       rettype: void
-       argument: text
-
-    - __dt_acc_count_sfunc:
-       rettype: bigint[]
-       argument: bigint[], integer, bigint, integer
-
-    - __dt_get_node_split_fids:
-       rettype: integer[]
-       argument: integer, integer, integer, integer[]
-
-    - __ebp_calc_errors:
-       rettype: double precision
-       argument: double precision, double precision, double precision
-
-    - __ebp_prune_tree:
-       rettype: void
-       argument: text
-
-    - __encode_and_train:
-       rettype: record
-       argument: text, text, integer, integer, text, text, text, text, text, text, text, double precision, text, integer, double precision, boolean, double precision, double precision, text, integer
-
-    - __encode_columns:
-       rettype: void
-       argument: text, text, integer, integer
-
-    - __encode_table:
-       rettype: void
-       argument: text, text, text, integer, integer
-
-    - __encode_table:
-       rettype: void
-       argument: text, text, text[], text, text[], text, text, integer, integer
-
-    - __find_best_split:
-       rettype: void
-       argument: text, double precision, text, integer, integer, text, integer, integer
-
-    - __format:
-       rettype: text
-       argument: text, text
-
-    - __format:
-       rettype: text
-       argument: text, text, text
-
-    - __format:
-       rettype: text
-       argument: text, text, text, text
-
-    - __format:
-       rettype: text
-       argument: text, text, text, text, text
-
-    - __format:
-       rettype: text
-       argument: text, text[]
-
-    - __gen_acc:
-       rettype: __gen_acc_time
-       argument: text, text, text, text, text, integer, integer, boolean, integer
-
-    - __gen_enc_meta_names:
-       rettype: text[]
-       argument: text, text
-
-    - __gen_horizontal_encoded_table:
-       rettype: void
-       argument: text, text, integer, integer
-
-    - __gen_vertical_encoded_table:
-       rettype: void
-       argument: text, text, text, boolean, integer
-
-    - __generate_final_tree:
-       rettype: void
-       argument: text
-
-    - __get_class_column_name:
-       rettype: text
-       argument: text
-
-    - __get_class_value:
-       rettype: text
-       argument: integer, text
-
-    - __get_classtable_name:
-       rettype: text
-       argument: text
-
-    - __get_column_value:
-       rettype: text
-       argument: integer, integer, character, text
-
-    - __get_feature_name:
-       rettype: text
-       argument: integer, text
-
-    - __get_feature_value:
-       rettype: text
-       argument: integer, integer, text
-
-    - __get_features_of_nodes:
-       rettype: text
-       argument: text, text, integer, integer, integer
-
-    - __get_id_column_name:
-       rettype: text
-       argument: text
-
-    - __get_schema_name:
-       rettype: text
-       argument: text
-
-    - __get_table_name:
-       rettype: text
-       argument: text
-
-    - __insert_into_metatable:
-       rettype: void
-       argument: text, integer, text, character, boolean, text, integer
-
-    - __is_valid_enc_table:
-       rettype: boolean
-       argument: text
-
-    - __num_of_class:
-       rettype: integer
-       argument: text
-
-    - __num_of_columns:
-       rettype: integer
-       argument: text
-
-    - __num_of_feature:
-       rettype: integer
-       argument: text
-
-    - __regclass_to_text:
-       rettype: text
-       argument: regclass
-
-    - __rename_table:
-       rettype: void
-       argument: text, text
-
-    - __rep_aggr_class_count_ffunc:
-       rettype: bigint[]
-       argument: bigint[]
-
-    - __rep_aggr_class_count_sfunc:
-       rettype: bigint[]
-       argument: bigint[], integer, integer, integer
-
-    - __rep_prune_tree:
-       rettype: void
-       argument: text, text, integer
-
-    - __sample_with_replacement:
-       rettype: void
-       argument: integer, bigint, text, text
-
-    - __sample_within_range:
-       rettype: SETOF bigint
-       argument: bigint, bigint, bigint
-
-    - __scv_aggr_ffunc:
-       rettype: double precision[]
-       argument: double precision[]
-
-    - __scv_aggr_prefunc:
-       rettype: double precision[]
-       argument: double precision[], double precision[]
-
-    - __scv_aggr_sfunc:
-       rettype: double precision[]
-       argument: double precision[], integer, boolean, integer, double precision[], double precision[], bigint
-
-    - __strip_schema_name:
-       rettype: text
-       argument: text
-
-    - __svm_random_ind2:
-       rettype: double precision[]
-       argument: integer
-
-    - __svm_random_ind:
-       rettype: double precision[]
-       argument: integer
-
-    - __svm_target_cl_func:
-       rettype: double precision
-       argument: double precision[]
-
-    - __svm_target_reg_func:
-       rettype: double precision
-       argument: double precision[]
-
-    - __table_exists:
-       rettype: boolean
-       argument: text
-
-    - __train_tree:
-       rettype: __train_result
-       argument: text, integer, integer, text, text, text, text, text, text, double precision, integer, double precision, double precision, double precision, boolean, integer, integer
-
-    - __treemodel_classify_internal:
-       rettype: text[]
-       argument: text, text, integer
-
-    - __treemodel_classify_internal_serial:
-       rettype: text[]
-       argument: text, text, integer
-
-    - __treemodel_display_no_ordered_aggr:
-       rettype: SETOF text
-       argument: text, integer[], integer
-
-    - __treemodel_display_with_ordered_aggr:
-       rettype: SETOF text
-       argument: text, integer[], integer
-
-    - __treemodel_get_vote_result:
-       rettype: void
-       argument: text, text
-
-    - __treemodel_score:
-       rettype: double precision
-       argument: text, text, integer
-
-    - __validate_input_table:
-       rettype: void
-       argument: text, text[], text, text
-
-    - __validate_metatable:
-       rettype: void
-       argument: text
-
-    - c45_classify:
-       rettype: c45_classify_result
-       argument: text, text, text
-
-    - c45_classify:
-       rettype: c45_classify_result
-       argument: text, text, text, integer
-
-    - c45_clean:
-       rettype: boolean
-       argument: text
-
-    - c45_display:
-       rettype: SETOF text
-       argument: text
-
-    - c45_display:
-       rettype: SETOF text
-       argument: text, integer
-
-    - c45_genrule:
-       rettype: SETOF text
-       argument: text
-
-    - c45_genrule:
-       rettype: SETOF text
-       argument: text, integer
-
-    - c45_score:
-       rettype: double precision
-       argument: text, text
-
-    - c45_score:
-       rettype: double precision
-       argument: text, text, integer
-
-    - c45_train:
-       rettype: c45_train_result
-       argument: text, text, text
-
-    - c45_train:
-       rettype: c45_train_result
-       argument: text, text, text, text, text, text, text, text, double precision, text
-
-    - c45_train:
-       rettype: c45_train_result
-       argument: text, text, text, text, text, text, text, text, double precision, text, integer, double precision, double precision, integer
-
-    - correlation:
-       rettype: correlation_result
-       argument: character varying, character varying
-
-    - correlation:
-       rettype: correlation_result
-       argument: character varying, character varying, character varying
-
-    - correlation:
-       rettype: correlation_result
-       argument: character varying, character varying, character varying, boolean
-
-    - linear_svm_igd_transition:
-       rettype: double precision[]
-       argument: double precision[], double precision[], boolean, double precision[], integer, double precision, double precision
-
-    - lsvm_classification:
-       rettype: SETOF lsvm_sgd_result
-       argument: text, text
-
-    - lsvm_classification:
-       rettype: SETOF lsvm_sgd_result
-       argument: text, text, boolean
-
-    - lsvm_classification:
-       rettype: SETOF lsvm_sgd_result
-       argument: text, text, boolean, boolean, double precision, double precision
-
-    - lsvm_classification:
-       rettype: SETOF lsvm_sgd_result
-       argument: text, text, boolean, boolean, double precision, double precision, integer
-
-    - lsvm_predict:
-       rettype: double precision
-       argument: double precision[], double precision[]
-
-    - lsvm_predict_batch:
-       rettype: text
-       argument: text, text, text, text, text
-
-    - lsvm_predict_batch:
-       rettype: text
-       argument: text, text, text, text, text, boolean
-
-    - matrix_block_trans:
-       rettype: matrix_result
-       argument: text, text, text, text, boolean
-
-    - matrix_densify:
-       rettype: matrix_result
-       argument: text, text, text, text, boolean
-
-    - matrix_sparsify:
-       rettype: matrix_result
-       argument: text, text, text, text, boolean
-
-    - matrix_trans:
-       rettype: matrix_result
-       argument: text, text, text, text, boolean
-
-    - rf_classify:
-       rettype: rf_classify_result
-       argument: text, text, text
-
-    - rf_classify:
-       rettype: rf_classify_result
-       argument: text, text, text, boolean, integer
-
-    - rf_classify:
-       rettype: rf_classify_result
-       argument: text, text, text, integer
-
-    - rf_clean:
-       rettype: boolean
-       argument: text
-
-    - rf_display:
-       rettype: SETOF text
-       argument: text
-
-    - rf_display:
-       rettype: SETOF text
-       argument: text, integer[]
-
-    - rf_display:
-       rettype: SETOF text
-       argument: text, integer[], integer
-
-    - rf_score:
-       rettype: double precision
-       argument: text, text
-
-    - rf_score:
-       rettype: double precision
-       argument: text, text, integer
-
-    - rf_train:
-       rettype: rf_train_result
-       argument: text, text, text
-
-    - rf_train:
-       rettype: rf_train_result
-       argument: text, text, text, integer, integer, double precision, text, text, text, text, text, integer, double precision, double precision, integer
-
-    - svdmf_run:
-       rettype: text
-       argument: text, text, text, text, integer
-
-    - svdmf_run:
-       rettype: text
-       argument: text, text, text, text, integer, integer, double precision
-
-    - svm_classification:
-       rettype: SETOF svm_cls_result
-       argument: text, text, boolean, text
-
-    - svm_classification:
-       rettype: SETOF svm_cls_result
-       argument: text, text, boolean, text, boolean, double precision, double precision
-
-    - svm_classification:
-       rettype: SETOF svm_cls_result
-       argument: text, text, boolean, text, boolean, double precision, double precision, double precision
-
-    - svm_classification:
-       rettype: SETOF svm_cls_result
-       argument: text, text, boolean, text, double precision
-
-    - svm_cls_update:
-       rettype: schema_madlib.svm_model_rec
-       argument: schema_madlib.svm_model_rec, double precision[], double precision, text, double precision, double precision, double precision
-
-    - svm_data_normalization:
-       rettype: void
-       argument: text
-
-    - svm_dot:
-       rettype: double precision
-       argument: double precision[], double precision[]
-
-    - svm_dot:
-       rettype: double precision
-       argument: double precision[], double precision[], double precision
-
-    - svm_drop_model:
-       rettype: void
-       argument: text
-
-    - svm_gaussian:
-       rettype: double precision
-       argument: double precision[], double precision[], double precision
-
-    - svm_generate_cls_data:
-       rettype: void
-       argument: text, integer, integer
-
-    - svm_generate_nd_data:
-       rettype: void
-       argument: text, integer, integer
-
-    - svm_generate_reg_data:
-       rettype: void
-       argument: text, integer, integer
-
-    - svm_nd_update:
-       rettype: schema_madlib.svm_model_rec
-       argument: schema_madlib.svm_model_rec, double precision[], text, double precision, double precision, double precision
-
-    - svm_novelty_detection:
-       rettype: SETOF schema_madlib.svm_nd_result
-       argument: text, text, boolean, text
-
-    - svm_novelty_detection:
-       rettype: SETOF schema_madlib.svm_nd_result
-       argument: text, text, boolean, text, boolean, double precision, double precision
-
-    - svm_novelty_detection:
-       rettype: SETOF schema_madlib.svm_nd_result
-       argument: text, text, boolean, text, boolean, double precision, double precision, double precision
-
-    - svm_polynomial:
-       rettype: double precision
-       argument: double precision[], double precision[], double precision
-
-    - svm_predict:
-       rettype: double precision
-       argument: schema_madlib.svm_model_rec, double precision[], text, double precision
-
-    - svm_predict_batch:
-       rettype: text
-       argument: text, text, text, text, text, boolean
-
-    - svm_predict_sub:
-       rettype: double precision
-       argument: integer, integer, double precision[], double precision[], double precision[], text, double precision
-
-    - svm_reg_update:
-       rettype: schema_madlib.svm_model_rec
-       argument: schema_madlib.svm_model_rec, double precision[], double precision, text, double precision, double precision, double precision, double precision
-
-    - svm_regression:
-       rettype: SETOF svm_reg_result
-       argument: text, text, boolean, text
-
-    - svm_regression:
-       rettype: SETOF svm_reg_result
-       argument: text, text, boolean, text, boolean, double precision, double precision, double precision
-
-    - svm_regression:
-       rettype: SETOF svm_reg_result
-       argument: text, text, boolean, text, boolean, double precision, double precision, double precision, double precision
-
-    - svm_store_model:
-       rettype: void
-       argument: text, text, text
-
-    # ----------------- Changes from 1.9 to 1.9.1 ----------
-    - array_collapse:
-        rettype: anyarray
-        argument: anyarray
-    - linear_svm_igd_transition:
-        rettype: double precision[]
-        argument: double precision[], double precision[], double precision, double precision[], integer, double precision, double precision, boolean, integer, double precision, boolean
-    - profile:
-        rettype: SETOF schema_madlib.profile_result
-        argument: text
-    - profile_full:
-        rettype: SETOF schema_madlib.profile_result
-        argument: text, integer
-    - profile:
-        rettype: schema_madlib.profile_result
-        argument: text
-    - profile_full:
-        rettype: schema_madlib.profile_result
-        argument: text, integer
-    - quantile:
-        rettype: double precision
-        argument: text, text, double precision
-    - quantile_big:
-        rettype: double precision
-        argument: text, text, double precision
-
-# Changes to aggregates (UDA) including removal and modification
-# Overloaded functions should be mentioned separately
-uda:
-
-    # ----------------- Changes from 1.8 to 1.9 ----------
-    - __array_indexed_agg:
-        rettype: double precision[]
-        argument: double precision, bigint, bigint
-
-    - __best_scv_aggr:
-        rettype: double precision[]
-        argument: double precision[], integer, double precision
-
-    - __bigint_array_sum:
-        rettype: bigint[]
-        argument: bigint[]
-
-    - __display_tree_aggr:
-        rettype: text
-        argument: integer, boolean, text, text, double precision, double precision, text, integer
-
-    - __dt_acc_count_aggr:
-        rettype: bigint[]
-        argument: integer, bigint, integer
-
-    - __rep_aggr_class_count:
-        rettype: bigint[]
-        argument: integer, integer, integer
-
-    - __scv_aggr:
-        rettype: double precision[]
-        argument: integer, boolean, integer, double precision[], double precision[], bigint
-
-    - linear_svm_igd_step:
-        rettype: double precision[]
-        argument: double precision[], boolean, double precision[], integer, double precision, double precision
-
-    - linear_svm_igd_step_serial:
-        rettype: double precision[]
-        argument: double precision[], boolean, double precision[], integer, double precision, double precision
-
-    - svm_cls_agg:
-        rettype: schema_madlib.svm_model_rec
-        argument: double precision[], double precision, text, double precision, double precision, double precision
-
-    - svm_nd_agg:
-        rettype: schema_madlib.svm_model_rec
-        argument: double precision[], text, double precision, double precision, double precision
-
-    - svm_reg_agg:
-        rettype: schema_madlib.svm_model_rec
-        argument: double precision[], double precision, text, double precision, double precision, double precision, double precision
-
-    - __svm_random_ind2:
-        rettype: double precision[]
-        argument: integer
-
-    # ----------------- Changes from 1.9 to 1.9.1 ----------
-    - array_agg:
-        rettype: anyarray
-        argument: anyelement
-    - linear_svm_igd_step:
-       rettype: double precision[]
-       argument: double precision[], double precision, double precision[], integer, double precision, double precision, boolean, integer, double precision, boolean
-
-# Casts (UDC) updated/removed
-udc:
-    # ----------------- Changes from 1.8 to 1.9 ----------
-
-# Operators (UDO) removed/updated
-udo:
-    # ----------------- Changes from 1.8 to 1.9 ----------
-
-# Operator Classes (UDOC) removed/updated
-udoc:
-    # ----------------- Changes from 1.8 to 1.9 ----------

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/90f4dc15/src/madpack/changelist_1.9_1.10.yaml
----------------------------------------------------------------------
diff --git a/src/madpack/changelist_1.9_1.10.yaml b/src/madpack/changelist_1.9_1.10.yaml
new file mode 100644
index 0000000..41082ad
--- /dev/null
+++ b/src/madpack/changelist_1.9_1.10.yaml
@@ -0,0 +1,156 @@
+# Changelist for MADlib version 1.9 to 1.10
+
+# This file contains all changes that were introduced in a new version of
+# MADlib. This changelist is used by the upgrade script to detect what objects
+# should be upgraded (while retaining all other objects from the previous version)
+
+# New modules (actually .sql_in files) added in upgrade version
+# For these files the sql_in code is retained as is with the functions in the
+# file installed on the upgrade version. All other files (that don't have
+# updates), are cleaned up to remove object replacements
+new module:
+    # ----------------- Changes from 1.9.1 to 1.0 ----------
+    sssp:
+    encode_categorical:
+    knn:
+
+# Changes in the types (UDT) including removal and modification
+udt:
+    # ----------------- Changes from 1.9 to 1.9.1 ----------
+    profile_result:
+    # ----------------- Changes from 1.9.1 to 1.0 ----------
+    _tree_result_type:
+    _prune_result_type:
+    kmeans_result:
+    kmeans_state:
+
+# List of the UDF changes that affect the user externally. This includes change
+# in function name, return type, argument order or types, or removal of
+# the function. In each case, the original function is as good as removed and a
+# new function is created. In such cases, we should abort the upgrade if there
+# are user views dependent on this function, since the original function will
+# not be present in the upgraded version.
+udf:
+    # ----------------- Changes from 1.9 to 1.9.1 ----------
+    - array_collapse:
+        rettype: anyarray
+        argument: anyarray
+    - linear_svm_igd_transition:
+        rettype: double precision[]
+        argument: double precision[], double precision[], double precision, double precision[], integer, double precision, double precision, boolean, integer, double precision, boolean
+    - profile:
+        rettype: SETOF schema_madlib.profile_result
+        argument: text
+    - profile_full:
+        rettype: SETOF schema_madlib.profile_result
+        argument: text, integer
+    - profile:
+        rettype: schema_madlib.profile_result
+        argument: text
+    - profile_full:
+        rettype: schema_madlib.profile_result
+        argument: text, integer
+    - quantile:
+        rettype: double precision
+        argument: text, text, double precision
+    - quantile_big:
+        rettype: double precision
+        argument: text, text, double precision
+    # ----------------- Changes from 1.9.1 to 1.0 ----------
+    - _dt_apply:
+        rettype: schema_madlib._tree_result_type
+        argument: schema_madlib.bytea8, schema_madlib.bytea8, schema_madlib.bytea8, smallint, smallint, smallint, boolean, integer
+    - _prune_and_cplist:
+        rettype: schema_madlib._prune_result_type
+        argument: schema_madlib.bytea8, double precision, boolean
+    - kmeans:
+        rettype: schema_madlib.kmeans_result
+        argument: character varying, character varying, double precision[], character varying, character varying, integer, double precision
+    - kmeans:
+        rettype: schema_madlib.kmeans_result
+        argument: character varying, character varying, double precision[], character varying, character varying, integer
+    - kmeans:
+        rettype: schema_madlib.kmeans_result
+        argument: character varying, character varying, double precision[], character varying, character varying
+    - kmeans:
+        rettype: schema_madlib.kmeans_result
+        argument: character varying, character varying, double precision[], character varying
+    - kmeans:
+        rettype: schema_madlib.kmeans_result
+        argument: character varying, character varying, double precision[]
+    - kmeans:
+        rettype: schema_madlib.kmeans_result
+        argument: character varying, character varying, character varying, character varying, character varying, character varying, integer, double precision
+    - kmeans:
+        rettype: schema_madlib.kmeans_result
+        argument: character varying, character varying, character varying, character varying, character varying, character varying, integer
+    - kmeans:
+        rettype: schema_madlib.kmeans_result
+        argument: character varying, character varying, character varying, character varying, character varying, character varying
+    - kmeans:
+        rettype: schema_madlib.kmeans_result
+        argument: character varying, character varying, character varying, character varying, character varying
+    - kmeans:
+        rettype: schema_madlib.kmeans_result
+        argument: character varying, character varying, character varying, character varying
+    - kmeanspp:
+        rettype: schema_madlib.kmeans_result
+        argument: character varying, character varying, integer, character varying, character varying, integer, double precision, double precision
+    - kmeanspp:
+        rettype: schema_madlib.kmeans_result
+        argument: character varying, character varying, integer, character varying, character varying, integer, double precision
+    - kmeanspp:
+        rettype: schema_madlib.kmeans_result
+        argument: character varying, character varying, integer, character varying, character varying, integer
+    - kmeanspp:
+        rettype: schema_madlib.kmeans_result
+        argument: character varying, character varying, integer, character varying, character varying
+    - kmeanspp:
+        rettype: schema_madlib.kmeans_result
+        argument: character varying, character varying, integer, character varying
+    - kmeanspp:
+        rettype: schema_madlib.kmeans_result
+        argument: character varying, character varying, integer
+    - kmeans_random:
+        rettype: schema_madlib.kmeans_result
+        argument: character varying, character varying, integer, character varying, character varying, integer, double precision
+    - kmeans_random:
+        rettype: schema_madlib.kmeans_result
+        argument: character varying, character varying, integer, character varying, character varying, integer
+    - kmeans_random:
+        rettype: schema_madlib.kmeans_result
+        argument: character varying, character varying, integer, character varying, character varying
+    - kmeans_random:
+        rettype: schema_madlib.kmeans_result
+        argument: character varying, character varying, integer, character varying
+    - kmeans_random:
+        rettype: schema_madlib.kmeans_result
+        argument: character varying, character varying, integer
+    - internal_execute_using_kmeans_args:
+        rettype: schema_madlib.kmeans_result
+        argument: character varying, character varying, character varying, character varying, character varying, integer, double precision
+
+
+# Changes to aggregates (UDA) including removal and modification
+# Overloaded functions should be mentioned separately
+uda:
+    # ----------------- Changes from 1.9 to 1.9.1 ----------
+    - array_agg:
+        rettype: anyarray
+        argument: anyelement
+    - linear_svm_igd_step:
+       rettype: double precision[]
+       argument: double precision[], double precision, double precision[], integer, double precision, double precision, boolean, integer, double precision, boolean
+
+
+# Casts (UDC) updated/removed
+udc:
+    # ----------------- Changes from 1.9 to 1.9.1 ----------
+
+# Operators (UDO) removed/updated
+udo:
+    # ----------------- Changes from 1.9 to 1.9.1 ----------
+
+# Operator Classes (UDOC) removed/updated
+udoc:
+    # ----------------- Changes from 1.9 to 1.9.1 ----------

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/90f4dc15/src/madpack/diff_udf.sql
----------------------------------------------------------------------
diff --git a/src/madpack/diff_udf.sql b/src/madpack/diff_udf.sql
index d00f964..da345b5 100644
--- a/src/madpack/diff_udf.sql
+++ b/src/madpack/diff_udf.sql
@@ -139,7 +139,7 @@ $$ LANGUAGE plpythonu;
 DROP TABLE IF EXISTS functions_madlib_old_version;
 DROP TABLE IF EXISTS functions_madlib_new_version;
 
-SELECT get_functions('madlib_v19');
+SELECT get_functions('madlib_old_vers');
 
 SELECT
     --'\t-' || name || ':' || '\n\t\t-rettype: ' || retype || '\n\t\t-argument: ' || argtypes

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/90f4dc15/src/madpack/diff_udt.sql
----------------------------------------------------------------------
diff --git a/src/madpack/diff_udt.sql b/src/madpack/diff_udt.sql
index 5cbf1e7..e521a09 100644
--- a/src/madpack/diff_udt.sql
+++ b/src/madpack/diff_udt.sql
@@ -66,44 +66,44 @@ $$ LANGUAGE plpythonu;
 
 -- Get UDTs
 DROP TABLE IF EXISTS types_madlib;
-DROP TABLE IF EXISTS types_madlib_v19;
+DROP TABLE IF EXISTS types_madlib_old_vers;
 SELECT get_types('madlib');
-SELECT get_types('madlib_v19');
+SELECT get_types('madlib_old_vers');
 
 --SELECT name FROM types_madlib;
 --SELECT name FROM types_madlib_v15;
 
 --Dropped
 SELECT
-    v19.name AS "Dropped UDTs"
+    old_vers.name AS "Dropped UDTs"
 FROM
-    types_madlib_v19 AS v19
+    types_madlib_old_vers AS old_vers
     LEFT JOIN
-    types_madlib AS v191
+    types_madlib AS new_vers
     USING (name)
-WHERE v191.name IS NULL;
+WHERE new_vers.name IS NULL;
 
 --Added
 SELECT
-     v191.name AS "Added UDTs"
+     new_vers.name AS "Added UDTs"
 FROM
-     types_madlib_v19 AS v19
+     types_madlib_old_vers AS old_vers
      RIGHT JOIN
-     types_madlib AS v191
+     types_madlib AS new_vers
      USING (name)
-WHERE v19.name IS NULL;
+WHERE old_vers.name IS NULL;
 
 --Common
 DROP TABLE IF EXISTS types_common;
 CREATE TABLE types_common AS
 SELECT
-    v19.name, v19.typrelid AS old_relid, v191.typrelid AS new_relid
+    old_vers.name, old_vers.typrelid AS old_relid, new_vers.typrelid AS new_relid
 FROM
-    types_madlib_v19 AS v19
+    types_madlib_old_vers AS old_vers
     JOIN
-    types_madlib AS v191
+    types_madlib AS new_vers
     USING (name)
-WHERE v19.typrelid <> 0; -- 0 means base type
+WHERE old_vers.typrelid <> 0; -- 0 means base type
 
 SELECT
     array_upper(detect_changed_types('types_common'), 1) AS N,

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/90f4dc15/src/madpack/madpack.py
----------------------------------------------------------------------
diff --git a/src/madpack/madpack.py b/src/madpack/madpack.py
index 2e21fa4..3b66b17 100755
--- a/src/madpack/madpack.py
+++ b/src/madpack/madpack.py
@@ -657,6 +657,13 @@ def _db_upgrade(schema, dbrev):
         _info("Current MADlib version already up to date.", True)
         return
 
+    if _is_rev_gte([1,7,1],_get_rev_num(dbrev)):
+        _error("""
+            MADlib versions prior to v1.8 are not supported for upgrade.
+            Please try upgrading to v1.9.1 and then upgrade to this version.
+            """, True)
+        return
+
     _info("Upgrading MADlib into %s schema..." % schema.upper(), True)
     _info("\tDetecting dependencies...", True)

[33/50] [abbrv] incubator-madlib git commit: Adds k-means doc example for array input Updates KNN and cat vars docs

Posted by ri...@apache.org.

Adds k-means doc example for array input
Updates KNN and cat vars docs

Closes #99


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/8e7c6ebf
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/8e7c6ebf
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/8e7c6ebf

Branch: refs/heads/latest_release
Commit: 8e7c6ebfeefede6f64c7f449f8f2f1a75a8dbe73
Parents: 2d5a5ed
Author: Frank McQuillan <fm...@pivotal.io>
Authored: Wed Feb 8 15:08:05 2017 -0800
Committer: Orhan Kislal <ok...@pivotal.io>
Committed: Wed Feb 8 15:10:29 2017 -0800

----------------------------------------------------------------------
 src/ports/postgres/modules/kmeans/kmeans.sql_in | 154 +++++++++----
 src/ports/postgres/modules/knn/knn.sql_in       | 190 ++++++++--------
 .../modules/utilities/encode_categorical.sql_in | 228 +++++++++----------
 3 files changed, 324 insertions(+), 248 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/8e7c6ebf/src/ports/postgres/modules/kmeans/kmeans.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/kmeans/kmeans.sql_in b/src/ports/postgres/modules/kmeans/kmeans.sql_in
index 16014ef..f689dd6 100644
--- a/src/ports/postgres/modules/kmeans/kmeans.sql_in
+++ b/src/ports/postgres/modules/kmeans/kmeans.sql_in
@@ -218,43 +218,45 @@ closest_column( m, x )
 @anchor examples
 @examp
 
--#  Prepare some input data.
+Note: Your results may not be exactly the same as below due to the nature of the 
+k-means algorithm.
+
+-#  Prepare some input data:
 <pre class="example">
-CREATE TABLE public.km_sample(pid int, points double precision[]);
-COPY km_sample (pid, points) FROM stdin DELIMITER '|';
-1 | {14.23, 1.71, 2.43, 15.6, 127, 2.8, 3.0600, 0.2800, 2.29, 5.64, 1.04, 3.92, 1065}
-2 | {13.2, 1.78, 2.14, 11.2, 1, 2.65, 2.76, 0.26, 1.28, 4.38, 1.05, 3.49, 1050}
-3 | {13.16, 2.36,  2.67, 18.6, 101, 2.8,  3.24, 0.3, 2.81, 5.6799, 1.03, 3.17, 1185}
-4 | {14.37, 1.95, 2.5, 16.8, 113, 3.85, 3.49, 0.24, 2.18, 7.8, 0.86, 3.45, 1480}
-5 | {13.24, 2.59, 2.87, 21, 118, 2.8, 2.69, 0.39, 1.82, 4.32, 1.04, 2.93, 735}
-6 | {14.2, 1.76, 2.45, 15.2, 112, 3.27, 3.39, 0.34, 1.97, 6.75, 1.05, 2.85, 1450}
-7 | {14.39, 1.87, 2.45, 14.6, 96, 2.5, 2.52, 0.3, 1.98, 5.25, 1.02, 3.58, 1290}
-8 | {14.06, 2.15, 2.61, 17.6, 121, 2.6, 2.51, 0.31, 1.25, 5.05, 1.06, 3.58, 1295}
-9 | {14.83, 1.64, 2.17, 14, 97, 2.8, 2.98, 0.29, 1.98, 5.2, 1.08, 2.85, 1045}
-10 | {13.86, 1.35, 2.27, 16, 98, 2.98, 3.15, 0.22, 1.8500, 7.2199, 1.01, 3.55, 1045}
-\\.
+DROP TABLE IF EXISTS km_sample;
+CREATE TABLE km_sample(pid int, points double precision[]);
+INSERT INTO km_sample VALUES
+(1,  '{14.23, 1.71, 2.43, 15.6, 127, 2.8, 3.0600, 0.2800, 2.29, 5.64, 1.04, 3.92, 1065}'),
+(2,  '{13.2, 1.78, 2.14, 11.2, 1, 2.65, 2.76, 0.26, 1.28, 4.38, 1.05, 3.49, 1050}'),
+(3,  '{13.16, 2.36,  2.67, 18.6, 101, 2.8,  3.24, 0.3, 2.81, 5.6799, 1.03, 3.17, 1185}'),
+(4,  '{14.37, 1.95, 2.5, 16.8, 113, 3.85, 3.49, 0.24, 2.18, 7.8, 0.86, 3.45, 1480}'),
+(5,  '{13.24, 2.59, 2.87, 21, 118, 2.8, 2.69, 0.39, 1.82, 4.32, 1.04, 2.93, 735}'),
+(6,  '{14.2, 1.76, 2.45, 15.2, 112, 3.27, 3.39, 0.34, 1.97, 6.75, 1.05, 2.85, 1450}'),
+(7,  '{14.39, 1.87, 2.45, 14.6, 96, 2.5, 2.52, 0.3, 1.98, 5.25, 1.02, 3.58, 1290}'),
+(8,  '{14.06, 2.15, 2.61, 17.6, 121, 2.6, 2.51, 0.31, 1.25, 5.05, 1.06, 3.58, 1295}'),
+(9,  '{14.83, 1.64, 2.17, 14, 97, 2.8, 2.98, 0.29, 1.98, 5.2, 1.08, 2.85, 1045}'),
+(10, '{13.86, 1.35, 2.27, 16, 98, 2.98, 3.15, 0.22, 1.8500, 7.2199, 1.01, 3.55, 1045}');
 </pre>
-
 -#  Run k-means clustering using kmeans++ for centroid seeding:
 <pre class="example">
 \\x on;
-SELECT * FROM madlib.kmeanspp( 'km_sample',
-                               'points',
-                               2,
-                               'madlib.squared_dist_norm2',
-                               'madlib.avg',
-                               20,
-                               0.001
+SELECT * FROM madlib.kmeanspp( 'km_sample',   -- Table of source data
+                               'points',      -- Column containing point co-ordinates 
+                               2,             -- Number of centroids to calculate
+                               'madlib.squared_dist_norm2',   -- Distance function
+                               'madlib.avg',  -- Aggregate function
+                               20,            -- Number of iterations
+                               0.001          -- Fraction of centroids reassigned to keep iterating 
                              );
 </pre>
 Result:
 <pre class="result">
-centroids        | {{13.872,1.814,2.376,15.56,88.2,2.806,2.928,0.288,1.844,5.35198,1.044,3.348,988},
-                   {14.036,2.018,2.536,16.56,108.6,3.004,3.03,0.298,2.038,6.10598,1.004,3.326,1340}}
-cluster_variance | {90512.324426408,60672.638245208}
-objective_fn     | 151184.962672
+centroids        | {{13.7533333333333,1.905,2.425,16.0666666666667,90.3333333333333,2.805,2.98,0.29,2.005,5.40663333333333,1.04166666666667, 3.31833333333333,1020.83333333333},
+                   {14.255,1.9325,2.5025,16.05,110.5,3.055,2.9775,0.2975,1.845,6.2125,0.9975,3.365,1378.75}}
+cluster_variance | {122999.110416013,30561.74805}
+objective_fn     | 153560.858466013
 frac_reassigned  | 0
-num_iterations   | 2
+num_iterations   | 3
 </pre>
 -# Calculate the simplified silhouette coefficient:
 <pre class="example">
@@ -273,33 +275,103 @@ SELECT * FROM madlib.simple_silhouette( 'km_sample',
 </pre>
 Result:
 <pre class="result">
-simple_silhouette | 0.68978804882941
+simple_silhouette | 0.686314347664694
 </pre>
 
--#  Find the cluster assignment for each point
+-#  Find the cluster assignment for each point:
 <pre class="example">
 \\x off;
-SELECT data.*, (madlib.closest_column(centroids, points)).column_id as cluster_id
-FROM public.km_sample as data,
-     (SELECT centroids
-      FROM madlib.kmeanspp('km_sample', 'points', 2,
+DROP TABLE IF EXISTS km_result;
+-- Run kmeans algorithm
+CREATE TABLE km_result AS
+SELECT * FROM madlib.kmeanspp('km_sample', 'points', 2,
                            'madlib.squared_dist_norm2',
-                           'madlib.avg', 20, 0.001)) as centroids
+                           'madlib.avg', 20, 0.001); 
+-- Get point assignment
+SELECT data.*,  (madlib.closest_column(centroids, points)).column_id as cluster_id
+FROM km_sample as data, km_result
 ORDER BY data.pid;
 </pre>
 <pre class="result">
- pid |                               points                               | cluster_id
+ pid |                               points                               | cluster_id 
 -----+--------------------------------------------------------------------+------------
    1 | {14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065}  |          0
    2 | {13.2,1.78,2.14,11.2,1,2.65,2.76,0.26,1.28,4.38,1.05,3.49,1050}    |          0
-   3 | {13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.6799,1.03,3.17,1185} |          1
-   4 | {14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480}   |          1
-   5 | {13.24,2.59,2.87,21,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735}     |          0
-   6 | {14.2,1.76,2.45,15.2,112,3.27,3.39,0.34,1.97,6.75,1.05,2.85,1450}  |          1
-   7 | {14.39,1.87,2.45,14.6,96,2.5,2.52,0.3,1.98,5.25,1.02,3.58,1290}    |          1
-   8 | {14.06,2.15,2.61,17.6,121,2.6,2.51,0.31,1.25,5.05,1.06,3.58,1295}  |          1
+   3 | {13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.6799,1.03,3.17,1185} |          0
+   4 | {14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480}   |          0
+   5 | {13.24,2.59,2.87,21,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735}     |          1
+   6 | {14.2,1.76,2.45,15.2,112,3.27,3.39,0.34,1.97,6.75,1.05,2.85,1450}  |          0
+   7 | {14.39,1.87,2.45,14.6,96,2.5,2.52,0.3,1.98,5.25,1.02,3.58,1290}    |          0
+   8 | {14.06,2.15,2.61,17.6,121,2.6,2.51,0.31,1.25,5.05,1.06,3.58,1295}  |          0
    9 | {14.83,1.64,2.17,14,97,2.8,2.98,0.29,1.98,5.2,1.08,2.85,1045}      |          0
   10 | {13.86,1.35,2.27,16,98,2.98,3.15,0.22,1.85,7.2199,1.01,3.55,1045}  |          0
+(10 rows)
+</pre>
+
+-#  Run the same example as above, but using array input.  Create the input table:
+<pre class="example">
+DROP TABLE IF EXISTS km_arrayin CASCADE;
+CREATE TABLE km_arrayin(pid int, 
+                        p1 float, 
+                        p2 float, 
+                        p3 float,
+                        p4 float, 
+                        p5 float, 
+                        p6 float,
+                        p7 float, 
+                        p8 float, 
+                        p9 float,
+                        p10 float, 
+                        p11 float, 
+                        p12 float,
+                        p13 float);
+INSERT INTO km_arrayin VALUES
+(1,  14.23, 1.71, 2.43, 15.6, 127, 2.8, 3.0600, 0.2800, 2.29, 5.64, 1.04, 3.92, 1065),
+(2,  13.2, 1.78, 2.14, 11.2, 1, 2.65, 2.76, 0.26, 1.28, 4.38, 1.05, 3.49, 1050),
+(3,  13.16, 2.36,  2.67, 18.6, 101, 2.8,  3.24, 0.3, 2.81, 5.6799, 1.03, 3.17, 1185),
+(4,  14.37, 1.95, 2.5, 16.8, 113, 3.85, 3.49, 0.24, 2.18, 7.8, 0.86, 3.45, 1480),
+(5,  13.24, 2.59, 2.87, 21, 118, 2.8, 2.69, 0.39, 1.82, 4.32, 1.04, 2.93, 735),
+(6,  14.2, 1.76, 2.45, 15.2, 112, 3.27, 3.39, 0.34, 1.97, 6.75, 1.05, 2.85, 1450),
+(7,  14.39, 1.87, 2.45, 14.6, 96, 2.5, 2.52, 0.3, 1.98, 5.25, 1.02, 3.58, 1290),
+(8,  14.06, 2.15, 2.61, 17.6, 121, 2.6, 2.51, 0.31, 1.25, 5.05, 1.06, 3.58, 1295),
+(9,  14.83, 1.64, 2.17, 14, 97, 2.8, 2.98, 0.29, 1.98, 5.2, 1.08, 2.85, 1045),
+(10, 13.86, 1.35, 2.27, 16, 98, 2.98, 3.15, 0.22, 1.8500, 7.2199, 1.01, 3.55, 1045);
+</pre>
+Now find the cluster assignment for each point:
+<pre class="example">
+DROP TABLE IF EXISTS km_result;
+-- Run kmeans algorithm
+CREATE TABLE km_result AS
+SELECT * FROM madlib.kmeans_random('km_arrayin', 
+                                'ARRAY[p1, p2, p3, p4, p5, p6, 
+                                      p7, p8, p9, p10, p11, p12, p13]', 
+                                2,
+                                'madlib.squared_dist_norm2',
+                                'madlib.avg', 
+                                20, 
+                                0.001);
+-- Get point assignment
+SELECT data.*,  (madlib.closest_column(centroids, 
+                                       ARRAY[p1, p2, p3, p4, p5, p6, 
+                                      p7, p8, p9, p10, p11, p12, p13])).column_id as cluster_id                                    
+FROM km_arrayin as data, km_result
+ORDER BY data.pid;
+</pre> 
+This produces the result in column format:
+<pre class="result">
+ pid |  p1   |  p2  |  p3  |  p4  | p5  |  p6  |  p7  |  p8  |  p9  |  p10   | p11  | p12  | p13  | cluster_id 
+-----+-------+------+------+------+-----+------+------+------+------+--------+------+------+------+------------
+   1 | 14.23 | 1.71 | 2.43 | 15.6 | 127 |  2.8 | 3.06 | 0.28 | 2.29 |   5.64 | 1.04 | 3.92 | 1065 |          0
+   2 |  13.2 | 1.78 | 2.14 | 11.2 |   1 | 2.65 | 2.76 | 0.26 | 1.28 |   4.38 | 1.05 | 3.49 | 1050 |          0
+   3 | 13.16 | 2.36 | 2.67 | 18.6 | 101 |  2.8 | 3.24 |  0.3 | 2.81 | 5.6799 | 1.03 | 3.17 | 1185 |          0
+   4 | 14.37 | 1.95 |  2.5 | 16.8 | 113 | 3.85 | 3.49 | 0.24 | 2.18 |    7.8 | 0.86 | 3.45 | 1480 |          1
+   5 | 13.24 | 2.59 | 2.87 |   21 | 118 |  2.8 | 2.69 | 0.39 | 1.82 |   4.32 | 1.04 | 2.93 |  735 |          0
+   6 |  14.2 | 1.76 | 2.45 | 15.2 | 112 | 3.27 | 3.39 | 0.34 | 1.97 |   6.75 | 1.05 | 2.85 | 1450 |          1
+   7 | 14.39 | 1.87 | 2.45 | 14.6 |  96 |  2.5 | 2.52 |  0.3 | 1.98 |   5.25 | 1.02 | 3.58 | 1290 |          1
+   8 | 14.06 | 2.15 | 2.61 | 17.6 | 121 |  2.6 | 2.51 | 0.31 | 1.25 |   5.05 | 1.06 | 3.58 | 1295 |          1
+   9 | 14.83 | 1.64 | 2.17 |   14 |  97 |  2.8 | 2.98 | 0.29 | 1.98 |    5.2 | 1.08 | 2.85 | 1045 |          0
+  10 | 13.86 | 1.35 | 2.27 |   16 |  98 | 2.98 | 3.15 | 0.22 | 1.85 | 7.2199 | 1.01 | 3.55 | 1045 |          0
+(10 rows)
 </pre>
 
 @anchor notes

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/8e7c6ebf/src/ports/postgres/modules/knn/knn.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/knn/knn.sql_in b/src/ports/postgres/modules/knn/knn.sql_in
index 526c8dd..d3c1929 100644
--- a/src/ports/postgres/modules/knn/knn.sql_in
+++ b/src/ports/postgres/modules/knn/knn.sql_in
@@ -44,29 +44,27 @@ m4_include(`SQLCommon.m4')
 <li class="level1"><a href="#examples">Examples</a></li>
 <li class="level1"><a href="#background">Technical Background</a></li>
 <li class="level1"><a href="#literature">Literature</a></li>
-<li class="level1"><a href="#related">Related Topics</a></li>
 </ul>
 </div>
 
-@brief Finds k nearest data points to the given data point and outputs majority vote value of output classes in case of classification and average value of target values for regression task.
+@brief Finds k nearest data points to the given data point and outputs majority vote value of output classes for classification, and average value of target values for regression.
 
 \warning <em> This MADlib method is still in early stage development. There may be some
 issues that will be addressed in a future version. Interface and implementation
-is subject to change. </em>
+are subject to change. </em>
 
 @anchor knn
 
-k-Nearest Neighbors is a method for finding k closest points to a
-given data point in terms of a given metric. Its input consist of
-data points as features from testing examples. For a given k, it
-looks for k closest points in training set for each of the data
-points in test set. Algorithm generates one output per testing example.
-The output of KNN depends on the type of task:
-For Classification, the output is majority vote of the classes of
-the k nearest data points. The testing example gets assigned the
-most popular class among nearest neighbors.
-For Regression, the output is average of the values of k nearest
-neighbors of the given testing example.
+K-nearest neighbors is a method for finding the k closest points to a
+given data point in terms of a given metric. Its input consists of
+data points as features from testing examples, and it
+looks for k closest points in the training set for each of the data
+points in test set.  The output of KNN depends on the type of task.
+For classification, the output is the majority vote of the classes of
+the k nearest data points. That is, the testing example gets assigned the
+most popular class from the nearest neighbors.
+For regression, the output is the average of the values of k nearest
+neighbors of the given test point.
 
 @anchor usage
 @par Usage
@@ -86,27 +84,27 @@ knn( point_source,
 \b Arguments
 <dl class="arglist">
 <dt>point_source</dt>
-<dd>TEXT. The name of the table containing the training data points.
+<dd>TEXT. Name of the table containing the training data points.
 
-Training data points are expected to be stored row-wise,
+Training data points are expected to be stored row-wise
 in a column of type <tt>DOUBLE PRECISION[]</tt>.
 </dd>
 
 <dt>point_column_name</dt>
-<dd>TEXT. The name of the column with training data points.</dd>
+<dd>TEXT. Name of the column with training data points.</dd>
 
 <dt>label_column_name</dt>
-<dd>TEXT. The name of the column with labels/values of training data points.</dd>
+<dd>TEXT. Name of the column with labels/values of training data points.</dd>
 
 <dt>test_source</dt>
-<dd>TEXT. The name of the table containing the test data points.
+<dd>TEXT. Name of the table containing the test data points.
 
-Testing data points are expected to be stored row-wise,
+Testing data points are expected to be stored row-wise
 in a column of type <tt>DOUBLE PRECISION[]</tt>.
 </dd>
 
 <dt>test_column_name</dt>
-<dd>TEXT. The name of the column with testing data points.</dd>
+<dd>TEXT. Name of the column with testing data points.</dd>
 
 <dt>id_column_name</dt>
 <dd>TEXT. Name of the column having ids of data points in test data table.</dd>
@@ -115,10 +113,11 @@ in a column of type <tt>DOUBLE PRECISION[]</tt>.
 <dd>TEXT. Name of the table to store final results.</dd>
 
 <dt>operation</dt>
-<dd>TEXT. the type of task; r for regression and c for classification.</dd>
+<dd>TEXT. Type of task: 'r' for regression and 'c' for classification.</dd>
 
 <dt>k (optional)</dt>
-<dd>INTEGER. default: 1. The number of nearest neighbors to consider.</dd>
+<dd>INTEGER. default: 1. Number of nearest neighbors to consider.
+For classification, should be an odd number to break ties.</dd>
 
 </dl>
 
@@ -138,7 +137,7 @@ The output of the KNN module is a table with the following columns:
     </tr>
     <tr>
         <th>prediction</th>
-        <td>INTEGER. The output of KNN- label in case of classification, average value in case of regression.</td>
+        <td>INTEGER. Label in case of classification, average value in case of regression.</td>
     </tr>
 </table>
 
@@ -146,89 +145,100 @@ The output of the KNN module is a table with the following columns:
 @anchor examples
 @examp
 
--#  Prepare some training data.
+-#  Prepare some training data:
 <pre class="example">
-CREATE TABLE knn_train_data (id integer, data integer[], label float);
-COPY knn_train_data (id, data, label) from stdin delimiter '|';
-1|{1,1}|1.0
-2|{2,2}|1.0
-3|{3,3}|1.0
-4|{4,4}|1.0
-5|{4,5}|1.0
-6|{20,50}|0.0
-7|{10,31}|0.0
-8|{81,13}|0.0
-9|{1,111}|0.0
-\\.
+DROP TABLE IF EXISTS knn_train_data;
+CREATE TABLE knn_train_data (
+                    id integer, 
+                    data integer[], 
+                    label float
+                    );
+INSERT INTO knn_train_data VALUES
+(1, '{1,1}', 1.0),
+(2, '{2,2}', 1.0),
+(3, '{3,3}', 1.0),
+(4, '{4,4}', 1.0),
+(5, '{4,5}', 1.0),
+(6, '{20,50}', 0.0),
+(7, '{10,31}', 0.0),
+(8, '{81,13}', 0.0),
+(9, '{1,111}', 0.0);
 </pre>
 
--#  Prepare some testing data.
+-#  Prepare some testing data:
 <pre class="example">
-CREATE TABLE knn_test_data (id integer, data integer[]);
-COPY knn_test_data (id, data) from stdin delimiter '|';
-1|{2,1}
-2|{2,6}
-3|{15,40}
-4|{12,1}
-5|{2,90}
-6|{50,45}
-\\.
+DROP TABLE IF EXISTS knn_test_data;
+CREATE TABLE knn_test_data (
+                    id integer, 
+                    data integer[]
+                    );
+INSERT INTO knn_test_data VALUES
+(1, '{2,1}'),
+(2, '{2,6}'),
+(3, '{15,40}'),
+(4, '{12,1}'),
+(5, '{2,90}'),
+(6, '{50,45}');
 </pre>
 
 -#  Run KNN for classification:
 <pre class="example">
-SELECT * FROM madlib.knn( 'knn_train_data',
-                               'data',
-                               'label',
-                               'knn_test_data',
-                               'data',
-                               'id',
-                               'madlib_knn_result_classification',
-                               'c',
-                               3
-                             );
-SELECT * from madlib_knn_result_classification;
+DROP TABLE IF EXISTS madlib_knn_result_classification;
+SELECT * FROM madlib.knn( 
+                'knn_train_data',      -- Table of training data
+                'data',                -- Col name of training data
+                'label',               -- Training labels
+                'knn_test_data',       -- Table of test data
+                'data',                -- Col name of test data
+                'id',                  -- Col name of id in test data 
+                'madlib_knn_result_classification',  -- Output table
+                'c',                   -- Classification
+                 3                     -- Number of nearest neighbours
+                );
+SELECT * from madlib_knn_result_classification ORDER BY id;
 </pre>
 Result:
 <pre class="result">
-  id |   data   | prediction
------+----------+-----------
-   1 | {2,1}    |       1
-   2 | {2,6}    |       1
-   3 | {15,40}  |       0
-   4 | {12,1}   |       1
-   5 | {2,90}   |       0
-   6 | {50,45}  |       0
+ id |  data   | prediction 
+----+---------+------------
+  1 | {2,1}   |          1
+  2 | {2,6}   |          1
+  3 | {15,40} |          0
+  4 | {12,1}  |          1
+  5 | {2,90}  |          0
+  6 | {50,45} |          0
+(6 rows)
 </pre>
 
 -#  Run KNN for regression:
 <pre class="example">
-SELECT * FROM madlib.knn( 'knn_train_data',
-                               'data',
-                               'label',
-                               'knn_test_data',
-                               'data',
-                               'id',
-                               'madlib_knn_result_regression',
-                               'r',
-                               3
-                             );
-SELECT * from madlib_knn_result_regression;
+DROP TABLE IF EXISTS madlib_knn_result_regression;
+SELECT * FROM madlib.knn( 
+                'knn_train_data',      -- Table of training data
+                'data',                -- Col name of training data
+                'label',               -- Training labels
+                'knn_test_data',       -- Table of test data
+                'data',                -- Col name of test data
+                'id',                  -- Col name of id in test data 
+                'madlib_knn_result_regression',  -- Output table
+                'r',                   -- Regressions
+                 3                     -- Number of nearest neighbours
+                );
+SELECT * from madlib_knn_result_regression ORDER BY id;
 </pre>
 Result:
 <pre class="result">
-  id |   data   | prediction
------+----------+-----------
-   1 | {2,1}    |      1
-   2 | {2,6}    |      1
-   3 | {15,40}  |      0.5
-   4 | {12,1}   |      1
-   5 | {2,90}   |      0.25
-   6 | {50,45}  |      0.25
+ id |  data   |    prediction     
+----+---------+-------------------
+  1 | {2,1}   |                 1
+  2 | {2,6}   |                 1
+  3 | {15,40} | 0.333333333333333
+  4 | {12,1}  |                 1
+  5 | {2,90}  |                 0
+  6 | {50,45} |                 0
+(6 rows)
 </pre>
 
-
-
 @anchor background
 @par Technical Background
 
@@ -260,12 +270,6 @@ Other distance metrics will be added in a future release of this module.
 [3] Gongde Guo1, Hui Wang, David Bell, Yaxin Bi, Kieran Greer: KNN Model-Based Approach in Classification,
     https://ai2-s2-pdfs.s3.amazonaws.com/a7e2/814ec5db800d2f8c4313fd436e9cf8273821.pdf
 
-
-@anchor related
-@par Related Topics
-
-File knn.sql_in documenting the knn SQL functions
-
 @internal
 @sa namespace knn (documenting the implementation in Python)
 @endinternal

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/8e7c6ebf/src/ports/postgres/modules/utilities/encode_categorical.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/encode_categorical.sql_in b/src/ports/postgres/modules/utilities/encode_categorical.sql_in
index a14337c..d36ef2b 100644
--- a/src/ports/postgres/modules/utilities/encode_categorical.sql_in
+++ b/src/ports/postgres/modules/utilities/encode_categorical.sql_in
@@ -337,28 +337,28 @@ SELECT madlib.encode_categorical_variables (
 SELECT * FROM abalone_out ORDER BY id;
 </pre>
 <pre class="result">
- id | sex_F | sex_I | sex_M | rings_10 | rings_11 | rings_12 | rings_14 | rings_15 | rings_16 | rings_19 | rings_20 | rings_7 | rings_8 | rings_9
-----+-------+-------+-------+----------+----------+----------+----------+----------+----------+----------+----------+---------+---------+---------
-  1 |     0 |     0 |     1 |        0 |        0 |        0 |        0 |        1 |        0 |        0 |        0 |       0 |       0 |       0
-  2 |     0 |     0 |     1 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |       1 |       0 |       0
-  3 |     1 |     0 |     0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |       0 |       0 |       1
-  4 |     0 |     0 |     1 |        1 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |       0 |       0 |       0
-  5 |     0 |     1 |     0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |       1 |       0 |       0
-  6 |     0 |     1 |     0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |       0 |       1 |       0
-  7 |     1 |     0 |     0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        1 |       0 |       0 |       0
-  8 |     1 |     0 |     0 |        0 |        0 |        0 |        0 |        0 |        1 |        0 |        0 |       0 |       0 |       0
-  9 |     0 |     0 |     1 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |       0 |       0 |       1
- 10 |     0 |     0 |     0 |        0 |        0 |        0 |        0 |        0 |        0 |        1 |        0 |       0 |       0 |       0
- 11 |     1 |     0 |     0 |        0 |        0 |        0 |        1 |        0 |        0 |        0 |        0 |       0 |       0 |       0
- 12 |     0 |     0 |     1 |        1 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |       0 |       0 |       0
- 13 |     0 |     0 |     1 |        0 |        1 |        0 |        0 |        0 |        0 |        0 |        0 |       0 |       0 |       0
- 14 |     1 |     0 |     0 |        1 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |       0 |       0 |       0
- 15 |     1 |     0 |     0 |        1 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |       0 |       0 |       0
- 16 |     0 |     0 |     1 |        0 |        0 |        1 |        0 |        0 |        0 |        0 |        0 |       0 |       0 |       0
- 17 |     0 |     1 |     0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |       1 |       0 |       0
- 18 |     1 |     0 |     0 |        1 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |       0 |       0 |       0
- 19 |     0 |     0 |     1 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |       1 |       0 |       0
- 20 |     0 |     0 |     0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |       0 |       0 |       1
+ id | sex_F | sex_I | sex_M | rings_7 | rings_8 | rings_9 | rings_10 | rings_11 | rings_12 | rings_14 | rings_15 | rings_16 | rings_19 | rings_20 
+----+-------+-------+-------+---------+---------+---------+----------+----------+----------+----------+----------+----------+----------+----------
+  1 |     0 |     0 |     1 |       0 |       0 |       0 |        0 |        0 |        0 |        0 |        1 |        0 |        0 |        0
+  2 |     0 |     0 |     1 |       1 |       0 |       0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+  3 |     1 |     0 |     0 |       0 |       0 |       1 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+  4 |     0 |     0 |     1 |       0 |       0 |       0 |        1 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+  5 |     0 |     1 |     0 |       1 |       0 |       0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+  6 |     0 |     1 |     0 |       0 |       1 |       0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+  7 |     1 |     0 |     0 |       0 |       0 |       0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        1
+  8 |     1 |     0 |     0 |       0 |       0 |       0 |        0 |        0 |        0 |        0 |        0 |        1 |        0 |        0
+  9 |     0 |     0 |     1 |       0 |       0 |       1 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+ 10 |     0 |     0 |     0 |       0 |       0 |       0 |        0 |        0 |        0 |        0 |        0 |        0 |        1 |        0
+ 11 |     1 |     0 |     0 |       0 |       0 |       0 |        0 |        0 |        0 |        1 |        0 |        0 |        0 |        0
+ 12 |     0 |     0 |     1 |       0 |       0 |       0 |        1 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+ 13 |     0 |     0 |     1 |       0 |       0 |       0 |        0 |        1 |        0 |        0 |        0 |        0 |        0 |        0
+ 14 |     1 |     0 |     0 |       0 |       0 |       0 |        1 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+ 15 |     1 |     0 |     0 |       0 |       0 |       0 |        1 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+ 16 |     0 |     0 |     1 |       0 |       0 |       0 |        0 |        0 |        1 |        0 |        0 |        0 |        0 |        0
+ 17 |     0 |     1 |     0 |       1 |       0 |       0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+ 18 |     1 |     0 |     0 |       0 |       0 |       0 |        1 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+ 19 |     0 |     0 |     1 |       1 |       0 |       0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+ 20 |     0 |     0 |     0 |       0 |       0 |       1 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0
 (20 rows)
 </pre>
 
@@ -379,7 +379,7 @@ SELECT madlib.encode_categorical_variables (
 SELECT * FROM abalone_out ORDER BY id;
 </pre>
 <pre class="result">
- id | sex_M | sex_F | sex__MISC__ | rings_10 | rings_7 | rings_9 | rings__MISC__
+ id | sex_M | sex_F | sex__MISC__ | rings_10 | rings_7 | rings_9 | rings__MISC__ 
 ----+-------+-------+-------------+----------+---------+---------+---------------
   1 |     1 |     0 |           0 |        0 |       0 |       0 |             1
   2 |     1 |     0 |           0 |        0 |       1 |       0 |             0
@@ -421,28 +421,28 @@ SELECT madlib.encode_categorical_variables (
 SELECT * FROM abalone_out ORDER BY id;
 </pre>
 <pre class="result">
- id | sex | rings | sex_F | sex_I | sex_M | rings_10 | rings_11 | rings_12 | rings_14 | rings_15 | rings_16 | rings_19 | rings_20 | rings_7 | rings_8 | rings_9
-----+-----+-------+-------+-------+-------+----------+----------+----------+----------+----------+----------+----------+----------+---------+---------+---------
-  1 | M   |    15 |     0 |     0 |     1 |        0 |        0 |        0 |        0 |        1 |        0 |        0 |        0 |       0 |       0 |       0
-  2 | M   |     7 |     0 |     0 |     1 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |       1 |       0 |       0
-  3 | F   |     9 |     1 |     0 |     0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |       0 |       0 |       1
-  4 | M   |    10 |     0 |     0 |     1 |        1 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |       0 |       0 |       0
-  5 | I   |     7 |     0 |     1 |     0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |       1 |       0 |       0
-  6 | I   |     8 |     0 |     1 |     0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |       0 |       1 |       0
-  7 | F   |    20 |     1 |     0 |     0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        1 |       0 |       0 |       0
-  8 | F   |    16 |     1 |     0 |     0 |        0 |        0 |        0 |        0 |        0 |        1 |        0 |        0 |       0 |       0 |       0
-  9 | M   |     9 |     0 |     0 |     1 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |       0 |       0 |       1
- 10 |     |    19 |     0 |     0 |     0 |        0 |        0 |        0 |        0 |        0 |        0 |        1 |        0 |       0 |       0 |       0
- 11 | F   |    14 |     1 |     0 |     0 |        0 |        0 |        0 |        1 |        0 |        0 |        0 |        0 |       0 |       0 |       0
- 12 | M   |    10 |     0 |     0 |     1 |        1 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |       0 |       0 |       0
- 13 | M   |    11 |     0 |     0 |     1 |        0 |        1 |        0 |        0 |        0 |        0 |        0 |        0 |       0 |       0 |       0
- 14 | F   |    10 |     1 |     0 |     0 |        1 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |       0 |       0 |       0
- 15 | F   |    10 |     1 |     0 |     0 |        1 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |       0 |       0 |       0
- 16 | M   |    12 |     0 |     0 |     1 |        0 |        0 |        1 |        0 |        0 |        0 |        0 |        0 |       0 |       0 |       0
- 17 | I   |     7 |     0 |     1 |     0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |       1 |       0 |       0
- 18 | F   |    10 |     1 |     0 |     0 |        1 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |       0 |       0 |       0
- 19 | M   |     7 |     0 |     0 |     1 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |       1 |       0 |       0
- 20 |     |     9 |     0 |     0 |     0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |       0 |       0 |       1
+ id | sex | rings | sex_F | sex_I | sex_M | rings_7 | rings_8 | rings_9 | rings_10 | rings_11 | rings_12 | rings_14 | rings_15 | rings_16 | rings_19 | rings_20 
+----+-----+-------+-------+-------+-------+---------+---------+---------+----------+----------+----------+----------+----------+----------+----------+----------
+  1 | M   |    15 |     0 |     0 |     1 |       0 |       0 |       0 |        0 |        0 |        0 |        0 |        1 |        0 |        0 |        0
+  2 | M   |     7 |     0 |     0 |     1 |       1 |       0 |       0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+  3 | F   |     9 |     1 |     0 |     0 |       0 |       0 |       1 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+  4 | M   |    10 |     0 |     0 |     1 |       0 |       0 |       0 |        1 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+  5 | I   |     7 |     0 |     1 |     0 |       1 |       0 |       0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+  6 | I   |     8 |     0 |     1 |     0 |       0 |       1 |       0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+  7 | F   |    20 |     1 |     0 |     0 |       0 |       0 |       0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        1
+  8 | F   |    16 |     1 |     0 |     0 |       0 |       0 |       0 |        0 |        0 |        0 |        0 |        0 |        1 |        0 |        0
+  9 | M   |     9 |     0 |     0 |     1 |       0 |       0 |       1 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+ 10 |     |    19 |     0 |     0 |     0 |       0 |       0 |       0 |        0 |        0 |        0 |        0 |        0 |        0 |        1 |        0
+ 11 | F   |    14 |     1 |     0 |     0 |       0 |       0 |       0 |        0 |        0 |        0 |        1 |        0 |        0 |        0 |        0
+ 12 | M   |    10 |     0 |     0 |     1 |       0 |       0 |       0 |        1 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+ 13 | M   |    11 |     0 |     0 |     1 |       0 |       0 |       0 |        0 |        1 |        0 |        0 |        0 |        0 |        0 |        0
+ 14 | F   |    10 |     1 |     0 |     0 |       0 |       0 |       0 |        1 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+ 15 | F   |    10 |     1 |     0 |     0 |       0 |       0 |       0 |        1 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+ 16 | M   |    12 |     0 |     0 |     1 |       0 |       0 |       0 |        0 |        0 |        1 |        0 |        0 |        0 |        0 |        0
+ 17 | I   |     7 |     0 |     1 |     0 |       1 |       0 |       0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+ 18 | F   |    10 |     1 |     0 |     0 |       0 |       0 |       0 |        1 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+ 19 | M   |     7 |     0 |     0 |     1 |       1 |       0 |       0 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0
+ 20 |     |     9 |     0 |     0 |     0 |       0 |       0 |       1 |        0 |        0 |        0 |        0 |        0 |        0 |        0 |        0
 (20 rows)
 </pre>
 
@@ -499,33 +499,33 @@ SELECT madlib.encode_categorical_variables (
         NULL,                        -- Top values
         NULL,                        -- Value to drop for dummy encoding
         NULL,                        -- Encode nulls
-        TRUE                         -- Array output
+        'array'                      -- Array output type
         );
 SELECT * FROM abalone_out ORDER BY id;
 </pre>
 <pre class="result">
- id |     __encoded_variables__
+ id |     __encoded_variables__     
 ----+-------------------------------
-  1 | {0,0,1,0,0,0,0,1,0,0,0,0,0,0}
-  2 | {0,0,1,0,0,0,0,0,0,0,0,1,0,0}
-  3 | {1,0,0,0,0,0,0,0,0,0,0,0,0,1}
-  4 | {0,0,1,1,0,0,0,0,0,0,0,0,0,0}
-  5 | {0,1,0,0,0,0,0,0,0,0,0,1,0,0}
-  6 | {0,1,0,0,0,0,0,0,0,0,0,0,1,0}
-  7 | {1,0,0,0,0,0,0,0,0,0,1,0,0,0}
-  8 | {1,0,0,0,0,0,0,0,1,0,0,0,0,0}
-  9 | {0,0,1,0,0,0,0,0,0,0,0,0,0,1}
- 10 | {0,0,0,0,0,0,0,0,0,1,0,0,0,0}
- 11 | {1,0,0,0,0,0,1,0,0,0,0,0,0,0}
- 12 | {0,0,1,1,0,0,0,0,0,0,0,0,0,0}
- 13 | {0,0,1,0,1,0,0,0,0,0,0,0,0,0}
- 14 | {1,0,0,1,0,0,0,0,0,0,0,0,0,0}
- 15 | {1,0,0,1,0,0,0,0,0,0,0,0,0,0}
- 16 | {0,0,1,0,0,1,0,0,0,0,0,0,0,0}
- 17 | {0,1,0,0,0,0,0,0,0,0,0,1,0,0}
- 18 | {1,0,0,1,0,0,0,0,0,0,0,0,0,0}
- 19 | {0,0,1,0,0,0,0,0,0,0,0,1,0,0}
- 20 | {0,0,0,0,0,0,0,0,0,0,0,0,0,1}
+  1 | {0,0,1,0,0,0,0,0,0,0,1,0,0,0}
+  2 | {0,0,1,1,0,0,0,0,0,0,0,0,0,0}
+  3 | {1,0,0,0,0,1,0,0,0,0,0,0,0,0}
+  4 | {0,0,1,0,0,0,1,0,0,0,0,0,0,0}
+  5 | {0,1,0,1,0,0,0,0,0,0,0,0,0,0}
+  6 | {0,1,0,0,1,0,0,0,0,0,0,0,0,0}
+  7 | {1,0,0,0,0,0,0,0,0,0,0,0,0,1}
+  8 | {1,0,0,0,0,0,0,0,0,0,0,1,0,0}
+  9 | {0,0,1,0,0,1,0,0,0,0,0,0,0,0}
+ 10 | {0,0,0,0,0,0,0,0,0,0,0,0,1,0}
+ 11 | {1,0,0,0,0,0,0,0,0,1,0,0,0,0}
+ 12 | {0,0,1,0,0,0,1,0,0,0,0,0,0,0}
+ 13 | {0,0,1,0,0,0,0,1,0,0,0,0,0,0}
+ 14 | {1,0,0,0,0,0,1,0,0,0,0,0,0,0}
+ 15 | {1,0,0,0,0,0,1,0,0,0,0,0,0,0}
+ 16 | {0,0,1,0,0,0,0,0,1,0,0,0,0,0}
+ 17 | {0,1,0,1,0,0,0,0,0,0,0,0,0,0}
+ 18 | {1,0,0,0,0,0,1,0,0,0,0,0,0,0}
+ 19 | {0,0,1,1,0,0,0,0,0,0,0,0,0,0}
+ 20 | {0,0,0,0,0,1,0,0,0,0,0,0,0,0}
 (20 rows)
 </pre>
 View the dictionary table that gives the index into the array:
@@ -533,22 +533,22 @@ View the dictionary table that gives the index into the array:
 SELECT * FROM abalone_out_dictionary;
 </pre>
 <pre class="result">
-  encoded_column_name  | index | variable | value
+  encoded_column_name  | index | variable | value 
 -----------------------+-------+----------+-------
  __encoded_variables__ |     1 | sex      | F
  __encoded_variables__ |     2 | sex      | I
  __encoded_variables__ |     3 | sex      | M
- __encoded_variables__ |     4 | rings    | 10
- __encoded_variables__ |     5 | rings    | 11
- __encoded_variables__ |     6 | rings    | 12
- __encoded_variables__ |     7 | rings    | 14
- __encoded_variables__ |     8 | rings    | 15
- __encoded_variables__ |     9 | rings    | 16
- __encoded_variables__ |    10 | rings    | 19
- __encoded_variables__ |    11 | rings    | 20
- __encoded_variables__ |    12 | rings    | 7
- __encoded_variables__ |    13 | rings    | 8
- __encoded_variables__ |    14 | rings    | 9
+ __encoded_variables__ |     4 | rings    | 7
+ __encoded_variables__ |     5 | rings    | 8
+ __encoded_variables__ |     6 | rings    | 9
+ __encoded_variables__ |     7 | rings    | 10
+ __encoded_variables__ |     8 | rings    | 11
+ __encoded_variables__ |     9 | rings    | 12
+ __encoded_variables__ |    10 | rings    | 14
+ __encoded_variables__ |    11 | rings    | 15
+ __encoded_variables__ |    12 | rings    | 16
+ __encoded_variables__ |    13 | rings    | 19
+ __encoded_variables__ |    14 | rings    | 20
 (14 rows)
 </pre>
 -# Create a dictionary output:
@@ -563,34 +563,34 @@ SELECT madlib.encode_categorical_variables (
         NULL,                        -- Top values
         NULL,                        -- Value to drop for dummy encoding
         NULL,                        -- Encode nulls
-        FALSE,                       -- Array output
+        NULL,                        -- Output type
         TRUE                         -- Dictionary output
         );
 SELECT * FROM abalone_out ORDER BY id;
 </pre>
 <pre class="result">
-  id | sex_1 | sex_2 | sex_3 | rings_1 | rings_2 | rings_3 | rings_4 | rings_5 | rings_6 | rings_7 | rings_8 | rings_9 | rings_10 | rings_11
+ id | sex_1 | sex_2 | sex_3 | rings_1 | rings_2 | rings_3 | rings_4 | rings_5 | rings_6 | rings_7 | rings_8 | rings_9 | rings_10 | rings_11 
 ----+-------+-------+-------+---------+---------+---------+---------+---------+---------+---------+---------+---------+----------+----------
-  1 |     0 |     0 |     1 |       0 |       0 |       0 |       0 |       1 |       0 |       0 |       0 |       0 |        0 |        0
-  2 |     0 |     0 |     1 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       1 |        0 |        0
-  3 |     1 |     0 |     0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |        0 |        1
-  4 |     0 |     0 |     1 |       1 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |        0 |        0
-  5 |     0 |     1 |     0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       1 |        0 |        0
-  6 |     0 |     1 |     0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |        1 |        0
-  7 |     1 |     0 |     0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       1 |       0 |        0 |        0
-  8 |     1 |     0 |     0 |       0 |       0 |       0 |       0 |       0 |       1 |       0 |       0 |       0 |        0 |        0
-  9 |     0 |     0 |     1 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |        0 |        1
- 10 |     0 |     0 |     0 |       0 |       0 |       0 |       0 |       0 |       0 |       1 |       0 |       0 |        0 |        0
- 11 |     1 |     0 |     0 |       0 |       0 |       0 |       1 |       0 |       0 |       0 |       0 |       0 |        0 |        0
- 12 |     0 |     0 |     1 |       1 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |        0 |        0
- 13 |     0 |     0 |     1 |       0 |       1 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |        0 |        0
- 14 |     1 |     0 |     0 |       1 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |        0 |        0
- 15 |     1 |     0 |     0 |       1 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |        0 |        0
- 16 |     0 |     0 |     1 |       0 |       0 |       1 |       0 |       0 |       0 |       0 |       0 |       0 |        0 |        0
- 17 |     0 |     1 |     0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       1 |        0 |        0
- 18 |     1 |     0 |     0 |       1 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |        0 |        0
- 19 |     0 |     0 |     1 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       1 |        0 |        0
- 20 |     0 |     0 |     0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |        0 |        1
+  1 |     0 |     0 |     1 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       1 |       0 |        0 |        0
+  2 |     0 |     0 |     1 |       1 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |        0 |        0
+  3 |     1 |     0 |     0 |       0 |       0 |       1 |       0 |       0 |       0 |       0 |       0 |       0 |        0 |        0
+  4 |     0 |     0 |     1 |       0 |       0 |       0 |       1 |       0 |       0 |       0 |       0 |       0 |        0 |        0
+  5 |     0 |     1 |     0 |       1 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |        0 |        0
+  6 |     0 |     1 |     0 |       0 |       1 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |        0 |        0
+  7 |     1 |     0 |     0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |        0 |        1
+  8 |     1 |     0 |     0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       1 |        0 |        0
+  9 |     0 |     0 |     1 |       0 |       0 |       1 |       0 |       0 |       0 |       0 |       0 |       0 |        0 |        0
+ 10 |     0 |     0 |     0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |        1 |        0
+ 11 |     1 |     0 |     0 |       0 |       0 |       0 |       0 |       0 |       0 |       1 |       0 |       0 |        0 |        0
+ 12 |     0 |     0 |     1 |       0 |       0 |       0 |       1 |       0 |       0 |       0 |       0 |       0 |        0 |        0
+ 13 |     0 |     0 |     1 |       0 |       0 |       0 |       0 |       1 |       0 |       0 |       0 |       0 |        0 |        0
+ 14 |     1 |     0 |     0 |       0 |       0 |       0 |       1 |       0 |       0 |       0 |       0 |       0 |        0 |        0
+ 15 |     1 |     0 |     0 |       0 |       0 |       0 |       1 |       0 |       0 |       0 |       0 |       0 |        0 |        0
+ 16 |     0 |     0 |     1 |       0 |       0 |       0 |       0 |       0 |       1 |       0 |       0 |       0 |        0 |        0
+ 17 |     0 |     1 |     0 |       1 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |        0 |        0
+ 18 |     1 |     0 |     0 |       0 |       0 |       0 |       1 |       0 |       0 |       0 |       0 |       0 |        0 |        0
+ 19 |     0 |     0 |     1 |       1 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |       0 |        0 |        0
+ 20 |     0 |     0 |     0 |       0 |       0 |       1 |       0 |       0 |       0 |       0 |       0 |       0 |        0 |        0
 (20 rows)
 </pre>
 View the dictionary table that defines the numerical columns in the output table:
@@ -598,19 +598,19 @@ View the dictionary table that defines the numerical columns in the output table
 SELECT * FROM abalone_out_dictionary ORDER BY encoded_column_name;
 </pre>
 <pre class="result">
-  encoded_column_name | index | variable | value
+ encoded_column_name | index | variable | value 
 ---------------------+-------+----------+-------
- "rings_1"           |     1 | rings    | 10
- "rings_10"          |    10 | rings    | 8
- "rings_11"          |    11 | rings    | 9
- "rings_2"           |     2 | rings    | 11
- "rings_3"           |     3 | rings    | 12
- "rings_4"           |     4 | rings    | 14
- "rings_5"           |     5 | rings    | 15
- "rings_6"           |     6 | rings    | 16
- "rings_7"           |     7 | rings    | 19
- "rings_8"           |     8 | rings    | 20
- "rings_9"           |     9 | rings    | 7
+ "rings_1"           |     1 | rings    | 7
+ "rings_10"          |    10 | rings    | 19
+ "rings_11"          |    11 | rings    | 20
+ "rings_2"           |     2 | rings    | 8
+ "rings_3"           |     3 | rings    | 9
+ "rings_4"           |     4 | rings    | 10
+ "rings_5"           |     5 | rings    | 11
+ "rings_6"           |     6 | rings    | 12
+ "rings_7"           |     7 | rings    | 14
+ "rings_8"           |     8 | rings    | 15
+ "rings_9"           |     9 | rings    | 16
  "sex_1"             |     1 | sex      | F
  "sex_2"             |     2 | sex      | I
  "sex_3"             |     3 | sex      | M
@@ -629,7 +629,7 @@ SELECT madlib.encode_categorical_variables (
         NULL,                        -- Top values
         NULL,                        -- Value to drop for dummy encoding
         NULL,                        -- Encode nulls
-        NULL,                        -- Array output
+        NULL,                        -- Output type
         NULL,                        -- Dictionary output
         'RANDOMLY'                   -- Distribution policy
         );

[04/50] [abbrv] incubator-madlib git commit: CV: Fix order of validation output table columns

Posted by ri...@apache.org.

CV: Fix order of validation output table columns


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/6f12264c
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/6f12264c
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/6f12264c

Branch: refs/heads/latest_release
Commit: 6f12264c3ef34345b0b9b812afbd9bee5f6b815b
Parents: e1f37bb
Author: Rahul Iyer <ri...@apache.org>
Authored: Wed Jan 11 15:04:53 2017 -0800
Committer: Rahul Iyer <ri...@apache.org>
Committed: Wed Jan 11 15:08:32 2017 -0800

----------------------------------------------------------------------
 .../validation/internal/cross_validation.py_in  | 25 +++++++-------------
 1 file changed, 8 insertions(+), 17 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/6f12264c/src/ports/postgres/modules/validation/internal/cross_validation.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/validation/internal/cross_validation.py_in b/src/ports/postgres/modules/validation/internal/cross_validation.py_in
index a79b45a..c1b2561 100644
--- a/src/ports/postgres/modules/validation/internal/cross_validation.py_in
+++ b/src/ports/postgres/modules/validation/internal/cross_validation.py_in
@@ -139,30 +139,21 @@ class ValidationResult(object):
         if not tbl_name or not str(tbl_name).strip():
             return
 
-        cv_history_f = self._flatten()
-        header = cv_history_f[0].keys()
-        # assuming all keys are string
-        header_str = ','.join(header)
-        # assuming all values are double precision
-        header_with_type_str = ','.join([c + ' double precision'
-                                        for c in header])
-        plpy.execute("""
-                     DROP TABLE IF EXISTS {tbl_name};
-                     CREATE TABLE {tbl_name} ({header})
-                     """.format(tbl_name=tbl_name,
-                                header=header_with_type_str))
+        header = self._cv_history[0]['sub_args'].keys() + ['mean', 'std']
+        header_str = ','.join(map(str, header))
 
         data = []
-        for h in cv_history_f:
+        for h in self._flatten():
             values = ','.join([str(h[k]) for k in header])
             data.append("({0})".format(values))
         data = ','.join(data)
 
         plpy.execute("""
-                     INSERT INTO {tbl_name}({header}) VALUES
-                     {data}""".format(data=data,
-                                      header=header_str,
-                                      tbl_name=tbl_name))
+                     CREATE TABLE {tbl_name} ({header_str}) AS
+                     VALUES
+                        {data}
+                     """.
+                     format(tbl_name=tbl_name, header_str=header_str, data=data))
 
 
 class _ValidationArgs(object):

[42/50] [abbrv] incubator-madlib git commit: Release v1.10:

Posted by ri...@apache.org.

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/90f4dc15/src/madpack/changelist_1.1_1.9.1.yaml
----------------------------------------------------------------------
diff --git a/src/madpack/changelist_1.1_1.9.1.yaml b/src/madpack/changelist_1.1_1.9.1.yaml
deleted file mode 100644
index 3a26793..0000000
--- a/src/madpack/changelist_1.1_1.9.1.yaml
+++ /dev/null
@@ -1,1385 +0,0 @@
-# Changelist for MADlib version 1.1 to 1.7
-
-# This file contains all changes that were introduced in a new version of
-# MADlib. This changelist is used by the upgrade script to detect what objects
-# should be upgraded (while retaining all other objects from the previous version)
-
-# New modules (actually .sql_in files) added in upgrade version
-# For these files the sql_in code is retained as is with the functions in the
-# file installed on the upgrade version. All other files (that don't have
-# updates), are cleaned up to remove object replacements
-new module:
-    # ----------------- Changes from 1.1 to 1.2 -----------------
-    arima:
-    arima_forecast:
-    # ----------------- Changes from 1.2 to 1.3 -----------------
-    # ----------------- Changes from 1.3 to 1.4 -----------------
-    robust_variance_coxph:
-    clustered_variance_coxph:
-    # ----------------- Changes from 1.4 to 1.4.1 ---------------
-    # ----------------- Changes from 1.4.1 to 1.5 ---------------
-    # ----------------- Changes from 1.5 to 1.6 ---------------
-    table_to_pmml:
-    # ----------------- Changes from 1.6 to 1.6.0S ----------
-    # ----------------- Changes from 1.6.0S to 1.7 ----------
-    glm:
-    multiresponseglm:
-    ordinal:
-    decision_tree:
-    random_forest:
-    distribution:
-    # ----------------- Changes from 1.7.1 to 1.8 ----------
-    text_utilities:
-
-# Changes in the types (UDT) including removal and modification
-udt:
-
-    # ----------------- Changes from 1.1 to 1.2 -----------------
-    summary_result:
-    # ----------------- Changes from 1.2 to 1.3 -----------------
-    __logregr_result:
-    linregr_result:
-    # ----------------- Changes from 1.3 to 1.4 -----------------
-    # coxph_result: not exists in 1.1
-    mlogregr_result:
-    marginal_logregr_result:
-    marginal_mlogregr_result:
-    # ----------------- Changes from 1.4 to 1.4.1 ---------------
-    # ----------------- Changes from 1.4.1 to 1.5 ---------------
-    # ----------------- Changes from 1.5 to 1.6 ---------------
-    # __logregr_result: appeared before
-    # coxph_result: not exists in 1.1
-    # linregr_result: appeared before
-    # mlogregr_result: appeared before
-    # some types missed before upgrade to v1.6
-    intermediate_cox_prop_hazards_result:
-    __utils_scaled_data:
-    # ----------------- Changes from 1.6 to 1.6.0S ----------
-    # ----------------- Changes from 1.6.0S to 1.7 ----------
-    bytea8:
-    # ----------------- Changes from 1.8 to 1.9 ----------
-    __enc_tbl_result:
-    __gen_acc_time:
-    __rep_type:
-    __train_result:
-    c45_classify_result:
-    c45_train_result:
-    correlation_result:
-    lsvm_sgd_model_rec:
-    lsvm_sgd_result:
-    rf_classify_result:
-    rf_train_result:
-    svm_cls_result:
-    svm_model_pr:
-    svm_model_rec:
-    svm_nd_result:
-    svm_reg_result:
-    svm_support_vector:
-    _prune_result_type:
-    _tree_result_type:
-    linear_svm_result:
-    # ----------------- Changes from 1.9 to 1.9.1 ----------
-    profile_result:
-
-# List of the UDF changes that affect the user externally.  This includes change
-# in function name, change in argument order or argument types, and removal of
-# the function. In each case, the original function is as good as removed and a
-# new function is created. In such cases, we should abort the upgrade if there
-# are user views dependent on this function, since the original function will
-# not be present in the upgraded version.
-udf:
-    # ----------------- Changes from 1.1 to 1.2 -----------------
-    - summary:
-        rettype: schema_madlib.summary_result
-        argument: text, text
-    - summary:
-        rettype: schema_madlib.summary_result
-        argument: text, text, text
-    - summary:
-        rettype: schema_madlib.summary_result
-        argument: text, text, text, text
-    - summary:
-        rettype: schema_madlib.summary_result
-        argument: text, text, text, text, boolean
-    - summary:
-        rettype: schema_madlib.summary_result
-        argument: text, text, text, text, boolean, boolean
-    - summary:
-        rettype: schema_madlib.summary_result
-        argument: text, text, text, text, boolean, boolean, double precision[]
-    - summary:
-        rettype: schema_madlib.summary_result
-        argument: text, text, text, text, boolean, boolean, double precision[], integer
-    - summary:
-        rettype: schema_madlib.summary_result
-        argument: text, text, text, text, boolean, boolean, double precision[], integer, boolean
-
-    # ----------------- Changes from 1.2 to 1.3 -----------------
-    # linear regression: 'num_processed' added in 'linregr_result'
-    - linregr_final:
-        rettype: schema_madlib.linregr_result
-        argument: schema_madlib.bytea8
-    - linregr_merge_states:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - linregr_transition:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, double precision, double precision[]
-
-    # logistic regression: 'num_processed' added in '__logregr_result'
-    - __logregr_cg_result:
-        rettype: schema_madlib.__logregr_result
-        argument: double precision[]
-
-    - __logregr_irls_result:
-        rettype: schema_madlib.__logregr_result
-        argument: double precision[]
-
-    - __logregr_igd_result:
-        rettype: schema_madlib.__logregr_result
-        argument: double precision[]
-
-    # ----------------- Changes from 1.3 to 1.4 -----------------
-    - __internal_get_robust_linregr_insert_string:
-        rettype: character varying
-        argument: schema_madlib.robust_linregr_result, double precision[], text
-    - __internal_get_robust_linregr_result:
-        rettype: schema_madlib.robust_linregr_result
-        argument: character varying, character varying, character varying, double precision[]
-    - __internal_get_robust_logregr_insert_string:
-        rettype: character varying
-        argument: schema_madlib.robust_logregr_result, text
-    - __internal_get_robust_logregr_result:
-        rettype: schema_madlib.robust_logregr_result
-        argument: character varying, character varying, character varying, double precision[]
-    - __internal_get_robust_mlogregr_insert_string:
-        rettype: character varying
-        argument: schema_madlib.robust_mlogregr_result, text
-    - __lda_count_topic_prefunc:
-        rettype: integer[]
-        argument: integer[], integer[]
-    - __lda_count_topic_sfunc:
-        rettype: integer[]
-        argument: integer[], integer[], integer[], integer[], integer, integer
-    - __lda_gibbs_sample:
-        rettype: integer[]
-        argument: integer[], integer[], integer[], integer[], double precision, double precision, integer, integer, integer
-    - __lda_perplexity_ffunc:
-        rettype: double precision
-        argument: integer[]
-    - __lda_perplexity_prefunc:
-        rettype: integer[]
-        argument: integer[], integer[]
-    - __lda_perplexity_sfunc:
-        rettype: integer[]
-        argument: integer[], integer[], integer[], integer[], integer[], double precision, double precision, integer, integer
-    - __lda_util_transpose:
-        rettype: integer[]
-        argument: integer[]
-    - __lda_util_unnest:
-        rettype: SETOF integer[]
-        argument: integer[]
-    - clustered_variance_mlogregr:
-        rettype: void
-        argument: text, text, text, text, text, integer, text, integer, text, double precision
-    - clustered_variance_mlogregr:
-        rettype: void
-        argument: text, text, text, text, text, integer, text, integer, text
-    - clustered_variance_mlogregr:
-        rettype: void
-        argument: text, text, text, text, text, integer, text, integer, text, double precision, boolean
-    - robust_input_checking:
-        rettype: void
-        argument: character varying, character varying, character varying, character varying
-
-    # ----------------- Changes from 1.4 to 1.4.1 ---------------
-    # ----------------- Changes from 1.4.1 to 1.5 ---------------
-    - __cmsketch_final:
-        rettype: bytea
-        argument: bytea
-    - __delete_traininginfo:
-        rettype: void
-        argument: text
-    - __get_encode_table_name:
-        rettype: text
-        argument: text
-    - __get_metatable_name:
-        rettype: text
-        argument: text
-    - __get_routine_id:
-        rettype: integer
-        argument: text
-    - __get_routine_name:
-        rettype: text
-        argument: text
-    - __get_tree_table_name:
-        rettype: text
-        argument: text
-    - __insert_into_traininginfo:
-        rettype: void
-        argument: text, text, text, text, text, text, text, text, double precision, integer, integer
-    - __treemodel_clean:
-        rettype: boolean
-        argument: text
-    - compute_lincrf:
-        rettype: integer
-        argument: character varying, character varying, character varying, character varying, character varying, integer, integer
-    - create_nb_classify_fn:
-        rettype: void
-        argument: character varying, character varying, integer, character varying
-    - create_nb_classify_fn:
-        rettype: void
-        argument: character varying, character varying, character varying, integer, character varying
-    - create_table_as:
-        rettype: void
-        argument: character varying, character varying
-    - create_table_as:
-        rettype: void
-        argument: character varying, character varying, character varying
-    - create_temp_table_as:
-        rettype: void
-        argument: character varying, character varying, character varying
-    - create_temp_table_as:
-        rettype: void
-        argument: character varying, character varying
-    - create_temporary_table_as:
-        rettype: void
-        argument: character varying, character varying, character varying
-    - create_temporary_table_as:
-        rettype: void
-        argument: character varying, character varying
-    - crf_train_fgen:
-        rettype: void
-        argument: text, text, text, text, text
-    - insert_into:
-        rettype: void
-        argument: character varying, character varying
-    - internal_create_table_as:
-        rettype: void
-        argument: boolean, character varying, character varying, character varying
-    - internal_execute_using_kmeans_args:
-        rettype: void
-        argument: character varying, double precision[], regproc, integer, double precision
-    - internal_execute_using_kmeanspp_seeding_args:
-        rettype: void
-        argument: character varying, integer, regproc, double precision[]
-    - internal_execute_using_silhouette_args:
-        rettype: double precision
-        argument: character varying, double precision[], regproc
-    - lincrf:
-        rettype: integer
-        argument: character varying, character varying, character varying, character varying, character varying, integer, character varying, character varying, integer
-    - lincrf:
-        rettype: integer
-        argument: character varying, character varying, character varying, character varying, character varying, integer, character varying, character varying
-    - lsvm_predict:
-        rettype: double precision
-        argument: text, double precision[]
-    - lsvm_predict_combo:
-        rettype: SETOF schema_madlib.svm_model_pr
-        argument: text, double precision[]
-    - lsvm_sgd_update:
-        rettype: schema_madlib.lsvm_sgd_model_rec
-        argument: schema_madlib.lsvm_sgd_model_rec, double precision[], double precision, double precision, double precision
-    - svm_cls_update:
-        rettype: schema_madlib.svm_model_rec
-        argument: schema_madlib.svm_model_rec, double precision[], double precision, text, double precision, double precision
-    - svm_nd_update:
-        rettype: schema_madlib.svm_model_rec
-        argument: schema_madlib.svm_model_rec, double precision[], text, double precision, double precision
-    - svm_predict:
-        rettype: double precision
-        argument: schema_madlib.svm_model_rec, double precision[], text
-    - svm_predict:
-        rettype: double precision
-        argument: text, double precision[]
-    - svm_predict_combo:
-        rettype: SETOF schema_madlib.svm_model_pr
-        argument: text, double precision[]
-    - svm_predict_sub:
-        rettype: double precision
-        argument: integer, integer, double precision[], double precision[], double precision[], text
-    - svm_reg_update:
-        rettype: schema_madlib.svm_model_rec
-        argument: schema_madlib.svm_model_rec, double precision[], double precision, text, double precision, double precision, double precision
-    - utils_normalize_data:
-        rettype: schema_madlib.__utils_scaled_data
-        argument: double precision[], double precision[], double precision[]
-    - vcrf_top1_label:
-        rettype: integer[]
-        argument: integer[], integer[], integer
-    - vcrf_top1_view:
-        rettype: text
-        argument: text, text, text, text
-    # ----------------- Changes from 1.5 to 1.6 ---------------
-    # Removed functions
-    - array_contains_null:
-        rettype: boolean
-        argument: double precision[]
-    - array_sqrt:
-        rettype: anyarray
-        argument: anyarray
-    - coxph_step_final:
-        rettype: double precision[]
-        argument: double precision[]
-    - coxph_step_strata_final:
-        rettype: double precision[]
-        argument: double precision[]
-    - coxph_step_transition:
-        rettype: double precision[]
-        argument: double precision[], double precision[], double precision, boolean, double precision[]
-    - internal_coxph_result:
-        rettype: schema_madlib.coxph_result
-        argument: double precision[]
-    - internal_coxph_step_distance:
-        rettype: double precision
-        argument: double precision[], double precision[]
-    - normalize:
-        rettype: double precision[]
-        argument: double precision[]
-    # Changed functions (return type)
-    # These functions can be recreated correctly even if we don't add them here.
-    # But the view dependency checker needs the information.
-    - __internal_mlogregr_irls_result:
-        rettype: schema_madlib.mlogregr_result
-        argument: double precision[]
-    - __logregr_cg_result:
-        rettype: schema_madlib.__logregr_result
-        argument: double precision[]
-    - __logregr_igd_result:
-        rettype: schema_madlib.__logregr_result
-        argument: double precision[]
-    - __logregr_irls_result:
-        rettype: schema_madlib.__logregr_result
-        argument: double precision[]
-    - linregr_final:
-        rettype: schema_madlib.linregr_result
-        argument: schema_madlib.bytea8
-    - mlogregr:
-        rettype: schema_madlib.mlogregr_result
-        argument: character varying, character varying, character varying, integer, character varying, double precision, integer
-    - mlogregr:
-        rettype: schema_madlib.mlogregr_result
-        argument: character varying, character varying, character varying, integer, character varying
-    - mlogregr:
-        rettype: schema_madlib.mlogregr_result
-        argument: character varying, character varying, character varying, integer
-    - mlogregr:
-        rettype: schema_madlib.mlogregr_result
-        argument: character varying, character varying, character varying
-    # make-ups from upgrade to v1.6
-    - __internal_get_cox_prop_hazards_insert_string:
-        rettype: character varying
-        argument: schema_madlib.cox_prop_hazards_result, text
-    - __internal_get_cox_prop_hazards_result:
-        rettype: schema_madlib.cox_prop_hazards_result
-        argument: character varying, character varying, character varying, character varying
-    - __internal_get_hsk_result:
-        rettype: schema_madlib.heteroskedasticity_test_result
-        argument: character varying, character varying, character varying, double precision[]
-    - __internal_get_linreg_result:
-        rettype: schema_madlib.linregr_result
-        argument: character varying, character varying, character varying
-    - __internal_get_linregr_insert_string:
-        rettype: character varying
-        argument: schema_madlib.linregr_result, text
-    - __internal_linregr_train_hetero:
-        rettype: void
-        argument: character varying, character varying, character varying, character varying, boolean
-    - compute_cox_prop_hazards_regr:
-        rettype: integer
-        argument: character varying, character varying, character varying, character varying, integer, character varying, double precision
-    - cox_prop_hazards_step_final:
-        rettype: double precision[]
-        argument: double precision[]
-    - cox_prop_hazards_step_transition:
-        rettype: double precision[]
-        argument: double precision[], double precision[], double precision, boolean, double precision, double precision[], double precision[], double precision[]
-    - intermediate_cox_prop_hazards:
-        rettype: schema_madlib.intermediate_cox_prop_hazards_result
-        argument: double precision[], boolean, double precision[]
-    - internal_cox_prop_hazards_result:
-        rettype: schema_madlib.cox_prop_hazards_result
-        argument: double precision[]
-    - internal_cox_prop_hazards_step_distance:
-        rettype: double precision
-        argument: double precision[], double precision[]
-    - marginal_logregr_step_final:
-        rettype: schema_madlib.marginal_logregr_result
-        argument: double precision[]
-    - mlogregr_marginal_step_final:
-        rettype: schema_madlib.marginal_mlogregr_result
-        argument: double precision[]
-    # ----------------- Changes from 1.6 to 1.6.0S ----------
-    # ----------------- Changes from 1.6.0S to 1.7 ----------
-    - cross_validation_general:   # change in name of argument "fold_num"
-        rettype: void
-        argument: character varying, character varying[], character varying[], character varying, character varying[], character varying, character varying[], character varying[], character varying, character varying[], character varying[], character varying, character varying, boolean, character varying, character varying[], integer
-    - lmf_igd_transition:
-        rettype: double precision[]
-        argument: double precision[], smallint, smallint, double precision, double precision[], smallint, smallint, smallint, double precision, double precision
-    # depending on bytea8
-    # return type is bytea8
-    - __clustered_err_lin_merge:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - __clustered_err_lin_transition:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, double precision, double precision[], double precision[]
-    - __clustered_err_log_merge:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - __clustered_err_log_transition:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, boolean, double precision[], double precision[]
-    - __clustered_err_mlog_merge:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - __clustered_err_mlog_transition:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, integer, double precision[], double precision[], integer, integer
-    - dense_residual_norm_merge_states:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - dense_residual_norm_transition:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, double precision[], double precision, double precision[]
-    - hetero_linregr_merge_states:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - hetero_linregr_transition:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, double precision, double precision[], double precision[]
-    - linregr_merge_states:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - linregr_transition:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, double precision, double precision[]
-    - robust_linregr_merge_states:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - robust_linregr_transition:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, double precision, double precision[], double precision[]
-    - weighted_sample_merge_int64:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - weighted_sample_merge_vector:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - weighted_sample_transition_int64:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, bigint, double precision
-    - weighted_sample_transition_vector:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, double precision[], double precision
-    # argument type bytea8
-    - __clustered_err_lin_final:
-        rettype: schema_madlib.__clustered_agg_result
-        argument: schema_madlib.bytea8
-    - __clustered_err_log_final:
-        rettype: schema_madlib.__clustered_agg_result
-        argument: schema_madlib.bytea8
-    - __clustered_err_mlog_final:
-        rettype: schema_madlib.__clustered_agg_result
-        argument: schema_madlib.bytea8
-    - dense_residual_norm_final:
-        rettype: schema_madlib.residual_norm_result
-        argument: schema_madlib.bytea8
-    - hetero_linregr_final:
-        rettype: schema_madlib.heteroskedasticity_test_result
-        argument: schema_madlib.bytea8
-    - linregr_final:
-        rettype: schema_madlib.linregr_result
-        argument: schema_madlib.bytea8
-    - robust_linregr_final:
-        rettype: schema_madlib.robust_linregr_result
-        argument: schema_madlib.bytea8
-    - weighted_sample_final_int64:
-        rettype: bigint
-        argument: schema_madlib.bytea8
-    - weighted_sample_final_vector:
-        rettype: double precision[]
-        argument: schema_madlib.bytea8
-    # ----------------- Changes from 1.7.1 to 1.8 ----------
-    - __filter_input_relation:
-        rettype: character varying
-        argument: character varying, character varying
-    - __lda_util_unnest:
-        rettype: SETOF bigint[]
-        argument: bigint[]
-    - matrix_block_mult:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, text
-    - matrix_block_square:
-        rettype: schema_madlib.matrix_result
-        argument: text, text
-    - matrix_block_trans:
-        rettype: schema_madlib.matrix_result
-        argument: text, text
-    - matrix_block_trans:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, boolean
-    - matrix_blockize:
-        rettype: schema_madlib.matrix_result
-        argument: text, integer, integer, text
-    - matrix_densify:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, text, text, text, boolean
-    - matrix_mult:
-        rettype: schema_madlib.matrix_result
-        argument: text, boolean, text, boolean, text
-    - matrix_mult:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, text, text, boolean, text, text, text, text, boolean, text
-    - matrix_norm:
-        rettype: double precision
-        argument: text
-    - matrix_scale_and_add:
-        rettype: void
-        argument: text, text, double precision, text
-    - matrix_sparsify:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, boolean
-    - matrix_square:
-        rettype: schema_madlib.matrix_result
-        argument: text, text
-    - matrix_trans:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, boolean
-    - matrix_trans:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, text, text, text, boolean
-    - matrix_unblockize:
-        rettype: schema_madlib.matrix_result
-        argument: text, text
-    # ----------------- Changes from 1.8 to 1.9 ----------
-
-
-    - _dt_apply:
-       rettype: schema_madlib._tree_result_type
-       argument: schema_madlib.bytea8,schema_madlib.bytea8,schema_madlib.bytea8,smallint,smallint,smallint,boolean,integer
-
-    - internal_linear_svm_igd_result:
-       rettype: schema_madlib.linear_svm_result
-       argument: double precision[]
-
-    - _prune_and_cplist:
-       rettype: schema_madlib._prune_result_type
-       argument: schema_madlib.bytea8,double precision,boolean
-
-    - __array_elem_in:
-       rettype: boolean[]
-       argument: anyarray, anyarray
-
-    - __array_indexed_agg_ffunc:
-       rettype: double precision[]
-       argument: double precision[]
-
-    - __array_indexed_agg_prefunc:
-       rettype: double precision[]
-       argument: double precision[], double precision[]
-
-    - __array_indexed_agg_sfunc:
-       rettype: double precision[]
-       argument: double precision[], double precision, bigint, bigint
-
-    - __array_search:
-       rettype: boolean
-       argument: anyelement, anyarray
-
-    - __array_sort:
-       rettype: anyarray
-       argument: anyarray
-
-    - __assert:
-       rettype: void
-       argument: boolean, text
-
-    - __assert_table:
-       rettype: void
-       argument: text, boolean
-
-    - __best_scv_prefunc:
-       rettype: double precision[]
-       argument: double precision[], double precision[]
-
-    - __best_scv_sfunc:
-       rettype: double precision[]
-       argument: double precision[], double precision[], integer, double precision
-
-    - __bigint_array_add:
-       rettype: bigint[]
-       argument: bigint[], bigint[]
-
-    - __breakup_table:
-       rettype: void
-       argument: text, text, text, text, text, text[], boolean[], integer, integer
-
-    - __check_dt_common_params:
-       rettype: void
-       argument: text, text, text, text, text, text, text, text, integer, double precision, double precision, integer, text
-
-    - __check_training_table:
-       rettype: void
-       argument: text, text[], text[], text, text, integer
-
-    - __column_exists:
-       rettype: boolean
-       argument: text, text
-
-    - __columns_in_table:
-       rettype: boolean
-       argument: text[], text
-
-    - __create_metatable:
-       rettype: void
-       argument: text
-
-    - __create_tree_tables:
-       rettype: void
-       argument: text
-
-    - __csvstr_to_array:
-       rettype: text[]
-       argument: text
-
-    - __display_node_sfunc:
-       rettype: text
-       argument: text, integer, boolean, text, text, double precision, double precision, text, integer
-
-    - __display_tree_no_ordered_aggr:
-       rettype: text
-       argument: text, integer, integer, integer, boolean, double precision, text, integer, integer
-
-    - __distinct_feature_value:
-       rettype: integer
-       argument: text, integer
-
-    - __drop_metatable:
-       rettype: void
-       argument: text
-
-    - __dt_acc_count_sfunc:
-       rettype: bigint[]
-       argument: bigint[], integer, bigint, integer
-
-    - __dt_get_node_split_fids:
-       rettype: integer[]
-       argument: integer, integer, integer, integer[]
-
-    - __ebp_calc_errors:
-       rettype: double precision
-       argument: double precision, double precision, double precision
-
-    - __ebp_prune_tree:
-       rettype: void
-       argument: text
-
-    - __encode_and_train:
-       rettype: record
-       argument: text, text, integer, integer, text, text, text, text, text, text, text, double precision, text, integer, double precision, boolean, double precision, double precision, text, integer
-
-    - __encode_columns:
-       rettype: void
-       argument: text, text, integer, integer
-
-    - __encode_table:
-       rettype: void
-       argument: text, text, text, integer, integer
-
-    - __encode_table:
-       rettype: void
-       argument: text, text, text[], text, text[], text, text, integer, integer
-
-    - __find_best_split:
-       rettype: void
-       argument: text, double precision, text, integer, integer, text, integer, integer
-
-    - __format:
-       rettype: text
-       argument: text, text
-
-    - __format:
-       rettype: text
-       argument: text, text, text
-
-    - __format:
-       rettype: text
-       argument: text, text, text, text
-
-    - __format:
-       rettype: text
-       argument: text, text, text, text, text
-
-    - __format:
-       rettype: text
-       argument: text, text[]
-
-    - __gen_acc:
-       rettype: __gen_acc_time
-       argument: text, text, text, text, text, integer, integer, boolean, integer
-
-    - __gen_enc_meta_names:
-       rettype: text[]
-       argument: text, text
-
-    - __gen_horizontal_encoded_table:
-       rettype: void
-       argument: text, text, integer, integer
-
-    - __gen_vertical_encoded_table:
-       rettype: void
-       argument: text, text, text, boolean, integer
-
-    - __generate_final_tree:
-       rettype: void
-       argument: text
-
-    - __get_class_column_name:
-       rettype: text
-       argument: text
-
-    - __get_class_value:
-       rettype: text
-       argument: integer, text
-
-    - __get_classtable_name:
-       rettype: text
-       argument: text
-
-    - __get_column_value:
-       rettype: text
-       argument: integer, integer, character, text
-
-    - __get_feature_name:
-       rettype: text
-       argument: integer, text
-
-    - __get_feature_value:
-       rettype: text
-       argument: integer, integer, text
-
-    - __get_features_of_nodes:
-       rettype: text
-       argument: text, text, integer, integer, integer
-
-    - __get_id_column_name:
-       rettype: text
-       argument: text
-
-    - __get_schema_name:
-       rettype: text
-       argument: text
-
-    - __get_table_name:
-       rettype: text
-       argument: text
-
-    - __insert_into_metatable:
-       rettype: void
-       argument: text, integer, text, character, boolean, text, integer
-
-    - __is_valid_enc_table:
-       rettype: boolean
-       argument: text
-
-    - __num_of_class:
-       rettype: integer
-       argument: text
-
-    - __num_of_columns:
-       rettype: integer
-       argument: text
-
-    - __num_of_feature:
-       rettype: integer
-       argument: text
-
-    - __regclass_to_text:
-       rettype: text
-       argument: regclass
-
-    - __rename_table:
-       rettype: void
-       argument: text, text
-
-    - __rep_aggr_class_count_ffunc:
-       rettype: bigint[]
-       argument: bigint[]
-
-    - __rep_aggr_class_count_sfunc:
-       rettype: bigint[]
-       argument: bigint[], integer, integer, integer
-
-    - __rep_prune_tree:
-       rettype: void
-       argument: text, text, integer
-
-    - __sample_with_replacement:
-       rettype: void
-       argument: integer, bigint, text, text
-
-    - __sample_within_range:
-       rettype: SETOF bigint
-       argument: bigint, bigint, bigint
-
-    - __scv_aggr_ffunc:
-       rettype: double precision[]
-       argument: double precision[]
-
-    - __scv_aggr_prefunc:
-       rettype: double precision[]
-       argument: double precision[], double precision[]
-
-    - __scv_aggr_sfunc:
-       rettype: double precision[]
-       argument: double precision[], integer, boolean, integer, double precision[], double precision[], bigint
-
-    - __strip_schema_name:
-       rettype: text
-       argument: text
-
-    - __svm_random_ind2:
-       rettype: double precision[]
-       argument: integer
-
-    - __svm_random_ind:
-       rettype: double precision[]
-       argument: integer
-
-    - __svm_target_cl_func:
-       rettype: double precision
-       argument: double precision[]
-
-    - __svm_target_reg_func:
-       rettype: double precision
-       argument: double precision[]
-
-    - __table_exists:
-       rettype: boolean
-       argument: text
-
-    - __train_tree:
-       rettype: __train_result
-       argument: text, integer, integer, text, text, text, text, text, text, double precision, integer, double precision, double precision, double precision, boolean, integer, integer
-
-    - __treemodel_classify_internal:
-       rettype: text[]
-       argument: text, text, integer
-
-    - __treemodel_classify_internal_serial:
-       rettype: text[]
-       argument: text, text, integer
-
-    - __treemodel_display_no_ordered_aggr:
-       rettype: SETOF text
-       argument: text, integer[], integer
-
-    - __treemodel_display_with_ordered_aggr:
-       rettype: SETOF text
-       argument: text, integer[], integer
-
-    - __treemodel_get_vote_result:
-       rettype: void
-       argument: text, text
-
-    - __treemodel_score:
-       rettype: double precision
-       argument: text, text, integer
-
-    - __validate_input_table:
-       rettype: void
-       argument: text, text[], text, text
-
-    - __validate_metatable:
-       rettype: void
-       argument: text
-
-    - c45_classify:
-       rettype: c45_classify_result
-       argument: text, text, text
-
-    - c45_classify:
-       rettype: c45_classify_result
-       argument: text, text, text, integer
-
-    - c45_clean:
-       rettype: boolean
-       argument: text
-
-    - c45_display:
-       rettype: SETOF text
-       argument: text
-
-    - c45_display:
-       rettype: SETOF text
-       argument: text, integer
-
-    - c45_genrule:
-       rettype: SETOF text
-       argument: text
-
-    - c45_genrule:
-       rettype: SETOF text
-       argument: text, integer
-
-    - c45_score:
-       rettype: double precision
-       argument: text, text
-
-    - c45_score:
-       rettype: double precision
-       argument: text, text, integer
-
-    - c45_train:
-       rettype: c45_train_result
-       argument: text, text, text
-
-    - c45_train:
-       rettype: c45_train_result
-       argument: text, text, text, text, text, text, text, text, double precision, text
-
-    - c45_train:
-       rettype: c45_train_result
-       argument: text, text, text, text, text, text, text, text, double precision, text, integer, double precision, double precision, integer
-
-    - correlation:
-       rettype: correlation_result
-       argument: character varying, character varying
-
-    - correlation:
-       rettype: correlation_result
-       argument: character varying, character varying, character varying
-
-    - correlation:
-       rettype: correlation_result
-       argument: character varying, character varying, character varying, boolean
-
-    - linear_svm_igd_transition:
-       rettype: double precision[]
-       argument: double precision[], double precision[], boolean, double precision[], integer, double precision, double precision
-
-    - lsvm_classification:
-       rettype: SETOF lsvm_sgd_result
-       argument: text, text
-
-    - lsvm_classification:
-       rettype: SETOF lsvm_sgd_result
-       argument: text, text, boolean
-
-    - lsvm_classification:
-       rettype: SETOF lsvm_sgd_result
-       argument: text, text, boolean, boolean, double precision, double precision
-
-    - lsvm_classification:
-       rettype: SETOF lsvm_sgd_result
-       argument: text, text, boolean, boolean, double precision, double precision, integer
-
-    - lsvm_predict:
-       rettype: double precision
-       argument: double precision[], double precision[]
-
-    - lsvm_predict_batch:
-       rettype: text
-       argument: text, text, text, text, text
-
-    - lsvm_predict_batch:
-       rettype: text
-       argument: text, text, text, text, text, boolean
-
-    - matrix_block_trans:
-       rettype: matrix_result
-       argument: text, text, text, text, boolean
-
-    - matrix_densify:
-       rettype: matrix_result
-       argument: text, text, text, text, boolean
-
-    - matrix_sparsify:
-       rettype: matrix_result
-       argument: text, text, text, text, boolean
-
-    - matrix_trans:
-       rettype: matrix_result
-       argument: text, text, text, text, boolean
-
-    - rf_classify:
-       rettype: rf_classify_result
-       argument: text, text, text
-
-    - rf_classify:
-       rettype: rf_classify_result
-       argument: text, text, text, boolean, integer
-
-    - rf_classify:
-       rettype: rf_classify_result
-       argument: text, text, text, integer
-
-    - rf_clean:
-       rettype: boolean
-       argument: text
-
-    - rf_display:
-       rettype: SETOF text
-       argument: text
-
-    - rf_display:
-       rettype: SETOF text
-       argument: text, integer[]
-
-    - rf_display:
-       rettype: SETOF text
-       argument: text, integer[], integer
-
-    - rf_score:
-       rettype: double precision
-       argument: text, text
-
-    - rf_score:
-       rettype: double precision
-       argument: text, text, integer
-
-    - rf_train:
-       rettype: rf_train_result
-       argument: text, text, text
-
-    - rf_train:
-       rettype: rf_train_result
-       argument: text, text, text, integer, integer, double precision, text, text, text, text, text, integer, double precision, double precision, integer
-
-    - svdmf_run:
-       rettype: text
-       argument: text, text, text, text, integer
-
-    - svdmf_run:
-       rettype: text
-       argument: text, text, text, text, integer, integer, double precision
-
-    - svm_classification:
-       rettype: SETOF svm_cls_result
-       argument: text, text, boolean, text
-
-    - svm_classification:
-       rettype: SETOF svm_cls_result
-       argument: text, text, boolean, text, boolean, double precision, double precision
-
-    - svm_classification:
-       rettype: SETOF svm_cls_result
-       argument: text, text, boolean, text, boolean, double precision, double precision, double precision
-
-    - svm_classification:
-       rettype: SETOF svm_cls_result
-       argument: text, text, boolean, text, double precision
-
-    - svm_cls_update:
-       rettype: schema_madlib.svm_model_rec
-       argument: schema_madlib.svm_model_rec, double precision[], double precision, text, double precision, double precision, double precision
-
-    - svm_data_normalization:
-       rettype: void
-       argument: text
-
-    - svm_dot:
-       rettype: double precision
-       argument: double precision[], double precision[]
-
-    - svm_dot:
-       rettype: double precision
-       argument: double precision[], double precision[], double precision
-
-    - svm_drop_model:
-       rettype: void
-       argument: text
-
-    - svm_gaussian:
-       rettype: double precision
-       argument: double precision[], double precision[], double precision
-
-    - svm_generate_cls_data:
-       rettype: void
-       argument: text, integer, integer
-
-    - svm_generate_nd_data:
-       rettype: void
-       argument: text, integer, integer
-
-    - svm_generate_reg_data:
-       rettype: void
-       argument: text, integer, integer
-
-    - svm_nd_update:
-       rettype: schema_madlib.svm_model_rec
-       argument: schema_madlib.svm_model_rec, double precision[], text, double precision, double precision, double precision
-
-    - svm_novelty_detection:
-       rettype: SETOF schema_madlib.svm_nd_result
-       argument: text, text, boolean, text
-
-    - svm_novelty_detection:
-       rettype: SETOF schema_madlib.svm_nd_result
-       argument: text, text, boolean, text, boolean, double precision, double precision
-
-    - svm_novelty_detection:
-       rettype: SETOF schema_madlib.svm_nd_result
-       argument: text, text, boolean, text, boolean, double precision, double precision, double precision
-
-    - svm_polynomial:
-       rettype: double precision
-       argument: double precision[], double precision[], double precision
-
-    - svm_predict:
-       rettype: double precision
-       argument: schema_madlib.svm_model_rec, double precision[], text, double precision
-
-    - svm_predict_batch:
-       rettype: text
-       argument: text, text, text, text, text, boolean
-
-    - svm_predict_sub:
-       rettype: double precision
-       argument: integer, integer, double precision[], double precision[], double precision[], text, double precision
-
-    - svm_reg_update:
-       rettype: schema_madlib.svm_model_rec
-       argument: schema_madlib.svm_model_rec, double precision[], double precision, text, double precision, double precision, double precision, double precision
-
-    - svm_regression:
-       rettype: SETOF svm_reg_result
-       argument: text, text, boolean, text
-
-    - svm_regression:
-       rettype: SETOF svm_reg_result
-       argument: text, text, boolean, text, boolean, double precision, double precision, double precision
-
-    - svm_regression:
-       rettype: SETOF svm_reg_result
-       argument: text, text, boolean, text, boolean, double precision, double precision, double precision, double precision
-
-    - svm_store_model:
-       rettype: void
-       argument: text, text, text
-
-    # ----------------- Changes from 1.9 to 1.9.1 ----------
-    - array_collapse:
-        rettype: anyarray
-        argument: anyarray
-    - linear_svm_igd_transition:
-        rettype: double precision[]
-        argument: double precision[], double precision[], double precision, double precision[], integer, double precision, double precision, boolean, integer, double precision, boolean
-    - profile:
-        rettype: SETOF schema_madlib.profile_result
-        argument: text
-    - profile_full:
-        rettype: SETOF schema_madlib.profile_result
-        argument: text, integer
-    - profile:
-        rettype: schema_madlib.profile_result
-        argument: text
-    - profile_full:
-        rettype: schema_madlib.profile_result
-        argument: text, integer
-    - quantile:
-        rettype: double precision
-        argument: text, text, double precision
-    - quantile_big:
-        rettype: double precision
-        argument: text, text, double precision
-
-# Changes to aggregates (UDA) including removal and modification
-# Overloaded functions should be mentioned separately
-uda:
-    # ----------------- Changes from 1.1 to 1.2 -----------------
-    # ----------------- Changes from 1.2 to 1.3 -----------------
-    - linregr:
-        rettype: schema_madlib.linregr_result
-        argument: double precision, double precision[]
-    # ----------------- Changes from 1.3 to 1.4 -----------------
-    - cox_prop_hazards_step:
-         rettype: double precision[]
-         argument: double precision[], double precision, boolean, double precision, double precision[], double precision[], double precision[]
-    - __lda_count_topic_agg:
-        rettype: integer[]
-        argument: integer[], integer[], integer[], integer, integer
-    - __lda_perplexity_agg:
-        rettype: double precision
-        argument: integer[], integer[], integer[], integer[], double precision, double precision, integer, integer
-    # ----------------- Changes from 1.4 to 1.4.1 ---------------
-    # ----------------- Changes from 1.4.1 to 1.5 ---------------
-    - lsvm_sgd_agg:
-        rettype: schema_madlib.lsvm_sgd_model_rec
-        argument: double precision[], double precision, double precision, double precision
-    - svm_cls_agg:
-        rettype: schema_madlib.svm_model_rec
-        argument: double precision[], double precision, text, double precision, double precision
-    - svm_nd_agg:
-        rettype: schema_madlib.svm_model_rec
-        argument: double precision[], text, double precision, double precision
-    - svm_reg_agg:
-        rettype: schema_madlib.svm_model_rec
-        argument: double precision[], double precision, text, double precision, double precision, double precision
-    # ----------------- Changes from 1.5 to 1.6 ---------------
-    # - coxph_step: not exists in v1.1
-    # - coxph_strata_step_inner: not exists in v1.1
-    # - coxph_strata_step_outer: not exists in v1.1
-    # return type change
-    # - linregr: appeared before
-    # initcond change
-    - __mlogregr_irls_step:
-        rettype: double precision[]
-        argument: integer, integer, integer, double precision[], double precision[]
-    # make-ups from upgrade to v1.6
-    - marginal_logregr:
-        rettype: schema_madlib.marginal_logregr_result
-        argument: boolean, double precision[], double precision[]
-    - marginal_mlogregr:
-        rettype: schema_madlib.marginal_mlogregr_result
-        argument: integer, integer, integer, double precision[], double precision[]
-    # ----------------- Changes from 1.6 to 1.6.0S ----------
-    # ----------------- Changes from 1.6.0S to 1.7 ----------
-    - lmf_igd_step:
-        rettype: double precision[]
-        argument: smallint, smallint, double precision, double precision[], smallint, smallint, smallint, double precision, double precision
-    - __clustered_err_lin_step:
-        rettype: schema_madlib.__clustered_agg_result
-        argument: double precision, double precision[], double precision[]
-    - __clustered_err_log_step:
-        rettype: schema_madlib.__clustered_agg_result
-        argument: boolean, double precision[], double precision[]
-    - __clustered_err_mlog_step:
-        rettype: schema_madlib.__clustered_agg_result
-        argument: integer, double precision[], double precision[], integer, integer
-    - dense_residual_norm:
-        rettype: schema_madlib.residual_norm_result
-        argument: double precision[], double precision, double precision[]
-    - heteroskedasticity_test_linregr:
-        rettype: schema_madlib.heteroskedasticity_test_result
-        argument: double precision, double precision[], double precision[]
-    - linregr:
-        rettype: schema_madlib.linregr_result
-        argument: double precision, double precision[]
-    - robust_linregr:
-        rettype: schema_madlib.robust_linregr_result
-        argument: double precision, double precision[], double precision[]
-    - weighted_sample:
-        rettype: double precision[]
-        argument: double precision[], double precision
-    - weighted_sample:
-        rettype: bigint
-        argument: bigint, double precision
-    # ----------------- Changes from 1.7.1 to 1.8 ----------
-    - discrete_distribution_agg:
-        rettype: double precision[]
-        argument: integer, double precision, integer
-    - vectorized_distribution_agg:
-        rettype: double precision[]
-        argument: integer[], integer[]
-    # ----------------- Changes from 1.8 to 1.9 ----------
-
-    - __array_indexed_agg:
-        rettype: double precision[]
-        argument: double precision, bigint, bigint
-
-    - __best_scv_aggr:
-        rettype: double precision[]
-        argument: double precision[], integer, double precision
-
-    - __bigint_array_sum:
-        rettype: bigint[]
-        argument: bigint[]
-
-    - __display_tree_aggr:
-        rettype: text
-        argument: integer, boolean, text, text, double precision, double precision, text, integer
-
-    - __dt_acc_count_aggr:
-        rettype: bigint[]
-        argument: integer, bigint, integer
-
-    - __rep_aggr_class_count:
-        rettype: bigint[]
-        argument: integer, integer, integer
-
-    - __scv_aggr:
-        rettype: double precision[]
-        argument: integer, boolean, integer, double precision[], double precision[], bigint
-
-    - linear_svm_igd_step:
-        rettype: double precision[]
-        argument: double precision[], boolean, double precision[], integer, double precision, double precision
-
-    - linear_svm_igd_step_serial:
-        rettype: double precision[]
-        argument: double precision[], boolean, double precision[], integer, double precision, double precision
-
-    - svm_cls_agg:
-        rettype: schema_madlib.svm_model_rec
-        argument: double precision[], double precision, text, double precision, double precision, double precision
-
-    - svm_nd_agg:
-        rettype: schema_madlib.svm_model_rec
-        argument: double precision[], text, double precision, double precision, double precision
-
-    - svm_reg_agg:
-        rettype: schema_madlib.svm_model_rec
-        argument: double precision[], double precision, text, double precision, double precision, double precision, double precision
-
-    - __svm_random_ind2:
-        rettype: double precision[]
-        argument: integer
-
-    # ----------------- Changes from 1.9 to 1.9.1 ----------
-    - array_agg:
-        rettype: anyarray
-        argument: anyelement
-    - linear_svm_igd_step:
-       rettype: double precision[]
-       argument: double precision[], double precision, double precision[], integer, double precision, double precision, boolean, integer, double precision, boolean
-
-# Casts (UDC) updated/removed
-udc:
-    # ----------------- Changes from 1.1 to 1.2 -----------------
-    # ----------------- Changes from 1.2 to 1.3 -----------------
-    # ----------------- Changes from 1.3 to 1.4 -----------------
-    # ----------------- Changes from 1.4 to 1.4.1 ---------------
-    # ----------------- Changes from 1.4.1 to 1.5 ---------------
-    bool2text:
-        sourcetype: boolean
-        targettype: text
-    # ----------------- Changes from 1.5 to 1.6 ---------------
-    # ----------------- Changes from 1.6 to 1.6.0S ----------
-    # ----------------- Changes from 1.6.0S to 1.7 ----------
-
-# Operators (UDO) removed/updated
-udo:
-    # ----------------- Changes from 1.5 to 1.6 ---------------
-    - '<':
-        leftarg: schema_madlib.svec
-        rightarg: schema_madlib.svec
-    - '<=':
-        leftarg: schema_madlib.svec
-        rightarg: schema_madlib.svec
-    - '<>':
-        leftarg: schema_madlib.svec
-        rightarg: schema_madlib.svec
-    - '==':
-        leftarg: schema_madlib.svec
-        rightarg: schema_madlib.svec
-    - '>=':
-        leftarg: schema_madlib.svec
-        rightarg: schema_madlib.svec
-    - '>':
-        leftarg: schema_madlib.svec
-        rightarg: schema_madlib.svec
-    # ----------------- Changes from 1.6 to 1.6.0S ----------
-    # ----------------- Changes from 1.6.0S to 1.7 ----------
-
-# Operator Classes (UDOC) removed/updated
-udoc:
-    # ----------------- Changes from 1.5 to 1.6 ---------------
-    # removed
-    - svec_l2_ops:
-        index: btree
-    # ----------------- Changes from 1.6 to 1.6.0S ----------
-    # ----------------- Changes from 1.6.0S to 1.7 ----------

[19/50] [abbrv] incubator-madlib git commit: Include boost::format in MathToolkit_impl.hpp.

Posted by ri...@apache.org.

Include boost::format in MathToolkit_impl.hpp.

Without this include, compilation fails on macOS Sierra with the following
error:

In file included from /tmp/incubator-madlib-e1c99c1/src/dbal/BoostIntegration/BoostIntegration.hpp:25:
/tmp/incubator-madlib-e1c99c1/src/dbal/BoostIntegration/MathToolkit_impl.hpp:56:22: error: no member named 'io' in
      namespace 'boost'
            % boost::io::group(std::setprecision(prec), inVal)
              ~~~~~~~^
/tmp/incubator-madlib-e1c99c1/src/dbal/BoostIntegration/MathToolkit_impl.hpp:55:31: error: no member named 'format' in
      namespace 'boost'
    std::string msg = (boost::format(inMessage)
                       ~~~~~~~^
2 errors generated.
make[2]: *** [src/ports/postgres/9.6/CMakeFiles/madlib_postgresql_9_6.dir/__/__/__/modules/assoc_rules/assoc_rules.cpp.o] Error 1
make[1]: *** [src/ports/postgres/9.6/CMakeFiles/madlib_postgresql_9_6.dir/all] Error 2
make: *** [all] Error 2


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/0e00a27f
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/0e00a27f
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/0e00a27f

Branch: refs/heads/latest_release
Commit: 0e00a27f9f854e20c75ac015f247dcaa9e01b9f6
Parents: f7cb980
Author: Lifepillar <li...@lifepillar.me>
Authored: Thu Nov 24 21:15:39 2016 +0100
Committer: Rahul Iyer <ri...@apache.org>
Committed: Thu Jan 26 15:58:24 2017 -0800

----------------------------------------------------------------------
 src/dbal/BoostIntegration/MathToolkit_impl.hpp | 1 +
 1 file changed, 1 insertion(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/0e00a27f/src/dbal/BoostIntegration/MathToolkit_impl.hpp
----------------------------------------------------------------------
diff --git a/src/dbal/BoostIntegration/MathToolkit_impl.hpp b/src/dbal/BoostIntegration/MathToolkit_impl.hpp
index 2239f14..a83b421 100644
--- a/src/dbal/BoostIntegration/MathToolkit_impl.hpp
+++ b/src/dbal/BoostIntegration/MathToolkit_impl.hpp
@@ -11,6 +11,7 @@
 
 #include <iomanip>
 
+#include <boost/format.hpp>
 #include <boost/math/policies/error_handling.hpp>
 
 namespace boost {

[22/50] [abbrv] incubator-madlib git commit: K-means: support for array input

Posted by ri...@apache.org.

K-means: support for array input

JIRA: MADLIB-1018

Adds support for array input as data points. The function collates
the columns into a column in a temp table.


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/08294791
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/08294791
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/08294791

Branch: refs/heads/latest_release
Commit: 08294791fbfcfef053c7a752bc87a42aeba117e1
Parents: 071128d
Author: Orhan Kislal <ok...@pivotal.io>
Authored: Wed Feb 1 14:04:38 2017 -0800
Committer: Orhan Kislal <ok...@pivotal.io>
Committed: Wed Feb 1 14:04:38 2017 -0800

----------------------------------------------------------------------
 src/ports/postgres/modules/kmeans/kmeans.py_in  | 58 ++++++++++++++++++++
 src/ports/postgres/modules/kmeans/kmeans.sql_in | 32 +++++++----
 .../postgres/modules/kmeans/test/kmeans.sql_in  | 10 +++-
 3 files changed, 86 insertions(+), 14 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/08294791/src/ports/postgres/modules/kmeans/kmeans.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/kmeans/kmeans.py_in b/src/ports/postgres/modules/kmeans/kmeans.py_in
index d99ddd8..da75d78 100644
--- a/src/ports/postgres/modules/kmeans/kmeans.py_in
+++ b/src/ports/postgres/modules/kmeans/kmeans.py_in
@@ -12,11 +12,15 @@ m4_changequote(`<!', `!>')
 """
 
 import plpy
+import re
 
 from utilities.control import IterationController2D
 from utilities.control_composite import IterationControllerComposite
 from utilities.validate_args import table_exists
+from utilities.validate_args import columns_exist_in_table
 from utilities.validate_args import table_is_empty
+from utilities.validate_args import get_expr_type
+from utilities.utilities import unique_string
 
 STATE_IN_MEM = m4_ifdef(<!__HAWQ__!>, <!True!>, <!False!>)
 HAS_FUNCTION_PROPERTIES = m4_ifdef(<!__HAS_FUNCTION_PROPERTIES__!>, <!True!>, <!False!>)
@@ -34,6 +38,31 @@ def kmeans_validate_src(schema_madlib, rel_source, **kwargs):
 
 # ----------------------------------------------------------------------
 
+def kmeans_validate_expr(schema_madlib, rel_source, expr_point, **kwargs):
+    """
+    Validation function for the expr_point parameter
+    expr_point accepts 2 formats:
+        - A single column name of a numeric array
+        - A numeric array expression
+    """
+
+    expr_type = get_expr_type(expr_point,rel_source).lower()
+
+    # Both formats should return a numeric array type
+    if expr_type in ['smallint[]', 'integer[]', 'bigint[]', 'decimal[]',
+                        'numeric[]', 'real[]', 'double precision[]',
+                        'serial[]', 'bigserial[]', 'float8[]']:
+
+        # An array expression should fail this check
+        if columns_exist_in_table(rel_source, [expr_point]):
+            return False
+        return True
+    else:
+        plpy.error(
+            """Kmeans error: {expr_point} is not a valid column or array!
+            """.format(**locals()))
+
+# ----------------------------------------------------------------------
 
 def compute_kmeanspp_seeding(schema_madlib, rel_args, rel_state, rel_source,
                              expr_point, **kwargs):
@@ -54,6 +83,16 @@ def compute_kmeanspp_seeding(schema_madlib, rel_args, rel_state, rel_source,
     @return The iteration number (i.e., the key) with which to look up the
         result in \c rel_state
     """
+
+    if kmeans_validate_expr(schema_madlib, rel_source, expr_point):
+        view_name = unique_string('km_view')
+
+        plpy.execute(""" CREATE TEMP VIEW {view_name} AS
+            SELECT {expr_point} AS expr FROM {rel_source}
+            """.format(**locals()))
+        rel_source = view_name
+        expr_point = 'expr'
+
     fn_dist_name = plpy.execute("SELECT fn_dist_name FROM " + rel_args)[0]['fn_dist_name']
     iterationCtrl = IterationController2D(
         rel_args=rel_args,
@@ -139,6 +178,15 @@ def compute_kmeans_random_seeding(schema_madlib, rel_args, rel_state,
     @return The iteration number (i.e., the key) with which to look up the
         result in \c rel_state
     """
+    if kmeans_validate_expr(schema_madlib, rel_source, expr_point):
+        view_name = unique_string('km_view')
+
+        plpy.execute(""" CREATE TEMP VIEW {view_name} AS
+            SELECT {expr_point} AS expr FROM {rel_source}
+            """.format(**locals()))
+        rel_source = view_name
+        expr_point = 'expr'
+
     iterationCtrl = IterationController2D(
         rel_args=rel_args,
         rel_state=rel_state,
@@ -211,6 +259,16 @@ def compute_kmeans(schema_madlib, rel_args, rel_state, rel_source,
     @return The iteration number (i.e., the key) with which to look up the
         result in \c rel_state
     """
+
+    if kmeans_validate_expr(schema_madlib, rel_source, expr_point):
+        view_name = unique_string('km_view')
+
+        plpy.execute(""" CREATE TEMP VIEW {view_name} AS
+            SELECT {expr_point} AS expr FROM {rel_source}
+            """.format(**locals()))
+        rel_source = view_name
+        expr_point = 'expr'
+
     fn_dist_name = plpy.execute("SELECT fn_dist_name FROM " +
                                 rel_args)[0]['fn_dist_name']
     iterationCtrl = IterationControllerComposite(

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/08294791/src/ports/postgres/modules/kmeans/kmeans.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/kmeans/kmeans.sql_in b/src/ports/postgres/modules/kmeans/kmeans.sql_in
index 2352bc1..16014ef 100644
--- a/src/ports/postgres/modules/kmeans/kmeans.sql_in
+++ b/src/ports/postgres/modules/kmeans/kmeans.sql_in
@@ -105,7 +105,7 @@ are skipped during analysis.
 </dd>
 
 <dt>expr_point</dt>
-<dd>TEXT. The name of the column with point coordinates.</dd>
+<dd>TEXT. The name of the column with point coordinates or an array expression.</dd>
 
 <dt>k</dt>
 <dd>INTEGER. The number of centroids to calculate.</dd>
@@ -148,20 +148,11 @@ Note: the final K-means algorithm is run on the complete dataset. This parameter
 only builds a subsample for the seeding and is only available for kmeans++.
 
 <dt>rel_initial_centroids</dt>
-<dd>TEXT. The set of initial centroids. The centroid relation is
-expected to be of the following form:
-<pre>
-{TABLE|VIEW} rel_initial_centroids (
-    ...
-    expr_centroid DOUBLE PRECISION[],
-    ...
-)
-</pre>
-where <em>expr_centroid</em> is the name of a column with coordinates.
+<dd>TEXT. The set of initial centroids.
 </dd>
 
 <dt>expr_centroid</dt>
-<dd>TEXT. The name of the column in the <em>rel_initial_centroids</em> relation that contains the centroid coordinates.</dd>
+<dd>TEXT. The name of the column (or the array expression) in the <em>rel_initial_centroids</em> relation that contains the centroid coordinates.</dd>
 
 <dt>initial_centroids</dt>
 <dd>TEXT. A string containing a DOUBLE PRECISION array expression with the initial centroid coordinates.</dd>
@@ -501,6 +492,13 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__kmeans_validate_src(
 $$ LANGUAGE plpythonu
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
 
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__kmeans_validate_expr(
+    rel_source      VARCHAR,
+    expr_point      VARCHAR
+) RETURNS BOOLEAN AS $$
+    PythonFunction(kmeans, kmeans, kmeans_validate_expr)
+$$ LANGUAGE plpythonu
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
 
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__seeding_validate_args(
     rel_source VARCHAR,
@@ -516,6 +514,11 @@ DECLARE
   rel_source_regclass REGCLASS;
   rel_filtered VARCHAR;
 BEGIN
+
+    -- Validate the expr_point input. Since we don't need a view at this
+    -- point, the output is safe to ignore.
+    PERFORM MADLIB_SCHEMA.__kmeans_validate_expr(rel_source,expr_point);
+
     rel_source_regclass := rel_source;
 
     IF (initial_centroids IS NOT NULL) THEN
@@ -532,6 +535,7 @@ BEGIN
         Number of clusters k must be <= 32767 (for results to be returned in a
         reasonable amount of time).';
     END IF;
+
     EXECUTE $sql$ SELECT count(*)
                   FROM $sql$ || textin(regclassout(rel_source_regclass)) || $sql$
                   WHERE abs(coalesce(MADLIB_SCHEMA.svec_elsum($sql$ || expr_point || $sql$), 'Infinity'::FLOAT8)) < 'Infinity'::FLOAT8 $sql$
@@ -1571,6 +1575,10 @@ BEGIN
 
     PERFORM MADLIB_SCHEMA.__kmeans_validate_src(rel_source);
 
+    -- Validate the expr_point input. Since we don't need a view at this
+    -- point, the output is safe to ignore.
+    PERFORM MADLIB_SCHEMA.__kmeans_validate_expr(rel_source,expr_point);
+
     class_rel_source := rel_source;
 
     proc_fn_dist := fn_dist

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/08294791/src/ports/postgres/modules/kmeans/test/kmeans.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/kmeans/test/kmeans.sql_in b/src/ports/postgres/modules/kmeans/test/kmeans.sql_in
index b95693b..072ecb3 100644
--- a/src/ports/postgres/modules/kmeans/test/kmeans.sql_in
+++ b/src/ports/postgres/modules/kmeans/test/kmeans.sql_in
@@ -28,7 +28,7 @@ FROM (
 ) AS centroids, generate_series(1,100) i;
 
 CREATE TABLE centroids AS
-SELECT position
+SELECT x,y,position
 FROM kmeans_2d
 ORDER BY random()
 LIMIT 10;
@@ -82,8 +82,14 @@ COPY km_sample (pid, points) FROM stdin DELIMITER '|';
 10 | {13.86, 1.35, 2.27, 16, 98, 2.98, 3.15, 0.22, 1.8500, 7.2199, 1.01, NULL, 1045}
 \.
 
-DROP TABLE IF EXISTS centroids;
 
 SELECT * FROM kmeanspp('km_sample', 'points', 2,
                        'MADLIB_SCHEMA.squared_dist_norm2',
                        'MADLIB_SCHEMA.avg', 20, 0.001);
+
+
+SELECT * FROM kmeans('kmeans_2d', 'array[x,y]', 'centroids', 'array[x,y]');
+SELECT * FROM kmeanspp('kmeans_2d', 'array[x,y]', 10);
+SELECT * FROM kmeans_random('kmeans_2d', 'arRAy [ x,y]', 10);
+
+DROP TABLE IF EXISTS centroids;

[03/50] [abbrv] incubator-madlib git commit: Utilities: Fix incorrect flag for distribution

Posted by ri...@apache.org.

Utilities: Fix incorrect flag for distribution


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/e1f37bb7
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/e1f37bb7
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/e1f37bb7

Branch: refs/heads/latest_release
Commit: e1f37bb7fbe7671bed4bdbaa13d7b76f071c00f1
Parents: 02f4602
Author: Rahul Iyer <ri...@apache.org>
Authored: Tue Jan 10 13:34:49 2017 -0800
Committer: Rahul Iyer <ri...@apache.org>
Committed: Tue Jan 10 13:35:02 2017 -0800

----------------------------------------------------------------------
 src/ports/postgres/modules/utilities/create_indicators.py_in | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/e1f37bb7/src/ports/postgres/modules/utilities/create_indicators.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/create_indicators.py_in b/src/ports/postgres/modules/utilities/create_indicators.py_in
index 68cf0d5..dbbc923 100644
--- a/src/ports/postgres/modules/utilities/create_indicators.py_in
+++ b/src/ports/postgres/modules/utilities/create_indicators.py_in
@@ -20,7 +20,7 @@ from validate_args import _get_table_schema_names
 from validate_args import get_first_schema
 
 m4_changequote(`<!', `!>')
-IS_POSTGRES = m4_ifdef(<!__POSTGRESQL__!>, <!True!>, <!False!>)
+is_postgresql = m4_ifdef(<!__POSTGRESQL__!>, <!True!>, <!False!>)
 # -----------------------------------------------------------------------
 # Deprecated functions corresponding to "create_indicators.sql_in"
 # -----------------------------------------------------------------------
@@ -74,11 +74,11 @@ def create_indicator_variables(schema_madlib, source_table, out_table,
                                 "as \"{0}_NULL\"".format(col_no_quotes))
         sql_list.append(" FROM " + source_table + ") ")
 
-        if IS_POSTGRES:
+        if not is_postgresql:
             if distributed_by:
                 dist_str = distributed_by
             else:
-                dist_str = ','.join(['"%s"'%i
+                dist_str = ','.join(['"%s"' % i
                                      for i in get_distribution_policy(source_table)
                                      if i is not None])
             if dist_str:

[44/50] [abbrv] incubator-madlib git commit: Release: Updates the release notes for v1.10.0.

Posted by ri...@apache.org.

Release: Updates the release notes for v1.10.0.

Closes #97


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/97e795dd
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/97e795dd
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/97e795dd

Branch: refs/heads/latest_release
Commit: 97e795dd49be94871a076d1019ccb5c2513d27a2
Parents: 90f4dc1
Author: Orhan Kislal <ok...@pivotal.io>
Authored: Fri Feb 10 11:36:25 2017 -0800
Committer: Orhan Kislal <ok...@pivotal.io>
Committed: Fri Feb 10 11:36:25 2017 -0800

----------------------------------------------------------------------
 RELEASE_NOTES | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/97e795dd/RELEASE_NOTES
----------------------------------------------------------------------
diff --git a/RELEASE_NOTES b/RELEASE_NOTES
index 6eb5c22..29d850c 100644
--- a/RELEASE_NOTES
+++ b/RELEASE_NOTES
@@ -9,6 +9,41 @@ commit history located at https://github.com/madlib/madlib/commits/master.
 
 Current list of bugs and issues can be found at https://issues.apache.org/jira/browse/MADLIB.
 \u2014-------------------------------------------------------------------------
+MADlib v1.10.0
+
+Release Date: 2017-February-17
+
+New features:
+* New module: Graph - Single Source Shortest Path (SSSP) (MADLIB-992)
+    - Calculate the shortest path from a given vertex to every vertex in the graph.
+* New module: Encode categorical variables (MADLIB-1038)
+    - Completely new version for dummy/one-hot encoding of categorical variables with new name and different arguments.
+    - Previous version has been deprecated.
+* New module (early stage): K-Nearest Neighbors (KNN) (MADLIB-927)
+    - Find the k nearest neighbors based on the squared_dist_norm2 metric.
+* Elastic Net: Add grouping support (MADLIB-950)
+    - Elastic net train for both Gaussian and Binomial models, with FISTA
+    and IGD optimizations support grouping.
+    - Use active sets for FISTA, but active sets are used only after the
+    log-likelihood of all the groups becomes 0.
+* Elastic Net: Add cross validation (MADLIB-996)
+* PCA: Add grouping support (MADLIB-947)
+* PCA: Removed column id restriction.
+* Kmeans: Cluster variance for PivotalR support.
+* Kmeans: Support for array input. (MADLIB-1018)
+* DT and RF: Verbose option for the dot output format. (MADLIB-1051)
+* Association Rules: Add rule counts and limit itemset size feature (MADLIB-1044, MADLIB-1031)
+* Boost library has been upgraded from 1.47 to 1.61
+* Multiple improvements to the build system (madpack, cmake etc.) to support Semantic versioning and various versions of GPDB and HAWQ.
+
+Bug fixes:
+    - Pivot: Adjust the warning level to remove redundant messages.
+    - RF: Fix the online help and examples.
+    - Utilities: Fix incorrect flag for distribution.
+    - Install check: Update date format and remove hardcoded schema names.
+    - Multiple user documentation improvements.
+
+\u2014-------------------------------------------------------------------------
 MADlib v1.9.1
 
 Release Date: 2016-August-25

[47/50] [abbrv] incubator-madlib git commit: Build: Add error for missing server includedir

Posted by ri...@apache.org.

Build: Add error for missing server includedir

JIRA: MADLIB-1065

Missing server headers is a common error with new contributors. This
leads to a non-related error with CMake. This commit fixes that by
explicitly asking user to check server includedir.

Closes #102


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/b3495c50
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/b3495c50
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/b3495c50

Branch: refs/heads/latest_release
Commit: b3495c50bf491139ac245a21d97963e81892c610
Parents: 7055dce
Author: Rahul Iyer <ri...@apache.org>
Authored: Fri Feb 17 15:50:58 2017 -0800
Committer: Orhan Kislal <ok...@pivotal.io>
Committed: Fri Feb 17 15:50:58 2017 -0800

----------------------------------------------------------------------
 src/ports/postgres/cmake/FindPostgreSQL.cmake | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/b3495c50/src/ports/postgres/cmake/FindPostgreSQL.cmake
----------------------------------------------------------------------
diff --git a/src/ports/postgres/cmake/FindPostgreSQL.cmake b/src/ports/postgres/cmake/FindPostgreSQL.cmake
index eda277b..0f9663d 100644
--- a/src/ports/postgres/cmake/FindPostgreSQL.cmake
+++ b/src/ports/postgres/cmake/FindPostgreSQL.cmake
@@ -86,7 +86,6 @@ if(${PKG_NAME}_PG_CONFIG)
         OUTPUT_VARIABLE ${PKG_NAME}_SERVER_INCLUDE_DIR
         OUTPUT_STRIP_TRAILING_WHITESPACE
     )
-
     execute_process(COMMAND ${${PKG_NAME}_PG_CONFIG} --includedir
         OUTPUT_VARIABLE ${PKG_NAME}_CLIENT_INCLUDE_DIR
         OUTPUT_STRIP_TRAILING_WHITESPACE
@@ -97,7 +96,7 @@ if(${PKG_NAME}_PG_CONFIG AND ${PKG_NAME}_SERVER_INCLUDE_DIR)
     set(${PKG_NAME}_VERSION_MAJOR 0)
     set(${PKG_NAME}_VERSION_MINOR 0)
     set(${PKG_NAME}_VERSION_PATCH 0)
-    
+
     set(CONFIG_FILE ${${PKG_NAME}_SERVER_INCLUDE_DIR}/pg_config.h)
 
     if(EXISTS ${CONFIG_FILE})
@@ -112,7 +111,6 @@ if(${PKG_NAME}_PG_CONFIG AND ${PKG_NAME}_SERVER_INCLUDE_DIR)
         else(${CMAKE_COMPILER_IS_GNUCC})
             file(READ ${CONFIG_FILE} _PG_CONFIG_HEADER_CONTENTS)
         endif(${CMAKE_COMPILER_IS_GNUCC})
-        
 
 		# Get PACKAGE_NAME
 		if (_PG_CONFIG_HEADER_CONTENTS MATCHES "#define PACKAGE_NAME \".*\"")
@@ -159,6 +157,10 @@ if(${PKG_NAME}_PG_CONFIG AND ${PKG_NAME}_SERVER_INCLUDE_DIR)
 			endif(_PG_CONFIG_HEADER_CONTENTS MATCHES "#define ${_PG_CONFIG_VERSION_MACRO} ([0-9]+).*")
 
         endif(${PKG_NAME}_VERSION_NUM MATCHES "^[0-9]+$")
+    else(EXISTS ${CONFIG_FILE})
+        message(FATAL_ERROR "Found pg_config (\"${${PKG_NAME}_PG_CONFIG}\"), "
+              "but pg_config.h file not present in the "
+              "server include dir (${${PKG_NAME}_SERVER_INCLUDE_DIR}).")
     endif(EXISTS ${CONFIG_FILE})
 
     if(_PACKAGE_NAME STREQUAL "${_NEEDED_PG_CONFIG_PACKAGE_NAME}")

[13/50] [abbrv] incubator-madlib git commit: Association Rules: Add rule counts and limit itemset size feature

Posted by ri...@apache.org.

Association Rules: Add rule counts and limit itemset size feature

JIRA: MADLIB-1044, MADLIB-1031

- The output column currently lists support, along with other columns.
The output table now contains count too, which is essentially
support*number_of_transactions for any rule. This could have been
obtained by post-processing the original output table content too,
but listing it explicitly might be useful.
- Add a new optional parameter named max_itemset_size that determines
the maximum size of frequent itemsets that are used for generating
association rules. Must be 2 or more, and the default behavior is to
generate itemsets of all sizes. This parameter can be used to reduce
run time for data sets where itemset size is large.
- This commit also includes improved online and user documentation
with examples of usage.

Closes #87


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/8e5da2ff
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/8e5da2ff
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/8e5da2ff

Branch: refs/heads/latest_release
Commit: 8e5da2ff053ca49b3c08bdb5e540a3370f6e8d09
Parents: e384c1f
Author: Nandish Jayaram <nj...@users.noreply.github.com>
Authored: Wed Jan 25 16:44:58 2017 -0800
Committer: Nandish Jayaram <nj...@users.noreply.github.com>
Committed: Wed Jan 25 16:44:58 2017 -0800

----------------------------------------------------------------------
 .../modules/assoc_rules/assoc_rules.py_in       | 132 ++++++++++++-
 .../modules/assoc_rules/assoc_rules.sql_in      | 186 +++++++++++++++----
 .../modules/assoc_rules/test/assoc_rules.sql_in |   9 +
 3 files changed, 282 insertions(+), 45 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/8e5da2ff/src/ports/postgres/modules/assoc_rules/assoc_rules.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/assoc_rules/assoc_rules.py_in b/src/ports/postgres/modules/assoc_rules/assoc_rules.py_in
index d5eee48..abc8b50 100644
--- a/src/ports/postgres/modules/assoc_rules/assoc_rules.py_in
+++ b/src/ports/postgres/modules/assoc_rules/assoc_rules.py_in
@@ -7,7 +7,6 @@
 
 @namespace assoc_rules
 """
-
 import time
 import plpy
 from utilities.validate_args import columns_exist_in_table
@@ -55,13 +54,19 @@ def __float_le(val1, val2):
 @param input_table     name of the table where the data is stored
 @param output_schema   name of the schema where the final results will be stored
 @param verbose         determining if output contains comments
+@param max_itemset_size determines the maximum size of frequent itemsets allowed
+                        to generate association rules from
 """
 def assoc_rules(madlib_schema, support, confidence, tid_col,
-                item_col, input_table, output_schema, verbose):
+                item_col, input_table, output_schema, verbose, max_itemset_size):
 
     begin_func_exec = time.time();
     begin_step_exec = time.time();
     cal_itemsets_time = 0;
+    if max_itemset_size is None:
+        max_itemset_size = float('inf')
+    elif max_itemset_size <= 1:
+        plpy.error("ERROR: max_itemset_size has to be greater than 1.")
 
     #check parameters
     __assert(
@@ -113,6 +118,7 @@ def assoc_rules(madlib_schema, support, confidence, tid_col,
             ruleId      INT,
             pre         TEXT[],
             post        TEXT[],
+            count       INT,
             support     FLOAT8,
             confidence  FLOAT8,
             lift        FLOAT8,
@@ -304,7 +310,7 @@ def assoc_rules(madlib_schema, support, confidence, tid_col,
 
     iter = 0;
 
-    while num_item_loop > 0 :
+    while num_item_loop > 0 and iter < max_itemset_size:
         begin_step_exec = time.time();
         iter = iter + 1;
 
@@ -459,14 +465,14 @@ def assoc_rules(madlib_schema, support, confidence, tid_col,
 
         plpy.execute("""
              INSERT INTO {0}.assoc_rules
-             SELECT t1.ruleId, t2.pre, t3.post, support,
+             SELECT t1.ruleId, t2.pre, t3.post, t1.support*{1}::INT AS count, support,
                     confidence, lift, conviction
              FROM
                 assoc_rules_aux_tmp t1,
                 pre_tmp_table t2,
                 post_tmp_table t3
              WHERE t1.ruleId = t2.ruleId AND t1.ruleId = t3.ruleId
-             """.format(output_schema)
+             """.format(output_schema, num_tranx)
              );
 
         # if in verbose mode, we will keep all the intermediate tables
@@ -504,3 +510,119 @@ def assoc_rules(madlib_schema, support, confidence, tid_col,
             total_rules,
             time.time() - begin_func_exec
            );
+
+def assoc_rules_help_message(schema_madlib, message=None, **kwargs):
+    """
+    Given a help string, provide usage information
+
+    Args:
+        @param schema_madlib Name of the MADlib schema
+        @param message  Helper message to print
+
+    Returns:
+        None
+    """
+    if message is not None and \
+            message.lower() in ("usage", "help", "?"):
+        return """
+-----------------------------------------------------------------------
+                                USAGE
+-----------------------------------------------------------------------
+SELECT {schema_madlib}.assoc_rules(
+    support,            -- FLOAT8, minimum level of support needed for each itemset to be included in result
+    confidence,         -- FLOAT8, minimum level of confidence needed for each rule to be included in result
+    tid_col,            -- TEXT, name of the column storing the transaction ids
+    item_col,           -- TEXT, name of the column storing the products
+    input_table,        -- TEXT, name of the table containing the input data
+    output_schema,      -- TEXT, name of the schema where the final results will be stored.
+                                The schema must be created before calling the function.  Alternatively, use
+                                <tt>NULL</tt> to output to the current schema.
+    verbose,            -- BOOLEAN, (optional, default: False) determines if details are printed for each
+                                iteration as the algorithm progresses
+    max_itemset_size    -- INTEGER, (optional, default: itemsets of all sizes) determines the maximum size of frequent
+                                itemsets allowed that are used for generating association rules. Value less
+                                than 2 throws an error.
+);
+-------------------------------------------------------------------------
+                                OUTPUT TABLES
+-------------------------------------------------------------------------
+The output table "assoc_rules" in the "output_schema" contains a unique rule of the form "If X, then Y
+(i.e., X => Y)" in each row. X and Y are non-empty itemsets, called the antecedent and consequent, or
+the left-hand-side (LHS) and right-hand-side (LHS), of the rule respectively.
+
+in each row, with the following columns:
+    ruleid,     -- INTEGER, row number
+    pre,        -- TEXT, specifies the antecedent, or the LHS of the rule
+    post,       -- DOUBLE, specifies the consequent, or the RHS of the rule
+    support,    -- DOUBLE, support of the frequent itemset X,Y
+    count,      -- INTEGER, number of transactions in the input table that contain X,Y
+    confidence, -- DOUBLE, the ratio of number of transactions that contain X,Y to the number of transactions
+                        that contain X
+    lift,       -- DOUBLE, the ratio of observed support of X,Y to the expected support of X,Y, assuming X and
+                        Y are independent.
+    conviction  -- DOUBLE, the ratio of expected support of X occurring without Y assuming X and Y are
+                        independent, to the observed support of X occuring without Y
+        """.format(schema_madlib=schema_madlib)
+    else:
+        if message.lower() in ("example", "examples"):
+            return """
+------------------------------------------------------------------------
+                                EXAMPLES
+------------------------------------------------------------------------
+DROP TABLE IF EXISTS test_data;
+CREATE TABLE test_data (
+    trans_id INT,
+    product TEXT
+);
+INSERT INTO test_data VALUES (1, 'beer');
+INSERT INTO test_data VALUES (1, 'diapers');
+INSERT INTO test_data VALUES (1, 'chips');
+INSERT INTO test_data VALUES (2, 'beer');
+INSERT INTO test_data VALUES (2, 'diapers');
+INSERT INTO test_data VALUES (3, 'beer');
+INSERT INTO test_data VALUES (3, 'diapers');
+INSERT INTO test_data VALUES (4, 'beer');
+INSERT INTO test_data VALUES (4, 'chips');
+INSERT INTO test_data VALUES (5, 'beer');
+INSERT INTO test_data VALUES (6, 'beer');
+INSERT INTO test_data VALUES (6, 'diapers');
+INSERT INTO test_data VALUES (6, 'chips');
+INSERT INTO test_data VALUES (7, 'beer');
+INSERT INTO test_data VALUES (7, 'diapers');
+
+Find all association rules with a support and threshold value of
+at least 0.25 and 0.5 respectively.
+
+SELECT * FROM {schema_madlib}.assoc_rules( .25,
+                          .5,
+                          'trans_id',
+                          'product',
+                          'test_data',
+                          NULL,
+                          TRUE
+                        );
+
+View output results:
+SELECT * FROM assoc_rules;
+
+Find association rules generated from itemsets of size at most 2,
+and a support and threshold value of at least 0.25 and 0.5 respectively.
+
+SELECT * FROM {schema_madlib}.assoc_rules( .25,
+                          .5,
+                          'trans_id',
+                          'product',
+                          'test_data',
+                          NULL,
+                          TRUE,
+                          2
+                        );
+
+View output results:
+SELECT * FROM assoc_rules;
+            """.format(schema_madlib=schema_madlib)
+        else:
+            return """
+For an overview on usage, run: SELECT {schema_madlib}.assoc_rules('usage');
+For an example of using assoc_rules, run: SELECT {schema_madlib}.assoc_rules('example');
+            """.format(schema_madlib=schema_madlib)

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/8e5da2ff/src/ports/postgres/modules/assoc_rules/assoc_rules.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/assoc_rules/assoc_rules.sql_in b/src/ports/postgres/modules/assoc_rules/assoc_rules.sql_in
index 99b8f00..fc87b16 100644
--- a/src/ports/postgres/modules/assoc_rules/assoc_rules.sql_in
+++ b/src/ports/postgres/modules/assoc_rules/assoc_rules.sql_in
@@ -43,7 +43,7 @@ support and confidence values, this function generates all single and
 multidimensional association rules that meet the minimum thresholds.
 
 Association rule mining is a widely used technique for discovering relationships
-between variables in a large data set (e.g items in a store that are commonly
+between variables in a large data set (e.g., items in a store that are commonly
 purchased together). The classic market basket analysis example using
 association rules is the "beer and diapers" rule. According to data mining urban
 legend, a study of customer purchase behavior in a supermarket found that men
@@ -62,7 +62,7 @@ data is stored in two columns with one item and transaction id per row.
 Transactions with multiple items will span multiple rows with one row per item.
 
 <pre>
-     tran_id | product
+    trans_id | product
     ---------+---------
            1 | 1
            1 | 2
@@ -81,18 +81,28 @@ Transactions with multiple items will span multiple rows with one row per item.
 @par Rules
 
 Association rules take the form "If X, then Y", where X and Y are non-empty
-itemsets. X and Y are called the antecedent and consequent, or the left-hand-side 
+itemsets. X and Y are called the antecedent and consequent, or the left-hand-side
 and right-hand-side, of the rule respectively. Using our previous example,
 the association rule may state "If {diapers}, then {beer}" with .2 support and
 .85 confidence.
 
-Given any association rule "If X, then Y", the association rules function will
-also calculate the following metrics:
+The following metrics are defined for any given itemset "X".
+- Count: The number of transactions that contain X
+
 - Support: The ratio of transactions that contain X to all transactions, T
 \f[
 S (X) = \frac{Total X}{Total transactions}
 \f]
 
+Given any association rule "If X, then Y", the association rules function will
+also calculate the following metrics:
+- Count: The number of transactions that contain X,Y
+
+- Support: The ratio of transactions that contain X,Y to all transactions, T
+\f[
+S (X \Rightarrow Y) = \frac{Total(X \cup Y)}{Total transactions}
+\f]
+
 - Confidence: The ratio of transactions that contain \f$ X,Y \f$ to
 transactions that contain \f$ X \f$. One could view this metric as the
 conditional probability of \f$ Y \f$ , given \f$ X \f$ . \f$ P(Y|X) \f$
@@ -124,7 +134,7 @@ Conv (X \Rightarrow Y) = \frac{1 - S(Y)}{1 - C(X \Rightarrow Y)}
 @par Apriori Algorithm
 
 Although there are many algorithms that generate association rules, the classic
-algorithm used is called Apriori [1] which we have implemented in this module. It is a
+algorithm is called Apriori [1] which we have implemented in this module. It is a
 breadth-first search, as opposed to depth-first searches like Eclat. Frequent
 itemsets of order \f$ n \f$ are generated from sets of order \f$ n - 1 \f$.
 Using the downward closure property, all sets must have frequent subsets. There
@@ -143,7 +153,7 @@ itemsets of order \f$ n - 1 \f$.
 This is done by doing the union of two itemsets that have identical items except one.
 -# Eliminate itemsets that have (n-1) order subsets with insufficient support.
 -# Eliminate itemsets with insufficient support.
--# Repeat until itemsets cannot be generated.
+-# Repeat until itemsets cannot be generated, or maximum itemset size is exceeded.
 
 \e Association \e rule \e generation
 
@@ -162,7 +172,8 @@ assoc_rules( support,
              item_col,
              input_table,
              output_schema,
-             verbose
+             verbose,
+             max_itemset_size
            );</pre>
 This generates all association rules that satisfy the specified minimum
 <em>support</em> and <em>confidence</em>.
@@ -170,19 +181,19 @@ This generates all association rules that satisfy the specified minimum
 \b Arguments
 <dl class="arglist">
   <dt>support</dt>
-  <dd>The minimum level of support needed for each itemset to be included in result.</dd>
+  <dd>Minimum level of support needed for each itemset to be included in result.</dd>
 
   <dt>confidence</dt>
-  <dd>The minimum level of confidence needed for each rule to be included in result.</dd>
+  <dd>Minimum level of confidence needed for each rule to be included in result.</dd>
 
   <dt>tid_col</dt>
-  <dd>The name of the column storing the transaction ids.</dd>
+  <dd>Name of the column storing the transaction ids.</dd>
 
   <dt>item_col</dt>
-  <dd>The name of the column storing the products.</dd>
+  <dd>Name of the column storing the products.</dd>
 
   <dt>input_table</dt>
-  <dd>The name of the table containing the input data.
+  <dd>Name of the table containing the input data.
 
   The input data is expected to be of the following form:
 <pre>{TABLE|VIEW} <em>input_table</em> (
@@ -199,7 +210,7 @@ This generates all association rules that satisfy the specified minimum
   The schema must be created before calling the function.  Alternatively, use
   <tt>NULL</tt> to output to the current schema.
 
-  The results containing the rules, support, confidence, lift, and
+  The results containing the rules, support, count, confidence, lift, and
   conviction are stored in the table \c assoc_rules in the schema
   specified by \c output_schema.
 
@@ -218,6 +229,10 @@ This generates all association rules that satisfy the specified minimum
         <td>text</td>
       </tr>
       <tr>
+        <th>count</th>
+        <td>integer</td>
+      </tr>
+      <tr>
         <th>support</th>
         <td>double</td>
       </tr>
@@ -243,15 +258,20 @@ This generates all association rules that satisfy the specified minimum
   </dd>
 
   <dt>verbose</dt>
-  <dd>BOOLEAN, default FALSE. Determines if details are printed for each iteration
+  <dd>BOOLEAN, default: FALSE. Determines if details are printed for each iteration
   as the algorithm progresses.</dd>
+
+  <dt>max_itemset_size</dt>
+  <dd>INTEGER, default: generate itemsets of all sizes. Determines the maximum size of frequent
+  itemsets that are used for generating association rules. Must be 2 or more.
+  This parameter can be used to reduce run time for data sets where itemset size is large. </dd>
 </dl>
 
 
 @anchor examples
 @examp
 
-Let us take a look at some sample transactional data and generate association rules.
+Let's look at some sample transactional data and generate association rules.
 
 -# Create an input dataset:
 <pre class="example">
@@ -283,20 +303,20 @@ In this example we set verbose to
 TRUE so that we have some insight into progress of the function. We
 can now generate association rules as follows:
 <pre class="example">
-SELECT * FROM madlib.assoc_rules( .25,
-                                  .5,
-                                  'trans_id',
-                                  'product',
-                                  'test_data',
-                                  NULL,
-                                  TRUE
+SELECT * FROM madlib.assoc_rules( .25,            -- Support
+                                  .5,             -- Confidence
+                                  'trans_id',     -- Transaction id col
+                                  'product',      -- Product col
+                                  'test_data',    -- Input data
+                                  NULL,           -- Output schema
+                                  TRUE            -- Verbose output
                                 );
 </pre>
 Result (iteration details not shown):
 <pre class="result">
  output_schema | output_table | total_rules |   total_time    
 ---------------+--------------+-------------+-----------------
- public        | assoc_rules  |           7 | 00:00:00.028534
+ public        | assoc_rules  |           7 | 00:00:00.569254
 (1 row)
 </pre>
 The association rules are stored in the assoc_rules table:
@@ -306,18 +326,53 @@ ORDER BY support DESC, confidence DESC;
 </pre>
 Result:
 <pre class="result">
- ruleid |       pre       |      post      |      support      |    confidence     |       lift        |    conviction     
---------+-----------------+----------------+-------------------+-------------------+-------------------+-------------------
-      4 | {diapers}       | {beer}         | 0.714285714285714 |                 1 |                 1 |                 0
-      3 | {beer}          | {diapers}      | 0.714285714285714 | 0.714285714285714 |                 1 |                 1
-      1 | {chips}         | {beer}         | 0.428571428571429 |                 1 |                 1 |                 0
-      7 | {diapers,chips} | {beer}         | 0.285714285714286 |                 1 |                 1 |                 0
-      2 | {chips}         | {diapers}      | 0.285714285714286 | 0.666666666666667 | 0.933333333333333 | 0.857142857142857
-      5 | {chips}         | {beer,diapers} | 0.285714285714286 | 0.666666666666667 | 0.933333333333333 | 0.857142857142857
-      6 | {beer,chips}    | {diapers}      | 0.285714285714286 | 0.666666666666667 | 0.933333333333333 | 0.857142857142857
+ ruleid |       pre       |      post      | count |      support      |    confidence     |       lift        |    conviction     
+--------+-----------------+----------------+-------+-------------------+-------------------+-------------------+-------------------
+      2 | {diapers}       | {beer}         |     5 | 0.714285714285714 |                 1 |                 1 |                 0
+      6 | {beer}          | {diapers}      |     5 | 0.714285714285714 | 0.714285714285714 |                 1 |                 1
+      5 | {chips}         | {beer}         |     3 | 0.428571428571429 |                 1 |                 1 |                 0
+      4 | {chips,diapers} | {beer}         |     2 | 0.285714285714286 |                 1 |                 1 |                 0
+      1 | {chips}         | {diapers,beer} |     2 | 0.285714285714286 | 0.666666666666667 | 0.933333333333333 | 0.857142857142857
+      7 | {chips}         | {diapers}      |     2 | 0.285714285714286 | 0.666666666666667 | 0.933333333333333 | 0.857142857142857
+      3 | {beer,chips}    | {diapers}      |     2 | 0.285714285714286 | 0.666666666666667 | 0.933333333333333 | 0.857142857142857
 (7 rows)
 </pre>
 
+-# Limit association rules generated from itemsets of size at most 2:
+<pre class="example">
+SELECT * FROM madlib.assoc_rules( .25,            -- Support
+                                  .5,             -- Confidence
+                                  'trans_id',     -- Transaction id col
+                                  'product',      -- Product col
+                                  'test_data',    -- Input data
+                                  NULL,           -- Output schema
+                                  TRUE,           -- Verbose output
+                                  2               -- Max itemset size
+                                );
+</pre>
+Result (iteration details not shown):
+<pre class="result">
+ output_schema | output_table | total_rules |   total_time    
+---------------+--------------+-------------+-----------------
+ public        | assoc_rules  |           4 | 00:00:00.565176
+(1 row)
+</pre>
+The association rules are again stored in the assoc_rules table:
+<pre class="example">
+SELECT * FROM assoc_rules
+ORDER BY support DESC, confidence DESC;
+</pre>
+Result:
+<pre class="result">
+ ruleid |    pre    |   post    | count |      support      |    confidence     |       lift        |    conviction     
+--------+-----------+-----------+-------+-------------------+-------------------+-------------------+-------------------
+      1 | {diapers} | {beer}    |     5 | 0.714285714285714 |                 1 |                 1 |                 0
+      2 | {beer}    | {diapers} |     5 | 0.714285714285714 | 0.714285714285714 |                 1 |                 1
+      3 | {chips}   | {beer}    |     3 | 0.428571428571429 |                 1 |                 1 |                 0
+      4 | {chips}   | {diapers} |     2 | 0.285714285714286 | 0.666666666666667 | 0.933333333333333 | 0.857142857142857
+(4 rows)
+</pre>
+
 -# Post-processing can now be done on the output table in the case that
 you want to filter the results.  For example, if you want any single item on the left hand side 
 and a particular item on the right hand side:
@@ -326,10 +381,10 @@ SELECT * FROM assoc_rules WHERE array_upper(pre,1) = 1 AND post = array['beer'];
 </pre>
 Result:
 <pre class="result">
- ruleid |    pre    |  post  |      support      | confidence | lift | conviction 
---------+-----------+--------+-------------------+------------+------+------------
-      1 | {chips}   | {beer} | 0.428571428571429 |          1 |    1 |          0
-      4 | {diapers} | {beer} | 0.714285714285714 |          1 |    1 |          0
+ ruleid |    pre    |  post  | count |      support      | confidence | lift | conviction 
+--------+-----------+--------+-------+-------------------+------------+------+------------
+      1 | {diapers} | {beer} |     5 | 0.714285714285714 |          1 |    1 |          0
+      3 | {chips}   | {beer} |     3 | 0.428571428571429 |          1 |    1 |          0
 (2 rows)
 </pre>
 
@@ -433,7 +488,8 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.assoc_rules
     item_col TEXT,
     input_table TEXT,
     output_schema TEXT,
-    verbose BOOLEAN
+    verbose BOOLEAN,
+    max_itemset_size INTEGER
    )
 RETURNS MADLIB_SCHEMA.assoc_rules_results
 AS $$
@@ -451,7 +507,8 @@ AS $$
         item_col,
         input_table,
         output_schema,
-        verbose
+        verbose,
+        max_itemset_size
         );
 
 $$ LANGUAGE plpythonu
@@ -488,8 +545,57 @@ AS $$
         item_col,
         input_table,
         output_schema,
-        False
+        False,
+        'NULL'
+        );
+
+$$ LANGUAGE plpythonu
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
+
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.assoc_rules
+    (
+    support FLOAT8,
+    confidence FLOAT8,
+    tid_col TEXT,
+    item_col TEXT,
+    input_table TEXT,
+    output_schema TEXT,
+    verbose BOOLEAN
+    )
+RETURNS MADLIB_SCHEMA.assoc_rules_results
+AS $$
+
+    PythonFunctionBodyOnly(`assoc_rules', `assoc_rules')
+
+    plpy.execute("SET client_min_messages = error;")
+
+    # schema_madlib comes from PythonFunctionBodyOnly
+    return assoc_rules.assoc_rules(
+        schema_madlib,
+        support,
+        confidence,
+        tid_col,
+        item_col,
+        input_table,
+        output_schema,
+        verbose,
+        'NULL'
         );
 
 $$ LANGUAGE plpythonu
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
+
+--------------------------------------------------------------------------
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.assoc_rules(message TEXT)
+RETURNS text AS $$
+PythonFunction(assoc_rules, assoc_rules, assoc_rules_help_message)
+$$ language plpythonu
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.assoc_rules()
+RETURNS text AS $$
+    SELECT MADLIB_SCHEMA.assoc_rules('');
+$$ language SQL
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/8e5da2ff/src/ports/postgres/modules/assoc_rules/test/assoc_rules.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/assoc_rules/test/assoc_rules.sql_in b/src/ports/postgres/modules/assoc_rules/test/assoc_rules.sql_in
index 59aba05..5a6275e 100644
--- a/src/ports/postgres/modules/assoc_rules/test/assoc_rules.sql_in
+++ b/src/ports/postgres/modules/assoc_rules/test/assoc_rules.sql_in
@@ -28,6 +28,7 @@ declare
     result1        TEXT;
     result2        TEXT;
     result3        TEXT;
+    result_maxiter TEXT;
     res            MADLIB_SCHEMA.assoc_rules_results;
     output_schema  TEXT;
     output_table   TEXT;
@@ -137,6 +138,10 @@ begin
           abs(t1.support - t2.support) < 1E-10 AND
           abs(t1.confidence - t2.confidence) < 1E-10;
 
+    PERFORM MADLIB_SCHEMA.assoc_rules (.1, .5, 'trans_id', 'product', 'test_data2','madlib_installcheck_assoc_rules', false, 2);
+    SELECT INTO result_maxiter CASE WHEN count(*) = 4 then 'PASS' ELSE 'FAIL' END
+    FROM assoc_rules;
+
     DROP TABLE IF EXISTS test_data1;
     DROP TABLE IF EXISTS test_data2;
     DROP TABLE IF EXISTS test2_exp_result;
@@ -150,6 +155,10 @@ begin
         RAISE EXCEPTION 'Association rules mining failed. No results were returned.';
     END IF;
 
+    IF result_maxiter = 'FAIL' THEN
+        RAISE EXCEPTION 'Association rules mining error when max_iter parameter specified.';
+    END IF;
+
     RAISE INFO 'Association rules install check passed.';
     RETURN;

[15/50] [abbrv] incubator-madlib git commit: Update dateformat in multiple install-checks

Posted by ri...@apache.org.

Update dateformat in multiple install-checks


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/13203baa
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/13203baa
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/13203baa

Branch: refs/heads/latest_release
Commit: 13203baac10fe9b83536b983828f3ebf7cbc7e6f
Parents: 9d04b7d
Author: Rahul Iyer <ri...@apache.org>
Authored: Wed Jan 25 17:42:22 2017 -0800
Committer: Rahul Iyer <ri...@apache.org>
Committed: Wed Jan 25 17:42:57 2017 -0800

----------------------------------------------------------------------
 .../postgres/modules/utilities/test/path.sql_in | 76 ++++++++---------
 .../modules/utilities/test/sessionize.sql_in    | 87 ++++++++++----------
 2 files changed, 82 insertions(+), 81 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/13203baa/src/ports/postgres/modules/utilities/test/path.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/test/path.sql_in b/src/ports/postgres/modules/utilities/test/path.sql_in
index bd9e840..8397fca 100644
--- a/src/ports/postgres/modules/utilities/test/path.sql_in
+++ b/src/ports/postgres/modules/utilities/test/path.sql_in
@@ -39,41 +39,41 @@ CREATE TABLE "Weblog" (event_timestamp TIMESTAMP,
             "Margin" FLOAT);
 
 INSERT INTO "Weblog" VALUES
-('04/14/2012 23:43:00', 102201, 3, 3, 'Female', 'East', 3, 1, 1, 112, 36),
-('04/14/2012 23:56:00', 101881, 2, 4, 'Male', 'West', 5, 0, 0, 0, 0),
-('04/15/2012 01:04:00', 100821, 1, 4, 'Unknown', 'West', 3, 0, 0, 0, 0),
-('04/15/2012 01:15:00', 101121, 2, 2, 'Unknown', 'West', 4, 0, 0, 0, 0),
-('04/15/2012 02:53:00', 102201, 3, 3, 'Female', 'East', 3, 1, 1, 117, 28),
-('04/15/2012 04:11:00', 103711, 4, 3, 'Female', 'Central', 5, 0, 0, 0, 0),
-('04/15/2012 04:25:00', 100821, 1, 4, 'Unknown', 'West', 3, 1, 1, 91, 28),
-('04/15/2012 06:26:00', 102871, 3, 4, 'Female', 'Central', 5, 0, 0, 0, 0),
-('04/15/2012 06:32:00', 100821, 1, 4, 'Unknown', 'West', 3, 0, 0, 0, 0),
-('04/15/2012 07:02:00', 100821, 1, 4, 'Unknown', 'West', 3, 1, 1, 118, 39),
-('04/15/2012 08:51:00', 102201, 3, 3, 'Female', 'East', 3, 0, 0, 0, 0),
-('04/15/2012 09:28:00', 101121, 2, 2, 'Unknown', 'West', 4, 1, 1, 103, 32),
-('04/15/2012 10:19:00', 103711, 4, 3, 'Female', 'Central', 5, 0, 0, 0, 0),
-('04/15/2012 11:40:00', 100821, 1, 4, 'Unknown', 'West', 3, 0, 0, 0, 0),
-('04/15/2012 12:58:00', 101121, 2, 2, 'Unknown', 'West', 4, 1, 1, 148, 23),
-('04/15/2012 14:18:00', 101121, 2, 2, 'Unknown', 'West', 4, 1, 1, 113, 29),
-('04/15/2012 22:20:00', 101121, 2, 2, 'Unknown', 'West', 4, 1, 1, 108, 38),
-('04/15/2012 23:13:00', 102201, 3, 3, 'Female', 'East', 3, 0, 0, 0, 0),
-('04/15/2012 23:14:00', 103711, 4, 3, 'Female', 'Central', 5, 0, 0, 0, 0),
-('04/16/2012 01:55:00', 101121, 2, 2, 'Unknown', 'West', 4, 0, 0, 0, 0),
-('04/16/2012 02:12:00', 100821, 1, 4, 'Unknown', 'West', 3, 1, 1, 153, 26),
-('04/16/2012 04:20:00', 102201, 3, 3, 'Female', 'East', 3, 0, 0, 0, 0),
-('04/16/2012 05:38:00', 101121, 2, 2, 'Unknown', 'West', 4, 1, 0, 0, 0),
-('04/16/2012 05:44:00', 102201, 3, 3, 'Female', 'East', 3, 1, 0, 0, 0),
-('04/16/2012 05:59:00', 102871, 3, 4, 'Female', 'Central', 5, 1, 0, 0, 0),
-('04/16/2012 09:35:00', 102871, 3, 4, 'Female', 'Central', 5, 1, 0, 0, 0),
-('04/16/2012 10:40:00', 101331, 2, 4, 'Female', 'East', 5, 0, 0, 0, 0),
-('04/16/2012 14:23:00', 102871, 3, 4, 'Female', 'Central', 5, 0, 0, 0, 0),
-('04/16/2012 20:46:00', 101121, 2, 2, 'Unknown', 'West', 4, 1, 1, 131, 28),
-('04/16/2012 21:11:00', 101331, 2, 4, 'Female', 'East', 5, 1, 1, 127, 27),
-('04/16/2012 22:35:00', 101121, 2, 2, 'Unknown', 'West', 4, 0, 0, 0, 0),
-('04/16/2012 23:51:00', 101881, 2, 4, 'Male', 'West', 5, 0, 0, 0, 0),
-('04/16/2012 23:55:00', 101331, 2, 4, 'Female', 'East', 5, 0, 0, 0, 0),
-('04/16/2012 23:56:00', 101331, 2, 4, 'Female', 'East', 5, 1, 0, 0, 0),
-('04/16/2012 23:57:00', 101331, 2, 4, 'Female', 'East', 5, 1, 1, 456, 77);
+(to_timestamp('04/14/2012 23:43:00', 'MM/DD/YYYY HH24:MI:SS'), 102201, 3, 3, 'Female', 'East', 3, 1, 1, 112, 36),
+(to_timestamp('04/14/2012 23:56:00', 'MM/DD/YYYY HH24:MI:SS'), 101881, 2, 4, 'Male', 'West', 5, 0, 0, 0, 0),
+(to_timestamp('04/15/2012 01:04:00', 'MM/DD/YYYY HH24:MI:SS'), 100821, 1, 4, 'Unknown', 'West', 3, 0, 0, 0, 0),
+(to_timestamp('04/15/2012 01:15:00', 'MM/DD/YYYY HH24:MI:SS'), 101121, 2, 2, 'Unknown', 'West', 4, 0, 0, 0, 0),
+(to_timestamp('04/15/2012 02:53:00', 'MM/DD/YYYY HH24:MI:SS'), 102201, 3, 3, 'Female', 'East', 3, 1, 1, 117, 28),
+(to_timestamp('04/15/2012 04:11:00', 'MM/DD/YYYY HH24:MI:SS'), 103711, 4, 3, 'Female', 'Central', 5, 0, 0, 0, 0),
+(to_timestamp('04/15/2012 04:25:00', 'MM/DD/YYYY HH24:MI:SS'), 100821, 1, 4, 'Unknown', 'West', 3, 1, 1, 91, 28),
+(to_timestamp('04/15/2012 06:26:00', 'MM/DD/YYYY HH24:MI:SS'), 102871, 3, 4, 'Female', 'Central', 5, 0, 0, 0, 0),
+(to_timestamp('04/15/2012 06:32:00', 'MM/DD/YYYY HH24:MI:SS'), 100821, 1, 4, 'Unknown', 'West', 3, 0, 0, 0, 0),
+(to_timestamp('04/15/2012 07:02:00', 'MM/DD/YYYY HH24:MI:SS'), 100821, 1, 4, 'Unknown', 'West', 3, 1, 1, 118, 39),
+(to_timestamp('04/15/2012 08:51:00', 'MM/DD/YYYY HH24:MI:SS'), 102201, 3, 3, 'Female', 'East', 3, 0, 0, 0, 0),
+(to_timestamp('04/15/2012 09:28:00', 'MM/DD/YYYY HH24:MI:SS'), 101121, 2, 2, 'Unknown', 'West', 4, 1, 1, 103, 32),
+(to_timestamp('04/15/2012 10:19:00', 'MM/DD/YYYY HH24:MI:SS'), 103711, 4, 3, 'Female', 'Central', 5, 0, 0, 0, 0),
+(to_timestamp('04/15/2012 11:40:00', 'MM/DD/YYYY HH24:MI:SS'), 100821, 1, 4, 'Unknown', 'West', 3, 0, 0, 0, 0),
+(to_timestamp('04/15/2012 12:58:00', 'MM/DD/YYYY HH24:MI:SS'), 101121, 2, 2, 'Unknown', 'West', 4, 1, 1, 148, 23),
+(to_timestamp('04/15/2012 14:18:00', 'MM/DD/YYYY HH24:MI:SS'), 101121, 2, 2, 'Unknown', 'West', 4, 1, 1, 113, 29),
+(to_timestamp('04/15/2012 22:20:00', 'MM/DD/YYYY HH24:MI:SS'), 101121, 2, 2, 'Unknown', 'West', 4, 1, 1, 108, 38),
+(to_timestamp('04/15/2012 23:13:00', 'MM/DD/YYYY HH24:MI:SS'), 102201, 3, 3, 'Female', 'East', 3, 0, 0, 0, 0),
+(to_timestamp('04/15/2012 23:14:00', 'MM/DD/YYYY HH24:MI:SS'), 103711, 4, 3, 'Female', 'Central', 5, 0, 0, 0, 0),
+(to_timestamp('04/16/2012 01:55:00', 'MM/DD/YYYY HH24:MI:SS'), 101121, 2, 2, 'Unknown', 'West', 4, 0, 0, 0, 0),
+(to_timestamp('04/16/2012 02:12:00', 'MM/DD/YYYY HH24:MI:SS'), 100821, 1, 4, 'Unknown', 'West', 3, 1, 1, 153, 26),
+(to_timestamp('04/16/2012 04:20:00', 'MM/DD/YYYY HH24:MI:SS'), 102201, 3, 3, 'Female', 'East', 3, 0, 0, 0, 0),
+(to_timestamp('04/16/2012 05:38:00', 'MM/DD/YYYY HH24:MI:SS'), 101121, 2, 2, 'Unknown', 'West', 4, 1, 0, 0, 0),
+(to_timestamp('04/16/2012 05:44:00', 'MM/DD/YYYY HH24:MI:SS'), 102201, 3, 3, 'Female', 'East', 3, 1, 0, 0, 0),
+(to_timestamp('04/16/2012 05:59:00', 'MM/DD/YYYY HH24:MI:SS'), 102871, 3, 4, 'Female', 'Central', 5, 1, 0, 0, 0),
+(to_timestamp('04/16/2012 09:35:00', 'MM/DD/YYYY HH24:MI:SS'), 102871, 3, 4, 'Female', 'Central', 5, 1, 0, 0, 0),
+(to_timestamp('04/16/2012 10:40:00', 'MM/DD/YYYY HH24:MI:SS'), 101331, 2, 4, 'Female', 'East', 5, 0, 0, 0, 0),
+(to_timestamp('04/16/2012 14:23:00', 'MM/DD/YYYY HH24:MI:SS'), 102871, 3, 4, 'Female', 'Central', 5, 0, 0, 0, 0),
+(to_timestamp('04/16/2012 20:46:00', 'MM/DD/YYYY HH24:MI:SS'), 101121, 2, 2, 'Unknown', 'West', 4, 1, 1, 131, 28),
+(to_timestamp('04/16/2012 21:11:00', 'MM/DD/YYYY HH24:MI:SS'), 101331, 2, 4, 'Female', 'East', 5, 1, 1, 127, 27),
+(to_timestamp('04/16/2012 22:35:00', 'MM/DD/YYYY HH24:MI:SS'), 101121, 2, 2, 'Unknown', 'West', 4, 0, 0, 0, 0),
+(to_timestamp('04/16/2012 23:51:00', 'MM/DD/YYYY HH24:MI:SS'), 101881, 2, 4, 'Male', 'West', 5, 0, 0, 0, 0),
+(to_timestamp('04/16/2012 23:55:00', 'MM/DD/YYYY HH24:MI:SS'), 101331, 2, 4, 'Female', 'East', 5, 0, 0, 0, 0),
+(to_timestamp('04/16/2012 23:56:00', 'MM/DD/YYYY HH24:MI:SS'), 101331, 2, 4, 'Female', 'East', 5, 1, 0, 0, 0),
+(to_timestamp('04/16/2012 23:57:00', 'MM/DD/YYYY HH24:MI:SS'), 101331, 2, 4, 'Female', 'East', 5, 1, 1, 456, 77);
 
 /*
 SELECT * FROM weblog ORDER BY event_timestamp ASC;
@@ -115,9 +115,9 @@ SELECT assert(count::integer=3::integer, 'wrong results in path')
 FROM "Path_output";
 
 INSERT INTO "Weblog" VALUES
-('04/15/2012 02:15:00', 101331, 2, 4, 'Female', 'East', 5, 0, 0, 0, 0),
-('04/15/2012 02:59:00', 101331, 2, 4, 'Female', 'East', 5, 1, 0, 0, 0),
-('04/15/2012 04:32:00', 101331, 2, 4, 'Female', 'East', 5, 1, 1, 112, 36);
+(to_timestamp('04/15/2012 02:15:00', 'MM/DD/YYYY HH24:MI:SS'), 101331, 2, 4, 'Female', 'East', 5, 0, 0, 0, 0),
+(to_timestamp('04/15/2012 02:59:00', 'MM/DD/YYYY HH24:MI:SS'), 101331, 2, 4, 'Female', 'East', 5, 1, 0, 0, 0),
+(to_timestamp('04/15/2012 04:32:00', 'MM/DD/YYYY HH24:MI:SS'), 101331, 2, 4, 'Female', 'East', 5, 1, 1, 112, 36);
 
 DROP TABLE "Path_output", "Path_output_tuples";
 SELECT path(

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/13203baa/src/ports/postgres/modules/utilities/test/sessionize.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/test/sessionize.sql_in b/src/ports/postgres/modules/utilities/test/sessionize.sql_in
index 8e58d1b..845e572 100644
--- a/src/ports/postgres/modules/utilities/test/sessionize.sql_in
+++ b/src/ports/postgres/modules/utilities/test/sessionize.sql_in
@@ -25,7 +25,8 @@
  */
 /* ----------------------------------------------------------------------- */
 
-CREATE TABLE eventlog_installchk (event_timestamp TIMESTAMP,
+CREATE TABLE eventlog_installchk (
+            event_timestamp TIMESTAMP,
             "user id" INT,
             original_session_id INT,
             page TEXT,
@@ -33,48 +34,48 @@ CREATE TABLE eventlog_installchk (event_timestamp TIMESTAMP,
             row INT,
             part_expr BOOLEAN);
 INSERT INTO eventlog_installchk VALUES
-('04/15/2015 01:03:0.5', 100821, 1, 'LANDING', 0, 1, 'f'),
-('04/15/2015 01:05:00', 100821, 1, 'WINE', 0, 1, 'f'),
-('04/15/2015 01:07:00', 100821, 1, 'CHECKOUT', 39, 1, 'f'),
-('04/15/2015 02:06:00', 100821, 2, 'WINE', 0, 1, 'f'),
-('04/15/2015 02:07:00', 100821, 2, 'WINE', 0, 1, 'f'),
-('04/15/2015 01:15:00', 101121, 1, 'LANDING', 0, 1, 'f'),
-('04/15/2015 01:16:00', 101121, 1, 'WINE', 0, 1, 'f'),
-('04/15/2015 01:18:00', 101121, 1, 'CHECKOUT', 15, 1, 'f'),
-('04/15/2015 01:19:00', 101121, 1, 'LANDING', 0, 1, 'f'),
-('04/15/2015 01:21:00', 101121, 1, 'HELP', 0, 1, 'f'),
-(NULL, 101121, NULL, 'LANDING', 0, 1, 'f'),
-(NULL, 101121, NULL, 'HELP', 0, 1, 'f'),
-('04/15/2015 01:24:00', 101121, 1, 'WINE', 0, 1, 'f'),
-('04/15/2015 01:26:00', 101121, 1, 'CHECKOUT', 23, 1, 'f'),
-('04/15/2015 02:21:00', 101121, 2, 'HELP', 0, 1, 'f'),
-('04/15/2015 02:24:00', 101121, 2, 'WINE', 0, 1, 'f'),
-('04/15/2015 02:26:00', 101121, 2, 'CHECKOUT', 23, 1, 't'),
-('04/15/2015 02:15:00', 101331, 1, 'LANDING', 0, 1, 't'),
-('04/15/2015 02:16:0.56', 101331, 1, 'WINE', 0, 1, 't'),
-('04/15/2015 02:18:00', 101331, 1, 'HELP', 0, 1, 't'),
-('04/15/2015 02:20:00', 101331, 1, 'WINE', 0, 1, 't'),
-('04/15/2015 02:21:00', 101331, 1, 'CHECKOUT', 16, 1, 't'),
-('04/15/2015 02:22:00', 101443, 1, 'BEER', 0, 1, 't'),
-('04/15/2015 02:27:00', 101443, 2, 'CHECKOUT', 12, 1, 't'),
-('04/15/2015 02:29:00', 101881, 1, 'LANDING', 0, 1, 't'),
-('04/15/2015 02:30:00', 101881, 1, 'BEER', 0, 1, 't'),
-('04/15/2015 01:05:00', 102201, 1, 'LANDING', 0, 1, 't'),
-('04/15/2015 01:06:00', 102201, 1, 'HELP', 0, 1, 't'),
-('04/15/2015 01:10:00', 102201, 2, 'LANDING', 0, 1, 't'),
-('04/15/2015 02:15:00', 102201, 3, 'WINE', 0, 1, 't'),
-('04/15/2015 02:16:00', 102201, 3, 'BEER', 0, 1, 't'),
-('04/15/2015 02:17:00', 102201, 3, 'WINE', 0, 1, 't'),
-('04/15/2015 02:18:00', 102871, 1, 'BEER', 0, 1, 't'),
-('04/15/2015 02:19:00', 102871, 1, 'WINE', 0, 1, 't'),
-('04/15/2015 02:22:00', 102871, 1, 'CHECKOUT', 21, 1, 't'),
-('04/15/2015 02:25:00', 102871, 1, 'LANDING', 0, 1, 't'),
-(NULL, 103711, NULL, 'BEER', 0, 1, 't'),
-(NULL, 103711, NULL, 'LANDING', 0, 1, 't'),
-(NULL, 103711, NULL, 'WINE', 0, 1, 't'),
-('04/15/2016 02:17:00', 103711, 1, 'BEER', 0, 1, 't'),
-('04/15/2016 02:21:00', 103711, 2, 'LANDING', 0, 1, 't'),
-('04/15/2016 02:31:0.05', 103711, 3, 'WINE', 0, 1, 't');
+(to_timestamp('04/15/2015 01:03:00', 'MM/DD/YYYY HH:MI:SS'), 100821, 1, 'LANDING', 0, 1, 'f'),
+(to_timestamp('04/15/2015 01:05:00', 'MM/DD/YYYY HH:MI:SS'), 100821, 1, 'WINE', 0, 1, 'f'),
+(to_timestamp('04/15/2015 01:07:00', 'MM/DD/YYYY HH:MI:SS'), 100821, 1, 'CHECKOUT', 39, 1, 'f'),
+(to_timestamp('04/15/2015 02:06:00', 'MM/DD/YYYY HH:MI:SS'), 100821, 2, 'WINE', 0, 1, 'f'),
+(to_timestamp('04/15/2015 02:07:00', 'MM/DD/YYYY HH:MI:SS'), 100821, 2, 'WINE', 0, 1, 'f'),
+(to_timestamp('04/15/2015 01:15:00', 'MM/DD/YYYY HH:MI:SS'), 101121, 1, 'LANDING', 0, 1, 'f'),
+(to_timestamp('04/15/2015 01:16:00', 'MM/DD/YYYY HH:MI:SS'), 101121, 1, 'WINE', 0, 1, 'f'),
+(to_timestamp('04/15/2015 01:18:00', 'MM/DD/YYYY HH:MI:SS'), 101121, 1, 'CHECKOUT', 15, 1, 'f'),
+(to_timestamp('04/15/2015 01:19:00', 'MM/DD/YYYY HH:MI:SS'), 101121, 1, 'LANDING', 0, 1, 'f'),
+(to_timestamp('04/15/2015 01:21:00', 'MM/DD/YYYY HH:MI:SS'), 101121, 1, 'HELP', 0, 1, 'f'),
+(to_timestamp(NULL, 'MM/DD/YYYY HH:MI:SS'), 101121, NULL, 'LANDING', 0, 1, 'f'),
+(to_timestamp(NULL, 'MM/DD/YYYY HH:MI:SS'), 101121, NULL, 'HELP', 0, 1, 'f'),
+(to_timestamp('04/15/2015 01:24:00', 'MM/DD/YYYY HH:MI:SS'), 101121, 1, 'WINE', 0, 1, 'f'),
+(to_timestamp('04/15/2015 01:26:00', 'MM/DD/YYYY HH:MI:SS'), 101121, 1, 'CHECKOUT', 23, 1, 'f'),
+(to_timestamp('04/15/2015 02:21:00', 'MM/DD/YYYY HH:MI:SS'), 101121, 2, 'HELP', 0, 1, 'f'),
+(to_timestamp('04/15/2015 02:24:00', 'MM/DD/YYYY HH:MI:SS'), 101121, 2, 'WINE', 0, 1, 'f'),
+(to_timestamp('04/15/2015 02:26:00', 'MM/DD/YYYY HH:MI:SS'), 101121, 2, 'CHECKOUT', 23, 1, 't'),
+(to_timestamp('04/15/2015 02:15:00', 'MM/DD/YYYY HH:MI:SS'), 101331, 1, 'LANDING', 0, 1, 't'),
+(to_timestamp('04/15/2015 02:16:06', 'MM/DD/YYYY HH:MI:SS'), 101331, 1, 'WINE', 0, 1, 't'),
+(to_timestamp('04/15/2015 02:18:00', 'MM/DD/YYYY HH:MI:SS'), 101331, 1, 'HELP', 0, 1, 't'),
+(to_timestamp('04/15/2015 02:20:00', 'MM/DD/YYYY HH:MI:SS'), 101331, 1, 'WINE', 0, 1, 't'),
+(to_timestamp('04/15/2015 02:21:00', 'MM/DD/YYYY HH:MI:SS'), 101331, 1, 'CHECKOUT', 16, 1, 't'),
+(to_timestamp('04/15/2015 02:22:00', 'MM/DD/YYYY HH:MI:SS'), 101443, 1, 'BEER', 0, 1, 't'),
+(to_timestamp('04/15/2015 02:27:00', 'MM/DD/YYYY HH:MI:SS'), 101443, 2, 'CHECKOUT', 12, 1, 't'),
+(to_timestamp('04/15/2015 02:29:00', 'MM/DD/YYYY HH:MI:SS'), 101881, 1, 'LANDING', 0, 1, 't'),
+(to_timestamp('04/15/2015 02:30:00', 'MM/DD/YYYY HH:MI:SS'), 101881, 1, 'BEER', 0, 1, 't'),
+(to_timestamp('04/15/2015 01:05:00', 'MM/DD/YYYY HH:MI:SS'), 102201, 1, 'LANDING', 0, 1, 't'),
+(to_timestamp('04/15/2015 01:06:00', 'MM/DD/YYYY HH:MI:SS'), 102201, 1, 'HELP', 0, 1, 't'),
+(to_timestamp('04/15/2015 01:10:00', 'MM/DD/YYYY HH:MI:SS'), 102201, 2, 'LANDING', 0, 1, 't'),
+(to_timestamp('04/15/2015 02:15:00', 'MM/DD/YYYY HH:MI:SS'), 102201, 3, 'WINE', 0, 1, 't'),
+(to_timestamp('04/15/2015 02:16:00', 'MM/DD/YYYY HH:MI:SS'), 102201, 3, 'BEER', 0, 1, 't'),
+(to_timestamp('04/15/2015 02:17:00', 'MM/DD/YYYY HH:MI:SS'), 102201, 3, 'WINE', 0, 1, 't'),
+(to_timestamp('04/15/2015 02:18:00', 'MM/DD/YYYY HH:MI:SS'), 102871, 1, 'BEER', 0, 1, 't'),
+(to_timestamp('04/15/2015 02:19:00', 'MM/DD/YYYY HH:MI:SS'), 102871, 1, 'WINE', 0, 1, 't'),
+(to_timestamp('04/15/2015 02:22:00', 'MM/DD/YYYY HH:MI:SS'), 102871, 1, 'CHECKOUT', 21, 1, 't'),
+(to_timestamp('04/15/2015 02:25:00', 'MM/DD/YYYY HH:MI:SS'), 102871, 1, 'LANDING', 0, 1, 't'),
+(to_timestamp(NULL, 'MM/DD/YYYY HH:MI:SS'), 103711, NULL, 'BEER', 0, 1, 't'),
+(to_timestamp(NULL, 'MM/DD/YYYY HH:MI:SS'), 103711, NULL, 'LANDING', 0, 1, 't'),
+(to_timestamp(NULL, 'MM/DD/YYYY HH:MI:SS'), 103711, NULL, 'WINE', 0, 1, 't'),
+(to_timestamp('04/15/2016 02:17:00', 'MM/DD/YYYY HH:MI:SS'), 103711, 1, 'BEER', 0, 1, 't'),
+(to_timestamp('04/15/2016 02:21:00', 'MM/DD/YYYY HH:MI:SS'), 103711, 2, 'LANDING', 0, 1, 't'),
+(to_timestamp('04/15/2016 02:31:0.05', 'MM/DD/YYYY HH:MI:SS'), 103711, 3, 'WINE', 0, 1, 't');
 
 
 SELECT sessionize(

[38/50] [abbrv] incubator-madlib git commit: Release v1.10:

Posted by ri...@apache.org.

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/90f4dc15/src/madpack/changelist_1.6.0S_1.9.1.yaml
----------------------------------------------------------------------
diff --git a/src/madpack/changelist_1.6.0S_1.9.1.yaml b/src/madpack/changelist_1.6.0S_1.9.1.yaml
deleted file mode 100644
index 6330591..0000000
--- a/src/madpack/changelist_1.6.0S_1.9.1.yaml
+++ /dev/null
@@ -1,945 +0,0 @@
-# Changelist for MADlib version 1.6.0S to 1.7
-
-# This file contains all changes that were introduced in a new version of
-# MADlib. This changelist is used by the upgrade script to detect what objects
-# should be upgraded (while retaining all other objects from the previous version)
-
-# New modules (actually .sql_in files) added in upgrade version
-# For these files the sql_in code is retained as is with the functions in the
-# file installed on the upgrade version. All other files (that don't have
-# updates), are cleaned up to remove object replacements
-new module:
-    # ----------------- Changes from 1.6.0S to 1.7 ----------
-    glm:
-    multiresponseglm:
-    ordinal:
-    decision_tree:
-    random_forest:
-    distribution:
-    # ----------------- Changes from 1.7.1 to 1.8 ----------
-    text_utilities:
-
-# Changes in the types (UDT) including removal and modification
-udt:
-
-    # ----------------- Changes from 1.6.0S to 1.7 ----------
-    bytea8:
-    # ----------------- Changes from 1.8 to 1.9 ----------
-    __enc_tbl_result:
-    __gen_acc_time:
-    __rep_type:
-    __train_result:
-    c45_classify_result:
-    c45_train_result:
-    correlation_result:
-    lsvm_sgd_model_rec:
-    lsvm_sgd_result:
-    rf_classify_result:
-    rf_train_result:
-    svm_cls_result:
-    svm_model_pr:
-    svm_model_rec:
-    svm_nd_result:
-    svm_reg_result:
-    svm_support_vector:
-    _prune_result_type:
-    _tree_result_type:
-    linear_svm_result:
-    # ----------------- Changes from 1.9 to 1.9.1 ----------
-    profile_result:
-
-# List of the UDF changes that affect the user externally. This includes change
-# in function name, return type, argument order or types, or removal of
-# the function. In each case, the original function is as good as removed and a
-# new function is created. In such cases, we should abort the upgrade if there
-# are user views dependent on this function, since the original function will
-# not be present in the upgraded version.
-udf:
-    # ----------------- Changes from 1.6.0S to 1.7 ----------
-    - cross_validation_general:   # change in name of argument "fold_num"
-        rettype: void
-        argument: character varying, character varying[], character varying[], character varying, character varying[], character varying, character varying[], character varying[], character varying, character varying[], character varying[], character varying, character varying, boolean, character varying, character varying[], integer
-    - lmf_igd_transition:
-        rettype: double precision[]
-        argument: double precision[], smallint, smallint, double precision, double precision[], smallint, smallint, smallint, double precision, double precision
-    # depending on bytea8
-    # return type is bytea8
-    - __clustered_err_lin_merge:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - __clustered_err_lin_transition:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, double precision, double precision[], double precision[]
-    - __clustered_err_log_merge:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - __clustered_err_log_transition:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, boolean, double precision[], double precision[]
-    - __clustered_err_mlog_merge:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - __clustered_err_mlog_transition:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, integer, double precision[], double precision[], integer, integer
-    - dense_residual_norm_merge_states:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - dense_residual_norm_transition:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, double precision[], double precision, double precision[]
-    - hetero_linregr_merge_states:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - hetero_linregr_transition:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, double precision, double precision[], double precision[]
-    - linregr_merge_states:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - linregr_transition:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, double precision, double precision[]
-    - robust_linregr_merge_states:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - robust_linregr_transition:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, double precision, double precision[], double precision[]
-    - weighted_sample_merge_int64:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - weighted_sample_merge_vector:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - weighted_sample_transition_int64:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, bigint, double precision
-    - weighted_sample_transition_vector:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, double precision[], double precision
-    # argument type bytea8
-    - __clustered_err_lin_final:
-        rettype: schema_madlib.__clustered_agg_result
-        argument: schema_madlib.bytea8
-    - __clustered_err_log_final:
-        rettype: schema_madlib.__clustered_agg_result
-        argument: schema_madlib.bytea8
-    - __clustered_err_mlog_final:
-        rettype: schema_madlib.__clustered_agg_result
-        argument: schema_madlib.bytea8
-    - dense_residual_norm_final:
-        rettype: schema_madlib.residual_norm_result
-        argument: schema_madlib.bytea8
-    - hetero_linregr_final:
-        rettype: schema_madlib.heteroskedasticity_test_result
-        argument: schema_madlib.bytea8
-    - linregr_final:
-        rettype: schema_madlib.linregr_result
-        argument: schema_madlib.bytea8
-    - robust_linregr_final:
-        rettype: schema_madlib.robust_linregr_result
-        argument: schema_madlib.bytea8
-    - weighted_sample_final_int64:
-        rettype: bigint
-        argument: schema_madlib.bytea8
-    - weighted_sample_final_vector:
-        rettype: double precision[]
-        argument: schema_madlib.bytea8
-    # ----------------- Changes from 1.7.1 to 1.8 ----------
-    - __filter_input_relation:
-        rettype: character varying
-        argument: character varying, character varying
-    - __lda_util_unnest:
-        rettype: SETOF bigint[]
-        argument: bigint[]
-    - matrix_block_mult:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, text
-    - matrix_block_square:
-        rettype: schema_madlib.matrix_result
-        argument: text, text
-    - matrix_block_trans:
-        rettype: schema_madlib.matrix_result
-        argument: text, text
-    - matrix_block_trans:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, boolean
-    - matrix_blockize:
-        rettype: schema_madlib.matrix_result
-        argument: text, integer, integer, text
-    - matrix_densify:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, text, text, text, boolean
-    - matrix_mult:
-        rettype: schema_madlib.matrix_result
-        argument: text, boolean, text, boolean, text
-    - matrix_mult:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, text, text, boolean, text, text, text, text, boolean, text
-    - matrix_norm:
-        rettype: double precision
-        argument: text
-    - matrix_scale_and_add:
-        rettype: void
-        argument: text, text, double precision, text
-    - matrix_sparsify:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, boolean
-    - matrix_square:
-        rettype: schema_madlib.matrix_result
-        argument: text, text
-    - matrix_trans:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, boolean
-    - matrix_trans:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, text, text, text, boolean
-    - matrix_unblockize:
-        rettype: schema_madlib.matrix_result
-        argument: text, text
-    # ----------------- Changes from 1.8 to 1.9 ----------
-
-    - _dt_apply:
-       rettype: schema_madlib._tree_result_type
-       argument: schema_madlib.bytea8,schema_madlib.bytea8,schema_madlib.bytea8,smallint,smallint,smallint,boolean,integer
-
-    - internal_linear_svm_igd_result:
-       rettype: schema_madlib.linear_svm_result
-       argument: double precision[]
-
-    - _prune_and_cplist:
-       rettype: schema_madlib._prune_result_type
-       argument: schema_madlib.bytea8,double precision,boolean
-
-    - __array_elem_in:
-       rettype: boolean[]
-       argument: anyarray, anyarray
-
-    - __array_indexed_agg_ffunc:
-       rettype: double precision[]
-       argument: double precision[]
-
-    - __array_indexed_agg_prefunc:
-       rettype: double precision[]
-       argument: double precision[], double precision[]
-
-    - __array_indexed_agg_sfunc:
-       rettype: double precision[]
-       argument: double precision[], double precision, bigint, bigint
-
-    - __array_search:
-       rettype: boolean
-       argument: anyelement, anyarray
-
-    - __array_sort:
-       rettype: anyarray
-       argument: anyarray
-
-    - __assert:
-       rettype: void
-       argument: boolean, text
-
-    - __assert_table:
-       rettype: void
-       argument: text, boolean
-
-    - __best_scv_prefunc:
-       rettype: double precision[]
-       argument: double precision[], double precision[]
-
-    - __best_scv_sfunc:
-       rettype: double precision[]
-       argument: double precision[], double precision[], integer, double precision
-
-    - __bigint_array_add:
-       rettype: bigint[]
-       argument: bigint[], bigint[]
-
-    - __breakup_table:
-       rettype: void
-       argument: text, text, text, text, text, text[], boolean[], integer, integer
-
-    - __check_dt_common_params:
-       rettype: void
-       argument: text, text, text, text, text, text, text, text, integer, double precision, double precision, integer, text
-
-    - __check_training_table:
-       rettype: void
-       argument: text, text[], text[], text, text, integer
-
-    - __column_exists:
-       rettype: boolean
-       argument: text, text
-
-    - __columns_in_table:
-       rettype: boolean
-       argument: text[], text
-
-    - __create_metatable:
-       rettype: void
-       argument: text
-
-    - __create_tree_tables:
-       rettype: void
-       argument: text
-
-    - __csvstr_to_array:
-       rettype: text[]
-       argument: text
-
-    - __display_node_sfunc:
-       rettype: text
-       argument: text, integer, boolean, text, text, double precision, double precision, text, integer
-
-    - __display_tree_no_ordered_aggr:
-       rettype: text
-       argument: text, integer, integer, integer, boolean, double precision, text, integer, integer
-
-    - __distinct_feature_value:
-       rettype: integer
-       argument: text, integer
-
-    - __drop_metatable:
-       rettype: void
-       argument: text
-
-    - __dt_acc_count_sfunc:
-       rettype: bigint[]
-       argument: bigint[], integer, bigint, integer
-
-    - __dt_get_node_split_fids:
-       rettype: integer[]
-       argument: integer, integer, integer, integer[]
-
-    - __ebp_calc_errors:
-       rettype: double precision
-       argument: double precision, double precision, double precision
-
-    - __ebp_prune_tree:
-       rettype: void
-       argument: text
-
-    - __encode_and_train:
-       rettype: record
-       argument: text, text, integer, integer, text, text, text, text, text, text, text, double precision, text, integer, double precision, boolean, double precision, double precision, text, integer
-
-    - __encode_columns:
-       rettype: void
-       argument: text, text, integer, integer
-
-    - __encode_table:
-       rettype: void
-       argument: text, text, text, integer, integer
-
-    - __encode_table:
-       rettype: void
-       argument: text, text, text[], text, text[], text, text, integer, integer
-
-    - __find_best_split:
-       rettype: void
-       argument: text, double precision, text, integer, integer, text, integer, integer
-
-    - __format:
-       rettype: text
-       argument: text, text
-
-    - __format:
-       rettype: text
-       argument: text, text, text
-
-    - __format:
-       rettype: text
-       argument: text, text, text, text
-
-    - __format:
-       rettype: text
-       argument: text, text, text, text, text
-
-    - __format:
-       rettype: text
-       argument: text, text[]
-
-    - __gen_acc:
-       rettype: __gen_acc_time
-       argument: text, text, text, text, text, integer, integer, boolean, integer
-
-    - __gen_enc_meta_names:
-       rettype: text[]
-       argument: text, text
-
-    - __gen_horizontal_encoded_table:
-       rettype: void
-       argument: text, text, integer, integer
-
-    - __gen_vertical_encoded_table:
-       rettype: void
-       argument: text, text, text, boolean, integer
-
-    - __generate_final_tree:
-       rettype: void
-       argument: text
-
-    - __get_class_column_name:
-       rettype: text
-       argument: text
-
-    - __get_class_value:
-       rettype: text
-       argument: integer, text
-
-    - __get_classtable_name:
-       rettype: text
-       argument: text
-
-    - __get_column_value:
-       rettype: text
-       argument: integer, integer, character, text
-
-    - __get_feature_name:
-       rettype: text
-       argument: integer, text
-
-    - __get_feature_value:
-       rettype: text
-       argument: integer, integer, text
-
-    - __get_features_of_nodes:
-       rettype: text
-       argument: text, text, integer, integer, integer
-
-    - __get_id_column_name:
-       rettype: text
-       argument: text
-
-    - __get_schema_name:
-       rettype: text
-       argument: text
-
-    - __get_table_name:
-       rettype: text
-       argument: text
-
-    - __insert_into_metatable:
-       rettype: void
-       argument: text, integer, text, character, boolean, text, integer
-
-    - __is_valid_enc_table:
-       rettype: boolean
-       argument: text
-
-    - __num_of_class:
-       rettype: integer
-       argument: text
-
-    - __num_of_columns:
-       rettype: integer
-       argument: text
-
-    - __num_of_feature:
-       rettype: integer
-       argument: text
-
-    - __regclass_to_text:
-       rettype: text
-       argument: regclass
-
-    - __rename_table:
-       rettype: void
-       argument: text, text
-
-    - __rep_aggr_class_count_ffunc:
-       rettype: bigint[]
-       argument: bigint[]
-
-    - __rep_aggr_class_count_sfunc:
-       rettype: bigint[]
-       argument: bigint[], integer, integer, integer
-
-    - __rep_prune_tree:
-       rettype: void
-       argument: text, text, integer
-
-    - __sample_with_replacement:
-       rettype: void
-       argument: integer, bigint, text, text
-
-    - __sample_within_range:
-       rettype: SETOF bigint
-       argument: bigint, bigint, bigint
-
-    - __scv_aggr_ffunc:
-       rettype: double precision[]
-       argument: double precision[]
-
-    - __scv_aggr_prefunc:
-       rettype: double precision[]
-       argument: double precision[], double precision[]
-
-    - __scv_aggr_sfunc:
-       rettype: double precision[]
-       argument: double precision[], integer, boolean, integer, double precision[], double precision[], bigint
-
-    - __strip_schema_name:
-       rettype: text
-       argument: text
-
-    - __svm_random_ind2:
-       rettype: double precision[]
-       argument: integer
-
-    - __svm_random_ind:
-       rettype: double precision[]
-       argument: integer
-
-    - __svm_target_cl_func:
-       rettype: double precision
-       argument: double precision[]
-
-    - __svm_target_reg_func:
-       rettype: double precision
-       argument: double precision[]
-
-    - __table_exists:
-       rettype: boolean
-       argument: text
-
-    - __train_tree:
-       rettype: __train_result
-       argument: text, integer, integer, text, text, text, text, text, text, double precision, integer, double precision, double precision, double precision, boolean, integer, integer
-
-    - __treemodel_classify_internal:
-       rettype: text[]
-       argument: text, text, integer
-
-    - __treemodel_classify_internal_serial:
-       rettype: text[]
-       argument: text, text, integer
-
-    - __treemodel_display_no_ordered_aggr:
-       rettype: SETOF text
-       argument: text, integer[], integer
-
-    - __treemodel_display_with_ordered_aggr:
-       rettype: SETOF text
-       argument: text, integer[], integer
-
-    - __treemodel_get_vote_result:
-       rettype: void
-       argument: text, text
-
-    - __treemodel_score:
-       rettype: double precision
-       argument: text, text, integer
-
-    - __validate_input_table:
-       rettype: void
-       argument: text, text[], text, text
-
-    - __validate_metatable:
-       rettype: void
-       argument: text
-
-    - c45_classify:
-       rettype: c45_classify_result
-       argument: text, text, text
-
-    - c45_classify:
-       rettype: c45_classify_result
-       argument: text, text, text, integer
-
-    - c45_clean:
-       rettype: boolean
-       argument: text
-
-    - c45_display:
-       rettype: SETOF text
-       argument: text
-
-    - c45_display:
-       rettype: SETOF text
-       argument: text, integer
-
-    - c45_genrule:
-       rettype: SETOF text
-       argument: text
-
-    - c45_genrule:
-       rettype: SETOF text
-       argument: text, integer
-
-    - c45_score:
-       rettype: double precision
-       argument: text, text
-
-    - c45_score:
-       rettype: double precision
-       argument: text, text, integer
-
-    - c45_train:
-       rettype: c45_train_result
-       argument: text, text, text
-
-    - c45_train:
-       rettype: c45_train_result
-       argument: text, text, text, text, text, text, text, text, double precision, text
-
-    - c45_train:
-       rettype: c45_train_result
-       argument: text, text, text, text, text, text, text, text, double precision, text, integer, double precision, double precision, integer
-
-    - correlation:
-       rettype: correlation_result
-       argument: character varying, character varying
-
-    - correlation:
-       rettype: correlation_result
-       argument: character varying, character varying, character varying
-
-    - correlation:
-       rettype: correlation_result
-       argument: character varying, character varying, character varying, boolean
-
-    - linear_svm_igd_transition:
-       rettype: double precision[]
-       argument: double precision[], double precision[], boolean, double precision[], integer, double precision, double precision
-
-    - lsvm_classification:
-       rettype: SETOF lsvm_sgd_result
-       argument: text, text
-
-    - lsvm_classification:
-       rettype: SETOF lsvm_sgd_result
-       argument: text, text, boolean
-
-    - lsvm_classification:
-       rettype: SETOF lsvm_sgd_result
-       argument: text, text, boolean, boolean, double precision, double precision
-
-    - lsvm_classification:
-       rettype: SETOF lsvm_sgd_result
-       argument: text, text, boolean, boolean, double precision, double precision, integer
-
-    - lsvm_predict:
-       rettype: double precision
-       argument: double precision[], double precision[]
-
-    - lsvm_predict_batch:
-       rettype: text
-       argument: text, text, text, text, text
-
-    - lsvm_predict_batch:
-       rettype: text
-       argument: text, text, text, text, text, boolean
-
-    - matrix_block_trans:
-       rettype: matrix_result
-       argument: text, text, text, text, boolean
-
-    - matrix_densify:
-       rettype: matrix_result
-       argument: text, text, text, text, boolean
-
-    - matrix_sparsify:
-       rettype: matrix_result
-       argument: text, text, text, text, boolean
-
-    - matrix_trans:
-       rettype: matrix_result
-       argument: text, text, text, text, boolean
-
-    - rf_classify:
-       rettype: rf_classify_result
-       argument: text, text, text
-
-    - rf_classify:
-       rettype: rf_classify_result
-       argument: text, text, text, boolean, integer
-
-    - rf_classify:
-       rettype: rf_classify_result
-       argument: text, text, text, integer
-
-    - rf_clean:
-       rettype: boolean
-       argument: text
-
-    - rf_display:
-       rettype: SETOF text
-       argument: text
-
-    - rf_display:
-       rettype: SETOF text
-       argument: text, integer[]
-
-    - rf_display:
-       rettype: SETOF text
-       argument: text, integer[], integer
-
-    - rf_score:
-       rettype: double precision
-       argument: text, text
-
-    - rf_score:
-       rettype: double precision
-       argument: text, text, integer
-
-    - rf_train:
-       rettype: rf_train_result
-       argument: text, text, text
-
-    - rf_train:
-       rettype: rf_train_result
-       argument: text, text, text, integer, integer, double precision, text, text, text, text, text, integer, double precision, double precision, integer
-
-    - svdmf_run:
-       rettype: text
-       argument: text, text, text, text, integer
-
-    - svdmf_run:
-       rettype: text
-       argument: text, text, text, text, integer, integer, double precision
-
-    - svm_classification:
-       rettype: SETOF svm_cls_result
-       argument: text, text, boolean, text
-
-    - svm_classification:
-       rettype: SETOF svm_cls_result
-       argument: text, text, boolean, text, boolean, double precision, double precision
-
-    - svm_classification:
-       rettype: SETOF svm_cls_result
-       argument: text, text, boolean, text, boolean, double precision, double precision, double precision
-
-    - svm_classification:
-       rettype: SETOF svm_cls_result
-       argument: text, text, boolean, text, double precision
-
-    - svm_cls_update:
-       rettype: schema_madlib.svm_model_rec
-       argument: schema_madlib.svm_model_rec, double precision[], double precision, text, double precision, double precision, double precision
-
-    - svm_data_normalization:
-       rettype: void
-       argument: text
-
-    - svm_dot:
-       rettype: double precision
-       argument: double precision[], double precision[]
-
-    - svm_dot:
-       rettype: double precision
-       argument: double precision[], double precision[], double precision
-
-    - svm_drop_model:
-       rettype: void
-       argument: text
-
-    - svm_gaussian:
-       rettype: double precision
-       argument: double precision[], double precision[], double precision
-
-    - svm_generate_cls_data:
-       rettype: void
-       argument: text, integer, integer
-
-    - svm_generate_nd_data:
-       rettype: void
-       argument: text, integer, integer
-
-    - svm_generate_reg_data:
-       rettype: void
-       argument: text, integer, integer
-
-    - svm_nd_update:
-       rettype: schema_madlib.svm_model_rec
-       argument: schema_madlib.svm_model_rec, double precision[], text, double precision, double precision, double precision
-
-    - svm_novelty_detection:
-       rettype: SETOF schema_madlib.svm_nd_result
-       argument: text, text, boolean, text
-
-    - svm_novelty_detection:
-       rettype: SETOF schema_madlib.svm_nd_result
-       argument: text, text, boolean, text, boolean, double precision, double precision
-
-    - svm_novelty_detection:
-       rettype: SETOF schema_madlib.svm_nd_result
-       argument: text, text, boolean, text, boolean, double precision, double precision, double precision
-
-    - svm_polynomial:
-       rettype: double precision
-       argument: double precision[], double precision[], double precision
-
-    - svm_predict:
-       rettype: double precision
-       argument: schema_madlib.svm_model_rec, double precision[], text, double precision
-
-    - svm_predict_batch:
-       rettype: text
-       argument: text, text, text, text, text, boolean
-
-    - svm_predict_sub:
-       rettype: double precision
-       argument: integer, integer, double precision[], double precision[], double precision[], text, double precision
-
-    - svm_reg_update:
-       rettype: schema_madlib.svm_model_rec
-       argument: schema_madlib.svm_model_rec, double precision[], double precision, text, double precision, double precision, double precision, double precision
-
-    - svm_regression:
-       rettype: SETOF svm_reg_result
-       argument: text, text, boolean, text
-
-    - svm_regression:
-       rettype: SETOF svm_reg_result
-       argument: text, text, boolean, text, boolean, double precision, double precision, double precision
-
-    - svm_regression:
-       rettype: SETOF svm_reg_result
-       argument: text, text, boolean, text, boolean, double precision, double precision, double precision, double precision
-
-    - svm_store_model:
-       rettype: void
-       argument: text, text, text
-
-    # ----------------- Changes from 1.9 to 1.9.1 ----------
-    - array_collapse:
-        rettype: anyarray
-        argument: anyarray
-    - linear_svm_igd_transition:
-        rettype: double precision[]
-        argument: double precision[], double precision[], double precision, double precision[], integer, double precision, double precision, boolean, integer, double precision, boolean
-    - profile:
-        rettype: SETOF schema_madlib.profile_result
-        argument: text
-    - profile_full:
-        rettype: SETOF schema_madlib.profile_result
-        argument: text, integer
-    - profile:
-        rettype: schema_madlib.profile_result
-        argument: text
-    - profile_full:
-        rettype: schema_madlib.profile_result
-        argument: text, integer
-    - quantile:
-        rettype: double precision
-        argument: text, text, double precision
-    - quantile_big:
-        rettype: double precision
-        argument: text, text, double precision
-
-# Changes to aggregates (UDA) including removal and modification
-# Overloaded functions should be mentioned separately
-uda:
-    # ----------------- Changes from 1.6.0S to 1.7 ----------
-    - lmf_igd_step:
-        rettype: double precision[]
-        argument: smallint, smallint, double precision, double precision[], smallint, smallint, smallint, double precision, double precision
-    - __clustered_err_lin_step:
-        rettype: schema_madlib.__clustered_agg_result
-        argument: double precision, double precision[], double precision[]
-    - __clustered_err_log_step:
-        rettype: schema_madlib.__clustered_agg_result
-        argument: boolean, double precision[], double precision[]
-    - __clustered_err_mlog_step:
-        rettype: schema_madlib.__clustered_agg_result
-        argument: integer, double precision[], double precision[], integer, integer
-    - dense_residual_norm:
-        rettype: schema_madlib.residual_norm_result
-        argument: double precision[], double precision, double precision[]
-    - heteroskedasticity_test_linregr:
-        rettype: schema_madlib.heteroskedasticity_test_result
-        argument: double precision, double precision[], double precision[]
-    - linregr:
-        rettype: schema_madlib.linregr_result
-        argument: double precision, double precision[]
-    - robust_linregr:
-        rettype: schema_madlib.robust_linregr_result
-        argument: double precision, double precision[], double precision[]
-    - weighted_sample:
-        rettype: double precision[]
-        argument: double precision[], double precision
-    - weighted_sample:
-        rettype: bigint
-        argument: bigint, double precision
-    # ----------------- Changes from 1.7.1 to 1.8 ----------
-    - discrete_distribution_agg:
-        rettype: double precision[]
-        argument: integer, double precision, integer
-    - vectorized_distribution_agg:
-        rettype: double precision[]
-        argument: integer[], integer[]
-    # ----------------- Changes from 1.8 to 1.9 ----------
-
-    - __array_indexed_agg:
-        rettype: double precision[]
-        argument: double precision, bigint, bigint
-
-    - __best_scv_aggr:
-        rettype: double precision[]
-        argument: double precision[], integer, double precision
-
-    - __bigint_array_sum:
-        rettype: bigint[]
-        argument: bigint[]
-
-    - __display_tree_aggr:
-        rettype: text
-        argument: integer, boolean, text, text, double precision, double precision, text, integer
-
-    - __dt_acc_count_aggr:
-        rettype: bigint[]
-        argument: integer, bigint, integer
-
-    - __rep_aggr_class_count:
-        rettype: bigint[]
-        argument: integer, integer, integer
-
-    - __scv_aggr:
-        rettype: double precision[]
-        argument: integer, boolean, integer, double precision[], double precision[], bigint
-
-    - linear_svm_igd_step:
-        rettype: double precision[]
-        argument: double precision[], boolean, double precision[], integer, double precision, double precision
-
-    - linear_svm_igd_step_serial:
-        rettype: double precision[]
-        argument: double precision[], boolean, double precision[], integer, double precision, double precision
-
-    - svm_cls_agg:
-        rettype: schema_madlib.svm_model_rec
-        argument: double precision[], double precision, text, double precision, double precision, double precision
-
-    - svm_nd_agg:
-        rettype: schema_madlib.svm_model_rec
-        argument: double precision[], text, double precision, double precision, double precision
-
-    - svm_reg_agg:
-        rettype: schema_madlib.svm_model_rec
-        argument: double precision[], double precision, text, double precision, double precision, double precision, double precision
-
-    - __svm_random_ind2:
-        rettype: double precision[]
-        argument: integer
-    # ----------------- Changes from 1.9 to 1.9.1 ----------
-    - array_agg:
-        rettype: anyarray
-        argument: anyelement
-    - linear_svm_igd_step:
-       rettype: double precision[]
-       argument: double precision[], double precision, double precision[], integer, double precision, double precision, boolean, integer, double precision, boolean
-
-# Casts (UDC) updated/removed
-udc:
-    # ----------------- Changes from 1.6.0S to 1.7 ----------
-
-# Operators (UDO) removed/updated
-udo:
-    # ----------------- Changes from 1.6.0S to 1.7 ----------
-
-# Operator Classes (UDOC) removed/updated
-udoc:
-    # ----------------- Changes from 1.6.0S to 1.7 ----------

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/90f4dc15/src/madpack/changelist_1.6_1.9.1.yaml
----------------------------------------------------------------------
diff --git a/src/madpack/changelist_1.6_1.9.1.yaml b/src/madpack/changelist_1.6_1.9.1.yaml
deleted file mode 100644
index dccdf79..0000000
--- a/src/madpack/changelist_1.6_1.9.1.yaml
+++ /dev/null
@@ -1,951 +0,0 @@
-# Changelist for MADlib version 1.6 to 1.7
-
-# This file contains all changes that were introduced in a new version of
-# MADlib. This changelist is used by the upgrade script to detect what objects
-# should be upgraded (while retaining all other objects from the previous version)
-
-# New modules (actually .sql_in files) added in upgrade version
-# For these files the sql_in code is retained as is with the functions in the
-# file installed on the upgrade version. All other files (that don't have
-# updates), are cleaned up to remove object replacements
-new module:
-    # ----------------- Changes from 1.6 to 1.6.0S ----------
-    # ----------------- Changes from 1.6.0S to 1.7 ----------
-    glm:
-    multiresponseglm:
-    ordinal:
-    decision_tree:
-    random_forest:
-    distribution:
-    # ----------------- Changes from 1.7.1 to 1.8 ----------
-    text_utilities:
-
-# Changes in the types (UDT) including removal and modification
-udt:
-
-    # ----------------- Changes from 1.6 to 1.6.0S ----------
-    # ----------------- Changes from 1.6.0S to 1.7 ----------
-    bytea8:
-    # ----------------- Changes from 1.8 to 1.9 ----------
-    __enc_tbl_result:
-    __gen_acc_time:
-    __rep_type:
-    __train_result:
-    c45_classify_result:
-    c45_train_result:
-    correlation_result:
-    lsvm_sgd_model_rec:
-    lsvm_sgd_result:
-    rf_classify_result:
-    rf_train_result:
-    svm_cls_result:
-    svm_model_pr:
-    svm_model_rec:
-    svm_nd_result:
-    svm_reg_result:
-    svm_support_vector:
-    _prune_result_type:
-    _tree_result_type:
-    linear_svm_result:
-    # ----------------- Changes from 1.9 to 1.9.1 ----------
-    profile_result:
-
-# List of the UDF changes that affect the user externally. This includes change
-# in function name, return type, argument order or types, or removal of
-# the function. In each case, the original function is as good as removed and a
-# new function is created. In such cases, we should abort the upgrade if there
-# are user views dependent on this function, since the original function will
-# not be present in the upgraded version.
-udf:
-    # ----------------- Changes from 1.6 to 1.6.0S ----------
-    # ----------------- Changes from 1.6.0S to 1.7 ----------
-    - cross_validation_general:   # change in name of argument "fold_num"
-        rettype: void
-        argument: character varying, character varying[], character varying[], character varying, character varying[], character varying, character varying[], character varying[], character varying, character varying[], character varying[], character varying, character varying, boolean, character varying, character varying[], integer
-    - lmf_igd_transition:
-        rettype: double precision[]
-        argument: double precision[], smallint, smallint, double precision, double precision[], smallint, smallint, smallint, double precision, double precision
-    # depending on bytea8
-    # return type is bytea8
-    - __clustered_err_lin_merge:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - __clustered_err_lin_transition:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, double precision, double precision[], double precision[]
-    - __clustered_err_log_merge:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - __clustered_err_log_transition:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, boolean, double precision[], double precision[]
-    - __clustered_err_mlog_merge:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - __clustered_err_mlog_transition:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, integer, double precision[], double precision[], integer, integer
-    - dense_residual_norm_merge_states:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - dense_residual_norm_transition:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, double precision[], double precision, double precision[]
-    - hetero_linregr_merge_states:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - hetero_linregr_transition:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, double precision, double precision[], double precision[]
-    - linregr_merge_states:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - linregr_transition:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, double precision, double precision[]
-    - robust_linregr_merge_states:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - robust_linregr_transition:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, double precision, double precision[], double precision[]
-    - weighted_sample_merge_int64:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - weighted_sample_merge_vector:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - weighted_sample_transition_int64:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, bigint, double precision
-    - weighted_sample_transition_vector:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, double precision[], double precision
-    # argument type bytea8
-    - __clustered_err_lin_final:
-        rettype: schema_madlib.__clustered_agg_result
-        argument: schema_madlib.bytea8
-    - __clustered_err_log_final:
-        rettype: schema_madlib.__clustered_agg_result
-        argument: schema_madlib.bytea8
-    - __clustered_err_mlog_final:
-        rettype: schema_madlib.__clustered_agg_result
-        argument: schema_madlib.bytea8
-    - dense_residual_norm_final:
-        rettype: schema_madlib.residual_norm_result
-        argument: schema_madlib.bytea8
-    - hetero_linregr_final:
-        rettype: schema_madlib.heteroskedasticity_test_result
-        argument: schema_madlib.bytea8
-    - linregr_final:
-        rettype: schema_madlib.linregr_result
-        argument: schema_madlib.bytea8
-    - robust_linregr_final:
-        rettype: schema_madlib.robust_linregr_result
-        argument: schema_madlib.bytea8
-    - weighted_sample_final_int64:
-        rettype: bigint
-        argument: schema_madlib.bytea8
-    - weighted_sample_final_vector:
-        rettype: double precision[]
-        argument: schema_madlib.bytea8
-    # ----------------- Changes from 1.7.1 to 1.8 ----------
-    - __filter_input_relation:
-        rettype: character varying
-        argument: character varying, character varying
-    - __lda_util_unnest:
-        rettype: SETOF bigint[]
-        argument: bigint[]
-    - matrix_block_mult:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, text
-    - matrix_block_square:
-        rettype: schema_madlib.matrix_result
-        argument: text, text
-    - matrix_block_trans:
-        rettype: schema_madlib.matrix_result
-        argument: text, text
-    - matrix_block_trans:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, boolean
-    - matrix_blockize:
-        rettype: schema_madlib.matrix_result
-        argument: text, integer, integer, text
-    - matrix_densify:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, text, text, text, boolean
-    - matrix_mult:
-        rettype: schema_madlib.matrix_result
-        argument: text, boolean, text, boolean, text
-    - matrix_mult:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, text, text, boolean, text, text, text, text, boolean, text
-    - matrix_norm:
-        rettype: double precision
-        argument: text
-    - matrix_scale_and_add:
-        rettype: void
-        argument: text, text, double precision, text
-    - matrix_sparsify:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, boolean
-    - matrix_square:
-        rettype: schema_madlib.matrix_result
-        argument: text, text
-    - matrix_trans:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, boolean
-    - matrix_trans:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, text, text, text, boolean
-    - matrix_unblockize:
-        rettype: schema_madlib.matrix_result
-        argument: text, text
-    # ----------------- Changes from 1.8 to 1.9 ----------
-
-    - _dt_apply:
-       rettype: schema_madlib._tree_result_type
-       argument: schema_madlib.bytea8,schema_madlib.bytea8,schema_madlib.bytea8,smallint,smallint,smallint,boolean,integer
-
-    - internal_linear_svm_igd_result:
-       rettype: schema_madlib.linear_svm_result
-       argument: double precision[]
-
-    - _prune_and_cplist:
-       rettype: schema_madlib._prune_result_type
-       argument: schema_madlib.bytea8,double precision,boolean
-
-    - __array_elem_in:
-       rettype: boolean[]
-       argument: anyarray, anyarray
-
-    - __array_indexed_agg_ffunc:
-       rettype: double precision[]
-       argument: double precision[]
-
-    - __array_indexed_agg_prefunc:
-       rettype: double precision[]
-       argument: double precision[], double precision[]
-
-    - __array_indexed_agg_sfunc:
-       rettype: double precision[]
-       argument: double precision[], double precision, bigint, bigint
-
-    - __array_search:
-       rettype: boolean
-       argument: anyelement, anyarray
-
-    - __array_sort:
-       rettype: anyarray
-       argument: anyarray
-
-    - __assert:
-       rettype: void
-       argument: boolean, text
-
-    - __assert_table:
-       rettype: void
-       argument: text, boolean
-
-    - __best_scv_prefunc:
-       rettype: double precision[]
-       argument: double precision[], double precision[]
-
-    - __best_scv_sfunc:
-       rettype: double precision[]
-       argument: double precision[], double precision[], integer, double precision
-
-    - __bigint_array_add:
-       rettype: bigint[]
-       argument: bigint[], bigint[]
-
-    - __breakup_table:
-       rettype: void
-       argument: text, text, text, text, text, text[], boolean[], integer, integer
-
-    - __check_dt_common_params:
-       rettype: void
-       argument: text, text, text, text, text, text, text, text, integer, double precision, double precision, integer, text
-
-    - __check_training_table:
-       rettype: void
-       argument: text, text[], text[], text, text, integer
-
-    - __column_exists:
-       rettype: boolean
-       argument: text, text
-
-    - __columns_in_table:
-       rettype: boolean
-       argument: text[], text
-
-    - __create_metatable:
-       rettype: void
-       argument: text
-
-    - __create_tree_tables:
-       rettype: void
-       argument: text
-
-    - __csvstr_to_array:
-       rettype: text[]
-       argument: text
-
-    - __display_node_sfunc:
-       rettype: text
-       argument: text, integer, boolean, text, text, double precision, double precision, text, integer
-
-    - __display_tree_no_ordered_aggr:
-       rettype: text
-       argument: text, integer, integer, integer, boolean, double precision, text, integer, integer
-
-    - __distinct_feature_value:
-       rettype: integer
-       argument: text, integer
-
-    - __drop_metatable:
-       rettype: void
-       argument: text
-
-    - __dt_acc_count_sfunc:
-       rettype: bigint[]
-       argument: bigint[], integer, bigint, integer
-
-    - __dt_get_node_split_fids:
-       rettype: integer[]
-       argument: integer, integer, integer, integer[]
-
-    - __ebp_calc_errors:
-       rettype: double precision
-       argument: double precision, double precision, double precision
-
-    - __ebp_prune_tree:
-       rettype: void
-       argument: text
-
-    - __encode_and_train:
-       rettype: record
-       argument: text, text, integer, integer, text, text, text, text, text, text, text, double precision, text, integer, double precision, boolean, double precision, double precision, text, integer
-
-    - __encode_columns:
-       rettype: void
-       argument: text, text, integer, integer
-
-    - __encode_table:
-       rettype: void
-       argument: text, text, text, integer, integer
-
-    - __encode_table:
-       rettype: void
-       argument: text, text, text[], text, text[], text, text, integer, integer
-
-    - __find_best_split:
-       rettype: void
-       argument: text, double precision, text, integer, integer, text, integer, integer
-
-    - __format:
-       rettype: text
-       argument: text, text
-
-    - __format:
-       rettype: text
-       argument: text, text, text
-
-    - __format:
-       rettype: text
-       argument: text, text, text, text
-
-    - __format:
-       rettype: text
-       argument: text, text, text, text, text
-
-    - __format:
-       rettype: text
-       argument: text, text[]
-
-    - __gen_acc:
-       rettype: __gen_acc_time
-       argument: text, text, text, text, text, integer, integer, boolean, integer
-
-    - __gen_enc_meta_names:
-       rettype: text[]
-       argument: text, text
-
-    - __gen_horizontal_encoded_table:
-       rettype: void
-       argument: text, text, integer, integer
-
-    - __gen_vertical_encoded_table:
-       rettype: void
-       argument: text, text, text, boolean, integer
-
-    - __generate_final_tree:
-       rettype: void
-       argument: text
-
-    - __get_class_column_name:
-       rettype: text
-       argument: text
-
-    - __get_class_value:
-       rettype: text
-       argument: integer, text
-
-    - __get_classtable_name:
-       rettype: text
-       argument: text
-
-    - __get_column_value:
-       rettype: text
-       argument: integer, integer, character, text
-
-    - __get_feature_name:
-       rettype: text
-       argument: integer, text
-
-    - __get_feature_value:
-       rettype: text
-       argument: integer, integer, text
-
-    - __get_features_of_nodes:
-       rettype: text
-       argument: text, text, integer, integer, integer
-
-    - __get_id_column_name:
-       rettype: text
-       argument: text
-
-    - __get_schema_name:
-       rettype: text
-       argument: text
-
-    - __get_table_name:
-       rettype: text
-       argument: text
-
-    - __insert_into_metatable:
-       rettype: void
-       argument: text, integer, text, character, boolean, text, integer
-
-    - __is_valid_enc_table:
-       rettype: boolean
-       argument: text
-
-    - __num_of_class:
-       rettype: integer
-       argument: text
-
-    - __num_of_columns:
-       rettype: integer
-       argument: text
-
-    - __num_of_feature:
-       rettype: integer
-       argument: text
-
-    - __regclass_to_text:
-       rettype: text
-       argument: regclass
-
-    - __rename_table:
-       rettype: void
-       argument: text, text
-
-    - __rep_aggr_class_count_ffunc:
-       rettype: bigint[]
-       argument: bigint[]
-
-    - __rep_aggr_class_count_sfunc:
-       rettype: bigint[]
-       argument: bigint[], integer, integer, integer
-
-    - __rep_prune_tree:
-       rettype: void
-       argument: text, text, integer
-
-    - __sample_with_replacement:
-       rettype: void
-       argument: integer, bigint, text, text
-
-    - __sample_within_range:
-       rettype: SETOF bigint
-       argument: bigint, bigint, bigint
-
-    - __scv_aggr_ffunc:
-       rettype: double precision[]
-       argument: double precision[]
-
-    - __scv_aggr_prefunc:
-       rettype: double precision[]
-       argument: double precision[], double precision[]
-
-    - __scv_aggr_sfunc:
-       rettype: double precision[]
-       argument: double precision[], integer, boolean, integer, double precision[], double precision[], bigint
-
-    - __strip_schema_name:
-       rettype: text
-       argument: text
-
-    - __svm_random_ind2:
-       rettype: double precision[]
-       argument: integer
-
-    - __svm_random_ind:
-       rettype: double precision[]
-       argument: integer
-
-    - __svm_target_cl_func:
-       rettype: double precision
-       argument: double precision[]
-
-    - __svm_target_reg_func:
-       rettype: double precision
-       argument: double precision[]
-
-    - __table_exists:
-       rettype: boolean
-       argument: text
-
-    - __train_tree:
-       rettype: __train_result
-       argument: text, integer, integer, text, text, text, text, text, text, double precision, integer, double precision, double precision, double precision, boolean, integer, integer
-
-    - __treemodel_classify_internal:
-       rettype: text[]
-       argument: text, text, integer
-
-    - __treemodel_classify_internal_serial:
-       rettype: text[]
-       argument: text, text, integer
-
-    - __treemodel_display_no_ordered_aggr:
-       rettype: SETOF text
-       argument: text, integer[], integer
-
-    - __treemodel_display_with_ordered_aggr:
-       rettype: SETOF text
-       argument: text, integer[], integer
-
-    - __treemodel_get_vote_result:
-       rettype: void
-       argument: text, text
-
-    - __treemodel_score:
-       rettype: double precision
-       argument: text, text, integer
-
-    - __validate_input_table:
-       rettype: void
-       argument: text, text[], text, text
-
-    - __validate_metatable:
-       rettype: void
-       argument: text
-
-    - c45_classify:
-       rettype: c45_classify_result
-       argument: text, text, text
-
-    - c45_classify:
-       rettype: c45_classify_result
-       argument: text, text, text, integer
-
-    - c45_clean:
-       rettype: boolean
-       argument: text
-
-    - c45_display:
-       rettype: SETOF text
-       argument: text
-
-    - c45_display:
-       rettype: SETOF text
-       argument: text, integer
-
-    - c45_genrule:
-       rettype: SETOF text
-       argument: text
-
-    - c45_genrule:
-       rettype: SETOF text
-       argument: text, integer
-
-    - c45_score:
-       rettype: double precision
-       argument: text, text
-
-    - c45_score:
-       rettype: double precision
-       argument: text, text, integer
-
-    - c45_train:
-       rettype: c45_train_result
-       argument: text, text, text
-
-    - c45_train:
-       rettype: c45_train_result
-       argument: text, text, text, text, text, text, text, text, double precision, text
-
-    - c45_train:
-       rettype: c45_train_result
-       argument: text, text, text, text, text, text, text, text, double precision, text, integer, double precision, double precision, integer
-
-    - correlation:
-       rettype: correlation_result
-       argument: character varying, character varying
-
-    - correlation:
-       rettype: correlation_result
-       argument: character varying, character varying, character varying
-
-    - correlation:
-       rettype: correlation_result
-       argument: character varying, character varying, character varying, boolean
-
-    - linear_svm_igd_transition:
-       rettype: double precision[]
-       argument: double precision[], double precision[], boolean, double precision[], integer, double precision, double precision
-
-    - lsvm_classification:
-       rettype: SETOF lsvm_sgd_result
-       argument: text, text
-
-    - lsvm_classification:
-       rettype: SETOF lsvm_sgd_result
-       argument: text, text, boolean
-
-    - lsvm_classification:
-       rettype: SETOF lsvm_sgd_result
-       argument: text, text, boolean, boolean, double precision, double precision
-
-    - lsvm_classification:
-       rettype: SETOF lsvm_sgd_result
-       argument: text, text, boolean, boolean, double precision, double precision, integer
-
-    - lsvm_predict:
-       rettype: double precision
-       argument: double precision[], double precision[]
-
-    - lsvm_predict_batch:
-       rettype: text
-       argument: text, text, text, text, text
-
-    - lsvm_predict_batch:
-       rettype: text
-       argument: text, text, text, text, text, boolean
-
-    - matrix_block_trans:
-       rettype: matrix_result
-       argument: text, text, text, text, boolean
-
-    - matrix_densify:
-       rettype: matrix_result
-       argument: text, text, text, text, boolean
-
-    - matrix_sparsify:
-       rettype: matrix_result
-       argument: text, text, text, text, boolean
-
-    - matrix_trans:
-       rettype: matrix_result
-       argument: text, text, text, text, boolean
-
-    - rf_classify:
-       rettype: rf_classify_result
-       argument: text, text, text
-
-    - rf_classify:
-       rettype: rf_classify_result
-       argument: text, text, text, boolean, integer
-
-    - rf_classify:
-       rettype: rf_classify_result
-       argument: text, text, text, integer
-
-    - rf_clean:
-       rettype: boolean
-       argument: text
-
-    - rf_display:
-       rettype: SETOF text
-       argument: text
-
-    - rf_display:
-       rettype: SETOF text
-       argument: text, integer[]
-
-    - rf_display:
-       rettype: SETOF text
-       argument: text, integer[], integer
-
-    - rf_score:
-       rettype: double precision
-       argument: text, text
-
-    - rf_score:
-       rettype: double precision
-       argument: text, text, integer
-
-    - rf_train:
-       rettype: rf_train_result
-       argument: text, text, text
-
-    - rf_train:
-       rettype: rf_train_result
-       argument: text, text, text, integer, integer, double precision, text, text, text, text, text, integer, double precision, double precision, integer
-
-    - svdmf_run:
-       rettype: text
-       argument: text, text, text, text, integer
-
-    - svdmf_run:
-       rettype: text
-       argument: text, text, text, text, integer, integer, double precision
-
-    - svm_classification:
-       rettype: SETOF svm_cls_result
-       argument: text, text, boolean, text
-
-    - svm_classification:
-       rettype: SETOF svm_cls_result
-       argument: text, text, boolean, text, boolean, double precision, double precision
-
-    - svm_classification:
-       rettype: SETOF svm_cls_result
-       argument: text, text, boolean, text, boolean, double precision, double precision, double precision
-
-    - svm_classification:
-       rettype: SETOF svm_cls_result
-       argument: text, text, boolean, text, double precision
-
-    - svm_cls_update:
-       rettype: schema_madlib.svm_model_rec
-       argument: schema_madlib.svm_model_rec, double precision[], double precision, text, double precision, double precision, double precision
-
-    - svm_data_normalization:
-       rettype: void
-       argument: text
-
-    - svm_dot:
-       rettype: double precision
-       argument: double precision[], double precision[]
-
-    - svm_dot:
-       rettype: double precision
-       argument: double precision[], double precision[], double precision
-
-    - svm_drop_model:
-       rettype: void
-       argument: text
-
-    - svm_gaussian:
-       rettype: double precision
-       argument: double precision[], double precision[], double precision
-
-    - svm_generate_cls_data:
-       rettype: void
-       argument: text, integer, integer
-
-    - svm_generate_nd_data:
-       rettype: void
-       argument: text, integer, integer
-
-    - svm_generate_reg_data:
-       rettype: void
-       argument: text, integer, integer
-
-    - svm_nd_update:
-       rettype: schema_madlib.svm_model_rec
-       argument: schema_madlib.svm_model_rec, double precision[], text, double precision, double precision, double precision
-
-    - svm_novelty_detection:
-       rettype: SETOF schema_madlib.svm_nd_result
-       argument: text, text, boolean, text
-
-    - svm_novelty_detection:
-       rettype: SETOF schema_madlib.svm_nd_result
-       argument: text, text, boolean, text, boolean, double precision, double precision
-
-    - svm_novelty_detection:
-       rettype: SETOF schema_madlib.svm_nd_result
-       argument: text, text, boolean, text, boolean, double precision, double precision, double precision
-
-    - svm_polynomial:
-       rettype: double precision
-       argument: double precision[], double precision[], double precision
-
-    - svm_predict:
-       rettype: double precision
-       argument: schema_madlib.svm_model_rec, double precision[], text, double precision
-
-    - svm_predict_batch:
-       rettype: text
-       argument: text, text, text, text, text, boolean
-
-    - svm_predict_sub:
-       rettype: double precision
-       argument: integer, integer, double precision[], double precision[], double precision[], text, double precision
-
-    - svm_reg_update:
-       rettype: schema_madlib.svm_model_rec
-       argument: schema_madlib.svm_model_rec, double precision[], double precision, text, double precision, double precision, double precision, double precision
-
-    - svm_regression:
-       rettype: SETOF svm_reg_result
-       argument: text, text, boolean, text
-
-    - svm_regression:
-       rettype: SETOF svm_reg_result
-       argument: text, text, boolean, text, boolean, double precision, double precision, double precision
-
-    - svm_regression:
-       rettype: SETOF svm_reg_result
-       argument: text, text, boolean, text, boolean, double precision, double precision, double precision, double precision
-
-    - svm_store_model:
-       rettype: void
-       argument: text, text, text
-
-    # ----------------- Changes from 1.9 to 1.9.1 ----------
-    - array_collapse:
-        rettype: anyarray
-        argument: anyarray
-    - linear_svm_igd_transition:
-        rettype: double precision[]
-        argument: double precision[], double precision[], double precision, double precision[], integer, double precision, double precision, boolean, integer, double precision, boolean
-    - profile:
-        rettype: SETOF schema_madlib.profile_result
-        argument: text
-    - profile_full:
-        rettype: SETOF schema_madlib.profile_result
-        argument: text, integer
-    - profile:
-        rettype: schema_madlib.profile_result
-        argument: text
-    - profile_full:
-        rettype: schema_madlib.profile_result
-        argument: text, integer
-    - quantile:
-        rettype: double precision
-        argument: text, text, double precision
-    - quantile_big:
-        rettype: double precision
-        argument: text, text, double precision
-
-# Changes to aggregates (UDA) including removal and modification
-# Overloaded functions should be mentioned separately
-uda:
-    # ----------------- Changes from 1.6 to 1.6.0S ----------
-    # ----------------- Changes from 1.6.0S to 1.7 ----------
-    - lmf_igd_step:
-        rettype: double precision[]
-        argument: smallint, smallint, double precision, double precision[], smallint, smallint, smallint, double precision, double precision
-    - __clustered_err_lin_step:
-        rettype: schema_madlib.__clustered_agg_result
-        argument: double precision, double precision[], double precision[]
-    - __clustered_err_log_step:
-        rettype: schema_madlib.__clustered_agg_result
-        argument: boolean, double precision[], double precision[]
-    - __clustered_err_mlog_step:
-        rettype: schema_madlib.__clustered_agg_result
-        argument: integer, double precision[], double precision[], integer, integer
-    - dense_residual_norm:
-        rettype: schema_madlib.residual_norm_result
-        argument: double precision[], double precision, double precision[]
-    - heteroskedasticity_test_linregr:
-        rettype: schema_madlib.heteroskedasticity_test_result
-        argument: double precision, double precision[], double precision[]
-    - linregr:
-        rettype: schema_madlib.linregr_result
-        argument: double precision, double precision[]
-    - robust_linregr:
-        rettype: schema_madlib.robust_linregr_result
-        argument: double precision, double precision[], double precision[]
-    - weighted_sample:
-        rettype: double precision[]
-        argument: double precision[], double precision
-    - weighted_sample:
-        rettype: bigint
-        argument: bigint, double precision
-    # ----------------- Changes from 1.7.1 to 1.8 ----------
-    - discrete_distribution_agg:
-        rettype: double precision[]
-        argument: integer, double precision, integer
-    - vectorized_distribution_agg:
-        rettype: double precision[]
-        argument: integer[], integer[]
-    # ----------------- Changes from 1.8 to 1.9 ----------
-    - __array_indexed_agg:
-        rettype: double precision[]
-        argument: double precision, bigint, bigint
-
-    - __best_scv_aggr:
-        rettype: double precision[]
-        argument: double precision[], integer, double precision
-
-    - __bigint_array_sum:
-        rettype: bigint[]
-        argument: bigint[]
-
-    - __display_tree_aggr:
-        rettype: text
-        argument: integer, boolean, text, text, double precision, double precision, text, integer
-
-    - __dt_acc_count_aggr:
-        rettype: bigint[]
-        argument: integer, bigint, integer
-
-    - __rep_aggr_class_count:
-        rettype: bigint[]
-        argument: integer, integer, integer
-
-    - __scv_aggr:
-        rettype: double precision[]
-        argument: integer, boolean, integer, double precision[], double precision[], bigint
-
-    - linear_svm_igd_step:
-        rettype: double precision[]
-        argument: double precision[], boolean, double precision[], integer, double precision, double precision
-
-    - linear_svm_igd_step_serial:
-        rettype: double precision[]
-        argument: double precision[], boolean, double precision[], integer, double precision, double precision
-
-    - svm_cls_agg:
-        rettype: schema_madlib.svm_model_rec
-        argument: double precision[], double precision, text, double precision, double precision, double precision
-
-    - svm_nd_agg:
-        rettype: schema_madlib.svm_model_rec
-        argument: double precision[], text, double precision, double precision, double precision
-
-    - svm_reg_agg:
-        rettype: schema_madlib.svm_model_rec
-        argument: double precision[], double precision, text, double precision, double precision, double precision, double precision
-
-    - __svm_random_ind2:
-        rettype: double precision[]
-        argument: integer
-    # ----------------- Changes from 1.9 to 1.9.1 ----------
-    - array_agg:
-        rettype: anyarray
-        argument: anyelement
-    - linear_svm_igd_step:
-       rettype: double precision[]
-       argument: double precision[], double precision, double precision[], integer, double precision, double precision, boolean, integer, double precision, boolean
-
-# Casts (UDC) updated/removed
-udc:
-    # ----------------- Changes from 1.6 to 1.6.0S ----------
-    # ----------------- Changes from 1.6.0S to 1.7 ----------
-
-# Operators (UDO) removed/updated
-udo:
-    # ----------------- Changes from 1.6 to 1.6.0S ----------
-    # ----------------- Changes from 1.6.0S to 1.7 ----------
-
-# Operator Classes (UDOC) removed/updated
-udoc:
-    # ----------------- Changes from 1.6 to 1.6.0S ----------
-    # ----------------- Changes from 1.6.0S to 1.7 ----------

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/90f4dc15/src/madpack/changelist_1.7.1_1.9.1.yaml
----------------------------------------------------------------------
diff --git a/src/madpack/changelist_1.7.1_1.9.1.yaml b/src/madpack/changelist_1.7.1_1.9.1.yaml
deleted file mode 100644
index 0e989c1..0000000
--- a/src/madpack/changelist_1.7.1_1.9.1.yaml
+++ /dev/null
@@ -1,815 +0,0 @@
-# Changelist for MADlib version 1.7.1 to 1.8
-
-# This file contains all changes that were introduced in a new version of
-# MADlib. This changelist is used by the upgrade script to detect what objects
-# should be upgraded (while retaining all other objects from the previous version)
-
-# New modules (actually .sql_in files) added in upgrade version
-# For these files the sql_in code is retained as is with the functions in the
-# file installed on the upgrade version. All other files (that don't have
-# updates), are cleaned up to remove object replacements
-new module:
-    # ----------------- Changes from 1.7.1 to 1.8 ----------
-    text_utilities:
-
-# Changes in the types (UDT) including removal and modification
-udt:
-
-    # ----------------- Changes from 1.7.1 to 1.8 ----------
-    # ----------------- Changes from 1.8 to 1.9 ----------
-    __enc_tbl_result:
-    __gen_acc_time:
-    __rep_type:
-    __train_result:
-    c45_classify_result:
-    c45_train_result:
-    correlation_result:
-    lsvm_sgd_model_rec:
-    lsvm_sgd_result:
-    rf_classify_result:
-    rf_train_result:
-    svm_cls_result:
-    svm_model_pr:
-    svm_model_rec:
-    svm_nd_result:
-    svm_reg_result:
-    svm_support_vector:
-    _prune_result_type:
-    _tree_result_type:
-    linear_svm_result:
-    # ----------------- Changes from 1.9 to 1.9.1 ----------
-    profile_result:
-
-# List of the UDF changes that affect the user externally. This includes change
-# in function name, return type, argument order or types, or removal of
-# the function. In each case, the original function is as good as removed and a
-# new function is created. In such cases, we should abort the upgrade if there
-# are user views dependent on this function, since the original function will
-# not be present in the upgraded version.
-udf:
-    # ----------------- Changes from 1.7.1 to 1.8 ----------
-    - __filter_input_relation:
-        rettype: character varying
-        argument: character varying, character varying
-    - __lda_util_unnest:
-        rettype: SETOF bigint[]
-        argument: bigint[]
-    - matrix_block_mult:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, text
-    - matrix_block_square:
-        rettype: schema_madlib.matrix_result
-        argument: text, text
-    - matrix_block_trans:
-        rettype: schema_madlib.matrix_result
-        argument: text, text
-    - matrix_block_trans:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, boolean
-    - matrix_blockize:
-        rettype: schema_madlib.matrix_result
-        argument: text, integer, integer, text
-    - matrix_densify:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, text, text, text, boolean
-    - matrix_mult:
-        rettype: schema_madlib.matrix_result
-        argument: text, boolean, text, boolean, text
-    - matrix_mult:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, text, text, boolean, text, text, text, text, boolean, text
-    - matrix_norm:
-        rettype: double precision
-        argument: text
-    - matrix_scale_and_add:
-        rettype: void
-        argument: text, text, double precision, text
-    - matrix_sparsify:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, boolean
-    - matrix_square:
-        rettype: schema_madlib.matrix_result
-        argument: text, text
-    - matrix_trans:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, boolean
-    - matrix_trans:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, text, text, text, boolean
-    - matrix_unblockize:
-        rettype: schema_madlib.matrix_result
-        argument: text, text
-    # ----------------- Changes from 1.8 to 1.9 ----------
-
-    - _dt_apply:
-       rettype: schema_madlib._tree_result_type
-       argument: schema_madlib.bytea8,schema_madlib.bytea8,schema_madlib.bytea8,smallint,smallint,smallint,boolean,integer
-
-    - internal_linear_svm_igd_result:
-       rettype: schema_madlib.linear_svm_result
-       argument: double precision[]
-
-    - _prune_and_cplist:
-       rettype: schema_madlib._prune_result_type
-       argument: schema_madlib.bytea8,double precision,boolean
-
-    - __array_elem_in:
-       rettype: boolean[]
-       argument: anyarray, anyarray
-
-    - __array_indexed_agg_ffunc:
-       rettype: double precision[]
-       argument: double precision[]
-
-    - __array_indexed_agg_prefunc:
-       rettype: double precision[]
-       argument: double precision[], double precision[]
-
-    - __array_indexed_agg_sfunc:
-       rettype: double precision[]
-       argument: double precision[], double precision, bigint, bigint
-
-    - __array_search:
-       rettype: boolean
-       argument: anyelement, anyarray
-
-    - __array_sort:
-       rettype: anyarray
-       argument: anyarray
-
-    - __assert:
-       rettype: void
-       argument: boolean, text
-
-    - __assert_table:
-       rettype: void
-       argument: text, boolean
-
-    - __best_scv_prefunc:
-       rettype: double precision[]
-       argument: double precision[], double precision[]
-
-    - __best_scv_sfunc:
-       rettype: double precision[]
-       argument: double precision[], double precision[], integer, double precision
-
-    - __bigint_array_add:
-       rettype: bigint[]
-       argument: bigint[], bigint[]
-
-    - __breakup_table:
-       rettype: void
-       argument: text, text, text, text, text, text[], boolean[], integer, integer
-
-    - __check_dt_common_params:
-       rettype: void
-       argument: text, text, text, text, text, text, text, text, integer, double precision, double precision, integer, text
-
-    - __check_training_table:
-       rettype: void
-       argument: text, text[], text[], text, text, integer
-
-    - __column_exists:
-       rettype: boolean
-       argument: text, text
-
-    - __columns_in_table:
-       rettype: boolean
-       argument: text[], text
-
-    - __create_metatable:
-       rettype: void
-       argument: text
-
-    - __create_tree_tables:
-       rettype: void
-       argument: text
-
-    - __csvstr_to_array:
-       rettype: text[]
-       argument: text
-
-    - __display_node_sfunc:
-       rettype: text
-       argument: text, integer, boolean, text, text, double precision, double precision, text, integer
-
-    - __display_tree_no_ordered_aggr:
-       rettype: text
-       argument: text, integer, integer, integer, boolean, double precision, text, integer, integer
-
-    - __distinct_feature_value:
-       rettype: integer
-       argument: text, integer
-
-    - __drop_metatable:
-       rettype: void
-       argument: text
-
-    - __dt_acc_count_sfunc:
-       rettype: bigint[]
-       argument: bigint[], integer, bigint, integer
-
-    - __dt_get_node_split_fids:
-       rettype: integer[]
-       argument: integer, integer, integer, integer[]
-
-    - __ebp_calc_errors:
-       rettype: double precision
-       argument: double precision, double precision, double precision
-
-    - __ebp_prune_tree:
-       rettype: void
-       argument: text
-
-    - __encode_and_train:
-       rettype: record
-       argument: text, text, integer, integer, text, text, text, text, text, text, text, double precision, text, integer, double precision, boolean, double precision, double precision, text, integer
-
-    - __encode_columns:
-       rettype: void
-       argument: text, text, integer, integer
-
-    - __encode_table:
-       rettype: void
-       argument: text, text, text, integer, integer
-
-    - __encode_table:
-       rettype: void
-       argument: text, text, text[], text, text[], text, text, integer, integer
-
-    - __find_best_split:
-       rettype: void
-       argument: text, double precision, text, integer, integer, text, integer, integer
-
-    - __format:
-       rettype: text
-       argument: text, text
-
-    - __format:
-       rettype: text
-       argument: text, text, text
-
-    - __format:
-       rettype: text
-       argument: text, text, text, text
-
-    - __format:
-       rettype: text
-       argument: text, text, text, text, text
-
-    - __format:
-       rettype: text
-       argument: text, text[]
-
-    - __gen_acc:
-       rettype: __gen_acc_time
-       argument: text, text, text, text, text, integer, integer, boolean, integer
-
-    - __gen_enc_meta_names:
-       rettype: text[]
-       argument: text, text
-
-    - __gen_horizontal_encoded_table:
-       rettype: void
-       argument: text, text, integer, integer
-
-    - __gen_vertical_encoded_table:
-       rettype: void
-       argument: text, text, text, boolean, integer
-
-    - __generate_final_tree:
-       rettype: void
-       argument: text
-
-    - __get_class_column_name:
-       rettype: text
-       argument: text
-
-    - __get_class_value:
-       rettype: text
-       argument: integer, text
-
-    - __get_classtable_name:
-       rettype: text
-       argument: text
-
-    - __get_column_value:
-       rettype: text
-       argument: integer, integer, character, text
-
-    - __get_feature_name:
-       rettype: text
-       argument: integer, text
-
-    - __get_feature_value:
-       rettype: text
-       argument: integer, integer, text
-
-    - __get_features_of_nodes:
-       rettype: text
-       argument: text, text, integer, integer, integer
-
-    - __get_id_column_name:
-       rettype: text
-       argument: text
-
-    - __get_schema_name:
-       rettype: text
-       argument: text
-
-    - __get_table_name:
-       rettype: text
-       argument: text
-
-    - __insert_into_metatable:
-       rettype: void
-       argument: text, integer, text, character, boolean, text, integer
-
-    - __is_valid_enc_table:
-       rettype: boolean
-       argument: text
-
-    - __num_of_class:
-       rettype: integer
-       argument: text
-
-    - __num_of_columns:
-       rettype: integer
-       argument: text
-
-    - __num_of_feature:
-       rettype: integer
-       argument: text
-
-    - __regclass_to_text:
-       rettype: text
-       argument: regclass
-
-    - __rename_table:
-       rettype: void
-       argument: text, text
-
-    - __rep_aggr_class_count_ffunc:
-       rettype: bigint[]
-       argument: bigint[]
-
-    - __rep_aggr_class_count_sfunc:
-       rettype: bigint[]
-       argument: bigint[], integer, integer, integer
-
-    - __rep_prune_tree:
-       rettype: void
-       argument: text, text, integer
-
-    - __sample_with_replacement:
-       rettype: void
-       argument: integer, bigint, text, text
-
-    - __sample_within_range:
-       rettype: SETOF bigint
-       argument: bigint, bigint, bigint
-
-    - __scv_aggr_ffunc:
-       rettype: double precision[]
-       argument: double precision[]
-
-    - __scv_aggr_prefunc:
-       rettype: double precision[]
-       argument: double precision[], double precision[]
-
-    - __scv_aggr_sfunc:
-       rettype: double precision[]
-       argument: double precision[], integer, boolean, integer, double precision[], double precision[], bigint
-
-    - __strip_schema_name:
-       rettype: text
-       argument: text
-
-    - __svm_random_ind2:
-       rettype: double precision[]
-       argument: integer
-
-    - __svm_random_ind:
-       rettype: double precision[]
-       argument: integer
-
-    - __svm_target_cl_func:
-       rettype: double precision
-       argument: double precision[]
-
-    - __svm_target_reg_func:
-       rettype: double precision
-       argument: double precision[]
-
-    - __table_exists:
-       rettype: boolean
-       argument: text
-
-    - __train_tree:
-       rettype: __train_result
-       argument: text, integer, integer, text, text, text, text, text, text, double precision, integer, double precision, double precision, double precision, boolean, integer, integer
-
-    - __treemodel_classify_internal:
-       rettype: text[]
-       argument: text, text, integer
-
-    - __treemodel_classify_internal_serial:
-       rettype: text[]
-       argument: text, text, integer
-
-    - __treemodel_display_no_ordered_aggr:
-       rettype: SETOF text
-       argument: text, integer[], integer
-
-    - __treemodel_display_with_ordered_aggr:
-       rettype: SETOF text
-       argument: text, integer[], integer
-
-    - __treemodel_get_vote_result:
-       rettype: void
-       argument: text, text
-
-    - __treemodel_score:
-       rettype: double precision
-       argument: text, text, integer
-
-    - __validate_input_table:
-       rettype: void
-       argument: text, text[], text, text
-
-    - __validate_metatable:
-       rettype: void
-       argument: text
-
-    - c45_classify:
-       rettype: c45_classify_result
-       argument: text, text, text
-
-    - c45_classify:
-       rettype: c45_classify_result
-       argument: text, text, text, integer
-
-    - c45_clean:
-       rettype: boolean
-       argument: text
-
-    - c45_display:
-       rettype: SETOF text
-       argument: text
-
-    - c45_display:
-       rettype: SETOF text
-       argument: text, integer
-
-    - c45_genrule:
-       rettype: SETOF text
-       argument: text
-
-    - c45_genrule:
-       rettype: SETOF text
-       argument: text, integer
-
-    - c45_score:
-       rettype: double precision
-       argument: text, text
-
-    - c45_score:
-       rettype: double precision
-       argument: text, text, integer
-
-    - c45_train:
-       rettype: c45_train_result
-       argument: text, text, text
-
-    - c45_train:
-       rettype: c45_train_result
-       argument: text, text, text, text, text, text, text, text, double precision, text
-
-    - c45_train:
-       rettype: c45_train_result
-       argument: text, text, text, text, text, text, text, text, double precision, text, integer, double precision, double precision, integer
-
-    - correlation:
-       rettype: correlation_result
-       argument: character varying, character varying
-
-    - correlation:
-       rettype: correlation_result
-       argument: character varying, character varying, character varying
-
-    - correlation:
-       rettype: correlation_result
-       argument: character varying, character varying, character varying, boolean
-
-    - linear_svm_igd_transition:
-       rettype: double precision[]
-       argument: double precision[], double precision[], boolean, double precision[], integer, double precision, double precision
-
-    - lsvm_classification:
-       rettype: SETOF lsvm_sgd_result
-       argument: text, text
-
-    - lsvm_classification:
-       rettype: SETOF lsvm_sgd_result
-       argument: text, text, boolean
-
-    - lsvm_classification:
-       rettype: SETOF lsvm_sgd_result
-       argument: text, text, boolean, boolean, double precision, double precision
-
-    - lsvm_classification:
-       rettype: SETOF lsvm_sgd_result
-       argument: text, text, boolean, boolean, double precision, double precision, integer
-
-    - lsvm_predict:
-       rettype: double precision
-       argument: double precision[], double precision[]
-
-    - lsvm_predict_batch:
-       rettype: text
-       argument: text, text, text, text, text
-
-    - lsvm_predict_batch:
-       rettype: text
-       argument: text, text, text, text, text, boolean
-
-    - matrix_block_trans:
-       rettype: matrix_result
-       argument: text, text, text, text, boolean
-
-    - matrix_densify:
-       rettype: matrix_result
-       argument: text, text, text, text, boolean
-
-    - matrix_sparsify:
-       rettype: matrix_result
-       argument: text, text, text, text, boolean
-
-    - matrix_trans:
-       rettype: matrix_result
-       argument: text, text, text, text, boolean
-
-    - rf_classify:
-       rettype: rf_classify_result
-       argument: text, text, text
-
-    - rf_classify:
-       rettype: rf_classify_result
-       argument: text, text, text, boolean, integer
-
-    - rf_classify:
-       rettype: rf_classify_result
-       argument: text, text, text, integer
-
-    - rf_clean:
-       rettype: boolean
-       argument: text
-
-    - rf_display:
-       rettype: SETOF text
-       argument: text
-
-    - rf_display:
-       rettype: SETOF text
-       argument: text, integer[]
-
-    - rf_display:
-       rettype: SETOF text
-       argument: text, integer[], integer
-
-    - rf_score:
-       rettype: double precision
-       argument: text, text
-
-    - rf_score:
-       rettype: double precision
-       argument: text, text, integer
-
-    - rf_train:
-       rettype: rf_train_result
-       argument: text, text, text
-
-    - rf_train:
-       rettype: rf_train_result
-       argument: text, text, text, integer, integer, double precision, text, text, text, text, text, integer, double precision, double precision, integer
-
-    - svdmf_run:
-       rettype: text
-       argument: text, text, text, text, integer
-
-    - svdmf_run:
-       rettype: text
-       argument: text, text, text, text, integer, integer, double precision
-
-    - svm_classification:
-       rettype: SETOF svm_cls_result
-       argument: text, text, boolean, text
-
-    - svm_classification:
-       rettype: SETOF svm_cls_result
-       argument: text, text, boolean, text, boolean, double precision, double precision
-
-    - svm_classification:
-       rettype: SETOF svm_cls_result
-       argument: text, text, boolean, text, boolean, double precision, double precision, double precision
-
-    - svm_classification:
-       rettype: SETOF svm_cls_result
-       argument: text, text, boolean, text, double precision
-
-    - svm_cls_update:
-       rettype: schema_madlib.svm_model_rec
-       argument: schema_madlib.svm_model_rec, double precision[], double precision, text, double precision, double precision, double precision
-
-    - svm_data_normalization:
-       rettype: void
-       argument: text
-
-    - svm_dot:
-       rettype: double precision
-       argument: double precision[], double precision[]
-
-    - svm_dot:
-       rettype: double precision
-       argument: double precision[], double precision[], double precision
-
-    - svm_drop_model:
-       rettype: void
-       argument: text
-
-    - svm_gaussian:
-       rettype: double precision
-       argument: double precision[], double precision[], double precision
-
-    - svm_generate_cls_data:
-       rettype: void
-       argument: text, integer, integer
-
-    - svm_generate_nd_data:
-       rettype: void
-       argument: text, integer, integer
-
-    - svm_generate_reg_data:
-       rettype: void
-       argument: text, integer, integer
-
-    - svm_nd_update:
-       rettype: schema_madlib.svm_model_rec
-       argument: schema_madlib.svm_model_rec, double precision[], text, double precision, double precision, double precision
-
-    - svm_novelty_detection:
-       rettype: SETOF schema_madlib.svm_nd_result
-       argument: text, text, boolean, text
-
-    - svm_novelty_detection:
-       rettype: SETOF schema_madlib.svm_nd_result
-       argument: text, text, boolean, text, boolean, double precision, double precision
-
-    - svm_novelty_detection:
-       rettype: SETOF schema_madlib.svm_nd_result
-       argument: text, text, boolean, text, boolean, double precision, double precision, double precision
-
-    - svm_polynomial:
-       rettype: double precision
-       argument: double precision[], double precision[], double precision
-
-    - svm_predict:
-       rettype: double precision
-       argument: schema_madlib.svm_model_rec, double precision[], text, double precision
-
-    - svm_predict_batch:
-       rettype: text
-       argument: text, text, text, text, text, boolean
-
-    - svm_predict_sub:
-       rettype: double precision
-       argument: integer, integer, double precision[], double precision[], double precision[], text, double precision
-
-    - svm_reg_update:
-       rettype: schema_madlib.svm_model_rec
-       argument: schema_madlib.svm_model_rec, double precision[], double precision, text, double precision, double precision, double precision, double precision
-
-    - svm_regression:
-       rettype: SETOF svm_reg_result
-       argument: text, text, boolean, text
-
-    - svm_regression:
-       rettype: SETOF svm_reg_result
-       argument: text, text, boolean, text, boolean, double precision, double precision, double precision
-
-    - svm_regression:
-       rettype: SETOF svm_reg_result
-       argument: text, text, boolean, text, boolean, double precision, double precision, double precision, double precision
-
-    - svm_store_model:
-       rettype: void
-       argument: text, text, text
-
-    # ----------------- Changes from 1.9 to 1.9.1 ----------
-    - array_collapse:
-        rettype: anyarray
-        argument: anyarray
-    - linear_svm_igd_transition:
-        rettype: double precision[]
-        argument: double precision[], double precision[], double precision, double precision[], integer, double precision, double precision, boolean, integer, double precision, boolean
-    - profile:
-        rettype: SETOF schema_madlib.profile_result
-        argument: text
-    - profile_full:
-        rettype: SETOF schema_madlib.profile_result
-        argument: text, integer
-    - profile:
-        rettype: schema_madlib.profile_result
-        argument: text
-    - profile_full:
-        rettype: schema_madlib.profile_result
-        argument: text, integer
-    - quantile:
-        rettype: double precision
-        argument: text, text, double precision
-    - quantile_big:
-        rettype: double precision
-        argument: text, text, double precision
-
-# Changes to aggregates (UDA) including removal and modification
-# Overloaded functions should be mentioned separately
-uda:
-    # ----------------- Changes from 1.7.1 to 1.8 ----------
-    - discrete_distribution_agg:
-        rettype: double precision[]
-        argument: integer, double precision, integer
-    - vectorized_distribution_agg:
-        rettype: double precision[]
-        argument: integer[], integer[]
-    # ----------------- Changes from 1.8 to 1.9 ----------
-
-    - __array_indexed_agg:
-        rettype: double precision[]
-        argument: double precision, bigint, bigint
-
-    - __best_scv_aggr:
-        rettype: double precision[]
-        argument: double precision[], integer, double precision
-
-    - __bigint_array_sum:
-        rettype: bigint[]
-        argument: bigint[]
-
-    - __display_tree_aggr:
-        rettype: text
-        argument: integer, boolean, text, text, double precision, double precision, text, integer
-
-    - __dt_acc_count_aggr:
-        rettype: bigint[]
-        argument: integer, bigint, integer
-
-    - __rep_aggr_class_count:
-        rettype: bigint[]
-        argument: integer, integer, integer
-
-    - __scv_aggr:
-        rettype: double precision[]
-        argument: integer, boolean, integer, double precision[], double precision[], bigint
-
-    - linear_svm_igd_step:
-        rettype: double precision[]
-        argument: double precision[], boolean, double precision[], integer, double precision, double precision
-
-    - linear_svm_igd_step_serial:
-        rettype: double precision[]
-        argument: double precision[], boolean, double precision[], integer, double precision, double precision
-
-    - svm_cls_agg:
-        rettype: schema_madlib.svm_model_rec
-        argument: double precision[], double precision, text, double precision, double precision, double precision
-
-    - svm_nd_agg:
-        rettype: schema_madlib.svm_model_rec
-        argument: double precision[], text, double precision, double precision, double precision
-
-    - svm_reg_agg:
-        rettype: schema_madlib.svm_model_rec
-        argument: double precision[], double precision, text, double precision, double precision, double precision, double precision
-
-    - __svm_random_ind2:
-        rettype: double precision[]
-        argument: integer
-    # ----------------- Changes from 1.9 to 1.9.1 ----------
-    - array_agg:
-        rettype: anyarray
-        argument: anyelement
-    - linear_svm_igd_step:
-       rettype: double precision[]
-       argument: double precision[], double precision, double precision[], integer, double precision, double precision, boolean, integer, double precision, boolean
-
-# Casts (UDC) updated/removed
-udc:
-    # ----------------- Changes from 1.7.1 to 1.8 ----------
-
-# Operators (UDO) removed/updated
-udo:
-    # ----------------- Changes from 1.7.1 to 1.8 ----------
-
-# Operator Classes (UDOC) removed/updated
-udoc:
-    # ----------------- Changes from 1.7.1 to 1.8 ----------

[36/50] [abbrv] incubator-madlib git commit: Release v1.10:

Posted by ri...@apache.org.

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/90f4dc15/src/madpack/upgrade_util.py
----------------------------------------------------------------------
diff --git a/src/madpack/upgrade_util.py b/src/madpack/upgrade_util.py
index 67227f7..21ddd55 100644
--- a/src/madpack/upgrade_util.py
+++ b/src/madpack/upgrade_util.py
@@ -140,55 +140,16 @@ class ChangeHandler(UpgradeBase):
         """
         @brief Load the configuration file
         """
-        # _mad_dbrev = 1.0
-        if self._mad_dbrev.split('.') < '1.1'.split('.'):
-            filename = os.path.join(self._maddir, 'madpack',
-                                    'changelist_1.0_1.9.1.yaml')
-        # _mad_dbrev = 1.1
-        elif self._mad_dbrev.split('.') < '1.2'.split('.'):
-            filename = os.path.join(self._maddir, 'madpack',
-                                    'changelist_1.1_1.9.1.yaml')
-        # _mad_dbrev = 1.2
-        elif self._mad_dbrev.split('.') < '1.3'.split('.'):
-            filename = os.path.join(self._maddir, 'madpack',
-                                    'changelist_1.2_1.9.1.yaml')
-        # _mad_dbrev = 1.3
-        elif self._mad_dbrev.split('.') < '1.4'.split('.'):
-            filename = os.path.join(self._maddir, 'madpack',
-                                    'changelist_1.3_1.9.1.yaml')
-        # _mad_dbrev = 1.4
-        elif self._mad_dbrev.split('.') < '1.4.1'.split('.'):
-            filename = os.path.join(self._maddir, 'madpack',
-                                    'changelist_1.4_1.9.1.yaml')
-        # _mad_dbrev = 1.4.1
-        elif self._mad_dbrev.split('.') < '1.5'.split('.'):
-            filename = os.path.join(self._maddir, 'madpack',
-                                    'changelist_1.4.1_1.9.1.yaml')
-        # _mad_dbrev = 1.5
-        elif self._mad_dbrev.split('.') < '1.6'.split('.'):
-            filename = os.path.join(self._maddir, 'madpack',
-                                    'changelist_1.5_1.9.1.yaml')
-        # _mad_dbrev = 1.6
-        elif self._mad_dbrev.split('.') < '1.6.0S'.split('.'):
-            filename = os.path.join(self._maddir, 'madpack',
-                                    'changelist_1.6_1.9.1.yaml')
-        # _mad_dbrev = 1.6.0S
-        elif self._mad_dbrev.split('.') < '1.7'.split('.'):
-            filename = os.path.join(self._maddir, 'madpack',
-                                    'changelist_1.6_1.9.1.yaml')
-        # _mad_dbrev = 1.7
-        elif self._mad_dbrev.split('.') < '1.7.1'.split('.'):
-            filename = os.path.join(self._maddir, 'madpack',
-                                    'changelist_1.7_1.9.1.yaml')
-        # _mad_dbrev = 1.7.1
-        elif self._mad_dbrev.split('.') < '1.8'.split('.'):
-            filename = os.path.join(self._maddir, 'madpack',
-                                    'changelist_1.7.1_1.9.1.yaml')
+
         # _mad_dbrev = 1.8
-        elif self._mad_dbrev.split('.') < '1.9'.split('.'):
+        if self._mad_dbrev.split('.') < '1.9'.split('.'):
             filename = os.path.join(self._maddir, 'madpack',
-                                    'changelist_1.8_1.9.1.yaml')
+                                    'changelist_1.8_1.10.yaml')
         # _mad_dbrev = 1.9
+        elif self._mad_dbrev.split('.') < '1.9.1'.split('.'):
+            filename = os.path.join(self._maddir, 'madpack',
+                                    'changelist_1.9_1.10.yaml')
+        # _mad_dbrev = 1.9.1
         else:
             filename = os.path.join(self._maddir, 'madpack',
                                     'changelist.yaml')

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/90f4dc15/src/ports/postgres/modules/recursive_partitioning/test/random_forest.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/recursive_partitioning/test/random_forest.sql_in b/src/ports/postgres/modules/recursive_partitioning/test/random_forest.sql_in
index 743bdfe..e91c71a 100644
--- a/src/ports/postgres/modules/recursive_partitioning/test/random_forest.sql_in
+++ b/src/ports/postgres/modules/recursive_partitioning/test/random_forest.sql_in
@@ -277,7 +277,7 @@ INSERT INTO rf_gr_test (id,gr,f1,f2,f3,cl) VALUES
 (5,2,5,4,1,1);
 
 DROP TABLE IF EXISTS train_output, train_output_summary, train_output_group;
-SELECT madlib.forest_train(
+SELECT forest_train(
                   'rf_gr_test'::TEXT,         -- source table
                   'train_output'::TEXT,    -- output model table
                   'id'::TEXT,              -- id column

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/90f4dc15/src/ports/postgres/modules/svm/test/svm.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/svm/test/svm.sql_in b/src/ports/postgres/modules/svm/test/svm.sql_in
index 2ea711a..60d280e 100644
--- a/src/ports/postgres/modules/svm/test/svm.sql_in
+++ b/src/ports/postgres/modules/svm/test/svm.sql_in
@@ -795,7 +795,7 @@ COPY svm_unbalanced (index, x1, x2, y) FROM stdin delimiter '|';
 
 
 DROP TABLE IF EXISTS svm_out, svm_out_summary;
-SELECT madlib.svm_classification(
+SELECT svm_classification(
     'svm_unbalanced',
     'svm_out',
     'y',
@@ -807,7 +807,7 @@ SELECT madlib.svm_classification(
     );
 
 DROP TABLE IF EXISTS svm_predict_out;
-SELECT madlib.svm_predict('svm_out', 'svm_unbalanced', 'index', 'svm_predict_out');
+SELECT svm_predict('svm_out', 'svm_unbalanced', 'index', 'svm_predict_out');
 
 -- we check if the accuracy in prediction the unbalanced class is relatively
 -- good. Without the class weight, this can go as low as 50%.

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/90f4dc15/src/ports/postgres/modules/utilities/test/encode_categorical.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/test/encode_categorical.sql_in b/src/ports/postgres/modules/utilities/test/encode_categorical.sql_in
index 79cfba9..5a3dd75 100644
--- a/src/ports/postgres/modules/utilities/test/encode_categorical.sql_in
+++ b/src/ports/postgres/modules/utilities/test/encode_categorical.sql_in
@@ -58,16 +58,16 @@ F|0.550|0.415|0.135|2
 SELECT * FROM abalone;
 
 -- default test
-select madlib.encode_categorical_variables('abalone', 'abalone_out1', 'sex');
+select encode_categorical_variables('abalone', 'abalone_out1', 'sex');
 select * from abalone_out1;
 
 -- ignoring numeric columns
-select madlib.encode_categorical_variables('abalone', 'abalone_out2', 'sex, length');
+select encode_categorical_variables('abalone', 'abalone_out2', 'sex, length');
 select * from abalone_out2;
 
 -- row_id showing multiple columns,
 -- top and value_to_drop able to work together with unquoted column names
-select madlib.encode_categorical_variables('abalone', 'abalone_out3',
+select encode_categorical_variables('abalone', 'abalone_out3',
                                            'sex, "Class"', 'class',
                                            'id, sex, "Class"', '2', 'sex=M, Class=1',
                                            true, 'column', false
@@ -75,7 +75,7 @@ select madlib.encode_categorical_variables('abalone', 'abalone_out3',
 select * from abalone_out3;
 
 -- * working, exclude working, global value_to_drop working
-select madlib.encode_categorical_variables('abalone', 'abalone_out4',
+select encode_categorical_variables('abalone', 'abalone_out4',
                                            '*', '"Class"',
                                            'id', '2', 'M',
                                            true, 'column', false
@@ -84,7 +84,7 @@ select * from abalone_out4;
 
 -- array output working with dictionary output,
 -- top with percent input, global value_to_drop
-select madlib.encode_categorical_variables('abalone', 'abalone_out5',
+select encode_categorical_variables('abalone', 'abalone_out5',
                                            'sex, "Class"', '',
                                            'id', '0.5', 'M',
                                            true, 'array', false
@@ -93,7 +93,7 @@ select * from abalone_out5;
 select * from abalone_out5_dictionary order by index;
 
 -- dictionary working, top with more than possible values working
-select madlib.encode_categorical_variables('abalone', 'abalone_out6',
+select encode_categorical_variables('abalone', 'abalone_out6',
                                            'sex, "Class"', '',
                                            'id', '3', 'class=1',
                                            true, 'svec', true

[28/50] [abbrv] incubator-madlib git commit: Fix headerguard for MallocAllocator

Posted by ri...@apache.org.

Fix headerguard for MallocAllocator

The headerguard for MallocAllocator is defining a different macro
than what it's testing for. Fix the defined macro to match the
check and filename.


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/078ba9eb
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/078ba9eb
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/078ba9eb

Branch: refs/heads/latest_release
Commit: 078ba9eb8860f313a7d14e426decc39627a6043a
Parents: 153037a
Author: Daniel Gustafsson <da...@yesql.se>
Authored: Fri Feb 3 10:36:02 2017 +0100
Committer: Daniel Gustafsson <da...@yesql.se>
Committed: Fri Feb 3 10:36:02 2017 +0100

----------------------------------------------------------------------
 src/utils/MallocAllocator.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/078ba9eb/src/utils/MallocAllocator.hpp
----------------------------------------------------------------------
diff --git a/src/utils/MallocAllocator.hpp b/src/utils/MallocAllocator.hpp
index 2c6ae83..5f245ba 100644
--- a/src/utils/MallocAllocator.hpp
+++ b/src/utils/MallocAllocator.hpp
@@ -5,7 +5,7 @@
  *//* ----------------------------------------------------------------------- */
 
 #ifndef MADLIB_MALLOC_ALLOCATOR_HPP
-#define MADLIB_NALLOC_ALLOCATOR_HPP
+#define MADLIB_MALLOC_ALLOCATOR_HPP
 
 namespace madlib {

[32/50] [abbrv] incubator-madlib git commit: knn: Fix input validation issues

Posted by ri...@apache.org.

knn: Fix input validation issues

- Some missing input validation cases are included now.
- Remove reduntant test cases from sql, since they are all now
handled in python code.
- There was still a bug wrt ambiguous references to column names
in the query that computes the squared_dist_norm2 in knn.sql_in.
We now use unique strings for variables in that query that fixes it.
- Handle boolean values for classification. MADlib's mode()
function does not handle boolean values, so they have to be converted
to integer before using it with mode().

Closes #98


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/2d5a5edb
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/2d5a5edb
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/2d5a5edb

Branch: refs/heads/latest_release
Commit: 2d5a5edb995758d0f8667c2c6c9cf58d9e802d50
Parents: 735dc35
Author: Nandish Jayaram <nj...@apache.org>
Authored: Tue Feb 7 11:06:09 2017 -0800
Committer: Nandish Jayaram <nj...@apache.org>
Committed: Tue Feb 7 11:06:09 2017 -0800

----------------------------------------------------------------------
 src/ports/postgres/modules/knn/knn.py_in       | 128 ++++++++++----------
 src/ports/postgres/modules/knn/knn.sql_in      |  97 ++++++++-------
 src/ports/postgres/modules/knn/test/knn.sql_in |  16 +--
 3 files changed, 128 insertions(+), 113 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/2d5a5edb/src/ports/postgres/modules/knn/knn.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/knn/knn.py_in b/src/ports/postgres/modules/knn/knn.py_in
index da7f9d6..c0d9cd7 100644
--- a/src/ports/postgres/modules/knn/knn.py_in
+++ b/src/ports/postgres/modules/knn/knn.py_in
@@ -44,86 +44,88 @@ UDF_ON_SEGMENT_NOT_ALLOWED = m4_ifdef(<!__UDF_ON_SEGMENT_NOT_ALLOWED__!>, <!True
 # ----------------------------------------------------------------------
 
 
-def knn_validate_src(schema_madlib, **kwargs):
-    trainingSource = kwargs['trainingSource']
-    if not trainingSource:
-        plpy.error("knn error: Invalid training table name!")
-    if not table_exists(trainingSource):
-        plpy.error("knn error: Training table {0} does not exist!".format(trainingSource))
-    if table_is_empty(trainingSource):
-        plpy.error("knn error: Training table {0} is empty!".format(trainingSource))
-
-    testSource = kwargs['testSource']
-    if not testSource:
-        plpy.error("knn error: Invalid test table name!")
-    if not table_exists(testSource):
-        plpy.error("knn error: Test table {0} does not exist!".format(testSource))
-    if table_is_empty(testSource):
-        plpy.error("knn error: Test table {0} is empty!".format(testSource))
-
-    trainingClassColumn = kwargs['trainingClassColumn']
-    trainingFeatureColumn = kwargs['trainingFeatureColumn']
-    for c in (trainingClassColumn, trainingFeatureColumn):
+def knn_validate_src(schema_madlib, point_source, point_column_name, label_column_name,
+    test_source, test_column_name, id_column_name, output_table, operation, k, **kwargs):
+    if not operation or operation not in ['c', 'r']:
+        plpy.error("kNN Error: operation='{0}' is an invalid value, has to be 'r' for regression OR 'c' for classification.".format(operation))
+    if not point_source:
+        plpy.error("kNN Error: Invalid training table name.")
+    if not table_exists(point_source):
+        plpy.error("kNN Error: Training table '{0}' does not exist.".format(point_source))
+    if table_is_empty(point_source):
+        plpy.error("kNN Error: Training table '{0}' is empty.".format(point_source))
+
+    if not test_source:
+        plpy.error("kNN Error: Invalid test table name.")
+    if not table_exists(test_source):
+        plpy.error("kNN Error: Test table '{0}' does not exist.".format(test_source))
+    if table_is_empty(test_source):
+        plpy.error("kNN Error: Test table '{0}' is empty.".format(test_source))
+
+    for c in (label_column_name, point_column_name):
         if not c:
-            plpy.error("knn error: Invalid column name in training table!")
-        if not columns_exist_in_table(trainingSource, [c]):
-            plpy.error("knn error: " + \
-                    "Column '{0}' does not exist in {1}!".format(c, trainingSource))
-
-    testingFeatureColumn = kwargs['testingFeatureColumn']
-    testingIdColumn = kwargs['testingIdColumn']
-    for c in (testingFeatureColumn, testingIdColumn):
+            plpy.error("kNN Error: Invalid column name in training table.")
+        if not columns_exist_in_table(point_source, [c]):
+            plpy.error("kNN Error: " + \
+                    "Column '{0}' does not exist in {1}.".format(c, point_source))
+
+    for c in (test_column_name, id_column_name):
         if not c:
-            plpy.error("knn error: Invalid column name in test table!")
-        if not columns_exist_in_table(testSource, [c]):
-            plpy.error("knn error: " + \
-                    "Column '{0}' does not exist in {1}!".format(c, testSource))
-
-    if not is_col_array(trainingSource, trainingFeatureColumn):
-        plpy.error("knn error:" + \
-                    "'Feature column {0} in train table is not an array!".format(str(trainingFeatureColumn)))
-    if not is_col_array(testSource, testingFeatureColumn):
-        plpy.error("knn error:" + \
-                    "'Feature column {0} in test table is not an array!".format(str(testingFeatureColumn)))
-
-    if not array_col_has_no_null(trainingSource, trainingFeatureColumn):
-        plpy.error("knn error:" + \
-                    "'Feature column {0} in train table has some NULL values!".format(str(trainingFeatureColumn)))
-    if not array_col_has_no_null(testSource, testingFeatureColumn):
-        plpy.error("knn error:" + \
-                    "'Feature column {0} in test table has some NULL values!".format(str(testingFeatureColumn)))
-
-    k = int(kwargs['K'])
+            plpy.error("kNN Error: Invalid column name in test table.")
+        if not columns_exist_in_table(test_source, [c]):
+            plpy.error("kNN Error: " + \
+                    "Column '{0}' does not exist in {1}.".format(c, test_source))
+
+    if not is_col_array(point_source, point_column_name):
+        plpy.error("kNN Error: " + \
+                    "Feature column '{0}' in train table is not an array.".format(point_column_name))
+    if not is_col_array(test_source, test_column_name):
+        plpy.error("kNN Error: " + \
+                    "Feature column '{0}' in test table is not an array.".format(test_column_name))
+
+    if not array_col_has_no_null(point_source, point_column_name):
+        plpy.error("kNN Error: " + \
+                    "Feature column '{0}' in train table has some NULL values.".format(point_column_name))
+    if not array_col_has_no_null(test_source, test_column_name):
+        plpy.error("kNN Error: " + \
+                    "Feature column '{0}' in test table has some NULL values.".format(test_column_name))
+
+    if not output_table:
+        plpy.error("kNN Error: Invalid output table name")
+    if table_exists(output_table):
+        plpy.error("kNN Error: Table '{0}' already exists, cannot use it as output table.".format(output_table))
+
+    if k is None:
+        k = 1
     if k<=0:
-        plpy.error("knn error:" + \
-                    "'k' {0} is not valid for knn!".format(str(k)))
+        plpy.error("kNN Error: k='{0}' is an invalid value, must be greater than 0.".format(k))
     bound = plpy.execute("""SELECT {k} <= count(*)
-            AS bound FROM {tbl}""".format(k=str(k),
-            trainingFeatureColumn=trainingFeatureColumn, tbl=trainingSource))[0]['bound']
+            AS bound FROM {tbl}""".format(k=k,
+            point_column_name=point_column_name, tbl=point_source))[0]['bound']
     if not bound:
-        plpy.error("knn error:" + \
-                "'k' {0} is greater than number of rows in training table!".format(str(k)))
+        plpy.error("kNN Error: " + \
+                "k='{0}' is greater than number of rows in training table.".format(k))
 
-    colTypesList = get_cols_and_types(trainingSource)
+    colTypesList = get_cols_and_types(point_source)
     colType = ''
     for type in colTypesList:
-        if type[0] == trainingClassColumn:
+        if type[0] == label_column_name:
             colType = type[1]
             break
     if colType not in ['INTEGER','integer','double precision','DOUBLE PRECISION','float','FLOAT','boolean','BOOLEAN'] :
-        plpy.error("knn error:" + \
-                    "Data type {0} is not valid as label for scope of knn!".format(str(colType)))
+        plpy.error("kNN Error: " + \
+                    "Data type '{0}' is not a valid type for column '{1}' in table '{2}'.".format(colType, label_column_name, point_source))
 
-    colTypesTestList = get_cols_and_types(testSource)
+    colTypesTestList = get_cols_and_types(test_source)
     colType = ''
     for type in colTypesTestList:
-        if type[0] == testingIdColumn:
+        if type[0] == id_column_name:
             colType = type[1]
             break
     if colType not in ['INTEGER','integer'] :
-        plpy.error("knn error:" + \
-                    "Data type {0} is not valid as Id in test table!".format(str(colType)))
-
+        plpy.error("kNN Error: " + \
+                    "Data type '{0}' is not a valid type for column '{1}' in table '{2}'.".format(colType, id_column_name, test_source))
+    return k
 
 # ----------------------------------------------------------------------
 m4_changequote(<!`!>, <!'!>)

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/2d5a5edb/src/ports/postgres/modules/knn/knn.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/knn/knn.sql_in b/src/ports/postgres/modules/knn/knn.sql_in
index 7ee736b..526c8dd 100644
--- a/src/ports/postgres/modules/knn/knn.sql_in
+++ b/src/ports/postgres/modules/knn/knn.sql_in
@@ -271,19 +271,32 @@ File knn.sql_in documenting the knn SQL functions
 @endinternal
 */
 
-
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__knn_validate_src(
-"trainingSource" VARCHAR,
-"trainingClassColumn" VARCHAR,
-"trainingFeatureColumn" VARCHAR,
-"testSource" VARCHAR,
-"testingIdColumn" VARCHAR,
-"testingFeatureColumn" VARCHAR,
-"K" INTEGER
-) RETURNS VOID AS $$
-    PythonFunction(knn, knn, knn_validate_src)
-$$ LANGUAGE plpythonu
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
+    point_source VARCHAR,
+    point_column_name VARCHAR,
+    label_column_name VARCHAR,
+    test_source VARCHAR,
+    test_column_name VARCHAR,
+    id_column_name VARCHAR,
+    output_table VARCHAR,
+    operation VARCHAR,
+    k INTEGER
+) RETURNS INTEGER AS $$
+    PythonFunctionBodyOnly(`knn', `knn')
+    return knn.knn_validate_src(
+        schema_madlib,
+        point_source,
+        point_column_name,
+        label_column_name,
+        test_source,
+        test_column_name,
+        id_column_name,
+        output_table,
+        operation,
+        k
+    )
+$$ LANGUAGE plpythonu VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 
 
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn(
@@ -353,10 +366,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn(
     k INTEGER
 ) RETURNS VARCHAR AS $$
 DECLARE
-    class_test_source REGCLASS;
-    class_point_source REGCLASS;
     l FLOAT;
-    outputTableFlag INTEGER;
     id INTEGER;
     vector DOUBLE PRECISION[];
     cur_pid integer;
@@ -364,29 +374,24 @@ DECLARE
     returnstring VARCHAR;
     x_temp_table VARCHAR;
     y_temp_table VARCHAR;
+    k_val INTEGER;
+    label_column_name_unique VARCHAR;
+    test_id VARCHAR;
+    convert_boolean_to_int VARCHAR;
 BEGIN
     oldClientMinMessages :=
         (SELECT setting FROM pg_settings WHERE name = 'client_min_messages');
     EXECUTE 'SET client_min_messages TO warning';
-    PERFORM MADLIB_SCHEMA.__knn_validate_src(point_source, label_column_name, point_column_name, test_source, id_column_name, test_column_name,k);
-    class_test_source := test_source;
-    class_point_source := point_source;
-    --checks
-    IF (k <= 0) THEN
-        RAISE EXCEPTION 'KNN error: Number of neighbors k must be a positive integer.';
-    END IF;
-    IF (operation != 'c' AND operation != 'r') THEN
-        RAISE EXCEPTION 'KNN error: The operation has to be r for regression OR c for classification.';
-    END IF;
+    SELECT * FROM MADLIB_SCHEMA.__knn_validate_src(point_source, point_column_name, label_column_name, test_source, test_column_name, id_column_name, output_table, operation, k) INTO k_val;
     PERFORM MADLIB_SCHEMA.create_schema_pg_temp();
     x_temp_table := 'knn_'||md5('knn_'||now()::text||random()::text)||'_temp';
     y_temp_table := 'knn_'||md5('knn_'||now()::text||random()::text)||'_temp';
+    label_column_name_unique := 'label'||md5('knn_'||now()::text||random()::text)||'_name';
+    test_id := 'id'||md5('knn_'||now()::text||random()::text)||'_name';
 
-    EXECUTE
-	$sql$
-	SELECT count(*) FROM information_schema.tables WHERE table_name = '$sql$ || output_table || $sql$'$sql$ into outputTableFlag;
-    IF (outputTableFlag != 0) THEN
-	RAISE Exception 'KNN error: Output table % already exists.', output_table;
+    convert_boolean_to_int := '';
+    IF (operation = 'c') THEN
+        convert_boolean_to_int := '::INTEGER';
     END IF;
 
     EXECUTE
@@ -396,30 +401,38 @@ BEGIN
 	SELECT *
     FROM
         (
-        SELECT row_number() over (partition by test_id order by dist) as r, $sql$ || x_temp_table || $sql$.*
+        SELECT row_number() over (partition by $sql$ || test_id || $sql$ order by dist) AS r, $sql$ || x_temp_table || $sql$.*
         FROM
             (
-                SELECT test. $sql$ || id_column_name || $sql$ as test_id, MADLIB_SCHEMA.squared_dist_norm2(train.$sql$ || point_column_name || $sql$,test.$sql$ || test_column_name || $sql$) as dist, $sql$ || label_column_name || $sql$ from $sql$ || textin(regclassout(point_source)) || $sql$ AS train, $sql$ || textin(regclassout(test_source)) || $sql$ AS test
+                SELECT test.$sql$ || id_column_name || $sql$ AS $sql$ || test_id || $sql$, MADLIB_SCHEMA.squared_dist_norm2(train.$sql$ || point_column_name || $sql$,test.$sql$ || test_column_name || $sql$) AS dist, train.$sql$ || label_column_name || $sql$ $sql$ || convert_boolean_to_int || $sql$ AS $sql$ || label_column_name_unique || $sql$
+                FROM $sql$ || textin(regclassout(point_source)) || $sql$ AS train, $sql$ || textin(regclassout(test_source)) || $sql$ AS test
             )$sql$ || x_temp_table || $sql$
         )$sql$ || y_temp_table || $sql$
-    WHERE $sql$ || y_temp_table || $sql$.r <= $sql$ || k;
-	IF (operation = 'c') THEN
+    WHERE $sql$ || y_temp_table || $sql$.r <= $sql$ || k_val;
+
+    IF (operation = 'c') THEN
     	EXECUTE
         $sql$
-	CREATE TABLE $sql$ || output_table || $sql$ AS
-    SELECT test_id as id, $sql$ || test_column_name || $sql$, MADLIB_SCHEMA.mode($sql$ || label_column_name || $sql$) as prediction from pg_temp.madlib_knn_interm join $sql$ || textin(regclassout(test_source)) || $sql$  on test_id=$sql$ || id_column_name || $sql$ group by test_id, $sql$ || test_column_name;
-        ELSE
+    	CREATE TABLE $sql$ || output_table || $sql$ AS
+        SELECT $sql$ || test_id || $sql$ AS id, $sql$ || test_column_name || $sql$, MADLIB_SCHEMA.mode($sql$ || label_column_name_unique || $sql$) AS prediction
+        FROM pg_temp.madlib_knn_interm join $sql$ || textin(regclassout(test_source)) || $sql$ ON $sql$ || test_id || $sql$=$sql$ || id_column_name || $sql$
+        GROUP BY $sql$ || test_id || $sql$, $sql$ || test_column_name;
+    ELSE
         EXECUTE
         $sql$
-	CREATE TABLE $sql$ || output_table || $sql$ AS
-        SELECT test_id as id, $sql$ || test_column_name || $sql$ ,avg($sql$ || label_column_name || $sql$) as prediction from pg_temp.madlib_knn_interm join $sql$ || textin(regclassout(test_source)) || $sql$  on test_id=$sql$ || id_column_name || $sql$ group by test_id, $sql$ || test_column_name || $sql$ order by test_id $sql$;
-        END IF;
+	    CREATE TABLE $sql$ || output_table || $sql$ AS
+        SELECT $sql$ || test_id || $sql$ AS id, $sql$ || test_column_name || $sql$, avg($sql$ || label_column_name_unique || $sql$) AS prediction
+        FROM
+            pg_temp.madlib_knn_interm join $sql$ || textin(regclassout(test_source)) || $sql$ on $sql$ || test_id || $sql$=$sql$ || id_column_name || $sql$
+        GROUP BY $sql$ || test_id || $sql$, $sql$ || test_column_name || $sql$
+        ORDER BY $sql$ || test_id || $sql$ $sql$;
+    END IF;
 
    EXECUTE 'SET client_min_messages TO ' || oldClientMinMessages;
    IF (operation = 'c') THEN
-   	returnstring := 'The classification results have been written to table';
+   	returnstring := 'The classification results have been written to output table '||output_table;
    ELSE
-        returnstring := 'The regression results have been written to table';
+        returnstring := 'The regression results have been written to output table '||output_table;
    END IF;
    DROP TABLE pg_temp.madlib_knn_interm;
    RETURN returnstring;

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/2d5a5edb/src/ports/postgres/modules/knn/test/knn.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/knn/test/knn.sql_in b/src/ports/postgres/modules/knn/test/knn.sql_in
index 3c730ee..1bf6b57 100644
--- a/src/ports/postgres/modules/knn/test/knn.sql_in
+++ b/src/ports/postgres/modules/knn/test/knn.sql_in
@@ -55,16 +55,16 @@ copy knn_test_data (id, data) from stdin delimiter '|';
 6|{50,45}
 \.
 drop table if exists madlib_knn_result_classification;
-select madlib.knn('knn_train_data','data','label','knn_test_data','data','id','madlib_knn_result_classification','c',3);
-select madlib.assert(array_agg(prediction order by id)='{1,1,0,1,0,0}', 'Wrong output in classification with k=3') from madlib_knn_result_classification;
+select knn('knn_train_data','data','label','knn_test_data','data','id','madlib_knn_result_classification','c',3);
+select assert(array_agg(prediction order by id)='{1,1,0,1,0,0}', 'Wrong output in classification with k=3') from madlib_knn_result_classification;
 
 drop table if exists madlib_knn_result_regression;
-select madlib.knn('knn_train_data','data','label','knn_test_data','data','id','madlib_knn_result_regression','r',4);
-select madlib.assert(array_agg(prediction order by id)='{1,1,0.5,1,0.25,0.25}', 'Wrong output in regression') from madlib_knn_result_regression;
+select knn('knn_train_data','data','label','knn_test_data','data','id','madlib_knn_result_regression','r',4);
+select assert(array_agg(prediction order by id)='{1,1,0.5,1,0.25,0.25}', 'Wrong output in regression') from madlib_knn_result_regression;
 
 drop table if exists madlib_knn_result_classification;
-select madlib.knn('knn_train_data','data','label','knn_test_data','data','id','madlib_knn_result_classification','c');
-select madlib.assert(array_agg(prediction order by id)='{1,1,0,1,0,0}', 'Wrong output in classification with k=1') from madlib_knn_result_classification;
+select knn('knn_train_data','data','label','knn_test_data','data','id','madlib_knn_result_classification','c');
+select assert(array_agg(prediction order by id)='{1,1,0,1,0,0}', 'Wrong output in classification with k=1') from madlib_knn_result_classification;
 
-select madlib.knn();
-select madlib.knn('help');
+select knn();
+select knn('help');

[39/50] [abbrv] incubator-madlib git commit: Release v1.10:

Posted by ri...@apache.org.

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/90f4dc15/src/madpack/changelist_1.4_1.9.1.yaml
----------------------------------------------------------------------
diff --git a/src/madpack/changelist_1.4_1.9.1.yaml b/src/madpack/changelist_1.4_1.9.1.yaml
deleted file mode 100644
index f3e4117..0000000
--- a/src/madpack/changelist_1.4_1.9.1.yaml
+++ /dev/null
@@ -1,1220 +0,0 @@
-# Changelist for MADlib version 1.4 to 1.7
-
-# This file contains all changes that were introduced in a new version of
-# MADlib. This changelist is used by the upgrade script to detect what objects
-# should be upgraded (while retaining all other objects from the previous version)
-
-# New modules (actually .sql_in files) added in upgrade version
-# For these files the sql_in code is retained as is with the functions in the
-# file installed on the upgrade version. All other files (that don't have
-# updates), are cleaned up to remove object replacements
-new module:
-    # ----------------- Changes from 1.4 to 1.4.1 ---------------
-    # ----------------- Changes from 1.4.1 to 1.5 ---------------
-    # ----------------- Changes from 1.5 to 1.6 ---------------
-    table_to_pmml:
-    # ----------------- Changes from 1.6 to 1.6.0S ----------
-    # ----------------- Changes from 1.6.0S to 1.7 ----------
-    glm:
-    multiresponseglm:
-    ordinal:
-    decision_tree:
-    random_forest:
-    distribution:
-    # ----------------- Changes from 1.7.1 to 1.8 ----------
-    text_utilities:
-
-# Changes in the types (UDT) including removal and modification
-udt:
-
-    # ----------------- Changes from 1.4 to 1.4.1 ---------------
-    # ----------------- Changes from 1.4.1 to 1.5 ---------------
-    __utils_scaled_data:
-    # ----------------- Changes from 1.5 to 1.6 ---------------
-    __logregr_result:
-    coxph_result:
-    linregr_result:
-    mlogregr_result:
-    # ----------------- Changes from 1.6 to 1.6.0S ----------
-    # ----------------- Changes from 1.6.0S to 1.7 ----------
-    bytea8:
-    # ----------------- Changes from 1.8 to 1.9 ----------
-    __enc_tbl_result:
-    __gen_acc_time:
-    __rep_type:
-    __train_result:
-    c45_classify_result:
-    c45_train_result:
-    correlation_result:
-    lsvm_sgd_model_rec:
-    lsvm_sgd_result:
-    rf_classify_result:
-    rf_train_result:
-    svm_cls_result:
-    svm_model_pr:
-    svm_model_rec:
-    svm_nd_result:
-    svm_reg_result:
-    svm_support_vector:
-    _prune_result_type:
-    _tree_result_type:
-    linear_svm_result:
-    # ----------------- Changes from 1.9 to 1.9.1 ----------
-    profile_result:
-
-# List of the UDF changes that affect the user externally.  This includes change
-# in function name, change in argument order or argument types, and removal of
-# the function. In each case, the original function is as good as removed and a
-# new function is created. In such cases, we should abort the upgrade if there
-# are user views dependent on this function, since the original function will
-# not be present in the upgraded version.
-udf:
-    # ----------------- Changes from 1.4 to 1.4.1 ---------------
-    # ----------------- Changes from 1.4.1 to 1.5 ---------------
-    - __cmsketch_final:
-        rettype: bytea
-        argument: bytea
-    - __delete_traininginfo:
-        rettype: void
-        argument: text
-    - __get_encode_table_name:
-        rettype: text
-        argument: text
-    - __get_metatable_name:
-        rettype: text
-        argument: text
-    - __get_routine_id:
-        rettype: integer
-        argument: text
-    - __get_routine_name:
-        rettype: text
-        argument: text
-    - __get_tree_table_name:
-        rettype: text
-        argument: text
-    - __insert_into_traininginfo:
-        rettype: void
-        argument: text, text, text, text, text, text, text, text, double precision, integer, integer
-    - __treemodel_clean:
-        rettype: boolean
-        argument: text
-    - compute_lincrf:
-        rettype: integer
-        argument: character varying, character varying, character varying, character varying, character varying, integer, integer
-    - create_nb_classify_fn:
-        rettype: void
-        argument: character varying, character varying, integer, character varying
-    - create_nb_classify_fn:
-        rettype: void
-        argument: character varying, character varying, character varying, integer, character varying
-    - create_table_as:
-        rettype: void
-        argument: character varying, character varying
-    - create_table_as:
-        rettype: void
-        argument: character varying, character varying, character varying
-    - create_temp_table_as:
-        rettype: void
-        argument: character varying, character varying, character varying
-    - create_temp_table_as:
-        rettype: void
-        argument: character varying, character varying
-    - create_temporary_table_as:
-        rettype: void
-        argument: character varying, character varying, character varying
-    - create_temporary_table_as:
-        rettype: void
-        argument: character varying, character varying
-    - crf_train_fgen:
-        rettype: void
-        argument: text, text, text, text, text
-    - insert_into:
-        rettype: void
-        argument: character varying, character varying
-    - internal_create_table_as:
-        rettype: void
-        argument: boolean, character varying, character varying, character varying
-    - internal_execute_using_kmeans_args:
-        rettype: void
-        argument: character varying, double precision[], regproc, integer, double precision
-    - internal_execute_using_kmeanspp_seeding_args:
-        rettype: void
-        argument: character varying, integer, regproc, double precision[]
-    - internal_execute_using_silhouette_args:
-        rettype: double precision
-        argument: character varying, double precision[], regproc
-    - lincrf:
-        rettype: integer
-        argument: character varying, character varying, character varying, character varying, character varying, integer, character varying, character varying, integer
-    - lincrf:
-        rettype: integer
-        argument: character varying, character varying, character varying, character varying, character varying, integer, character varying, character varying
-    - lsvm_predict:
-        rettype: double precision
-        argument: text, double precision[]
-    - lsvm_predict_combo:
-        rettype: SETOF schema_madlib.svm_model_pr
-        argument: text, double precision[]
-    - lsvm_sgd_update:
-        rettype: schema_madlib.lsvm_sgd_model_rec
-        argument: schema_madlib.lsvm_sgd_model_rec, double precision[], double precision, double precision, double precision
-    - svm_cls_update:
-        rettype: schema_madlib.svm_model_rec
-        argument: schema_madlib.svm_model_rec, double precision[], double precision, text, double precision, double precision
-    - svm_nd_update:
-        rettype: schema_madlib.svm_model_rec
-        argument: schema_madlib.svm_model_rec, double precision[], text, double precision, double precision
-    - svm_predict:
-        rettype: double precision
-        argument: schema_madlib.svm_model_rec, double precision[], text
-    - svm_predict:
-        rettype: double precision
-        argument: text, double precision[]
-    - svm_predict_combo:
-        rettype: SETOF schema_madlib.svm_model_pr
-        argument: text, double precision[]
-    - svm_predict_sub:
-        rettype: double precision
-        argument: integer, integer, double precision[], double precision[], double precision[], text
-    - svm_reg_update:
-        rettype: schema_madlib.svm_model_rec
-        argument: schema_madlib.svm_model_rec, double precision[], double precision, text, double precision, double precision, double precision
-    - utils_normalize_data:
-        rettype: schema_madlib.__utils_scaled_data
-        argument: double precision[], double precision[], double precision[]
-    - vcrf_top1_label:
-        rettype: integer[]
-        argument: integer[], integer[], integer
-    - vcrf_top1_view:
-        rettype: text
-        argument: text, text, text, text
-    # ----------------- Changes from 1.5 to 1.6 ---------------
-    # Removed functions
-    - array_contains_null:
-        rettype: boolean
-        argument: double precision[]
-    - array_sqrt:
-        rettype: anyarray
-        argument: anyarray
-    - coxph_step_final:
-        rettype: double precision[]
-        argument: double precision[]
-    - coxph_step_strata_final:
-        rettype: double precision[]
-        argument: double precision[]
-    - coxph_step_transition:
-        rettype: double precision[]
-        argument: double precision[], double precision[], double precision, boolean, double precision[]
-    - internal_coxph_result:
-        rettype: schema_madlib.coxph_result
-        argument: double precision[]
-    - internal_coxph_step_distance:
-        rettype: double precision
-        argument: double precision[], double precision[]
-    - normalize:
-        rettype: double precision[]
-        argument: double precision[]
-    # Changed functions (return type)
-    # These functions can be recreated correctly even if we don't add them here.
-    # But the view dependency checker needs the information.
-    - __internal_mlogregr_irls_result:
-        rettype: schema_madlib.mlogregr_result
-        argument: double precision[]
-    - __logregr_cg_result:
-        rettype: schema_madlib.__logregr_result
-        argument: double precision[]
-    - __logregr_igd_result:
-        rettype: schema_madlib.__logregr_result
-        argument: double precision[]
-    - __logregr_irls_result:
-        rettype: schema_madlib.__logregr_result
-        argument: double precision[]
-    - linregr_final:
-        rettype: schema_madlib.linregr_result
-        argument: schema_madlib.bytea8
-    - mlogregr:
-        rettype: schema_madlib.mlogregr_result
-        argument: character varying, character varying, character varying, integer, character varying, double precision, integer
-    - mlogregr:
-        rettype: schema_madlib.mlogregr_result
-        argument: character varying, character varying, character varying, integer, character varying
-    - mlogregr:
-        rettype: schema_madlib.mlogregr_result
-        argument: character varying, character varying, character varying, integer
-    - mlogregr:
-        rettype: schema_madlib.mlogregr_result
-        argument: character varying, character varying, character varying
-    # name of rettype changed
-    - compute_coxph_result:
-        rettype: schema_madlib.coxph_new_result
-        argument: double precision[], double precision, double precision[], integer, double precision[]
-    # argument changed
-    - coxph_train:
-        rettype: void
-        argument: text
-    - coxph_train:
-        rettype: void
-        argument: text, text, text, text
-    - coxph_train:
-        rettype: void
-        argument: text, text, text, text, text
-    - coxph_train:
-        rettype: void
-        argument: text, text, text, text, text, text
-    - coxph_train:
-        rettype: void
-        argument: text, text, text, text, text, text, text
-    # ----------------- Changes from 1.6 to 1.6.0S ----------
-    # ----------------- Changes from 1.6.0S to 1.7 ----------
-    - cross_validation_general:   # change in name of argument "fold_num"
-        rettype: void
-        argument: character varying, character varying[], character varying[], character varying, character varying[], character varying, character varying[], character varying[], character varying, character varying[], character varying[], character varying, character varying, boolean, character varying, character varying[], integer
-    - lmf_igd_transition:
-        rettype: double precision[]
-        argument: double precision[], smallint, smallint, double precision, double precision[], smallint, smallint, smallint, double precision, double precision
-    # depending on bytea8
-    # return type is bytea8
-    - __clustered_err_lin_merge:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - __clustered_err_lin_transition:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, double precision, double precision[], double precision[]
-    - __clustered_err_log_merge:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - __clustered_err_log_transition:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, boolean, double precision[], double precision[]
-    - __clustered_err_mlog_merge:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - __clustered_err_mlog_transition:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, integer, double precision[], double precision[], integer, integer
-    - dense_residual_norm_merge_states:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - dense_residual_norm_transition:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, double precision[], double precision, double precision[]
-    - hetero_linregr_merge_states:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - hetero_linregr_transition:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, double precision, double precision[], double precision[]
-    - linregr_merge_states:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - linregr_transition:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, double precision, double precision[]
-    - robust_linregr_merge_states:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - robust_linregr_transition:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, double precision, double precision[], double precision[]
-    - weighted_sample_merge_int64:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - weighted_sample_merge_vector:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - weighted_sample_transition_int64:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, bigint, double precision
-    - weighted_sample_transition_vector:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, double precision[], double precision
-    # argument type bytea8
-    - __clustered_err_lin_final:
-        rettype: schema_madlib.__clustered_agg_result
-        argument: schema_madlib.bytea8
-    - __clustered_err_log_final:
-        rettype: schema_madlib.__clustered_agg_result
-        argument: schema_madlib.bytea8
-    - __clustered_err_mlog_final:
-        rettype: schema_madlib.__clustered_agg_result
-        argument: schema_madlib.bytea8
-    - dense_residual_norm_final:
-        rettype: schema_madlib.residual_norm_result
-        argument: schema_madlib.bytea8
-    - hetero_linregr_final:
-        rettype: schema_madlib.heteroskedasticity_test_result
-        argument: schema_madlib.bytea8
-    - linregr_final:
-        rettype: schema_madlib.linregr_result
-        argument: schema_madlib.bytea8
-    - robust_linregr_final:
-        rettype: schema_madlib.robust_linregr_result
-        argument: schema_madlib.bytea8
-    - weighted_sample_final_int64:
-        rettype: bigint
-        argument: schema_madlib.bytea8
-    - weighted_sample_final_vector:
-        rettype: double precision[]
-        argument: schema_madlib.bytea8
-    # ----------------- Changes from 1.7.1 to 1.8 ----------
-    - __filter_input_relation:
-        rettype: character varying
-        argument: character varying, character varying
-    - __lda_util_unnest:
-        rettype: SETOF bigint[]
-        argument: bigint[]
-    - matrix_block_mult:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, text
-    - matrix_block_square:
-        rettype: schema_madlib.matrix_result
-        argument: text, text
-    - matrix_block_trans:
-        rettype: schema_madlib.matrix_result
-        argument: text, text
-    - matrix_block_trans:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, boolean
-    - matrix_blockize:
-        rettype: schema_madlib.matrix_result
-        argument: text, integer, integer, text
-    - matrix_densify:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, text, text, text, boolean
-    - matrix_mult:
-        rettype: schema_madlib.matrix_result
-        argument: text, boolean, text, boolean, text
-    - matrix_mult:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, text, text, boolean, text, text, text, text, boolean, text
-    - matrix_norm:
-        rettype: double precision
-        argument: text
-    - matrix_scale_and_add:
-        rettype: void
-        argument: text, text, double precision, text
-    - matrix_sparsify:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, boolean
-    - matrix_square:
-        rettype: schema_madlib.matrix_result
-        argument: text, text
-    - matrix_trans:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, boolean
-    - matrix_trans:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, text, text, text, boolean
-    - matrix_unblockize:
-        rettype: schema_madlib.matrix_result
-        argument: text, text
-    # ----------------- Changes from 1.8 to 1.9 ----------
-
-    - _dt_apply:
-       rettype: schema_madlib._tree_result_type
-       argument: schema_madlib.bytea8,schema_madlib.bytea8,schema_madlib.bytea8,smallint,smallint,smallint,boolean,integer
-
-    - internal_linear_svm_igd_result:
-       rettype: schema_madlib.linear_svm_result
-       argument: double precision[]
-
-    - _prune_and_cplist:
-       rettype: schema_madlib._prune_result_type
-       argument: schema_madlib.bytea8,double precision,boolean
-
-    - __array_elem_in:
-       rettype: boolean[]
-       argument: anyarray, anyarray
-
-    - __array_indexed_agg_ffunc:
-       rettype: double precision[]
-       argument: double precision[]
-
-    - __array_indexed_agg_prefunc:
-       rettype: double precision[]
-       argument: double precision[], double precision[]
-
-    - __array_indexed_agg_sfunc:
-       rettype: double precision[]
-       argument: double precision[], double precision, bigint, bigint
-
-    - __array_search:
-       rettype: boolean
-       argument: anyelement, anyarray
-
-    - __array_sort:
-       rettype: anyarray
-       argument: anyarray
-
-    - __assert:
-       rettype: void
-       argument: boolean, text
-
-    - __assert_table:
-       rettype: void
-       argument: text, boolean
-
-    - __best_scv_prefunc:
-       rettype: double precision[]
-       argument: double precision[], double precision[]
-
-    - __best_scv_sfunc:
-       rettype: double precision[]
-       argument: double precision[], double precision[], integer, double precision
-
-    - __bigint_array_add:
-       rettype: bigint[]
-       argument: bigint[], bigint[]
-
-    - __breakup_table:
-       rettype: void
-       argument: text, text, text, text, text, text[], boolean[], integer, integer
-
-    - __check_dt_common_params:
-       rettype: void
-       argument: text, text, text, text, text, text, text, text, integer, double precision, double precision, integer, text
-
-    - __check_training_table:
-       rettype: void
-       argument: text, text[], text[], text, text, integer
-
-    - __column_exists:
-       rettype: boolean
-       argument: text, text
-
-    - __columns_in_table:
-       rettype: boolean
-       argument: text[], text
-
-    - __create_metatable:
-       rettype: void
-       argument: text
-
-    - __create_tree_tables:
-       rettype: void
-       argument: text
-
-    - __csvstr_to_array:
-       rettype: text[]
-       argument: text
-
-    - __display_node_sfunc:
-       rettype: text
-       argument: text, integer, boolean, text, text, double precision, double precision, text, integer
-
-    - __display_tree_no_ordered_aggr:
-       rettype: text
-       argument: text, integer, integer, integer, boolean, double precision, text, integer, integer
-
-    - __distinct_feature_value:
-       rettype: integer
-       argument: text, integer
-
-    - __drop_metatable:
-       rettype: void
-       argument: text
-
-    - __dt_acc_count_sfunc:
-       rettype: bigint[]
-       argument: bigint[], integer, bigint, integer
-
-    - __dt_get_node_split_fids:
-       rettype: integer[]
-       argument: integer, integer, integer, integer[]
-
-    - __ebp_calc_errors:
-       rettype: double precision
-       argument: double precision, double precision, double precision
-
-    - __ebp_prune_tree:
-       rettype: void
-       argument: text
-
-    - __encode_and_train:
-       rettype: record
-       argument: text, text, integer, integer, text, text, text, text, text, text, text, double precision, text, integer, double precision, boolean, double precision, double precision, text, integer
-
-    - __encode_columns:
-       rettype: void
-       argument: text, text, integer, integer
-
-    - __encode_table:
-       rettype: void
-       argument: text, text, text, integer, integer
-
-    - __encode_table:
-       rettype: void
-       argument: text, text, text[], text, text[], text, text, integer, integer
-
-    - __find_best_split:
-       rettype: void
-       argument: text, double precision, text, integer, integer, text, integer, integer
-
-    - __format:
-       rettype: text
-       argument: text, text
-
-    - __format:
-       rettype: text
-       argument: text, text, text
-
-    - __format:
-       rettype: text
-       argument: text, text, text, text
-
-    - __format:
-       rettype: text
-       argument: text, text, text, text, text
-
-    - __format:
-       rettype: text
-       argument: text, text[]
-
-    - __gen_acc:
-       rettype: __gen_acc_time
-       argument: text, text, text, text, text, integer, integer, boolean, integer
-
-    - __gen_enc_meta_names:
-       rettype: text[]
-       argument: text, text
-
-    - __gen_horizontal_encoded_table:
-       rettype: void
-       argument: text, text, integer, integer
-
-    - __gen_vertical_encoded_table:
-       rettype: void
-       argument: text, text, text, boolean, integer
-
-    - __generate_final_tree:
-       rettype: void
-       argument: text
-
-    - __get_class_column_name:
-       rettype: text
-       argument: text
-
-    - __get_class_value:
-       rettype: text
-       argument: integer, text
-
-    - __get_classtable_name:
-       rettype: text
-       argument: text
-
-    - __get_column_value:
-       rettype: text
-       argument: integer, integer, character, text
-
-    - __get_feature_name:
-       rettype: text
-       argument: integer, text
-
-    - __get_feature_value:
-       rettype: text
-       argument: integer, integer, text
-
-    - __get_features_of_nodes:
-       rettype: text
-       argument: text, text, integer, integer, integer
-
-    - __get_id_column_name:
-       rettype: text
-       argument: text
-
-    - __get_schema_name:
-       rettype: text
-       argument: text
-
-    - __get_table_name:
-       rettype: text
-       argument: text
-
-    - __insert_into_metatable:
-       rettype: void
-       argument: text, integer, text, character, boolean, text, integer
-
-    - __is_valid_enc_table:
-       rettype: boolean
-       argument: text
-
-    - __num_of_class:
-       rettype: integer
-       argument: text
-
-    - __num_of_columns:
-       rettype: integer
-       argument: text
-
-    - __num_of_feature:
-       rettype: integer
-       argument: text
-
-    - __regclass_to_text:
-       rettype: text
-       argument: regclass
-
-    - __rename_table:
-       rettype: void
-       argument: text, text
-
-    - __rep_aggr_class_count_ffunc:
-       rettype: bigint[]
-       argument: bigint[]
-
-    - __rep_aggr_class_count_sfunc:
-       rettype: bigint[]
-       argument: bigint[], integer, integer, integer
-
-    - __rep_prune_tree:
-       rettype: void
-       argument: text, text, integer
-
-    - __sample_with_replacement:
-       rettype: void
-       argument: integer, bigint, text, text
-
-    - __sample_within_range:
-       rettype: SETOF bigint
-       argument: bigint, bigint, bigint
-
-    - __scv_aggr_ffunc:
-       rettype: double precision[]
-       argument: double precision[]
-
-    - __scv_aggr_prefunc:
-       rettype: double precision[]
-       argument: double precision[], double precision[]
-
-    - __scv_aggr_sfunc:
-       rettype: double precision[]
-       argument: double precision[], integer, boolean, integer, double precision[], double precision[], bigint
-
-    - __strip_schema_name:
-       rettype: text
-       argument: text
-
-    - __svm_random_ind2:
-       rettype: double precision[]
-       argument: integer
-
-    - __svm_random_ind:
-       rettype: double precision[]
-       argument: integer
-
-    - __svm_target_cl_func:
-       rettype: double precision
-       argument: double precision[]
-
-    - __svm_target_reg_func:
-       rettype: double precision
-       argument: double precision[]
-
-    - __table_exists:
-       rettype: boolean
-       argument: text
-
-    - __train_tree:
-       rettype: __train_result
-       argument: text, integer, integer, text, text, text, text, text, text, double precision, integer, double precision, double precision, double precision, boolean, integer, integer
-
-    - __treemodel_classify_internal:
-       rettype: text[]
-       argument: text, text, integer
-
-    - __treemodel_classify_internal_serial:
-       rettype: text[]
-       argument: text, text, integer
-
-    - __treemodel_display_no_ordered_aggr:
-       rettype: SETOF text
-       argument: text, integer[], integer
-
-    - __treemodel_display_with_ordered_aggr:
-       rettype: SETOF text
-       argument: text, integer[], integer
-
-    - __treemodel_get_vote_result:
-       rettype: void
-       argument: text, text
-
-    - __treemodel_score:
-       rettype: double precision
-       argument: text, text, integer
-
-    - __validate_input_table:
-       rettype: void
-       argument: text, text[], text, text
-
-    - __validate_metatable:
-       rettype: void
-       argument: text
-
-    - c45_classify:
-       rettype: c45_classify_result
-       argument: text, text, text
-
-    - c45_classify:
-       rettype: c45_classify_result
-       argument: text, text, text, integer
-
-    - c45_clean:
-       rettype: boolean
-       argument: text
-
-    - c45_display:
-       rettype: SETOF text
-       argument: text
-
-    - c45_display:
-       rettype: SETOF text
-       argument: text, integer
-
-    - c45_genrule:
-       rettype: SETOF text
-       argument: text
-
-    - c45_genrule:
-       rettype: SETOF text
-       argument: text, integer
-
-    - c45_score:
-       rettype: double precision
-       argument: text, text
-
-    - c45_score:
-       rettype: double precision
-       argument: text, text, integer
-
-    - c45_train:
-       rettype: c45_train_result
-       argument: text, text, text
-
-    - c45_train:
-       rettype: c45_train_result
-       argument: text, text, text, text, text, text, text, text, double precision, text
-
-    - c45_train:
-       rettype: c45_train_result
-       argument: text, text, text, text, text, text, text, text, double precision, text, integer, double precision, double precision, integer
-
-    - correlation:
-       rettype: correlation_result
-       argument: character varying, character varying
-
-    - correlation:
-       rettype: correlation_result
-       argument: character varying, character varying, character varying
-
-    - correlation:
-       rettype: correlation_result
-       argument: character varying, character varying, character varying, boolean
-
-    - linear_svm_igd_transition:
-       rettype: double precision[]
-       argument: double precision[], double precision[], boolean, double precision[], integer, double precision, double precision
-
-    - lsvm_classification:
-       rettype: SETOF lsvm_sgd_result
-       argument: text, text
-
-    - lsvm_classification:
-       rettype: SETOF lsvm_sgd_result
-       argument: text, text, boolean
-
-    - lsvm_classification:
-       rettype: SETOF lsvm_sgd_result
-       argument: text, text, boolean, boolean, double precision, double precision
-
-    - lsvm_classification:
-       rettype: SETOF lsvm_sgd_result
-       argument: text, text, boolean, boolean, double precision, double precision, integer
-
-    - lsvm_predict:
-       rettype: double precision
-       argument: double precision[], double precision[]
-
-    - lsvm_predict_batch:
-       rettype: text
-       argument: text, text, text, text, text
-
-    - lsvm_predict_batch:
-       rettype: text
-       argument: text, text, text, text, text, boolean
-
-    - matrix_block_trans:
-       rettype: matrix_result
-       argument: text, text, text, text, boolean
-
-    - matrix_densify:
-       rettype: matrix_result
-       argument: text, text, text, text, boolean
-
-    - matrix_sparsify:
-       rettype: matrix_result
-       argument: text, text, text, text, boolean
-
-    - matrix_trans:
-       rettype: matrix_result
-       argument: text, text, text, text, boolean
-
-    - rf_classify:
-       rettype: rf_classify_result
-       argument: text, text, text
-
-    - rf_classify:
-       rettype: rf_classify_result
-       argument: text, text, text, boolean, integer
-
-    - rf_classify:
-       rettype: rf_classify_result
-       argument: text, text, text, integer
-
-    - rf_clean:
-       rettype: boolean
-       argument: text
-
-    - rf_display:
-       rettype: SETOF text
-       argument: text
-
-    - rf_display:
-       rettype: SETOF text
-       argument: text, integer[]
-
-    - rf_display:
-       rettype: SETOF text
-       argument: text, integer[], integer
-
-    - rf_score:
-       rettype: double precision
-       argument: text, text
-
-    - rf_score:
-       rettype: double precision
-       argument: text, text, integer
-
-    - rf_train:
-       rettype: rf_train_result
-       argument: text, text, text
-
-    - rf_train:
-       rettype: rf_train_result
-       argument: text, text, text, integer, integer, double precision, text, text, text, text, text, integer, double precision, double precision, integer
-
-    - svdmf_run:
-       rettype: text
-       argument: text, text, text, text, integer
-
-    - svdmf_run:
-       rettype: text
-       argument: text, text, text, text, integer, integer, double precision
-
-    - svm_classification:
-       rettype: SETOF svm_cls_result
-       argument: text, text, boolean, text
-
-    - svm_classification:
-       rettype: SETOF svm_cls_result
-       argument: text, text, boolean, text, boolean, double precision, double precision
-
-    - svm_classification:
-       rettype: SETOF svm_cls_result
-       argument: text, text, boolean, text, boolean, double precision, double precision, double precision
-
-    - svm_classification:
-       rettype: SETOF svm_cls_result
-       argument: text, text, boolean, text, double precision
-
-    - svm_cls_update:
-       rettype: schema_madlib.svm_model_rec
-       argument: schema_madlib.svm_model_rec, double precision[], double precision, text, double precision, double precision, double precision
-
-    - svm_data_normalization:
-       rettype: void
-       argument: text
-
-    - svm_dot:
-       rettype: double precision
-       argument: double precision[], double precision[]
-
-    - svm_dot:
-       rettype: double precision
-       argument: double precision[], double precision[], double precision
-
-    - svm_drop_model:
-       rettype: void
-       argument: text
-
-    - svm_gaussian:
-       rettype: double precision
-       argument: double precision[], double precision[], double precision
-
-    - svm_generate_cls_data:
-       rettype: void
-       argument: text, integer, integer
-
-    - svm_generate_nd_data:
-       rettype: void
-       argument: text, integer, integer
-
-    - svm_generate_reg_data:
-       rettype: void
-       argument: text, integer, integer
-
-    - svm_nd_update:
-       rettype: schema_madlib.svm_model_rec
-       argument: schema_madlib.svm_model_rec, double precision[], text, double precision, double precision, double precision
-
-    - svm_novelty_detection:
-       rettype: SETOF schema_madlib.svm_nd_result
-       argument: text, text, boolean, text
-
-    - svm_novelty_detection:
-       rettype: SETOF schema_madlib.svm_nd_result
-       argument: text, text, boolean, text, boolean, double precision, double precision
-
-    - svm_novelty_detection:
-       rettype: SETOF schema_madlib.svm_nd_result
-       argument: text, text, boolean, text, boolean, double precision, double precision, double precision
-
-    - svm_polynomial:
-       rettype: double precision
-       argument: double precision[], double precision[], double precision
-
-    - svm_predict:
-       rettype: double precision
-       argument: schema_madlib.svm_model_rec, double precision[], text, double precision
-
-    - svm_predict_batch:
-       rettype: text
-       argument: text, text, text, text, text, boolean
-
-    - svm_predict_sub:
-       rettype: double precision
-       argument: integer, integer, double precision[], double precision[], double precision[], text, double precision
-
-    - svm_reg_update:
-       rettype: schema_madlib.svm_model_rec
-       argument: schema_madlib.svm_model_rec, double precision[], double precision, text, double precision, double precision, double precision, double precision
-
-    - svm_regression:
-       rettype: SETOF svm_reg_result
-       argument: text, text, boolean, text
-
-    - svm_regression:
-       rettype: SETOF svm_reg_result
-       argument: text, text, boolean, text, boolean, double precision, double precision, double precision
-
-    - svm_regression:
-       rettype: SETOF svm_reg_result
-       argument: text, text, boolean, text, boolean, double precision, double precision, double precision, double precision
-
-    - svm_store_model:
-       rettype: void
-       argument: text, text, text
-
-    # ----------------- Changes from 1.9 to 1.9.1 ----------
-    - array_collapse:
-        rettype: anyarray
-        argument: anyarray
-    - linear_svm_igd_transition:
-        rettype: double precision[]
-        argument: double precision[], double precision[], double precision, double precision[], integer, double precision, double precision, boolean, integer, double precision, boolean
-    - profile:
-        rettype: SETOF schema_madlib.profile_result
-        argument: text
-    - profile_full:
-        rettype: SETOF schema_madlib.profile_result
-        argument: text, integer
-    - profile:
-        rettype: schema_madlib.profile_result
-        argument: text
-    - profile_full:
-        rettype: schema_madlib.profile_result
-        argument: text, integer
-    - quantile:
-        rettype: double precision
-        argument: text, text, double precision
-    - quantile_big:
-        rettype: double precision
-        argument: text, text, double precision
-
-# Changes to aggregates (UDA) including removal and modification
-# Overloaded functions should be mentioned separately
-uda:
-    # ----------------- Changes from 1.4 to 1.4.1 ---------------
-    # ----------------- Changes from 1.4.1 to 1.5 ---------------
-    - lsvm_sgd_agg:
-        rettype: schema_madlib.lsvm_sgd_model_rec
-        argument: double precision[], double precision, double precision, double precision
-    - svm_cls_agg:
-        rettype: schema_madlib.svm_model_rec
-        argument: double precision[], double precision, text, double precision, double precision
-    - svm_nd_agg:
-        rettype: schema_madlib.svm_model_rec
-        argument: double precision[], text, double precision, double precision
-    - svm_reg_agg:
-        rettype: schema_madlib.svm_model_rec
-        argument: double precision[], double precision, text, double precision, double precision, double precision
-    # ----------------- Changes from 1.5 to 1.6 ---------------
-    - coxph_step:
-        rettype: double precision[]
-        argument: double precision[], double precision, boolean, double precision[]
-    - coxph_strata_step_inner:
-        rettype: double precision[]
-        argument: double precision[], double precision, boolean, double precision[]
-    - coxph_strata_step_outer:
-        rettype: double precision[]
-        argument: double precision[]
-    # return type change
-    - linregr:
-        rettype: schema_madlib.linregr_result
-        argument: double precision, double precision[]
-    # initcond change
-    - __mlogregr_irls_step:
-        rettype: double precision[]
-        argument: integer, integer, integer, double precision[], double precision[]
-    # ----------------- Changes from 1.6 to 1.6.0S ----------
-    # ----------------- Changes from 1.6.0S to 1.7 ----------
-    - lmf_igd_step:
-        rettype: double precision[]
-        argument: smallint, smallint, double precision, double precision[], smallint, smallint, smallint, double precision, double precision
-    - __clustered_err_lin_step:
-        rettype: schema_madlib.__clustered_agg_result
-        argument: double precision, double precision[], double precision[]
-    - __clustered_err_log_step:
-        rettype: schema_madlib.__clustered_agg_result
-        argument: boolean, double precision[], double precision[]
-    - __clustered_err_mlog_step:
-        rettype: schema_madlib.__clustered_agg_result
-        argument: integer, double precision[], double precision[], integer, integer
-    - dense_residual_norm:
-        rettype: schema_madlib.residual_norm_result
-        argument: double precision[], double precision, double precision[]
-    - heteroskedasticity_test_linregr:
-        rettype: schema_madlib.heteroskedasticity_test_result
-        argument: double precision, double precision[], double precision[]
-    - linregr:
-        rettype: schema_madlib.linregr_result
-        argument: double precision, double precision[]
-    - robust_linregr:
-        rettype: schema_madlib.robust_linregr_result
-        argument: double precision, double precision[], double precision[]
-    - weighted_sample:
-        rettype: double precision[]
-        argument: double precision[], double precision
-    - weighted_sample:
-        rettype: bigint
-        argument: bigint, double precision
-    # ----------------- Changes from 1.7.1 to 1.8 ----------
-    - discrete_distribution_agg:
-        rettype: double precision[]
-        argument: integer, double precision, integer
-    - vectorized_distribution_agg:
-        rettype: double precision[]
-        argument: integer[], integer[]
-    # ----------------- Changes from 1.8 to 1.9 ----------
-
-    - __array_indexed_agg:
-        rettype: double precision[]
-        argument: double precision, bigint, bigint
-
-    - __best_scv_aggr:
-        rettype: double precision[]
-        argument: double precision[], integer, double precision
-
-    - __bigint_array_sum:
-        rettype: bigint[]
-        argument: bigint[]
-
-    - __display_tree_aggr:
-        rettype: text
-        argument: integer, boolean, text, text, double precision, double precision, text, integer
-
-    - __dt_acc_count_aggr:
-        rettype: bigint[]
-        argument: integer, bigint, integer
-
-    - __rep_aggr_class_count:
-        rettype: bigint[]
-        argument: integer, integer, integer
-
-    - __scv_aggr:
-        rettype: double precision[]
-        argument: integer, boolean, integer, double precision[], double precision[], bigint
-
-    - linear_svm_igd_step:
-        rettype: double precision[]
-        argument: double precision[], boolean, double precision[], integer, double precision, double precision
-
-    - linear_svm_igd_step_serial:
-        rettype: double precision[]
-        argument: double precision[], boolean, double precision[], integer, double precision, double precision
-
-    - svm_cls_agg:
-        rettype: schema_madlib.svm_model_rec
-        argument: double precision[], double precision, text, double precision, double precision, double precision
-
-    - svm_nd_agg:
-        rettype: schema_madlib.svm_model_rec
-        argument: double precision[], text, double precision, double precision, double precision
-
-    - svm_reg_agg:
-        rettype: schema_madlib.svm_model_rec
-        argument: double precision[], double precision, text, double precision, double precision, double precision, double precision
-
-    - __svm_random_ind2:
-        rettype: double precision[]
-        argument: integer
-    # ----------------- Changes from 1.9 to 1.9.1 ----------
-    - array_agg:
-        rettype: anyarray
-        argument: anyelement
-    - linear_svm_igd_step:
-       rettype: double precision[]
-       argument: double precision[], double precision, double precision[], integer, double precision, double precision, boolean, integer, double precision, boolean
-
-# Casts (UDC) updated/removed
-udc:
-    # ----------------- Changes from 1.4 to 1.4.1 ---------------
-    # ----------------- Changes from 1.4.1 to 1.5 ---------------
-    bool2text:
-        sourcetype: boolean
-        targettype: text
-    # ----------------- Changes from 1.5 to 1.6 ---------------
-    # ----------------- Changes from 1.6 to 1.6.0S ----------
-    # ----------------- Changes from 1.6.0S to 1.7 ----------
-
-# Operators (UDO) removed/updated
-udo:
-    # ----------------- Changes from 1.5 to 1.6 ---------------
-    - '<':
-        leftarg: schema_madlib.svec
-        rightarg: schema_madlib.svec
-    - '<=':
-        leftarg: schema_madlib.svec
-        rightarg: schema_madlib.svec
-    - '<>':
-        leftarg: schema_madlib.svec
-        rightarg: schema_madlib.svec
-    - '==':
-        leftarg: schema_madlib.svec
-        rightarg: schema_madlib.svec
-    - '>=':
-        leftarg: schema_madlib.svec
-        rightarg: schema_madlib.svec
-    - '>':
-        leftarg: schema_madlib.svec
-        rightarg: schema_madlib.svec
-    # ----------------- Changes from 1.6 to 1.6.0S ----------
-    # ----------------- Changes from 1.6.0S to 1.7 ----------
-
-# Operator Classes (UDOC) removed/updated
-udoc:
-    # ----------------- Changes from 1.5 to 1.6 ---------------
-    # removed
-    - svec_l2_ops:
-        index: btree
-    # ----------------- Changes from 1.6 to 1.6.0S ----------
-    # ----------------- Changes from 1.6.0S to 1.7 ----------

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/90f4dc15/src/madpack/changelist_1.5_1.9.1.yaml
----------------------------------------------------------------------
diff --git a/src/madpack/changelist_1.5_1.9.1.yaml b/src/madpack/changelist_1.5_1.9.1.yaml
deleted file mode 100644
index 0e3b9d6..0000000
--- a/src/madpack/changelist_1.5_1.9.1.yaml
+++ /dev/null
@@ -1,1077 +0,0 @@
-# Changelist for MADlib version 1.5 to 1.7
-
-# This file contains all changes that were introduced in a new version of
-# MADlib. This changelist is used by the upgrade script to detect what objects
-# should be upgraded (while retaining all other objects from the previous version)
-
-# New modules (actually .sql_in files) added in upgrade version
-# For these files the sql_in code is retained as is with the functions in the
-# file installed on the upgrade version. All other files (that don't have
-# updates), are cleaned up to remove object replacements
-new module:
-    # ----------------- Changes from 1.5 to 1.6 ---------------
-    table_to_pmml:
-    # ----------------- Changes from 1.6 to 1.6.0S ----------
-    # ----------------- Changes from 1.6.0S to 1.7 ----------
-    glm:
-    multiresponseglm:
-    ordinal:
-    decision_tree:
-    random_forest:
-    distribution:
-    # ----------------- Changes from 1.7.1 to 1.8 ----------
-    text_utilities:
-
-# Changes in the types (UDT) including removal and modification
-udt:
-
-    # ----------------- Changes from 1.5 to 1.6 ---------------
-    __logregr_result:
-    coxph_result:
-    linregr_result:
-    mlogregr_result:
-    # ----------------- Changes from 1.6 to 1.6.0S ----------
-    # ----------------- Changes from 1.6.0S to 1.7 ----------
-    bytea8:
-    # ----------------- Changes from 1.8 to 1.9 ----------
-    __enc_tbl_result:
-    __gen_acc_time:
-    __rep_type:
-    __train_result:
-    c45_classify_result:
-    c45_train_result:
-    correlation_result:
-    lsvm_sgd_model_rec:
-    lsvm_sgd_result:
-    rf_classify_result:
-    rf_train_result:
-    svm_cls_result:
-    svm_model_pr:
-    svm_model_rec:
-    svm_nd_result:
-    svm_reg_result:
-    svm_support_vector:
-    _prune_result_type:
-    _tree_result_type:
-    linear_svm_result:
-    # ----------------- Changes from 1.9 to 1.9.1 ----------
-    profile_result:
-
-# List of the UDF changes that affect the user externally. This includes change
-# in function name, return type, argument order or types, or removal of
-# the function. In each case, the original function is as good as removed and a
-# new function is created. In such cases, we should abort the upgrade if there
-# are user views dependent on this function, since the original function will
-# not be present in the upgraded version.
-udf:
-    # ----------------- Changes from 1.5 to 1.6 ---------------
-    # Removed functions
-    - array_contains_null:
-        rettype: boolean
-        argument: double precision[]
-    - array_sqrt:
-        rettype: anyarray
-        argument: anyarray
-    - coxph_step_final:
-        rettype: double precision[]
-        argument: double precision[]
-    - coxph_step_strata_final:
-        rettype: double precision[]
-        argument: double precision[]
-    - coxph_step_transition:
-        rettype: double precision[]
-        argument: double precision[], double precision[], double precision, boolean, double precision[]
-    - internal_coxph_result:
-        rettype: schema_madlib.coxph_result
-        argument: double precision[]
-    - internal_coxph_step_distance:
-        rettype: double precision
-        argument: double precision[], double precision[]
-    - normalize:
-        rettype: double precision[]
-        argument: double precision[]
-    # Changed functions (return type)
-    # If the typename does not change, functions can be recreated correctly even if we don't add them here.
-    # But the view dependency checker needs the information.
-    - __internal_mlogregr_irls_result:
-        rettype: schema_madlib.mlogregr_result
-        argument: double precision[]
-    - __logregr_cg_result:
-        rettype: schema_madlib.__logregr_result
-        argument: double precision[]
-    - __logregr_igd_result:
-        rettype: schema_madlib.__logregr_result
-        argument: double precision[]
-    - __logregr_irls_result:
-        rettype: schema_madlib.__logregr_result
-        argument: double precision[]
-    - linregr_final:
-        rettype: schema_madlib.linregr_result
-        argument: schema_madlib.bytea8
-    - mlogregr:
-        rettype: schema_madlib.mlogregr_result
-        argument: character varying, character varying, character varying, integer, character varying, double precision, integer
-    - mlogregr:
-        rettype: schema_madlib.mlogregr_result
-        argument: character varying, character varying, character varying, integer, character varying
-    - mlogregr:
-        rettype: schema_madlib.mlogregr_result
-        argument: character varying, character varying, character varying, integer
-    - mlogregr:
-        rettype: schema_madlib.mlogregr_result
-        argument: character varying, character varying, character varying
-    # name of rettype changed
-    - compute_coxph_result:
-        rettype: schema_madlib.coxph_new_result
-        argument: double precision[], double precision, double precision[], integer, double precision[]
-    # argument changed
-    - coxph_train:
-        rettype: void
-        argument: text
-    - coxph_train:
-        rettype: void
-        argument: text, text, text, text
-    - coxph_train:
-        rettype: void
-        argument: text, text, text, text, text
-    - coxph_train:
-        rettype: void
-        argument: text, text, text, text, text, text
-    - coxph_train:
-        rettype: void
-        argument: text, text, text, text, text, text, text
-    # ----------------- Changes from 1.6 to 1.6.0S ----------
-    # ----------------- Changes from 1.6.0S to 1.7 ----------
-    - cross_validation_general:   # change in name of argument "fold_num"
-        rettype: void
-        argument: character varying, character varying[], character varying[], character varying, character varying[], character varying, character varying[], character varying[], character varying, character varying[], character varying[], character varying, character varying, boolean, character varying, character varying[], integer
-    - lmf_igd_transition:
-        rettype: double precision[]
-        argument: double precision[], smallint, smallint, double precision, double precision[], smallint, smallint, smallint, double precision, double precision
-    # depending on bytea8
-    # return type is bytea8
-    - __clustered_err_lin_merge:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - __clustered_err_lin_transition:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, double precision, double precision[], double precision[]
-    - __clustered_err_log_merge:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - __clustered_err_log_transition:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, boolean, double precision[], double precision[]
-    - __clustered_err_mlog_merge:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - __clustered_err_mlog_transition:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, integer, double precision[], double precision[], integer, integer
-    - dense_residual_norm_merge_states:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - dense_residual_norm_transition:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, double precision[], double precision, double precision[]
-    - hetero_linregr_merge_states:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - hetero_linregr_transition:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, double precision, double precision[], double precision[]
-    - linregr_merge_states:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - linregr_transition:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, double precision, double precision[]
-    - robust_linregr_merge_states:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - robust_linregr_transition:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, double precision, double precision[], double precision[]
-    - weighted_sample_merge_int64:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - weighted_sample_merge_vector:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, schema_madlib.bytea8
-    - weighted_sample_transition_int64:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, bigint, double precision
-    - weighted_sample_transition_vector:
-        rettype: schema_madlib.bytea8
-        argument: schema_madlib.bytea8, double precision[], double precision
-    # argument type bytea8
-    - __clustered_err_lin_final:
-        rettype: schema_madlib.__clustered_agg_result
-        argument: schema_madlib.bytea8
-    - __clustered_err_log_final:
-        rettype: schema_madlib.__clustered_agg_result
-        argument: schema_madlib.bytea8
-    - __clustered_err_mlog_final:
-        rettype: schema_madlib.__clustered_agg_result
-        argument: schema_madlib.bytea8
-    - dense_residual_norm_final:
-        rettype: schema_madlib.residual_norm_result
-        argument: schema_madlib.bytea8
-    - hetero_linregr_final:
-        rettype: schema_madlib.heteroskedasticity_test_result
-        argument: schema_madlib.bytea8
-    - linregr_final:
-        rettype: schema_madlib.linregr_result
-        argument: schema_madlib.bytea8
-    - robust_linregr_final:
-        rettype: schema_madlib.robust_linregr_result
-        argument: schema_madlib.bytea8
-    - weighted_sample_final_int64:
-        rettype: bigint
-        argument: schema_madlib.bytea8
-    - weighted_sample_final_vector:
-        rettype: double precision[]
-        argument: schema_madlib.bytea8
-    # ----------------- Changes from 1.7.1 to 1.8 ----------
-    - __filter_input_relation:
-        rettype: character varying
-        argument: character varying, character varying
-    - __lda_util_unnest:
-        rettype: SETOF bigint[]
-        argument: bigint[]
-    - matrix_block_mult:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, text
-    - matrix_block_square:
-        rettype: schema_madlib.matrix_result
-        argument: text, text
-    - matrix_block_trans:
-        rettype: schema_madlib.matrix_result
-        argument: text, text
-    - matrix_block_trans:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, boolean
-    - matrix_blockize:
-        rettype: schema_madlib.matrix_result
-        argument: text, integer, integer, text
-    - matrix_densify:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, text, text, text, boolean
-    - matrix_mult:
-        rettype: schema_madlib.matrix_result
-        argument: text, boolean, text, boolean, text
-    - matrix_mult:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, text, text, boolean, text, text, text, text, boolean, text
-    - matrix_norm:
-        rettype: double precision
-        argument: text
-    - matrix_scale_and_add:
-        rettype: void
-        argument: text, text, double precision, text
-    - matrix_sparsify:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, boolean
-    - matrix_square:
-        rettype: schema_madlib.matrix_result
-        argument: text, text
-    - matrix_trans:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, boolean
-    - matrix_trans:
-        rettype: schema_madlib.matrix_result
-        argument: text, text, text, text, text, boolean
-    - matrix_unblockize:
-        rettype: schema_madlib.matrix_result
-        argument: text, text
-    # ----------------- Changes from 1.8 to 1.9 ----------
-
-    - _dt_apply:
-       rettype: schema_madlib._tree_result_type
-       argument: schema_madlib.bytea8,schema_madlib.bytea8,schema_madlib.bytea8,smallint,smallint,smallint,boolean,integer
-
-    - internal_linear_svm_igd_result:
-       rettype: schema_madlib.linear_svm_result
-       argument: double precision[]
-
-    - _prune_and_cplist:
-       rettype: schema_madlib._prune_result_type
-       argument: schema_madlib.bytea8,double precision,boolean
-
-    - __array_elem_in:
-       rettype: boolean[]
-       argument: anyarray, anyarray
-
-    - __array_indexed_agg_ffunc:
-       rettype: double precision[]
-       argument: double precision[]
-
-    - __array_indexed_agg_prefunc:
-       rettype: double precision[]
-       argument: double precision[], double precision[]
-
-    - __array_indexed_agg_sfunc:
-       rettype: double precision[]
-       argument: double precision[], double precision, bigint, bigint
-
-    - __array_search:
-       rettype: boolean
-       argument: anyelement, anyarray
-
-    - __array_sort:
-       rettype: anyarray
-       argument: anyarray
-
-    - __assert:
-       rettype: void
-       argument: boolean, text
-
-    - __assert_table:
-       rettype: void
-       argument: text, boolean
-
-    - __best_scv_prefunc:
-       rettype: double precision[]
-       argument: double precision[], double precision[]
-
-    - __best_scv_sfunc:
-       rettype: double precision[]
-       argument: double precision[], double precision[], integer, double precision
-
-    - __bigint_array_add:
-       rettype: bigint[]
-       argument: bigint[], bigint[]
-
-    - __breakup_table:
-       rettype: void
-       argument: text, text, text, text, text, text[], boolean[], integer, integer
-
-    - __check_dt_common_params:
-       rettype: void
-       argument: text, text, text, text, text, text, text, text, integer, double precision, double precision, integer, text
-
-    - __check_training_table:
-       rettype: void
-       argument: text, text[], text[], text, text, integer
-
-    - __column_exists:
-       rettype: boolean
-       argument: text, text
-
-    - __columns_in_table:
-       rettype: boolean
-       argument: text[], text
-
-    - __create_metatable:
-       rettype: void
-       argument: text
-
-    - __create_tree_tables:
-       rettype: void
-       argument: text
-
-    - __csvstr_to_array:
-       rettype: text[]
-       argument: text
-
-    - __display_node_sfunc:
-       rettype: text
-       argument: text, integer, boolean, text, text, double precision, double precision, text, integer
-
-    - __display_tree_no_ordered_aggr:
-       rettype: text
-       argument: text, integer, integer, integer, boolean, double precision, text, integer, integer
-
-    - __distinct_feature_value:
-       rettype: integer
-       argument: text, integer
-
-    - __drop_metatable:
-       rettype: void
-       argument: text
-
-    - __dt_acc_count_sfunc:
-       rettype: bigint[]
-       argument: bigint[], integer, bigint, integer
-
-    - __dt_get_node_split_fids:
-       rettype: integer[]
-       argument: integer, integer, integer, integer[]
-
-    - __ebp_calc_errors:
-       rettype: double precision
-       argument: double precision, double precision, double precision
-
-    - __ebp_prune_tree:
-       rettype: void
-       argument: text
-
-    - __encode_and_train:
-       rettype: record
-       argument: text, text, integer, integer, text, text, text, text, text, text, text, double precision, text, integer, double precision, boolean, double precision, double precision, text, integer
-
-    - __encode_columns:
-       rettype: void
-       argument: text, text, integer, integer
-
-    - __encode_table:
-       rettype: void
-       argument: text, text, text, integer, integer
-
-    - __encode_table:
-       rettype: void
-       argument: text, text, text[], text, text[], text, text, integer, integer
-
-    - __find_best_split:
-       rettype: void
-       argument: text, double precision, text, integer, integer, text, integer, integer
-
-    - __format:
-       rettype: text
-       argument: text, text
-
-    - __format:
-       rettype: text
-       argument: text, text, text
-
-    - __format:
-       rettype: text
-       argument: text, text, text, text
-
-    - __format:
-       rettype: text
-       argument: text, text, text, text, text
-
-    - __format:
-       rettype: text
-       argument: text, text[]
-
-    - __gen_acc:
-       rettype: __gen_acc_time
-       argument: text, text, text, text, text, integer, integer, boolean, integer
-
-    - __gen_enc_meta_names:
-       rettype: text[]
-       argument: text, text
-
-    - __gen_horizontal_encoded_table:
-       rettype: void
-       argument: text, text, integer, integer
-
-    - __gen_vertical_encoded_table:
-       rettype: void
-       argument: text, text, text, boolean, integer
-
-    - __generate_final_tree:
-       rettype: void
-       argument: text
-
-    - __get_class_column_name:
-       rettype: text
-       argument: text
-
-    - __get_class_value:
-       rettype: text
-       argument: integer, text
-
-    - __get_classtable_name:
-       rettype: text
-       argument: text
-
-    - __get_column_value:
-       rettype: text
-       argument: integer, integer, character, text
-
-    - __get_feature_name:
-       rettype: text
-       argument: integer, text
-
-    - __get_feature_value:
-       rettype: text
-       argument: integer, integer, text
-
-    - __get_features_of_nodes:
-       rettype: text
-       argument: text, text, integer, integer, integer
-
-    - __get_id_column_name:
-       rettype: text
-       argument: text
-
-    - __get_schema_name:
-       rettype: text
-       argument: text
-
-    - __get_table_name:
-       rettype: text
-       argument: text
-
-    - __insert_into_metatable:
-       rettype: void
-       argument: text, integer, text, character, boolean, text, integer
-
-    - __is_valid_enc_table:
-       rettype: boolean
-       argument: text
-
-    - __num_of_class:
-       rettype: integer
-       argument: text
-
-    - __num_of_columns:
-       rettype: integer
-       argument: text
-
-    - __num_of_feature:
-       rettype: integer
-       argument: text
-
-    - __regclass_to_text:
-       rettype: text
-       argument: regclass
-
-    - __rename_table:
-       rettype: void
-       argument: text, text
-
-    - __rep_aggr_class_count_ffunc:
-       rettype: bigint[]
-       argument: bigint[]
-
-    - __rep_aggr_class_count_sfunc:
-       rettype: bigint[]
-       argument: bigint[], integer, integer, integer
-
-    - __rep_prune_tree:
-       rettype: void
-       argument: text, text, integer
-
-    - __sample_with_replacement:
-       rettype: void
-       argument: integer, bigint, text, text
-
-    - __sample_within_range:
-       rettype: SETOF bigint
-       argument: bigint, bigint, bigint
-
-    - __scv_aggr_ffunc:
-       rettype: double precision[]
-       argument: double precision[]
-
-    - __scv_aggr_prefunc:
-       rettype: double precision[]
-       argument: double precision[], double precision[]
-
-    - __scv_aggr_sfunc:
-       rettype: double precision[]
-       argument: double precision[], integer, boolean, integer, double precision[], double precision[], bigint
-
-    - __strip_schema_name:
-       rettype: text
-       argument: text
-
-    - __svm_random_ind2:
-       rettype: double precision[]
-       argument: integer
-
-    - __svm_random_ind:
-       rettype: double precision[]
-       argument: integer
-
-    - __svm_target_cl_func:
-       rettype: double precision
-       argument: double precision[]
-
-    - __svm_target_reg_func:
-       rettype: double precision
-       argument: double precision[]
-
-    - __table_exists:
-       rettype: boolean
-       argument: text
-
-    - __train_tree:
-       rettype: __train_result
-       argument: text, integer, integer, text, text, text, text, text, text, double precision, integer, double precision, double precision, double precision, boolean, integer, integer
-
-    - __treemodel_classify_internal:
-       rettype: text[]
-       argument: text, text, integer
-
-    - __treemodel_classify_internal_serial:
-       rettype: text[]
-       argument: text, text, integer
-
-    - __treemodel_display_no_ordered_aggr:
-       rettype: SETOF text
-       argument: text, integer[], integer
-
-    - __treemodel_display_with_ordered_aggr:
-       rettype: SETOF text
-       argument: text, integer[], integer
-
-    - __treemodel_get_vote_result:
-       rettype: void
-       argument: text, text
-
-    - __treemodel_score:
-       rettype: double precision
-       argument: text, text, integer
-
-    - __validate_input_table:
-       rettype: void
-       argument: text, text[], text, text
-
-    - __validate_metatable:
-       rettype: void
-       argument: text
-
-    - c45_classify:
-       rettype: c45_classify_result
-       argument: text, text, text
-
-    - c45_classify:
-       rettype: c45_classify_result
-       argument: text, text, text, integer
-
-    - c45_clean:
-       rettype: boolean
-       argument: text
-
-    - c45_display:
-       rettype: SETOF text
-       argument: text
-
-    - c45_display:
-       rettype: SETOF text
-       argument: text, integer
-
-    - c45_genrule:
-       rettype: SETOF text
-       argument: text
-
-    - c45_genrule:
-       rettype: SETOF text
-       argument: text, integer
-
-    - c45_score:
-       rettype: double precision
-       argument: text, text
-
-    - c45_score:
-       rettype: double precision
-       argument: text, text, integer
-
-    - c45_train:
-       rettype: c45_train_result
-       argument: text, text, text
-
-    - c45_train:
-       rettype: c45_train_result
-       argument: text, text, text, text, text, text, text, text, double precision, text
-
-    - c45_train:
-       rettype: c45_train_result
-       argument: text, text, text, text, text, text, text, text, double precision, text, integer, double precision, double precision, integer
-
-    - correlation:
-       rettype: correlation_result
-       argument: character varying, character varying
-
-    - correlation:
-       rettype: correlation_result
-       argument: character varying, character varying, character varying
-
-    - correlation:
-       rettype: correlation_result
-       argument: character varying, character varying, character varying, boolean
-
-    - linear_svm_igd_transition:
-       rettype: double precision[]
-       argument: double precision[], double precision[], boolean, double precision[], integer, double precision, double precision
-
-    - lsvm_classification:
-       rettype: SETOF lsvm_sgd_result
-       argument: text, text
-
-    - lsvm_classification:
-       rettype: SETOF lsvm_sgd_result
-       argument: text, text, boolean
-
-    - lsvm_classification:
-       rettype: SETOF lsvm_sgd_result
-       argument: text, text, boolean, boolean, double precision, double precision
-
-    - lsvm_classification:
-       rettype: SETOF lsvm_sgd_result
-       argument: text, text, boolean, boolean, double precision, double precision, integer
-
-    - lsvm_predict:
-       rettype: double precision
-       argument: double precision[], double precision[]
-
-    - lsvm_predict_batch:
-       rettype: text
-       argument: text, text, text, text, text
-
-    - lsvm_predict_batch:
-       rettype: text
-       argument: text, text, text, text, text, boolean
-
-    - matrix_block_trans:
-       rettype: matrix_result
-       argument: text, text, text, text, boolean
-
-    - matrix_densify:
-       rettype: matrix_result
-       argument: text, text, text, text, boolean
-
-    - matrix_sparsify:
-       rettype: matrix_result
-       argument: text, text, text, text, boolean
-
-    - matrix_trans:
-       rettype: matrix_result
-       argument: text, text, text, text, boolean
-
-    - rf_classify:
-       rettype: rf_classify_result
-       argument: text, text, text
-
-    - rf_classify:
-       rettype: rf_classify_result
-       argument: text, text, text, boolean, integer
-
-    - rf_classify:
-       rettype: rf_classify_result
-       argument: text, text, text, integer
-
-    - rf_clean:
-       rettype: boolean
-       argument: text
-
-    - rf_display:
-       rettype: SETOF text
-       argument: text
-
-    - rf_display:
-       rettype: SETOF text
-       argument: text, integer[]
-
-    - rf_display:
-       rettype: SETOF text
-       argument: text, integer[], integer
-
-    - rf_score:
-       rettype: double precision
-       argument: text, text
-
-    - rf_score:
-       rettype: double precision
-       argument: text, text, integer
-
-    - rf_train:
-       rettype: rf_train_result
-       argument: text, text, text
-
-    - rf_train:
-       rettype: rf_train_result
-       argument: text, text, text, integer, integer, double precision, text, text, text, text, text, integer, double precision, double precision, integer
-
-    - svdmf_run:
-       rettype: text
-       argument: text, text, text, text, integer
-
-    - svdmf_run:
-       rettype: text
-       argument: text, text, text, text, integer, integer, double precision
-
-    - svm_classification:
-       rettype: SETOF svm_cls_result
-       argument: text, text, boolean, text
-
-    - svm_classification:
-       rettype: SETOF svm_cls_result
-       argument: text, text, boolean, text, boolean, double precision, double precision
-
-    - svm_classification:
-       rettype: SETOF svm_cls_result
-       argument: text, text, boolean, text, boolean, double precision, double precision, double precision
-
-    - svm_classification:
-       rettype: SETOF svm_cls_result
-       argument: text, text, boolean, text, double precision
-
-    - svm_cls_update:
-       rettype: schema_madlib.svm_model_rec
-       argument: schema_madlib.svm_model_rec, double precision[], double precision, text, double precision, double precision, double precision
-
-    - svm_data_normalization:
-       rettype: void
-       argument: text
-
-    - svm_dot:
-       rettype: double precision
-       argument: double precision[], double precision[]
-
-    - svm_dot:
-       rettype: double precision
-       argument: double precision[], double precision[], double precision
-
-    - svm_drop_model:
-       rettype: void
-       argument: text
-
-    - svm_gaussian:
-       rettype: double precision
-       argument: double precision[], double precision[], double precision
-
-    - svm_generate_cls_data:
-       rettype: void
-       argument: text, integer, integer
-
-    - svm_generate_nd_data:
-       rettype: void
-       argument: text, integer, integer
-
-    - svm_generate_reg_data:
-       rettype: void
-       argument: text, integer, integer
-
-    - svm_nd_update:
-       rettype: schema_madlib.svm_model_rec
-       argument: schema_madlib.svm_model_rec, double precision[], text, double precision, double precision, double precision
-
-    - svm_novelty_detection:
-       rettype: SETOF schema_madlib.svm_nd_result
-       argument: text, text, boolean, text
-
-    - svm_novelty_detection:
-       rettype: SETOF schema_madlib.svm_nd_result
-       argument: text, text, boolean, text, boolean, double precision, double precision
-
-    - svm_novelty_detection:
-       rettype: SETOF schema_madlib.svm_nd_result
-       argument: text, text, boolean, text, boolean, double precision, double precision, double precision
-
-    - svm_polynomial:
-       rettype: double precision
-       argument: double precision[], double precision[], double precision
-
-    - svm_predict:
-       rettype: double precision
-       argument: schema_madlib.svm_model_rec, double precision[], text, double precision
-
-    - svm_predict_batch:
-       rettype: text
-       argument: text, text, text, text, text, boolean
-
-    - svm_predict_sub:
-       rettype: double precision
-       argument: integer, integer, double precision[], double precision[], double precision[], text, double precision
-
-    - svm_reg_update:
-       rettype: schema_madlib.svm_model_rec
-       argument: schema_madlib.svm_model_rec, double precision[], double precision, text, double precision, double precision, double precision, double precision
-
-    - svm_regression:
-       rettype: SETOF svm_reg_result
-       argument: text, text, boolean, text
-
-    - svm_regression:
-       rettype: SETOF svm_reg_result
-       argument: text, text, boolean, text, boolean, double precision, double precision, double precision
-
-    - svm_regression:
-       rettype: SETOF svm_reg_result
-       argument: text, text, boolean, text, boolean, double precision, double precision, double precision, double precision
-
-    - svm_store_model:
-       rettype: void
-       argument: text, text, text
-
-    # ----------------- Changes from 1.9 to 1.9.1 ----------
-    - array_collapse:
-        rettype: anyarray
-        argument: anyarray
-    - linear_svm_igd_transition:
-        rettype: double precision[]
-        argument: double precision[], double precision[], double precision, double precision[], integer, double precision, double precision, boolean, integer, double precision, boolean
-    - profile:
-        rettype: SETOF schema_madlib.profile_result
-        argument: text
-    - profile_full:
-        rettype: SETOF schema_madlib.profile_result
-        argument: text, integer
-    - profile:
-        rettype: schema_madlib.profile_result
-        argument: text
-    - profile_full:
-        rettype: schema_madlib.profile_result
-        argument: text, integer
-    - quantile:
-        rettype: double precision
-        argument: text, text, double precision
-    - quantile_big:
-        rettype: double precision
-        argument: text, text, double precision
-
-# Changes to aggregates (UDA) including removal and modification
-# Overloaded functions should be mentioned separately
-uda:
-    # ----------------- Changes from 1.5 to 1.6 ---------------
-    - coxph_step:
-        rettype: double precision[]
-        argument: double precision[], double precision, boolean, double precision[]
-    - coxph_strata_step_inner:
-        rettype: double precision[]
-        argument: double precision[], double precision, boolean, double precision[]
-    - coxph_strata_step_outer:
-        rettype: double precision[]
-        argument: double precision[]
-    # return type change
-    - linregr:
-        rettype: schema_madlib.linregr_result
-        argument: double precision, double precision[]
-    # initcond change
-    - __mlogregr_irls_step:
-        rettype: double precision[]
-        argument: integer, integer, integer, double precision[], double precision[]
-    # ----------------- Changes from 1.6 to 1.6.0S ----------
-    # ----------------- Changes from 1.6.0S to 1.7 ----------
-    - lmf_igd_step:
-        rettype: double precision[]
-        argument: smallint, smallint, double precision, double precision[], smallint, smallint, smallint, double precision, double precision
-    - __clustered_err_lin_step:
-        rettype: schema_madlib.__clustered_agg_result
-        argument: double precision, double precision[], double precision[]
-    - __clustered_err_log_step:
-        rettype: schema_madlib.__clustered_agg_result
-        argument: boolean, double precision[], double precision[]
-    - __clustered_err_mlog_step:
-        rettype: schema_madlib.__clustered_agg_result
-        argument: integer, double precision[], double precision[], integer, integer
-    - dense_residual_norm:
-        rettype: schema_madlib.residual_norm_result
-        argument: double precision[], double precision, double precision[]
-    - heteroskedasticity_test_linregr:
-        rettype: schema_madlib.heteroskedasticity_test_result
-        argument: double precision, double precision[], double precision[]
-    - linregr:
-        rettype: schema_madlib.linregr_result
-        argument: double precision, double precision[]
-    - robust_linregr:
-        rettype: schema_madlib.robust_linregr_result
-        argument: double precision, double precision[], double precision[]
-    - weighted_sample:
-        rettype: double precision[]
-        argument: double precision[], double precision
-    - weighted_sample:
-        rettype: bigint
-        argument: bigint, double precision
-    # ----------------- Changes from 1.7.1 to 1.8 ----------
-    - discrete_distribution_agg:
-        rettype: double precision[]
-        argument: integer, double precision, integer
-    - vectorized_distribution_agg:
-        rettype: double precision[]
-        argument: integer[], integer[]
-    # ----------------- Changes from 1.8 to 1.9 ----------
-
-    - __array_indexed_agg:
-        rettype: double precision[]
-        argument: double precision, bigint, bigint
-
-    - __best_scv_aggr:
-        rettype: double precision[]
-        argument: double precision[], integer, double precision
-
-    - __bigint_array_sum:
-        rettype: bigint[]
-        argument: bigint[]
-
-    - __display_tree_aggr:
-        rettype: text
-        argument: integer, boolean, text, text, double precision, double precision, text, integer
-
-    - __dt_acc_count_aggr:
-        rettype: bigint[]
-        argument: integer, bigint, integer
-
-    - __rep_aggr_class_count:
-        rettype: bigint[]
-        argument: integer, integer, integer
-
-    - __scv_aggr:
-        rettype: double precision[]
-        argument: integer, boolean, integer, double precision[], double precision[], bigint
-
-    - linear_svm_igd_step:
-        rettype: double precision[]
-        argument: double precision[], boolean, double precision[], integer, double precision, double precision
-
-    - linear_svm_igd_step_serial:
-        rettype: double precision[]
-        argument: double precision[], boolean, double precision[], integer, double precision, double precision
-
-    - svm_cls_agg:
-        rettype: schema_madlib.svm_model_rec
-        argument: double precision[], double precision, text, double precision, double precision, double precision
-
-    - svm_nd_agg:
-        rettype: schema_madlib.svm_model_rec
-        argument: double precision[], text, double precision, double precision, double precision
-
-    - svm_reg_agg:
-        rettype: schema_madlib.svm_model_rec
-        argument: double precision[], double precision, text, double precision, double precision, double precision, double precision
-
-    - __svm_random_ind2:
-        rettype: double precision[]
-        argument: integer
-    # ----------------- Changes from 1.9 to 1.9.1 ----------
-    - array_agg:
-        rettype: anyarray
-        argument: anyelement
-    - linear_svm_igd_step:
-       rettype: double precision[]
-       argument: double precision[], double precision, double precision[], integer, double precision, double precision, boolean, integer, double precision, boolean
-
-# Casts (UDC) updated/removed
-udc:
-    # ----------------- Changes from 1.5 to 1.6 ---------------
-    # ----------------- Changes from 1.6 to 1.6.0S ----------
-    # ----------------- Changes from 1.6.0S to 1.7 ----------
-
-# Operators (UDO) removed/updated
-udo:
-    # ----------------- Changes from 1.5 to 1.6 ---------------
-    - '<':
-        leftarg: schema_madlib.svec
-        rightarg: schema_madlib.svec
-    - '<=':
-        leftarg: schema_madlib.svec
-        rightarg: schema_madlib.svec
-    - '<>':
-        leftarg: schema_madlib.svec
-        rightarg: schema_madlib.svec
-    - '==':
-        leftarg: schema_madlib.svec
-        rightarg: schema_madlib.svec
-    - '>=':
-        leftarg: schema_madlib.svec
-        rightarg: schema_madlib.svec
-    - '>':
-        leftarg: schema_madlib.svec
-        rightarg: schema_madlib.svec
-    # ----------------- Changes from 1.6 to 1.6.0S ----------
-    # ----------------- Changes from 1.6.0S to 1.7 ----------
-
-# Operator Classes (UDOC) removed/updated
-udoc:
-    # ----------------- Changes from 1.5 to 1.6 ---------------
-    # removed
-    - svec_l2_ops:
-        index: btree
-    # ----------------- Changes from 1.6 to 1.6.0S ----------
-    # ----------------- Changes from 1.6.0S to 1.7 ----------

[29/50] [abbrv] incubator-madlib git commit: Madpack: Retain only major.minor after processing DB version

Posted by ri...@apache.org.

Madpack: Retain only major.minor after processing DB version


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/d035faa1
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/d035faa1
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/d035faa1

Branch: refs/heads/latest_release
Commit: d035faa190ba55b4262d4832c97a234e7714cc34
Parents: 078ba9e
Author: Rahul Iyer <ri...@apache.org>
Authored: Mon Feb 6 12:38:28 2017 -0800
Committer: Rahul Iyer <ri...@apache.org>
Committed: Mon Feb 6 12:38:28 2017 -0800

----------------------------------------------------------------------
 src/madpack/madpack.py | 3 +++
 1 file changed, 3 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/d035faa1/src/madpack/madpack.py
----------------------------------------------------------------------
diff --git a/src/madpack/madpack.py b/src/madpack/madpack.py
index ddd75df..2e21fa4 100755
--- a/src/madpack/madpack.py
+++ b/src/madpack/madpack.py
@@ -1187,6 +1187,9 @@ def main(argv):
                 # 'on' by default in 4.3.5
                 elif _is_rev_gte(_get_rev_num(dbver), _get_rev_num('4.3.4')):
                     dbver = '4.3ORCA'
+                else:
+                    # only need the first two digits for <= 4.3.4
+                    dbver = '.'.join(dbver.split('.')[:2])
 
             if not os.path.isdir(os.path.join(portdir, dbver)):
                 _error("This version is not among the %s versions for which "

[16/50] [abbrv] incubator-madlib git commit: Reverses the changes to the madlib.mode function to maintain backwards compatibility

Posted by ri...@apache.org.

Reverses the changes to the madlib.mode function to maintain backwards compatibility


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/faec6bee
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/faec6bee
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/faec6bee

Branch: refs/heads/latest_release
Commit: faec6bee3e2ee098965a713f8884cc503bf1aabc
Parents: 13203ba
Author: Orhan Kislal <ok...@pivotal.io>
Authored: Thu Jan 26 13:07:54 2017 -0800
Committer: Orhan Kislal <ok...@pivotal.io>
Committed: Thu Jan 26 13:07:54 2017 -0800

----------------------------------------------------------------------
 .../postgres/modules/recursive_partitioning/random_forest.py_in  | 4 ++--
 src/ports/postgres/modules/utilities/utilities.sql_in            | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/faec6bee/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in b/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in
index affa9f9..0eb5985 100644
--- a/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in
+++ b/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in
@@ -715,7 +715,7 @@ def forest_predict(schema_madlib, model, source, output, pred_type='response',
         majority_pred_expression = "avg(aggregated_prediction)"
     else:
         majority_pred_expression = """($sql${{ {dep_levels} }}$sql$::varchar[])[
-                                    {schema_madlib}.rf_mode(aggregated_prediction + 1)]::TEXT
+                                    {schema_madlib}.mode(aggregated_prediction + 1)]::TEXT
                                     """.format(**locals())
 
     if dep_type.lower() == "boolean":
@@ -1164,7 +1164,7 @@ def _calculate_oob_error(schema_madlib, oob_prediction_table, oob_error_table,
                     THEN 0.
                     ELSE 1.
                 END""".format(**locals())
-        forest_prediction_agg = "{schema_madlib}.rf_mode".format(**locals())
+        forest_prediction_agg = "{schema_madlib}.mode".format(**locals())
 
     sql_compute_oob_error = """
             CREATE TABLE {oob_error_table} AS

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/faec6bee/src/ports/postgres/modules/utilities/utilities.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/utilities.sql_in b/src/ports/postgres/modules/utilities/utilities.sql_in
index b2415a7..0ec864d 100644
--- a/src/ports/postgres/modules/utilities/utilities.sql_in
+++ b/src/ports/postgres/modules/utilities/utilities.sql_in
@@ -466,8 +466,8 @@ LANGUAGE 'sql' IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');
 
 -- Tell Postgres how to use our aggregate
-DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.rf_mode(double precision) CASCADE;
-CREATE AGGREGATE MADLIB_SCHEMA.rf_mode(double precision) (
+DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.mode(double precision) CASCADE;
+CREATE AGGREGATE MADLIB_SCHEMA.mode(double precision) (
   SFUNC=array_append, --Function to call for each row. Just builds the array
   STYPE=double precision[],
   FINALFUNC=MADLIB_SCHEMA._final_mode, --Function to call after everything has been added to array

[10/50] [abbrv] incubator-madlib git commit: PCA: Add grouping support to PCA

Posted by ri...@apache.org.

PCA: Add grouping support to PCA

JIRA: MADLIB-947

- PCA can now handle grouping columns. pca_train() with grouping_cols
parameter specified learns an independent model for each group in
the input table. New columns corresponding to the columns specified
in grouping_cols will be created in the output, mean and summary
tables.
- If pca_project() is called on an input table that has grouping_cols
in it, the pc_table used in the parameter list must be a PCA model
table that is learnt with grouping_cols. If the input table for
pca_project() has grouping columns but the pc_table used does not
support grouping_cols, or vice versa, there will be an error thrown.
- Another important new feature is that the 'row_id' column in the
input tables always had to be serially increasing, starting from 1. That
requirement is now relaxed since this commit converts given 'row_id' to
a new column that follows the rules laid out by sparse and dense
matrix formats.
- Both the online and user docs are improved with more examples.


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/02a7ef45
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/02a7ef45
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/02a7ef45

Branch: refs/heads/latest_release
Commit: 02a7ef453aa16e19eaa1e044ba18fe955fde0bf0
Parents: e0439ed
Author: Nandish Jayaram <nj...@users.noreply.github.com>
Authored: Wed Dec 21 14:18:38 2016 -0800
Committer: Nandish Jayaram <nj...@users.noreply.github.com>
Committed: Thu Jan 19 12:00:02 2017 -0800

----------------------------------------------------------------------
 .../postgres/modules/linalg/matrix_ops.py_in    |   9 +-
 src/ports/postgres/modules/pca/pca.py_in        | 815 ++++++++++-------
 src/ports/postgres/modules/pca/pca.sql_in       | 316 ++++---
 .../postgres/modules/pca/pca_project.py_in      | 871 ++++++++++++++-----
 .../postgres/modules/pca/pca_project.sql_in     | 355 ++++++--
 src/ports/postgres/modules/pca/test/pca.sql_in  | 106 +++
 .../modules/pca/test/pca_project.sql_in         | 128 ++-
 7 files changed, 1912 insertions(+), 688 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/02a7ef45/src/ports/postgres/modules/linalg/matrix_ops.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/linalg/matrix_ops.py_in b/src/ports/postgres/modules/linalg/matrix_ops.py_in
index 51ae7e3..9f3215c 100644
--- a/src/ports/postgres/modules/linalg/matrix_ops.py_in
+++ b/src/ports/postgres/modules/linalg/matrix_ops.py_in
@@ -86,7 +86,8 @@ def _matrix_column_to_array_format(source_table, row_id, output_table,
 def create_temp_sparse_matrix_table_with_dims(source_table,
                                               out_table,
                                               row_id, col_id, value,
-                                              row_dim, col_dim):
+                                              row_dim, col_dim,
+                                              sparse_where_condition=None):
     """
     Make a copy of the input sparse table and add (row_dim, col_dim, NULL) to it
 
@@ -102,6 +103,8 @@ def create_temp_sparse_matrix_table_with_dims(source_table,
     Returns:
         None
     """
+    if not sparse_where_condition:
+        sparse_where_condition = ''
     plpy.execute("""
                  CREATE TABLE {out_table} as
                      SELECT
@@ -110,11 +113,13 @@ def create_temp_sparse_matrix_table_with_dims(source_table,
                          {value}
                      FROM {source_table}
                      WHERE {value} is not NULL
+                     {sparse_where_condition}
                  """.format(row_id=row_id,
                             col_id=col_id,
                             value=value,
                             source_table=source_table,
-                            out_table=out_table))
+                            out_table=out_table,
+                            sparse_where_condition=sparse_where_condition))
     res_row_dim, res_col_dim = get_dims(out_table, {'row': row_id,
                                                     'col': col_id,
                                                     'val': value})

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/02a7ef45/src/ports/postgres/modules/pca/pca.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/pca/pca.py_in b/src/ports/postgres/modules/pca/pca.py_in
index 327dfd7..196c558 100644
--- a/src/ports/postgres/modules/pca/pca.py_in
+++ b/src/ports/postgres/modules/pca/pca.py_in
@@ -16,12 +16,13 @@ from linalg.svd import _svd_upper_wrap
 from utilities.utilities import _array_to_string
 from utilities.utilities import add_postfix
 from utilities.utilities import __mad_version
-from utilities.utilities import unique_string
+from utilities.utilities import unique_string, split_quoted_delimited_str
 from utilities.utilities import _assert
+from utilities.validate_args import get_cols, get_cols_and_types
+from utilities.control import MinWarning
 from utilities.validate_args import columns_exist_in_table
 from utilities.validate_args import table_exists
 
-
 import time
 import plpy
 
@@ -29,16 +30,43 @@ version_wrapper = __mad_version()
 string_to_array = version_wrapper.select_vecfunc()
 array_to_string = version_wrapper.select_vec_return()
 
+
+def pca_sparse(schema_madlib, source_table, pc_table, row_id,
+               col_id, val_id, row_dim, col_dim, k, grouping_cols,
+               lanczos_iter, use_correlation, result_summary_table,
+               variance, **kwargs):
+    """
+    Args:
+        @param schema_madlib
+        @param source_table
+        @param pc_table
+        @param row_id
+        @param col_id
+        @param val_id
+        @param row_dim
+        @param col_dim
+        @param k
+        @param grouping_cols
+        @param lanczos_iter
+        @param use_correlation
+        @param result_summary_table
+        @param variance
+
+    Returns:
+        None
+
+    """
+    pca_wrap(schema_madlib, source_table, pc_table, row_id,
+        k, grouping_cols, lanczos_iter, use_correlation,
+        result_summary_table, variance, True, col_id,
+        val_id, row_dim, col_dim)
+# ------------------------------------------------------------------------
+
 # ========================================================================
 def pca(schema_madlib, source_table, pc_table, row_id,
         k, grouping_cols, lanczos_iter, use_correlation,
         result_summary_table, variance, **kwargs):
     """
-    Compute the PCA of the matrix in source_table.
-
-    This function is the specific call for dense matrices and creates three
-    tables corresponding to the three decomposition matrices.
-
     Args:
         @param schema_madlib
         @param source_table
@@ -55,7 +83,23 @@ def pca(schema_madlib, source_table, pc_table, row_id,
         None
 
     """
-    startTime = time.time()  # measure the starting time
+    pca_wrap(schema_madlib, source_table, pc_table, row_id,
+        k, grouping_cols, lanczos_iter, use_correlation,
+        result_summary_table, variance)
+# ------------------------------------------------------------------------
+
+
+def pca_wrap(schema_madlib, source_table, pc_table, row_id,
+        k, grouping_cols, lanczos_iter, use_correlation,
+        result_summary_table, variance, is_sparse=False, col_id=None,
+        val_id=None, row_dim=None, col_dim=None, **kwargs):
+    """
+    This wrapper was added to support grouping columns. This
+    function does the necessary pre-processing for handling
+    grouping_cols, if set. It then constructs a single query
+    that includes a separate "madlib._pca_union(...)" for each
+    group.
+    """
     # Reset the message level to avoid random messages
     old_msg_level = plpy.execute("""
                                   SELECT setting
@@ -63,25 +107,230 @@ def pca(schema_madlib, source_table, pc_table, row_id,
                                   WHERE name='client_min_messages'
                                   """)[0]['setting']
     plpy.execute('SET client_min_messages TO warning')
-
-    # Step 1: Validate the input arguments
-    _validate_args(schema_madlib, source_table, pc_table, k,
+    grouping_cols_list = []
+    if is_sparse:
+        _validate_args(schema_madlib, source_table, pc_table, k, row_id, col_id,
+                   val_id, row_dim, col_dim, lanczos_iter,
+                   use_correlation, result_summary_table, variance)
+    else:
+        _validate_args(schema_madlib, source_table, pc_table, k,
                    row_id, None, None, None, None,
-                   grouping_cols, lanczos_iter, use_correlation,
+                   lanczos_iter, use_correlation,
                    result_summary_table,variance)
+    if(grouping_cols):
+        # validate the grouping columns. We currently only support grouping_cols
+        # to be column names in the source_table, and not expressions!
+        grouping_cols_list = split_quoted_delimited_str(grouping_cols)
+        _assert(columns_exist_in_table(source_table, grouping_cols_list, schema_madlib),
+                "PCA error: One or more grouping columns in {0} do not exist!".format(grouping_cols))
+        distinct_grouping_values = plpy.execute("""
+                SELECT DISTINCT {grouping_cols} FROM {source_table}
+            """.format(grouping_cols=grouping_cols, source_table=source_table))
+    else:
+        grouping_cols = ''
+    other_columns_in_table = [col for col in get_cols(source_table) if col not in grouping_cols_list]
+    grouping_cols_clause = ''
+    if grouping_cols_list:
+        cols_names_types = get_cols_and_types(source_table)
+        grouping_cols_clause = ', ' + ', '.join([c_name+" "+c_type for (c_name, c_type) in cols_names_types if c_name in grouping_cols_list])
+    ## Create all output tables
+    plpy.execute("""
+        CREATE TABLE {pc_table} (
+            row_id               INTEGER,
+            principal_components double precision[],
+            std_dev              double precision,
+            proportion           double precision
+            {grouping_cols_clause}
+        )
+        """.format(pc_table=pc_table, grouping_cols_clause=grouping_cols_clause))
+    pc_table_mean = add_postfix(pc_table, "_mean")
+    plpy.execute("""
+        DROP TABLE IF EXISTS {pc_table_mean};
+        CREATE TABLE {pc_table_mean} (
+            column_mean     double precision[]
+            {grouping_cols_clause}
+        )
+        """.format(pc_table_mean=pc_table_mean, grouping_cols_clause=grouping_cols_clause))
+    if result_summary_table:
+        plpy.execute("""
+                DROP TABLE IF EXISTS {0};
+                CREATE TABLE {0} (
+                rows_used               INTEGER,
+                "exec_time (ms)"        numeric,
+                iter                    INTEGER,
+                recon_error             double precision,
+                relative_recon_error    double precision,
+                use_correlation         boolean
+                {1}
+                )
+            """.format(result_summary_table, grouping_cols_clause))
+    else:
+        result_summary_table = ''
+
+    # declare variables whose values will be different for each group, if
+    # grouping_cols is specified
+    grouping_where_clause = ''
+    sparse_where_condition = ''
+    select_grouping_cols = ''
+    temp_table_columns = ''
+    result_summary_table_temp = ''
+    # For Dense matrix format only:
+    # We can now ignore the original row_id for all computations since we will
+    # create a new table with a row_id column that has not duplicates and ranges
+    # from 1 to number of rows in the group/table. This is to mainly support the
+    # grouping scneario where the row_id values might not range between 1 and
+    # number of rows in the group, for each group. Doing this also just extends
+    # this behavior for non-grouping scenarios too. If creating a new temp table
+    # that corrects the row_id column is not of much importance in non-grouping
+    # cases, we can avoid creating the temp table and save some computation time.
+    # But, at the moment, the code creates the temp table even for the non-grouping
+    # scenario.
+    # We don't need to do this for sparse representation because of the nature
+    # of its definition.
+    other_columns_in_table.remove(row_id)
+    temp_table_columns = """ ROW_NUMBER() OVER() AS row_id, """ + ','.join(other_columns_in_table)
+
+    pca_union_call_list = []
+    grp_id = 0
+    if not is_sparse:
+        col_id = 'NULL'
+        val_id = 'NULL'
+        row_dim = 0
+        col_dim = 0
+    while True:
+        if result_summary_table:
+            result_summary_table_temp = "pg_temp." + unique_string() + "_" + str(grp_id)
+        if grouping_cols:
+            grp_value_dict = distinct_grouping_values[grp_id]
+            where_conditions = ' AND '.join([str(key)+"="+str(value) for (key, value) in grp_value_dict.items()])
+            sparse_where_condition = ' AND ' + where_conditions
+            grouping_where_clause = ' WHERE ' + where_conditions
+            select_grouping_cols = ', ' + ', '.join([str(value)+" AS "+key for (key, value) in grp_value_dict.items()])
+
+        pca_union_call_list.append("""
+            {schema_madlib}._pca_union('{source_table}', '{pc_table}', '{pc_table_mean}', '{row_id}',
+                {k}, '{grouping_cols}', {lanczos_iter}, {use_correlation},
+                '{result_summary_table}', '{result_summary_table_temp}', {variance},
+                {grp_id}, '{grouping_where_clause}', '{sparse_where_condition}',
+                '{select_grouping_cols}', '{temp_table_columns}', {is_sparse},
+                '{col_id}', '{val_id}', {row_dim}, {col_dim})
+            """.format(schema_madlib=schema_madlib,
+                source_table=source_table, pc_table=pc_table,
+                pc_table_mean=pc_table_mean, row_id=row_id,
+                k='NULL' if k is None else k, grouping_cols=grouping_cols,
+                lanczos_iter=lanczos_iter, use_correlation=use_correlation,
+                result_summary_table=result_summary_table,
+                result_summary_table_temp=result_summary_table_temp,
+                variance='NULL' if variance==None else variance,
+                grp_id=grp_id, grouping_where_clause=grouping_where_clause,
+                sparse_where_condition=sparse_where_condition,
+                select_grouping_cols=select_grouping_cols,
+                temp_table_columns=temp_table_columns, is_sparse=is_sparse,
+                col_id=col_id, val_id=val_id, row_dim=row_dim, col_dim=col_dim))
+        grp_id += 1
+        if not grouping_cols_list or len(distinct_grouping_values) == grp_id:
+            break
+    # "SELECT <query_1>, <query_2>, <query_3>, ..." is expected to run each
+    # <query_i> in parallel.
+    pca_union_call = 'SELECT ' + ', '.join(pca_union_call_list)
+    plpy.execute(pca_union_call)
+
+    plpy.execute("SET client_min_messages TO %s" % old_msg_level)
+
+
+def _pca_union(schema_madlib, source_table, pc_table, pc_table_mean,
+        row_id, k, grouping_cols, lanczos_iter, use_correlation,
+        result_summary_table, result_summary_table_temp, variance,
+        grp_id, grouping_where_clause, sparse_where_condition,
+        select_grouping_cols, temp_table_columns, is_sparse, col_id,
+        val_id, row_dim, col_dim, **kwargs):
+    """
+    This function does all the heavy lifting of PCA, for both pca and pca_sparse.
+    Compute the PCA of the matrix in source_table. This function is the specific
+    call for dense matrices and creates three tables corresponding to the three
+    decomposition matrices.
+
+    Args:
+        @param source_table          TEXT,    -- Source table name (dense matrix)
+        @param pc_table              TEXT,    -- Output table name for the principal components
+        @param pc_table_mean         TEXT,    -- Output table name for the principal components
+        @param row_id                TEXT,    -- Column name for the ID for each row
+        @param k                     INTEGER, -- Number of principal components to compute
+        @param grouping_cols         TEXT,    -- Comma-separated list of grouping columns (Default: NULL)
+        @param lanczos_iter          INTEGER, -- The number of Lanczos iterations for the SVD calculation (Default: min(k+40, smallest Matrix dimension))
+        @param use_correlation       BOOLEAN, -- If True correlation matrix is used for principal components (Default: False)
+        @param result_summary_table  TEXT,    -- Table name to store summary of results (Default: NULL)
+        @param result_summary_table_temp  TEXT,    -- Table name to store summary of results (Default: NULL)
+        @param variance              DOUBLE PRECISION,   -- The proportion of variance (Default: NULL)
+        @param grp_id                INTEGER, -- a place holder id for each group
+        @param grouping_where_clause TEXT,    -- WHERE clause using grouping_cols
+        @param select_grouping_cols  TEXT,    -- SELECT clause using grouping_cols
+        @param temp_table_columns    TEXT,    -- SELECT caluse for creating temporary copy of the source_table
+        @param is_sparse             BOOLEAN, -- specifies if the PCA call is for sparse or dense matrices
+        @param col_id                TEXT,    -- sparse representation based detail
+        @param val_id                TEXT,    -- sparse representation based detail
+        @param row_dim               INTEGER, -- sparse representation based detail
+        @param col_dim               INTEGER  -- sparse representation based detail
 
+    Returns:
+        None
+    """
+    startTime = time.time()  # measure the starting time
+    # Step 1: Modify data format for sparse input
+    if is_sparse:
+        # Step 1.1: Densify the matrix for sparse input tables
+        # We densify the matrix because the recentering process will generate a
+        # dense matrix, so we just wrap around regular PCA.
+        # First we must copy the sparse matrix and add in the dimension information
+        sparse_temp = "pg_temp." + unique_string() + "_sparse"
+        # Add in the dimension information needed by the densifying process
+        create_temp_sparse_matrix_table_with_dims(source_table, sparse_temp,
+                                                  row_id, col_id, val_id,
+                                                  row_dim, col_dim, sparse_where_condition)
+        validate_sparse(sparse_temp,
+                        {'row': row_id, 'col': col_id, 'val': val_id},
+                        check_col=False)
+        # Step 1.2: Densify the input matrix
+        x_dense = "pg_temp." + unique_string() + "_dense"
+        plpy.execute("""
+            SELECT {schema_madlib}.matrix_densify(
+                '{sparse_temp}',
+                'row={row_id}, col={col_id}, val={val_id}',
+                '{x_dense}', 'row=row_id, val=row_vec')
+            """.format(schema_madlib=schema_madlib,
+                sparse_temp=sparse_temp, row_id=row_id,
+                col_id=col_id, val_id=val_id, x_dense=x_dense))
+        plpy.execute("""
+            DROP TABLE IF EXISTS {0};
+            """.format(sparse_temp))
+        source_table_grouped = x_dense
+    else:
+        # Creation of this temp table is unnecessary if the scenario does not involve
+        # grouping, and/or, the input table had perfect values for the row_id column.
+        # This temp table will ensure pca works even when the value of row_id column
+        # in dense matrix format does not have values ranging from 1 to number of rows.
+        source_table_grouped = "pg_temp." + unique_string() + "group_" + str(grp_id)
+        plpy.execute("""
+                    CREATE TABLE {source_table_grouped} AS
+                    SELECT {temp_table_columns}
+                    FROM {source_table}
+                    {grouping_where_clause}
+                """.format(source_table_grouped=source_table_grouped,
+                    source_table=source_table, grouping_where_clause=grouping_where_clause,
+                    temp_table_columns=temp_table_columns))
+    row_id = 'row_id'
     # Make sure that the table has row_id and row_vec
     source_table_copy = "pg_temp." + unique_string() + "_reformated_names"
     created_new_table = cast_dense_input_table_to_correct_columns(
-        schema_madlib, source_table, source_table_copy, row_id)
+        schema_madlib, source_table_grouped, source_table_copy, row_id)
 
     if(created_new_table):
-        source_table = source_table_copy
-
-    [row_dim, col_dim] = get_dims(source_table,
+        plpy.execute("DROP TABLE {0}".format(source_table_grouped))
+        source_table_grouped = source_table_copy
+    [row_dim, col_dim] = get_dims(source_table_grouped,
                                   {'row': 'row_id', 'col': 'col_id',
                                    'val': 'row_vec'})
-    validate_dense(source_table,
+    validate_dense(source_table_grouped,
                    {'row': 'row_id', 'val': 'row_vec'},
                    check_col=False, row_dim=row_dim)
     if k:
@@ -100,15 +349,13 @@ def pca(schema_madlib, source_table, pc_table, row_id,
     else:
         if variance: #lanczos_iter overrides the proportion default for k
             curK = lanczos_iter
-
     # Note: we currently don't support grouping columns or correlation matrices
-    if grouping_cols is None and not use_correlation:
-
+    if not use_correlation:
         # Step 2: Normalize the data (Column means)
         dimension = col_dim
         scaled_source_table = "pg_temp." + unique_string() + "_scaled_table"
         column_mean_str = _recenter_data(schema_madlib,
-                                         source_table,
+                                         source_table_grouped,
                                          scaled_source_table,
                                          'row_id',
                                          'row_vec',
@@ -116,19 +363,18 @@ def pca(schema_madlib, source_table, pc_table, row_id,
         # Step 3: Create temporary output & result summary table
         svd_output_temp_table = "pg_temp."+ unique_string()+ "_svd_out_tbl"
 
-        if result_summary_table is None:
+        if result_summary_table_temp is None:
             result_summary_table_string = ''
         else:
-            result_summary_table_string = ", '{0}'".format(result_summary_table)
-
+            result_summary_table_string = ", '{0}'".format(result_summary_table_temp)
         # Step 4: Perform SVD
         # Step 4.1: Perform upper part of SVD
-        if result_summary_table:
+        if result_summary_table_temp:
             t0 = time.time()
 
         (source_table_svd,bd_pref) = _svd_upper_wrap(schema_madlib,
             scaled_source_table, svd_output_temp_table,
-            row_id, curK, lanczos_iter, result_summary_table)
+            row_id, curK, lanczos_iter, result_summary_table_temp)
 
         # Calculate the sum of values for proportion
         svd_var_s = add_postfix(svd_output_temp_table, "_s")
@@ -141,7 +387,6 @@ def pca(schema_madlib, source_table, pc_table, row_id,
             )
             FROM {scaled_source_table}
             """.format(**locals()))[0]['array_sum']
-
         # Step 4.2: Adjust the k value
         if variance:
             variance_tmp_table = "pg_temp."+ unique_string()+ "_var_tmp"
@@ -165,7 +410,6 @@ def pca(schema_madlib, source_table, pc_table, row_id,
             plpy.execute("""
                 DROP TABLE IF EXISTS {variance_tmp_table}
                 """.format(variance_tmp_table=variance_tmp_table))
-
         # Step 4.3: Perform the lower part of SVD
         tmp_matrix_table = "temp_"+ unique_string()+ "_matrix"
         tmp_matrix_s_table = add_postfix(tmp_matrix_table, "_s")
@@ -193,11 +437,10 @@ def pca(schema_madlib, source_table, pc_table, row_id,
             tmp_matrix_table = svd_output_temp_table
             _svd_lower_wrap(schema_madlib, source_table_svd,
                 svd_output_temp_table, row_id, curK, lanczos_iter, bd_pref)
-
         # Step 4.4: Create the SVD result table
-        if result_summary_table:
+        if result_summary_table_temp:
             t1 = time.time()
-            [row_dim, col_dim] = get_dims(source_table,
+            [row_dim, col_dim] = get_dims(source_table_grouped,
                 {'row': 'row_id', 'col': 'col_id', 'val': 'row_vec'})
             arguments = {'schema_madlib': schema_madlib,
                          'source_table': scaled_source_table,
@@ -206,7 +449,7 @@ def pca(schema_madlib, source_table, pc_table, row_id,
                          'matrix_s': add_postfix(tmp_matrix_table, "_s"),
                          'row_dim': row_dim,
                          'col_dim': col_dim,
-                         'result_summary_table': result_summary_table,
+                         'result_summary_table': result_summary_table_temp,
                          'temp_prefix': "pg_temp." + unique_string(),
                          't0': t0, 't1': t1}
             create_summary_table(**arguments)
@@ -229,11 +472,12 @@ def pca(schema_madlib, source_table, pc_table, row_id,
         # Step 6: Insert the output of SVD into the PCA table
         plpy.execute(
             """
-            CREATE TABLE {pc_table} AS
+            INSERT INTO {pc_table}
             SELECT  {svd_v_transpose}.row_id,
                     row_vec AS principal_components,
                     value / sqrt({row_dim} - 1) AS std_dev,
                     ((value*value)/ {eigen_sum}) AS proportion
+                    {select_grouping_cols}
             FROM {svd_v_transpose},
                  {svd_output_temp_table_s}
             WHERE ({svd_v_transpose}.row_id = {svd_output_temp_table_s}.row_id)
@@ -244,30 +488,24 @@ def pca(schema_madlib, source_table, pc_table, row_id,
                        svd_v_transpose=svd_v_transpose,
                        pc_table=pc_table,
                        row_dim=row_dim,
-                       eigen_sum=eigen_sum))
+                       eigen_sum=eigen_sum,
+                       select_grouping_cols=select_grouping_cols))
         # Output the column mean
-        pc_table_mean = add_postfix(pc_table, "_mean")
         plpy.execute(
             """
-            DROP TABLE IF EXISTS {pc_table_mean};
-            CREATE TABLE {pc_table_mean} AS
+            INSERT INTO {pc_table_mean}
             SELECT '{column_mean_str}'::FLOAT8[] AS column_mean
+            {select_grouping_cols}
             """.format(pc_table_mean=pc_table_mean,
-                       column_mean_str=column_mean_str))
+                       column_mean_str=column_mean_str,
+                       select_grouping_cols=select_grouping_cols))
         # Step 7: Append to the SVD summary table to get the PCA summary table
-        if result_summary_table:
+        if result_summary_table_temp:
             stopTime = time.time()
             dt = (stopTime - startTime) * 1000.
-            summary_table_tmp_name = unique_string()
-            plpy.execute(
-                """
-                ALTER TABLE {result_summary_table}
-                RENAME TO {tmp_name};
-                """.format(result_summary_table=result_summary_table,
-                           tmp_name=summary_table_tmp_name))
             plpy.execute(
                 """
-                CREATE TABLE {result_summary_table} AS
+                INSERT INTO {result_summary_table}
                 SELECT
                     rows_used,
                     {dt} AS "exec_time (ms)",
@@ -275,13 +513,15 @@ def pca(schema_madlib, source_table, pc_table, row_id,
                     recon_error,
                     relative_recon_error,
                     {use_correlation} AS use_correlation
-                FROM {tmp_name};
+                    {select_grouping_cols}
+                FROM {result_summary_table_temp};
                 """.format(result_summary_table=result_summary_table,
                            dt=str(dt), iter=curK,
                            use_correlation=bool(use_correlation),
-                           tmp_name=summary_table_tmp_name))
-            plpy.execute("DROP TABLE {tmp_name};".format(
-                tmp_name=summary_table_tmp_name))
+                           result_summary_table_temp=result_summary_table_temp,
+                           select_grouping_cols=select_grouping_cols))
+            plpy.execute("DROP TABLE {result_summary_table_temp};".format(
+                result_summary_table_temp=result_summary_table_temp))
 
         # Step 8: Output handling & cleanup
         plpy.execute(
@@ -294,6 +534,7 @@ def pca(schema_madlib, source_table, pc_table, row_id,
             DROP TABLE IF EXISTS {svd_output_temp_table_u};
             DROP TABLE IF EXISTS {svd_output_temp_table_v};
             DROP TABLE IF EXISTS {scaled_source_table};
+            DROP TABLE IF EXISTS {source_table_grouped};
             """.format(svd_output_temp_table=svd_output_temp_table,
                        svd_output_temp_table_s=svd_output_temp_table_s,
                        svd_output_temp_table_u=svd_output_temp_table_u,
@@ -301,9 +542,8 @@ def pca(schema_madlib, source_table, pc_table, row_id,
                        scaled_source_table=scaled_source_table,
                        svd_v_transpose=svd_v_transpose,
                        source_table_copy=source_table_copy,
-                       tmp_matrix_s_table=tmp_matrix_s_table))
-
-    plpy.execute("SET client_min_messages TO %s" % old_msg_level)
+                       tmp_matrix_s_table=tmp_matrix_s_table,
+                       source_table_grouped=source_table_grouped))
 # ------------------------------------------------------------------------
 
 # ------------------------------------------------------------------------
@@ -318,7 +558,6 @@ def _validate_args(schema_madlib,
                    val_id=None,
                    row_dim=None,
                    col_dim=None,
-                   grouping_cols=None,
                    lanczos_iter=0,
                    use_correlation=False,
                    result_summary_table=None,
@@ -373,10 +612,6 @@ def _validate_args(schema_madlib,
             "PCA error: {1} column does not exist in {0}!".
             format(source_table, "NULL" if row_id is None else row_id))
 
-    if(grouping_cols):
-        plpy.error("PCA error: Grouping columns are not currently supported!\
-        This value must be set to NULL")
-
     if (lanczos_iter < 0):
         plpy.error("PCA error: lanczos_iter can't be negative! (Use zero for \
         default value)  The provided value is {0}".format(str(lanczos_iter)))
@@ -409,9 +644,6 @@ def _validate_args(schema_madlib,
         if col_dim <= 0:
             plpy.error("PCA error: The column dimension must be larger than 0!")
 
-        validate_sparse(source_table,
-                        {'row': row_id, 'col': col_id, 'val': val_id},
-                        check_col=False)
     if use_correlation:
         plpy.error("PCA error: Using the correlation matrix is not enabled! \
         This value must be set to FALSE")
@@ -476,123 +708,7 @@ def _recenter_data(schema_madlib, source_table, output_table, row_id,
     return x_mean_str
 # ------------------------------------------------------------------------
 
-
-def pca_sparse(schema_madlib,
-               source_table,
-               pc_table,
-               row_id,
-               col_id,
-               val_id,
-               row_dim,
-               col_dim,
-               k,
-               grouping_cols,
-               lanczos_iter,
-               use_correlation,
-               result_summary_table,
-               variance,
-               **kwargs):
-    """
-    Compute the PCA of a sparse matrix in source_table.
-
-    This function is the specific call for dense matrices and creates three
-    tables corresponding to the three decomposition matrices.
-
-    Args:
-        @param schema_madlib
-        @param source_table
-        @param pc_table
-        @param row_id
-        @param col_id
-        @param val_id
-        @param row_dim
-        @param col_dim
-        @param k
-        @param grouping_cols
-        @param lanczos_iter
-        @param use_correlation
-        @param result_summary_table
-        @param variance
-
-    Returns:
-        None
-
-    """
-    startTime = time.time()
-    # Reset the message level to avoid random messages
-    old_msg_level = plpy.execute("""
-                                  SELECT setting
-                                  FROM pg_settings
-                                  WHERE name='client_min_messages'
-                                  """)[0]['setting']
-    plpy.execute('SET client_min_messages TO warning')
-
-    # Step 1: Validate the input arguments
-    _validate_args(schema_madlib, source_table, pc_table, k, row_id, col_id,
-                   val_id, row_dim, col_dim, grouping_cols, lanczos_iter,
-                   use_correlation, result_summary_table, variance)
-
-    # Step 2: Densify the matrix
-    #  We densify the matrix because the recentering process will generate a
-    # dense matrix, so we just wrap around regular PCA.
-    # First we must copy the sparse matrix and add in the dimension information
-
-    sparse_temp = "pg_temp." + unique_string() + "_sparse"
-
-    # Add in the dimension information need by the densifying process
-    create_temp_sparse_matrix_table_with_dims(source_table, sparse_temp,
-                                              row_id, col_id, val_id,
-                                              row_dim, col_dim)
-
-    x_dense = "pg_temp." + unique_string() + "_dense"
-    plpy.execute("""
-        SELECT {schema_madlib}.matrix_densify(
-            '{sparse_temp}',
-            'row={row_id}, col={col_id}, val={val_id}',
-            '{x_dense}', 'row=row_id, val=row_vec')
-        """.format(**locals()))
-
-    # Step 3: Pass the densified matrix to regular PCA
-    pca(schema_madlib, x_dense, pc_table, 'row_id',
-        k, grouping_cols, lanczos_iter, use_correlation,
-        result_summary_table, variance)
-
-    # Step 4: Clean up
-    plpy.execute("""
-        DROP TABLE IF EXISTS {x_dense};
-        DROP TABLE IF EXISTS {sparse_temp};
-        """.format(x_dense=x_dense, sparse_temp=sparse_temp))
-
-    if result_summary_table:
-        stopTime = time.time()
-        dt = (stopTime - startTime) * 1000.
-        summary_table_tmp_name = unique_string()
-        plpy.execute(
-            """
-            ALTER TABLE {result_summary_table}
-            RENAME TO {tmp_name};
-            """.format(result_summary_table=result_summary_table,
-                       tmp_name=summary_table_tmp_name))
-        plpy.execute(
-            """
-            CREATE TABLE {result_summary_table} AS
-            SELECT
-                rows_used,
-                {dt} AS "exec_time (ms)",
-                iter,
-                recon_error,
-                relative_recon_error,
-                use_correlation
-            FROM {tmp_name};
-            """.format(result_summary_table=result_summary_table,
-                       dt=str(dt), tmp_name=summary_table_tmp_name))
-        plpy.execute("DROP TABLE {tmp_name};".format(
-            tmp_name=summary_table_tmp_name))
-
-    plpy.execute("SET client_min_messages TO %s" % old_msg_level)
-# ------------------------------------------------------------------------
-
-
+# Sparse PCA train help function
 def pca_sparse_help_message(schema_madlib, message=None, **kwargs):
     """
     Given a help string, provide usage information
@@ -607,82 +723,119 @@ def pca_sparse_help_message(schema_madlib, message=None, **kwargs):
     if message is not None and \
             message.lower() in ("usage", "help", "?"):
         return """
-        -----------------------------------------------------------------------
-                                    USAGE
-        -----------------------------------------------------------------------
-        SELECT {schema_madlib}.pca_sparse_train(
-            source_table        -- TEXT,    Name of data table
-            pc_table            -- TEXT,    Name of the table containing the principle components
-            row_id              -- TEXT,    Column name for the row coordinates.
-            col_id              -- TEXT,    Column name for the column coordinates.
-            val_id              -- TEXT,    Column name for the sparse values.
-            row_dim,            -- INTEGER, The number of rows in the sparse matrix
-            col_dim,            -- INTEGER, The number of columns in the sparse matrix
-            components_param    -- INTEGER OR FLOAT, The parameter to control the number of principal components to calculate from the input data.
-            [
-            grouping_cols       -- TEXT,    Comma-separated list of grouping columns
-                                            (Default: NULL)
-            lanczos_iter        -- INTEGER, The number of Lanczos iterations to use in the SVD calculation
-                                            (Default: minimum of of the smallest input
-                                                matrix dimension and k+40)
-            use_correlation     -- BOOLEAN, If True correlation matrix is used for principal components
-                                            (Default: False)
-            rslt_summary_table  -- TEXT,    Table name to store summary of results
-                                            (Default: NULL)
-            ]
-        );
-        If components_param is INTEGER it is used for denoting the number of principal components to compute.
-        If components_param is FLOAT it is used as the target proportion of variance.
-        -------------------------------------------------------------------------
-                                OUTPUT TABLES
-        -------------------------------------------------------------------------
-        The output table ("pc_table" above) has the following columns:
-            row_id              -- INTEGER, The ranking of the eigenvalues
-            prin_comp           -- FLOAT[], The principal components
-            eigen_values        -- FLOAT[]  The eigenvalues associated with each principal component
-
-        A secondary output table named "pc_table"_mean is also generated.
-        This table has only the single column:
-            column_mean         -- FLOAT[], The column means of the input data
-
-        -------------------------------------------------------------------------
-                            RESULT SUMMARY TABLE
-        -------------------------------------------------------------------------
-        The result summary table ("rslt_summary_table" above) has the following columns
-            rows_used              -- INTEGER,  Number of rows used in the PCA calculation
-            exec_time              -- FLOAT,    Number of milliseconds the PCA calculation took
-            use_correlation        -- BOOLEAN,  Value of parameter use_correlation
-            iter                   -- INTEGER,  Number of iterations the SVD took to converge
-            recon_error            -- FLOAT,    Absolute error in the approximation
-            relative_recon_error   -- FLOAT     Relative error in the approximation
+-----------------------------------------------------------------------
+                            USAGE
+-----------------------------------------------------------------------
+SELECT {schema_madlib}.pca_sparse_train(
+    source_table        -- TEXT,    Name of data table
+    pc_table            -- TEXT,    Name of the table containing the principal components
+    row_id              -- TEXT,    Column name for the row coordinates.
+    col_id              -- TEXT,    Column name for the column coordinates.
+    val_id              -- TEXT,    Column name for the sparse values.
+    row_dim,            -- INTEGER, The number of rows in the sparse matrix
+    col_dim,            -- INTEGER, The number of columns in the sparse matrix
+    components_param    -- INTEGER OR FLOAT, The parameter to control the number of
+                                    principal components to calculate from the input data.
+    grouping_cols       -- TEXT,    Comma-separated list of grouping columns
+                                    (Default: NULL)
+    lanczos_iter        -- INTEGER, The number of Lanczos iterations to use in the SVD calculation
+                                    (Default: minimum of of the smallest input
+                                        matrix dimension and k+40)
+    use_correlation     -- BOOLEAN, If True correlation matrix is used for principal components
+                                    (Default: False)
+    rslt_summary_table  -- TEXT,    Table name to store summary of results
+                                    (Default: NULL)
+);
+If components_param is INTEGER it is used for denoting the number of principal components to compute.
+If components_param is FLOAT it is used as the target proportion of variance.
+-------------------------------------------------------------------------
+                        OUTPUT TABLES
+-------------------------------------------------------------------------
+A PCA model is created for each group, if grouping_cols is specified.
+The output table ("pc_table" above) has the following columns:
+    row_id              -- INTEGER, The ranking of the eigenvalues
+    prin_comp           -- FLOAT[], The principal components
+    eigen_values        -- FLOAT[]  The eigenvalues associated with each principal component
+    grouping_cols       -- The grouping columns (with their types), if any,
+                           specified in grouping_cols
+
+A secondary output table named "pc_table"_mean is also generated.
+This table has only the single column:
+    column_mean         -- FLOAT[], The column means of the input data
+
+-------------------------------------------------------------------------
+                    RESULT SUMMARY TABLE
+-------------------------------------------------------------------------
+The result summary table ("rslt_summary_table" above) has the following columns
+    rows_used              -- INTEGER,  Number of rows used in the PCA calculation
+    exec_time              -- FLOAT,    Number of milliseconds the PCA calculation took
+    use_correlation        -- BOOLEAN,  Value of parameter use_correlation
+    iter                   -- INTEGER,  Number of iterations the SVD took to converge
+    recon_error            -- FLOAT,    Absolute error in the approximation
+    relative_recon_error   -- FLOAT     Relative error in the approximation
+    grouping_cols          -- The grouping columns (with their types), if any,
+                           specified in grouping_cols
         """.format(schema_madlib=schema_madlib)
     else:
-        if message.lower() in ("example", "examples"):
+        if message is not None and \
+                message.lower() in ("example", "examples"):
             return """
-DROP TABLE IF EXISTS sparse_mat;
-CREATE TABLE sparse_mat (
+----------------------------------------------------------------
+                        Examples
+----------------------------------------------------------------
+DROP TABLE IF EXISTS mat_sparse;
+CREATE TABLE mat_sparse (
     row_id integer,
     col_id integer,
-    val_id integer
+    value double precision
 );
-COPY sparse_mat (row_id, col_id, val_id) FROM stdin delimiter '|';
-1|2|4
-1|5|6
-3|8|4
-5|4|2
-6|6|12
-8|1|2
-8|7|2
-9|3|4
-9|8|2
+INSERT INTO mat_sparse VALUES
+(1, 1, 1.0),
+(2, 2, 2.0),
+(3, 3, 3.0),
+(4, 4, 4.0),
+(1, 5, 5.0),
+(2, 4, 6.0),
+(3, 2, 7.0),
+(4, 3, 8.0);
 \.
-DROP TABLE IF EXISTS result_table;
-DROP TABLE IF EXISTS result_table_mean;
-SELECT pca_sparse_train('sparse_mat', 'result_table',
-'row_id', 'col_id', 'val_id', 10, 10, 10);
-            """
+
+DROP TABLE IF EXISTS result_table_sparse;
+DROP TABLE IF EXISTS result_table_sparse_mean;
+SELECT {schema_madlib}.pca_sparse_train('mat_sparse', 'result_table_sparse',
+'row_id', 'col_id', 'val_id', 4, 5, 3);
+
+SELECT * FROM result_table_sparse ORDER BY row_id;
+
+DROP TABLE IF EXISTS mat_sparse_group;
+CREATE TABLE mat_sparse_group (
+    row_id integer,
+    col_id integer,
+    value double precision,
+    matrix_id integer);
+INSERT INTO mat_sparse_group VALUES
+(1, 1, 1.0, 1),
+(2, 2, 2.0, 1),
+(3, 3, 3.0, 1),
+(4, 4, 4.0, 1),
+(1, 5, 5.0, 1),
+(2, 4, 6.0, 2),
+(3, 2, 7.0, 2),
+(4, 3, 8.0, 2);
+\.
+
+DROP TABLE IF EXISTS result_table_sparsed_grouped;
+DROP TABLE IF EXISTS result_table_sparsed_grouped_mean;
+SELECT {schema_madlib}.pca_sparse_train('mat_sparse_group', 'result_table_sparsed_grouped',
+'row_id', 'col_id', 'val_id', 4, 5, 0.8, 'matrix_id');
+
+SELECT * FROM result_table_sparsed_grouped ORDER BY matrix_id, row_id;
+            """.format(schema_madlib=schema_madlib)
         else:
             return """
+----------------------------------------------------------------
+         Summary: Sparse PCA Training
+----------------------------------------------------------------
 Principal component analysis (PCA) is a mathematical procedure that uses an
 orthogonal transformation to convert a set of observations of possibly
 correlated variables into a set of values of linearly uncorrelated variables
@@ -692,8 +845,10 @@ accounts for as much of the variability in the data as possible), and each
 succeeding component in turn has the highest variance possible under the
 constraint that it be orthogonal to (i.e., uncorrelated with) the preceding
 components.
-
-For an overview on usage, run: SELECT {schema_madlib}.pca_sparse_train('usage');
+--
+For an overview on usage, run:
+SELECT {schema_madlib}.pca_sparse_train('usage');
+--
         """.format(schema_madlib=schema_madlib)
 
 
@@ -711,75 +866,121 @@ def pca_help_message(schema_madlib, message=None, **kwargs):
     if message is not None and \
             message.lower() in ("usage", "help", "?"):
         return """
-        -----------------------------------------------------------------------
-                                    USAGE
-        -----------------------------------------------------------------------
-        SELECT {schema_madlib}.pca_train(
-            source_table        -- TEXT,    Name of data table
-            pc_table            -- TEXT,    Name of the table containing the principle components
-            row_id              -- TEXT,    Column name for the row coordinates.
-            components_param    -- INTEGER OR FLOAT, The parameter to control the number of principal components to calculate from the input data.
-            [
-            grouping_cols       -- TEXT,    Comma-separated list of grouping columns
-                                            (Default: NULL)
-            lanczos_iter        -- INTEGER, The number of Lanczos iterations to use in the SVD calculation
-                                            (Default: minimum of of the smallest input
-                                                matrix dimension and k+40)
-            use_correlation     -- BOOLEAN, If True correlation matrix is used for principal components
-                                            (Default: False)
-            rslt_summary_table  -- TEXT,    Table name to store summary of results
-                                            (Default: NULL)
-            variance            -- DOUBLE PRECISION, Proportion of variance
-                                            (Default: NULL)
-            ]
-        );
-        If components_param is INTEGER it is used for denoting the number of principal components to compute.
-        If components_param is FLOAT it is used as the target proportion of variance.
-        -------------------------------------------------------------------------
-                                OUTPUT TABLES
-        -------------------------------------------------------------------------
-        The output table ("pc_table" above) has the following columns:
-            row_id              -- INTEGER, The ranking of the eigenvalues
-            prin_comp           -- FLOAT[], The principal components
-            eigen_values        -- FLOAT[]  The eigenvalues associated with each principal component
-
-        A secondary output table named "pc_table"_mean is also generated.
-        This table has only the single column:
-            column_mean         -- FLOAT[], The column means of the input data
-        -------------------------------------------------------------------------
-                            RESULT SUMMARY TABLE
-        -------------------------------------------------------------------------
-        The result summary table ("rslt_summary_table" above) has the following columns
-            rows_used              -- INTEGER,  Number of rows used in the PCA calculation
-            exec_time              -- FLOAT,    Number of milliseconds the PCA calculation took
-            use_correlation        -- BOOLEAN,  Value of parameter use_correlation
-            iter                   -- INTEGER,  Number of iterations the SVD took to converge
-            recon_error            -- FLOAT,    Absolute error in the approximation
-            relative_recon_error   -- FLOAT     Relative error in the approximation
+-----------------------------------------------------------------------
+                            USAGE
+-----------------------------------------------------------------------
+SELECT {schema_madlib}.pca_train(
+    source_table        -- TEXT,    Name of data table
+    pc_table            -- TEXT,    Name of the table containing the principal components
+    row_id              -- TEXT,    Column name for the row coordinates.
+    components_param    -- INTEGER OR FLOAT, The parameter to control the number of
+                                             principal components to calculate from
+                                             the input data.
+
+    grouping_cols       -- TEXT,    Comma-separated list of grouping column names
+                                    (Default: NULL)
+    lanczos_iter        -- INTEGER, The number of Lanczos iterations to use in the SVD calculation
+                                    (Default: minimum of of the smallest input
+                                        matrix dimension and k+40)
+    use_correlation     -- BOOLEAN, If True correlation matrix is used for principal components
+                                    (Default: False)
+    rslt_summary_table  -- TEXT,    Table name to store summary of results
+                                    (Default: NULL)
+    variance            -- DOUBLE PRECISION, Proportion of variance
+                                    (Default: NULL)
+);
+If components_param is INTEGER it is used for denoting the number of
+principal components to compute. If components_param is FLOAT it is used
+as the target proportion of variance.
+-------------------------------------------------------------------------
+                        OUTPUT TABLES
+-------------------------------------------------------------------------
+A PCA model is created for each group, if grouping_cols is specified.
+The output table ("pc_table" above) has the following columns:
+    row_id              -- INTEGER, The ranking of the eigenvalues
+    prin_comp           -- FLOAT[], The principal components
+    eigen_values        -- FLOAT[], The eigenvalues associated with each
+                                    principal component
+    grouping_cols       -- The grouping columns (with their types), if any,
+                           specified in grouping_cols
+
+A secondary output table named "pc_table"_mean is also generated.
+This table has only the single column:
+    column_mean         -- FLOAT[], The column means of the input data
+    grouping_cols       -- The grouping columns (with their types), if any,
+                           specified in grouping_cols
+-------------------------------------------------------------------------
+                    RESULT SUMMARY TABLE
+-------------------------------------------------------------------------
+The result summary table ("rslt_summary_table" above) has the following columns
+    rows_used              -- INTEGER,  Number of rows used in the PCA calculation
+    exec_time              -- FLOAT,    Number of milliseconds the PCA calculation took
+    use_correlation        -- BOOLEAN,  Value of parameter use_correlation
+    iter                   -- INTEGER,  Number of iterations the SVD took to converge
+    recon_error            -- FLOAT,    Absolute error in the approximation
+    relative_recon_error   -- FLOAT     Relative error in the approximation
+    grouping_cols          -- The grouping columns (with their types), if any,
+                              specified in grouping_cols
         """.format(schema_madlib=schema_madlib)
     else:
-        if message.lower() in ("example", "examples"):
+        if message is not None and \
+                message.lower() in ("example", "examples"):
             return """
+----------------------------------------------------------------
+                        Examples
+----------------------------------------------------------------
 DROP TABLE IF EXISTS mat;
 CREATE TABLE mat (
-    row_id integer,
+    id integer,
     row_vec double precision[]
 );
-COPY mat (row_id, row_vec) FROM stdin DELIMITER '|';
-1|{1,2,3}
-2|{2,1,2}
-3|{3,2,1}
+COPY mat (id, row_vec) FROM stdin DELIMITER '|';
+1|{{1,2,3}}
+2|{{2,1,2}}
+3|{{3,2,1}}
 \.
+
 DROP TABLE IF EXISTS result_table;
 DROP TABLE IF EXISTS result_table_mean;
-SELECT pca_train( 'mat',
-                  'result_table',
-                  'row_id',
-                  3
+SELECT {schema_madlib}.pca_train( 'mat',
+          'result_table',
+          'id',
+          3
     );
-            """
+    
+SELECT * FROM result_table ORDER BY row_id;
+
+DROP TABLE IF EXISTS mat_group;
+CREATE TABLE mat_group (
+    id integer,
+    row_vec double precision[],
+    matrix_id integer
+);
+INSERT INTO mat_group VALUES
+(1, '{{1,2,3}}', 1),
+(2, '{{2,1,2}}', 1),
+(3, '{{3,2,1}}', 1),
+(4, '{{1,2,3,4,5}}', 2),
+(5, '{{2,5,2,4,1}}', 2),
+(6, '{{5,4,3,2,1}}', 2);
+\.
+
+DROP TABLE IF EXISTS result_table_grp;
+DROP TABLE IF EXISTS result_table_grp_mean;
+SELECT {schema_madlib}.pca_train( 'mat_group',
+          'result_table_grp',
+          'row_id',
+          0.9,
+          'matrix_id'
+    );
+
+SELECT * FROM result_table_grp ORDER BY matrix_id, row_id;
+            """.format(schema_madlib=schema_madlib)
         else:
             return """
+----------------------------------------------------------------
+         Summary: PCA Training
+----------------------------------------------------------------
 Principal component analysis (PCA) is a mathematical procedure that uses an
 orthogonal transformation to convert a set of observations of possibly
 correlated variables into a set of values of linearly uncorrelated variables
@@ -789,6 +990,8 @@ accounts for as much of the variability in the data as possible), and each
 succeeding component in turn has the highest variance possible under the
 constraint that it be orthogonal to (i.e., uncorrelated with) the preceding
 components.
-
-For an overview on usage, run: SELECT {schema_madlib}.pca_train('usage');
+--
+For an overview on usage, run:
+SELECT {schema_madlib}.pca_train('usage');
+--
             """.format(schema_madlib=schema_madlib)

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/02a7ef45/src/ports/postgres/modules/pca/pca.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/pca/pca.sql_in b/src/ports/postgres/modules/pca/pca.sql_in
index 9f573f3..6bcce1c 100644
--- a/src/ports/postgres/modules/pca/pca.sql_in
+++ b/src/ports/postgres/modules/pca/pca.sql_in
@@ -208,20 +208,26 @@ variance feature was introduced.  A special case to be aware of:
 'components_param' = 1 (INTEGER) will return 1 principal
 component, but 'components_param' = 1.0 (FLOAT) will return all 
 principal components, i.e., proportion of variance of 100%.
+\n \n
+Also, please note that the number of principal components (<em>k</em>)
+is global, even in the case where grouping is used (see 'grouping_cols'
+below).  In the case of grouping, proportion of variance 
+might be a better choice; this could result in different numbers
+of principal components for different groups.
 
 <DT>grouping_cols (optional)</DT>
-<DD>TEXT, default: NULL.  
+<DD>TEXT, default: NULL. A comma-separated list of column names, with the
+source data grouped using the combination of all the columns. An independent
+PCA model will be computed for each combination of the grouping columns.</DD>
 
-@note <em>Not currently implemented. Any non-NULL value is ignored.
-   Grouping support will be added in a future release. </em> The parameter 
-   is planned to be implemented as a 
-   comma-separated list of column names, with the source data grouped using 
-   the combination of all the columns. An independent PCA model will be 
-   computed for each combination of the grouping columns.</DD>
+@note Dense matrices can be different sizes for different groups if desired.  
+Sparse matrices cannot be different sizes for different groups,
+because the 'row_dim' and 'col_dim' parameters used for sparse matrices 
+are global across all groups.
 
 <DT>lanczos_iter (optional)</DT>
 <DD>INTEGER, default: minimum of {<em>k+40</em>, smallest matrix dimension}
-where <em>k</em> is the number of principle components specified in the 
+where <em>k</em> is the number of principal components specified in the 
 parameter 'components_param'.  This parameter defines the 
 number of Lanczos iterations for the SVD calculation.
 The Lanczos iteration number roughly corresponds to the accuracy of the SVD
@@ -280,7 +286,7 @@ This sumary table has the following columns:
 @anchor examples
 @examp
 
--# View online help for the PCA training function:
+-# View online help for the PCA training functions:
 <pre class="example">
 SELECT madlib.pca_train();
 or
@@ -290,115 +296,192 @@ SELECT madlib.pca_sparse_train();
 -# Create sample data in dense matrix form:
 <pre class="example">
 DROP TABLE IF EXISTS mat;
-CREATE TABLE mat (
-    			row_id integer,
-    			row_vec double precision[]
-);
+CREATE TABLE mat (id integer,
+                  row_vec double precision[]
+                  );
 INSERT INTO mat VALUES
 (1, '{1,2,3}'),
 (2, '{2,1,2}'),
 (3, '{3,2,1}');
 </pre>
 
--# Run the PCA function for a specified number of principle components and view the results:
+-# Run the PCA function for a specified number of principal components and view the results:
 <pre class="example">
 DROP TABLE IF EXISTS result_table, result_table_mean;
-SELECT madlib.pca_train( 'mat',
-                        'result_table',
-                        'row_id',
-                         3);
-SELECT * FROM result_table;
+SELECT madlib.pca_train('mat',             -- Source table
+                        'result_table',    -- Output table
+                        'id',              -- Row id of source table
+                         2);               -- Number of principal components
+SELECT * FROM result_table ORDER BY row_id;
 </pre>
 <pre class="result">
- row_id |                     principal_components                     |       std_dev        |      proportion      
---------+--------------------------------------------------------------+----------------------+----------------------
-      1 | {-0.707106781186547,-1.6306400674182e-16,0.707106781186547}  |     1.41421356237309 |    0.857142857142245
-      2 | {-1.66533453693773e-16,1,5.55111512312578e-17}               |    0.577350269189626 |    0.142857142857041
-      3 | {-0.707106781186548,1.11022302462516e-16,-0.707106781186547} | 1.59506745224211e-16 | 1.09038864737157e-32
+ row_id |                     principal_components                     |      std_dev      |    proportion     
+--------+--------------------------------------------------------------+-------------------+-------------------
+      1 | {0.707106781186547,-6.93889390390723e-18,-0.707106781186548} |  1.41421356237309 | 0.857142857142244
+      2 | {0,1,0}                                                      | 0.577350269189626 | 0.142857142857041
+(2 rows)
 </pre>
 
 -# Run the PCA function for a specified proportion of variance and view the results:
 <pre class="example">
+%%sql
 DROP TABLE IF EXISTS result_table, result_table_mean;
-SELECT madlib.pca_train( 'mat',
-                         'result_table',
-                         'row_id',
-                          0.9);
-SELECT * FROM result_table;
+SELECT madlib.pca_train('mat',             -- Source table
+                        'result_table',    -- Output table
+                        'id',              -- Row id of source table
+                         0.9);             -- Proportion of variance
+SELECT * FROM result_table ORDER BY row_id;
 </pre>
 <pre class="result">
  row_id |                     principal_components                     |      std_dev      |    proportion     
 --------+--------------------------------------------------------------+-------------------+-------------------
-      1 | {-0.707106781186548,-3.46944695195361e-17,0.707106781186548} |   1.4142135623731 | 0.857142857142245
-      2 | {2.22044604925031e-16,-1,1.11022302462516e-16}               | 0.577350269189626 | 0.142857142857041
+      1 | {0.707106781186548,-2.77555756156289e-17,-0.707106781186548} |   1.4142135623731 | 0.857142857142245
+      2 | {-1.11022302462516e-16,-1,0}                                 | 0.577350269189626 | 0.142857142857041
+(2 rows)
 </pre>
 
--# Create sample data in sparse matrix form:
+-# Now we use grouping in dense form to learn different models for different groups.
+First, we create sample data in dense matrix form with a grouping column.
+Note we actually have different matrix sizes for the different groups, which 
+is allowed for dense:
 <pre class="example">
-DROP TABLE IF EXISTS sparse_mat;
-CREATE TABLE sparse_mat (
-                        row_id integer,
-                        col_id integer,
-                        val_id integer
-                        );
-INSERT INTO sparse_mat VALUES
-(1, 2, 4.0),
-(1, 5, 6.0),
-(3, 8, 4.0),
-(5, 4, 2.0),
-(6, 6, 12.0),
-(8, 1, 2.0),
-(8, 7, 2.0),
-(9, 3, 4.0),
-(9, 8, 2.0);
+DROP TABLE IF EXISTS mat_group;
+CREATE TABLE mat_group (
+    id integer,
+    row_vec double precision[],
+    matrix_id integer
+);
+INSERT INTO mat_group VALUES
+(1, '{1,2,3}', 1),
+(2, '{2,1,2}', 1),
+(3, '{3,2,1}', 1),
+(4, '{1,2,3,4,5}', 2),
+(5, '{2,5,2,4,1}', 2),
+(6, '{5,4,3,2,1}', 2);
 </pre>
 
--# This matrix is what this matrix looks like in dense form:
+-# Run the PCA function with grouping for a specified proportion of variance and view the results:
 <pre class="example">
-DROP TABLE IF EXISTS dense_mat;
-SELECT madlib.matrix_densify(
-							'sparse_mat', 
-							'row=row_id, col=col_id, val=val_id', 
-							'dense_mat');
-SELECT * FROM dense_mat order by row_id;
+DROP TABLE IF EXISTS result_table_group, result_table_group_mean;
+SELECT madlib.pca_train('mat_group',             -- Source table
+                        'result_table_group',    -- Output table
+                        'id',                    -- Row id of source table
+                         0.8,                    -- Proportion of variance
+                        'matrix_id');            -- Grouping column
+SELECT * FROM result_table_group ORDER BY matrix_id, row_id;
 </pre>
 <pre class="result">
- row_id |       val_id       
---------+--------------------
-      1 | {0,4,0,0,6,0,0,0}
-      2 | {0,0,0,0,0,0,0,0}
-      3 | {0,0,0,0,0,0,0,4}
-      4 | {0,0,0,0,0,0,0,0}
-      5 | {0,0,0,2,0,0,0,0}
-      6 | {0,0,0,0,0,12,0,0}
-      7 | {0,0,0,0,0,0,0,0}
-      8 | {2,0,0,0,0,0,2,0}
-      9 | {0,0,4,0,0,0,0,2}
+ row_id |                                      principal_components                                      |     std_dev     |    proportion     | matrix_id 
+--------+------------------------------------------------------------------------------------------------+-----------------+-------------------+-----------
+      1 | {0.707106781186548,0,-0.707106781186547}                                                       | 1.4142135623731 | 0.857142857142245 |         1
+      1 | {-0.555378486712784,-0.388303582074091,0.0442457354870796,0.255566375612852,0.688115693174023} | 3.2315220311722 | 0.764102534485173 |         2
+      2 | {0.587384101786277,-0.485138064894743,0.311532046315153,-0.449458074050715,0.347212037159181}  |  1.795531127192 | 0.235897465516047 |         2
+(3 rows)
 </pre>
 
--# Run the PCA sparse function for a specified number of principle components and view the results:
+-# Now let's look at sparse matrices.  Create sample data in sparse matrix form:
 <pre class="example">
-DROP TABLE IF EXISTS result_table, result_table_mean;
-SELECT madlib.pca_sparse_train(
-                                'sparse_mat', 
-                                'result_table',
-                                'row_id', 
-                                'col_id', 
-                                'val_id', 
-                                9, 
-                                8, 
-                                5);
-SELECT * FROM result_table;
+DROP TABLE IF EXISTS mat_sparse;
+CREATE TABLE mat_sparse (
+    row_id integer,
+    col_id integer,
+    value double precision
+);
+INSERT INTO mat_sparse VALUES
+(1, 1, 1.0),
+(2, 2, 2.0),
+(3, 3, 3.0),
+(4, 4, 4.0),
+(1, 5, 5.0),
+(2, 4, 6.0),
+(3, 2, 7.0),
+(4, 3, 8.0);
+</pre>
+As an aside, this is what the sparse matrix above looks like when 
+put in dense form:
+<pre class="example">
+DROP TABLE IF EXISTS mat_dense;
+SELECT madlib.matrix_densify('mat_sparse', 
+                            'row=row_id, col=col_id, val=value', 
+                            'mat_dense');
+SELECT * FROM mat_dense ORDER BY row_id;
+</pre>
+<pre class="result">
+ row_id |    value    
+--------+-------------
+      1 | {1,0,0,0,5}
+      2 | {0,2,0,6,0}
+      3 | {0,7,3,0,0}
+      4 | {0,0,8,4,0}
+(4 rows)
+</pre>
+
+-# Run the PCA sparse function for a specified number of principal components and view the results:
+<pre class="example">DROP TABLE IF EXISTS result_table, result_table_mean;
+SELECT madlib.pca_sparse_train( 'mat_sparse',       -- Source table
+                                'result_table',     -- Output table
+                                'row_id',           -- Row id of source table
+                                'col_id',           -- Column id of source table
+                                'value',            -- Value of matrix at row_id, col_id
+                                4,                  -- Actual number of rows in the matrix
+                                5,                  -- Actual number of columns in the matrix
+                                3);                 -- Number of principal components                            
+SELECT * FROM result_table ORDER BY row_id;
 </pre>
-Result (with principle components truncated for readability):
+Result (with principal components truncated for readability):
 <pre class="result">
-  row_id |      principal_components                   |      std_dev      |     proportion     
---------+----------------------------------------------------------------------------------------
-      1 | {0.0189854059340971,0.0593979357345431,\u2026    |  4.03069474374092 |  0.604208682045711
-      2 | {0.0346801706473592,-0.536234300404824,\u2026    |  2.42282285507368 |  0.218308410262949
-      3 | {0.166190350977087,-0.112693750915351,\u2026     |  1.54680674776235 | 0.0889814051004931
-      4 | {-0.0699448377725649,0.00569475043252321,\u2026  |  1.10233418049845 | 0.0451911810308358
-      5 | {0.645363366217337,0.0403370697192613,\u2026     | 0.906957663197704 | 0.0305915282045503
+ row_id |         principal_components                 |     std_dev      |    proportion     
+--------+----------------------------------------------+------------------+-------------------
+      1 | {-0.0876046030186158,-0.0968983772909994,... | 4.21362803829554 | 0.436590030617467
+      2 | {-0.0647272661608605,0.877639526308692,...   | 3.68408023747461 | 0.333748701544697
+      3 | {-0.0780380267884855,0.177956517174911,...   | 3.05606908060098 | 0.229661267837836
+(3 rows)
+</pre>
+
+-# Now we use grouping in sparse form to learn different models for different groups.
+First, we create sample data in sparse matrix form with a grouping column:
+<pre class="example">
+DROP TABLE IF EXISTS mat_sparse_group;
+CREATE TABLE mat_sparse_group (
+    row_id integer,
+    col_id integer,
+    value double precision,
+    matrix_id integer);
+INSERT INTO mat_sparse_group VALUES
+(1, 1, 1.0, 1),
+(2, 2, 2.0, 1),
+(3, 3, 3.0, 1),
+(4, 4, 4.0, 1),
+(1, 5, 5.0, 1),
+(2, 4, 6.0, 2),
+(3, 2, 7.0, 2),
+(4, 3, 8.0, 2);
+</pre>
+
+-#  Run the PCA function with grouping for a specified proportion of variance
+and view the results:
+<pre class="example">
+DROP TABLE IF EXISTS result_table_group, result_table_group_mean;
+SELECT madlib.pca_sparse_train( 'mat_sparse_group',   -- Source table
+                                'result_table_group', -- Output table
+                                'row_id',             -- Row id of source table
+                                'col_id',             -- Column id of source table
+                                'value',              -- Value of matrix at row_id, col_id
+                                4,                    -- Actual number of rows in the matrix
+                                5,                    -- Actual number of columns in the matrix
+                                0.8,                  -- Proportion of variance
+                                'matrix_id');
+SELECT * FROM result_table_group ORDER BY matrix_id, row_id;
+</pre>
+Result (with principal components truncated for readability):
+<pre class="result">
+ row_id |           principal_components             |     std_dev      |    proportion     | matrix_id 
+--------+--------------------------------------------+------------------+-------------------+-----------
+      1 | {-0.17805696611353,0.0681313257646983,...  | 2.73659933165925 | 0.544652792875481 |         1
+      2 | {-0.0492086814863993,0.149371585357526,... | 2.06058314533194 | 0.308800210823714 |         1
+      1 | {0,-0.479486114660443,...                  | 4.40325305087975 | 0.520500333693473 |         2
+      2 | {0,0.689230898585949,...                   |  3.7435566458567 | 0.376220573442628 |         2
+(4 rows)
 </pre>
 
 @anchor notes
@@ -453,13 +536,13 @@ recovered as the entries of \f$ {\boldsymbol \Sigma}/(\sqrt{(N-1)} \f$, and the
 components are the rows of  \f$ {\boldsymbol V} \f$. The reasoning behind using N \u2212 1 instead of N to calculate the covariance is <a href="https://en.wikipedia.org/wiki/Bessel%27s_correction">Bessel's correction</a>.
 
 
-It is important to note that the PCA implementation assumes that the user will
- use only the principal components that have non-zero eigenvalues.  The SVD
- calculation is done with the Lanczos method, with does not guarantee
- correctness for singular vectors with zero-valued eigenvalues.  Consequently,
-  principal components with zero-valued eigenvalues are not guaranteed to be correct.
- Generally, this will not be problem unless the user wants to use the
- principal components for the entire eigenspectrum.
+@note It is important to note that this PCA implementation assumes that the user will
+use only the principal components that have non-zero eigenvalues.  The SVD
+calculation is done with the Lanczos method, which does not guarantee
+correctness for singular vectors with zero-valued eigenvalues.  Consequently,
+principal components with zero-valued eigenvalues are not guaranteed to be correct.
+Generally, this will not be problem unless the user wants to use the
+principal components for the entire eigenspectrum.
 
 
 @anchor literature
@@ -478,8 +561,6 @@ File pca.sql_in documenting the SQL functions
 
 \ref grp_pca_project
 
-
-
 */
 
 -- -----------------------------------------------------------------------
@@ -512,14 +593,14 @@ m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 
 CREATE OR REPLACE FUNCTION
 MADLIB_SCHEMA.pca_train(
-    source_table    TEXT,   -- Source table name (dense matrix)
-    pc_table        TEXT,   -- Output table name for the principal components
-    row_id          TEXT,   -- Column name for the ID for each row
-    k               INTEGER,-- Number of principal components to compute
-    grouping_cols   TEXT,   -- Comma-separated list of grouping columns
-    lanczos_iter    INTEGER,-- The number of Lanczos iterations for the SVD calculation
-    use_correlation BOOLEAN, -- If True correlation matrix is used for principal components
-    result_summary_table  TEXT    -- Table name to store summary of results (Default: NULL)
+    source_table    TEXT,       -- Source table name (dense matrix)
+    pc_table        TEXT,       -- Output table name for the principal components
+    row_id          TEXT,       -- Column name for the ID for each row
+    k               INTEGER,    -- Number of principal components to compute
+    grouping_cols   TEXT,       -- Comma-separated list of grouping columns
+    lanczos_iter    INTEGER,    -- The number of Lanczos iterations for the SVD calculation
+    use_correlation BOOLEAN,    -- If True correlation matrix is used for principal components
+    result_summary_table  TEXT  -- Table name to store summary of results (Default: NULL)
 )
 RETURNS VOID AS $$
     SELECT MADLIB_SCHEMA.pca_train($1, $2, $3, $4, $5, $6, $7, $8, NULL)
@@ -913,3 +994,32 @@ BEGIN
 END;
 $$ LANGUAGE plpgsql IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');
+
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._pca_union(
+    source_table          TEXT,    -- Source table name (dense matrix)
+    pc_table              TEXT,    -- Output table name for the principal components
+    pc_table_mean         TEXT,    -- Output table name for the principal components
+    row_id                TEXT,    -- Column name for the ID for each row
+    k                     INTEGER, -- Number of principal components to compute
+    grouping_cols         TEXT,    -- Comma-separated list of grouping columns (Default: NULL)
+    lanczos_iter          INTEGER, -- The number of Lanczos iterations for the SVD calculation (Default: min(k+40, smallest Matrix dimension))
+    use_correlation       BOOLEAN, -- If True correlation matrix is used for principal components (Default: False)
+    result_summary_table  TEXT,    -- Table name to store summary of results (Default: NULL)
+    result_summary_table_temp  TEXT,    -- Table name to store summary of results (Default: NULL)
+    variance              DOUBLE PRECISION,   -- The proportion of variance (Default: NULL)
+    grp_id                INTEGER, -- a place holder id for each group
+    grouping_where_clause TEXT,    -- WHERE clause using grouping_cols
+    sparse_where_condition TEXT,   -- WHERE clause used when creating temp sparse matrix table with dims
+    select_grouping_cols  TEXT,    -- SELECT clause using grouping_cols
+    temp_table_columns    TEXT,    -- SELECT caluse for creating temporary copy of the source_table
+    is_sparse             BOOLEAN, -- specifies if the PCA call is for sparse or dense matrices
+    col_id                TEXT,    -- sparse representation based detail
+    val_id                TEXT,    -- sparse representation based detail
+    row_dim               INTEGER, -- sparse representation based detail
+    col_dim               INTEGER  -- sparse representation based detail
+)
+RETURNS VOID AS $$
+PythonFunction(pca, pca, _pca_union)
+$$ LANGUAGE plpythonu
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');