You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by nj...@apache.org on 2018/04/14 00:22:36 UTC

[1/3] madlib git commit: Utilities: Add functions for postgres character/boolean type comparison.

Repository: madlib
Updated Branches:
  refs/heads/master b2efa7d0b -> 259e00416


Utilities: Add functions for postgres character/boolean type comparison.

This commit adds two functions to check if a given type matches
one of the predefined postgres character or boolean types.


Project: http://git-wip-us.apache.org/repos/asf/madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/madlib/commit/c902cb60
Tree: http://git-wip-us.apache.org/repos/asf/madlib/tree/c902cb60
Diff: http://git-wip-us.apache.org/repos/asf/madlib/diff/c902cb60

Branch: refs/heads/master
Commit: c902cb60fcbc436fadee8902bb24cbbd02eaad0b
Parents: b2efa7d
Author: Nikhil Kak <nk...@pivotal.io>
Authored: Fri Apr 6 11:35:16 2018 -0700
Committer: Nandish Jayaram <nj...@apache.org>
Committed: Fri Apr 13 17:16:38 2018 -0700

----------------------------------------------------------------------
 .../test/unit_tests/test_utilities.py_in        | 20 ++++++++++++++++
 .../postgres/modules/utilities/utilities.py_in  | 24 ++++++++++++++++++++
 2 files changed, 44 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/madlib/blob/c902cb60/src/ports/postgres/modules/utilities/test/unit_tests/test_utilities.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/test/unit_tests/test_utilities.py_in b/src/ports/postgres/modules/utilities/test/unit_tests/test_utilities.py_in
index 0f38a05..7ad6253 100644
--- a/src/ports/postgres/modules/utilities/test/unit_tests/test_utilities.py_in
+++ b/src/ports/postgres/modules/utilities/test/unit_tests/test_utilities.py_in
@@ -196,5 +196,25 @@ class UtilitiesTestCase(unittest.TestCase):
                                               {'class': 'c'}]),
                          {'class': ['a', 'b', 'c']})
 
+    def test_is_psql_char_type(self):
+        self.assertTrue(self.subject.is_psql_char_type('text'))
+        self.assertTrue(self.subject.is_psql_char_type('varchar'))
+        self.assertTrue(self.subject.is_psql_char_type('character varying'))
+        self.assertTrue(self.subject.is_psql_char_type('char'))
+        self.assertTrue(self.subject.is_psql_char_type('character'))
+
+        self.assertFalse(self.subject.is_psql_char_type('c1har'))
+        self.assertFalse(self.subject.is_psql_char_type('varchar1'))
+        self.assertFalse(self.subject.is_psql_char_type('1character'))
+
+    def test_is_psql_char_type_excludes_list(self):
+        self.assertTrue(self.subject.is_psql_char_type('text', ['varchar','char']))
+        self.assertFalse(self.subject.is_psql_char_type('text', ['text','char']))
+        self.assertFalse(self.subject.is_psql_char_type('varchar', 'varchar'))
+
+    def test_is_psql_boolean_type(self):
+        self.assertTrue(self.subject.is_psql_boolean_type('boolean'))
+        self.assertFalse(self.subject.is_psql_boolean_type('not boolean'))
+
 if __name__ == '__main__':
     unittest.main()

http://git-wip-us.apache.org/repos/asf/madlib/blob/c902cb60/src/ports/postgres/modules/utilities/utilities.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/utilities.py_in b/src/ports/postgres/modules/utilities/utilities.py_in
index 324ed6d..4131a80 100644
--- a/src/ports/postgres/modules/utilities/utilities.py_in
+++ b/src/ports/postgres/modules/utilities/utilities.py_in
@@ -210,6 +210,30 @@ def is_psql_int_type(arg, exclude=None):
     return (arg in to_check_types)
 # -------------------------------------------------------------------------
 
+
+def is_psql_char_type(arg, exclude_list=[]):
+    """
+    This function checks if the given arg is one of the predefined postgres
+    character types
+    :param arg:
+    :param exclude: Optionally exclude one or more types from the comparison
+    :return: True if it is one of the character types, else False.
+
+    """
+    if not isinstance(exclude_list, list) :
+        exclude_list = [exclude_list]
+
+    text_types = set(['text', 'varchar', 'character varying', 'char', 'character'])
+    return arg in text_types - set(exclude_list)
+
+def is_psql_boolean_type(arg):
+        """
+        This function checks if the given arg is one of type postgres boolean
+        :param arg:
+        :return: True if it is boolean, else False.
+        """
+        return arg == 'boolean'
+
 def is_string_formatted_as_array_expression(string_to_match):
     """
     Return true if the string is formatted as array[<something>], else false


[3/3] madlib git commit: Minibatch Preprocessor: Create temp table for standardization.

Posted by nj...@apache.org.
Minibatch Preprocessor: Create temp table for standardization.

We did a few experiments and the results proved that creating a
temp table for standardization is faster than using a subquery.
This commit now creates a temp table for the standardization.
Before this commit, we were calling the `utils_normalize_data`
function inside the main query but now we create a temp table from the
output of `utils_normalize_data` and use the table in the main query.

Closes #260


Project: http://git-wip-us.apache.org/repos/asf/madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/madlib/commit/259e0041
Tree: http://git-wip-us.apache.org/repos/asf/madlib/tree/259e0041
Diff: http://git-wip-us.apache.org/repos/asf/madlib/diff/259e0041

Branch: refs/heads/master
Commit: 259e00416a268512cee80513fb24bcb2ae9fb273
Parents: b886381
Author: Nikhil Kak <nk...@pivotal.io>
Authored: Fri Apr 6 13:55:46 2018 -0700
Committer: Nandish Jayaram <nj...@apache.org>
Committed: Fri Apr 13 17:16:50 2018 -0700

----------------------------------------------------------------------
 .../utilities/minibatch_preprocessing.py_in     | 70 ++++++++++++--------
 .../test_minibatch_preprocessing.py_in          |  2 +-
 2 files changed, 42 insertions(+), 30 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/madlib/blob/259e0041/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in b/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in
index 856c7e4..89eea6e 100644
--- a/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in
+++ b/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in
@@ -169,6 +169,7 @@ class MiniBatchPreProcessor:
         plpy.execute(sql)
 
         standardizer.create_output_standardization_table()
+        standardizer.drop_standardized_table()
         MiniBatchSummarizer.create_output_summary_table(
             self.output_summary_table,
             self.source_table,
@@ -365,6 +366,7 @@ class MiniBatchStandardizer:
         self.x_mean_table = unique_string(desp='x_mean_table')
         self.x_mean_str = None
         self.x_std_dev_str = None
+        self.standardized_table = unique_string(desp='std_table')
         self._calculate_mean_and_std_dev_str()
 
     def _calculate_mean_and_std_dev_str(self):
@@ -395,42 +397,49 @@ class MiniBatchStandardizer:
 
     def get_query_for_standardizing(self):
         if self.grouping_cols:
-            return self._get_query_for_standardizing_with_grouping()
+            query = self._get_query_for_standardizing_with_grouping()
         else:
-            return self._get_query_for_standardizing_without_grouping()
+            query = self._get_query_for_standardizing_without_grouping()
+        plpy.execute(query)
+
+        return "select * from {0}".format(self.standardized_table)
 
     def _get_query_for_standardizing_without_grouping(self):
         return """
-            SELECT
-                {self.dep_var_array_str} AS {dep_colname},
-                {self.schema_madlib}.utils_normalize_data(
-                    {self.indep_var_array_str},
-                    '{self.x_mean_str}'::double precision[],
-                    '{self.x_std_dev_str}'::double precision[]
-                ) AS {ind_colname}
-            FROM {self.source_table}
-        """.format(dep_colname=MINIBATCH_OUTPUT_DEPENDENT_COLNAME,
-                   ind_colname=MINIBATCH_OUTPUT_INDEPENDENT_COLNAME,
-                   self=self)
+          CREATE TEMP TABLE {self.standardized_table} AS
+          SELECT
+            {self.dep_var_array_str} AS {dep_colname},
+            {self.schema_madlib}.utils_normalize_data(
+              {self.indep_var_array_str},
+              '{self.x_mean_str}'::double precision[],
+              '{self.x_std_dev_str}'::double precision[]
+            ) AS {ind_colname}
+          FROM {self.source_table}
+          """.format(dep_colname=MINIBATCH_OUTPUT_DEPENDENT_COLNAME,
+                     ind_colname=MINIBATCH_OUTPUT_INDEPENDENT_COLNAME,
+                     self=self)
+
 
     def _get_query_for_standardizing_with_grouping(self):
         return """
-            SELECT
-                {self.dep_var_array_str} as {dep_colname},
-                {self.schema_madlib}.utils_normalize_data(
-                    {self.indep_var_array_str},
-                    __x__.mean::double precision[],
-                    __x__.std::double precision[]
-                ) AS {ind_colname},
-                {self.source_table}.{self.grouping_cols}
-            FROM
-                {self.source_table}
-                INNER JOIN
-                {self.x_mean_table} AS __x__
-                ON  {self.source_table}.{self.grouping_cols} = __x__.{self.grouping_cols}
-        """.format(self=self,
-                   dep_colname = MINIBATCH_OUTPUT_DEPENDENT_COLNAME,
-                   ind_colname = MINIBATCH_OUTPUT_INDEPENDENT_COLNAME)
+        CREATE TEMP TABLE {self.standardized_table} AS
+          SELECT
+            {self.dep_var_array_str} AS {dep_colname},
+            {self.schema_madlib}.utils_normalize_data(
+                {self.indep_var_array_str},
+                __x__.mean::double precision[],
+                __x__.std::double precision[]
+            ) AS {ind_colname},
+            {self.source_table}.{self.grouping_cols}
+        FROM
+          {self.source_table} 
+          INNER JOIN 
+          {self.x_mean_table} AS __x__ 
+          ON  {self.source_table}.{self.grouping_cols} = __x__.{self.grouping_cols}
+        """.format(
+            self=self,
+            dep_colname=MINIBATCH_OUTPUT_DEPENDENT_COLNAME,
+            ind_colname=MINIBATCH_OUTPUT_INDEPENDENT_COLNAME)
 
     def create_output_standardization_table(self):
         if self.grouping_cols:
@@ -446,6 +455,9 @@ class MiniBatchStandardizer:
             """.format(self=self)
         plpy.execute(query)
 
+    def drop_standardized_table(self):
+        plpy.execute("DROP TABLE IF EXISTS {0}".format(self.standardized_table))
+
 
 class MiniBatchSummarizer:
     @staticmethod

http://git-wip-us.apache.org/repos/asf/madlib/blob/259e0041/src/ports/postgres/modules/utilities/test/unit_tests/test_minibatch_preprocessing.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/test/unit_tests/test_minibatch_preprocessing.py_in b/src/ports/postgres/modules/utilities/test/unit_tests/test_minibatch_preprocessing.py_in
index f458303..75cc044 100644
--- a/src/ports/postgres/modules/utilities/test/unit_tests/test_minibatch_preprocessing.py_in
+++ b/src/ports/postgres/modules/utilities/test/unit_tests/test_minibatch_preprocessing.py_in
@@ -135,7 +135,7 @@ m4_changequote(`<!', `!>')
 #                                                              self.grouping_cols,
 #                                                              1)
 #         preprocessor_obj.minibatch_preprocessor()
-#         self.assert_(True)
+#         self.assertEqual(1, drop_table_mock.call_count)
 
 
 class MiniBatchQueryFormatterTestCase(unittest.TestCase):


[2/3] madlib git commit: MiniBatch Preprocessor: Check for all character types for dependent col

Posted by nj...@apache.org.
MiniBatch Preprocessor: Check for all character types for dependent col

This commit enables support for dependent column type
to be any of the postgres character types instead of just `text`.


Project: http://git-wip-us.apache.org/repos/asf/madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/madlib/commit/b8863813
Tree: http://git-wip-us.apache.org/repos/asf/madlib/tree/b8863813
Diff: http://git-wip-us.apache.org/repos/asf/madlib/diff/b8863813

Branch: refs/heads/master
Commit: b886381303c0cd4deff9468558a95455a64f4699
Parents: c902cb6
Author: Nikhil Kak <nk...@pivotal.io>
Authored: Fri Apr 6 11:42:41 2018 -0700
Committer: Nandish Jayaram <nj...@apache.org>
Committed: Fri Apr 13 17:16:50 2018 -0700

----------------------------------------------------------------------
 .../modules/utilities/minibatch_preprocessing.py_in       | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/madlib/blob/b8863813/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in b/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in
index 1c53a59..856c7e4 100644
--- a/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in
+++ b/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in
@@ -30,6 +30,8 @@ from utilities import add_postfix
 from utilities import _assert
 from utilities import get_seg_number
 from utilities import is_platform_pg
+from utilities import is_psql_boolean_type
+from utilities import is_psql_char_type
 from utilities import is_psql_numeric_type
 from utilities import is_psql_int_type
 from utilities import is_string_formatted_as_array_expression
@@ -288,7 +290,8 @@ class MiniBatchQueryFormatter:
         """
         dep_var_class_value_str = 'NULL::TEXT'
         is_dep_var_int_type = is_psql_int_type(dependent_var_dbtype)
-        to_one_hot_encode = (dependent_var_dbtype in ("text", "boolean") or
+        to_one_hot_encode = (is_psql_char_type(dependent_var_dbtype) or
+                             is_psql_boolean_type(dependent_var_dbtype) or
                                 (to_one_hot_encode_int and
                                     is_dep_var_int_type))
         if to_one_hot_encode:
@@ -314,8 +317,9 @@ class MiniBatchQueryFormatter:
         elif is_psql_numeric_type(dependent_var_dbtype):
             dep_var_array_str = 'ARRAY[{0}]'.format(dependent_varname)
         else:
-            plpy.error("Invalid dependent variable type. It should be text, "
-                       "boolean, numeric, or array.")
+            plpy.error("""Invalid dependent variable type. It should be character,
+                boolean, numeric, or array.""")
+
         return dep_var_array_str, dep_var_class_value_str
 
     def _get_one_hot_encoded_str(self, var_name, var_classes, to_quote=True):