You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by nj...@apache.org on 2018/04/14 00:22:36 UTC
[1/3] madlib git commit: Utilities: Add functions for postgres
character/boolean type comparison.
Repository: madlib
Updated Branches:
refs/heads/master b2efa7d0b -> 259e00416
Utilities: Add functions for postgres character/boolean type comparison.
This commit adds two functions to check if a given type matches
one of the predefined postgres character or boolean types.
Project: http://git-wip-us.apache.org/repos/asf/madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/madlib/commit/c902cb60
Tree: http://git-wip-us.apache.org/repos/asf/madlib/tree/c902cb60
Diff: http://git-wip-us.apache.org/repos/asf/madlib/diff/c902cb60
Branch: refs/heads/master
Commit: c902cb60fcbc436fadee8902bb24cbbd02eaad0b
Parents: b2efa7d
Author: Nikhil Kak <nk...@pivotal.io>
Authored: Fri Apr 6 11:35:16 2018 -0700
Committer: Nandish Jayaram <nj...@apache.org>
Committed: Fri Apr 13 17:16:38 2018 -0700
----------------------------------------------------------------------
.../test/unit_tests/test_utilities.py_in | 20 ++++++++++++++++
.../postgres/modules/utilities/utilities.py_in | 24 ++++++++++++++++++++
2 files changed, 44 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/madlib/blob/c902cb60/src/ports/postgres/modules/utilities/test/unit_tests/test_utilities.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/test/unit_tests/test_utilities.py_in b/src/ports/postgres/modules/utilities/test/unit_tests/test_utilities.py_in
index 0f38a05..7ad6253 100644
--- a/src/ports/postgres/modules/utilities/test/unit_tests/test_utilities.py_in
+++ b/src/ports/postgres/modules/utilities/test/unit_tests/test_utilities.py_in
@@ -196,5 +196,25 @@ class UtilitiesTestCase(unittest.TestCase):
{'class': 'c'}]),
{'class': ['a', 'b', 'c']})
+ def test_is_psql_char_type(self):
+ self.assertTrue(self.subject.is_psql_char_type('text'))
+ self.assertTrue(self.subject.is_psql_char_type('varchar'))
+ self.assertTrue(self.subject.is_psql_char_type('character varying'))
+ self.assertTrue(self.subject.is_psql_char_type('char'))
+ self.assertTrue(self.subject.is_psql_char_type('character'))
+
+ self.assertFalse(self.subject.is_psql_char_type('c1har'))
+ self.assertFalse(self.subject.is_psql_char_type('varchar1'))
+ self.assertFalse(self.subject.is_psql_char_type('1character'))
+
+ def test_is_psql_char_type_excludes_list(self):
+ self.assertTrue(self.subject.is_psql_char_type('text', ['varchar','char']))
+ self.assertFalse(self.subject.is_psql_char_type('text', ['text','char']))
+ self.assertFalse(self.subject.is_psql_char_type('varchar', 'varchar'))
+
+ def test_is_psql_boolean_type(self):
+ self.assertTrue(self.subject.is_psql_boolean_type('boolean'))
+ self.assertFalse(self.subject.is_psql_boolean_type('not boolean'))
+
if __name__ == '__main__':
unittest.main()
http://git-wip-us.apache.org/repos/asf/madlib/blob/c902cb60/src/ports/postgres/modules/utilities/utilities.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/utilities.py_in b/src/ports/postgres/modules/utilities/utilities.py_in
index 324ed6d..4131a80 100644
--- a/src/ports/postgres/modules/utilities/utilities.py_in
+++ b/src/ports/postgres/modules/utilities/utilities.py_in
@@ -210,6 +210,30 @@ def is_psql_int_type(arg, exclude=None):
return (arg in to_check_types)
# -------------------------------------------------------------------------
+
+def is_psql_char_type(arg, exclude_list=[]):
+ """
+ This function checks if the given arg is one of the predefined postgres
+ character types
+ :param arg:
+ :param exclude: Optionally exclude one or more types from the comparison
+ :return: True if it is one of the character types, else False.
+
+ """
+ if not isinstance(exclude_list, list) :
+ exclude_list = [exclude_list]
+
+ text_types = set(['text', 'varchar', 'character varying', 'char', 'character'])
+ return arg in text_types - set(exclude_list)
+
+def is_psql_boolean_type(arg):
+ """
+ This function checks if the given arg is one of type postgres boolean
+ :param arg:
+ :return: True if it is boolean, else False.
+ """
+ return arg == 'boolean'
+
def is_string_formatted_as_array_expression(string_to_match):
"""
Return true if the string is formatted as array[<something>], else false
[3/3] madlib git commit: Minibatch Preprocessor: Create temp table
for standardization.
Posted by nj...@apache.org.
Minibatch Preprocessor: Create temp table for standardization.
We did a few experiments and the results proved that creating a
temp table for standardization is faster than using a subquery.
This commit now creates a temp table for the standardization.
Before this commit, we were calling the `utils_normalize_data`
function inside the main query but now we create a temp table from the
output of `utils_normalize_data` and use the table in the main query.
Closes #260
Project: http://git-wip-us.apache.org/repos/asf/madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/madlib/commit/259e0041
Tree: http://git-wip-us.apache.org/repos/asf/madlib/tree/259e0041
Diff: http://git-wip-us.apache.org/repos/asf/madlib/diff/259e0041
Branch: refs/heads/master
Commit: 259e00416a268512cee80513fb24bcb2ae9fb273
Parents: b886381
Author: Nikhil Kak <nk...@pivotal.io>
Authored: Fri Apr 6 13:55:46 2018 -0700
Committer: Nandish Jayaram <nj...@apache.org>
Committed: Fri Apr 13 17:16:50 2018 -0700
----------------------------------------------------------------------
.../utilities/minibatch_preprocessing.py_in | 70 ++++++++++++--------
.../test_minibatch_preprocessing.py_in | 2 +-
2 files changed, 42 insertions(+), 30 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/madlib/blob/259e0041/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in b/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in
index 856c7e4..89eea6e 100644
--- a/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in
+++ b/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in
@@ -169,6 +169,7 @@ class MiniBatchPreProcessor:
plpy.execute(sql)
standardizer.create_output_standardization_table()
+ standardizer.drop_standardized_table()
MiniBatchSummarizer.create_output_summary_table(
self.output_summary_table,
self.source_table,
@@ -365,6 +366,7 @@ class MiniBatchStandardizer:
self.x_mean_table = unique_string(desp='x_mean_table')
self.x_mean_str = None
self.x_std_dev_str = None
+ self.standardized_table = unique_string(desp='std_table')
self._calculate_mean_and_std_dev_str()
def _calculate_mean_and_std_dev_str(self):
@@ -395,42 +397,49 @@ class MiniBatchStandardizer:
def get_query_for_standardizing(self):
if self.grouping_cols:
- return self._get_query_for_standardizing_with_grouping()
+ query = self._get_query_for_standardizing_with_grouping()
else:
- return self._get_query_for_standardizing_without_grouping()
+ query = self._get_query_for_standardizing_without_grouping()
+ plpy.execute(query)
+
+ return "select * from {0}".format(self.standardized_table)
def _get_query_for_standardizing_without_grouping(self):
return """
- SELECT
- {self.dep_var_array_str} AS {dep_colname},
- {self.schema_madlib}.utils_normalize_data(
- {self.indep_var_array_str},
- '{self.x_mean_str}'::double precision[],
- '{self.x_std_dev_str}'::double precision[]
- ) AS {ind_colname}
- FROM {self.source_table}
- """.format(dep_colname=MINIBATCH_OUTPUT_DEPENDENT_COLNAME,
- ind_colname=MINIBATCH_OUTPUT_INDEPENDENT_COLNAME,
- self=self)
+ CREATE TEMP TABLE {self.standardized_table} AS
+ SELECT
+ {self.dep_var_array_str} AS {dep_colname},
+ {self.schema_madlib}.utils_normalize_data(
+ {self.indep_var_array_str},
+ '{self.x_mean_str}'::double precision[],
+ '{self.x_std_dev_str}'::double precision[]
+ ) AS {ind_colname}
+ FROM {self.source_table}
+ """.format(dep_colname=MINIBATCH_OUTPUT_DEPENDENT_COLNAME,
+ ind_colname=MINIBATCH_OUTPUT_INDEPENDENT_COLNAME,
+ self=self)
+
def _get_query_for_standardizing_with_grouping(self):
return """
- SELECT
- {self.dep_var_array_str} as {dep_colname},
- {self.schema_madlib}.utils_normalize_data(
- {self.indep_var_array_str},
- __x__.mean::double precision[],
- __x__.std::double precision[]
- ) AS {ind_colname},
- {self.source_table}.{self.grouping_cols}
- FROM
- {self.source_table}
- INNER JOIN
- {self.x_mean_table} AS __x__
- ON {self.source_table}.{self.grouping_cols} = __x__.{self.grouping_cols}
- """.format(self=self,
- dep_colname = MINIBATCH_OUTPUT_DEPENDENT_COLNAME,
- ind_colname = MINIBATCH_OUTPUT_INDEPENDENT_COLNAME)
+ CREATE TEMP TABLE {self.standardized_table} AS
+ SELECT
+ {self.dep_var_array_str} AS {dep_colname},
+ {self.schema_madlib}.utils_normalize_data(
+ {self.indep_var_array_str},
+ __x__.mean::double precision[],
+ __x__.std::double precision[]
+ ) AS {ind_colname},
+ {self.source_table}.{self.grouping_cols}
+ FROM
+ {self.source_table}
+ INNER JOIN
+ {self.x_mean_table} AS __x__
+ ON {self.source_table}.{self.grouping_cols} = __x__.{self.grouping_cols}
+ """.format(
+ self=self,
+ dep_colname=MINIBATCH_OUTPUT_DEPENDENT_COLNAME,
+ ind_colname=MINIBATCH_OUTPUT_INDEPENDENT_COLNAME)
def create_output_standardization_table(self):
if self.grouping_cols:
@@ -446,6 +455,9 @@ class MiniBatchStandardizer:
""".format(self=self)
plpy.execute(query)
+ def drop_standardized_table(self):
+ plpy.execute("DROP TABLE IF EXISTS {0}".format(self.standardized_table))
+
class MiniBatchSummarizer:
@staticmethod
http://git-wip-us.apache.org/repos/asf/madlib/blob/259e0041/src/ports/postgres/modules/utilities/test/unit_tests/test_minibatch_preprocessing.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/test/unit_tests/test_minibatch_preprocessing.py_in b/src/ports/postgres/modules/utilities/test/unit_tests/test_minibatch_preprocessing.py_in
index f458303..75cc044 100644
--- a/src/ports/postgres/modules/utilities/test/unit_tests/test_minibatch_preprocessing.py_in
+++ b/src/ports/postgres/modules/utilities/test/unit_tests/test_minibatch_preprocessing.py_in
@@ -135,7 +135,7 @@ m4_changequote(`<!', `!>')
# self.grouping_cols,
# 1)
# preprocessor_obj.minibatch_preprocessor()
-# self.assert_(True)
+# self.assertEqual(1, drop_table_mock.call_count)
class MiniBatchQueryFormatterTestCase(unittest.TestCase):
[2/3] madlib git commit: MiniBatch Preprocessor: Check for all
character types for dependent col
Posted by nj...@apache.org.
MiniBatch Preprocessor: Check for all character types for dependent col
This commit enables support for dependent column type
to be any of the postgres character types instead of just `text`.
Project: http://git-wip-us.apache.org/repos/asf/madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/madlib/commit/b8863813
Tree: http://git-wip-us.apache.org/repos/asf/madlib/tree/b8863813
Diff: http://git-wip-us.apache.org/repos/asf/madlib/diff/b8863813
Branch: refs/heads/master
Commit: b886381303c0cd4deff9468558a95455a64f4699
Parents: c902cb6
Author: Nikhil Kak <nk...@pivotal.io>
Authored: Fri Apr 6 11:42:41 2018 -0700
Committer: Nandish Jayaram <nj...@apache.org>
Committed: Fri Apr 13 17:16:50 2018 -0700
----------------------------------------------------------------------
.../modules/utilities/minibatch_preprocessing.py_in | 10 +++++++---
1 file changed, 7 insertions(+), 3 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/madlib/blob/b8863813/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in b/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in
index 1c53a59..856c7e4 100644
--- a/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in
+++ b/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in
@@ -30,6 +30,8 @@ from utilities import add_postfix
from utilities import _assert
from utilities import get_seg_number
from utilities import is_platform_pg
+from utilities import is_psql_boolean_type
+from utilities import is_psql_char_type
from utilities import is_psql_numeric_type
from utilities import is_psql_int_type
from utilities import is_string_formatted_as_array_expression
@@ -288,7 +290,8 @@ class MiniBatchQueryFormatter:
"""
dep_var_class_value_str = 'NULL::TEXT'
is_dep_var_int_type = is_psql_int_type(dependent_var_dbtype)
- to_one_hot_encode = (dependent_var_dbtype in ("text", "boolean") or
+ to_one_hot_encode = (is_psql_char_type(dependent_var_dbtype) or
+ is_psql_boolean_type(dependent_var_dbtype) or
(to_one_hot_encode_int and
is_dep_var_int_type))
if to_one_hot_encode:
@@ -314,8 +317,9 @@ class MiniBatchQueryFormatter:
elif is_psql_numeric_type(dependent_var_dbtype):
dep_var_array_str = 'ARRAY[{0}]'.format(dependent_varname)
else:
- plpy.error("Invalid dependent variable type. It should be text, "
- "boolean, numeric, or array.")
+ plpy.error("""Invalid dependent variable type. It should be character,
+ boolean, numeric, or array.""")
+
return dep_var_array_str, dep_var_class_value_str
def _get_one_hot_encoded_str(self, var_name, var_classes, to_quote=True):