You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by ri...@apache.org on 2018/07/12 17:09:26 UTC
madlib git commit: Utilites: Add CTAS while dropping some columns
Repository: madlib
Updated Branches:
refs/heads/master 59ad96a04 -> 5f80ba978
Utilites: Add CTAS while dropping some columns
JIRA: MADLIB-1241
This commit adds function to create a new table from existing table
while dropping some of the columns of the original table.
Closes #282
Project: http://git-wip-us.apache.org/repos/asf/madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/madlib/commit/5f80ba97
Tree: http://git-wip-us.apache.org/repos/asf/madlib/tree/5f80ba97
Diff: http://git-wip-us.apache.org/repos/asf/madlib/diff/5f80ba97
Branch: refs/heads/master
Commit: 5f80ba9781efb3526e422a0867ae1d34b49c7ac8
Parents: 59ad96a
Author: Rahul Iyer <ri...@apache.org>
Authored: Thu Jul 12 10:08:52 2018 -0700
Committer: Rahul Iyer <ri...@apache.org>
Committed: Thu Jul 12 10:08:52 2018 -0700
----------------------------------------------------------------------
doc/mainpage.dox.in | 16 ++--
.../utilities/test/drop_madlib_temp.ic.sql_in | 23 ------
.../utilities/test/drop_madlib_temp.sql_in | 16 ----
.../modules/utilities/test/utilities.ic.sql_in | 58 +++++++++++++
.../modules/utilities/test/utilities.sql_in | 87 ++++++++++++++++++++
.../postgres/modules/utilities/utilities.py_in | 37 ++++++++-
.../postgres/modules/utilities/utilities.sql_in | 43 +++++++++-
7 files changed, 227 insertions(+), 53 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/madlib/blob/5f80ba97/doc/mainpage.dox.in
----------------------------------------------------------------------
diff --git a/doc/mainpage.dox.in b/doc/mainpage.dox.in
index 8681eb4..e41e6c9 100644
--- a/doc/mainpage.dox.in
+++ b/doc/mainpage.dox.in
@@ -261,12 +261,9 @@ Contains graph algorithms.
@ingroup grp_topic_modelling
-@defgroup grp_utility_functions Utility Functions
- @defgroup @grp_utilities Developer Database Functions
- @ingroup grp_utility_functions
-
+@defgroup grp_other_functions Other Functions
@defgroup grp_linear_solver Linear Solvers
- @ingroup grp_utility_functions
+ @ingroup grp_other_functions
@{A collection of methods that implement solutions for systems of consistent linear equations. @}
@defgroup grp_dense_linear_solver Dense Linear Systems
@@ -276,13 +273,16 @@ Contains graph algorithms.
@ingroup grp_linear_solver
@defgroup grp_minibatch_preprocessing Mini-Batch Preprocessor
- @ingroup grp_utility_functions
+ @ingroup grp_other_functions
@defgroup grp_pmml PMML Export
- @ingroup grp_utility_functions
+ @ingroup grp_other_functions
@defgroup grp_text_utilities Term Frequency
- @ingroup grp_utility_functions
+ @ingroup grp_other_functions
+
+ @defgroup @grp_utilities Utilities
+ @ingroup grp_other_functions
@defgroup grp_early_stage Early Stage Development
@brief A collection of implementations which are in early stage of development.
http://git-wip-us.apache.org/repos/asf/madlib/blob/5f80ba97/src/ports/postgres/modules/utilities/test/drop_madlib_temp.ic.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/test/drop_madlib_temp.ic.sql_in b/src/ports/postgres/modules/utilities/test/drop_madlib_temp.ic.sql_in
deleted file mode 100644
index 7879385..0000000
--- a/src/ports/postgres/modules/utilities/test/drop_madlib_temp.ic.sql_in
+++ /dev/null
@@ -1,23 +0,0 @@
-/* ----------------------------------------------------------------------- *//**
- *
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- *
- *//* ----------------------------------------------------------------------- */
-
--- cleanup
-SELECT cleanup_madlib_temp_tables(quote_ident(current_schema()));
http://git-wip-us.apache.org/repos/asf/madlib/blob/5f80ba97/src/ports/postgres/modules/utilities/test/drop_madlib_temp.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/test/drop_madlib_temp.sql_in b/src/ports/postgres/modules/utilities/test/drop_madlib_temp.sql_in
deleted file mode 100644
index f902361..0000000
--- a/src/ports/postgres/modules/utilities/test/drop_madlib_temp.sql_in
+++ /dev/null
@@ -1,16 +0,0 @@
-CREATE TABLE "__madlib_temp_Quoted"(b varchar);
-CREATE TABLE __madlib_temp_non_quoted(a text);
--- assert that madlib_temp tables are created
-SELECT assert(count(*) >= 2, 'Error setting up madlib_temp in schema ' || quote_ident(current_schema()))
-FROM pg_tables
-WHERE tablename LIKE '%madlib\_temp%'
- AND quote_ident(schemaname) = quote_ident(current_schema());
-
--- cleanup
-SELECT cleanup_madlib_temp_tables(quote_ident(current_schema()));
-
--- assert that madlib_temp tables are dropped
-SELECT assert(count(*) = 0, 'Error cleaning up madlib_temp in schema ' || quote_ident(current_schema()))
-FROM pg_tables
-WHERE tablename LIKE '%madlib\_temp%'
- AND quote_ident(schemaname) = quote_ident(current_schema());
http://git-wip-us.apache.org/repos/asf/madlib/blob/5f80ba97/src/ports/postgres/modules/utilities/test/utilities.ic.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/test/utilities.ic.sql_in b/src/ports/postgres/modules/utilities/test/utilities.ic.sql_in
new file mode 100644
index 0000000..832e1de
--- /dev/null
+++ b/src/ports/postgres/modules/utilities/test/utilities.ic.sql_in
@@ -0,0 +1,58 @@
+/* ----------------------------------------------------------------------- */
+/**
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ *//* ----------------------------------------------------------------------- */
+
+-- cleanup
+SELECT cleanup_madlib_temp_tables(quote_ident(current_schema()));
+
+-- test dropcols
+DROP TABLE IF EXISTS dt_golf CASCADE;
+CREATE TABLE dt_golf (
+ id integer NOT NULL,
+ id_2 integer,
+ "OUTLOOK" text,
+ temperature double precision,
+ humidity double precision,
+ "Cont,features" double precision[],
+ cat_features text[],
+ windy boolean,
+ class text
+) ;
+
+INSERT INTO dt_golf (id,"OUTLOOK",temperature,humidity,"Cont,features",cat_features, windy,class) VALUES
+(1, 'sunny', 85, 85,ARRAY[85, 85], ARRAY['a', 'b'], false, 'Don''t Play'),
+(2, 'sunny', 80, 90, ARRAY[80, 90], ARRAY['a', 'b'], true, 'Don''t Play'),
+(3, 'overcast', 83, 78, ARRAY[83, 78], ARRAY['a', 'b'], false, 'Play'),
+(4, 'rain', 70, NULL, ARRAY[70, 96], ARRAY['a', 'b'], false, 'Play'),
+(5, 'rain', 68, 80, ARRAY[68, 80], ARRAY['a', 'b'], false, 'Play'),
+(6, 'rain', NULL, 70, ARRAY[65, 70], ARRAY['a', 'b'], true, 'Don''t Play'),
+(7, 'overcast', 64, 65, ARRAY[64, 65], ARRAY['c', 'b'], NULL , 'Play'),
+(8, 'sunny', 72, 95, ARRAY[72, 95], ARRAY['a', 'b'], false, 'Don''t Play'),
+(9, 'sunny', 69, 70, ARRAY[69, 70], ARRAY['a', 'b'], false, 'Play'),
+(10, 'rain', 75, 80, ARRAY[75, 80], ARRAY['a', 'b'], false, 'Play'),
+(11, 'sunny', 75, 70, ARRAY[75, 70], ARRAY['a', 'd'], true, 'Play'),
+(12, 'overcast', 72, 90, ARRAY[72, 90], ARRAY['c', 'b'], NULL, 'Play'),
+(13, 'overcast', 81, 75, ARRAY[81, 75], ARRAY['a', 'b'], false, 'Play'),
+(15, NULL, 81, 75, ARRAY[81, 75], ARRAY['a', 'b'], false, 'Play'),
+(16, 'overcast', NULL, 75, ARRAY[81, 75], ARRAY['a', 'd'], false, 'Play'),
+(14, 'rain', 71, 80, ARRAY[71, 80], ARRAY['c', 'b'], true, 'Don''t Play');
+
+SELECT dropcols('dt_golf', 'dt_golf2', '"OUTLOOK", "Cont,features", cat_features');
http://git-wip-us.apache.org/repos/asf/madlib/blob/5f80ba97/src/ports/postgres/modules/utilities/test/utilities.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/test/utilities.sql_in b/src/ports/postgres/modules/utilities/test/utilities.sql_in
new file mode 100644
index 0000000..d3525ce
--- /dev/null
+++ b/src/ports/postgres/modules/utilities/test/utilities.sql_in
@@ -0,0 +1,87 @@
+/* ----------------------------------------------------------------------- */
+/**
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+/* ----------------------------------------------------------------------- */
+
+CREATE TABLE "__madlib_temp_Quoted"(b varchar);
+CREATE TABLE __madlib_temp_non_quoted(a text);
+-- assert that madlib_temp tables are created
+SELECT assert(count(*) >= 2, 'Error setting up madlib_temp in schema ' || quote_ident(current_schema()))
+FROM pg_tables
+WHERE tablename LIKE '%madlib\_temp%'
+ AND quote_ident(schemaname) = quote_ident(current_schema());
+
+-- cleanup
+SELECT cleanup_madlib_temp_tables(quote_ident(current_schema()));
+
+-- assert that madlib_temp tables are dropped
+SELECT assert(count(*) = 0, 'Error cleaning up madlib_temp in schema ' || quote_ident(current_schema()))
+FROM pg_tables
+WHERE tablename LIKE '%madlib\_temp%'
+ AND quote_ident(schemaname) = quote_ident(current_schema());
+
+-- test dropcols
+DROP TABLE IF EXISTS dt_golf CASCADE;
+CREATE TABLE dt_golf (
+ id integer NOT NULL,
+ id_2 integer,
+ "OUTLOOK" text,
+ temperature double precision,
+ "len$$'%*()gth" double precision,
+ "Cont,features" double precision[],
+ cat_features text[],
+ windy boolean,
+ class text
+) ;
+
+INSERT INTO dt_golf (id,"OUTLOOK",temperature,"len$$'%*()gth","Cont,features",cat_features, windy,class) VALUES
+(1, 'sunny', 85, 85,ARRAY[85, 85], ARRAY['a', 'b'], false, 'Don''t Play'),
+(2, 'sunny', 80, 90, ARRAY[80, 90], ARRAY['a', 'b'], true, 'Don''t Play'),
+(3, 'overcast', 83, 78, ARRAY[83, 78], ARRAY['a', 'b'], false, 'Play'),
+(4, 'rain', 70, NULL, ARRAY[70, 96], ARRAY['a', 'b'], false, 'Play'),
+(5, 'rain', 68, 80, ARRAY[68, 80], ARRAY['a', 'b'], false, 'Play'),
+(6, 'rain', NULL, 70, ARRAY[65, 70], ARRAY['a', 'b'], true, 'Don''t Play'),
+(7, 'overcast', 64, 65, ARRAY[64, 65], ARRAY['c', 'b'], NULL , 'Play'),
+(8, 'sunny', 72, 95, ARRAY[72, 95], ARRAY['a', 'b'], false, 'Don''t Play'),
+(9, 'sunny', 69, 70, ARRAY[69, 70], ARRAY['a', 'b'], false, 'Play'),
+(10, 'rain', 75, 80, ARRAY[75, 80], ARRAY['a', 'b'], false, 'Play'),
+(11, 'sunny', 75, 70, ARRAY[75, 70], ARRAY['a', 'd'], true, 'Play'),
+(12, 'overcast', 72, 90, ARRAY[72, 90], ARRAY['c', 'b'], NULL, 'Play'),
+(13, 'overcast', 81, 75, ARRAY[81, 75], ARRAY['a', 'b'], false, 'Play'),
+(15, NULL, 81, 75, ARRAY[81, 75], ARRAY['a', 'b'], false, 'Play'),
+(16, 'overcast', NULL, 75, ARRAY[81, 75], ARRAY['a', 'd'], false, 'Play'),
+(14, 'rain', 71, 80, ARRAY[71, 80], ARRAY['c', 'b'], true, 'Don''t Play');
+
+SELECT dropcols('dt_golf', 'dt_golf2', '"OUTLOOK", "Cont,features", cat_features');
+
+-- test if columns have been dropped
+SELECT assert(n_cols = 0, 'Some of the columns have not been dropped')
+FROM (
+ SELECT count(*) AS n_cols
+ FROM information_schema.columns
+ WHERE table_name='dt_golf2' AND
+ (column_name in ('"OUTLOOK"', '"Cont,features"', 'cat_features'))
+) q;
+
+-- test if the retained columns are present in output table
+SELECT
+ id, id_2, temperature, "len$$'%*()gth", windy, class
+FROM dt_golf2;
http://git-wip-us.apache.org/repos/asf/madlib/blob/5f80ba97/src/ports/postgres/modules/utilities/utilities.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/utilities.py_in b/src/ports/postgres/modules/utilities/utilities.py_in
index 2bf08a7..55b6983 100644
--- a/src/ports/postgres/modules/utilities/utilities.py_in
+++ b/src/ports/postgres/modules/utilities/utilities.py_in
@@ -7,6 +7,7 @@ from distutils.util import strtobool
from validate_args import _get_table_schema_names
from validate_args import get_first_schema
+from validate_args import get_cols
from validate_args import cols_in_tbl_valid
from validate_args import does_exclude_reserved
from validate_args import explicit_bool_to_text
@@ -840,8 +841,8 @@ def get_grouping_col_str(schema_madlib, module_name, reserved_cols,
cols_in_tbl_valid(source_table, grouping_col_array, module_name)
does_exclude_reserved(grouping_col_array, reserved_cols)
grp_array_w_cast = explicit_bool_to_text(source_table,
- grouping_col_array,
- schema_madlib)
+ grouping_col_array,
+ schema_madlib)
grouping_str = ', '.join(i + "::text" for i in grp_array_w_cast)
else:
grouping_str = "Null"
@@ -911,3 +912,35 @@ def validate_module_input_params(source_table, output_table, independent_varname
grouping_cols=grouping_cols,
source_table=source_table))
# ------------------------------------------------------------------------
+
+
+def create_table_drop_cols(source_table, out_table, cols_to_drop, **kwargs):
+ """ Create copy of table while dropping some of the columns
+ Args:
+ @param source_table str. Name of the source table
+ @param out_table str. Name of the output table
+ @param cols_to_drop str. Comma-separated list of columns to drop
+ """
+ input_tbl_valid(source_table, 'Utilities')
+ output_tbl_valid(out_table, 'Utilities')
+ _assert(cols_to_drop and cols_to_drop.strip(),
+ "Utilities error: cols_to_drop cannot be empty or NULL")
+
+ source_table_cols = get_cols(source_table)
+ cols_to_drop_list = split_quoted_delimited_str(cols_to_drop)
+ cols_not_in_source = set(cols_to_drop_list) - set(source_table_cols)
+ _assert(not cols_not_in_source,
+ "Utilities error: Some column(s) in cols_to_drop are not present "
+ "in source table")
+
+ cols_to_retain = [c for c in source_table_cols if c not in cols_to_drop_list]
+ _assert(cols_to_retain,
+ "Utilities error: No valid columns for the output table")
+ plpy.execute("""
+ CREATE TABLE {out_table} AS
+ SELECT {cols}
+ FROM {source_table}
+ """.format(cols=', '.join(cols_to_retain),
+ out_table=out_table,
+ source_table=source_table))
+# ------------------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/madlib/blob/5f80ba97/src/ports/postgres/modules/utilities/utilities.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/utilities.sql_in b/src/ports/postgres/modules/utilities/utilities.sql_in
index 6fdb463..4993afe 100644
--- a/src/ports/postgres/modules/utilities/utilities.sql_in
+++ b/src/ports/postgres/modules/utilities/utilities.sql_in
@@ -1,4 +1,5 @@
-/* ----------------------------------------------------------------------- *//**
+/* ----------------------------------------------------------------------- */
+/**
*
* @file utilities.sql_in
*
@@ -7,7 +8,8 @@
* @sa For a brief overview of utility functions, see the
* module description \ref grp_utilities.
*
- *//* ----------------------------------------------------------------------- */
+**/
+/* ----------------------------------------------------------------------- */
m4_include(`SQLCommon.m4')
@@ -72,6 +74,12 @@ while implementing new algorithms.
<td>Drop all tables matching pattern '%madlib_temp%' in a given schema.</td>
</tr>
+ <tr>
+ <th>dropcols()</th>
+ <td>Create a new table with a subset of the columns dropped from a source table.
+ </td>
+ </tr>
+
</table>
Note: If the function cleanup_madlib_temp_tables() gives an Out-of-memory error,
@@ -207,7 +215,7 @@ m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
*
* @param source_table Source table
* @param column_name Column name in the table
- * @returns \c TRUE if it exsists and FALSE if not
+ * @returns \c TRUE if it exists and FALSE if not
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.check_if_col_exists(
source_table TEXT,
@@ -409,7 +417,7 @@ $$m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');
/**
* @brief Cast any value to text.
*
- * @param val A value with any specific type.
+ * @param val A value with any specific type.
*
* @return The text format string for the value.
*
@@ -474,3 +482,30 @@ CREATE AGGREGATE MADLIB_SCHEMA.mode(double precision) (
FINALFUNC=MADLIB_SCHEMA._final_mode, --Function to call after everything has been added to array
INITCOND='{}' --Initialize an empty array when starting
);
+
+
+------------------------------------------------------------------------
+/**
+ * @brief Creates a new table with a subset of columns dropped from another
+ * source table.
+ *
+ * @param source_table Name of the table containing the source data.
+ * @param out_table Name of the generated table containing the output.
+ If a table with the same name already exists, an error will be returned.
+ * @param cols_to_drop Comma-separated string of column names from the source
+ * table to drop. An error is returned if the output table does not
+ * contain any columns or if a requested column is not present in the
+ * source table.
+ */
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.dropcols (
+ source_table VARCHAR,
+ out_table VARCHAR,
+ cols_to_drop VARCHAR
+)
+RETURNS void AS $$
+PythonFunctionBodyOnly(utilities, utilities)
+ from utilities import control
+ with control.MinWarning('error'):
+ return utilities.create_table_drop_cols(source_table, out_table, cols_to_drop)
+$$ LANGUAGE plpythonu VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');