You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by ri...@apache.org on 2018/07/12 17:09:26 UTC

madlib git commit: Utilites: Add CTAS while dropping some columns

Repository: madlib
Updated Branches:
  refs/heads/master 59ad96a04 -> 5f80ba978


Utilites: Add CTAS while dropping some columns

JIRA: MADLIB-1241

This commit adds function to create a new table from existing table
while dropping some of the columns of the original table.

Closes #282


Project: http://git-wip-us.apache.org/repos/asf/madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/madlib/commit/5f80ba97
Tree: http://git-wip-us.apache.org/repos/asf/madlib/tree/5f80ba97
Diff: http://git-wip-us.apache.org/repos/asf/madlib/diff/5f80ba97

Branch: refs/heads/master
Commit: 5f80ba9781efb3526e422a0867ae1d34b49c7ac8
Parents: 59ad96a
Author: Rahul Iyer <ri...@apache.org>
Authored: Thu Jul 12 10:08:52 2018 -0700
Committer: Rahul Iyer <ri...@apache.org>
Committed: Thu Jul 12 10:08:52 2018 -0700

----------------------------------------------------------------------
 doc/mainpage.dox.in                             | 16 ++--
 .../utilities/test/drop_madlib_temp.ic.sql_in   | 23 ------
 .../utilities/test/drop_madlib_temp.sql_in      | 16 ----
 .../modules/utilities/test/utilities.ic.sql_in  | 58 +++++++++++++
 .../modules/utilities/test/utilities.sql_in     | 87 ++++++++++++++++++++
 .../postgres/modules/utilities/utilities.py_in  | 37 ++++++++-
 .../postgres/modules/utilities/utilities.sql_in | 43 +++++++++-
 7 files changed, 227 insertions(+), 53 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/madlib/blob/5f80ba97/doc/mainpage.dox.in
----------------------------------------------------------------------
diff --git a/doc/mainpage.dox.in b/doc/mainpage.dox.in
index 8681eb4..e41e6c9 100644
--- a/doc/mainpage.dox.in
+++ b/doc/mainpage.dox.in
@@ -261,12 +261,9 @@ Contains graph algorithms.
         @ingroup grp_topic_modelling
 
 
-@defgroup grp_utility_functions Utility Functions
-    @defgroup @grp_utilities Developer Database Functions
-    @ingroup grp_utility_functions
-
+@defgroup grp_other_functions Other Functions
     @defgroup grp_linear_solver Linear Solvers
-    @ingroup grp_utility_functions
+    @ingroup grp_other_functions
     @{A collection of methods that implement solutions for systems of consistent linear equations. @}
 
         @defgroup grp_dense_linear_solver Dense Linear Systems
@@ -276,13 +273,16 @@ Contains graph algorithms.
         @ingroup grp_linear_solver
 
     @defgroup grp_minibatch_preprocessing Mini-Batch Preprocessor
-    @ingroup grp_utility_functions
+    @ingroup grp_other_functions
 
     @defgroup grp_pmml PMML Export
-    @ingroup grp_utility_functions
+    @ingroup grp_other_functions
 
     @defgroup grp_text_utilities Term Frequency
-    @ingroup grp_utility_functions
+    @ingroup grp_other_functions
+
+    @defgroup @grp_utilities Utilities
+    @ingroup grp_other_functions
 
 @defgroup grp_early_stage Early Stage Development
 @brief A collection of implementations which are in early stage of development.

http://git-wip-us.apache.org/repos/asf/madlib/blob/5f80ba97/src/ports/postgres/modules/utilities/test/drop_madlib_temp.ic.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/test/drop_madlib_temp.ic.sql_in b/src/ports/postgres/modules/utilities/test/drop_madlib_temp.ic.sql_in
deleted file mode 100644
index 7879385..0000000
--- a/src/ports/postgres/modules/utilities/test/drop_madlib_temp.ic.sql_in
+++ /dev/null
@@ -1,23 +0,0 @@
-/* ----------------------------------------------------------------------- *//**
- *
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- *
- *//* ----------------------------------------------------------------------- */
-
--- cleanup
-SELECT cleanup_madlib_temp_tables(quote_ident(current_schema()));

http://git-wip-us.apache.org/repos/asf/madlib/blob/5f80ba97/src/ports/postgres/modules/utilities/test/drop_madlib_temp.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/test/drop_madlib_temp.sql_in b/src/ports/postgres/modules/utilities/test/drop_madlib_temp.sql_in
deleted file mode 100644
index f902361..0000000
--- a/src/ports/postgres/modules/utilities/test/drop_madlib_temp.sql_in
+++ /dev/null
@@ -1,16 +0,0 @@
-CREATE TABLE "__madlib_temp_Quoted"(b varchar);
-CREATE TABLE __madlib_temp_non_quoted(a text);
--- assert that madlib_temp tables are created
-SELECT assert(count(*) >= 2, 'Error setting up madlib_temp in schema ' || quote_ident(current_schema()))
-FROM pg_tables
-WHERE tablename LIKE '%madlib\_temp%'
-  AND quote_ident(schemaname) = quote_ident(current_schema());
-
--- cleanup
-SELECT cleanup_madlib_temp_tables(quote_ident(current_schema()));
-
--- assert that madlib_temp tables are dropped
-SELECT assert(count(*) = 0, 'Error cleaning up madlib_temp in schema ' || quote_ident(current_schema()))
-FROM pg_tables
-WHERE tablename LIKE '%madlib\_temp%'
-  AND quote_ident(schemaname) = quote_ident(current_schema());

http://git-wip-us.apache.org/repos/asf/madlib/blob/5f80ba97/src/ports/postgres/modules/utilities/test/utilities.ic.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/test/utilities.ic.sql_in b/src/ports/postgres/modules/utilities/test/utilities.ic.sql_in
new file mode 100644
index 0000000..832e1de
--- /dev/null
+++ b/src/ports/postgres/modules/utilities/test/utilities.ic.sql_in
@@ -0,0 +1,58 @@
+/* ----------------------------------------------------------------------- */
+/**
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ *//* ----------------------------------------------------------------------- */
+
+-- cleanup
+SELECT cleanup_madlib_temp_tables(quote_ident(current_schema()));
+
+-- test dropcols
+DROP TABLE IF EXISTS dt_golf CASCADE;
+CREATE TABLE dt_golf (
+    id integer NOT NULL,
+    id_2 integer,
+    "OUTLOOK" text,
+    temperature double precision,
+    humidity double precision,
+    "Cont,features" double precision[],
+    cat_features text[],
+    windy boolean,
+    class text
+) ;
+
+INSERT INTO dt_golf (id,"OUTLOOK",temperature,humidity,"Cont,features",cat_features, windy,class) VALUES
+(1, 'sunny', 85, 85,ARRAY[85, 85], ARRAY['a', 'b'], false, 'Don''t Play'),
+(2, 'sunny', 80, 90, ARRAY[80, 90], ARRAY['a', 'b'], true, 'Don''t Play'),
+(3, 'overcast', 83, 78, ARRAY[83, 78], ARRAY['a', 'b'], false, 'Play'),
+(4, 'rain', 70, NULL, ARRAY[70, 96], ARRAY['a', 'b'], false, 'Play'),
+(5, 'rain', 68, 80, ARRAY[68, 80], ARRAY['a', 'b'], false, 'Play'),
+(6, 'rain', NULL, 70, ARRAY[65, 70], ARRAY['a', 'b'], true, 'Don''t Play'),
+(7, 'overcast', 64, 65, ARRAY[64, 65], ARRAY['c', 'b'], NULL , 'Play'),
+(8, 'sunny', 72, 95, ARRAY[72, 95], ARRAY['a', 'b'], false, 'Don''t Play'),
+(9, 'sunny', 69, 70, ARRAY[69, 70], ARRAY['a', 'b'], false, 'Play'),
+(10, 'rain', 75, 80, ARRAY[75, 80], ARRAY['a', 'b'], false, 'Play'),
+(11, 'sunny', 75, 70, ARRAY[75, 70], ARRAY['a', 'd'], true, 'Play'),
+(12, 'overcast', 72, 90, ARRAY[72, 90], ARRAY['c', 'b'], NULL, 'Play'),
+(13, 'overcast', 81, 75, ARRAY[81, 75], ARRAY['a', 'b'], false, 'Play'),
+(15, NULL, 81, 75, ARRAY[81, 75], ARRAY['a', 'b'], false, 'Play'),
+(16, 'overcast', NULL, 75, ARRAY[81, 75], ARRAY['a', 'd'], false, 'Play'),
+(14, 'rain', 71, 80, ARRAY[71, 80], ARRAY['c', 'b'], true, 'Don''t Play');
+
+SELECT dropcols('dt_golf', 'dt_golf2', '"OUTLOOK", "Cont,features", cat_features');

http://git-wip-us.apache.org/repos/asf/madlib/blob/5f80ba97/src/ports/postgres/modules/utilities/test/utilities.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/test/utilities.sql_in b/src/ports/postgres/modules/utilities/test/utilities.sql_in
new file mode 100644
index 0000000..d3525ce
--- /dev/null
+++ b/src/ports/postgres/modules/utilities/test/utilities.sql_in
@@ -0,0 +1,87 @@
+/* ----------------------------------------------------------------------- */
+/**
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+/* ----------------------------------------------------------------------- */
+
+CREATE TABLE "__madlib_temp_Quoted"(b varchar);
+CREATE TABLE __madlib_temp_non_quoted(a text);
+-- assert that madlib_temp tables are created
+SELECT assert(count(*) >= 2, 'Error setting up madlib_temp in schema ' || quote_ident(current_schema()))
+FROM pg_tables
+WHERE tablename LIKE '%madlib\_temp%'
+  AND quote_ident(schemaname) = quote_ident(current_schema());
+
+-- cleanup
+SELECT cleanup_madlib_temp_tables(quote_ident(current_schema()));
+
+-- assert that madlib_temp tables are dropped
+SELECT assert(count(*) = 0, 'Error cleaning up madlib_temp in schema ' || quote_ident(current_schema()))
+FROM pg_tables
+WHERE tablename LIKE '%madlib\_temp%'
+  AND quote_ident(schemaname) = quote_ident(current_schema());
+
+-- test dropcols
+DROP TABLE IF EXISTS dt_golf CASCADE;
+CREATE TABLE dt_golf (
+    id integer NOT NULL,
+    id_2 integer,
+    "OUTLOOK" text,
+    temperature double precision,
+    "len$$'%*()gth" double precision,
+    "Cont,features" double precision[],
+    cat_features text[],
+    windy boolean,
+    class text
+) ;
+
+INSERT INTO dt_golf (id,"OUTLOOK",temperature,"len$$'%*()gth","Cont,features",cat_features, windy,class) VALUES
+(1, 'sunny', 85, 85,ARRAY[85, 85], ARRAY['a', 'b'], false, 'Don''t Play'),
+(2, 'sunny', 80, 90, ARRAY[80, 90], ARRAY['a', 'b'], true, 'Don''t Play'),
+(3, 'overcast', 83, 78, ARRAY[83, 78], ARRAY['a', 'b'], false, 'Play'),
+(4, 'rain', 70, NULL, ARRAY[70, 96], ARRAY['a', 'b'], false, 'Play'),
+(5, 'rain', 68, 80, ARRAY[68, 80], ARRAY['a', 'b'], false, 'Play'),
+(6, 'rain', NULL, 70, ARRAY[65, 70], ARRAY['a', 'b'], true, 'Don''t Play'),
+(7, 'overcast', 64, 65, ARRAY[64, 65], ARRAY['c', 'b'], NULL , 'Play'),
+(8, 'sunny', 72, 95, ARRAY[72, 95], ARRAY['a', 'b'], false, 'Don''t Play'),
+(9, 'sunny', 69, 70, ARRAY[69, 70], ARRAY['a', 'b'], false, 'Play'),
+(10, 'rain', 75, 80, ARRAY[75, 80], ARRAY['a', 'b'], false, 'Play'),
+(11, 'sunny', 75, 70, ARRAY[75, 70], ARRAY['a', 'd'], true, 'Play'),
+(12, 'overcast', 72, 90, ARRAY[72, 90], ARRAY['c', 'b'], NULL, 'Play'),
+(13, 'overcast', 81, 75, ARRAY[81, 75], ARRAY['a', 'b'], false, 'Play'),
+(15, NULL, 81, 75, ARRAY[81, 75], ARRAY['a', 'b'], false, 'Play'),
+(16, 'overcast', NULL, 75, ARRAY[81, 75], ARRAY['a', 'd'], false, 'Play'),
+(14, 'rain', 71, 80, ARRAY[71, 80], ARRAY['c', 'b'], true, 'Don''t Play');
+
+SELECT dropcols('dt_golf', 'dt_golf2', '"OUTLOOK", "Cont,features", cat_features');
+
+-- test if columns have been dropped
+SELECT assert(n_cols = 0, 'Some of the columns have not been dropped')
+FROM (
+    SELECT count(*) AS n_cols
+    FROM information_schema.columns
+    WHERE table_name='dt_golf2' AND
+          (column_name in ('"OUTLOOK"', '"Cont,features"', 'cat_features'))
+) q;
+
+-- test if the retained columns are present in output table
+SELECT
+    id, id_2, temperature, "len$$'%*()gth", windy, class
+FROM dt_golf2;

http://git-wip-us.apache.org/repos/asf/madlib/blob/5f80ba97/src/ports/postgres/modules/utilities/utilities.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/utilities.py_in b/src/ports/postgres/modules/utilities/utilities.py_in
index 2bf08a7..55b6983 100644
--- a/src/ports/postgres/modules/utilities/utilities.py_in
+++ b/src/ports/postgres/modules/utilities/utilities.py_in
@@ -7,6 +7,7 @@ from distutils.util import strtobool
 
 from validate_args import _get_table_schema_names
 from validate_args import get_first_schema
+from validate_args import get_cols
 from validate_args import cols_in_tbl_valid
 from validate_args import does_exclude_reserved
 from validate_args import explicit_bool_to_text
@@ -840,8 +841,8 @@ def get_grouping_col_str(schema_madlib, module_name, reserved_cols,
         cols_in_tbl_valid(source_table, grouping_col_array, module_name)
         does_exclude_reserved(grouping_col_array, reserved_cols)
         grp_array_w_cast = explicit_bool_to_text(source_table,
-                                                grouping_col_array,
-                                                schema_madlib)
+                                                 grouping_col_array,
+                                                 schema_madlib)
         grouping_str = ', '.join(i + "::text" for i in grp_array_w_cast)
     else:
         grouping_str = "Null"
@@ -911,3 +912,35 @@ def validate_module_input_params(source_table, output_table, independent_varname
                                            grouping_cols=grouping_cols,
                                            source_table=source_table))
 # ------------------------------------------------------------------------
+
+
+def create_table_drop_cols(source_table, out_table, cols_to_drop, **kwargs):
+    """ Create copy of table while dropping some of the columns
+    Args:
+        @param source_table str. Name of the source table
+        @param out_table str. Name of the output table
+        @param cols_to_drop str. Comma-separated list of columns to drop
+    """
+    input_tbl_valid(source_table, 'Utilities')
+    output_tbl_valid(out_table, 'Utilities')
+    _assert(cols_to_drop and cols_to_drop.strip(),
+            "Utilities error: cols_to_drop cannot be empty or NULL")
+
+    source_table_cols = get_cols(source_table)
+    cols_to_drop_list = split_quoted_delimited_str(cols_to_drop)
+    cols_not_in_source = set(cols_to_drop_list) - set(source_table_cols)
+    _assert(not cols_not_in_source,
+            "Utilities error: Some column(s) in cols_to_drop are not present "
+            "in source table")
+
+    cols_to_retain = [c for c in source_table_cols if c not in cols_to_drop_list]
+    _assert(cols_to_retain,
+            "Utilities error: No valid columns for the output table")
+    plpy.execute("""
+        CREATE TABLE {out_table} AS
+        SELECT {cols}
+        FROM {source_table}
+        """.format(cols=', '.join(cols_to_retain),
+                   out_table=out_table,
+                   source_table=source_table))
+# ------------------------------------------------------------------------------

http://git-wip-us.apache.org/repos/asf/madlib/blob/5f80ba97/src/ports/postgres/modules/utilities/utilities.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/utilities.sql_in b/src/ports/postgres/modules/utilities/utilities.sql_in
index 6fdb463..4993afe 100644
--- a/src/ports/postgres/modules/utilities/utilities.sql_in
+++ b/src/ports/postgres/modules/utilities/utilities.sql_in
@@ -1,4 +1,5 @@
-/* ----------------------------------------------------------------------- *//**
+/* ----------------------------------------------------------------------- */
+/**
  *
  * @file utilities.sql_in
  *
@@ -7,7 +8,8 @@
  * @sa For a brief overview of utility functions, see the
  *     module description \ref grp_utilities.
  *
- *//* ----------------------------------------------------------------------- */
+**/
+/* ----------------------------------------------------------------------- */
 
 m4_include(`SQLCommon.m4')
 
@@ -72,6 +74,12 @@ while implementing new algorithms.
     <td>Drop all tables matching pattern '%madlib_temp%' in a given schema.</td>
   </tr>
 
+  <tr>
+    <th>dropcols()</th>
+    <td>Create a new table with a subset of the columns dropped from a source table.
+    </td>
+  </tr>
+
 </table>
 
 Note: If the function cleanup_madlib_temp_tables() gives an Out-of-memory error,
@@ -207,7 +215,7 @@ m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
  *
  * @param source_table Source table
  * @param column_name  Column name in the table
- * @returns \c TRUE if it exsists and FALSE if not
+ * @returns \c TRUE if it exists and FALSE if not
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.check_if_col_exists(
     source_table TEXT,
@@ -409,7 +417,7 @@ $$m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');
 /**
  * @brief Cast any value to text.
  *
- * @param val	A value with any specific type.
+ * @param val   A value with any specific type.
  *
  * @return The text format string for the value.
  *
@@ -474,3 +482,30 @@ CREATE AGGREGATE MADLIB_SCHEMA.mode(double precision) (
   FINALFUNC=MADLIB_SCHEMA._final_mode, --Function to call after everything has been added to array
   INITCOND='{}' --Initialize an empty array when starting
 );
+
+
+------------------------------------------------------------------------
+/**
+ * @brief Creates a new table with a subset of columns dropped from another
+ *  source table.
+ *
+ * @param source_table Name of the table containing the source data.
+ * @param out_table Name of the generated table containing the output.
+            If a table with the same name already exists, an error will be returned.
+ * @param cols_to_drop Comma-separated string of column names from the source
+ *          table to drop. An error is returned if the output table does not
+ *          contain any columns or if a requested column is not present in the
+ *          source table.
+ */
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.dropcols (
+  source_table      VARCHAR,
+  out_table         VARCHAR,
+  cols_to_drop      VARCHAR
+)
+RETURNS void AS $$
+PythonFunctionBodyOnly(utilities, utilities)
+    from utilities import control
+    with control.MinWarning('error'):
+        return utilities.create_table_drop_cols(source_table, out_table, cols_to_drop)
+$$ LANGUAGE plpythonu VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');