You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@madlib.apache.org by njayaram2 <gi...@git.apache.org> on 2018/07/24 00:22:08 UTC
[GitHub] madlib pull request #291: Feature: Vector-Column Transformations
Github user njayaram2 commented on a diff in the pull request:
https://github.com/apache/madlib/pull/291#discussion_r204589559
--- Diff: src/ports/postgres/modules/utilities/transform_vec_cols.py_in ---
@@ -0,0 +1,513 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import plpy
+from control import MinWarning
+from internal.db_utils import is_col_1d_array
+from internal.db_utils import quote_literal
+from utilities import _assert
+from utilities import add_postfix
+from utilities import ANY_ARRAY
+from utilities import is_psql_boolean_type
+from utilities import is_psql_char_type
+from utilities import is_psql_numeric_type
+from utilities import is_valid_psql_type
+from utilities import py_list_to_sql_string
+from utilities import split_quoted_delimited_str
+from validate_args import is_var_valid
+from validate_args import get_cols
+from validate_args import get_cols_and_types
+from validate_args import get_expr_type
+from validate_args import input_tbl_valid
+from validate_args import output_tbl_valid
+from validate_args import table_exists
+
+class vec_cols_helper:
+ def __init__(self):
+ self.all_cols = None
+
+ def get_cols_as_list(self, cols_to_process, source_table=None, exclude_cols=None):
+ """
+ Get a list of columns based on the value of cols_to_process
+ Args:
+ @param cols_to_process: str, Either a * or a comma-separated list of col names
+ @param source_table: str, optional. Source table name
+ @param exclude_cols: str, optional. Comma-separated list of the col(s) to exclude
+ from the source table, only used if cols_to_process is *
+ Returns:
+ A list of column names (or an empty list)
+ """
+ # If cols_to_process is empty/None, return empty list
+ if not cols_to_process:
+ return []
+ if cols_to_process.strip() != "*":
+ # If cols_to_process is a comma separated list of names, return list
+ # of column names in cols_to_process.
+ return [col for col in split_quoted_delimited_str(cols_to_process)
+ if col not in split_quoted_delimited_str(exclude_cols)]
+ if source_table:
+ if not self.all_cols:
+ self.all_cols = get_cols(source_table)
+ return [col for col in self.all_cols
+ if col not in split_quoted_delimited_str(exclude_cols)]
+ return []
+
+ def get_type_class(self, arg):
+ if is_psql_numeric_type(arg):
+ return "double precision"
+ elif is_psql_char_type(arg):
+ return "text"
+ else:
+ return arg
+
+class vec2cols:
+ def __init__(self):
+ self.get_cols_helper = vec_cols_helper()
+ self.module_name = self.__class__.__name__
+
+ def validate_args(self, source_table, output_table, vector_col, feature_names,
+ cols_to_output):
+ """
+ Validate args for vec2cols
+ """
+ input_tbl_valid(source_table, self.module_name)
+ output_tbl_valid(output_table, self.module_name)
+ is_var_valid(source_table, cols_to_output)
+ is_var_valid(source_table, vector_col)
+ _assert(is_valid_psql_type(get_expr_type(vector_col, source_table), ANY_ARRAY),
+ "{0}: vector_col should refer to an array.".format(self.module_name))
+ _assert(is_col_1d_array(source_table, vector_col),
+ "{0}: vector_col must be a 1-dimensional array.".format(self.module_name))
+
+ def get_names_for_split_output_cols(self, source_table, vector_col, feature_names):
+ """
+ Get list of names for the newly-split columns to include in the
+ output table.
+ Args:
+ @param: source_table, str. Source table
+ @param: vector_col, str. Column name containing the array input
+ @param: feature_names, list. Python list of the feature names to
+ use for the split elements in the vector_col array
+ """
+ query = """
+ SELECT array_upper({0}, 1) AS n_x
+ FROM {1}
+ LIMIT 1
+ """.format(vector_col, source_table)
+ result = plpy.execute(query)[0]["n_x"]
+ if not result:
+ plpy.error('{0}: Column to split ({1}) must not be an empty array'
+ .format(self.module_name, vector_col))
+ if not feature_names:
+ # Create custom col names for output columns, with prefix "f".
+ feature_names = ["f{0}".format(i+1) for i in range(result)]
+ else:
+ # Check if the array dimension is equal to the number of col names
+ # specified in feature_names.
+ _assert(result == len(feature_names),
+ "{0}: Mismatch between size of vector_col and number of "
+ "cols in feature_names.".format(self.module_name))
+ return feature_names
+
+ def validate_output_cols(self, features_to_unnest, cols_to_keep):
+ # If there are more than 1600 columns for the output table, we give a
+ # warning as it might give an error
+ MAX_OUTPUT_COLUMN_COUNT = 1600
+ _assert(len(features_to_unnest)+len(cols_to_keep) < MAX_OUTPUT_COLUMN_COUNT,
+ "{0}: The output exceeds the max number of columns that " +
+ "can be created ({1})".format(self.module_name, MAX_OUTPUT_COLUMN_COUNT))
+ # Check if newly created col names have the same name as existing cols
+ duplicate_col_names = set(features_to_unnest).intersection(set(cols_to_keep))
+ _assert(len(duplicate_col_names) == 0,
+ "{0}: Conflicting column names. Column names in source "
+ "table cannot be {1}".format(self.module_name,
+ list(duplicate_col_names)))
+
+ def vec2cols(self, schema_madlib, source_table, output_table,
+ vector_col, feature_names, cols_to_output, **kwargs):
+ """
+ Split up a column of array entries into multiple columns, each column
+ corresponding to one array position
+ Args:
+ @param: schema_madlib, str. The schema with madlib installed
+ @param: source_table, str. The source table
+ @param: output_table, str. The output table
+ @param: vector_col, str. The column with array entries to split up
+ @param: feature_names, list. Python list of the feature names to use
+ for the split elements in the vector_col array
+ @param: cols_to_output, str. Comma-separated list of the columns in
+ the source_table to include in the output_table
+ """
+ self.validate_args(source_table, output_table, vector_col, feature_names,
+ cols_to_output)
+
+ # Get names of columns to use for the split vector_col
+ features_to_unnest = self.get_names_for_split_output_cols(source_table,
+ vector_col, feature_names)
+ cols_to_keep = self.get_cols_helper.get_cols_as_list(cols_to_output,
+ source_table)
+
+ self.validate_output_cols(features_to_unnest, cols_to_keep)
+
+ # Construct the output query and populate the output table with all the
+ # correct parameters
+ select_new_cols = ', '.join(["{0}[{1}] AS {2}".format(vector_col,
+ i+1, features_to_unnest[i]) for i in range(len(features_to_unnest))])
+ cols_from_src_table = ', '.join(cols_to_keep)+', ' if cols_to_keep else ''
+ query = """
+ CREATE TABLE {output_table} AS
+ SELECT {cols_from_src_table} {select_new_cols}
+ FROM {source_table}
+ """.format(**locals())
+ plpy.execute(query)
+
+ def vec2cols_help_message(self, schema_madlib, message, **kwargs):
+ """
+ Help message for vec2cols function
+ """
+ summary_string = """
+-----------------------------------------------------------------------------------
+ SUMMARY
+-----------------------------------------------------------------------------------
+Functionality: Vector to Columns
+
+The MADlib vec2cols function enables the user to split up a single column into
+multiple columns, given that the input column contains array entries. For example,
+if the input column contained ARRAY[1, 2, 3] in one of its rows, the output table
+will contain 3 different columns, one for each element of the array.
+
+For more details on function usage:
+ SELECT {schema_madlib}.vec2cols('usage');
+
+For a small example on using the function:
+ SELECT {schema_madlib}.vec2cols('example');
+ """.format(schema_madlib=schema_madlib)
+
+ usage_string = """
+-----------------------------------------------------------------------------------
+ USAGE
+-----------------------------------------------------------------------------------
+SELECT {schema_madlib}.vec2cols(
+ 'source_table', -- str, Name of the source table that contains the data
+ 'output_table', -- str, Name of the output view or table
+ 'vector_col', -- str, Name of the array entry column to be split
+ 'feature_names', -- array, Optional parameter to provide a text array of
+ -- the feature names for the newly split columns (if not
+ -- provided, default names f0, f1, ... will be used)
+ 'cols_to_output' -- str, Optional parameter to specify any other columns
+ -- in the source_table to include in the output_table
+ -- (default none of them, also supports '*' as input)
+ """.format(schema_madlib=schema_madlib)
+
+ example_string = """
+-----------------------------------------------------------------------------------
+ EXAMPLE
+-----------------------------------------------------------------------------------
+-- Create an input data set:
+
+DROP TABLE IF EXISTS golf CASCADE;
+CREATE TABLE golf (
+ id integer NOT NULL,
+ "OUTLOOK" text,
+ temperature double precision,
+ humidity double precision,
+ "Temp_Humidity" double precision[],
+ clouds_airquality text[],
+ windy boolean,
+ class text,
+ observation_weight double precision
+);
+INSERT INTO golf VALUES
+(1,'sunny', 85, 85, ARRAY[85, 85],ARRAY['none', 'unhealthy'], 'false','Don''t Play', 5.0),
+(2, 'sunny', 80, 90, ARRAY[80, 90], ARRAY['none', 'moderate'], 'true', 'Don''t Play', 5.0),
+(3, 'overcast', 83, 78, ARRAY[83, 78], ARRAY['low', 'moderate'], 'false', 'Play', 1.5),
+(4, 'rain', 70, 96, ARRAY[70, 96], ARRAY['low', 'moderate'], 'false', 'Play', 1.0),
+(5, 'rain', 68, 80, ARRAY[68, 80], ARRAY['medium', 'good'], 'false', 'Play', 1.0),
+(6, 'rain', 65, 70, ARRAY[65, 70], ARRAY['low', 'unhealthy'], 'true', 'Don''t Play', 1.0),
+(7, 'overcast', 64, 65, ARRAY[64, 65], ARRAY['medium', 'moderate'], 'true', 'Play', 1.5),
+(8, 'sunny', 72, 95, ARRAY[72, 95], ARRAY['high', 'unhealthy'], 'false', 'Don''t Play', 5.0),
+(9, 'sunny', 69, 70, ARRAY[69, 70], ARRAY['high', 'good'], 'false', 'Play', 5.0),
+(10, 'rain', 75, 80, ARRAY[75, 80], ARRAY['medium', 'good'], 'false', 'Play', 1.0),
+(11, 'sunny', 75, 70, ARRAY[75, 70], ARRAY['none', 'good'], 'true', 'Play', 5.0),
+(12, 'overcast', 72, 90, ARRAY[72, 90], ARRAY['medium', 'moderate'], 'true', 'Play', 1.5),
+(13, 'overcast', 81, 75, ARRAY[81, 75], ARRAY['medium', 'moderate'], 'false', 'Play', 1.5),
+(14, 'rain', 71, 80, ARRAY[71, 80], ARRAY['low', 'unhealthy'], 'true', 'Don''t Play', 1.0);
+
+-- Call the vec2cols function on the 'clouds_airquality' column, to split it up
+
+DROP TABLE IF EXISTS output_table;
+SELECT {schema_madlib}.vec2cols(
+ 'golf', -- source table
+ 'output_table', -- output table
+ 'clouds_airquality', -- column with array entries to split
+ ARRAY['a', 'b'], -- feature_names array (will use 'a' to name the first new column, and 'b' for the second)
+ '"OUTLOOK", id' -- columns to keep from source table (as a comma-separated list)
+);
+
+SELECT * FROM output_table ORDER BY id;
+ OUTLOOK | id | a | b
+----------+----+--------+-----------
+ sunny | 1 | none | unhealthy
+ sunny | 2 | none | moderate
+ overcast | 3 | low | moderate
+ rain | 4 | low | moderate
+ rain | 5 | medium | good
+ rain | 6 | low | unhealthy
+ overcast | 7 | medium | moderate
+ sunny | 8 | high | unhealthy
+ sunny | 9 | high | good
+ rain | 10 | medium | good
+ sunny | 11 | none | good
+ overcast | 12 | medium | moderate
+ overcast | 13 | medium | moderate
+ rain | 14 | low | unhealthy
+(14 rows)
+""".format(schema_madlib=schema_madlib)
+
+ if not message:
+ return summary_string
+ elif message.lower() in ('usage', 'help', '?'):
+ return usage_string
+ elif message.lower() in ('example', 'examples'):
+ return example_string
+ else:
+ return """
+No such option. Use "SELECT {schema_madlib}.vec2cols()" for help.
+ """.format(schema_madlib=schema_madlib)
+
+class cols2vec:
+ def __init__(self):
+ self.get_cols_helper = vec_cols_helper()
+ self.module_name = self.__class__.__name__
+
+ def validate_args(self, source_table, output_table,
+ list_of_features, list_of_features_to_exclude, cols_to_output):
+ """
+ Function to validate input parameters
+ """
+ input_tbl_valid(source_table, self.module_name)
+ output_tbl_valid(output_table, self.module_name)
+
+ _assert(list_of_features and list_of_features.strip(), "{0}: List of "
+ "features cannot be empty".format(self.module_name))
+ if list_of_features.strip() != '*':
+ is_var_valid(source_table, list_of_features)
+
+ if list_of_features_to_exclude:
+ if list_of_features_to_exclude.strip() == "*":
+ plpy.error("{0}: Cannot exclude all columns from being "
+ "features".format(self.module_name))
+ elif list_of_features.strip() != '*':
+ plpy.info("{0} NOTICE: will exclude given column(s) even though "
+ "list of features was not *".format(self.module_name))
+
+ is_var_valid(source_table, list_of_features_to_exclude)
+ is_var_valid(source_table, cols_to_output)
+
+ def get_and_validate_feature_types(self, source_table, features_to_nest):
+ """
+ This function will validate and return the appropriate type to cast
+ the final SQL array. Will fail if feature types do not belong to the
+ same group (numeric, text, etc.) or if any type is an array.
+
+ If all features to nest are of the same type, we just return that type
+ and do not cast. Else, if they only constitute integers and smallints,
+ we cast everything to an integer. Else, if they are all part of the same
+ type group, we return the most comprehensive type in that group. Else, throw an error.
+ """
+ all_cols_and_types = get_cols_and_types(source_table)
+ distinct_types = set([col_type[1] for col_type in all_cols_and_types
+ if col_type[0] in features_to_nest])
+ for expr_type in distinct_types:
+ _assert(not is_valid_psql_type(expr_type, ANY_ARRAY),
+ "{0}: Feature columns to nest cannot be of type array"
+ .format(self.module_name))
+ if len(distinct_types) > 1:
+ if distinct_types == {'integer', 'smallint'}:
--- End diff --
Is there a reason why `bigint` is not here? If the distinct types were `{integer, bigint}`, then this function would return back `double` right? It seems to be fine functionality-wise, in the sense that no precision is lost in the output array though.
I created a table named `t1` with the following schema:
```
madlib-pg94=# \d+ t1;
Table "public.t1"
Column | Type | Modifiers | Storage | Stats target | Description
--------+------------------+-------------------------------------------------+---------+--------------+-------------
id | integer | not null default nextval('t1_id_seq'::regclass) | plain | |
c1 | integer | | plain | |
c2 | bigint | | plain | |
c3 | smallint | | plain | |
c4 | double precision | | plain | |
```
Calling `select madlib.cols2vec('t1','t1_out','c1,c2');` resulted in an output array of type `double precision[]`. Is that the expected behavior?
---