You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by ok...@apache.org on 2017/06/28 01:17:55 UTC
incubator-madlib git commit: Sample: Add stratified sampling
Repository: incubator-madlib
Updated Branches:
refs/heads/master a17d63c9b -> 0b5000ab0
Sample: Add stratified sampling
JIRA: MADLIB-986
Add stratified sampling with the following options.
- With or without grouping
- With or without replacement
- A specific set of target columns or all of them
Closes #143
Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/0b5000ab
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/0b5000ab
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/0b5000ab
Branch: refs/heads/master
Commit: 0b5000ab042e8346545cf1e9984b005869865ebf
Parents: a17d63c
Author: Orhan Kislal <ok...@pivotal.io>
Authored: Wed Jun 21 16:07:08 2017 -0700
Committer: Orhan Kislal <ok...@pivotal.io>
Committed: Tue Jun 27 18:14:34 2017 -0700
----------------------------------------------------------------------
doc/mainpage.dox.in | 3 +
src/config/Modules.yml | 1 +
.../postgres/modules/sample/__init__.py_in | 0
.../modules/sample/stratified_sample.py_in | 306 +++++++++++++++++++
.../modules/sample/stratified_sample.sql_in | 248 +++++++++++++++
.../sample/test/stratified_sample.sql_in | 75 +++++
6 files changed, 633 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/0b5000ab/doc/mainpage.dox.in
----------------------------------------------------------------------
diff --git a/doc/mainpage.dox.in b/doc/mainpage.dox.in
index 407946e..274426a 100644
--- a/doc/mainpage.dox.in
+++ b/doc/mainpage.dox.in
@@ -248,6 +248,9 @@ complete matrix stored as a distributed table.
@defgroup grp_pmml PMML Export
@ingroup grp_utility_functions
+ @defgroup grp_strs Stratified Sampling
+ @ingroup grp_utility_functions
+
@defgroup grp_sessionize Sessionize
@ingroup grp_utility_functions
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/0b5000ab/src/config/Modules.yml
----------------------------------------------------------------------
diff --git a/src/config/Modules.yml b/src/config/Modules.yml
index d5c336e..b70c8bd 100644
--- a/src/config/Modules.yml
+++ b/src/config/Modules.yml
@@ -15,6 +15,7 @@ modules:
- name: glm
depends: ['utilities']
- name: graph
+ depends: ['utilities']
- name: kmeans
depends: ['array_ops', 'svec_util', 'sample']
- name: knn
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/0b5000ab/src/ports/postgres/modules/sample/__init__.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/sample/__init__.py_in b/src/ports/postgres/modules/sample/__init__.py_in
new file mode 100644
index 0000000..e69de29
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/0b5000ab/src/ports/postgres/modules/sample/stratified_sample.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/sample/stratified_sample.py_in b/src/ports/postgres/modules/sample/stratified_sample.py_in
new file mode 100644
index 0000000..e7762ef
--- /dev/null
+++ b/src/ports/postgres/modules/sample/stratified_sample.py_in
@@ -0,0 +1,306 @@
+# coding=utf-8
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import plpy
+from utilities.control import MinWarning
+from utilities.utilities import _assert
+from utilities.utilities import extract_keyvalue_params
+from utilities.utilities import unique_string
+from utilities.utilities import split_quoted_delimited_str
+from utilities.validate_args import table_exists
+from utilities.validate_args import columns_exist_in_table
+from utilities.validate_args import table_is_empty
+from utilities.validate_args import get_expr_type
+from utilities.validate_args import get_cols
+from graph.graph_utils import _check_groups
+from graph.graph_utils import _grp_from_table
+
+m4_changequote(`<!', `!>')
+
+def stratified_sample(schema_madlib, source_table, output_table, proportion,
+ grouping_cols, target_cols, with_replacement, **kwargs):
+
+ """
+ Stratified sampling function
+ Args:
+ @param source_table Input table name.
+ @param output_table Output table name.
+ @param proportion The ratio of sample size to the number of
+ records.
+ @param grouping_cols (Default: NULL) The columns to distinguish
+ each strata.
+ @param target_cols (Default: NULL) The columns to include in
+ the output.
+ @param with_replacement (Default: FALSE) The sampling method.
+
+ """
+ with MinWarning("warning"):
+ label = unique_string(desp='label')
+ perc = unique_string(desp='perc')
+
+ checkg_lp = ""
+ window = ""
+ grp_by = ""
+ grp_from_perc = ""
+ grp_comma = ""
+ glist = None
+ if grouping_cols is not None:
+ glist = split_quoted_delimited_str(grouping_cols)
+ checkg_lp = " AND " + _check_groups(label,perc,glist)
+ window = "PARTITION BY {0}".format(grouping_cols)
+ grp_by = "GROUP BY {0}".format(grouping_cols)
+ grp_from_perc = _grp_from_table(perc,glist) + " , "
+ grp_comma = grouping_cols + " , "
+
+ validate_strs(source_table, output_table, proportion, glist, target_cols)
+
+ if target_cols is None or target_cols is '*':
+ cols = get_cols(source_table)
+ if grouping_cols is not None:
+ cols = [item for item in cols if item not in glist]
+ target_cols = " , ".join(cols)
+
+ plpy.execute("DROP TABLE IF EXISTS {0},{1}".format(label,perc))
+ if not with_replacement :
+ if grouping_cols:
+
+ # Create a random label for each record
+ sql1 = """ CREATE TEMP TABLE {label} AS (
+ SELECT {target_cols},{grouping_cols},random() AS __samp_out_label
+ FROM {source_table})""".format(**locals())
+ plpy.execute(sql1)
+
+ # Find the cut-off label for the given proportion
+ sql2 = """ CREATE TEMP TABLE {perc} AS (
+ SELECT {grouping_cols}, percentile_disc({proportion})
+ WITHIN GROUP (ORDER BY __samp_out_label) AS __samp_out_label
+ FROM {label} GROUP BY {grouping_cols})""".format(**locals())
+ plpy.execute(sql2)
+
+ # Select every record that has a label under the threshold
+ sql3 = """ CREATE TABLE {output_table} AS (
+ SELECT {grp_from_perc} {target_cols}
+ FROM {label} INNER JOIN {perc} ON (
+ {label}.__samp_out_label <= {perc}.__samp_out_label
+ {checkg_lp}) )""".format(**locals())
+ plpy.execute(sql3)
+ else:
+
+ # Find the number of records to select
+ count = plpy.execute("SELECT count(*) AS count FROM {0}".
+ format(source_table))[0]['count']
+ count = count * proportion
+
+ # Order randomly and select the required number of records
+ sql1 = """ CREATE TABLE {output_table} AS (
+ SELECT {target_cols}
+ FROM {source_table}
+ ORDER BY random()
+ LIMIT {count})""".format(**locals())
+ plpy.execute(sql1)
+ else:
+
+ # Set the row number as the label for each record
+ # OVER clause ensures that different groups have independent
+ # row_numbers
+ sql1 = """ CREATE TEMP TABLE {label} AS (
+ SELECT {grp_comma} {target_cols},
+ row_number() OVER ({window}) AS __samp_out_label
+ FROM {source_table})""".format(**locals())
+ plpy.execute(sql1)
+
+ # Generate a series of random values for each group based on their
+ # individual row counts.
+ # These random values are independent from each other and may have
+ # the same value.
+
+ sql2 = """ CREATE TEMP TABLE {perc} AS (
+ SELECT {grp_comma}
+ GENERATE_SERIES(0,(count*{proportion}-1)::int) AS __i,
+ ((random()*(count-1)+1)::int) AS __samp_out_label
+ FROM (
+ SELECT {grp_comma} count(*) AS count
+ FROM {source_table} {grp_by}) AS sub)
+ """.format(**locals())
+ plpy.execute(sql2)
+
+ # Join the two tables to get the selected samples.
+ # If a random value is generated twice, the join will ensure that
+ # the record is selected twice
+ sql3 = """ CREATE TABLE {output_table} AS (
+ SELECT {grp_from_perc} {target_cols}
+ FROM {label} INNER JOIN {perc} ON (
+ {label}.__samp_out_label = {perc}.__samp_out_label
+ {checkg_lp}) )""".format(**locals())
+ plpy.execute(sql3)
+ plpy.execute("DROP TABLE IF EXISTS {0},{1}".format(label,perc))
+
+ return
+
+def validate_strs (source_table, output_table, proportion, glist, target_cols):
+
+ _assert(output_table and output_table.strip().lower() not in ('null', ''),
+ "Sample: Invalid output table name {output_table}!".format(**locals()))
+ _assert(not table_exists(output_table),
+ "Sample: Output table already exists!".format(**locals()))
+
+ _assert(source_table and source_table.strip().lower() not in ('null', ''),
+ "Sample: Invalid Source table name!".format(**locals()))
+ _assert(table_exists(source_table),
+ "Sample: Source table ({source_table}) is missing!".format(**locals()))
+ _assert(not table_is_empty(source_table),
+ "Sample: Source table ({source_table}) is empty!".format(**locals()))
+
+ _assert(proportion > 0 and proportion < 1,
+ "Sample: Proportion isn't in the range (0,1)!")
+
+ if glist is not None:
+ _assert(columns_exist_in_table(source_table, glist),
+ ("""Sample: Not all columns from {glist} are present in source"""+
+ """ table ({source_table}).""").format(**locals()))
+
+ if not (target_cols is None or target_cols is '*'):
+ tlist = split_quoted_delimited_str(target_cols)
+ _assert(columns_exist_in_table(source_table, tlist),
+ ("""Sample: Not all columns from {target_cols} are present in"""+
+ """ edge table ({source_table})""").format(**locals()))
+ return
+
+
+def stratified_sample_help(schema_madlib, message, **kwargs):
+ """
+ Help function for stratified_sample
+
+ Args:
+ @param schema_madlib
+ @param message: string, Help message string
+ @param kwargs
+
+ Returns:
+ String. Help/usage information
+ """
+ if not message:
+ help_string = """
+-----------------------------------------------------------------------
+ SUMMARY
+-----------------------------------------------------------------------
+
+Given a table, stratified sampling returns a proportion of records for
+each group (strata). It is possible to use with or without replacement
+sampling methods, specify a set of target columns, and assume the
+whole table is a single strata.
+
+For more details on function usage:
+ SELECT {schema_madlib}.stratified_sample('usage');
+ SELECT {schema_madlib}.stratified_sample('example');
+ """
+ elif message.lower() in ['usage', 'help', '?']:
+ help_string = """
+
+Given a table, stratified sampling returns a proportion of records for
+each group (strata). It is possible to use with or without replacement
+sampling methods, specify a set of target columns, and assume the
+whole table is a single strata.
+
+----------------------------------------------------------------------------
+ USAGE
+----------------------------------------------------------------------------
+
+ SELECT {schema_madlib}.stratified_sample(
+ source_table TEXT, -- Input table name.
+ output_table TEXT, -- Output table name.
+ proportion FLOAT8, -- The ratio of sample size to the number of
+ -- records.
+ grouping_cols TEXT -- (Default: NULL) The columns to distinguish
+ -- each strata.
+ target_cols TEXT, -- (Default: NULL) The columns to include in
+ -- the output.
+ with_replacement BOOLEAN -- (Default: FALSE) The sampling method.
+
+If grouping_cols is NULL, the whole table is treated as a single group and
+sampled accordingly.
+
+If target_cols is NULL or '*', all of the columns will be included in the
+output table.
+
+If with_replacement is TRUE, each sample is independent (the same row may
+be selected in the sample set more than once). Else (if with_replacement
+is FALSE), a row can be selected at most once.
+);
+"""
+ elif message.lower() in ("example", "examples"):
+ help_string = """
+----------------------------------------------------------------------------
+ EXAMPLES
+----------------------------------------------------------------------------
+
+-- Create an input table
+DROP TABLE IF EXISTS test;
+
+CREATE TABLE test(
+ id1 INTEGER,
+ id2 INTEGER,
+ gr1 INTEGER,
+ gr2 INTEGER
+);
+
+INSERT INTO test VALUES
+(1,0,1,1),
+(2,0,1,1),
+(3,0,1,1),
+(4,0,1,1),
+(5,0,1,1),
+(6,0,1,1),
+(7,0,1,1),
+(8,0,1,1),
+(9,0,1,1),
+(9,0,1,1),
+(9,0,1,1),
+(9,0,1,1),
+(0,1,1,2),
+(0,2,1,2),
+(0,3,1,2),
+(0,4,1,2),
+(0,5,1,2),
+(0,6,1,2),
+(10,10,2,2),
+(20,20,2,2),
+(30,30,2,2),
+(40,40,2,2),
+(50,50,2,2),
+(60,60,2,2),
+(70,70,2,2)
+;
+
+-- Sample without replacement
+DROP TABLE IF EXISTS out;
+SELECT madlib.stratified_sample('test', 'out', 0.5, 'gr1,gr2', 'id1,id2',
+ FALSE);
+SELECT * FROM out;
+
+-- Sample with replacement
+DROP TABLE IF EXISTS out;
+SELECT madlib.stratified_sample('test', 'out', 0.5, 'gr1,gr2', 'id1,id2',
+ TRUE);
+SELECT * FROM out;
+"""
+ else:
+ help_string = "No such option. Use {schema_madlib}.graph_sssp()"
+
+ return help_string.format(schema_madlib=schema_madlib)
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/0b5000ab/src/ports/postgres/modules/sample/stratified_sample.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/sample/stratified_sample.sql_in b/src/ports/postgres/modules/sample/stratified_sample.sql_in
new file mode 100644
index 0000000..7327c26
--- /dev/null
+++ b/src/ports/postgres/modules/sample/stratified_sample.sql_in
@@ -0,0 +1,248 @@
+/* ----------------------------------------------------------------------- *//**
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ *
+ * @file stratified_sample.sql_in
+ *
+ * @brief SQL functions for stratified sampling.
+ * @date 07/19/2017
+ *
+ * @sa Given a table, stratified sampling returns a proportion of records
+ * for each group (strata).
+ *
+ *//* ----------------------------------------------------------------------- */
+
+m4_include(`SQLCommon.m4')
+
+
+/**
+@addtogroup grp_strs
+
+<div class="toc"><b>Contents</b>
+<ul>
+<li><a href="#strs">Stratified Sampling</a></li>
+<li><a href="#notes">Notes</a></li>
+<li><a href="#examples">Examples</a></li>
+</ul>
+</div>
+
+@brief Provides stratified sampling function.
+
+Given a table, stratified sampling returns a proportion of records for
+each group (strata). It is possible to use with or without replacement
+sampling methods, specify a set of target columns, and assume the
+whole table is a single strata.
+
+@anchor strs
+@par Stratified Sampling
+
+<pre class="syntax">
+stratified_sample( source_table,
+ output_table,
+ proportion,
+ grouping_cols,
+ target_cols,
+ with_replacement
+ )
+</pre>
+
+\b Arguments
+<dl class="arglist">
+<dt>source_table</dt>
+<dd>TEXT. Name of the table containing the input data. Must contain the
+columns specified in the 'groupinhg_cols' and target_cols' parameters
+below.</dd>
+<dt>output_table</dt>
+<dd>TEXT. Name of the table to store the sampled records.</dd>
+<dt>proportion</dt>
+<dd>FLOAT8. The ratio of sample size to the number of records.</dd>
+<dt>grouping_cols</dt>
+<dd>TEXT. (Default: NULL) The columns to distinguish each strata.</dd>
+<dt>target_cols</dt>
+<dd>TEXT. (Default: NULL) The columns to include in the output.</dd>
+<dt>with_replacement</dt>
+<dd>BOOLEAN. (Default: FALSE) The sampling method.</dd>
+</dl>
+@anchor notes
+@par Notes
+
+If grouping_cols is NULL, the whole table is treated as a single group and
+sampled accordingly.
+
+If target_cols is NULL or '*', all of the columns will be included in the
+output table.
+
+If with_replacement is TRUE, each sample is independent (the same row may
+be selected in the sample set more than once). Else (if with_replacement
+is FALSE), a row can be selected at most once.
+);
+
+@anchor examples
+@par Examples
+
+Please note that, due to the random nature of sampling, the output records
+might differ.
+
+-# Create an input table:
+<pre class="syntax">
+DROP TABLE IF EXISTS test;
+CREATE TABLE test(
+ id1 INTEGER,
+ id2 INTEGER,
+ gr1 INTEGER,
+ gr2 INTEGER
+);
+INSERT INTO test VALUES
+(1,0,1,1),
+(2,0,1,1),
+(3,0,1,1),
+(4,0,1,1),
+(5,0,1,1),
+(6,0,1,1),
+(7,0,1,1),
+(8,0,1,1),
+(9,0,1,1),
+(9,0,1,1),
+(9,0,1,1),
+(9,0,1,1),
+(0,1,1,2),
+(0,2,1,2),
+(0,3,1,2),
+(0,4,1,2),
+(0,5,1,2),
+(0,6,1,2),
+(10,10,2,2),
+(20,20,2,2),
+(30,30,2,2),
+(40,40,2,2),
+(50,50,2,2),
+(60,60,2,2),
+(70,70,2,2);
+</pre>
+
+-# Sample without replacement
+<pre class="syntax">
+DROP TABLE IF EXISTS out;
+SELECT madlib.stratified_sample('test', 'out', 0.5, 'gr1,gr2', 'id1,id2', FALSE);
+SELECT * FROM out ORDER BY gr1,gr2,id1,id2;
+</pre>
+<pre class="result">
+ gr1 | gr2 | id1 | id2
+-----+-----+-----+-----
+ 1 | 1 | 2 | 0
+ 1 | 1 | 4 | 0
+ 1 | 1 | 7 | 0
+ 1 | 1 | 8 | 0
+ 1 | 1 | 9 | 0
+ 1 | 1 | 9 | 0
+ 1 | 2 | 0 | 2
+ 1 | 2 | 0 | 3
+ 1 | 2 | 0 | 4
+ 2 | 2 | 20 | 20
+ 2 | 2 | 30 | 30
+ 2 | 2 | 40 | 40
+ 2 | 2 | 60 | 60
+(13 rows)
+</pre>
+
+-# Sample with replacement
+<pre class="syntax">
+DROP TABLE IF EXISTS out;
+SELECT madlib.stratified_sample('test', 'out', 0.5, 'gr1,gr2', 'id1,id2', TRUE);
+SELECT * FROM out ORDER BY gr1,gr2,id1,id2;
+</pre>
+<pre class="result">
+ gr1 | gr2 | id1 | id2
+-------+-------+-------+-------
+ 1 | 1 | 3 | 0
+ 1 | 1 | 6 | 0
+ 1 | 1 | 6 | 0
+ 1 | 1 | 7 | 0
+ 1 | 1 | 7 | 0
+ 1 | 1 | 9 | 0
+ 1 | 2 | 0 | 1
+ 1 | 2 | 0 | 2
+ 1 | 2 | 0 | 6
+ 2 | 2 | 20 | 20
+ 2 | 2 | 30 | 30
+ 2 | 2 | 50 | 50
+ 2 | 2 | 50 | 50
+</pre>
+*/
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.stratified_sample(
+ source_table TEXT,
+ output_table TEXT,
+ proportion FLOAT8,
+ grouping_cols TEXT,
+ target_cols TEXT,
+ with_replacement BOOLEAN
+) RETURNS VOID AS $$
+ PythonFunction(sample, stratified_sample, stratified_sample)
+$$ LANGUAGE plpythonu VOLATILE
+m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
+
+-------------------------------------------------------------------------------
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.stratified_sample(
+ source_table TEXT,
+ output_table TEXT,
+ proportion FLOAT8,
+ grouping_cols TEXT,
+ target_cols TEXT
+) RETURNS VOID AS $$
+ SELECT MADLIB_SCHEMA.stratified_sample($1, $2, $3, $4, $5, FALSE);
+$$ LANGUAGE sql VOLATILE
+m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.stratified_sample(
+ source_table TEXT,
+ output_table TEXT,
+ proportion FLOAT8,
+ grouping_cols TEXT
+) RETURNS VOID AS $$
+ SELECT MADLIB_SCHEMA.stratified_sample($1, $2, $3, $4, NULL, FALSE);
+$$ LANGUAGE sql VOLATILE
+m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.stratified_sample(
+ source_table TEXT,
+ output_table TEXT,
+ proportion FLOAT8
+) RETURNS VOID AS $$
+ SELECT MADLIB_SCHEMA.stratified_sample($1, $2, $3, NULL, NULL, FALSE);
+$$ LANGUAGE sql VOLATILE
+m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
+
+-------------------------------------------------------------------------------
+
+-- Online help
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.stratified_sample_help(
+ message VARCHAR
+) RETURNS VARCHAR AS $$
+ PythonFunction(sample, stratified_sample, stratified_sample_help)
+$$ LANGUAGE plpythonu IMMUTABLE
+m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `');
+
+-------------------------------------------------------------------------------
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.stratified_sample_help()
+RETURNS VARCHAR AS $$
+ SELECT MADLIB_SCHEMA.stratified_sample_help('');
+$$ LANGUAGE sql IMMUTABLE
+m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `');
+-------------------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/0b5000ab/src/ports/postgres/modules/sample/test/stratified_sample.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/sample/test/stratified_sample.sql_in b/src/ports/postgres/modules/sample/test/stratified_sample.sql_in
new file mode 100644
index 0000000..b889260
--- /dev/null
+++ b/src/ports/postgres/modules/sample/test/stratified_sample.sql_in
@@ -0,0 +1,75 @@
+/* ----------------------------------------------------------------------- *//**
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ *//* ----------------------------------------------------------------------- */
+
+DROP TABLE IF EXISTS test;
+
+CREATE TABLE test(
+ id1 INTEGER,
+ id2 INTEGER,
+ gr1 INTEGER,
+ gr2 INTEGER
+);
+
+INSERT INTO test VALUES
+(1,0,1,1),
+(2,0,1,1),
+(3,0,1,1),
+(4,0,1,1),
+(5,0,1,1),
+(6,0,1,1),
+(7,0,1,1),
+(8,0,1,1),
+(9,0,1,1),
+(9,0,1,1),
+(9,0,1,1),
+(9,0,1,1),
+(0,1,1,2),
+(0,2,1,2),
+(0,3,1,2),
+(0,4,1,2),
+(0,5,1,2),
+(0,6,1,2),
+(10,10,2,2),
+(20,20,2,2),
+(30,30,2,2),
+(40,40,2,2),
+(50,50,2,2),
+(60,60,2,2),
+(70,70,2,2)
+;
+
+DROP TABLE IF EXISTS out;
+SELECT stratified_sample('test', 'out', 0.5, NULL, 'id1,id2,gr1,gr2', FALSE);
+SELECT assert(count(*) = 13, 'Wrong number of samples') FROM out;
+
+DROP TABLE IF EXISTS out;
+SELECT stratified_sample('test', 'out', 0.5, 'gr1,gr2', 'id1,id2', FALSE);
+SELECT assert(count(DISTINCT (id1,id2)) = 3, 'Wrong number of samples')
+FROM out WHERE gr1 = 1 AND gr2 = 2;
+
+DROP TABLE IF EXISTS out;
+SELECT stratified_sample('test', 'out', 0.5, NULL, 'id1,id2,gr1,gr2', TRUE);
+SELECT assert(count(*) = 13, 'Wrong number of samples') FROM out;
+
+DROP TABLE IF EXISTS out;
+SELECT stratified_sample('test', 'out', 0.5, 'gr1,gr2', 'id1,id2', TRUE);
+SELECT assert(count(*) = 6, 'Wrong number of samples')
+FROM out WHERE gr1 = 1 AND gr2 = 1;