You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by ok...@apache.org on 2023/04/03 14:06:33 UTC
[madlib] 04/08: Various fixes: Add gpdb7 specific checks
This is an automated email from the ASF dual-hosted git repository.
okislal pushed a commit to branch madlib2-master
in repository https://gitbox.apache.org/repos/asf/madlib.git
commit 149363a17ce9f3675d617b6abeb861542309d2ea
Author: Orhan Kislal <ok...@apache.org>
AuthorDate: Mon Nov 21 16:43:44 2022 +0300
Various fixes: Add gpdb7 specific checks
---
cmake/TestIfNoUTF8BOM.py | 2 +-
deploy/gppkg/madlib.spec.in | 1 +
deploy/madlib.spec.in | 1 +
methods/kmeans/src/pg_gp/kmeans.c | 5 +++
src/madpack/argparse.py | 23 --------------
src/madpack/madpack.py | 36 +++++++---------------
src/madpack/utilities.py | 2 --
src/ports/greenplum/dbconnector/dbconnector.hpp | 3 ++
.../dbconnector/SystemInformation_impl.hpp | 11 +++++++
src/ports/postgres/dbconnector/dbconnector.hpp | 3 --
src/ports/postgres/modules/dbscan/dbscan.py_in | 2 ++
.../deep_learning/input_data_preprocessor.py_in | 5 +--
.../modules/deep_learning/madlib_keras.py_in | 3 ++
.../madlib_keras_custom_function.py_in | 1 -
.../test/unit_tests/test_madlib_keras.py_in | 8 ++---
src/ports/postgres/modules/graph/wcc.py_in | 4 ---
src/ports/postgres/modules/graph/wcc.sql_in | 4 +--
.../postgres/modules/mxgboost/madlib_xgboost.py_in | 9 +++---
18 files changed, 49 insertions(+), 74 deletions(-)
diff --git a/cmake/TestIfNoUTF8BOM.py b/cmake/TestIfNoUTF8BOM.py
index ce9b7b02..1a2824f7 100755
--- a/cmake/TestIfNoUTF8BOM.py
+++ b/cmake/TestIfNoUTF8BOM.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python3
+#!/usr/bin/env python
import sys
def detectBOM(inFileName):
diff --git a/deploy/gppkg/madlib.spec.in b/deploy/gppkg/madlib.spec.in
index 78fdb908..b71bf98a 100644
--- a/deploy/gppkg/madlib.spec.in
+++ b/deploy/gppkg/madlib.spec.in
@@ -1,3 +1,4 @@
+%define _build_id_links none
%define _topdir @CMAKE_CURRENT_BINARY_DIR@/@GPDB_VERSION@
%define __os_install_post %{nil}
%define _rpmfilename @MADLIB_GPPKG_RPM_FILE_NAME@
diff --git a/deploy/madlib.spec.in b/deploy/madlib.spec.in
index 8be7c823..a92ebd5f 100644
--- a/deploy/madlib.spec.in
+++ b/deploy/madlib.spec.in
@@ -1,4 +1,5 @@
# -*- rpm-spec -*-
+%define _build_id_links none
%define _rpmdir @CPACK_RPM_DIRECTORY@
%define _rpmfilename @CPACK_RPM_FILE_NAME@
%define _unpackaged_files_terminate_build 0
diff --git a/methods/kmeans/src/pg_gp/kmeans.c b/methods/kmeans/src/pg_gp/kmeans.c
index d74f65b9..a9d0cc0d 100644
--- a/methods/kmeans/src/pg_gp/kmeans.c
+++ b/methods/kmeans/src/pg_gp/kmeans.c
@@ -86,7 +86,12 @@ compute_metric(PGFunction inMetricFn, MemoryContext inMemContext, Datum inVec1,
* The 50k bound here is arbitrary, and motivated by ResetExprContext()
* in execUtils.c
*/
+
+#if GP_VERSION_NUM >= 70000
+ if(inMemContext->mem_allocated > 50000)
+#else
if(inMemContext->allBytesAlloc - inMemContext->allBytesFreed > 50000)
+#endif
MemoryContextReset(inMemContext);
#else
/* PostgreSQL does not have the allBytesAlloc and allBytesFreed fields */
diff --git a/src/madpack/argparse.py b/src/madpack/argparse.py
index f006fd48..3a812c2a 100644
--- a/src/madpack/argparse.py
+++ b/src/madpack/argparse.py
@@ -90,29 +90,6 @@ import textwrap as _textwrap
from gettext import gettext as _
-try:
- set
-except NameError:
- # for python < 2.4 compatibility (sets module is there since 2.3):
- from sets import Set as set
-
-try:
- str
-except NameError:
- str = str
-
-try:
- sorted
-except NameError:
- # for python < 2.4 compatibility:
- def sorted(iterable, reverse=False):
- result = list(iterable)
- result.sort()
- if reverse:
- result.reverse()
- return result
-
-
def _callable(obj):
return hasattr(obj, '__call__') or hasattr(obj, '__bases__')
diff --git a/src/madpack/madpack.py b/src/madpack/madpack.py
index c662ae89..f6743483 100755
--- a/src/madpack/madpack.py
+++ b/src/madpack/madpack.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python3
+#!/usr/bin/env python
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# Main Madpack installation executable.
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
@@ -27,13 +27,6 @@ from utilities import run_query
# Required Python version
py_min_ver = [2, 6]
-# XXX py3 Check python version
-#if sys.version_info[:2] < py_min_ver:
-# print(("ERROR: python version too old ({0}). You need {1} or greater.".
-# format('.'.join(map(str, sys.version_info[:3])),
-# '.'.join(map(str, py_min_ver)))))
-# exit(1)
-
# Find MADlib root directory. This file is installed to
# $MADLIB_ROOT/madpack/madpack.py, so to get $MADLIB_ROOT we need to go
# two levels up in the directory hierarchy. We use (a) os.path.realpath and
@@ -55,7 +48,6 @@ maddir_lib = "libmadlib.so" # C/C++ libraries
# Read the config files
ports = configyml.get_ports(maddir_conf) # object made of Ports.yml
-# XXX py3
new_madlib_ver = configyml.get_version(maddir_conf) # MADlib OS-level version
portid_list = []
for port in ports:
@@ -397,14 +389,11 @@ def _plpy_check(py_min_ver):
info_(this, "> PL/Python not installed", verbose)
info_(this, "> Creating language PL/Python...", True)
try:
- # XXX py3
_internal_run_query("CREATE LANGUAGE plpython3u;", True)
except:
error_(this, """Cannot create language plpython3u. Please check if you
have configured and installed portid (your platform) with
`--with-python` option. Stopping installation...""", False)
- # XXX py3
- #raise Exception
# Check PL/Python version
_internal_run_query("DROP FUNCTION IF EXISTS plpy_version_for_madlib();", False)
@@ -647,9 +636,6 @@ def _process_py_sql_files_in_modules(modset, args_dict):
else:
maddir_mod_py = maddir + "/modules"
- ### XXX PY3
- # info_(this, "\ncalling_operation: %s, %s" % (calling_operation, maddir_mod_py), verbose)
-
# Find the SQL module dir (platform specific or generic)
if os.path.isdir(maddir + "/ports/" + portid + "/modules/" + module):
maddir_mod_sql = maddir + "/ports/" + portid + "/modules"
@@ -762,16 +748,16 @@ def _execute_per_module_unit_test_algo(module, pyfile, cur_tmpdir):
# Run the python unit test file
runcmd = ["python3", pyfile]
# runenv = os.environ
- # export LD_LIBRARY_PATH="/usr/local/greenplum-db-devel/ext/python3.9/lib:$LD_LIBRARY_PATH"
- # export PATH="/usr/local/greenplum-db-devel/ext/python3.9/bin:$PATH"
- # export PYTHONHOME=/usr/local/greenplum-db-devel/ext/python3.9
- # export PYTHONPATH=/usr/local/greenplum-db-devel/ext/python3.9/lib
runenv = os.environ.copy()
- gphome = runenv["GPHOME"]
- runenv["LD_LIBRARY_PATH"] = "{0}/ext/python3.9/lib:".format(gphome) + runenv["LD_LIBRARY_PATH"]
- runenv["PATH"] = "{0}/ext/python3.9/bin:".format(gphome) + runenv["PATH"]
- runenv["PYTHONHOME"] = "{0}/ext/python3.9".format(gphome)
- runenv["PYTHONPATH"] = "{0}/ext/python3.9/lib".format(gphome)
+
+ # GPDB6 python3 support is provided by an additional package.
+ # To access it, we will have to set environment variables.
+ if dbver == '6':
+ gphome = runenv["GPHOME"]
+ runenv["LD_LIBRARY_PATH"] = "{0}/ext/python3.9/lib:".format(gphome) + runenv["LD_LIBRARY_PATH"]
+ runenv["PATH"] = "{0}/ext/python3.9/bin:".format(gphome) + runenv["PATH"]
+ runenv["PYTHONHOME"] = "{0}/ext/python3.9".format(gphome)
+ runenv["PYTHONPATH"] = "{0}/ext/python3.9/lib".format(gphome)
retval = subprocess.call(runcmd, env=runenv, stdout=log, stderr=log)
run_end = datetime.datetime.now()
milliseconds = round((run_end - run_start).seconds * 1000 +
@@ -1322,7 +1308,7 @@ def set_dynamic_library_path_in_database(dbver_split, madlib_library_path):
global dynamic_library_path
dynamic_library_path = _internal_run_query("SHOW dynamic_library_path", True)[0]['dynamic_library_path']
- # PG7 gpconfig messes up $libdir so we remove it for now
+ # GP7 gpconfig messes up $libdir so we remove it for now
paths = dynamic_library_path.split(":")
if madlib_library_path not in paths:
if '$libdir' in paths:
diff --git a/src/madpack/utilities.py b/src/madpack/utilities.py
index 04f6f9ba..6e9be610 100644
--- a/src/madpack/utilities.py
+++ b/src/madpack/utilities.py
@@ -118,7 +118,6 @@ def run_query(sql, con_args, show_error=True):
if err:
if show_error:
error_("SQL command failed: \nSQL: %s \n%s" % (sql, err), False)
- # XXX py3
if 'password' in err.decode():
raise EnvironmentError
else:
@@ -128,7 +127,6 @@ def run_query(sql, con_args, show_error=True):
results = [] # list of rows
i = 0
for line in std.splitlines():
- # XXX py3
line = line.decode()
if i == 0:
cols = [name for name in line.split(delimiter)]
diff --git a/src/ports/greenplum/dbconnector/dbconnector.hpp b/src/ports/greenplum/dbconnector/dbconnector.hpp
index 9c38ef66..4a71a1d1 100644
--- a/src/ports/greenplum/dbconnector/dbconnector.hpp
+++ b/src/ports/greenplum/dbconnector/dbconnector.hpp
@@ -22,6 +22,9 @@ extern "C" {
#include <utils/acl.h>
#include <utils/array.h>
#include <utils/builtins.h> // needed for format_procedure()
+#if GP_VERSION_NUM >= 70000
+ #include <utils/regproc.h> // needed for format_procedure()
+#endif
#include <utils/datum.h>
#include <utils/lsyscache.h> // for type lookup, e.g., type_is_rowtype
#include <utils/memutils.h>
diff --git a/src/ports/postgres/dbconnector/SystemInformation_impl.hpp b/src/ports/postgres/dbconnector/SystemInformation_impl.hpp
index f7190b45..979da045 100644
--- a/src/ports/postgres/dbconnector/SystemInformation_impl.hpp
+++ b/src/ports/postgres/dbconnector/SystemInformation_impl.hpp
@@ -4,6 +4,13 @@
*
*//* ----------------------------------------------------------------------- */
+#if GP_VERSION_NUM >= 70000
+extern "C"{
+ #include <common/hashfn.h>
+ extern uint32 uint32_hash(const void *key, Size keysize);
+}
+#endif
+
#ifndef MADLIB_POSTGRES_SYSTEMINFORMATION_IMPL_HPP
#define MADLIB_POSTGRES_SYSTEMINFORMATION_IMPL_HPP
@@ -27,7 +34,11 @@ initializeOidHashTable(HTAB*& ioHashTable, MemoryContext inCacheContext,
HASHCTL ctl;
ctl.keysize = sizeof(Oid);
ctl.entrysize = inEntrySize;
+#if GP_VERSION_NUM >= 70000
+ ctl.hash = uint32_hash;
+#else
ctl.hash = oid_hash;
+#endif
ctl.hcxt = inCacheContext;
ioHashTable = madlib_hash_create(
/* tabname -- a name for the table (for debugging purposes) */
diff --git a/src/ports/postgres/dbconnector/dbconnector.hpp b/src/ports/postgres/dbconnector/dbconnector.hpp
index 76eaf8ab..f5a3b0f2 100644
--- a/src/ports/postgres/dbconnector/dbconnector.hpp
+++ b/src/ports/postgres/dbconnector/dbconnector.hpp
@@ -19,7 +19,6 @@
// Since we don't need anything from ports.h we can cheat and say its already been declared.
// Warning: This could cause problems in the future...
#define PG_PORT_H
-
extern "C" {
#include <postgres.h>
#include <pg_config.h> // Use the macro defined in the header to detect the platform
@@ -31,9 +30,7 @@ extern "C" {
#include <utils/acl.h>
#include <utils/array.h>
#include <utils/builtins.h> // needed for format_procedure()
-#if PG_VERSION_NUM >= 100000
#include <utils/regproc.h> // needed for format_procedure() - PostgreSQL 10
-#endif
#include <utils/datum.h>
#include <utils/lsyscache.h> // for type lookup, e.g., type_is_rowtype
#include <utils/memutils.h>
diff --git a/src/ports/postgres/modules/dbscan/dbscan.py_in b/src/ports/postgres/modules/dbscan/dbscan.py_in
index 8f5a740a..ae356a88 100644
--- a/src/ports/postgres/modules/dbscan/dbscan.py_in
+++ b/src/ports/postgres/modules/dbscan/dbscan.py_in
@@ -1290,6 +1290,8 @@ def _dbscan_leaf(db_rec, eps, min_samples, metric, num_internal_points,
"""
global pstats
+ num_points = num_internal_points + num_external_points
+
db_rec = dbscan_record.from_dict(db_rec, eps)
id = db_rec.id
dist_id = db_rec.dist_id
diff --git a/src/ports/postgres/modules/deep_learning/input_data_preprocessor.py_in b/src/ports/postgres/modules/deep_learning/input_data_preprocessor.py_in
index f0905f1d..88648cc6 100644
--- a/src/ports/postgres/modules/deep_learning/input_data_preprocessor.py_in
+++ b/src/ports/postgres/modules/deep_learning/input_data_preprocessor.py_in
@@ -24,7 +24,6 @@
"""
from math import ceil
import plpy
-import time
from internal.db_utils import get_distinct_col_levels
from internal.db_utils import quote_literal
@@ -52,12 +51,10 @@ from deep_learning.madlib_keras_helper import *
NUM_CLASSES_COLNAME = "num_classes"
-
class DistributionRulesOptions:
ALL_SEGMENTS = 'all_segments'
GPU_SEGMENTS = 'gpu_segments'
-
class InputDataPreprocessorDL(object):
def __init__(self, schema_madlib, source_table, output_table,
dependent_varname, independent_varname, buffer_size,
@@ -287,7 +284,7 @@ class InputDataPreprocessorDL(object):
{self.schema_madlib}.array_to_bytea({i}) AS {i}
""".format(**locals()))
- for i, j in zip(self.dependent_varname, dep_shape):
+ for i,j in zip(self.dependent_varname, dep_shape):
concat_sql.append("""
{self.schema_madlib}.agg_array_concat(ARRAY[{i}]) AS {i}
""".format(**locals()))
diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras.py_in b/src/ports/postgres/modules/deep_learning/madlib_keras.py_in
index 2fa02c51..5a99f129 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras.py_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras.py_in
@@ -250,6 +250,9 @@ def fit(schema_madlib, source_table, model, model_arch_table,
[serialized_weights, custom_function_map]
)[0]['iteration_result']
except plpy.SPIError as e:
+ # FIXME:
+ # The following message parsing doesn't work since python3 exception
+ # implementation is different.
# msg = e.args[0]
# if 'TransAggDetail' in msg:
# e.args[0], detail = msg.split('TransAggDetail')
diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras_custom_function.py_in b/src/ports/postgres/modules/deep_learning/madlib_keras_custom_function.py_in
index 557f616c..549b0c7c 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras_custom_function.py_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras_custom_function.py_in
@@ -33,7 +33,6 @@ from utilities.validate_args import table_exists
module_name = 'Keras Custom Function'
-
class CustomFunctionSchema:
"""Expected format of custom function table.
Example uses:
diff --git a/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in b/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in
index c59b47a6..ab8dc383 100644
--- a/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in
+++ b/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in
@@ -676,7 +676,7 @@ class MadlibKerasFitEvalTransitionTestCase(unittest.TestCase):
merged_state = self.subject.fit_merge(state1.tostring(),state2.tostring())
state = np.fromstring(merged_state, dtype=np.float32)
image_count_total = state[0]
- weights = np.rint(state[1:]).astype(np.int)
+ weights = np.rint(state[1:]).astype(int)
self.assertEqual( 2*image_count+30 , image_count_total )
self.assertTrue( (mult(5,self.model_weights) == weights).all())
@@ -689,7 +689,7 @@ class MadlibKerasFitEvalTransitionTestCase(unittest.TestCase):
merged_state = self.subject.fit_merge(None, input_state.tostring())
state = np.fromstring(merged_state, dtype=np.float32)
image_count_total = state[0]
- weights = np.rint(state[1:]).astype(np.int)
+ weights = np.rint(state[1:]).astype(int)
self.assertEqual(image_count, image_count_total)
self.assertTrue((self.model_weights == weights).all())
@@ -702,7 +702,7 @@ class MadlibKerasFitEvalTransitionTestCase(unittest.TestCase):
merged_state = self.subject.fit_merge(input_state.tostring(), None)
state = np.fromstring(merged_state, dtype=np.float32)
image_count_total = state[0]
- weights = np.rint(state[1:]).astype(np.int)
+ weights = np.rint(state[1:]).astype(int)
self.assertEqual(image_count, image_count_total)
self.assertTrue((self.model_weights == weights).all())
@@ -719,7 +719,7 @@ class MadlibKerasFitEvalTransitionTestCase(unittest.TestCase):
output_state = self.subject.fit_final(input_state.tostring())
output_state = np.fromstring(output_state, dtype=np.float32)
- weights = np.rint(output_state).astype(np.int)
+ weights = np.rint(output_state).astype(int)
self.assertTrue((self.model_weights == weights).all())
diff --git a/src/ports/postgres/modules/graph/wcc.py_in b/src/ports/postgres/modules/graph/wcc.py_in
index 571bf617..13ef5970 100644
--- a/src/ports/postgres/modules/graph/wcc.py_in
+++ b/src/ports/postgres/modules/graph/wcc.py_in
@@ -442,11 +442,8 @@ def wcc(schema_madlib, vertex_table, vertex_id, edge_table, edge_args,
SELECT COUNT(*) AS cnt_sum FROM {toupdate};
"""
-<<<<<<< HEAD
iteration_counter = 0
while nodes_to_update > 0 and iteration_counter < iteration_limit:
-=======
- while nodes_to_update is not None and nodes_to_update > 0:
# Look at all the neighbors of a node, and assign the smallest node id
# among the neighbors as its component_id. The next table starts off
# with very high component_id (BIGINT_MAX). The component_id of all nodes
@@ -454,7 +451,6 @@ def wcc(schema_madlib, vertex_table, vertex_id, edge_table, edge_args,
# updated in the next table. At every iteration update only those nodes
# whose component_id in the previous iteration are greater than what was
# found in the current iteration.
->>>>>>> c8773b74 (Add python3 support)
with SetGUC("dev_opt_unsafe_truncate_in_subtransaction", "on"):
nodes_to_update = plpy.execute(loop_sql.format(**locals()))[0]["cnt_sum"]
diff --git a/src/ports/postgres/modules/graph/wcc.sql_in b/src/ports/postgres/modules/graph/wcc.sql_in
index 2d375f98..9d9b4802 100644
--- a/src/ports/postgres/modules/graph/wcc.sql_in
+++ b/src/ports/postgres/modules/graph/wcc.sql_in
@@ -619,7 +619,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.weakly_connected_components(
warm_start BOOLEAN
) RETURNS VOID AS $$
PythonFunction(graph, wcc, wcc)
-$$ LANGUAGE plpythonu VOLATILE
+$$ LANGUAGE plpython3u VOLATILE
m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
-------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.weakly_connected_components(
@@ -632,7 +632,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.weakly_connected_components(
iteration_limit INTEGER
) RETURNS VOID AS $$
PythonFunction(graph, wcc, wcc)
-$$ LANGUAGE plpythonu VOLATILE
+$$ LANGUAGE plpython3u VOLATILE
m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
-------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.weakly_connected_components(
diff --git a/src/ports/postgres/modules/mxgboost/madlib_xgboost.py_in b/src/ports/postgres/modules/mxgboost/madlib_xgboost.py_in
index ee57afdf..3abf8594 100644
--- a/src/ports/postgres/modules/mxgboost/madlib_xgboost.py_in
+++ b/src/ports/postgres/modules/mxgboost/madlib_xgboost.py_in
@@ -154,10 +154,10 @@ def xgboost_train(schema_madlib, dframe, features_all, class_label, params,
#save off and remove the id_column for later output. Make sure to get rid of id_column from features!
- test_ids = X_test [:,len(features)-1]
- X_train = numpy.delete(X_train,len(features)-1,1)
- X_test = numpy.delete(X_test,len(features)-1,1)
- features = features[0:len(features)-1]
+ test_ids = X_test[:,0]
+ X_train = numpy.delete(X_train,0,1)
+ X_test = numpy.delete(X_test,0,1)
+ features = features[1:len(features)]
class_list_y_train = numpy.unique(y_train).tolist()
class_list_y_test = numpy.unique(y_test).tolist()
@@ -165,7 +165,6 @@ def xgboost_train(schema_madlib, dframe, features_all, class_label, params,
if (class_list != class_list_y_train) or (class_list != class_list_y_test):
plpy.error("Train test split caused a subset with missing classes.")
-
#run weights
sample_representation = y_train.value_counts()
total_samples = sum(sample_representation)