You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by ok...@apache.org on 2023/04/03 14:06:33 UTC

[madlib] 04/08: Various fixes: Add gpdb7 specific checks

This is an automated email from the ASF dual-hosted git repository.

okislal pushed a commit to branch madlib2-master
in repository https://gitbox.apache.org/repos/asf/madlib.git

commit 149363a17ce9f3675d617b6abeb861542309d2ea
Author: Orhan Kislal <ok...@apache.org>
AuthorDate: Mon Nov 21 16:43:44 2022 +0300

    Various fixes: Add gpdb7 specific checks
---
 cmake/TestIfNoUTF8BOM.py                           |  2 +-
 deploy/gppkg/madlib.spec.in                        |  1 +
 deploy/madlib.spec.in                              |  1 +
 methods/kmeans/src/pg_gp/kmeans.c                  |  5 +++
 src/madpack/argparse.py                            | 23 --------------
 src/madpack/madpack.py                             | 36 +++++++---------------
 src/madpack/utilities.py                           |  2 --
 src/ports/greenplum/dbconnector/dbconnector.hpp    |  3 ++
 .../dbconnector/SystemInformation_impl.hpp         | 11 +++++++
 src/ports/postgres/dbconnector/dbconnector.hpp     |  3 --
 src/ports/postgres/modules/dbscan/dbscan.py_in     |  2 ++
 .../deep_learning/input_data_preprocessor.py_in    |  5 +--
 .../modules/deep_learning/madlib_keras.py_in       |  3 ++
 .../madlib_keras_custom_function.py_in             |  1 -
 .../test/unit_tests/test_madlib_keras.py_in        |  8 ++---
 src/ports/postgres/modules/graph/wcc.py_in         |  4 ---
 src/ports/postgres/modules/graph/wcc.sql_in        |  4 +--
 .../postgres/modules/mxgboost/madlib_xgboost.py_in |  9 +++---
 18 files changed, 49 insertions(+), 74 deletions(-)

diff --git a/cmake/TestIfNoUTF8BOM.py b/cmake/TestIfNoUTF8BOM.py
index ce9b7b02..1a2824f7 100755
--- a/cmake/TestIfNoUTF8BOM.py
+++ b/cmake/TestIfNoUTF8BOM.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python3
+#!/usr/bin/env python
 import sys
 
 def detectBOM(inFileName):
diff --git a/deploy/gppkg/madlib.spec.in b/deploy/gppkg/madlib.spec.in
index 78fdb908..b71bf98a 100644
--- a/deploy/gppkg/madlib.spec.in
+++ b/deploy/gppkg/madlib.spec.in
@@ -1,3 +1,4 @@
+%define _build_id_links none
 %define _topdir           @CMAKE_CURRENT_BINARY_DIR@/@GPDB_VERSION@
 %define __os_install_post %{nil}
 %define _rpmfilename      @MADLIB_GPPKG_RPM_FILE_NAME@
diff --git a/deploy/madlib.spec.in b/deploy/madlib.spec.in
index 8be7c823..a92ebd5f 100644
--- a/deploy/madlib.spec.in
+++ b/deploy/madlib.spec.in
@@ -1,4 +1,5 @@
 # -*- rpm-spec -*-
+%define _build_id_links none
 %define _rpmdir @CPACK_RPM_DIRECTORY@
 %define _rpmfilename @CPACK_RPM_FILE_NAME@
 %define _unpackaged_files_terminate_build 0
diff --git a/methods/kmeans/src/pg_gp/kmeans.c b/methods/kmeans/src/pg_gp/kmeans.c
index d74f65b9..a9d0cc0d 100644
--- a/methods/kmeans/src/pg_gp/kmeans.c
+++ b/methods/kmeans/src/pg_gp/kmeans.c
@@ -86,7 +86,12 @@ compute_metric(PGFunction inMetricFn, MemoryContext inMemContext, Datum inVec1,
      * The 50k bound here is arbitrary, and motivated by ResetExprContext()
      * in execUtils.c
      */
+
+#if GP_VERSION_NUM >= 70000
+    if(inMemContext->mem_allocated > 50000)
+#else
     if(inMemContext->allBytesAlloc - inMemContext->allBytesFreed > 50000)
+#endif
         MemoryContextReset(inMemContext);
 #else
     /* PostgreSQL does not have the allBytesAlloc and allBytesFreed fields */
diff --git a/src/madpack/argparse.py b/src/madpack/argparse.py
index f006fd48..3a812c2a 100644
--- a/src/madpack/argparse.py
+++ b/src/madpack/argparse.py
@@ -90,29 +90,6 @@ import textwrap as _textwrap
 
 from gettext import gettext as _
 
-try:
-    set
-except NameError:
-    # for python < 2.4 compatibility (sets module is there since 2.3):
-    from sets import Set as set
-
-try:
-    str
-except NameError:
-    str = str
-
-try:
-    sorted
-except NameError:
-    # for python < 2.4 compatibility:
-    def sorted(iterable, reverse=False):
-        result = list(iterable)
-        result.sort()
-        if reverse:
-            result.reverse()
-        return result
-
-
 def _callable(obj):
     return hasattr(obj, '__call__') or hasattr(obj, '__bases__')
 
diff --git a/src/madpack/madpack.py b/src/madpack/madpack.py
index c662ae89..f6743483 100755
--- a/src/madpack/madpack.py
+++ b/src/madpack/madpack.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python3
+#!/usr/bin/env python
 # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
 # Main Madpack installation executable.
 # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
@@ -27,13 +27,6 @@ from utilities import run_query
 # Required Python version
 py_min_ver = [2, 6]
 
-# XXX py3 Check python version
-#if sys.version_info[:2] < py_min_ver:
-#    print(("ERROR: python version too old ({0}). You need {1} or greater.".
-#          format('.'.join(map(str, sys.version_info[:3])),
-#                 '.'.join(map(str, py_min_ver)))))
-#    exit(1)
-
 # Find MADlib root directory. This file is installed to
 # $MADLIB_ROOT/madpack/madpack.py, so to get $MADLIB_ROOT we need to go
 # two levels up in the directory hierarchy. We use (a) os.path.realpath and
@@ -55,7 +48,6 @@ maddir_lib = "libmadlib.so"  # C/C++ libraries
 
 # Read the config files
 ports = configyml.get_ports(maddir_conf)  # object made of Ports.yml
-# XXX py3
 new_madlib_ver = configyml.get_version(maddir_conf)  # MADlib OS-level version
 portid_list = []
 for port in ports:
@@ -397,14 +389,11 @@ def _plpy_check(py_min_ver):
         info_(this, "> PL/Python not installed", verbose)
         info_(this, "> Creating language PL/Python...", True)
         try:
-            # XXX py3
             _internal_run_query("CREATE LANGUAGE plpython3u;", True)
         except:
             error_(this, """Cannot create language plpython3u. Please check if you
                 have configured and installed portid (your platform) with
                 `--with-python` option. Stopping installation...""", False)
-            # XXX py3
-            #raise Exception
 
     # Check PL/Python version
     _internal_run_query("DROP FUNCTION IF EXISTS plpy_version_for_madlib();", False)
@@ -647,9 +636,6 @@ def _process_py_sql_files_in_modules(modset, args_dict):
         else:
             maddir_mod_py = maddir + "/modules"
 
-        ### XXX PY3
-        # info_(this, "\ncalling_operation: %s, %s" % (calling_operation, maddir_mod_py), verbose)
-
         # Find the SQL module dir (platform specific or generic)
         if os.path.isdir(maddir + "/ports/" + portid + "/modules/" + module):
             maddir_mod_sql = maddir + "/ports/" + portid + "/modules"
@@ -762,16 +748,16 @@ def _execute_per_module_unit_test_algo(module, pyfile, cur_tmpdir):
         # Run the python unit test file
         runcmd = ["python3", pyfile]
         # runenv = os.environ
-        # export LD_LIBRARY_PATH="/usr/local/greenplum-db-devel/ext/python3.9/lib:$LD_LIBRARY_PATH"
-        # export PATH="/usr/local/greenplum-db-devel/ext/python3.9/bin:$PATH"
-        # export PYTHONHOME=/usr/local/greenplum-db-devel/ext/python3.9
-        # export PYTHONPATH=/usr/local/greenplum-db-devel/ext/python3.9/lib
         runenv = os.environ.copy()
-        gphome = runenv["GPHOME"]
-        runenv["LD_LIBRARY_PATH"] = "{0}/ext/python3.9/lib:".format(gphome) + runenv["LD_LIBRARY_PATH"]
-        runenv["PATH"] = "{0}/ext/python3.9/bin:".format(gphome) + runenv["PATH"]
-        runenv["PYTHONHOME"] = "{0}/ext/python3.9".format(gphome)
-        runenv["PYTHONPATH"] = "{0}/ext/python3.9/lib".format(gphome)
+
+        # GPDB6 python3 support is provided by an additional package.
+        # To access it, we will have to set environment variables.
+        if dbver == '6':
+            gphome = runenv["GPHOME"]
+            runenv["LD_LIBRARY_PATH"] = "{0}/ext/python3.9/lib:".format(gphome) + runenv["LD_LIBRARY_PATH"]
+            runenv["PATH"] = "{0}/ext/python3.9/bin:".format(gphome) + runenv["PATH"]
+            runenv["PYTHONHOME"] = "{0}/ext/python3.9".format(gphome)
+            runenv["PYTHONPATH"] = "{0}/ext/python3.9/lib".format(gphome)
         retval = subprocess.call(runcmd, env=runenv, stdout=log, stderr=log)
         run_end = datetime.datetime.now()
         milliseconds = round((run_end - run_start).seconds * 1000 +
@@ -1322,7 +1308,7 @@ def set_dynamic_library_path_in_database(dbver_split, madlib_library_path):
 
     global dynamic_library_path
     dynamic_library_path = _internal_run_query("SHOW dynamic_library_path", True)[0]['dynamic_library_path']
-    # PG7 gpconfig messes up $libdir so we remove it for now
+    # GP7 gpconfig messes up $libdir so we remove it for now
     paths = dynamic_library_path.split(":")
     if madlib_library_path not in paths:
         if '$libdir' in paths:
diff --git a/src/madpack/utilities.py b/src/madpack/utilities.py
index 04f6f9ba..6e9be610 100644
--- a/src/madpack/utilities.py
+++ b/src/madpack/utilities.py
@@ -118,7 +118,6 @@ def run_query(sql, con_args, show_error=True):
     if err:
         if show_error:
             error_("SQL command failed: \nSQL: %s \n%s" % (sql, err), False)
-        # XXX py3
         if 'password' in err.decode():
             raise EnvironmentError
         else:
@@ -128,7 +127,6 @@ def run_query(sql, con_args, show_error=True):
     results = []  # list of rows
     i = 0
     for line in std.splitlines():
-        # XXX py3
         line = line.decode()
         if i == 0:
             cols = [name for name in line.split(delimiter)]
diff --git a/src/ports/greenplum/dbconnector/dbconnector.hpp b/src/ports/greenplum/dbconnector/dbconnector.hpp
index 9c38ef66..4a71a1d1 100644
--- a/src/ports/greenplum/dbconnector/dbconnector.hpp
+++ b/src/ports/greenplum/dbconnector/dbconnector.hpp
@@ -22,6 +22,9 @@ extern "C" {
     #include <utils/acl.h>
     #include <utils/array.h>
     #include <utils/builtins.h>    // needed for format_procedure()
+#if GP_VERSION_NUM >= 70000
+    #include <utils/regproc.h>    // needed for format_procedure()
+#endif
     #include <utils/datum.h>
     #include <utils/lsyscache.h>   // for type lookup, e.g., type_is_rowtype
     #include <utils/memutils.h>
diff --git a/src/ports/postgres/dbconnector/SystemInformation_impl.hpp b/src/ports/postgres/dbconnector/SystemInformation_impl.hpp
index f7190b45..979da045 100644
--- a/src/ports/postgres/dbconnector/SystemInformation_impl.hpp
+++ b/src/ports/postgres/dbconnector/SystemInformation_impl.hpp
@@ -4,6 +4,13 @@
  *
  *//* ----------------------------------------------------------------------- */
 
+#if GP_VERSION_NUM >= 70000
+extern "C"{
+    #include <common/hashfn.h>
+    extern uint32 uint32_hash(const void *key, Size keysize);
+}
+#endif
+
 #ifndef MADLIB_POSTGRES_SYSTEMINFORMATION_IMPL_HPP
 #define MADLIB_POSTGRES_SYSTEMINFORMATION_IMPL_HPP
 
@@ -27,7 +34,11 @@ initializeOidHashTable(HTAB*& ioHashTable, MemoryContext inCacheContext,
         HASHCTL ctl;
         ctl.keysize = sizeof(Oid);
         ctl.entrysize = inEntrySize;
+#if GP_VERSION_NUM >= 70000
+        ctl.hash = uint32_hash;
+#else
         ctl.hash = oid_hash;
+#endif
         ctl.hcxt = inCacheContext;
         ioHashTable = madlib_hash_create(
             /* tabname -- a name for the table (for debugging purposes) */
diff --git a/src/ports/postgres/dbconnector/dbconnector.hpp b/src/ports/postgres/dbconnector/dbconnector.hpp
index 76eaf8ab..f5a3b0f2 100644
--- a/src/ports/postgres/dbconnector/dbconnector.hpp
+++ b/src/ports/postgres/dbconnector/dbconnector.hpp
@@ -19,7 +19,6 @@
 // Since we don't need anything from ports.h we can cheat and say its already been declared.
 // Warning: This could cause problems in the future...
 #define PG_PORT_H
-
 extern "C" {
     #include <postgres.h>
     #include <pg_config.h>         // Use the macro defined in the header to detect the platform
@@ -31,9 +30,7 @@ extern "C" {
     #include <utils/acl.h>
     #include <utils/array.h>
     #include <utils/builtins.h>    // needed for format_procedure()
-#if PG_VERSION_NUM >= 100000
     #include <utils/regproc.h>     // needed for format_procedure() - PostgreSQL 10
-#endif
     #include <utils/datum.h>
     #include <utils/lsyscache.h>   // for type lookup, e.g., type_is_rowtype
     #include <utils/memutils.h>
diff --git a/src/ports/postgres/modules/dbscan/dbscan.py_in b/src/ports/postgres/modules/dbscan/dbscan.py_in
index 8f5a740a..ae356a88 100644
--- a/src/ports/postgres/modules/dbscan/dbscan.py_in
+++ b/src/ports/postgres/modules/dbscan/dbscan.py_in
@@ -1290,6 +1290,8 @@ def _dbscan_leaf(db_rec, eps, min_samples, metric, num_internal_points,
     """
     global pstats
 
+    num_points = num_internal_points + num_external_points
+
     db_rec = dbscan_record.from_dict(db_rec, eps)
     id = db_rec.id
     dist_id = db_rec.dist_id
diff --git a/src/ports/postgres/modules/deep_learning/input_data_preprocessor.py_in b/src/ports/postgres/modules/deep_learning/input_data_preprocessor.py_in
index f0905f1d..88648cc6 100644
--- a/src/ports/postgres/modules/deep_learning/input_data_preprocessor.py_in
+++ b/src/ports/postgres/modules/deep_learning/input_data_preprocessor.py_in
@@ -24,7 +24,6 @@
 """
 from math import ceil
 import plpy
-import time
 
 from internal.db_utils import get_distinct_col_levels
 from internal.db_utils import quote_literal
@@ -52,12 +51,10 @@ from deep_learning.madlib_keras_helper import *
 
 NUM_CLASSES_COLNAME = "num_classes"
 
-
 class DistributionRulesOptions:
     ALL_SEGMENTS = 'all_segments'
     GPU_SEGMENTS = 'gpu_segments'
 
-
 class InputDataPreprocessorDL(object):
     def __init__(self, schema_madlib, source_table, output_table,
                  dependent_varname, independent_varname, buffer_size,
@@ -287,7 +284,7 @@ class InputDataPreprocessorDL(object):
                 {self.schema_madlib}.array_to_bytea({i}) AS {i}
                 """.format(**locals()))
 
-        for i, j in zip(self.dependent_varname, dep_shape):
+        for i,j in zip(self.dependent_varname, dep_shape):
             concat_sql.append("""
                 {self.schema_madlib}.agg_array_concat(ARRAY[{i}]) AS {i}
                 """.format(**locals()))
diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras.py_in b/src/ports/postgres/modules/deep_learning/madlib_keras.py_in
index 2fa02c51..5a99f129 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras.py_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras.py_in
@@ -250,6 +250,9 @@ def fit(schema_madlib, source_table, model, model_arch_table,
                                             [serialized_weights, custom_function_map]
                                             )[0]['iteration_result']
         except plpy.SPIError as e:
+            # FIXME:
+            # The following message parsing doesn't work since python3 exception
+            # implementation is different.
             # msg = e.args[0]
             # if 'TransAggDetail' in msg:
             #     e.args[0], detail = msg.split('TransAggDetail')
diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras_custom_function.py_in b/src/ports/postgres/modules/deep_learning/madlib_keras_custom_function.py_in
index 557f616c..549b0c7c 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras_custom_function.py_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras_custom_function.py_in
@@ -33,7 +33,6 @@ from utilities.validate_args import table_exists
 
 module_name = 'Keras Custom Function'
 
-
 class CustomFunctionSchema:
     """Expected format of custom function table.
        Example uses:
diff --git a/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in b/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in
index c59b47a6..ab8dc383 100644
--- a/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in
+++ b/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in
@@ -676,7 +676,7 @@ class MadlibKerasFitEvalTransitionTestCase(unittest.TestCase):
         merged_state = self.subject.fit_merge(state1.tostring(),state2.tostring())
         state = np.fromstring(merged_state, dtype=np.float32)
         image_count_total = state[0]
-        weights = np.rint(state[1:]).astype(np.int)
+        weights = np.rint(state[1:]).astype(int)
 
         self.assertEqual( 2*image_count+30 , image_count_total )
         self.assertTrue( (mult(5,self.model_weights) == weights).all())
@@ -689,7 +689,7 @@ class MadlibKerasFitEvalTransitionTestCase(unittest.TestCase):
         merged_state = self.subject.fit_merge(None, input_state.tostring())
         state = np.fromstring(merged_state, dtype=np.float32)
         image_count_total = state[0]
-        weights = np.rint(state[1:]).astype(np.int)
+        weights = np.rint(state[1:]).astype(int)
 
         self.assertEqual(image_count, image_count_total)
         self.assertTrue((self.model_weights == weights).all())
@@ -702,7 +702,7 @@ class MadlibKerasFitEvalTransitionTestCase(unittest.TestCase):
         merged_state = self.subject.fit_merge(input_state.tostring(), None)
         state = np.fromstring(merged_state, dtype=np.float32)
         image_count_total = state[0]
-        weights = np.rint(state[1:]).astype(np.int)
+        weights = np.rint(state[1:]).astype(int)
 
         self.assertEqual(image_count, image_count_total)
         self.assertTrue((self.model_weights == weights).all())
@@ -719,7 +719,7 @@ class MadlibKerasFitEvalTransitionTestCase(unittest.TestCase):
 
         output_state = self.subject.fit_final(input_state.tostring())
         output_state = np.fromstring(output_state, dtype=np.float32)
-        weights = np.rint(output_state).astype(np.int)
+        weights = np.rint(output_state).astype(int)
 
         self.assertTrue((self.model_weights == weights).all())
 
diff --git a/src/ports/postgres/modules/graph/wcc.py_in b/src/ports/postgres/modules/graph/wcc.py_in
index 571bf617..13ef5970 100644
--- a/src/ports/postgres/modules/graph/wcc.py_in
+++ b/src/ports/postgres/modules/graph/wcc.py_in
@@ -442,11 +442,8 @@ def wcc(schema_madlib, vertex_table, vertex_id, edge_table, edge_args,
 
         SELECT COUNT(*) AS cnt_sum FROM {toupdate};
     """
-<<<<<<< HEAD
     iteration_counter = 0
     while nodes_to_update > 0 and iteration_counter < iteration_limit:
-=======
-    while nodes_to_update is not None and nodes_to_update > 0:
         # Look at all the neighbors of a node, and assign the smallest node id
         # among the neighbors as its component_id. The next table starts off
         # with very high component_id (BIGINT_MAX). The component_id of all nodes
@@ -454,7 +451,6 @@ def wcc(schema_madlib, vertex_table, vertex_id, edge_table, edge_args,
         # updated in the next table. At every iteration update only those nodes
         # whose component_id in the previous iteration are greater than what was
         # found in the current iteration.
->>>>>>> c8773b74 (Add python3 support)
         with SetGUC("dev_opt_unsafe_truncate_in_subtransaction", "on"):
 
             nodes_to_update = plpy.execute(loop_sql.format(**locals()))[0]["cnt_sum"]
diff --git a/src/ports/postgres/modules/graph/wcc.sql_in b/src/ports/postgres/modules/graph/wcc.sql_in
index 2d375f98..9d9b4802 100644
--- a/src/ports/postgres/modules/graph/wcc.sql_in
+++ b/src/ports/postgres/modules/graph/wcc.sql_in
@@ -619,7 +619,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.weakly_connected_components(
     warm_start              BOOLEAN
 ) RETURNS VOID AS $$
     PythonFunction(graph, wcc, wcc)
-$$ LANGUAGE plpythonu VOLATILE
+$$ LANGUAGE plpython3u VOLATILE
 m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
 -------------------------------------------------------------------------
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.weakly_connected_components(
@@ -632,7 +632,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.weakly_connected_components(
     iteration_limit         INTEGER
 ) RETURNS VOID AS $$
     PythonFunction(graph, wcc, wcc)
-$$ LANGUAGE plpythonu VOLATILE
+$$ LANGUAGE plpython3u VOLATILE
 m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
 -------------------------------------------------------------------------
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.weakly_connected_components(
diff --git a/src/ports/postgres/modules/mxgboost/madlib_xgboost.py_in b/src/ports/postgres/modules/mxgboost/madlib_xgboost.py_in
index ee57afdf..3abf8594 100644
--- a/src/ports/postgres/modules/mxgboost/madlib_xgboost.py_in
+++ b/src/ports/postgres/modules/mxgboost/madlib_xgboost.py_in
@@ -154,10 +154,10 @@ def xgboost_train(schema_madlib, dframe, features_all, class_label, params,
 
     #save off and remove the id_column for later output. Make sure to get rid of id_column from features!
 
-    test_ids = X_test [:,len(features)-1]
-    X_train = numpy.delete(X_train,len(features)-1,1)
-    X_test = numpy.delete(X_test,len(features)-1,1)
-    features = features[0:len(features)-1]
+    test_ids = X_test[:,0]
+    X_train = numpy.delete(X_train,0,1)
+    X_test = numpy.delete(X_test,0,1)
+    features = features[1:len(features)]
 
     class_list_y_train = numpy.unique(y_train).tolist()
     class_list_y_test = numpy.unique(y_test).tolist()
@@ -165,7 +165,6 @@ def xgboost_train(schema_madlib, dframe, features_all, class_label, params,
     if (class_list != class_list_y_train) or (class_list != class_list_y_test):
         plpy.error("Train test split caused a subset with missing classes.")
 
-
     #run weights
     sample_representation = y_train.value_counts()
     total_samples = sum(sample_representation)