You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hawq.apache.org by nh...@apache.org on 2016/01/11 23:07:34 UTC
incubator-hawq git commit: HAWQ-324. Add minirepro management utility
Repository: incubator-hawq
Updated Branches:
refs/heads/master 127cac3e2 -> bee013732
HAWQ-324. Add minirepro management utility
Project: http://git-wip-us.apache.org/repos/asf/incubator-hawq/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-hawq/commit/bee01373
Tree: http://git-wip-us.apache.org/repos/asf/incubator-hawq/tree/bee01373
Diff: http://git-wip-us.apache.org/repos/asf/incubator-hawq/diff/bee01373
Branch: refs/heads/master
Commit: bee0137323c275a634ff7104a4c5640a4e6114a1
Parents: 127cac3
Author: Haisheng Yuan <hy...@pivotal.io>
Authored: Thu Jan 7 13:33:02 2016 -0800
Committer: Haisheng Yuan <hy...@pivotal.io>
Committed: Mon Jan 11 14:05:03 2016 -0800
----------------------------------------------------------------------
src/backend/catalog/Makefile | 2 +
src/backend/catalog/gp_toolkit.sql.in | 20 ++
src/bin/Makefile | 2 +-
src/bin/gpoptutils/Makefile | 12 +
src/bin/gpoptutils/gpoptutils.c | 140 ++++++++++++
tools/bin/minirepro | 343 +++++++++++++++++++++++++++++
6 files changed, 518 insertions(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/bee01373/src/backend/catalog/Makefile
----------------------------------------------------------------------
diff --git a/src/backend/catalog/Makefile b/src/backend/catalog/Makefile
index e066cff..54d26cb 100644
--- a/src/backend/catalog/Makefile
+++ b/src/backend/catalog/Makefile
@@ -51,6 +51,7 @@ JETPACK_PREFIX = $(JETPACK_SCHEMA).$(JETPACK_OBJECT_PREFIX)
JETPACK_PREFIX_AUX = $(JETPACK_SCHEMA).__$(JETPACK_OBJECT_PREFIX)
WORKFILE_PATHNAME = $$libdir\/gp_workfile_mgr
SESSION_STATE_PATHNAME = $$libdir\/gp_session_state
+OPTUTILS_PATHNAME = $$libdir\/gpoptutils
gp_toolkit.sql: gp_toolkit.sql.in
sed \
@@ -60,6 +61,7 @@ gp_toolkit.sql: gp_toolkit.sql.in
-e 's/%%JETPACK_PREFIX_AUX%%/$(JETPACK_PREFIX_AUX)/g' \
-e 's/%%WORKFILE_MODULE%%/$(WORKFILE_PATHNAME)/g' \
-e 's/%%SESSION_STATE_MODULE%%/$(SESSION_STATE_PATHNAME)/g' \
+ -e 's/%%OPTUTILS_MODULE%%/$(OPTUTILS_PATHNAME)/g' \
-e 's/ / /g' \
$< > $@
http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/bee01373/src/backend/catalog/gp_toolkit.sql.in
----------------------------------------------------------------------
diff --git a/src/backend/catalog/gp_toolkit.sql.in b/src/backend/catalog/gp_toolkit.sql.in
index 86269e5..ccd3212 100644
--- a/src/backend/catalog/gp_toolkit.sql.in
+++ b/src/backend/catalog/gp_toolkit.sql.in
@@ -2057,6 +2057,26 @@ LANGUAGE SQL;
REVOKE ALL ON FUNCTION %%JETPACK_PREFIX%%workfile_cache_clear() FROM public;
--------------------------------------------------------------------------------
+-- @function:
+-- %%JETPACK_PREFIX%%dump_query_oids(text)
+--
+-- @in:
+-- text - SQL text
+-- @out:
+-- text - serialized json string of oids
+--
+-- @doc:
+-- Dump query oids for a given SQL text
+--
+--------------------------------------------------------------------------------
+
+CREATE FUNCTION %%JETPACK_PREFIX%%dump_query_oids(text)
+RETURNS text
+AS '%%OPTUTILS_MODULE%%', 'gp_dump_query_oids' LANGUAGE C STRICT;
+
+GRANT EXECUTE ON FUNCTION %%JETPACK_PREFIX%%dump_query_oids(text) TO public;
+
+--------------------------------------------------------------------------------
-- Finalize install
COMMIT;
http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/bee01373/src/bin/Makefile
----------------------------------------------------------------------
diff --git a/src/bin/Makefile b/src/bin/Makefile
index 9d4919d..04d412d 100644
--- a/src/bin/Makefile
+++ b/src/bin/Makefile
@@ -16,7 +16,7 @@ include $(top_builddir)/src/Makefile.global
DIRS = initdb ipcclean pg_ctl pg_dump pgbench \
psql scripts pg_config pg_controldata pg_resetxlog \
gpfilesystem/hdfs gpmirrortransition gpupgrade \
- gpfusion gp_workfile_mgr gpcheckhdfs gpfdist
+ gpfusion gp_workfile_mgr gpcheckhdfs gpfdist gpoptutils
all install installdirs uninstall distprep clean distclean maintainer-clean:
@for dir in $(DIRS); do echo "build $$dir"; $(MAKE) -C $$dir $@ || exit; done
http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/bee01373/src/bin/gpoptutils/Makefile
----------------------------------------------------------------------
diff --git a/src/bin/gpoptutils/Makefile b/src/bin/gpoptutils/Makefile
new file mode 100644
index 0000000..3cc8be8
--- /dev/null
+++ b/src/bin/gpoptutils/Makefile
@@ -0,0 +1,12 @@
+MODULE_big = gpoptutils
+OBJS = gpoptutils.o
+
+ifdef USE_PGXS
+PGXS := $(shell pg_config --pgxs)
+include $(PGXS)
+else
+subdir = src/bin/gpoptutils
+top_builddir = ../../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/bee01373/src/bin/gpoptutils/gpoptutils.c
----------------------------------------------------------------------
diff --git a/src/bin/gpoptutils/gpoptutils.c b/src/bin/gpoptutils/gpoptutils.c
new file mode 100644
index 0000000..d1ae351
--- /dev/null
+++ b/src/bin/gpoptutils/gpoptutils.c
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2015 Pivotal Inc. All Rights Reserved
+ *
+ * ---------------------------------------------------------------------
+ *
+ * The dynamically linked library created from this source can be reference by
+ * creating a function in psql that references it. For example,
+ *
+ * CREATE FUNCTION gp_dump_query_oids(text)
+ * RETURNS text
+ * AS '$libdir/gpoptutils', 'gp_dump_query_oids'
+ * LANGUAGE C STRICT;
+ */
+
+#include "postgres.h"
+#include "funcapi.h"
+#include "utils/builtins.h"
+#include "gpopt/utils/nodeutils.h"
+#include "rewrite/rewriteHandler.h"
+#include "c.h"
+
+extern
+List *pg_parse_and_rewrite(const char *query_string, Oid *paramTypes, int iNumParams);
+
+extern
+List *QueryRewrite(Query *parsetree);
+
+static
+Query *parseSQL(char *szSqlText);
+
+static
+void traverseQueryRTEs(Query *pquery, HTAB *phtab, StringInfoData *buf);
+
+Datum gp_dump_query_oids(PG_FUNCTION_ARGS);
+
+#ifdef PG_MODULE_MAGIC
+PG_MODULE_MAGIC;
+#endif
+
+PG_FUNCTION_INFO_V1(gp_dump_query_oids);
+
+/*
+ * Parse a query given as SQL text.
+ */
+static Query *parseSQL(char *sqlText)
+{
+ Assert(sqlText);
+
+ List *queryTree = pg_parse_and_rewrite(sqlText, NULL, 0);
+
+ if (1 != list_length(queryTree))
+ {
+ elog(ERROR, "Cannot parse query. "
+ "Please make sure the input contains a single valid query. \n%s", sqlText);
+ }
+
+ Query *query = (Query *) lfirst(list_head(queryTree));
+
+ return query;
+}
+
+static void traverseQueryRTEs
+ (
+ Query *pquery,
+ HTAB *phtab,
+ StringInfoData *buf
+ )
+{
+ ListCell *plc;
+ bool found;
+ foreach (plc, pquery->rtable)
+ {
+ RangeTblEntry *rte = (RangeTblEntry *) lfirst(plc);
+
+ switch (rte->rtekind)
+ {
+ case RTE_RELATION:
+ {
+ hash_search(phtab, (void *)&rte->relid, HASH_ENTER, &found);
+ if (!found)
+ {
+ if (0 != buf->len)
+ appendStringInfo(buf, "%s", ", ");
+ appendStringInfo(buf, "%u", rte->relid);
+ }
+ }
+ break;
+ case RTE_SUBQUERY:
+ traverseQueryRTEs(rte->subquery, phtab, buf);
+ break;
+ default:
+ break;
+ }
+ }
+}
+
+/*
+ * Function dumping dependent relation oids for a given SQL text
+ */
+Datum
+gp_dump_query_oids(PG_FUNCTION_ARGS)
+{
+ char *szSqlText = text_to_cstring(PG_GETARG_TEXT_P(0));
+
+ Query *pquery = parseSQL(szSqlText);
+ if (CMD_UTILITY == pquery->commandType && T_ExplainStmt == pquery->utilityStmt->type)
+ {
+ Query *pqueryExplain = ((ExplainStmt *)pquery->utilityStmt)->query;
+ List *plQueryTree = QueryRewrite(pqueryExplain);
+ Assert(1 == list_length(plQueryTree));
+ pquery = (Query *) lfirst(list_head(plQueryTree));
+ }
+
+ typedef struct OidHashEntry
+ {
+ Oid key;
+ bool value;
+ } OidHashEntry;
+ HASHCTL ctl;
+ ctl.keysize = sizeof(Oid);
+ ctl.entrysize = sizeof(OidHashEntry);
+ ctl.hash = oid_hash;
+
+ StringInfoData buf;
+ initStringInfo(&buf);
+
+ HTAB *phtab = hash_create("relid hash table", 100, &ctl, HASH_ELEM | HASH_FUNCTION);
+ traverseQueryRTEs(pquery, phtab, &buf);
+ hash_destroy(phtab);
+
+ StringInfoData str;
+ initStringInfo(&str);
+ appendStringInfo(&str, "{\"relids\": [%s]}", buf.data);
+
+ text *result = cstring_to_text(str.data);
+ pfree(buf.data);
+ pfree(str.data);
+
+ PG_RETURN_TEXT_P(result);
+}
http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/bee01373/tools/bin/minirepro
----------------------------------------------------------------------
diff --git a/tools/bin/minirepro b/tools/bin/minirepro
new file mode 100755
index 0000000..7238cd3
--- /dev/null
+++ b/tools/bin/minirepro
@@ -0,0 +1,343 @@
+#!/usr/bin/env python
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#
+# Dump minimal schema objects and statistics for a given query
+#
+import os, sys, re, json, platform, subprocess
+from optparse import OptionParser
+from pygresql import pgdb
+from datetime import datetime
+
+prog_version = '%prog 1.0'
+PATH_PREFIX = '/tmp/'
+sysnslist = "('pg_toast', 'pg_bitmapindex', 'pg_catalog', 'information_schema', 'hawq_toolkit', 'pg_aoseg')"
+pgoptions = '-c gp_session_role=utility'
+
+class MRQuery(object):
+ def __init__(self):
+ self.relations = {}
+ self.funcs = {}
+ self.relids = []
+
+def E(query_str):
+ return pgdb.escape_string(query_str)
+
+def generate_timestamp():
+ timestamp = datetime.now()
+ return timestamp.strftime("%Y%m%d%H%M%S")
+
+def result_iter(cursor, arraysize=1000):
+ 'An iterator that uses fetchmany to keep memory usage down'
+ while True:
+ results = cursor.fetchmany(arraysize)
+ if not results:
+ break
+ for result in results:
+ yield result
+
+def get_version(cursor):
+ query = "select version()"
+ try:
+ cursor.execute(query)
+ vals = cursor.fetchone()
+ return vals[0]
+ except pgdb.DatabaseError as e:
+ sys.stderr.write('\nError while trying to find HAWQ/GPDB version.\n\n' + str(e) + '\n\n')
+ sys.exit(1)
+
+def parse_cmd_line():
+ p = OptionParser(usage='Usage: %prog <database> [options]', version=prog_version, conflict_handler="resolve")
+ p.add_option('-?', '--help', action='help', help='Show this help message and exit')
+ p.add_option('-h', '--host', action='store',
+ dest='host', help='Specify a remote host')
+ p.add_option('-p', '--port', action='store',
+ dest='port', help='Specify a port other than 5432')
+ p.add_option('-U', '--user', action='store', dest='user',
+ help='Connect as someone other than current user')
+ p.add_option('-q', action='store', dest='query_file',
+ help='file name that contains the query')
+ p.add_option('-f', action='store', dest='output_file',
+ help='minirepro output file name')
+ return p
+
+def dump_query(cursor, query_file):
+ print "Extracting metadata from query file %s ..." % query_file
+
+ with open(query_file, 'r') as query_f:
+ sql_text = query_f.read()
+ query = "select hawq_toolkit.hawq_dump_query_oids('%s')" % E(sql_text)
+
+ try:
+ cursor.execute(query)
+ vals = cursor.fetchone()
+ return vals[0]
+ except pgdb.DatabaseError as e:
+ sys.stderr.writelines('\nError while running hawq_toolkit.hawq_dump_query_oids(text).\nPlease make sure ' \
+ 'the function is installed and the query file contains single valid query.\n\n' + str(e) + '\n\n')
+ sys.exit(1)
+
+# relation oid will be extracted from the dump string
+def parse_oids(cursor, json_oids):
+ result = MRQuery()
+ oids = json.loads(json_oids)['relids']
+ result.relids = [str(x) for x in oids]
+
+ if len(result.relids) == 0:
+ result.relids.append('0')
+ oid_str = ','.join(result.relids)
+ cat_query = "SELECT nspname, relname FROM pg_class, pg_namespace WHERE pg_class.relnamespace = pg_namespace.oid " \
+ "AND pg_class.oid IN (%s)" % oid_str
+
+ cursor.execute(cat_query)
+
+ for vals in result_iter(cursor):
+ schm, relname = vals[0], vals[1]
+ if schm not in result.relations:
+ result.relations[schm] = [relname]
+ else:
+ result.relations[schm].append(relname)
+
+ return result
+
+def pg_dump_object(obj_dict, connectionInfo, envOpts):
+ for schema, table_list in obj_dict.iteritems():
+ out_file = PATH_PREFIX + schema + '.dp.sql'
+ dmp_cmd = 'pg_dump -h %s -p %s -U %s -sxO %s' % connectionInfo
+ dmp_cmd = "%s -t '%s.%s' -f %s" % (dmp_cmd, E(schema), E('|'.join(table_list)), E(out_file))
+ print dmp_cmd
+ p = subprocess.Popen(dmp_cmd, shell=True, stderr=subprocess.PIPE, env=envOpts)
+ if p.wait() is not 0:
+ sys.stderr.write('\nError while dumping schema.\n\n' + p.communicate()[1] + '\n\n')
+ sys.exit(1)
+
+def print_obj_ddl(filename, f_out):
+ if filename.endswith('.dp.sql'):
+ f_path = os.path.join(PATH_PREFIX, filename)
+ with open(f_path, 'r') as f_opened:
+ line_no = 1
+ for line in f_opened:
+ if line_no == 12 or line_no > 16:
+ f_out.writelines(line)
+ line_no += 1
+
+def dump_tuple_count(cur, oid_str, f_out):
+ stmt = "SELECT pgc.relname, pgn.nspname, pgc.relpages, pgc.reltuples FROM pg_class pgc, pg_namespace pgn " \
+ "WHERE pgc.relnamespace = pgn.oid and pgc.oid in (%s) and pgn.nspname NOT LIKE 'pg_temp_%%' " \
+ "and pgn.nspname NOT IN %s" % (oid_str, sysnslist)
+
+ templateStmt = '-- Table: {1}\n' \
+ 'UPDATE pg_class\nSET\n' \
+ '{0}\n' \
+ 'WHERE relname = \'{1}\' AND relnamespace = ' \
+ '(SELECT oid FROM pg_namespace WHERE nspname = \'{2}\');\n\n'
+
+ cur.execute(stmt)
+ columns = [x[0] for x in cur.description]
+ types = ['int', 'real']
+ for vals in result_iter(cur):
+ lines = []
+ for col, val, typ in zip(columns[2:], vals[2:], types):
+ # i.e. relpages = 1::int, reltuples = 1.0::real
+ lines.append('\t%s = %s::%s' % (col, val, typ))
+ updateStmt = templateStmt.format(E(',\n'.join(lines)), E(vals[0]), E(vals[1]))
+ f_out.writelines(updateStmt)
+
+def dump_stats(cur, oid_str, f_out):
+ query = 'SELECT pgc.relname, pgn.nspname, pga.attname, pgt.typname, pgs.* ' \
+ 'FROM pg_class pgc, pg_statistic pgs, pg_namespace pgn, pg_attribute pga, pg_type pgt ' \
+ 'WHERE pgc.relnamespace = pgn.oid and pgc.oid in (%s) and pgn.nspname NOT IN %s ' \
+ 'and pgn.nspname NOT LIKE \'pg_temp_%%\' ' \
+ 'and pgc.oid = pgs.starelid ' \
+ 'and pga.attrelid = pgc.oid ' \
+ 'and pga.attnum = pgs.staattnum ' \
+ 'and pga.atttypid = pgt.oid ' \
+ 'ORDER BY pgc.relname, pgs.staattnum' % (oid_str, sysnslist)
+
+ pstring = '--\n' \
+ '-- Table: {0}, Attribute: {1}\n' \
+ '--\n' \
+ 'INSERT INTO pg_statistic VALUES (\n' \
+ '{2});\n\n'
+ types = ['smallint', # staattnum
+ 'real',
+ 'integer',
+ 'real',
+ 'smallint',
+ 'smallint',
+ 'smallint',
+ 'smallint',
+ 'oid',
+ 'oid',
+ 'oid',
+ 'oid',
+ 'real[]',
+ 'real[]',
+ 'real[]',
+ 'real[]'
+ ]
+
+ cur.execute(query)
+
+ for vals in result_iter(cur):
+ rowVals = ["\t'%s.%s'::regclass" % (E(vals[1]), E(vals[0]))]
+
+ if vals[3][0] == '_':
+ rowTypes = types + [vals[3]] * 4
+ else:
+ rowTypes = types + [vals[3] + '[]'] * 4
+ for val, typ in zip(vals[5:], rowTypes):
+ if val is None:
+ val = 'NULL'
+ elif isinstance(val, (str, unicode)) and val[0] == '{':
+ val = "E'%s'" % val
+ rowVals.append('\t{0}::{1}'.format(val, typ))
+ f_out.writelines(pstring.format(E(vals[0]), E(vals[2]), ',\n'.join(rowVals)))
+
+def main():
+ parser = parse_cmd_line()
+ options, args = parser.parse_args()
+ if len(args) != 1:
+ parser.error("No database specified")
+ exit(1)
+
+ # OK - now let's setup all the arguments & options
+ envOpts = os.environ
+ db = args[0]
+ host = options.host or platform.node()
+ user = options.user or os.getlogin()
+ port = options.port or envOpts['PGPORT'] or '5432'
+ query_file = options.query_file
+ output_file = options.output_file
+
+ if query_file is None:
+ parser.error("No query file specified.")
+ exit(1)
+ if output_file is None:
+ parser.error("No output file specified.")
+ exit(1)
+ if not os.path.isfile(query_file):
+ parser.error('Query file %s does not exist.' % query_file)
+ exit(1)
+
+ timestamp = generate_timestamp()
+ global PATH_PREFIX
+ PATH_PREFIX = PATH_PREFIX + timestamp + '/'
+
+ # Create tmp dir if not already there
+ try:
+ os.stat(PATH_PREFIX)
+ except:
+ os.mkdir(PATH_PREFIX)
+
+ # setup the connection info tuple with options
+ connectionInfo = (host, port, user, db)
+ connectionString = ':'.join([host, port, db, user, '', pgoptions, ''])
+ print "Connecting to database: host=%s, port=%s, user=%s, db=%s ..." % connectionInfo
+ conn = pgdb.connect(connectionString)
+ cursor = conn.cursor()
+
+ # get server version, which is dumped to minirepro output file
+ version = get_version(cursor)
+
+ """
+ invoke hawq_toolkit UDF, dump relation oids as json text
+ input: query file name
+ output: json oids string
+ """
+ json_str = dump_query(cursor, query_file)
+
+ """
+ parse json oids string, collect all things that need to be dumped
+ input: json oids string
+ output: MRQuery class (self.relations, self.funcs, self.relids)
+ """
+ mr_query = parse_oids(cursor, json_str)
+
+ # dump tables and views
+ print "Invoking pg_dump to dump DDL ..."
+ pg_dump_object(mr_query.relations, connectionInfo, envOpts)
+
+ # dump functions
+ # TODO #108977046
+
+ ### start writing out to stdout ###
+ output_dir = os.path.dirname(output_file)
+ if not os.path.exists(output_dir):
+ os.makedirs(output_dir)
+ f_out = open(output_file, 'w')
+ ts = datetime.today()
+ f_out.writelines(['-- MiniRepro 1.0',
+ '\n-- Copyright 2016, The Apache Software Foundation'
+ '\n-- Database: ' + db,
+ '\n-- Date: ' + ts.date().isoformat(),
+ '\n-- Time: ' + ts.time().isoformat(),
+ '\n-- CmdLine: ' + ' '.join(sys.argv),
+ '\n-- Version: ' + version + '\n\n'])
+
+ # Now be sure that when we load the rest we are doing it in the right
+ # database
+ f_out.writelines('\\connect ' + db + '\n\n')
+
+ # first create schema DDLs
+ print "Writing schema DDLs ..."
+ table_schemas = ["CREATE SCHEMA %s;\n" % E(schema) for schema in mr_query.relations if schema != 'public']
+ f_out.writelines(table_schemas)
+
+ # write table & view DDLs
+ print "Writing table & view DDLs ..."
+ for f in os.listdir(PATH_PREFIX):
+ print_obj_ddl(f, f_out)
+
+ # Now we have to explicitly allow editing of these pg_class &
+ # pg_statistic tables
+ f_out.writelines(['\n-- ',
+ '\n-- Allow system table modifications',
+ '\n-- ',
+ '\nset allow_system_table_mods="DML";\n\n'])
+
+
+ # dump table stats
+ print "Writing table statistics ..."
+ oid_str = ','.join(mr_query.relids)
+ dump_tuple_count(cursor, oid_str, f_out)
+
+ # dump column stats
+ print "Writing column statistics ..."
+ dump_stats(cursor, oid_str, f_out)
+
+ cursor.close()
+ conn.close()
+
+ # attach query text
+ print "Attaching raw query text ..."
+ f_out.writelines(['\n-- ',
+ '\n-- Query text',
+ '\n-- \n\n'])
+
+ with open(query_file, 'r') as query_f:
+ for line in query_f:
+ f_out.writelines('-- ' + line)
+
+ f_out.writelines('\n-- MiniRepro completed.\n')
+ f_out.close()
+
+ print "--- MiniRepro completed! ---"
+
+if __name__ == "__main__":
+ main()