You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hawq.apache.org by nh...@apache.org on 2016/01/11 23:07:34 UTC
incubator-hawq git commit: HAWQ-324. Add minirepro management utility

Repository: incubator-hawq
Updated Branches:
  refs/heads/master 127cac3e2 -> bee013732


HAWQ-324. Add minirepro management utility


Project: http://git-wip-us.apache.org/repos/asf/incubator-hawq/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-hawq/commit/bee01373
Tree: http://git-wip-us.apache.org/repos/asf/incubator-hawq/tree/bee01373
Diff: http://git-wip-us.apache.org/repos/asf/incubator-hawq/diff/bee01373

Branch: refs/heads/master
Commit: bee0137323c275a634ff7104a4c5640a4e6114a1
Parents: 127cac3
Author: Haisheng Yuan <hy...@pivotal.io>
Authored: Thu Jan 7 13:33:02 2016 -0800
Committer: Haisheng Yuan <hy...@pivotal.io>
Committed: Mon Jan 11 14:05:03 2016 -0800

----------------------------------------------------------------------
 src/backend/catalog/Makefile          |   2 +
 src/backend/catalog/gp_toolkit.sql.in |  20 ++
 src/bin/Makefile                      |   2 +-
 src/bin/gpoptutils/Makefile           |  12 +
 src/bin/gpoptutils/gpoptutils.c       | 140 ++++++++++++
 tools/bin/minirepro                   | 343 +++++++++++++++++++++++++++++
 6 files changed, 518 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/bee01373/src/backend/catalog/Makefile
----------------------------------------------------------------------
diff --git a/src/backend/catalog/Makefile b/src/backend/catalog/Makefile
index e066cff..54d26cb 100644
--- a/src/backend/catalog/Makefile
+++ b/src/backend/catalog/Makefile
@@ -51,6 +51,7 @@ JETPACK_PREFIX = $(JETPACK_SCHEMA).$(JETPACK_OBJECT_PREFIX)
 JETPACK_PREFIX_AUX = $(JETPACK_SCHEMA).__$(JETPACK_OBJECT_PREFIX)
 WORKFILE_PATHNAME = $$libdir\/gp_workfile_mgr
 SESSION_STATE_PATHNAME = $$libdir\/gp_session_state
+OPTUTILS_PATHNAME = $$libdir\/gpoptutils
 
 gp_toolkit.sql: gp_toolkit.sql.in
 	sed \
@@ -60,6 +61,7 @@ gp_toolkit.sql: gp_toolkit.sql.in
 	-e 's/%%JETPACK_PREFIX_AUX%%/$(JETPACK_PREFIX_AUX)/g' \
 	-e 's/%%WORKFILE_MODULE%%/$(WORKFILE_PATHNAME)/g' \
 	-e 's/%%SESSION_STATE_MODULE%%/$(SESSION_STATE_PATHNAME)/g' \
+	-e 's/%%OPTUTILS_MODULE%%/$(OPTUTILS_PATHNAME)/g' \
 	-e 's/	/    /g' \
 	$< > $@
 

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/bee01373/src/backend/catalog/gp_toolkit.sql.in
----------------------------------------------------------------------
diff --git a/src/backend/catalog/gp_toolkit.sql.in b/src/backend/catalog/gp_toolkit.sql.in
index 86269e5..ccd3212 100644
--- a/src/backend/catalog/gp_toolkit.sql.in
+++ b/src/backend/catalog/gp_toolkit.sql.in
@@ -2057,6 +2057,26 @@ LANGUAGE SQL;
 REVOKE ALL ON FUNCTION %%JETPACK_PREFIX%%workfile_cache_clear() FROM public;
 
 --------------------------------------------------------------------------------
+-- @function:
+--		%%JETPACK_PREFIX%%dump_query_oids(text)
+--
+-- @in:
+--		text - SQL text
+-- @out:
+--		text - serialized json string of oids
+--
+-- @doc:
+--		Dump query oids for a given SQL text
+--
+--------------------------------------------------------------------------------
+
+CREATE FUNCTION %%JETPACK_PREFIX%%dump_query_oids(text)
+RETURNS text
+AS '%%OPTUTILS_MODULE%%', 'gp_dump_query_oids' LANGUAGE C STRICT;
+
+GRANT EXECUTE ON FUNCTION %%JETPACK_PREFIX%%dump_query_oids(text) TO public;
+
+--------------------------------------------------------------------------------
 
 -- Finalize install
 COMMIT;

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/bee01373/src/bin/Makefile
----------------------------------------------------------------------
diff --git a/src/bin/Makefile b/src/bin/Makefile
index 9d4919d..04d412d 100644
--- a/src/bin/Makefile
+++ b/src/bin/Makefile
@@ -16,7 +16,7 @@ include $(top_builddir)/src/Makefile.global
 DIRS = initdb ipcclean pg_ctl pg_dump pgbench \
 	psql scripts pg_config pg_controldata pg_resetxlog \
 	gpfilesystem/hdfs gpmirrortransition gpupgrade \
-	gpfusion gp_workfile_mgr gpcheckhdfs gpfdist
+	gpfusion gp_workfile_mgr gpcheckhdfs gpfdist gpoptutils
 
 all install installdirs uninstall distprep clean distclean maintainer-clean:
 	@for dir in $(DIRS); do echo "build $$dir"; $(MAKE) -C $$dir $@ || exit; done

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/bee01373/src/bin/gpoptutils/Makefile
----------------------------------------------------------------------
diff --git a/src/bin/gpoptutils/Makefile b/src/bin/gpoptutils/Makefile
new file mode 100644
index 0000000..3cc8be8
--- /dev/null
+++ b/src/bin/gpoptutils/Makefile
@@ -0,0 +1,12 @@
+MODULE_big = gpoptutils
+OBJS       = gpoptutils.o
+
+ifdef USE_PGXS
+PGXS := $(shell pg_config --pgxs)
+include $(PGXS)
+else
+subdir = src/bin/gpoptutils
+top_builddir = ../../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/bee01373/src/bin/gpoptutils/gpoptutils.c
----------------------------------------------------------------------
diff --git a/src/bin/gpoptutils/gpoptutils.c b/src/bin/gpoptutils/gpoptutils.c
new file mode 100644
index 0000000..d1ae351
--- /dev/null
+++ b/src/bin/gpoptutils/gpoptutils.c
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2015 Pivotal Inc. All Rights Reserved
+ *
+ * ---------------------------------------------------------------------
+ *
+ * The dynamically linked library created from this source can be reference by
+ * creating a function in psql that references it. For example,
+ *
+ * CREATE FUNCTION gp_dump_query_oids(text)
+ *	RETURNS text
+ *	AS '$libdir/gpoptutils', 'gp_dump_query_oids'
+ *	LANGUAGE C STRICT;
+ */
+
+#include "postgres.h"
+#include "funcapi.h"
+#include "utils/builtins.h"
+#include "gpopt/utils/nodeutils.h"
+#include "rewrite/rewriteHandler.h"
+#include "c.h"
+
+extern
+List *pg_parse_and_rewrite(const char *query_string, Oid *paramTypes, int iNumParams);
+
+extern
+List *QueryRewrite(Query *parsetree);
+
+static
+Query *parseSQL(char *szSqlText);
+
+static
+void traverseQueryRTEs(Query *pquery, HTAB *phtab, StringInfoData *buf);
+
+Datum gp_dump_query_oids(PG_FUNCTION_ARGS);
+
+#ifdef PG_MODULE_MAGIC
+PG_MODULE_MAGIC;
+#endif
+
+PG_FUNCTION_INFO_V1(gp_dump_query_oids);
+
+/*
+ * Parse a query given as SQL text.
+ */
+static Query *parseSQL(char *sqlText)
+{
+	Assert(sqlText);
+
+	List *queryTree = pg_parse_and_rewrite(sqlText, NULL, 0);
+
+	if (1 != list_length(queryTree))
+	{
+		elog(ERROR, "Cannot parse query. "
+				"Please make sure the input contains a single valid query. \n%s", sqlText);
+	}
+
+	Query *query = (Query *) lfirst(list_head(queryTree));
+
+	return query;
+}
+
+static void traverseQueryRTEs
+	(
+	Query *pquery,
+	HTAB *phtab,
+	StringInfoData *buf
+	)
+{
+	ListCell *plc;
+	bool found;
+	foreach (plc, pquery->rtable)
+	{
+		RangeTblEntry *rte = (RangeTblEntry *) lfirst(plc);
+
+		switch (rte->rtekind)
+		{
+			case RTE_RELATION:
+			{
+				hash_search(phtab, (void *)&rte->relid, HASH_ENTER, &found);
+				if (!found)
+				{
+					if (0 != buf->len)
+						appendStringInfo(buf, "%s", ", ");
+					appendStringInfo(buf, "%u", rte->relid);
+				}
+			}
+				break;
+			case RTE_SUBQUERY:
+				traverseQueryRTEs(rte->subquery, phtab, buf);
+				break;
+			default:
+				break;
+		}
+	}
+}
+
+/*
+ * Function dumping dependent relation oids for a given SQL text
+ */
+Datum
+gp_dump_query_oids(PG_FUNCTION_ARGS)
+{
+	char *szSqlText = text_to_cstring(PG_GETARG_TEXT_P(0));
+
+	Query *pquery = parseSQL(szSqlText);
+	if (CMD_UTILITY == pquery->commandType && T_ExplainStmt == pquery->utilityStmt->type)
+	{
+		Query *pqueryExplain = ((ExplainStmt *)pquery->utilityStmt)->query;
+		List *plQueryTree = QueryRewrite(pqueryExplain);
+		Assert(1 == list_length(plQueryTree));
+		pquery = (Query *) lfirst(list_head(plQueryTree));
+	}
+
+	typedef struct OidHashEntry
+	{
+		Oid key;
+		bool value;
+	} OidHashEntry;
+	HASHCTL ctl;
+	ctl.keysize = sizeof(Oid);
+	ctl.entrysize = sizeof(OidHashEntry);
+	ctl.hash = oid_hash;
+
+	StringInfoData buf;
+	initStringInfo(&buf);
+
+	HTAB *phtab = hash_create("relid hash table", 100, &ctl, HASH_ELEM | HASH_FUNCTION);
+	traverseQueryRTEs(pquery, phtab, &buf);
+	hash_destroy(phtab);
+
+	StringInfoData str;
+	initStringInfo(&str);
+	appendStringInfo(&str, "{\"relids\": [%s]}", buf.data);
+
+	text *result = cstring_to_text(str.data);
+	pfree(buf.data);
+	pfree(str.data);
+
+	PG_RETURN_TEXT_P(result);
+}

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/bee01373/tools/bin/minirepro
----------------------------------------------------------------------
diff --git a/tools/bin/minirepro b/tools/bin/minirepro
new file mode 100755
index 0000000..7238cd3
--- /dev/null
+++ b/tools/bin/minirepro
@@ -0,0 +1,343 @@
+#!/usr/bin/env python
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#
+# Dump minimal schema objects and statistics for a given query
+#
+import os, sys, re, json, platform, subprocess
+from optparse import OptionParser
+from pygresql import pgdb
+from datetime import datetime
+
+prog_version = '%prog 1.0'
+PATH_PREFIX = '/tmp/'
+sysnslist = "('pg_toast', 'pg_bitmapindex', 'pg_catalog', 'information_schema', 'hawq_toolkit', 'pg_aoseg')"
+pgoptions = '-c gp_session_role=utility'
+
+class MRQuery(object):
+    def __init__(self):
+        self.relations = {}
+        self.funcs = {}
+        self.relids = []
+
+def E(query_str):
+    return pgdb.escape_string(query_str)
+
+def generate_timestamp():
+    timestamp = datetime.now()
+    return timestamp.strftime("%Y%m%d%H%M%S")
+
+def result_iter(cursor, arraysize=1000):
+    'An iterator that uses fetchmany to keep memory usage down'
+    while True:
+        results = cursor.fetchmany(arraysize)
+        if not results:
+            break
+        for result in results:
+            yield result
+
+def get_version(cursor):
+    query = "select version()"
+    try:
+        cursor.execute(query)
+        vals = cursor.fetchone()
+        return vals[0]
+    except pgdb.DatabaseError as e:
+        sys.stderr.write('\nError while trying to find HAWQ/GPDB version.\n\n' + str(e) + '\n\n')
+        sys.exit(1)
+
+def parse_cmd_line():
+    p = OptionParser(usage='Usage: %prog <database> [options]', version=prog_version, conflict_handler="resolve")
+    p.add_option('-?', '--help', action='help', help='Show this help message and exit')
+    p.add_option('-h', '--host', action='store',
+                 dest='host', help='Specify a remote host')
+    p.add_option('-p', '--port', action='store',
+                 dest='port', help='Specify a port other than 5432')
+    p.add_option('-U', '--user', action='store', dest='user',
+                 help='Connect as someone other than current user')
+    p.add_option('-q', action='store', dest='query_file',
+                 help='file name that contains the query')
+    p.add_option('-f', action='store', dest='output_file',
+                 help='minirepro output file name')
+    return p
+
+def dump_query(cursor, query_file):
+    print "Extracting metadata from query file %s ..." % query_file
+
+    with open(query_file, 'r') as query_f:
+        sql_text = query_f.read()
+    query = "select hawq_toolkit.hawq_dump_query_oids('%s')" % E(sql_text)
+
+    try:
+        cursor.execute(query)
+        vals = cursor.fetchone()
+        return vals[0]
+    except pgdb.DatabaseError as e:
+        sys.stderr.writelines('\nError while running hawq_toolkit.hawq_dump_query_oids(text).\nPlease make sure ' \
+            'the function is installed and the query file contains single valid query.\n\n' + str(e) + '\n\n')
+        sys.exit(1)
+
+# relation oid will be extracted from the dump string
+def parse_oids(cursor, json_oids):
+    result = MRQuery()
+    oids = json.loads(json_oids)['relids']
+    result.relids = [str(x) for x in oids]
+
+    if len(result.relids) == 0:
+        result.relids.append('0')
+    oid_str = ','.join(result.relids)
+    cat_query = "SELECT nspname, relname FROM pg_class, pg_namespace WHERE pg_class.relnamespace = pg_namespace.oid " \
+                "AND pg_class.oid IN (%s)" % oid_str
+
+    cursor.execute(cat_query)
+
+    for vals in result_iter(cursor):
+        schm, relname = vals[0], vals[1]
+        if schm not in result.relations:
+            result.relations[schm] = [relname]
+        else:
+            result.relations[schm].append(relname)
+
+    return result
+
+def pg_dump_object(obj_dict, connectionInfo, envOpts):
+    for schema, table_list in obj_dict.iteritems():
+        out_file = PATH_PREFIX + schema + '.dp.sql'
+        dmp_cmd = 'pg_dump -h %s -p %s -U %s -sxO %s' % connectionInfo
+        dmp_cmd = "%s -t '%s.%s' -f %s" % (dmp_cmd, E(schema), E('|'.join(table_list)), E(out_file))
+        print dmp_cmd
+        p = subprocess.Popen(dmp_cmd, shell=True, stderr=subprocess.PIPE, env=envOpts)
+        if p.wait() is not 0:
+            sys.stderr.write('\nError while dumping schema.\n\n' + p.communicate()[1] + '\n\n')
+            sys.exit(1)
+
+def print_obj_ddl(filename, f_out):
+    if filename.endswith('.dp.sql'):
+        f_path = os.path.join(PATH_PREFIX, filename)
+        with open(f_path, 'r') as f_opened:
+            line_no = 1
+            for line in f_opened:
+                if line_no == 12 or line_no > 16:
+                    f_out.writelines(line)
+                line_no += 1
+
+def dump_tuple_count(cur, oid_str, f_out):
+    stmt = "SELECT pgc.relname, pgn.nspname, pgc.relpages, pgc.reltuples FROM pg_class pgc, pg_namespace pgn " \
+            "WHERE pgc.relnamespace = pgn.oid and pgc.oid in (%s) and pgn.nspname NOT LIKE 'pg_temp_%%' " \
+            "and pgn.nspname NOT IN %s" % (oid_str, sysnslist)
+
+    templateStmt = '-- Table: {1}\n' \
+        'UPDATE pg_class\nSET\n' \
+        '{0}\n' \
+        'WHERE relname = \'{1}\' AND relnamespace = ' \
+        '(SELECT oid FROM pg_namespace WHERE nspname = \'{2}\');\n\n'
+
+    cur.execute(stmt)
+    columns = [x[0] for x in cur.description]
+    types = ['int', 'real']
+    for vals in result_iter(cur):
+        lines = []
+        for col, val, typ in zip(columns[2:], vals[2:], types):
+            # i.e. relpages = 1::int, reltuples = 1.0::real
+            lines.append('\t%s = %s::%s' % (col, val, typ))
+        updateStmt = templateStmt.format(E(',\n'.join(lines)), E(vals[0]), E(vals[1]))
+        f_out.writelines(updateStmt)
+
+def dump_stats(cur, oid_str, f_out):
+    query = 'SELECT pgc.relname, pgn.nspname, pga.attname, pgt.typname, pgs.* ' \
+        'FROM pg_class pgc, pg_statistic pgs, pg_namespace pgn, pg_attribute pga, pg_type pgt ' \
+        'WHERE pgc.relnamespace = pgn.oid and pgc.oid in (%s) and pgn.nspname NOT IN %s ' \
+        'and pgn.nspname NOT LIKE \'pg_temp_%%\' ' \
+        'and pgc.oid = pgs.starelid ' \
+        'and pga.attrelid = pgc.oid ' \
+        'and pga.attnum = pgs.staattnum ' \
+        'and pga.atttypid = pgt.oid ' \
+        'ORDER BY pgc.relname, pgs.staattnum' % (oid_str, sysnslist)
+
+    pstring = '--\n' \
+        '-- Table: {0}, Attribute: {1}\n' \
+        '--\n' \
+        'INSERT INTO pg_statistic VALUES (\n' \
+        '{2});\n\n'
+    types = ['smallint',  # staattnum
+             'real',
+             'integer',
+             'real',
+             'smallint',
+             'smallint',
+             'smallint',
+             'smallint',
+             'oid',
+             'oid',
+             'oid',
+             'oid',
+             'real[]',
+             'real[]',
+             'real[]',
+             'real[]'
+             ]
+
+    cur.execute(query)
+
+    for vals in result_iter(cur):
+        rowVals = ["\t'%s.%s'::regclass" % (E(vals[1]), E(vals[0]))]
+
+        if vals[3][0] == '_':
+            rowTypes = types + [vals[3]] * 4
+        else:
+            rowTypes = types + [vals[3] + '[]'] * 4
+        for val, typ in zip(vals[5:], rowTypes):
+            if val is None:
+                val = 'NULL'
+            elif isinstance(val, (str, unicode)) and val[0] == '{':
+                val = "E'%s'" % val
+            rowVals.append('\t{0}::{1}'.format(val, typ))
+        f_out.writelines(pstring.format(E(vals[0]), E(vals[2]), ',\n'.join(rowVals)))
+
+def main():
+    parser = parse_cmd_line()
+    options, args = parser.parse_args()
+    if len(args) != 1:
+        parser.error("No database specified")
+        exit(1)
+
+    # OK - now let's setup all the arguments & options
+    envOpts = os.environ
+    db = args[0]
+    host = options.host or platform.node()
+    user = options.user or os.getlogin()
+    port = options.port or envOpts['PGPORT'] or '5432'
+    query_file = options.query_file
+    output_file = options.output_file
+
+    if query_file is None:
+        parser.error("No query file specified.")
+        exit(1)
+    if output_file is None:
+        parser.error("No output file specified.")
+        exit(1)
+    if not os.path.isfile(query_file):
+        parser.error('Query file %s does not exist.' % query_file)
+        exit(1)
+
+    timestamp = generate_timestamp()
+    global PATH_PREFIX
+    PATH_PREFIX = PATH_PREFIX + timestamp + '/'
+
+    # Create tmp dir if not already there
+    try:
+        os.stat(PATH_PREFIX)
+    except:
+        os.mkdir(PATH_PREFIX)
+
+    # setup the connection info tuple with options
+    connectionInfo = (host, port, user, db)
+    connectionString = ':'.join([host, port, db, user, '', pgoptions, ''])
+    print "Connecting to database: host=%s, port=%s, user=%s, db=%s ..." % connectionInfo
+    conn = pgdb.connect(connectionString)
+    cursor = conn.cursor()
+
+    # get server version, which is dumped to minirepro output file
+    version = get_version(cursor)
+
+    """
+    invoke hawq_toolkit UDF, dump relation oids as json text
+    input: query file name
+    output: json oids string
+    """
+    json_str = dump_query(cursor, query_file)
+
+    """
+    parse json oids string, collect all things that need to be dumped
+    input: json oids string
+    output: MRQuery class (self.relations, self.funcs, self.relids)
+    """
+    mr_query = parse_oids(cursor, json_str)
+
+    # dump tables and views
+    print "Invoking pg_dump to dump DDL ..."
+    pg_dump_object(mr_query.relations, connectionInfo, envOpts)
+
+    # dump functions
+    # TODO #108977046
+
+    ### start writing out to stdout ###
+    output_dir = os.path.dirname(output_file)
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    f_out = open(output_file, 'w')
+    ts = datetime.today()
+    f_out.writelines(['-- MiniRepro 1.0',
+                           '\n-- Copyright 2016, The Apache Software Foundation'
+                           '\n-- Database: ' + db,
+                           '\n-- Date:     ' + ts.date().isoformat(),
+                           '\n-- Time:     ' + ts.time().isoformat(),
+                           '\n-- CmdLine:  ' + ' '.join(sys.argv),
+                           '\n-- Version:  ' + version + '\n\n'])
+
+    # Now be sure that when we load the rest we are doing it in the right
+    # database
+    f_out.writelines('\\connect ' + db + '\n\n')
+
+    # first create schema DDLs
+    print "Writing schema DDLs ..."
+    table_schemas = ["CREATE SCHEMA %s;\n" % E(schema) for schema in mr_query.relations if schema != 'public']
+    f_out.writelines(table_schemas)
+
+    # write table & view DDLs
+    print "Writing table & view DDLs ..."
+    for f in os.listdir(PATH_PREFIX):
+        print_obj_ddl(f, f_out)
+
+    # Now we have to explicitly allow editing of these pg_class &
+    # pg_statistic tables
+    f_out.writelines(['\n-- ',
+                           '\n-- Allow system table modifications',
+                           '\n-- ',
+                           '\nset allow_system_table_mods="DML";\n\n'])
+
+
+    # dump table stats
+    print "Writing table statistics ..."
+    oid_str = ','.join(mr_query.relids)
+    dump_tuple_count(cursor, oid_str, f_out)
+
+    # dump column stats
+    print "Writing column statistics ..."
+    dump_stats(cursor, oid_str, f_out)
+
+    cursor.close()
+    conn.close()
+
+    # attach query text
+    print "Attaching raw query text ..."
+    f_out.writelines(['\n-- ',
+                       '\n-- Query text',
+                       '\n-- \n\n'])
+
+    with open(query_file, 'r') as query_f:
+        for line in query_f:
+            f_out.writelines('-- ' + line)
+
+    f_out.writelines('\n-- MiniRepro completed.\n')
+    f_out.close()
+
+    print "--- MiniRepro completed! ---"
+
+if __name__ == "__main__":
+        main()