You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hawq.apache.org by iw...@apache.org on 2017/02/09 06:36:00 UTC

incubator-hawq git commit: HAWQ-1317. Port "Fix some regex issues with out-of-range characters and large char ranges" from pg

Repository: incubator-hawq
Updated Branches:
  refs/heads/master 35ed3ad38 -> 23c45c746


HAWQ-1317. Port "Fix some regex issues with out-of-range characters and large char ranges" from pg


Project: http://git-wip-us.apache.org/repos/asf/incubator-hawq/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-hawq/commit/23c45c74
Tree: http://git-wip-us.apache.org/repos/asf/incubator-hawq/tree/23c45c74
Diff: http://git-wip-us.apache.org/repos/asf/incubator-hawq/diff/23c45c74

Branch: refs/heads/master
Commit: 23c45c746e6223e039cff29ca61d8ec5a2bb63bd
Parents: 35ed3ad
Author: amyrazz44 <ab...@pivotal.io>
Authored: Wed Feb 8 10:36:49 2017 +0800
Committer: ivan <iw...@pivotal.io>
Committed: Thu Feb 9 14:34:14 2017 +0800

----------------------------------------------------------------------
 src/backend/regex/regc_lex.c               |   8 +-
 src/backend/regex/regc_locale.c            |  55 ++++++---
 src/backend/regex/regcomp.c                |   3 +
 src/include/regex/regcustom.h              |   3 +-
 src/include/regex/regex.h                  |   2 +-
 src/test/feature/full_tests.txt            |   2 +-
 src/test/feature/regex/ans/regex_basic.ans | 144 ++++++++++++++++++++++++
 src/test/feature/regex/sql/regex_basic.sql |  41 +++++++
 src/test/feature/regex/test_regex.cpp      |  27 +++++
 9 files changed, 262 insertions(+), 23 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/23c45c74/src/backend/regex/regc_lex.c
----------------------------------------------------------------------
diff --git a/src/backend/regex/regc_lex.c b/src/backend/regex/regc_lex.c
index 782c008..4f6b8ea 100644
--- a/src/backend/regex/regc_lex.c
+++ b/src/backend/regex/regc_lex.c
@@ -792,13 +792,13 @@ lexescape(struct vars * v)
 			break;
 		case CHR('u'):
 			c = lexdigits(v, 16, 4, 4);
-			if (ISERR())
+			if (ISERR() || c < CHR_MIN || c > CHR_MAX)
 				FAILW(REG_EESCAPE);
 			RETV(PLAIN, c);
 			break;
 		case CHR('U'):
 			c = lexdigits(v, 16, 8, 8);
-			if (ISERR())
+			if (ISERR() || c < CHR_MIN || c > CHR_MAX) 
 				FAILW(REG_EESCAPE);
 			RETV(PLAIN, c);
 			break;
@@ -816,7 +816,7 @@ lexescape(struct vars * v)
 		case CHR('x'):
 			NOTE(REG_UUNPORT);
 			c = lexdigits(v, 16, 1, 255);		/* REs >255 long outside spec */
-			if (ISERR())
+			if (ISERR() || c < CHR_MIN || c > CHR_MAX)
 				FAILW(REG_EESCAPE);
 			RETV(PLAIN, c);
 			break;
@@ -872,6 +872,8 @@ lexescape(struct vars * v)
 
 /*
  * lexdigits - slurp up digits and return chr value
+ * This does not account for overflow; callers should range-check the result
+ * if maxlen is large enough to make that possible.
  */
 static chr						/* chr value; errors signalled via ERR */
 lexdigits(struct vars * v,

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/23c45c74/src/backend/regex/regc_locale.c
----------------------------------------------------------------------
diff --git a/src/backend/regex/regc_locale.c b/src/backend/regex/regc_locale.c
index 339380e..6ca59b2 100644
--- a/src/backend/regex/regc_locale.c
+++ b/src/backend/regex/regc_locale.c
@@ -471,8 +471,7 @@ range(struct vars * v,			/* context */
 	int			nchrs;
 	struct cvec *cv;
 	celt		c,
-				lc,
-				uc;
+				cc;
 
 	if (a != b && !before(a, b))
 	{
@@ -489,25 +488,47 @@ range(struct vars * v,			/* context */
 	}
 
 	/*
-	 * When case-independent, it's hard to decide when cvec ranges are usable,
-	 * so for now at least, we won't try.  We allocate enough space for two
-	 * case variants plus a little extra for the two title case variants.
-	 */
-
-	nchrs = (b - a + 1) * 2 + 4;
-
-	cv = getcvec(v, nchrs, 0);
+	* When case-independent, it's hard to decide when cvec ranges are usable,
+	* so for now at least, we won't try.  We use a range for the originally
+	* specified chrs and then add on any case-equivalents that are outside
+	* that range as individual chrs.
+	*
+	* To ensure sane behavior if someone specifies a very large range, limit
+	* the allocation size to 100000 chrs (arbitrary) and check for overrun
+	* inside the loop below.
+	*/
+
+	nchrs = b - a + 1;
+    
+	if (nchrs <= 0 || nchrs > 100000)
+		nchrs = 100000;
+
+	cv = getcvec(v, nchrs, 1);
 	NOERRN();
+	addrange(cv, a, b);
 
 	for (c = a; c <= b; c++)
 	{
-		addchr(cv, c);
-		lc = pg_wc_tolower((chr) c);
-		if (c != lc)
-			addchr(cv, lc);
-		uc = pg_wc_toupper((chr) c);
-		if (c != uc)
-			addchr(cv, uc);
+		cc = pg_wc_tolower((chr) c);
+		if (cc !=c && (before(cc, a) || before(b, cc)))
+		{
+			if (cv->nchrs >= cv->chrspace)
+			{
+				ERR(REG_ETOOBIG);
+				return NULL;
+			}
+			addchr(cv, cc);
+		}
+		cc = pg_wc_toupper((chr) c);
+		if (cc != c && (before(cc, a) || before(b, cc)))
+		{
+			if (cv->nchrs >= cv->chrspace)
+			{
+				ERR(REG_ETOOBIG);
+				return NULL;
+			}
+			addchr(cv, cc);
+		}
 	}
 
 	return cv;

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/23c45c74/src/backend/regex/regcomp.c
----------------------------------------------------------------------
diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c
index 0c08237..35c4d99 100644
--- a/src/backend/regex/regcomp.c
+++ b/src/backend/regex/regcomp.c
@@ -1509,6 +1509,7 @@ dovec(struct vars * v,
 	{
 		ch = *p;
 		newarc(v->nfa, PLAIN, subcolor(v->cm, ch), lp, rp);
+		NOERR();
 	}
 
 	/* and the ranges */
@@ -1518,6 +1519,7 @@ dovec(struct vars * v,
 		to = *(p + 1);
 		if (from <= to)
 			subrange(v, from, to, lp, rp);
+		NOERR();
 	}
 }
 
@@ -1844,6 +1846,7 @@ rfree(regex_t *re)
 	FREE(g);
 }
 
+
 #ifdef REG_DEBUG
 
 /*

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/23c45c74/src/include/regex/regcustom.h
----------------------------------------------------------------------
diff --git a/src/include/regex/regcustom.h b/src/include/regex/regcustom.h
index 269f926..cd43eca 100644
--- a/src/include/regex/regcustom.h
+++ b/src/include/regex/regcustom.h
@@ -54,7 +54,8 @@ typedef int celt;				/* type to hold chr, or NOCELT */
 #define DIGITVAL(c) ((c)-'0')	/* turn chr digit into its value */
 #define CHRBITS 32				/* bits in a chr; must not use sizeof */
 #define CHR_MIN 0x00000000		/* smallest and largest chr; the value */
-#define CHR_MAX 0xfffffffe		/* CHR_MAX-CHR_MIN+1 should fit in uchr */
+#define CHR_MAX 0x7ffffffe		/* CHR_MAX-CHR_MIN+1 must fit in an int, and
+                                 * CHR_MAX+1 must fit in both chr and celt */
 
 /* functions operating on chr */
 #define iscalnum(x) pg_wc_isalnum(x)

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/23c45c74/src/include/regex/regex.h
----------------------------------------------------------------------
diff --git a/src/include/regex/regex.h b/src/include/regex/regex.h
index abd90bc..154438a 100644
--- a/src/include/regex/regex.h
+++ b/src/include/regex/regex.h
@@ -151,7 +151,7 @@ typedef struct
 #define REG_INVARG	16			/* invalid argument to regex function */
 #define REG_MIXED	17			/* character widths of regex and string differ */
 #define REG_BADOPT	18			/* invalid embedded option */
-#define REG_ETOOBIG 19			/* nfa has too many states */
+#define REG_ETOOBIG 19			/* regular expression is too complex */
 /* two specials for debugging and testing */
 #define REG_ATOI	101			/* convert error-code name to number */
 #define REG_ITOA	102			/* convert error-code number to name */

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/23c45c74/src/test/feature/full_tests.txt
----------------------------------------------------------------------
diff --git a/src/test/feature/full_tests.txt b/src/test/feature/full_tests.txt
index 254f866..beda4e8 100644
--- a/src/test/feature/full_tests.txt
+++ b/src/test/feature/full_tests.txt
@@ -2,5 +2,5 @@
 #SERIAL=* are the serial tests to run, optional but should not be empty
 #you can have several PARALLEL or SRRIAL
 
-PARALLEL=TestErrorTable.*:TestPreparedStatement.*:TestUDF.*:TestAOSnappy.*:TestAlterOwner.*:TestAlterTable.*:TestCreateTable.*:TestGuc.*:TestType.*:TestDatabase.*:TestParquet.*:TestPartition.*:TestSubplan.*:TestAggregate.*:TestCreateTypeComposite.*:TestGpDistRandom.*:TestInformationSchema.*:TestQueryInsert.*:TestQueryNestedCaseNull.*:TestQueryPolymorphism.*:TestQueryPortal.*:TestQueryPrepare.*:TestQuerySequence.*:TestCommonLib.*:TestToast.*:TestTransaction.*:TestCommand.*:TestCopy.*:TestParser.*:TestHawqRegister.*
+PARALLEL=TestErrorTable.*:TestPreparedStatement.*:TestUDF.*:TestAOSnappy.*:TestAlterOwner.*:TestAlterTable.*:TestCreateTable.*:TestGuc.*:TestType.*:TestDatabase.*:TestParquet.*:TestPartition.*:TestSubplan.*:TestAggregate.*:TestCreateTypeComposite.*:TestGpDistRandom.*:TestInformationSchema.*:TestQueryInsert.*:TestQueryNestedCaseNull.*:TestQueryPolymorphism.*:TestQueryPortal.*:TestQueryPrepare.*:TestQuerySequence.*:TestCommonLib.*:TestToast.*:TestTransaction.*:TestCommand.*:TestCopy.*:TestParser.*:TestHawqRegister.*:TestRegex.*
 SERIAL=TestExternalOid.TestExternalOidAll:TestExternalTable.TestExternalTableAll:TestTemp.BasicTest:TestRowTypes.*

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/23c45c74/src/test/feature/regex/ans/regex_basic.ans
----------------------------------------------------------------------
diff --git a/src/test/feature/regex/ans/regex_basic.ans b/src/test/feature/regex/ans/regex_basic.ans
new file mode 100644
index 0000000..52a9077
--- /dev/null
+++ b/src/test/feature/regex/ans/regex_basic.ans
@@ -0,0 +1,144 @@
+-- start_ignore
+SET SEARCH_PATH=TestRegex_TestRegexBasic;
+SET
+-- end_ignore
+--
+-- Regular expression tests
+--
+-- Don't want to have to double backslashes in regexes
+set standard_conforming_strings = on;
+SET
+-- Test simple quantified backrefs
+select 'bbbbb' ~ '^([bc])\1*$' as t;
+ t 
+---
+ t
+(1 row)
+
+select 'ccc' ~ '^([bc])\1*$' as t;
+ t 
+---
+ t
+(1 row)
+
+select 'xxx' ~ '^([bc])\1*$' as f;
+ f 
+---
+ f
+(1 row)
+
+select 'b' ~ '^([bc])\1*$' as t;
+ t 
+---
+ t
+(1 row)
+
+-- Test lookahead constraints
+select regexp_matches('ab', 'a(?=b)b*');
+ regexp_matches 
+----------------
+ {ab}
+(1 row)
+
+select regexp_matches('a', 'a(?=b)b*');
+ regexp_matches 
+----------------
+(0 rows)
+
+select regexp_matches('abc', 'a(?=b)b*(?=c)c*');
+ regexp_matches 
+----------------
+ {abc}
+(1 row)
+
+select regexp_matches('ab', 'a(?=b)b*(?=c)c*');
+ regexp_matches 
+----------------
+(0 rows)
+
+select regexp_matches('ab', 'a(?!b)b*');
+ regexp_matches 
+----------------
+(0 rows)
+
+select regexp_matches('a', 'a(?!b)b*');
+ regexp_matches 
+----------------
+ {a}
+(1 row)
+
+select regexp_matches('b', '(?=b)b');
+ regexp_matches 
+----------------
+ {b}
+(1 row)
+
+select regexp_matches('a', '(?=b)b');
+ regexp_matches 
+----------------
+(0 rows)
+
+-- Test optimization of single-chr-or-bracket-expression lookaround constraints
+select 'xz' ~ 'x(?=[xy])';
+ ?column? 
+----------
+ f
+(1 row)
+
+select 'xy' ~ 'x(?=[xy])';
+ ?column? 
+----------
+ t
+(1 row)
+
+select 'xz' ~ 'x(?![xy])';
+ ?column? 
+----------
+ t
+(1 row)
+
+select 'xy' ~ 'x(?![xy])';
+ ?column? 
+----------
+ f
+(1 row)
+
+select 'x'  ~ 'x(?![xy])';
+ ?column? 
+----------
+ t
+(1 row)
+
+select 'zyy' ~ '(?<![xy])yy+';
+psql:/tmp/TestRegex_TestRegexBasic.sql:33: ERROR:  invalid regular expression: quantifier operand invalid
+-- Test for infinite loop in cfindloop with zero-length possible match
+-- but no actual match (can only happen in the presence of backrefs)
+select 'a' ~ '$()|^\1';
+ ?column? 
+----------
+ f
+(1 row)
+
+select 'a' ~ '.. ()|\1';
+ ?column? 
+----------
+ f
+(1 row)
+
+select 'a' ~ '()*\1';
+ ?column? 
+----------
+ t
+(1 row)
+
+select 'a' ~ '()+\1';
+ ?column? 
+----------
+ t
+(1 row)
+
+-- Error conditions
+select 'xyz' ~ 'x(\w)(?=\1)';  -- no backrefs in LACONs
+psql:/tmp/TestRegex_TestRegexBasic.sql:43: ERROR:  invalid regular expression: invalid backreference number
+select 'a' ~ '\x7fffffff';  -- invalid chr code
+psql:/tmp/TestRegex_TestRegexBasic.sql:44: ERROR:  invalid regular expression: invalid escape \ sequence

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/23c45c74/src/test/feature/regex/sql/regex_basic.sql
----------------------------------------------------------------------
diff --git a/src/test/feature/regex/sql/regex_basic.sql b/src/test/feature/regex/sql/regex_basic.sql
new file mode 100644
index 0000000..7dfad9c
--- /dev/null
+++ b/src/test/feature/regex/sql/regex_basic.sql
@@ -0,0 +1,41 @@
+--
+-- Regular expression tests
+--
+
+-- Don't want to have to double backslashes in regexes
+set standard_conforming_strings = on;
+
+-- Test simple quantified backrefs
+select 'bbbbb' ~ '^([bc])\1*$' as t;
+select 'ccc' ~ '^([bc])\1*$' as t;
+select 'xxx' ~ '^([bc])\1*$' as f;
+select 'b' ~ '^([bc])\1*$' as t;
+
+-- Test lookahead constraints
+select regexp_matches('ab', 'a(?=b)b*');
+select regexp_matches('a', 'a(?=b)b*');
+select regexp_matches('abc', 'a(?=b)b*(?=c)c*');
+select regexp_matches('ab', 'a(?=b)b*(?=c)c*');
+select regexp_matches('ab', 'a(?!b)b*');
+select regexp_matches('a', 'a(?!b)b*');
+select regexp_matches('b', '(?=b)b');
+select regexp_matches('a', '(?=b)b');
+
+-- Test optimization of single-chr-or-bracket-expression lookaround constraints
+select 'xz' ~ 'x(?=[xy])';
+select 'xy' ~ 'x(?=[xy])';
+select 'xz' ~ 'x(?![xy])';
+select 'xy' ~ 'x(?![xy])';
+select 'x'  ~ 'x(?![xy])';
+select 'zyy' ~ '(?<![xy])yy+';
+
+-- Test for infinite loop in cfindloop with zero-length possible match
+-- but no actual match (can only happen in the presence of backrefs)
+select 'a' ~ '$()|^\1';
+select 'a' ~ '.. ()|\1';
+select 'a' ~ '()*\1';
+select 'a' ~ '()+\1';
+
+-- Error conditions
+select 'xyz' ~ 'x(\w)(?=\1)';  -- no backrefs in LACONs
+select 'a' ~ '\x7fffffff';  -- invalid chr code

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/23c45c74/src/test/feature/regex/test_regex.cpp
----------------------------------------------------------------------
diff --git a/src/test/feature/regex/test_regex.cpp b/src/test/feature/regex/test_regex.cpp
new file mode 100644
index 0000000..5b08357
--- /dev/null
+++ b/src/test/feature/regex/test_regex.cpp
@@ -0,0 +1,27 @@
+#include <pwd.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <vector>
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <iostream>
+#include <string>
+
+#include "lib/sql_util.h"
+
+#include "gtest/gtest.h"
+
+class TestRegex : public ::testing::Test {
+ public:
+  TestRegex() {}
+  ~TestRegex() {}
+};
+
+
+TEST_F(TestRegex, TestRegexBasic) {
+  hawq::test::SQLUtility util;
+  util.execSQLFile("regex/sql/regex_basic.sql",
+		  	  	   "regex/ans/regex_basic.ans");
+}
+