You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by nk...@apache.org on 2024/03/02 00:23:02 UTC

(madlib) 08/09: PMML: Consider spaces when parsing the indep var

This is an automated email from the ASF dual-hosted git repository.

nkak pushed a commit to branch madlib2-master
in repository https://gitbox.apache.org/repos/asf/madlib.git

commit b944045e624e791b6c41bca4ef5d56ba54d4bb68
Author: Nikhil Kak <nk...@vmware.com>
AuthorDate: Tue Feb 20 15:06:46 2024 -0800

    PMML: Consider spaces when parsing the indep var
    
    JIRA: MADLIB-1517
    
    A previous commit 0cd28f9733927d63beaefc9488db7f8bfdb3bd80 added support to
    parse the independent var expression to determine if an intercept was used
    during training. This commit improves the regex by adding support for spaces
    and also adds a detailed explanation for the regex
    
    This commit also fixes a warning that would get generated with the previous regex:
    ```
    re.compile(r'array[[]([0-1],|[0-1].0,)?(["a-z0-9_, .]+)[]]', flags=re.I)
    <stdin>:1: FutureWarning: Possible nested set at position 6
    ```
---
 src/ports/postgres/modules/pmml/formula.py_in      |  16 +-
 .../pmml/test/unit_tests/test_formula.py_in        | 177 +++++++++++++++++++++
 2 files changed, 191 insertions(+), 2 deletions(-)

diff --git a/src/ports/postgres/modules/pmml/formula.py_in b/src/ports/postgres/modules/pmml/formula.py_in
index 0d575315..5f97bb51 100644
--- a/src/ports/postgres/modules/pmml/formula.py_in
+++ b/src/ports/postgres/modules/pmml/formula.py_in
@@ -12,8 +12,20 @@ class Formula(object):
         :param coef_len: Length of all the coefficients including the
                          intercept's coefficient(if any)
         """
-        # TODO: Fix the nested warning and add explanation for the regex
-        self.array_expr = re.compile(r'array[[]([0-1],|[0-1].0,)?(["a-z0-9_, .]+)[]]', flags=re.I)
+
+        self.array_expr = re.compile(r'array\[(\s*?[0-1]\s*?,\s*?|\s*?[0-1].0\s*?,\s*?)?(["a-z0-9_, .]+)]',
+                                     flags=re.I)
+        # Regex explanation:
+        # array\[ matches array[ or ARRAY[
+        # \s*? matches 0 or more spaces
+        # | represents an OR
+        # [0-1]\s*?, matches either "1," or "0," including spaces
+        # [0-1].0\s*?, matches either "1.0," or "0.0," including spaces
+        #   [0-1]\s*?,\s*?|\s*?[0-1].0\s*?, matches either "1", "0", "1.0", or "0.0" including spaces
+        # ()? captures the output of that group. ? means it's optional
+        #   That's why we use ()? for the first capture group i.e "1,", "0,", "1.0," or "0.0,"
+        # (["a-z0-9_, .]+) matches any occurrences of these characters and captures the output in a group
+
         self.non_array_expr = re.compile(r'["a-z0-9_]+', flags=re.I)
 
         self.intercept = self.has_intercept(x_str)
diff --git a/src/ports/postgres/modules/pmml/test/unit_tests/test_formula.py_in b/src/ports/postgres/modules/pmml/test/unit_tests/test_formula.py_in
index 6075edc4..2ce7b8ae 100644
--- a/src/ports/postgres/modules/pmml/test/unit_tests/test_formula.py_in
+++ b/src/ports/postgres/modules/pmml/test/unit_tests/test_formula.py_in
@@ -138,11 +138,22 @@ class FormulaTestCase(unittest.TestCase):
         self.assertEqual(f.intercept, True)
 
     def test_formula_array_with_invalid_intercept(self):
+        f = self.subject.Formula('baaz', 'ARRAY[0.1,foo,bar]', 3)
+        self.assertEqual(f.x, ['0.1', 'foo', 'bar'])
+        self.assertEqual(f.y, "baaz")
+        self.assertEqual(f.intercept, False)
+
+
         f = self.subject.Formula('baaz', 'ARRAY[10,foo,bar]', 3)
         self.assertEqual(f.x, ['10', 'foo', 'bar'])
         self.assertEqual(f.y, "baaz")
         self.assertEqual(f.intercept, False)
 
+        f = self.subject.Formula('baaz', 'ARRAY[  10  , foo,bar]', 3)
+        self.assertEqual(f.x, ['10', 'foo', 'bar'])
+        self.assertEqual(f.y, "baaz")
+        self.assertEqual(f.intercept, False)
+
         # A negative number shouldn't be allowed technically the train functions
         # don't error out, so adding this test for the sake of completeness
         f = self.subject.Formula('baaz', 'ARRAY[-2,foo,bar]', 3)
@@ -203,6 +214,172 @@ class FormulaTestCase(unittest.TestCase):
         self.assertEqual(f.y, "baaz")
         self.assertEqual(f.intercept, False)
 
+    def test_formula_array_with_spaces_with_intercept(self):
+        f = self.subject.Formula('baaz', 'ARRAY[1 ,foo,bar]', 3)
+        self.assertEqual(f.x, ['foo', 'bar'])
+        self.assertEqual(f.y, "baaz")
+        self.assertEqual(f.intercept, True)
+
+        f = self.subject.Formula('baaz', 'ARRAY[1  ,"1",bar]', 3)
+        self.assertEqual(f.x, ['1', 'bar'])
+        self.assertEqual(f.y, "baaz")
+        self.assertEqual(f.intercept, True)
+
+        f = self.subject.Formula('baaz', 'ARRAY[ 1,foo,bar]', 3)
+        self.assertEqual(f.x, ['foo', 'bar'])
+        self.assertEqual(f.y, "baaz")
+        self.assertEqual(f.intercept, True)
+
+        f = self.subject.Formula('baaz', 'ARRAY[  1,foo,bar]', 3)
+        self.assertEqual(f.x, ['foo', 'bar'])
+        self.assertEqual(f.y, "baaz")
+        self.assertEqual(f.intercept, True)
+
+
+        f = self.subject.Formula('baaz', 'ARRAY[  1  , foo,bar]', 3)
+        self.assertEqual(f.x, ['foo', 'bar'])
+        self.assertEqual(f.y, "baaz")
+        self.assertEqual(f.intercept, True)
+
+        f = self.subject.Formula('baaz', 'ARRAY[1,  "1",bar]', 3)
+        self.assertEqual(f.x, ['1', 'bar'])
+        self.assertEqual(f.y, "baaz")
+        self.assertEqual(f.intercept, True)
+
+        f = self.subject.Formula('baaz', 'ARRAY[  1,  foo,bar]', 3)
+        self.assertEqual(f.x, ['foo', 'bar'])
+        self.assertEqual(f.y, "baaz")
+        self.assertEqual(f.intercept, True)
+
+        f = self.subject.Formula('baaz', 'ARRAY[  1 ,  foo,bar]', 3)
+        self.assertEqual(f.x, ['foo', 'bar'])
+        self.assertEqual(f.y, "baaz")
+        self.assertEqual(f.intercept, True)
+
+        f = self.subject.Formula('baaz', 'ARRAY[  1 ,  foo  ,  bar  ]', 3)
+        self.assertEqual(f.x, ['foo', 'bar'])
+        self.assertEqual(f.y, "baaz")
+        self.assertEqual(f.intercept, True)
+
+        f = self.subject.Formula('baaz', 'ARRAY[1 ,foo,bar]', 3)
+        self.assertEqual(f.x, ['foo', 'bar'])
+        self.assertEqual(f.y, "baaz")
+        self.assertEqual(f.intercept, True)
+
+        f = self.subject.Formula('baaz', 'ARRAY[1  ,"1",bar]', 3)
+        self.assertEqual(f.x, ['1', 'bar'])
+        self.assertEqual(f.y, "baaz")
+        self.assertEqual(f.intercept, True)
+
+        f = self.subject.Formula('baaz', 'ARRAY[1.0 ,foo,bar]', 3)
+        self.assertEqual(f.x, ['foo', 'bar'])
+        self.assertEqual(f.y, "baaz")
+        self.assertEqual(f.intercept, True)
+
+        f = self.subject.Formula('baaz', 'ARRAY[1.0  ,"1",bar]', 3)
+        self.assertEqual(f.x, ['1', 'bar'])
+        self.assertEqual(f.y, "baaz")
+        self.assertEqual(f.intercept, True)
+
+        f = self.subject.Formula('baaz', 'ARRAY[ 1.0,foo,bar]', 3)
+        self.assertEqual(f.x, ['foo', 'bar'])
+        self.assertEqual(f.y, "baaz")
+        self.assertEqual(f.intercept, True)
+
+        f = self.subject.Formula('baaz', 'ARRAY[  1.0,foo,bar]', 3)
+        self.assertEqual(f.x, ['foo', 'bar'])
+        self.assertEqual(f.y, "baaz")
+        self.assertEqual(f.intercept, True)
+
+
+        f = self.subject.Formula('baaz', 'ARRAY[  1.0  , foo,bar]', 3)
+        self.assertEqual(f.x, ['foo', 'bar'])
+        self.assertEqual(f.y, "baaz")
+        self.assertEqual(f.intercept, True)
+
+        f = self.subject.Formula('baaz', 'ARRAY[1.0,  "1.0",bar]', 3)
+        self.assertEqual(f.x, ['1.0', 'bar'])
+        self.assertEqual(f.y, "baaz")
+        self.assertEqual(f.intercept, True)
+
+        f = self.subject.Formula('baaz', 'ARRAY[  1.0,  foo,bar]', 3)
+        self.assertEqual(f.x, ['foo', 'bar'])
+        self.assertEqual(f.y, "baaz")
+        self.assertEqual(f.intercept, True)
+
+        f = self.subject.Formula('baaz', 'ARRAY[  1.0 ,  foo,bar]', 3)
+        self.assertEqual(f.x, ['foo', 'bar'])
+        self.assertEqual(f.y, "baaz")
+        self.assertEqual(f.intercept, True)
+
+        f = self.subject.Formula('baaz', 'ARRAY[  1.0 ,  foo  ,  bar  ]', 3)
+        self.assertEqual(f.x, ['foo', 'bar'])
+        self.assertEqual(f.y, "baaz")
+        self.assertEqual(f.intercept, True)
+
+        f = self.subject.Formula('baaz', 'ARRAY[1.0 ,foo,bar]', 3)
+        self.assertEqual(f.x, ['foo', 'bar'])
+        self.assertEqual(f.y, "baaz")
+        self.assertEqual(f.intercept, True)
+
+        f = self.subject.Formula('baaz', 'ARRAY[1.0  ,"1",bar]', 3)
+        self.assertEqual(f.x, ['1', 'bar'])
+        self.assertEqual(f.y, "baaz")
+        self.assertEqual(f.intercept, True)
+
+        f = self.subject.Formula('baaz', 'ARRAY[ 0,foo,bar]', 3)
+        self.assertEqual(f.x, ['foo', 'bar'])
+        self.assertEqual(f.y, "baaz")
+        self.assertEqual(f.intercept, True)
+
+        f = self.subject.Formula('baaz', 'ARRAY[  0,foo,bar]', 3)
+        self.assertEqual(f.x, ['foo', 'bar'])
+        self.assertEqual(f.y, "baaz")
+        self.assertEqual(f.intercept, True)
+
+
+        f = self.subject.Formula('baaz', 'ARRAY[  0  ,foo,bar]', 3)
+        self.assertEqual(f.x, ['foo', 'bar'])
+        self.assertEqual(f.y, "baaz")
+        self.assertEqual(f.intercept, True)
+
+        f = self.subject.Formula('baaz', 'ARRAY[0,  "1",bar]', 3)
+        self.assertEqual(f.x, ['1', 'bar'])
+        self.assertEqual(f.y, "baaz")
+        self.assertEqual(f.intercept, True)
+
+        f = self.subject.Formula('baaz', 'ARRAY[  0,  foo,bar]', 3)
+        self.assertEqual(f.x, ['foo', 'bar'])
+        self.assertEqual(f.y, "baaz")
+        self.assertEqual(f.intercept, True)
+
+        f = self.subject.Formula('baaz', 'ARRAY[  0 ,  foo,bar]', 3)
+        self.assertEqual(f.x, ['foo', 'bar'])
+        self.assertEqual(f.y, "baaz")
+        self.assertEqual(f.intercept, True)
+
+        f = self.subject.Formula('baaz', 'ARRAY[  0 ,  foo  ,  bar  ]', 3)
+        self.assertEqual(f.x, ['foo', 'bar'])
+        self.assertEqual(f.y, "baaz")
+        self.assertEqual(f.intercept, True)
+
+    def test_formula_array_with_spaces_without_intercept(self):
+        f = self.subject.Formula('baaz', 'ARRAY[  foo,bar]', 2)
+        self.assertEqual(f.x, ['foo', 'bar'])
+        self.assertEqual(f.y, "baaz")
+        self.assertEqual(f.intercept, False)
+
+        f = self.subject.Formula('baaz', 'ARRAY[  foo ,  bar ]', 2)
+        self.assertEqual(f.x, ['foo', 'bar'])
+        self.assertEqual(f.y, "baaz")
+        self.assertEqual(f.intercept, False)
+
+        f = self.subject.Formula('baaz', 'ARRAY[foo  ,bar]', 2)
+        self.assertEqual(f.x, ['foo', 'bar'])
+        self.assertEqual(f.y, "baaz")
+        self.assertEqual(f.intercept, False)
+
+
     def test_formula_nonarray(self):
         f = self.subject.Formula('baaz', 'foo', 3)
         self.assertEqual(f.x, ['foo[1]', 'foo[2]'])