You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@iceberg.apache.org by dw...@apache.org on 2019/09/10 22:57:51 UTC

[incubator-iceberg] branch master updated: [python] Adding a string to iceberg expression converter an tests (#426)

This is an automated email from the ASF dual-hosted git repository.

dweeks pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-iceberg.git


The following commit(s) were added to refs/heads/master by this push:
     new 776210e  [python] Adding a string to iceberg expression converter an tests (#426)
776210e is described below

commit 776210e3e49591a47c5d3d93ecd6d5c8b22e19b5
Author: TGooch44 <te...@gmail.com>
AuthorDate: Tue Sep 10 15:57:46 2019 -0700

    [python] Adding a string to iceberg expression converter an tests (#426)
---
 python/iceberg/api/expressions/expressions.py    |  88 ++++++++++++++++
 python/setup.py                                  |   2 +
 python/tests/api/expressions/test_str_to_expr.py | 128 +++++++++++++++++++++++
 3 files changed, 218 insertions(+)

diff --git a/python/iceberg/api/expressions/expressions.py b/python/iceberg/api/expressions/expressions.py
index 72bb562..190382d 100644
--- a/python/iceberg/api/expressions/expressions.py
+++ b/python/iceberg/api/expressions/expressions.py
@@ -15,6 +15,8 @@
 # specific language governing permissions and limitations
 # under the License.
 
+import logging
+
 from .expression import (And,
                          FALSE,
                          Not,
@@ -25,6 +27,8 @@ from .predicate import (Predicate,
                         UnboundPredicate)
 from .reference import NamedReference
 
+_logger = logging.getLogger(__name__)
+
 
 class Expressions(object):
 
@@ -122,6 +126,90 @@ class Expressions(object):
     def ref(name):
         return NamedReference(name)
 
+    @staticmethod
+    def convert_string_to_expr(predicate_string):
+        from moz_sql_parser import parse
+        from pyparsing import ParseException
+
+        expr_map = {"and": (Expressions.and_,),
+                    "eq": (Expressions.equal,),
+                    "exists": (Expressions.not_null,),
+                    "gt": (Expressions.greater_than,),
+                    "gte": (Expressions.greater_than_or_equal,),
+                    "lt": (Expressions.less_than,),
+                    "lte": (Expressions.less_than_or_equal,),
+                    "missing": (Expressions.is_null,),
+                    "neq": (Expressions.not_equal,),
+                    "not": (Expressions.not_,),
+                    "or": (Expressions.or_,)}
+
+        dummy_query = "SELECT * FROM tbl WHERE {}".format(predicate_string)  # nosec
+        try:
+            expr = (Expressions.
+                    _transform_to_binary_tuples(Expressions.
+                                                _transform_between_op(parse(dummy_query)["where"])))
+            return Expressions._get_expr(expr, expr_map)
+        except ParseException as pe:
+            _logger.error("Error parsing string expression into iceberg expression: %s" % str(pe))
+            raise
+
+    @staticmethod
+    def _get_expr(node, expr_map):
+        if isinstance(node, dict):
+            for i in node.keys():
+                op = i
+            if op == "literal":
+                return node["literal"]
+            mapped_op = expr_map.get(op, expr_map)
+            if len(mapped_op) == 1:
+                mapped_op = mapped_op[0]
+            if mapped_op is None:
+                raise RuntimeError("no mapping for op: %s" % op)
+            if mapped_op in (Expressions.not_, Expressions.not_null, Expressions.is_null):
+                return mapped_op(Expressions._get_expr(node[op], expr_map))
+
+            return mapped_op(*Expressions._get_expr(node[op], expr_map))
+        elif isinstance(node, (list, tuple)):
+            return (Expressions._get_expr(item, expr_map) for item in node)
+        elif isinstance(node, (str, int, float)):
+            return node
+        else:
+            raise RuntimeError("unknown node type" % node)
+
+    @staticmethod
+    def _transform_to_binary_tuples(expr):
+        if not isinstance(expr, dict):
+            return expr
+        for op in expr.keys():
+            if op in ("exists", "literal", "missing", "not"):
+                return expr
+            new_expr = [Expressions._transform_to_binary_tuples(child)
+                        for child in expr[op]]
+            while len(new_expr) > 2:
+                new_and = {op: [new_expr[-2], new_expr[-1]]}
+                new_expr[-2] = new_and
+                del new_expr[-1]
+            expr[op] = new_expr
+
+        return expr
+
+    @staticmethod
+    def _transform_between_op(expr):
+        if isinstance(expr, (bool, float, int, str)):
+            return expr
+        for op, children in expr.items():
+            if op in ("exists", "literal", "missing", "not"):
+                return expr
+            new_children = []
+            for child in children:
+                new_children.append(Expressions._transform_between_op(child))
+            expr[op] = new_children
+            if op == "between":
+                return {"and": [{"gte": [expr[op][0], expr[op][1]]},
+                                {"lte": [expr[op][0], expr[op][2]]}]}
+            else:
+                return expr
+
 
 class ExpressionVisitors(object):
 
diff --git a/python/setup.py b/python/setup.py
index 20f01f2..4ee888b 100755
--- a/python/setup.py
+++ b/python/setup.py
@@ -30,6 +30,8 @@ setup(
                       'boto3',
                       'fastavro',
                       'mmh3',
+                      'moz_sql_parser',
+                      'pyparsing',
                       'python-dateutil',
                       'pytz',
                       'requests',
diff --git a/python/tests/api/expressions/test_str_to_expr.py b/python/tests/api/expressions/test_str_to_expr.py
new file mode 100644
index 0000000..d44d52e
--- /dev/null
+++ b/python/tests/api/expressions/test_str_to_expr.py
@@ -0,0 +1,128 @@
+from iceberg.api.expressions import Expressions
+
+
+def test_equal():
+    expected_expr = Expressions.equal("col_a", 1)
+    conv_expr = Expressions.convert_string_to_expr("col_a=1")
+    assert expected_expr == conv_expr
+
+
+def test_equal_alt_syntax():
+    expected_expr = Expressions.equal("col_a", 1)
+    conv_expr = Expressions.convert_string_to_expr("col_a==1")
+    assert expected_expr == conv_expr
+
+
+def test_gt():
+    expected_expr = Expressions.greater_than("col_a", 1)
+    conv_expr = Expressions.convert_string_to_expr("col_a > 1")
+    assert expected_expr == conv_expr
+
+
+def test_gte():
+    expected_expr = Expressions.greater_than_or_equal("col_a", 1)
+    conv_expr = Expressions.convert_string_to_expr("col_a >= 1")
+    assert expected_expr == conv_expr
+
+
+def test_lt():
+    expected_expr = Expressions.less_than("col_a", 1)
+    conv_expr = Expressions.convert_string_to_expr("col_a < 1")
+    assert expected_expr == conv_expr
+
+
+def test_lte():
+    expected_expr = Expressions.less_than_or_equal("col_a", 1)
+    conv_expr = Expressions.convert_string_to_expr("col_a <= 1")
+    assert expected_expr == conv_expr\
+
+
+def test_and():
+    expected_expr = Expressions.and_(Expressions.equal("col_a", 1), Expressions.equal("col_b", 2))
+    conv_expr = Expressions.convert_string_to_expr("col_a=1 and col_b=2")
+    assert expected_expr == conv_expr
+
+
+def test_or():
+    expected_expr = Expressions.or_(Expressions.equal("col_a", 1), Expressions.equal("col_b", 2))
+    conv_expr = Expressions.convert_string_to_expr("col_a=1 or col_b=2")
+    assert expected_expr == conv_expr
+
+
+def test_between():
+    expected_expr = Expressions.and_(Expressions.greater_than_or_equal("col_a", 1),
+                                     Expressions.less_than_or_equal("col_a", 2))
+    conv_expr = Expressions.convert_string_to_expr("col_a between 1 and 2")
+    assert expected_expr == conv_expr
+
+
+def test_is_null():
+    expected_expr = Expressions.is_null("col_a")
+    conv_expr = Expressions.convert_string_to_expr("col_a is null")
+    assert expected_expr == conv_expr
+
+
+def test_not_null():
+    expected_expr = Expressions.not_null("col_a")
+    conv_expr = Expressions.convert_string_to_expr("col_a is not null")
+    assert expected_expr == conv_expr
+
+
+def test_not():
+    expected_expr = Expressions.not_("col_a")
+    conv_expr = Expressions.convert_string_to_expr("not col_a")
+    assert expected_expr == conv_expr
+
+
+def test_not_equal():
+    expected_expr = Expressions.not_equal("col_a", 7)
+    conv_expr = Expressions.convert_string_to_expr("col_a <> 7")
+    assert expected_expr == conv_expr
+
+
+def test_not_equal_alt_syntax():
+    expected_expr = Expressions.not_equal("col_a", 7)
+    conv_expr = Expressions.convert_string_to_expr("col_a != 7")
+    assert expected_expr == conv_expr
+
+
+def test_compound_not_equal():
+    expected_expr = Expressions.not_(Expressions.equal("col_a", 7))
+    conv_expr = Expressions.convert_string_to_expr("not (col_a = 7)")
+    assert expected_expr == conv_expr
+
+
+def test_ternary_condition():
+    expected_expr = Expressions.and_(Expressions.equal("col_a", 1),
+                                     Expressions.and_(Expressions.equal("col_b", 2),
+                                                      Expressions.equal("col_c", 3)))
+
+    conv_expr = Expressions.convert_string_to_expr("col_a=1 and col_b=2 and col_c=3")
+    assert expected_expr == conv_expr
+
+
+def test_precedence():
+    expected_expr = Expressions.and_(Expressions.or_(Expressions.equal("col_a", 1),
+                                                     Expressions.equal("col_b", 2)),
+                                     Expressions.equal("col_c", 3))
+
+    conv_expr = Expressions.convert_string_to_expr("col_a=1 or col_b=2 and col_c=3")
+    assert expected_expr == conv_expr
+
+
+def test_precedence_with_between():
+    expected_expr = Expressions.or_(Expressions.and_(Expressions.greater_than_or_equal("col_a", 1),
+                                                     Expressions.less_than_or_equal("col_a", 2)),
+                                    Expressions.equal("col_c", 3))
+
+    conv_expr = Expressions.convert_string_to_expr("col_a between 1 and 2 or col_c=3")
+    assert expected_expr == conv_expr
+
+
+def test_complex_expansion():
+    expected_expr = Expressions.or_(Expressions.and_(Expressions.equal("a", 1),
+                                                     Expressions.and_(Expressions.equal("b", 2),
+                                                                      Expressions.not_equal("c", 3))),
+                                    Expressions.is_null("d"))
+    conv_expr = Expressions.convert_string_to_expr("(a=1 and b=2 and c<>3) or d is null")
+    assert expected_expr == conv_expr