You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@superset.apache.org by el...@apache.org on 2023/12/02 01:20:58 UTC

(superset) 03/08: fix: improve upload ZIP file validation (#25658)

This is an automated email from the ASF dual-hosted git repository.

elizabeth pushed a commit to branch 2.1
in repository https://gitbox.apache.org/repos/asf/superset.git

commit 7c23cb0b3fd224c320b35f05e74b572033569154
Author: Daniel Vaz Gaspar <da...@gmail.com>
AuthorDate: Tue Oct 17 18:28:09 2023 +0100

    fix: improve upload ZIP file validation (#25658)
---
 superset/commands/importers/v1/utils.py |   2 +
 superset/config.py                      |   5 +
 superset/utils/core.py                  |  19 ++++
 tests/unit_tests/utils/test_core.py     | 179 +++++++++++++++++++++++++++++++-
 4 files changed, 203 insertions(+), 2 deletions(-)

diff --git a/superset/commands/importers/v1/utils.py b/superset/commands/importers/v1/utils.py
index 1e73886682..3df5fbf821 100644
--- a/superset/commands/importers/v1/utils.py
+++ b/superset/commands/importers/v1/utils.py
@@ -25,6 +25,7 @@ from marshmallow.exceptions import ValidationError
 from superset import db
 from superset.commands.importers.exceptions import IncorrectVersionError
 from superset.models.core import Database
+from superset.utils.core import check_is_safe_zip
 
 METADATA_FILE_NAME = "metadata.yaml"
 IMPORT_VERSION = "1.0.0"
@@ -147,6 +148,7 @@ def is_valid_config(file_name: str) -> bool:
 
 
 def get_contents_from_bundle(bundle: ZipFile) -> Dict[str, str]:
+    check_is_safe_zip(bundle)
     return {
         remove_root(file_name): bundle.read(file_name).decode()
         for file_name in bundle.namelist()
diff --git a/superset/config.py b/superset/config.py
index c089416505..f2d9fa5adf 100644
--- a/superset/config.py
+++ b/superset/config.py
@@ -1530,6 +1530,11 @@ WELCOME_PAGE_LAST_TAB: Union[
     Literal["examples", "all"], Tuple[str, List[Dict[str, Any]]]
 ] = "all"
 
+# Max allowed size for a zipped file
+ZIPPED_FILE_MAX_SIZE = 100 * 1024 * 1024  # 100MB
+# Max allowed compression ratio for a zipped file
+ZIP_FILE_MAX_COMPRESS_RATIO = 200.0
+
 # Configuration for environment tag shown on the navbar. Setting 'text' to '' will hide the tag.
 # 'color' can either be a hex color code, or a dot-indexed theme color (e.g. error.base)
 ENVIRONMENT_TAG_CONFIG = {
diff --git a/superset/utils/core.py b/superset/utils/core.py
index 8cf73b84aa..517ca6e21b 100644
--- a/superset/utils/core.py
+++ b/superset/utils/core.py
@@ -1998,6 +1998,25 @@ def create_zip(files: Dict[str, Any]) -> BytesIO:
     return buf
 
 
+def check_is_safe_zip(zip_file: ZipFile) -> None:
+    """
+    Checks whether a ZIP file is safe, raises SupersetException if not.
+
+    :param zip_file:
+    :return:
+    """
+    uncompress_size = 0
+    compress_size = 0
+    for zip_file_element in zip_file.infolist():
+        if zip_file_element.file_size > current_app.config["ZIPPED_FILE_MAX_SIZE"]:
+            raise SupersetException("Found file with size above allowed threshold")
+        uncompress_size += zip_file_element.file_size
+        compress_size += zip_file_element.compress_size
+    compress_ratio = uncompress_size / compress_size
+    if compress_ratio > current_app.config["ZIP_FILE_MAX_COMPRESS_RATIO"]:
+        raise SupersetException("Zip compress ratio above allowed threshold")
+
+
 def remove_extra_adhoc_filters(form_data: Dict[str, Any]) -> None:
     """
     Remove filters from slice data that originate from a filter box or native filter
diff --git a/tests/unit_tests/utils/test_core.py b/tests/unit_tests/utils/test_core.py
index 6845bb2fc1..d95b302c4f 100644
--- a/tests/unit_tests/utils/test_core.py
+++ b/tests/unit_tests/utils/test_core.py
@@ -15,11 +15,25 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-from typing import Any, Dict
+import os
+from dataclasses import dataclass
+from typing import Any, Dict, Optional
+from unittest.mock import MagicMock
 
+import pandas as pd
 import pytest
 
-from superset.utils.core import QueryObjectFilterClause, remove_extra_adhoc_filters
+from superset.exceptions import SupersetException
+from superset.utils.core import (
+    cast_to_boolean,
+    check_is_safe_zip,
+    DateColumn,
+    is_test,
+    normalize_dttm_col,
+    parse_boolean_string,
+    QueryObjectFilterClause,
+    remove_extra_adhoc_filters,
+)
 
 ADHOC_FILTER: QueryObjectFilterClause = {
     "col": "foo",
@@ -35,6 +49,12 @@ EXTRA_FILTER: QueryObjectFilterClause = {
 }
 
 
+@dataclass
+class MockZipInfo:
+    file_size: int
+    compress_size: int
+
+
 @pytest.mark.parametrize(
     "original,expected",
     [
@@ -84,3 +104,158 @@ def test_remove_extra_adhoc_filters(
 ) -> None:
     remove_extra_adhoc_filters(original)
     assert expected == original
+
+
+def test_is_test() -> None:
+    orig_value = os.getenv("SUPERSET_TESTENV")
+
+    os.environ["SUPERSET_TESTENV"] = "true"
+    assert is_test()
+    os.environ["SUPERSET_TESTENV"] = "false"
+    assert not is_test()
+    os.environ["SUPERSET_TESTENV"] = ""
+    assert not is_test()
+
+    if orig_value is not None:
+        os.environ["SUPERSET_TESTENV"] = orig_value
+
+
+@pytest.mark.parametrize(
+    "test_input,expected",
+    [
+        ("y", True),
+        ("Y", True),
+        ("yes", True),
+        ("True", True),
+        ("t", True),
+        ("true", True),
+        ("On", True),
+        ("on", True),
+        ("1", True),
+        ("n", False),
+        ("N", False),
+        ("no", False),
+        ("False", False),
+        ("f", False),
+        ("false", False),
+        ("Off", False),
+        ("off", False),
+        ("0", False),
+        ("foo", False),
+        (None, False),
+    ],
+)
+def test_parse_boolean_string(test_input: Optional[str], expected: bool) -> None:
+    assert parse_boolean_string(test_input) == expected
+
+
+def test_int_values() -> None:
+    assert cast_to_boolean(1) is True
+    assert cast_to_boolean(0) is False
+    assert cast_to_boolean(-1) is True
+    assert cast_to_boolean(42) is True
+    assert cast_to_boolean(0) is False
+
+
+def test_float_values() -> None:
+    assert cast_to_boolean(0.5) is True
+    assert cast_to_boolean(3.14) is True
+    assert cast_to_boolean(-2.71) is True
+    assert cast_to_boolean(0.0) is False
+
+
+def test_string_values() -> None:
+    assert cast_to_boolean("true") is True
+    assert cast_to_boolean("TruE") is True
+    assert cast_to_boolean("false") is False
+    assert cast_to_boolean("FaLsE") is False
+    assert cast_to_boolean("") is False
+
+
+def test_none_value() -> None:
+    assert cast_to_boolean(None) is None
+
+
+def test_boolean_values() -> None:
+    assert cast_to_boolean(True) is True
+    assert cast_to_boolean(False) is False
+
+
+def test_other_values() -> None:
+    assert cast_to_boolean([]) is False
+    assert cast_to_boolean({}) is False
+    assert cast_to_boolean(object()) is False
+
+
+def test_normalize_dttm_col() -> None:
+    """
+    Tests for the ``normalize_dttm_col`` function.
+
+    In particular, this covers a regression when Pandas was upgraded from 1.5.3 to
+    2.0.3 and the behavior of ``pd.to_datetime`` changed.
+    """
+    df = pd.DataFrame({"__time": ["2017-07-01T00:00:00.000Z"]})
+    assert (
+        df.to_markdown()
+        == """
+|    | __time                   |
+|---:|:-------------------------|
+|  0 | 2017-07-01T00:00:00.000Z |
+    """.strip()
+    )
+
+    # in 1.5.3 this would return a datetime64[ns] dtype, but in 2.0.3 we had to
+    # add ``exact=False`` since there is a leftover after parsing the format
+    dttm_cols = (DateColumn("__time", "%Y-%m-%d"),)
+
+    # the function modifies the dataframe in place
+    normalize_dttm_col(df, dttm_cols)
+
+    assert df["__time"].astype(str).tolist() == ["2017-07-01"]
+
+
+def test_check_if_safe_zip_success(app_context: None) -> None:
+    """
+    Test if ZIP files are safe
+    """
+    ZipFile = MagicMock()
+    ZipFile.infolist.return_value = [
+        MockZipInfo(file_size=1000, compress_size=10),
+        MockZipInfo(file_size=1000, compress_size=10),
+        MockZipInfo(file_size=1000, compress_size=10),
+        MockZipInfo(file_size=1000, compress_size=10),
+        MockZipInfo(file_size=1000, compress_size=10),
+    ]
+    check_is_safe_zip(ZipFile)
+
+
+def test_check_if_safe_zip_high_rate(app_context: None) -> None:
+    """
+    Test if ZIP files is not highly compressed
+    """
+    ZipFile = MagicMock()
+    ZipFile.infolist.return_value = [
+        MockZipInfo(file_size=1000, compress_size=1),
+        MockZipInfo(file_size=1000, compress_size=1),
+        MockZipInfo(file_size=1000, compress_size=1),
+        MockZipInfo(file_size=1000, compress_size=1),
+        MockZipInfo(file_size=1000, compress_size=1),
+    ]
+    with pytest.raises(SupersetException):
+        check_is_safe_zip(ZipFile)
+
+
+def test_check_if_safe_zip_hidden_bomb(app_context: None) -> None:
+    """
+    Test if ZIP file does not contain a big file highly compressed
+    """
+    ZipFile = MagicMock()
+    ZipFile.infolist.return_value = [
+        MockZipInfo(file_size=1000, compress_size=100),
+        MockZipInfo(file_size=1000, compress_size=100),
+        MockZipInfo(file_size=1000, compress_size=100),
+        MockZipInfo(file_size=1000, compress_size=100),
+        MockZipInfo(file_size=1000 * (1024 * 1024), compress_size=100),
+    ]
+    with pytest.raises(SupersetException):
+        check_is_safe_zip(ZipFile)