You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@iceberg.apache.org by fo...@apache.org on 2022/09/16 03:21:28 UTC
[iceberg] branch master updated: Python: Add CLI command to list files (#5690)

This is an automated email from the ASF dual-hosted git repository.

fokko pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/iceberg.git


The following commit(s) were added to refs/heads/master by this push:
     new 76aaa6a4b5 Python: Add CLI command to list files (#5690)
76aaa6a4b5 is described below

commit 76aaa6a4b5c6ff0f512784e55954650a604dbca0
Author: Fokko Driesprong <fo...@apache.org>
AuthorDate: Thu Sep 15 20:21:22 2022 -0700

    Python: Add CLI command to list files (#5690)
    
    This makes it easy to check the FileIO:
    
    ```
    > pyiceberg files nyc.taxis
    Snapshots: nyc.taxis
    └── Snapshot 5937117119577207079, schema 0: file:/Users/fokkodriesprong/Desktop/docker-spark-iceberg/wh/nyc.db/taxis/metadata/snap-5937117119577207079-1-94656c4f-4c66-4600-a4ca-f30377300527.avro
        └── Manifest: file:/Users/fokkodriesprong/Desktop/docker-spark-iceberg/wh/nyc.db/taxis/metadata/94656c4f-4c66-4600-a4ca-f30377300527-m0.avro
            └── Datafile: file:/Users/fokkodriesprong/Desktop/docker-spark-iceberg/wh/nyc.db/taxis/data/00003-4-a245d9ee-8462-4a08-8cbc-26b8b33b9377-00001.parquet
    ```
---
 python/pyiceberg/cli/console.py      |  15 ++
 python/pyiceberg/cli/output.py       |  82 ++++++----
 python/pyiceberg/manifest.py         |   6 +-
 python/pyiceberg/table/snapshots.py  |  15 +-
 python/tests/avro/test_reader.py     |   6 +-
 python/tests/conftest.py             |  16 +-
 python/tests/table/test_snapshots.py |  39 +++++
 python/tests/utils/test_manifest.py  | 293 ++++++++++++++++++++++++++++++++++-
 8 files changed, 437 insertions(+), 35 deletions(-)

diff --git a/python/pyiceberg/cli/console.py b/python/pyiceberg/cli/console.py
index 3a040159d8..e8234d7c7e 100644
--- a/python/pyiceberg/cli/console.py
+++ b/python/pyiceberg/cli/console.py
@@ -24,6 +24,7 @@ from click import Context
 from pyiceberg.catalog import Catalog, load_catalog
 from pyiceberg.cli.output import ConsoleOutput, JsonOutput, Output
 from pyiceberg.exceptions import NoSuchNamespaceError, NoSuchPropertyException, NoSuchTableError
+from pyiceberg.io import load_file_io
 
 
 def catch_exception():
@@ -136,6 +137,20 @@ def describe(ctx: Context, entity: Literal["name", "namespace", "table"], identi
         raise NoSuchTableError(f"Table or namespace does not exist: {identifier}")
 
 
+@run.command()
+@click.argument("identifier")
+@click.option("--history", is_flag=True)
+@click.pass_context
+@catch_exception()
+def files(ctx: Context, identifier: str, history: bool):
+    """Lists all the files of the table"""
+    catalog, output = _catalog_and_output(ctx)
+
+    catalog_table = catalog.load_table(identifier)
+    io = load_file_io({**catalog.properties, **catalog_table.metadata.properties})
+    output.files(catalog_table, io, history)
+
+
 @run.command()
 @click.argument("identifier")
 @click.pass_context
diff --git a/python/pyiceberg/cli/output.py b/python/pyiceberg/cli/output.py
index 270567930e..c7907c7e5a 100644
--- a/python/pyiceberg/cli/output.py
+++ b/python/pyiceberg/cli/output.py
@@ -23,6 +23,7 @@ from rich.console import Console
 from rich.table import Table as RichTable
 from rich.tree import Tree
 
+from pyiceberg.io import FileIO
 from pyiceberg.schema import Schema
 from pyiceberg.table import Table
 from pyiceberg.table.partitioning import PartitionSpec
@@ -33,35 +34,39 @@ class Output(ABC):
     """Output interface for exporting"""
 
     @abstractmethod
-    def exception(self, ex: Exception):
+    def exception(self, ex: Exception) -> None:
         ...
 
     @abstractmethod
-    def identifiers(self, identifiers: List[Identifier]):
+    def identifiers(self, identifiers: List[Identifier]) -> None:
         ...
 
     @abstractmethod
-    def describe_table(self, table):
+    def describe_table(self, table: Table) -> None:
         ...
 
     @abstractmethod
-    def describe_properties(self, properties: Properties):
+    def files(self, table: Table, io: FileIO, history: bool) -> None:
         ...
 
     @abstractmethod
-    def text(self, response: str):
+    def describe_properties(self, properties: Properties) -> None:
         ...
 
     @abstractmethod
-    def schema(self, schema: Schema):
+    def text(self, response: str) -> None:
         ...
 
     @abstractmethod
-    def spec(self, spec: PartitionSpec):
+    def schema(self, schema: Schema) -> None:
         ...
 
     @abstractmethod
-    def uuid(self, uuid: Optional[UUID]):
+    def spec(self, spec: PartitionSpec) -> None:
+        ...
+
+    @abstractmethod
+    def uuid(self, uuid: Optional[UUID]) -> None:
         ...
 
 
@@ -70,27 +75,27 @@ class ConsoleOutput(Output):
 
     verbose: bool
 
-    def __init__(self, **properties: Any):
+    def __init__(self, **properties: Any) -> None:
         self.verbose = properties.get("verbose", False)
 
     @property
     def _table(self) -> RichTable:
         return RichTable.grid(padding=(0, 2))
 
-    def exception(self, ex: Exception):
+    def exception(self, ex: Exception) -> None:
         if self.verbose:
             Console(stderr=True).print_exception()
         else:
             Console(stderr=True).print(ex)
 
-    def identifiers(self, identifiers: List[Identifier]):
+    def identifiers(self, identifiers: List[Identifier]) -> None:
         table = self._table
         for identifier in identifiers:
             table.add_row(".".join(identifier))
 
         Console().print(table)
 
-    def describe_table(self, table: Table):
+    def describe_table(self, table: Table) -> None:
         metadata = table.metadata
         table_properties = self._table
 
@@ -119,25 +124,47 @@ class ConsoleOutput(Output):
         output_table.add_row("Properties", table_properties)
         Console().print(output_table)
 
-    def describe_properties(self, properties: Properties):
+    def files(self, table: Table, io: FileIO, history: bool) -> None:
+        if history:
+            snapshots = table.metadata.snapshots
+        else:
+            if snapshot := table.current_snapshot():
+                snapshots = [snapshot]
+            else:
+                snapshots = []
+
+        snapshot_tree = Tree(f"Snapshots: {'.'.join(table.identifier)}")
+
+        for snapshot in snapshots:
+            manifest_list_str = f": {snapshot.manifest_list}" if snapshot.manifest_list else ""
+            list_tree = snapshot_tree.add(f"Snapshot {snapshot.snapshot_id}, schema {snapshot.schema_id}{manifest_list_str}")
+
+            manifest_list = snapshot.fetch_manifest_list(io)
+            for manifest in manifest_list:
+                manifest_tree = list_tree.add(f"Manifest: {manifest.manifest_path}")
+                for manifest_entry in manifest.fetch_manifest_entry(io):
+                    manifest_tree.add(f"Datafile: {manifest_entry.data_file.file_path}")
+        Console().print(snapshot_tree)
+
+    def describe_properties(self, properties: Properties) -> None:
         output_table = self._table
         for k, v in properties.items():
             output_table.add_row(k, v)
         Console().print(output_table)
 
-    def text(self, response: str):
+    def text(self, response: str) -> None:
         Console().print(response)
 
-    def schema(self, schema: Schema):
+    def schema(self, schema: Schema) -> None:
         output_table = self._table
         for field in schema.fields:
             output_table.add_row(field.name, str(field.field_type), field.doc or "")
         Console().print(output_table)
 
-    def spec(self, spec: PartitionSpec):
+    def spec(self, spec: PartitionSpec) -> None:
         Console().print(str(spec))
 
-    def uuid(self, uuid: Optional[UUID]):
+    def uuid(self, uuid: Optional[UUID]) -> None:
         Console().print(str(uuid) if uuid else "missing")
 
 
@@ -146,32 +173,35 @@ class JsonOutput(Output):
 
     verbose: bool
 
-    def __init__(self, **properties: Any):
+    def __init__(self, **properties: Any) -> None:
         self.verbose = properties.get("verbose", False)
 
     def _out(self, d: Any) -> None:
         print(json.dumps(d))
 
-    def exception(self, ex: Exception):
+    def exception(self, ex: Exception) -> None:
         self._out({"type": ex.__class__.__name__, "message": str(ex)})
 
-    def identifiers(self, identifiers: List[Identifier]):
+    def identifiers(self, identifiers: List[Identifier]) -> None:
         self._out([".".join(identifier) for identifier in identifiers])
 
-    def describe_table(self, table: Table):
+    def describe_table(self, table: Table) -> None:
         print(table.json())
 
-    def describe_properties(self, properties: Properties):
+    def describe_properties(self, properties: Properties) -> None:
         self._out(properties)
 
-    def text(self, response: str):
+    def text(self, response: str) -> None:
         print(json.dumps(response))
 
-    def schema(self, schema: Schema):
+    def schema(self, schema: Schema) -> None:
         print(schema.json())
 
-    def spec(self, spec: PartitionSpec):
+    def files(self, table: Table, io: FileIO, history: bool) -> None:
+        pass
+
+    def spec(self, spec: PartitionSpec) -> None:
         print(spec.json())
 
-    def uuid(self, uuid: Optional[UUID]):
+    def uuid(self, uuid: Optional[UUID]) -> None:
         self._out({"uuid": str(uuid) if uuid else "missing"})
diff --git a/python/pyiceberg/manifest.py b/python/pyiceberg/manifest.py
index 6079fcc336..751dfbef2a 100644
--- a/python/pyiceberg/manifest.py
+++ b/python/pyiceberg/manifest.py
@@ -29,7 +29,7 @@ from pydantic import Field
 
 from pyiceberg.avro.file import AvroFile
 from pyiceberg.avro.reader import AvroStruct
-from pyiceberg.io import InputFile
+from pyiceberg.io import FileIO, InputFile
 from pyiceberg.schema import Schema
 from pyiceberg.types import (
     IcebergType,
@@ -128,6 +128,10 @@ class ManifestFile(IcebergBaseModel):
     partitions: Optional[List[FieldSummary]] = Field()
     key_metadata: Optional[bytes] = Field()
 
+    def fetch_manifest_entry(self, io: FileIO) -> List[ManifestEntry]:
+        file = io.new_input(self.manifest_path)
+        return list(read_manifest_entry(file))
+
 
 def read_manifest_entry(input_file: InputFile) -> Iterator[ManifestEntry]:
     with AvroFile(input_file) as reader:
diff --git a/python/pyiceberg/table/snapshots.py b/python/pyiceberg/table/snapshots.py
index 849b2f3c06..35dd3c87e7 100644
--- a/python/pyiceberg/table/snapshots.py
+++ b/python/pyiceberg/table/snapshots.py
@@ -15,10 +15,17 @@
 # specific language governing permissions and limitations
 # under the License.
 from enum import Enum
-from typing import Dict, Optional, Union
+from typing import (
+    Dict,
+    List,
+    Optional,
+    Union,
+)
 
 from pydantic import Field, PrivateAttr, root_validator
 
+from pyiceberg.io import FileIO
+from pyiceberg.manifest import ManifestFile, read_manifest_list
 from pyiceberg.utils.iceberg_base_model import IcebergBaseModel
 
 OPERATION = "operation"
@@ -103,6 +110,12 @@ class Snapshot(IcebergBaseModel):
         result_str = f"{operation}id={self.snapshot_id}{parent_id}{schema_id}"
         return result_str
 
+    def fetch_manifest_list(self, io: FileIO) -> List[ManifestFile]:
+        if self.manifest_list is not None:
+            file = io.new_input(self.manifest_list)
+            return list(read_manifest_list(file))
+        return []
+
 
 class MetadataLogEntry(IcebergBaseModel):
     metadata_file: str = Field(alias="metadata-file")
diff --git a/python/tests/avro/test_reader.py b/python/tests/avro/test_reader.py
index e4b0cb665e..44505a6174 100644
--- a/python/tests/avro/test_reader.py
+++ b/python/tests/avro/test_reader.py
@@ -376,9 +376,10 @@ def test_read_manifest_file_file(generated_manifest_file_file: str):
         records = list(reader)
 
     assert len(records) == 1, f"Expected 1 records, got {len(records)}"
-    assert records[0] == AvroStruct(
+    actual = records[0]
+    expected = AvroStruct(
         _data=[
-            "/home/iceberg/warehouse/nyc/taxis_partitioned/metadata/0125c686-8aa6-4502-bdcc-b6d17ca41a3b-m0.avro",
+            actual.get(0),
             7989,
             0,
             9182715666859759686,
@@ -391,6 +392,7 @@ def test_read_manifest_file_file(generated_manifest_file_file: str):
             0,
         ]
     )
+    assert actual == expected
 
 
 def test_fixed_reader():
diff --git a/python/tests/conftest.py b/python/tests/conftest.py
index 4030fbe9f1..be51c90357 100644
--- a/python/tests/conftest.py
+++ b/python/tests/conftest.py
@@ -26,7 +26,12 @@ retrieved using `request.getfixturevalue(fixture_name)`.
 """
 import os
 from tempfile import TemporaryDirectory
-from typing import Any, Dict, Union
+from typing import (
+    Any,
+    Dict,
+    Generator,
+    Union,
+)
 from urllib.parse import urlparse
 
 import pytest
@@ -932,7 +937,7 @@ def LocalFileIOFixture():
 
 
 @pytest.fixture(scope="session")
-def generated_manifest_entry_file(avro_schema_manifest_entry):
+def generated_manifest_entry_file(avro_schema_manifest_entry: Dict[str, Any]) -> Generator[str, None, None]:
     from fastavro import parse_schema, writer
 
     parsed_schema = parse_schema(avro_schema_manifest_entry)
@@ -945,11 +950,16 @@ def generated_manifest_entry_file(avro_schema_manifest_entry):
 
 
 @pytest.fixture(scope="session")
-def generated_manifest_file_file(avro_schema_manifest_file):
+def generated_manifest_file_file(
+    avro_schema_manifest_file: Dict[str, Any], generated_manifest_entry_file: str
+) -> Generator[str, None, None]:
     from fastavro import parse_schema, writer
 
     parsed_schema = parse_schema(avro_schema_manifest_file)
 
+    # Make sure that a valid manifest_path is set
+    manifest_file_records[0]["manifest_path"] = generated_manifest_entry_file
+
     with TemporaryDirectory() as tmpdir:
         tmp_avro_file = tmpdir + "/manifest.avro"
         with open(tmp_avro_file, "wb") as out:
diff --git a/python/tests/table/test_snapshots.py b/python/tests/table/test_snapshots.py
index 5dc48807ba..60d3bfa226 100644
--- a/python/tests/table/test_snapshots.py
+++ b/python/tests/table/test_snapshots.py
@@ -17,6 +17,8 @@
 # pylint:disable=redefined-outer-name,eval-used
 import pytest
 
+from pyiceberg.io.pyarrow import PyArrowFileIO
+from pyiceberg.manifest import FieldSummary, ManifestContent, ManifestFile
 from pyiceberg.table.snapshots import Operation, Snapshot, Summary
 
 
@@ -119,3 +121,40 @@ def test_snapshot_with_properties_repr(snapshot_with_properties: Snapshot):
         == """Snapshot(snapshot_id=25, parent_snapshot_id=19, sequence_number=200, timestamp_ms=1602638573590, manifest_list='s3:/a/b/c.avro', summary=Summary(Operation.APPEND, **{'foo': 'bar'}), schema_id=3)"""
     )
     assert snapshot_with_properties == eval(repr(snapshot_with_properties))
+
+
+def test_fetch_manifest_list(generated_manifest_file_file: str):
+    snapshot = Snapshot(
+        snapshot_id=25,
+        parent_snapshot_id=19,
+        sequence_number=200,
+        timestamp_ms=1602638573590,
+        manifest_list=generated_manifest_file_file,
+        summary=Summary(Operation.APPEND),
+        schema_id=3,
+    )
+    io = PyArrowFileIO()
+    actual = snapshot.fetch_manifest_list(io)
+    assert actual == [
+        ManifestFile(
+            manifest_path=actual[0].manifest_path,  # Is a temp path that changes every time
+            manifest_length=7989,
+            partition_spec_id=0,
+            content=ManifestContent.DATA,
+            sequence_number=0,
+            min_sequence_number=0,
+            added_snapshot_id=9182715666859759686,
+            added_data_files_count=3,
+            existing_data_files_count=0,
+            deleted_data_files_count=0,
+            added_rows_count=237993,
+            existing_rows_counts=None,
+            deleted_rows_count=0,
+            partitions=[
+                FieldSummary(
+                    contains_null=True, contains_nan=False, lower_bound=b"\x01\x00\x00\x00", upper_bound=b"\x02\x00\x00\x00"
+                )
+            ],
+            key_metadata=None,
+        )
+    ]
diff --git a/python/tests/utils/test_manifest.py b/python/tests/utils/test_manifest.py
index 66b70a2dc0..ae19c5d242 100644
--- a/python/tests/utils/test_manifest.py
+++ b/python/tests/utils/test_manifest.py
@@ -14,15 +14,22 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+
+from pyiceberg.io import load_file_io
 from pyiceberg.manifest import (
     DataFile,
+    DataFileContent,
     FieldSummary,
     FileFormat,
+    ManifestContent,
     ManifestEntry,
+    ManifestEntryStatus,
     ManifestFile,
     read_manifest_entry,
     read_manifest_list,
 )
+from pyiceberg.table import Snapshot
+from pyiceberg.table.snapshots import Operation, Summary
 from tests.io.test_io import LocalInputFile
 
 
@@ -262,9 +269,10 @@ def test_read_manifest_entry(generated_manifest_entry_file: str):
 
 def test_read_manifest_list(generated_manifest_file_file: str):
     input_file = LocalInputFile(generated_manifest_file_file)
-    assert list(read_manifest_list(input_file)) == [
+    actual = list(read_manifest_list(input_file))
+    expected = [
         ManifestFile(
-            manifest_path="/home/iceberg/warehouse/nyc/taxis_partitioned/metadata/0125c686-8aa6-4502-bdcc-b6d17ca41a3b-m0.avro",
+            manifest_path=actual[0].manifest_path,
             manifest_length=7989,
             partition_spec_id=0,
             added_snapshot_id=9182715666859759686,
@@ -281,3 +289,284 @@ def test_read_manifest_list(generated_manifest_file_file: str):
             deleted_rows_count=0,
         )
     ]
+    assert actual == expected
+
+
+def test_read_manifest(generated_manifest_file_file: str, generated_manifest_entry_file: str):
+    io = load_file_io({})
+
+    snapshot = Snapshot(
+        snapshot_id=25,
+        parent_snapshot_id=19,
+        timestamp_ms=1602638573590,
+        manifest_list=generated_manifest_file_file,
+        summary=Summary(Operation.APPEND),
+        schema_id=3,
+    )
+    manifest_list = snapshot.fetch_manifest_list(io)
+
+    assert manifest_list == [
+        ManifestFile(
+            manifest_path=generated_manifest_entry_file,
+            manifest_length=7989,
+            partition_spec_id=0,
+            content=ManifestContent.DATA,
+            sequence_number=0,
+            min_sequence_number=0,
+            added_snapshot_id=9182715666859759686,
+            added_data_files_count=3,
+            existing_data_files_count=0,
+            deleted_data_files_count=0,
+            added_rows_count=237993,
+            existing_rows_counts=None,
+            deleted_rows_count=0,
+            partitions=[
+                FieldSummary(
+                    contains_null=True, contains_nan=False, lower_bound=b"\x01\x00\x00\x00", upper_bound=b"\x02\x00\x00\x00"
+                )
+            ],
+            key_metadata=None,
+        )
+    ]
+
+    actual = manifest_list[0].fetch_manifest_entry(io)
+    expected = [
+        ManifestEntry(
+            status=ManifestEntryStatus.ADDED,
+            snapshot_id=8744736658442914487,
+            sequence_number=None,
+            data_file=DataFile(
+                content=DataFileContent.DATA,
+                file_path="/home/iceberg/warehouse/nyc/taxis_partitioned/data/VendorID=null/00000-633-d8a4223e-dc97-45a1-86e1-adaba6e8abd7-00001.parquet",
+                file_format=FileFormat.PARQUET,
+                partition={"VendorID": None},
+                record_count=19513,
+                file_size_in_bytes=388872,
+                block_size_in_bytes=67108864,
+                column_sizes={
+                    1: 53,
+                    2: 98153,
+                    3: 98693,
+                    4: 53,
+                    5: 53,
+                    6: 53,
+                    7: 17425,
+                    8: 18528,
+                    9: 53,
+                    10: 44788,
+                    11: 35571,
+                    12: 53,
+                    13: 1243,
+                    14: 2355,
+                    15: 12750,
+                    16: 4029,
+                    17: 110,
+                    18: 47194,
+                    19: 2948,
+                },
+                value_counts={
+                    1: 19513,
+                    2: 19513,
+                    3: 19513,
+                    4: 19513,
+                    5: 19513,
+                    6: 19513,
+                    7: 19513,
+                    8: 19513,
+                    9: 19513,
+                    10: 19513,
+                    11: 19513,
+                    12: 19513,
+                    13: 19513,
+                    14: 19513,
+                    15: 19513,
+                    16: 19513,
+                    17: 19513,
+                    18: 19513,
+                    19: 19513,
+                },
+                null_value_counts={
+                    1: 19513,
+                    2: 0,
+                    3: 0,
+                    4: 19513,
+                    5: 19513,
+                    6: 19513,
+                    7: 0,
+                    8: 0,
+                    9: 19513,
+                    10: 0,
+                    11: 0,
+                    12: 19513,
+                    13: 0,
+                    14: 0,
+                    15: 0,
+                    16: 0,
+                    17: 0,
+                    18: 0,
+                    19: 0,
+                },
+                nan_value_counts={16: 0, 17: 0, 18: 0, 19: 0, 10: 0, 11: 0, 12: 0, 13: 0, 14: 0, 15: 0},
+                distinct_counts=None,
+                lower_bounds={
+                    2: b"2020-04-01 00:00",
+                    3: b"2020-04-01 00:12",
+                    7: b"\x03\x00\x00\x00",
+                    8: b"\x01\x00\x00\x00",
+                    10: b"\xf6(\\\x8f\xc2\x05S\xc0",
+                    11: b"\x00\x00\x00\x00\x00\x00\x00\x00",
+                    13: b"\x00\x00\x00\x00\x00\x00\x00\x00",
+                    14: b"\x00\x00\x00\x00\x00\x00\xe0\xbf",
+                    15: b")\\\x8f\xc2\xf5(\x08\xc0",
+                    16: b"\x00\x00\x00\x00\x00\x00\x00\x00",
+                    17: b"\x00\x00\x00\x00\x00\x00\x00\x00",
+                    18: b"\xf6(\\\x8f\xc2\xc5S\xc0",
+                    19: b"\x00\x00\x00\x00\x00\x00\x04\xc0",
+                },
+                upper_bounds={
+                    2: b"2020-04-30 23:5:",
+                    3: b"2020-05-01 00:41",
+                    7: b"\t\x01\x00\x00",
+                    8: b"\t\x01\x00\x00",
+                    10: b"\xcd\xcc\xcc\xcc\xcc,_@",
+                    11: b"\x1f\x85\xebQ\\\xe2\xfe@",
+                    13: b"\x00\x00\x00\x00\x00\x00\x12@",
+                    14: b"\x00\x00\x00\x00\x00\x00\xe0?",
+                    15: b"q=\n\xd7\xa3\xf01@",
+                    16: b"\x00\x00\x00\x00\x00`B@",
+                    17: b"333333\xd3?",
+                    18: b"\x00\x00\x00\x00\x00\x18b@",
+                    19: b"\x00\x00\x00\x00\x00\x00\x04@",
+                },
+                key_metadata=None,
+                split_offsets=[4],
+                equality_ids=None,
+                sort_order_id=0,
+            ),
+        ),
+        ManifestEntry(
+            status=ManifestEntryStatus.ADDED,
+            snapshot_id=8744736658442914487,
+            sequence_number=None,
+            data_file=DataFile(
+                content=DataFileContent.DATA,
+                file_path="/home/iceberg/warehouse/nyc/taxis_partitioned/data/VendorID=1/00000-633-d8a4223e-dc97-45a1-86e1-adaba6e8abd7-00002.parquet",
+                file_format=FileFormat.PARQUET,
+                partition={"VendorID": 1},
+                record_count=95050,
+                file_size_in_bytes=1265950,
+                block_size_in_bytes=67108864,
+                column_sizes={
+                    1: 318,
+                    2: 329806,
+                    3: 331632,
+                    4: 15343,
+                    5: 2351,
+                    6: 3389,
+                    7: 71269,
+                    8: 76429,
+                    9: 16383,
+                    10: 86992,
+                    11: 89608,
+                    12: 265,
+                    13: 19377,
+                    14: 1692,
+                    15: 76162,
+                    16: 4354,
+                    17: 759,
+                    18: 120650,
+                    19: 11804,
+                },
+                value_counts={
+                    1: 95050,
+                    2: 95050,
+                    3: 95050,
+                    4: 95050,
+                    5: 95050,
+                    6: 95050,
+                    7: 95050,
+                    8: 95050,
+                    9: 95050,
+                    10: 95050,
+                    11: 95050,
+                    12: 95050,
+                    13: 95050,
+                    14: 95050,
+                    15: 95050,
+                    16: 95050,
+                    17: 95050,
+                    18: 95050,
+                    19: 95050,
+                },
+                null_value_counts={
+                    1: 0,
+                    2: 0,
+                    3: 0,
+                    4: 0,
+                    5: 0,
+                    6: 0,
+                    7: 0,
+                    8: 0,
+                    9: 0,
+                    10: 0,
+                    11: 0,
+                    12: 95050,
+                    13: 0,
+                    14: 0,
+                    15: 0,
+                    16: 0,
+                    17: 0,
+                    18: 0,
+                    19: 0,
+                },
+                nan_value_counts={16: 0, 17: 0, 18: 0, 19: 0, 10: 0, 11: 0, 12: 0, 13: 0, 14: 0, 15: 0},
+                distinct_counts=None,
+                lower_bounds={
+                    1: b"\x01\x00\x00\x00",
+                    2: b"2020-04-01 00:00",
+                    3: b"2020-04-01 00:03",
+                    4: b"\x00\x00\x00\x00",
+                    5: b"\x01\x00\x00\x00",
+                    6: b"N",
+                    7: b"\x01\x00\x00\x00",
+                    8: b"\x01\x00\x00\x00",
+                    9: b"\x01\x00\x00\x00",
+                    10: b"\x00\x00\x00\x00\x00\x00\x00\x00",
+                    11: b"\x00\x00\x00\x00\x00\x00\x00\x00",
+                    13: b"\x00\x00\x00\x00\x00\x00\x00\x00",
+                    14: b"\x00\x00\x00\x00\x00\x00\x00\x00",
+                    15: b"\x00\x00\x00\x00\x00\x00\x00\x00",
+                    16: b"\x00\x00\x00\x00\x00\x00\x00\x00",
+                    17: b"\x00\x00\x00\x00\x00\x00\x00\x00",
+                    18: b"\x00\x00\x00\x00\x00\x00\x00\x00",
+                    19: b"\x00\x00\x00\x00\x00\x00\x00\x00",
+                },
+                upper_bounds={
+                    1: b"\x01\x00\x00\x00",
+                    2: b"2020-04-30 23:5:",
+                    3: b"2020-05-01 00:1:",
+                    4: b"\x06\x00\x00\x00",
+                    5: b"c\x00\x00\x00",
+                    6: b"Y",
+                    7: b"\t\x01\x00\x00",
+                    8: b"\t\x01\x00\x00",
+                    9: b"\x04\x00\x00\x00",
+                    10: b"\\\x8f\xc2\xf5(8\x8c@",
+                    11: b"\xcd\xcc\xcc\xcc\xcc,f@",
+                    13: b"\x00\x00\x00\x00\x00\x00\x1c@",
+                    14: b"\x9a\x99\x99\x99\x99\x99\xf1?",
+                    15: b"\x00\x00\x00\x00\x00\x00Y@",
+                    16: b"\x00\x00\x00\x00\x00\xb0X@",
+                    17: b"333333\xd3?",
+                    18: b"\xc3\xf5(\\\x8f:\x8c@",
+                    19: b"\x00\x00\x00\x00\x00\x00\x04@",
+                },
+                key_metadata=None,
+                split_offsets=[4],
+                equality_ids=None,
+                sort_order_id=0,
+            ),
+        ),
+    ]
+
+    assert actual == expected