You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@airflow.apache.org by GitBox <gi...@apache.org> on 2020/12/30 17:55:17 UTC

[GitHub] [airflow] potiuk commented on a change in pull request #13359: Add Parquet data type to BaseSQLToGCSOperator

potiuk commented on a change in pull request #13359:
URL: https://github.com/apache/airflow/pull/13359#discussion_r550275535



##########
File path: tests/providers/google/cloud/transfers/test_sql_to_gcs.py
##########
@@ -140,6 +141,7 @@ def test_exec(
 
         cursor_mock.__iter__ = Mock(return_value=iter(INPUT_DATA))
 
+        # Test JSON

Review comment:
       I think the test should be split to three tests now. One test should test one thing only.

##########
File path: tests/providers/google/cloud/transfers/test_sql_to_gcs.py
##########
@@ -160,6 +162,37 @@ def test_exec(
         mock_upload.assert_called_once_with(BUCKET, FILENAME, TMP_FILE_NAME, mime_type=APP_JSON, gzip=False)
         mock_close.assert_called_once()
 
+        mock_query.reset_mock()
+        mock_flush.reset_mock()
+        mock_upload.reset_mock()
+        mock_close.reset_mock()
+        cursor_mock.reset_mock()
+
+        cursor_mock.__iter__ = Mock(return_value=iter(INPUT_DATA))
+
+        # Test parquet
+        operator = DummySQLToGCSOperator(
+            sql=SQL, bucket=BUCKET, filename=FILENAME, task_id=TASK_ID, export_format="parquet", schema=SCHEMA
+        )
+        operator.execute(context=dict())
+
+        mock_query.assert_called_once()
+        mock_write.assert_has_calls(
+            [
+                mock.call(OUTPUT_DATA),
+                mock.call(b"\n"),
+                mock.call(OUTPUT_DATA),
+                mock.call(b"\n"),
+                mock.call(OUTPUT_DATA),
+                mock.call(b"\n"),
+            ]
+        )
+        mock_flush.assert_called_once()
+        mock_upload.assert_called_once_with(
+            BUCKET, FILENAME, TMP_FILE_NAME, mime_type='application/octet-stream', gzip=False
+        )
+        mock_close.assert_called_once()
+

Review comment:
       Is there are any way we can verify that we are actually sending a Parquet file? could we for example try to read it and parse it via Parquet? I am not 100% sure but it seems a bit suspicious that we see the same OUTPUT_DATA being written in both CSV and Parquet case. Maybe I do not understand  it fully but I'd expect different output in both cases if I understand it correctly.

##########
File path: airflow/providers/google/cloud/transfers/sql_to_gcs.py
##########
@@ -198,6 +202,9 @@ def _write_local_data_files(self, cursor):
 
         if self.export_format == 'csv':
             csv_writer = self._configure_csv_file(tmp_file_handle, schema)
+        if self.export_format == 'parquet':
+            parquet_schema = self._convert_parquet_schema(cursor)
+            # parquet_writer = self._configure_parquet_file(tmp_file_handle, parquet_schema)

Review comment:
       II this comment deliberate? 




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org