You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@parquet.apache.org by "ASF GitHub Bot (JIRA)" <ji...@apache.org> on 2018/04/18 08:10:00 UTC
[jira] [Commented] (PARQUET-1273) [Python] Error writing to partitioned Parquet dataset

    [ https://issues.apache.org/jira/browse/PARQUET-1273?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16442096#comment-16442096 ] 

ASF GitHub Bot commented on PARQUET-1273:
-----------------------------------------

xhochy closed pull request #453: PARQUET-1273: Properly write dictionary values when writing in chunks
URL: https://github.com/apache/parquet-cpp/pull/453
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/src/parquet/arrow/arrow-reader-writer-test.cc b/src/parquet/arrow/arrow-reader-writer-test.cc
index 79a393f6..92b67353 100644
--- a/src/parquet/arrow/arrow-reader-writer-test.cc
+++ b/src/parquet/arrow/arrow-reader-writer-test.cc
@@ -1726,6 +1726,69 @@ TEST(TestArrowReadWrite, TableWithDuplicateColumns) {
   CheckSimpleRoundtrip(table, table->num_rows());
 }
 
+TEST(TestArrowReadWrite, DictionaryColumnChunkedWrite) {
+  // This is a regression test for this:
+  //
+  // https://issues.apache.org/jira/browse/ARROW-1938
+  //
+  // As of the writing of this test, columns of type
+  // dictionary are written as their raw/expanded values.
+  // The regression was that the whole column was being
+  // written for each chunk.
+  using ::arrow::ArrayFromVector;
+
+  std::vector<std::string> values = {"first", "second", "third"};
+  auto type = ::arrow::utf8();
+  std::shared_ptr<Array> dict_values;
+  ArrayFromVector<::arrow::StringType, std::string>(values, &dict_values);
+
+  auto dict_type = ::arrow::dictionary(::arrow::int32(), dict_values);
+  auto f0 = field("dictionary", dict_type);
+  std::vector<std::shared_ptr<::arrow::Field>> fields;
+  fields.emplace_back(f0);
+  auto schema = ::arrow::schema(fields);
+
+  std::shared_ptr<Array> f0_values, f1_values;
+  ArrayFromVector<::arrow::Int32Type, int32_t>({0, 1, 0, 2, 1}, &f0_values);
+  ArrayFromVector<::arrow::Int32Type, int32_t>({2, 0, 1, 0, 2}, &f1_values);
+  ::arrow::ArrayVector dict_arrays = {
+      std::make_shared<::arrow::DictionaryArray>(dict_type, f0_values),
+      std::make_shared<::arrow::DictionaryArray>(dict_type, f1_values)};
+
+  std::vector<std::shared_ptr<::arrow::Column>> columns;
+  auto column = MakeColumn("column", dict_arrays, false);
+  columns.emplace_back(column);
+
+  auto table = Table::Make(schema, columns);
+
+  std::shared_ptr<Table> result;
+  DoSimpleRoundtrip(table, 1,
+                    // Just need to make sure that we make
+                    // a chunk size that is smaller than the
+                    // total number of values
+                    2, {}, &result);
+
+  std::vector<std::string> expected_values = {"first",  "second", "first", "third",
+                                              "second", "third",  "first", "second",
+                                              "first",  "third"};
+  columns.clear();
+
+  std::shared_ptr<Array> expected_array;
+  ArrayFromVector<::arrow::StringType, std::string>(expected_values, &expected_array);
+
+  // The column name gets changed on output to the name of the
+  // field, and it also turns into a nullable column
+  columns.emplace_back(MakeColumn("dictionary", expected_array, true));
+
+  fields.clear();
+  fields.emplace_back(::arrow::field("dictionary", ::arrow::utf8()));
+  schema = ::arrow::schema(fields);
+
+  auto expected_table = Table::Make(schema, columns);
+
+  AssertTablesEqual(*expected_table, *result, false);
+}
+
 TEST(TestArrowWrite, CheckChunkSize) {
   const int num_columns = 2;
   const int num_rows = 128;
diff --git a/src/parquet/arrow/writer.cc b/src/parquet/arrow/writer.cc
index 5040e0cc..ce05ef0b 100644
--- a/src/parquet/arrow/writer.cc
+++ b/src/parquet/arrow/writer.cc
@@ -962,7 +962,7 @@ class FileWriter::Impl {
       ::arrow::compute::Datum cast_output;
       RETURN_NOT_OK(Cast(&ctx, cast_input, dict_type.dictionary()->type(), CastOptions(),
                          &cast_output));
-      return WriteColumnChunk(cast_output.chunked_array(), 0, data->length());
+      return WriteColumnChunk(cast_output.chunked_array(), offset, size);
     }
 
     ColumnWriter* column_writer;


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


> [Python] Error writing to partitioned Parquet dataset
> -----------------------------------------------------
>
>                 Key: PARQUET-1273
>                 URL: https://issues.apache.org/jira/browse/PARQUET-1273
>             Project: Parquet
>          Issue Type: Bug
>          Components: parquet-cpp
>         Environment: Linux (Ubuntu 16.04)
>            Reporter: Robert Dailey
>            Assignee: Joshua Storck
>            Priority: Major
>              Labels: pull-request-available
>             Fix For: cpp-1.5.0
>
>         Attachments: ARROW-1938-test-data.csv.gz, ARROW-1938.py, pyarrow_dataset_error.png
>
>
> I receive the following error after upgrading to pyarrow 0.8.0 when writing to a dataset:
> * ArrowIOError: Column 3 had 187374 while previous column had 10000
> The command was:
> write_table_values = {'row_group_size': 10000}
> pq.write_to_dataset(pa.Table.from_pandas(df, preserve_index=True), '/logs/parsed/test', partition_cols=['Product', 'year', 'month', 'day', 'hour'], **write_table_values)
> I've also tried write_table_values = {'chunk_size': 10000} and received the same error.
> This same command works in version 0.7.1.  I am trying to troubleshoot the problem but wanted to submit a ticket.



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)