You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2020/11/13 06:14:01 UTC

[GitHub] [arrow] emkornfield commented on a change in pull request #8632: ARROW-10426: [C++] Allow writing large strings to Parquet

emkornfield commented on a change in pull request #8632:
URL: https://github.com/apache/arrow/pull/8632#discussion_r522673324



##########
File path: cpp/src/parquet/encoding.cc
##########
@@ -127,6 +129,21 @@ class PlainEncoder : public EncoderImpl, virtual public TypedEncoder<DType> {
   }
 
  protected:
+  template <typename ArrayType>
+  void PutBinaryArray(const ArrayType& array) {
+    const int64_t total_bytes =
+        array.value_offset(array.length()) - array.value_offset(0);
+    PARQUET_THROW_NOT_OK(sink_.Reserve(total_bytes + array.length() * sizeof(uint32_t)));

Review comment:
       is a check for int32 overflow needed here here?

##########
File path: cpp/src/parquet/arrow/arrow_reader_writer_test.cc
##########
@@ -1135,14 +1172,50 @@ TEST_F(TestStringParquetIO, EmptyStringColumnRequiredWrite) {
   ASSERT_NO_FATAL_FAILURE(this->ReaderFromSink(&reader));
   ASSERT_NO_FATAL_FAILURE(this->ReadTableFromFile(std::move(reader), &out));
   ASSERT_EQ(1, out->num_columns());
-  ASSERT_EQ(100, out->num_rows());
+  ASSERT_EQ(table->num_rows(), out->num_rows());
 
   std::shared_ptr<ChunkedArray> chunked_array = out->column(0);
   ASSERT_EQ(1, chunked_array->num_chunks());
 
   AssertArraysEqual(*values, *chunked_array->chunk(0));
 }
 
+using TestLargeBinaryParquetIO = TestParquetIO<::arrow::LargeBinaryType>;
+
+TEST_F(TestLargeBinaryParquetIO, Basics) {
+  const char* json = "[\"foo\", \"\", null, \"\xff\"]";
+
+  const auto large_type = ::arrow::large_binary();
+  const auto narrow_type = ::arrow::binary();
+  const auto large_array = ::arrow::ArrayFromJSON(large_type, json);
+  const auto narrow_array = ::arrow::ArrayFromJSON(narrow_type, json);
+
+  this->RoundTripSingleColumn(large_array, narrow_array,
+                              default_arrow_writer_properties());
+  const auto arrow_properties =
+      ::parquet::ArrowWriterProperties::Builder().store_schema()->build();
+
+  this->RoundTripSingleColumn(large_array, large_array, arrow_properties);
+}
+
+using TestLargeStringParquetIO = TestParquetIO<::arrow::LargeStringType>;
+
+TEST_F(TestLargeStringParquetIO, Basics) {
+  const char* json = R"(["foo", "", null, "bar"])";
+
+  const auto large_type = ::arrow::large_utf8();
+  const auto narrow_type = ::arrow::utf8();
+  const auto large_array = ::arrow::ArrayFromJSON(large_type, json);
+  const auto narrow_array = ::arrow::ArrayFromJSON(narrow_type, json);
+
+  this->RoundTripSingleColumn(large_array, narrow_array,

Review comment:
       is this intended to be two different arrays?  same as above?  maybe add a comment on what you are testing here? (lack of schema to read back?)

##########
File path: cpp/src/parquet/statistics.cc
##########
@@ -691,9 +634,60 @@ void TypedStatisticsImpl<ByteArrayType>::PlainDecode(const std::string& src,
   dst->ptr = reinterpret_cast<const uint8_t*>(src.c_str());
 }
 
+}  // namespace
+
 // ----------------------------------------------------------------------
 // Public factory functions
 
+std::shared_ptr<Comparator> Comparator::Make(Type::type physical_type,
+                                             SortOrder::type sort_order,
+                                             int type_length) {
+  if (SortOrder::SIGNED == sort_order) {

Review comment:
       is this just moving code?

##########
File path: cpp/src/parquet/encoding.cc
##########
@@ -127,6 +129,21 @@ class PlainEncoder : public EncoderImpl, virtual public TypedEncoder<DType> {
   }
 
  protected:
+  template <typename ArrayType>

Review comment:
       i'm not very familiar with this code, could you let me know what these changes are intended to do?

##########
File path: cpp/src/parquet/encoding.cc
##########
@@ -592,6 +597,29 @@ class DictEncoderImpl : public EncoderImpl, virtual public DictEncoder<DType> {
   /// Indices that have not yet be written out by WriteIndices().
   ArrowPoolVector<int32_t> buffered_indices_;
 
+  template <typename ArrayType>
+  void PutBinaryArray(const ArrayType& array) {
+    ::arrow::VisitArrayDataInline<typename ArrayType::TypeClass>(
+        *array.data(),
+        [&](::arrow::util::string_view view) {
+          // XXX check length

Review comment:
       implement the TODO?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org