You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2021/05/11 18:23:31 UTC
[GitHub] [arrow] lidavidm commented on a change in pull request #10230: ARROW-12512: [C++][Python][Dataset] Create CSV writer class and add Datasets support

lidavidm commented on a change in pull request #10230:
URL: https://github.com/apache/arrow/pull/10230#discussion_r630421629



##########
File path: cpp/src/arrow/csv/writer.cc
##########
@@ -282,65 +283,79 @@ Result<std::unique_ptr<ColumnPopulator>> MakePopulator(const Field& field, char
   return std::unique_ptr<ColumnPopulator>(factory.populator);
 }
 
-class CSVConverter {
+class CSVConverter : public ipc::RecordBatchWriter {
  public:
-  static Result<std::unique_ptr<CSVConverter>> Make(std::shared_ptr<Schema> schema,
-                                                    MemoryPool* pool) {
+  static Result<std::shared_ptr<CSVConverter>> Make(
+      io::OutputStream* sink, std::shared_ptr<io::OutputStream> owned_sink,
+      std::shared_ptr<Schema> schema, MemoryPool* pool, const WriteOptions& options) {
+    if (!pool) pool = default_memory_pool();
     std::vector<std::unique_ptr<ColumnPopulator>> populators(schema->num_fields());
     for (int col = 0; col < schema->num_fields(); col++) {
       char end_char = col < schema->num_fields() - 1 ? ',' : '\n';
       ASSIGN_OR_RAISE(populators[col],
                       MakePopulator(*schema->field(col), end_char, pool));
     }
-    return std::unique_ptr<CSVConverter>(
-        new CSVConverter(std::move(schema), std::move(populators), pool));
+    auto writer = std::shared_ptr<CSVConverter>(
+        new CSVConverter(sink, std::move(owned_sink), std::move(schema),
+                         std::move(populators), pool, options));
+    if (options.include_header) {
+      RETURN_NOT_OK(writer->PrepareForContentsWrite());
+      RETURN_NOT_OK(writer->WriteHeader());
+    }
+    return writer;
   }
 
-  Status WriteCSV(const RecordBatch& batch, const WriteOptions& options,
-                  io::OutputStream* out) {
-    RETURN_NOT_OK(PrepareForContentsWrite(options, out));
-    RecordBatchIterator iterator = RecordBatchSliceIterator(batch, options.batch_size);
+  Status WriteRecordBatch(const RecordBatch& batch) override {
+    RETURN_NOT_OK(PrepareForContentsWrite());
+    RecordBatchIterator iterator = RecordBatchSliceIterator(batch, options_.batch_size);
     for (auto maybe_slice : iterator) {
       ASSIGN_OR_RAISE(std::shared_ptr<RecordBatch> slice, maybe_slice);
       RETURN_NOT_OK(TranslateMinimalBatch(*slice));
-      RETURN_NOT_OK(out->Write(data_buffer_));
+      RETURN_NOT_OK(sink_->Write(data_buffer_));
+      stats_.num_record_batches++;
     }
     return Status::OK();
   }
 
-  Status WriteCSV(const Table& table, const WriteOptions& options,
-                  io::OutputStream* out) {
+  Status WriteTable(const Table& table, int64_t max_chunksize) override {
     TableBatchReader reader(table);
-    reader.set_chunksize(options.batch_size);
-    RETURN_NOT_OK(PrepareForContentsWrite(options, out));
+    reader.set_chunksize(max_chunksize > 0 ? max_chunksize : options_.batch_size);

Review comment:
       There's a bit of an impedance mismatch because I elected to reuse the ipc::RecordBatchWriter interface, which has parameters like that in the API. I could at least introduce an overload that doesn't require specifying it for convenience.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org