You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/01/14 13:21:16 UTC
[GitHub] [arrow] lidavidm commented on a change in pull request #12150: ARROW-15332: [C++] Add new cases and fix issues in IPC read/write benchmark

lidavidm commented on a change in pull request #12150:
URL: https://github.com/apache/arrow/pull/12150#discussion_r784832743



##########
File path: cpp/src/arrow/ipc/read_write_benchmark.cc
##########
@@ -202,60 +227,139 @@ static void DecodeStream(benchmark::State& state) {  // NOLINT non-const referen
   {                                                                               \
     auto record_batch = MakeRecordBatch(kBatchSize, state.range(0));              \
     auto writer = *ipc::MakeFileWriter(sink, record_batch->schema(), options);    \
-    ABORT_NOT_OK(writer->WriteRecordBatch(*record_batch));                        \
+    for (int64_t i = 0; i < kBatches; i++) {                                      \
+      ABORT_NOT_OK(writer->WriteRecordBatch(*record_batch));                      \
+    }                                                                             \
     ABORT_NOT_OK(writer->Close());                                                \
     ABORT_NOT_OK(sink->Close());                                                  \
-  }
+  }                                                                               \
+  constexpr int64_t total_size = kBatchSize * kBatches;
+
+// Note: When working with real files we ensure each array is at least 4MB large
+// This slows things down considerably but using smaller sized arrays will cause
+// the I/O to bottleneck for partial reads which is not what we are trying to
+// measure here (although this may be interesting to optimize someday)
+#define GENERATE_DATA_REAL_FILE()                                                 \
+  constexpr int64_t kArraySize = (1 << 19) * sizeof(int64_t); /* 4 MB */          \
+  constexpr int64_t kBatches = 4;                                                 \
+  auto num_fields = state.range(0);                                               \
+  auto options = ipc::IpcWriteOptions::Defaults();                                \
+  ASSIGN_OR_ABORT(auto sink, io::FileOutputStream::Open("/tmp/benchmark.arrow")); \
+  {                                                                               \
+    auto batch_size = kArraySize * num_fields;                                    \
+    auto record_batch = MakeRecordBatch(batch_size, num_fields);                  \
+    auto writer = *ipc::MakeFileWriter(sink, record_batch->schema(), options);    \
+    for (int64_t i = 0; i < kBatches; i++) {                                      \
+      ABORT_NOT_OK(writer->WriteRecordBatch(*record_batch));                      \
+    }                                                                             \
+    ABORT_NOT_OK(writer->Close());                                                \
+    ABORT_NOT_OK(sink->Close());                                                  \
+  }                                                                               \
+  int64_t total_size = kArraySize * kBatches * num_fields;
 
 #define READ_DATA_IN_MEMORY() auto input = std::make_shared<io::BufferReader>(buffer);
 #define READ_DATA_TEMP_FILE() \
   ASSIGN_OR_ABORT(auto input, io::ReadableFile::Open("/tmp/benchmark.arrow"));
+// This will not be correct if your system mounts /tmp to RAM (using tmpfs
+// or ramfs).

Review comment:
       How are our benchmark machines set up? (Trying to find out now)




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org