You are viewing a plain text version of this content. The canonical link for it is here.

Posted to github@arrow.apache.org by "paleolimbot (via GitHub)" <gi...@apache.org> on 2023/11/29 20:52:52 UTC

[PR] feat: Add batch reader and data file read/write to/from ArrowArrayStream [arrow-nanoarrow]

paleolimbot opened a new pull request, #328:
URL: https://github.com/apache/arrow-nanoarrow/pull/328

   (no comment)


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

Re: [PR] feat: Add batch reader and data file read/write to/from ArrowArrayStream [arrow-nanoarrow]

Posted by "paleolimbot (via GitHub)" <gi...@apache.org>.

paleolimbot commented on code in PR #328:
URL: https://github.com/apache/arrow-nanoarrow/pull/328#discussion_r1411423947


##########
src/nanoarrow/nanoarrow_testing.hpp:
##########
@@ -46,6 +46,53 @@ namespace testing {
 /// \brief Writer for the Arrow integration testing JSON format
 class TestingJSONWriter {
  public:
+  /// \brief Write an ArrowArrayStream as a data file to out
+  ///
+  /// Creates output like `{"schema": {...}, "batches": [...], ...}`.
+  ArrowErrorCode WriteDataFile(std::ostream& out, ArrowArrayStream* stream) {
+    if (stream == nullptr || stream->release == nullptr) {
+      return EINVAL;
+    }
+
+    out << R"({"schema": )";
+
+    nanoarrow::UniqueSchema schema;
+    NANOARROW_RETURN_NOT_OK(stream->get_schema(stream, schema.get()));
+    NANOARROW_RETURN_NOT_OK(WriteSchema(out, schema.get()));
+
+    nanoarrow::UniqueArrayView array_view;
+    NANOARROW_RETURN_NOT_OK(
+        ArrowArrayViewInitFromSchema(array_view.get(), schema.get(), nullptr));
+
+    out << R"(, "batches": [)";
+
+    nanoarrow::UniqueArray array;
+    NANOARROW_RETURN_NOT_OK(stream->get_next(stream, array.get()));
+    NANOARROW_RETURN_NOT_OK(
+        ArrowArrayViewSetArray(array_view.get(), array.get(), nullptr));
+    NANOARROW_RETURN_NOT_OK(WriteBatch(out, schema.get(), array_view.get()));
+
+    while (true) {
+      array.reset();
+      NANOARROW_RETURN_NOT_OK(stream->get_next(stream, array.get()));
+      if (array->release == nullptr) {
+        break;
+      }
+
+      out << ", ";
+      NANOARROW_RETURN_NOT_OK(
+          ArrowArrayViewSetArray(array_view.get(), array.get(), nullptr));
+      NANOARROW_RETURN_NOT_OK(WriteBatch(out, schema.get(), array_view.get()));
+    }
+
+    out << "]";
+
+    // Dictionaries errored at the schema stage

Review Comment:
   Right! Sorry, I missed that part of the comment 😬 



##########
src/nanoarrow/nanoarrow_testing.hpp:
##########
@@ -46,6 +46,53 @@ namespace testing {
 /// \brief Writer for the Arrow integration testing JSON format
 class TestingJSONWriter {
  public:
+  /// \brief Write an ArrowArrayStream as a data file to out
+  ///
+  /// Creates output like `{"schema": {...}, "batches": [...], ...}`.
+  ArrowErrorCode WriteDataFile(std::ostream& out, ArrowArrayStream* stream) {
+    if (stream == nullptr || stream->release == nullptr) {
+      return EINVAL;
+    }
+
+    out << R"({"schema": )";
+
+    nanoarrow::UniqueSchema schema;
+    NANOARROW_RETURN_NOT_OK(stream->get_schema(stream, schema.get()));
+    NANOARROW_RETURN_NOT_OK(WriteSchema(out, schema.get()));
+
+    nanoarrow::UniqueArrayView array_view;
+    NANOARROW_RETURN_NOT_OK(
+        ArrowArrayViewInitFromSchema(array_view.get(), schema.get(), nullptr));
+
+    out << R"(, "batches": [)";
+
+    nanoarrow::UniqueArray array;
+    NANOARROW_RETURN_NOT_OK(stream->get_next(stream, array.get()));
+    NANOARROW_RETURN_NOT_OK(
+        ArrowArrayViewSetArray(array_view.get(), array.get(), nullptr));
+    NANOARROW_RETURN_NOT_OK(WriteBatch(out, schema.get(), array_view.get()));
+
+    while (true) {
+      array.reset();
+      NANOARROW_RETURN_NOT_OK(stream->get_next(stream, array.get()));
+      if (array->release == nullptr) {
+        break;
+      }
+
+      out << ", ";
+      NANOARROW_RETURN_NOT_OK(
+          ArrowArrayViewSetArray(array_view.get(), array.get(), nullptr));
+      NANOARROW_RETURN_NOT_OK(WriteBatch(out, schema.get(), array_view.get()));
+    }
+
+    out << "]";
+
+    // Dictionaries errored at the schema stage

Review Comment:
   Right! Sorry, I missed that part of the comment 😬 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

Re: [PR] feat: Add batch reader and data file read/write to/from ArrowArrayStream [arrow-nanoarrow]

Posted by "codecov-commenter (via GitHub)" <gi...@apache.org>.

codecov-commenter commented on PR #328:
URL: https://github.com/apache/arrow-nanoarrow/pull/328#issuecomment-1832685679

   ## [Codecov](https://app.codecov.io/gh/apache/arrow-nanoarrow/pull/328?src=pr&el=h1&utm_medium=referral&utm_source=github&utm_content=comment&utm_campaign=pr+comments&utm_term=apache) Report
   All modified and coverable lines are covered by tests :white_check_mark:
   > Comparison is base [(`e0329f4`)](https://app.codecov.io/gh/apache/arrow-nanoarrow/commit/e0329f4e9188d198680439cba74006173b7c02aa?el=desc&utm_medium=referral&utm_source=github&utm_content=comment&utm_campaign=pr+comments&utm_term=apache) 87.83% compared to head [(`2311cab`)](https://app.codecov.io/gh/apache/arrow-nanoarrow/pull/328?src=pr&el=desc&utm_medium=referral&utm_source=github&utm_content=comment&utm_campaign=pr+comments&utm_term=apache) 89.10%.
   > Report is 1 commits behind head on main.
   
   
   <details><summary>Additional details and impacted files</summary>
   
   
   ```diff
   @@            Coverage Diff             @@
   ##             main     #328      +/-   ##
   ==========================================
   + Coverage   87.83%   89.10%   +1.27%     
   ==========================================
     Files          72        4      -68     
     Lines       11114      101   -11013     
   ==========================================
   - Hits         9762       90    -9672     
   + Misses       1352       11    -1341     
   ```
   
   
   
   </details>
   
   [:umbrella: View full report in Codecov by Sentry](https://app.codecov.io/gh/apache/arrow-nanoarrow/pull/328?src=pr&el=continue&utm_medium=referral&utm_source=github&utm_content=comment&utm_campaign=pr+comments&utm_term=apache).   
   :loudspeaker: Have feedback on the report? [Share it here](https://about.codecov.io/codecov-pr-comment-feedback/?utm_medium=referral&utm_source=github&utm_content=comment&utm_campaign=pr+comments&utm_term=apache).
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

Re: [PR] feat: Add batch reader and data file read/write to/from ArrowArrayStream [arrow-nanoarrow]

Posted by "paleolimbot (via GitHub)" <gi...@apache.org>.

paleolimbot merged PR #328:
URL: https://github.com/apache/arrow-nanoarrow/pull/328


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

Re: [PR] feat: Add batch reader and data file read/write to/from ArrowArrayStream [arrow-nanoarrow]

Posted by "EpsilonPrime (via GitHub)" <gi...@apache.org>.

EpsilonPrime commented on code in PR #328:
URL: https://github.com/apache/arrow-nanoarrow/pull/328#discussion_r1411087920


##########
src/nanoarrow/nanoarrow_testing.hpp:
##########
@@ -46,6 +46,53 @@ namespace testing {
 /// \brief Writer for the Arrow integration testing JSON format
 class TestingJSONWriter {
  public:
+  /// \brief Write an ArrowArrayStream as a data file to out
+  ///
+  /// Creates output like `{"schema": {...}, "batches": [...], ...}`.
+  ArrowErrorCode WriteDataFile(std::ostream& out, ArrowArrayStream* stream) {
+    if (stream == nullptr || stream->release == nullptr) {
+      return EINVAL;
+    }
+
+    out << R"({"schema": )";
+
+    nanoarrow::UniqueSchema schema;
+    NANOARROW_RETURN_NOT_OK(stream->get_schema(stream, schema.get()));
+    NANOARROW_RETURN_NOT_OK(WriteSchema(out, schema.get()));
+
+    nanoarrow::UniqueArrayView array_view;
+    NANOARROW_RETURN_NOT_OK(
+        ArrowArrayViewInitFromSchema(array_view.get(), schema.get(), nullptr));
+
+    out << R"(, "batches": [)";
+
+    nanoarrow::UniqueArray array;
+    NANOARROW_RETURN_NOT_OK(stream->get_next(stream, array.get()));
+    NANOARROW_RETURN_NOT_OK(
+        ArrowArrayViewSetArray(array_view.get(), array.get(), nullptr));
+    NANOARROW_RETURN_NOT_OK(WriteBatch(out, schema.get(), array_view.get()));
+
+    while (true) {
+      array.reset();
+      NANOARROW_RETURN_NOT_OK(stream->get_next(stream, array.get()));
+      if (array->release == nullptr) {
+        break;
+      }
+
+      out << ", ";
+      NANOARROW_RETURN_NOT_OK(
+          ArrowArrayViewSetArray(array_view.get(), array.get(), nullptr));
+      NANOARROW_RETURN_NOT_OK(WriteBatch(out, schema.get(), array_view.get()));
+    }
+
+    out << "]";
+
+    // Dictionaries errored at the schema stage

Review Comment:
   Not sure what this means.  Is this a failure case handled elsewhere?  Are the dictionaries always going to be empty even for valid data?



##########
src/nanoarrow/nanoarrow_testing.hpp:
##########
@@ -46,6 +46,53 @@ namespace testing {
 /// \brief Writer for the Arrow integration testing JSON format
 class TestingJSONWriter {
  public:
+  /// \brief Write an ArrowArrayStream as a data file to out

Review Comment:
   as a JSON data file?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

Re: [PR] feat: Add batch reader and data file read/write to/from ArrowArrayStream [arrow-nanoarrow]

Posted by "bkietz (via GitHub)" <gi...@apache.org>.

bkietz commented on code in PR #328:
URL: https://github.com/apache/arrow-nanoarrow/pull/328#discussion_r1411268745


##########
src/nanoarrow/nanoarrow_testing.hpp:
##########
@@ -46,6 +46,53 @@ namespace testing {
 /// \brief Writer for the Arrow integration testing JSON format
 class TestingJSONWriter {
  public:
+  /// \brief Write an ArrowArrayStream as a data file to out
+  ///
+  /// Creates output like `{"schema": {...}, "batches": [...], ...}`.
+  ArrowErrorCode WriteDataFile(std::ostream& out, ArrowArrayStream* stream) {
+    if (stream == nullptr || stream->release == nullptr) {
+      return EINVAL;
+    }
+
+    out << R"({"schema": )";
+
+    nanoarrow::UniqueSchema schema;
+    NANOARROW_RETURN_NOT_OK(stream->get_schema(stream, schema.get()));
+    NANOARROW_RETURN_NOT_OK(WriteSchema(out, schema.get()));
+
+    nanoarrow::UniqueArrayView array_view;
+    NANOARROW_RETURN_NOT_OK(
+        ArrowArrayViewInitFromSchema(array_view.get(), schema.get(), nullptr));
+
+    out << R"(, "batches": [)";
+
+    nanoarrow::UniqueArray array;
+    NANOARROW_RETURN_NOT_OK(stream->get_next(stream, array.get()));
+    NANOARROW_RETURN_NOT_OK(
+        ArrowArrayViewSetArray(array_view.get(), array.get(), nullptr));
+    NANOARROW_RETURN_NOT_OK(WriteBatch(out, schema.get(), array_view.get()));
+
+    while (true) {
+      array.reset();
+      NANOARROW_RETURN_NOT_OK(stream->get_next(stream, array.get()));
+      if (array->release == nullptr) {
+        break;
+      }
+
+      out << ", ";
+      NANOARROW_RETURN_NOT_OK(
+          ArrowArrayViewSetArray(array_view.get(), array.get(), nullptr));
+      NANOARROW_RETURN_NOT_OK(WriteBatch(out, schema.get(), array_view.get()));
+    }
+
+    out << "]";
+
+    // Dictionaries errored at the schema stage

Review Comment:
   Alright, but since there aren't any dictionaries we should probably leave the dictionaries absent instead of empty



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

Re: [PR] feat: Add batch reader and data file read/write to/from ArrowArrayStream [arrow-nanoarrow]

Posted by "paleolimbot (via GitHub)" <gi...@apache.org>.

paleolimbot commented on code in PR #328:
URL: https://github.com/apache/arrow-nanoarrow/pull/328#discussion_r1411428757


##########
src/nanoarrow/nanoarrow_testing.hpp:
##########
@@ -660,6 +758,34 @@ class TestingJSONReader {
     }
   }
 
+  /// \brief Read JSON representing a RecordBatch
+  ///
+  /// Read a JSON object in the form `{"count": 123, "columns": [...]}`, propagating `out`
+  /// on success.
+  ArrowErrorCode ReadBatch(const std::string& batch_json, const ArrowSchema* schema,

Review Comment:
   I think you are right! I am going to defer reorganizing until I get closer to actual integration tests...it is not the only organizational change that needs to happen (e.g., it also maybe shouldn't be a 1700 line header) and I would like to get a few more pieces in place before the final shuffle 🙂 



##########
src/nanoarrow/nanoarrow_testing.hpp:
##########
@@ -660,6 +758,34 @@ class TestingJSONReader {
     }
   }
 
+  /// \brief Read JSON representing a RecordBatch
+  ///
+  /// Read a JSON object in the form `{"count": 123, "columns": [...]}`, propagating `out`
+  /// on success.
+  ArrowErrorCode ReadBatch(const std::string& batch_json, const ArrowSchema* schema,

Review Comment:
   I think you are right! I am going to defer reorganizing until I get closer to actual integration tests...it is not the only organizational change that needs to happen (e.g., it also maybe shouldn't be a 1700 line header) and I would like to get a few more pieces in place before the final shuffle 🙂 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

Re: [PR] feat: Add batch reader and data file read/write to/from ArrowArrayStream [arrow-nanoarrow]

Posted by "bkietz (via GitHub)" <gi...@apache.org>.

bkietz commented on code in PR #328:
URL: https://github.com/apache/arrow-nanoarrow/pull/328#discussion_r1411265082


##########
src/nanoarrow/nanoarrow_testing.hpp:
##########
@@ -660,6 +758,34 @@ class TestingJSONReader {
     }
   }
 
+  /// \brief Read JSON representing a RecordBatch
+  ///
+  /// Read a JSON object in the form `{"count": 123, "columns": [...]}`, propagating `out`
+  /// on success.
+  ArrowErrorCode ReadBatch(const std::string& batch_json, const ArrowSchema* schema,

Review Comment:
   Then perhaps this should be moved into nanoarrow_testing_test.cc?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

Re: [PR] feat: Add batch reader and data file read/write to/from ArrowArrayStream [arrow-nanoarrow]

Posted by "paleolimbot (via GitHub)" <gi...@apache.org>.

paleolimbot commented on code in PR #328:
URL: https://github.com/apache/arrow-nanoarrow/pull/328#discussion_r1411178536


##########
src/nanoarrow/nanoarrow_testing.hpp:
##########
@@ -46,6 +46,53 @@ namespace testing {
 /// \brief Writer for the Arrow integration testing JSON format
 class TestingJSONWriter {
  public:
+  /// \brief Write an ArrowArrayStream as a data file to out

Review Comment:
   Hopefully this is more clear now! The integration test JSON docs don't have a nicely named object for the JSON object that sits at the root of what they call a "data file" (whereas they do for things like RecordBatch and Schema).



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

Re: [PR] feat: Add batch reader and data file read/write to/from ArrowArrayStream [arrow-nanoarrow]

Posted by "paleolimbot (via GitHub)" <gi...@apache.org>.

paleolimbot commented on code in PR #328:
URL: https://github.com/apache/arrow-nanoarrow/pull/328#discussion_r1411173376


##########
src/nanoarrow/nanoarrow_testing.hpp:
##########
@@ -660,6 +758,34 @@ class TestingJSONReader {
     }
   }
 
+  /// \brief Read JSON representing a RecordBatch
+  ///
+  /// Read a JSON object in the form `{"count": 123, "columns": [...]}`, propagating `out`
+  /// on success.
+  ArrowErrorCode ReadBatch(const std::string& batch_json, const ArrowSchema* schema,

Review Comment:
   It's pretty much 100% to test `SetArrayBatch()`, although it's true that there is less to test in `SetArrayBatch()` than (say) `SetArrayColumn()`.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

Re: [PR] feat: Add batch reader and data file read/write to/from ArrowArrayStream [arrow-nanoarrow]

Posted by "paleolimbot (via GitHub)" <gi...@apache.org>.

paleolimbot commented on code in PR #328:
URL: https://github.com/apache/arrow-nanoarrow/pull/328#discussion_r1411170852


##########
src/nanoarrow/nanoarrow_testing.hpp:
##########
@@ -46,6 +46,53 @@ namespace testing {
 /// \brief Writer for the Arrow integration testing JSON format
 class TestingJSONWriter {
  public:
+  /// \brief Write an ArrowArrayStream as a data file to out
+  ///
+  /// Creates output like `{"schema": {...}, "batches": [...], ...}`.
+  ArrowErrorCode WriteDataFile(std::ostream& out, ArrowArrayStream* stream) {
+    if (stream == nullptr || stream->release == nullptr) {
+      return EINVAL;
+    }
+
+    out << R"({"schema": )";
+
+    nanoarrow::UniqueSchema schema;
+    NANOARROW_RETURN_NOT_OK(stream->get_schema(stream, schema.get()));
+    NANOARROW_RETURN_NOT_OK(WriteSchema(out, schema.get()));
+
+    nanoarrow::UniqueArrayView array_view;
+    NANOARROW_RETURN_NOT_OK(
+        ArrowArrayViewInitFromSchema(array_view.get(), schema.get(), nullptr));
+
+    out << R"(, "batches": [)";
+
+    nanoarrow::UniqueArray array;
+    NANOARROW_RETURN_NOT_OK(stream->get_next(stream, array.get()));
+    NANOARROW_RETURN_NOT_OK(
+        ArrowArrayViewSetArray(array_view.get(), array.get(), nullptr));
+    NANOARROW_RETURN_NOT_OK(WriteBatch(out, schema.get(), array_view.get()));
+
+    while (true) {
+      array.reset();
+      NANOARROW_RETURN_NOT_OK(stream->get_next(stream, array.get()));
+      if (array->release == nullptr) {
+        break;
+      }
+
+      out << ", ";
+      NANOARROW_RETURN_NOT_OK(
+          ArrowArrayViewSetArray(array_view.get(), array.get(), nullptr));
+      NANOARROW_RETURN_NOT_OK(WriteBatch(out, schema.get(), array_view.get()));
+    }

Review Comment:
   Good catch 😬 ! (Test added!)



##########
src/nanoarrow/nanoarrow_testing.hpp:
##########
@@ -46,6 +46,53 @@ namespace testing {
 /// \brief Writer for the Arrow integration testing JSON format
 class TestingJSONWriter {
  public:
+  /// \brief Write an ArrowArrayStream as a data file to out
+  ///
+  /// Creates output like `{"schema": {...}, "batches": [...], ...}`.
+  ArrowErrorCode WriteDataFile(std::ostream& out, ArrowArrayStream* stream) {
+    if (stream == nullptr || stream->release == nullptr) {
+      return EINVAL;
+    }
+
+    out << R"({"schema": )";
+
+    nanoarrow::UniqueSchema schema;
+    NANOARROW_RETURN_NOT_OK(stream->get_schema(stream, schema.get()));
+    NANOARROW_RETURN_NOT_OK(WriteSchema(out, schema.get()));
+
+    nanoarrow::UniqueArrayView array_view;
+    NANOARROW_RETURN_NOT_OK(
+        ArrowArrayViewInitFromSchema(array_view.get(), schema.get(), nullptr));
+
+    out << R"(, "batches": [)";
+
+    nanoarrow::UniqueArray array;
+    NANOARROW_RETURN_NOT_OK(stream->get_next(stream, array.get()));
+    NANOARROW_RETURN_NOT_OK(
+        ArrowArrayViewSetArray(array_view.get(), array.get(), nullptr));
+    NANOARROW_RETURN_NOT_OK(WriteBatch(out, schema.get(), array_view.get()));
+
+    while (true) {
+      array.reset();
+      NANOARROW_RETURN_NOT_OK(stream->get_next(stream, array.get()));
+      if (array->release == nullptr) {
+        break;
+      }
+
+      out << ", ";
+      NANOARROW_RETURN_NOT_OK(
+          ArrowArrayViewSetArray(array_view.get(), array.get(), nullptr));
+      NANOARROW_RETURN_NOT_OK(WriteBatch(out, schema.get(), array_view.get()));
+    }

Review Comment:
   Good catch 😬 ! (Test added!)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

Re: [PR] feat: Add batch reader and data file read/write to/from ArrowArrayStream [arrow-nanoarrow]

Posted by "bkietz (via GitHub)" <gi...@apache.org>.

bkietz commented on code in PR #328:
URL: https://github.com/apache/arrow-nanoarrow/pull/328#discussion_r1411088734


##########
src/nanoarrow/nanoarrow_testing.hpp:
##########
@@ -46,6 +46,53 @@ namespace testing {
 /// \brief Writer for the Arrow integration testing JSON format
 class TestingJSONWriter {
  public:
+  /// \brief Write an ArrowArrayStream as a data file to out
+  ///
+  /// Creates output like `{"schema": {...}, "batches": [...], ...}`.
+  ArrowErrorCode WriteDataFile(std::ostream& out, ArrowArrayStream* stream) {
+    if (stream == nullptr || stream->release == nullptr) {
+      return EINVAL;
+    }
+
+    out << R"({"schema": )";
+
+    nanoarrow::UniqueSchema schema;
+    NANOARROW_RETURN_NOT_OK(stream->get_schema(stream, schema.get()));
+    NANOARROW_RETURN_NOT_OK(WriteSchema(out, schema.get()));
+
+    nanoarrow::UniqueArrayView array_view;
+    NANOARROW_RETURN_NOT_OK(
+        ArrowArrayViewInitFromSchema(array_view.get(), schema.get(), nullptr));
+
+    out << R"(, "batches": [)";
+
+    nanoarrow::UniqueArray array;
+    NANOARROW_RETURN_NOT_OK(stream->get_next(stream, array.get()));
+    NANOARROW_RETURN_NOT_OK(
+        ArrowArrayViewSetArray(array_view.get(), array.get(), nullptr));
+    NANOARROW_RETURN_NOT_OK(WriteBatch(out, schema.get(), array_view.get()));
+
+    while (true) {
+      array.reset();
+      NANOARROW_RETURN_NOT_OK(stream->get_next(stream, array.get()));
+      if (array->release == nullptr) {
+        break;
+      }
+
+      out << ", ";
+      NANOARROW_RETURN_NOT_OK(
+          ArrowArrayViewSetArray(array_view.get(), array.get(), nullptr));
+      NANOARROW_RETURN_NOT_OK(WriteBatch(out, schema.get(), array_view.get()));
+    }
+
+    out << "]";
+
+    // Dictionaries errored at the schema stage

Review Comment:
   I'm confused by this comment. Are you leaving dictionaries as a TODO?
   
   Also, although I'm sure an empty dictionaries entry would be accepted by any reader: the spec says it will only be present [if there are dictionary fields in the schema](https://github.com/apache/arrow/blob/e9730f5/docs/source/format/Integration.rst?plain=1#L180)



##########
src/nanoarrow/nanoarrow_testing.hpp:
##########
@@ -660,6 +758,34 @@ class TestingJSONReader {
     }
   }
 
+  /// \brief Read JSON representing a RecordBatch
+  ///
+  /// Read a JSON object in the form `{"count": 123, "columns": [...]}`, propagating `out`
+  /// on success.
+  ArrowErrorCode ReadBatch(const std::string& batch_json, const ArrowSchema* schema,

Review Comment:
   When would this be used? When reading a json file the parsing will all be done up front and we won't see individual batch strings, so won't SetArrayBatch be sufficient?



##########
src/nanoarrow/nanoarrow_testing.hpp:
##########
@@ -46,6 +46,53 @@ namespace testing {
 /// \brief Writer for the Arrow integration testing JSON format
 class TestingJSONWriter {
  public:
+  /// \brief Write an ArrowArrayStream as a data file to out
+  ///
+  /// Creates output like `{"schema": {...}, "batches": [...], ...}`.
+  ArrowErrorCode WriteDataFile(std::ostream& out, ArrowArrayStream* stream) {
+    if (stream == nullptr || stream->release == nullptr) {
+      return EINVAL;
+    }
+
+    out << R"({"schema": )";
+
+    nanoarrow::UniqueSchema schema;
+    NANOARROW_RETURN_NOT_OK(stream->get_schema(stream, schema.get()));
+    NANOARROW_RETURN_NOT_OK(WriteSchema(out, schema.get()));
+
+    nanoarrow::UniqueArrayView array_view;
+    NANOARROW_RETURN_NOT_OK(
+        ArrowArrayViewInitFromSchema(array_view.get(), schema.get(), nullptr));
+
+    out << R"(, "batches": [)";
+
+    nanoarrow::UniqueArray array;
+    NANOARROW_RETURN_NOT_OK(stream->get_next(stream, array.get()));
+    NANOARROW_RETURN_NOT_OK(
+        ArrowArrayViewSetArray(array_view.get(), array.get(), nullptr));
+    NANOARROW_RETURN_NOT_OK(WriteBatch(out, schema.get(), array_view.get()));
+
+    while (true) {
+      array.reset();
+      NANOARROW_RETURN_NOT_OK(stream->get_next(stream, array.get()));
+      if (array->release == nullptr) {
+        break;
+      }
+
+      out << ", ";
+      NANOARROW_RETURN_NOT_OK(
+          ArrowArrayViewSetArray(array_view.get(), array.get(), nullptr));
+      NANOARROW_RETURN_NOT_OK(WriteBatch(out, schema.get(), array_view.get()));
+    }

Review Comment:
   IIUC, we need to check if the first batch is released too. As a nit, I've folded this into a single loop:
   ```suggestion
       std::string sep;
       do {
         NANOARROW_RETURN_NOT_OK(stream->get_next(stream, array.get()));
         if (array->release == nullptr) {
           break;
         }
         NANOARROW_RETURN_NOT_OK(
             ArrowArrayViewSetArray(array_view.get(), array.get(), nullptr));
         out << sep;
         sep = ", ";
         NANOARROW_RETURN_NOT_OK(WriteBatch(out, schema.get(), array_view.get()));
         array.reset();
       } while (true);
   ```



##########
src/nanoarrow/nanoarrow_testing.hpp:
##########
@@ -46,6 +46,53 @@ namespace testing {
 /// \brief Writer for the Arrow integration testing JSON format
 class TestingJSONWriter {
  public:
+  /// \brief Write an ArrowArrayStream as a data file to out
+  ///
+  /// Creates output like `{"schema": {...}, "batches": [...], ...}`.
+  ArrowErrorCode WriteDataFile(std::ostream& out, ArrowArrayStream* stream) {
+    if (stream == nullptr || stream->release == nullptr) {
+      return EINVAL;
+    }
+
+    out << R"({"schema": )";
+
+    nanoarrow::UniqueSchema schema;
+    NANOARROW_RETURN_NOT_OK(stream->get_schema(stream, schema.get()));
+    NANOARROW_RETURN_NOT_OK(WriteSchema(out, schema.get()));
+
+    nanoarrow::UniqueArrayView array_view;
+    NANOARROW_RETURN_NOT_OK(
+        ArrowArrayViewInitFromSchema(array_view.get(), schema.get(), nullptr));
+
+    out << R"(, "batches": [)";
+
+    nanoarrow::UniqueArray array;
+    NANOARROW_RETURN_NOT_OK(stream->get_next(stream, array.get()));
+    NANOARROW_RETURN_NOT_OK(
+        ArrowArrayViewSetArray(array_view.get(), array.get(), nullptr));
+    NANOARROW_RETURN_NOT_OK(WriteBatch(out, schema.get(), array_view.get()));
+
+    while (true) {
+      array.reset();
+      NANOARROW_RETURN_NOT_OK(stream->get_next(stream, array.get()));
+      if (array->release == nullptr) {
+        break;
+      }
+
+      out << ", ";
+      NANOARROW_RETURN_NOT_OK(
+          ArrowArrayViewSetArray(array_view.get(), array.get(), nullptr));
+      NANOARROW_RETURN_NOT_OK(WriteBatch(out, schema.get(), array_view.get()));
+    }

Review Comment:
   I'd recommend adding a test for empty streams too



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

Re: [PR] feat: Add batch reader and data file read/write to/from ArrowArrayStream [arrow-nanoarrow]

Posted by "paleolimbot (via GitHub)" <gi...@apache.org>.

paleolimbot commented on code in PR #328:
URL: https://github.com/apache/arrow-nanoarrow/pull/328#discussion_r1411177718


##########
src/nanoarrow/nanoarrow_testing.hpp:
##########
@@ -46,6 +46,53 @@ namespace testing {
 /// \brief Writer for the Arrow integration testing JSON format
 class TestingJSONWriter {
  public:
+  /// \brief Write an ArrowArrayStream as a data file to out
+  ///
+  /// Creates output like `{"schema": {...}, "batches": [...], ...}`.
+  ArrowErrorCode WriteDataFile(std::ostream& out, ArrowArrayStream* stream) {
+    if (stream == nullptr || stream->release == nullptr) {
+      return EINVAL;
+    }
+
+    out << R"({"schema": )";
+
+    nanoarrow::UniqueSchema schema;
+    NANOARROW_RETURN_NOT_OK(stream->get_schema(stream, schema.get()));
+    NANOARROW_RETURN_NOT_OK(WriteSchema(out, schema.get()));
+
+    nanoarrow::UniqueArrayView array_view;
+    NANOARROW_RETURN_NOT_OK(
+        ArrowArrayViewInitFromSchema(array_view.get(), schema.get(), nullptr));
+
+    out << R"(, "batches": [)";
+
+    nanoarrow::UniqueArray array;
+    NANOARROW_RETURN_NOT_OK(stream->get_next(stream, array.get()));
+    NANOARROW_RETURN_NOT_OK(
+        ArrowArrayViewSetArray(array_view.get(), array.get(), nullptr));
+    NANOARROW_RETURN_NOT_OK(WriteBatch(out, schema.get(), array_view.get()));
+
+    while (true) {
+      array.reset();
+      NANOARROW_RETURN_NOT_OK(stream->get_next(stream, array.get()));
+      if (array->release == nullptr) {
+        break;
+      }
+
+      out << ", ";
+      NANOARROW_RETURN_NOT_OK(
+          ArrowArrayViewSetArray(array_view.get(), array.get(), nullptr));
+      NANOARROW_RETURN_NOT_OK(WriteBatch(out, schema.get(), array_view.get()));
+    }
+
+    out << "]";
+
+    // Dictionaries errored at the schema stage

Review Comment:
   Ben had the same comment...hopefully this is more clear now! (I just haven't implemented dictionaries yet)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

Re: [PR] feat: Add batch reader and data file read/write to/from ArrowArrayStream [arrow-nanoarrow]

Posted by "paleolimbot (via GitHub)" <gi...@apache.org>.

paleolimbot commented on code in PR #328:
URL: https://github.com/apache/arrow-nanoarrow/pull/328#discussion_r1411172399


##########
src/nanoarrow/nanoarrow_testing.hpp:
##########
@@ -46,6 +46,53 @@ namespace testing {
 /// \brief Writer for the Arrow integration testing JSON format
 class TestingJSONWriter {
  public:
+  /// \brief Write an ArrowArrayStream as a data file to out
+  ///
+  /// Creates output like `{"schema": {...}, "batches": [...], ...}`.
+  ArrowErrorCode WriteDataFile(std::ostream& out, ArrowArrayStream* stream) {
+    if (stream == nullptr || stream->release == nullptr) {
+      return EINVAL;
+    }
+
+    out << R"({"schema": )";
+
+    nanoarrow::UniqueSchema schema;
+    NANOARROW_RETURN_NOT_OK(stream->get_schema(stream, schema.get()));
+    NANOARROW_RETURN_NOT_OK(WriteSchema(out, schema.get()));
+
+    nanoarrow::UniqueArrayView array_view;
+    NANOARROW_RETURN_NOT_OK(
+        ArrowArrayViewInitFromSchema(array_view.get(), schema.get(), nullptr));
+
+    out << R"(, "batches": [)";
+
+    nanoarrow::UniqueArray array;
+    NANOARROW_RETURN_NOT_OK(stream->get_next(stream, array.get()));
+    NANOARROW_RETURN_NOT_OK(
+        ArrowArrayViewSetArray(array_view.get(), array.get(), nullptr));
+    NANOARROW_RETURN_NOT_OK(WriteBatch(out, schema.get(), array_view.get()));
+
+    while (true) {
+      array.reset();
+      NANOARROW_RETURN_NOT_OK(stream->get_next(stream, array.get()));
+      if (array->release == nullptr) {
+        break;
+      }
+
+      out << ", ";
+      NANOARROW_RETURN_NOT_OK(
+          ArrowArrayViewSetArray(array_view.get(), array.get(), nullptr));
+      NANOARROW_RETURN_NOT_OK(WriteBatch(out, schema.get(), array_view.get()));
+    }
+
+    out << "]";
+
+    // Dictionaries errored at the schema stage

Review Comment:
   I updated this comment to hopefully be more clear that the dictionaries are only empty right now because they're not supported by the writer yet. I'm hoping to get something end-to-end before filling out dictionaries/a few other missing types!



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org