You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/08/12 17:32:40 UTC

[GitHub] [arrow] wjones127 commented on a diff in pull request #13859: ARROW-17377: [C++][Docs] Adds tutorial for basic Arrow, file access, compute, and datasets

wjones127 commented on code in PR #13859:
URL: https://github.com/apache/arrow/pull/13859#discussion_r944643464


##########
cpp/examples/tutorial_examples/dataset_example.cc:
##########
@@ -0,0 +1,169 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <arrow/api.h>
+#include <parquet/arrow/reader.h>
+#include <parquet/arrow/writer.h>
+#include <arrow/dataset/api.h>
+
+#include <iostream>
+
+
+// Generate some data for the rest of this example.
+arrow::Result<std::shared_ptr<arrow::Table>> CreateTable() {
+    //This code should look familiar from the basic Arrow example, and is not the
+    //focus of this example. However, we need data to work on it, and this makes that!
+    auto schema =
+            arrow::schema({arrow::field("a", arrow::int64()),
+                           arrow::field("b", arrow::int64()),
+                           arrow::field("c", arrow::int64())});
+    std::shared_ptr<arrow::Array> array_a;
+    std::shared_ptr<arrow::Array> array_b;
+    std::shared_ptr<arrow::Array> array_c;
+    arrow::NumericBuilder<arrow::Int64Type> builder;
+    ARROW_RETURN_NOT_OK(builder.AppendValues({0, 1, 2, 3, 4, 5, 6, 7, 8, 9}));
+    ARROW_RETURN_NOT_OK(builder.Finish(&array_a));
+    builder.Reset();
+    ARROW_RETURN_NOT_OK(builder.AppendValues({9, 8, 7, 6, 5, 4, 3, 2, 1, 0}));
+    ARROW_RETURN_NOT_OK(builder.Finish(&array_b));
+    builder.Reset();
+    ARROW_RETURN_NOT_OK(builder.AppendValues({1, 2, 1, 2, 1, 2, 1, 2, 1, 2}));
+    ARROW_RETURN_NOT_OK(builder.Finish(&array_c));
+    return arrow::Table::Make(schema, {array_a, array_b, array_c});
+}
+
+// Set up a dataset by writing two Parquet files.
+arrow::Result<std::string> CreateExampleParquetDataset(
+        const std::shared_ptr<arrow::fs::FileSystem>& filesystem,
+        const std::string& root_path) {
+    //Much like CreateTable(), this is utility that gets us the dataset we'll be reading
+    //from. Don't worry, we also write a dataset in the example proper.
+    auto base_path = root_path + "parquet_dataset";
+    ARROW_RETURN_NOT_OK(filesystem->CreateDir(base_path));
+    // Create an Arrow Table
+    ARROW_ASSIGN_OR_RAISE(auto table, CreateTable());
+    // Write it into two Parquet files
+    ARROW_ASSIGN_OR_RAISE(auto output,
+                          filesystem->OpenOutputStream(base_path + "/data1.parquet"));
+    ARROW_RETURN_NOT_OK(parquet::arrow::WriteTable(
+            *table->Slice(0, 5), arrow::default_memory_pool(), output, 2048));
+    ARROW_ASSIGN_OR_RAISE(output,
+                          filesystem->OpenOutputStream(base_path + "/data2.parquet"));
+    ARROW_RETURN_NOT_OK(parquet::arrow::WriteTable(
+            *table->Slice(5), arrow::default_memory_pool(), output, 2048));
+    return base_path;
+}
+
+arrow::Status RunMain() {
+
+    //Get our environment prepared for reading, by setting up some quick writing.
+    ARROW_ASSIGN_OR_RAISE(auto src_table, CreateTable())
+    std::shared_ptr<arrow::fs::FileSystem> setup_fs;
+    //Note this operates in the directory the executable is built in.
+    char setup_path[256];
+    getcwd(setup_path, 256);
+    ARROW_ASSIGN_OR_RAISE(setup_fs, arrow::fs::FileSystemFromUriOrPath(setup_path));
+    ARROW_ASSIGN_OR_RAISE(auto dset_path, CreateExampleParquetDataset(setup_fs,
+                                                                      ""));
+
+    //First, we need a filesystem object, which lets us interact with our local
+    //filesystem starting at a given path. For the sake of simplicity, that'll be
+    //the current directory.
+    std::shared_ptr<arrow::fs::FileSystem> fs;
+    //This feels pretty bad, but I wasn't finding great solutions that're
+    //system-generic -- could use some advice on how to set this up.
+    char init_path[256];
+    getcwd(init_path, 256);
+    ARROW_ASSIGN_OR_RAISE(fs, arrow::fs::FileSystemFromUriOrPath(init_path));
+
+    //A file selector lets us actually traverse a multi-file dataset.
+    arrow::fs::FileSelector selector;
+    selector.base_dir = "parquet_dataset";
+    //Recursive is a safe bet if you don't know the nesting of your dataset.
+    selector.recursive = true;
+    //Making an options object lets us configure our dataset reading.
+    arrow::dataset::FileSystemFactoryOptions options;
+    // We'll use Hive-style partitioning. We'll let Arrow Datasets infer the partition
+    // schema. We won't set any other options, defaults are fine.
+    options.partitioning = arrow::dataset::HivePartitioning::MakeFactory();
+    auto read_format =
+            std::make_shared<arrow::dataset::ParquetFileFormat>();
+    //Now, we get a factory that will let us get our dataset -- we don't have the
+    //dataset yet!
+    auto factory = arrow::dataset::FileSystemDatasetFactory::Make(fs,
+                                                                  selector,
+                                                                  read_format,
+                                                                  options)
+            .ValueOrDie();
+    //Now we read into our dataset from the factory.
+    auto read_dataset = factory->Finish().ValueOrDie();
+    // Print out the fragments
+    for (const auto& fragment : read_dataset->GetFragments().ValueOrDie()) {
+        std::cout << "Found fragment: " << (*fragment)->ToString() << std::endl;
+    }
+
+    //Scan dataset into a Table -- once this is done, you can do
+    //normal table things with it, like computation and printing. However, now you're
+    //also dedicated to being in memory.
+    auto read_scan_builder = read_dataset->NewScan().ValueOrDie();
+    auto read_scanner = read_scan_builder->Finish().ValueOrDie();
+    std::shared_ptr<arrow::Table> table = read_scanner->ToTable().ValueOrDie();
+    std::cout << table->ToString();
+
+    //Now, let's get a table out as a dataset!

Review Comment:
   Maybe instead of showing how to create an `InMemoryDataset`, we should show how to get a `RecordBatchReader`, which is a more commonly used data structure?
   
   ```cpp
   arrow::RecordBatchReader reader;
   ARROW_ASSIGN_OR_RAISE(reader, read_dataset->NewScan().ValueOrDie()->ScanBatches());
   
   std::shared_ptr<arrow::dataset::Scanner> write_scanner;
   ARROW_ASSIGN_OR_RAISE(write_scanner, arrow::dataset::ScannerBuilder.FromRecordBatchReader(reader)->Finish());
   ```
   
   Also I had no idea we had to make a scanner to write in C++. It's not that way in Python / R. Maybe should should add more overloads 🤔 



##########
cpp/examples/tutorial_examples/dataset_example.cc:
##########
@@ -0,0 +1,169 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <arrow/api.h>
+#include <parquet/arrow/reader.h>
+#include <parquet/arrow/writer.h>
+#include <arrow/dataset/api.h>
+
+#include <iostream>
+
+
+// Generate some data for the rest of this example.
+arrow::Result<std::shared_ptr<arrow::Table>> CreateTable() {
+    //This code should look familiar from the basic Arrow example, and is not the
+    //focus of this example. However, we need data to work on it, and this makes that!
+    auto schema =
+            arrow::schema({arrow::field("a", arrow::int64()),
+                           arrow::field("b", arrow::int64()),
+                           arrow::field("c", arrow::int64())});
+    std::shared_ptr<arrow::Array> array_a;
+    std::shared_ptr<arrow::Array> array_b;
+    std::shared_ptr<arrow::Array> array_c;
+    arrow::NumericBuilder<arrow::Int64Type> builder;
+    ARROW_RETURN_NOT_OK(builder.AppendValues({0, 1, 2, 3, 4, 5, 6, 7, 8, 9}));
+    ARROW_RETURN_NOT_OK(builder.Finish(&array_a));
+    builder.Reset();
+    ARROW_RETURN_NOT_OK(builder.AppendValues({9, 8, 7, 6, 5, 4, 3, 2, 1, 0}));
+    ARROW_RETURN_NOT_OK(builder.Finish(&array_b));
+    builder.Reset();
+    ARROW_RETURN_NOT_OK(builder.AppendValues({1, 2, 1, 2, 1, 2, 1, 2, 1, 2}));
+    ARROW_RETURN_NOT_OK(builder.Finish(&array_c));
+    return arrow::Table::Make(schema, {array_a, array_b, array_c});
+}
+
+// Set up a dataset by writing two Parquet files.
+arrow::Result<std::string> CreateExampleParquetDataset(
+        const std::shared_ptr<arrow::fs::FileSystem>& filesystem,
+        const std::string& root_path) {
+    //Much like CreateTable(), this is utility that gets us the dataset we'll be reading
+    //from. Don't worry, we also write a dataset in the example proper.
+    auto base_path = root_path + "parquet_dataset";
+    ARROW_RETURN_NOT_OK(filesystem->CreateDir(base_path));
+    // Create an Arrow Table
+    ARROW_ASSIGN_OR_RAISE(auto table, CreateTable());
+    // Write it into two Parquet files
+    ARROW_ASSIGN_OR_RAISE(auto output,
+                          filesystem->OpenOutputStream(base_path + "/data1.parquet"));
+    ARROW_RETURN_NOT_OK(parquet::arrow::WriteTable(
+            *table->Slice(0, 5), arrow::default_memory_pool(), output, 2048));
+    ARROW_ASSIGN_OR_RAISE(output,
+                          filesystem->OpenOutputStream(base_path + "/data2.parquet"));
+    ARROW_RETURN_NOT_OK(parquet::arrow::WriteTable(
+            *table->Slice(5), arrow::default_memory_pool(), output, 2048));
+    return base_path;
+}
+
+arrow::Status RunMain() {
+
+    //Get our environment prepared for reading, by setting up some quick writing.
+    ARROW_ASSIGN_OR_RAISE(auto src_table, CreateTable())
+    std::shared_ptr<arrow::fs::FileSystem> setup_fs;
+    //Note this operates in the directory the executable is built in.
+    char setup_path[256];
+    getcwd(setup_path, 256);
+    ARROW_ASSIGN_OR_RAISE(setup_fs, arrow::fs::FileSystemFromUriOrPath(setup_path));
+    ARROW_ASSIGN_OR_RAISE(auto dset_path, CreateExampleParquetDataset(setup_fs,
+                                                                      ""));
+
+    //First, we need a filesystem object, which lets us interact with our local
+    //filesystem starting at a given path. For the sake of simplicity, that'll be
+    //the current directory.
+    std::shared_ptr<arrow::fs::FileSystem> fs;
+    //This feels pretty bad, but I wasn't finding great solutions that're
+    //system-generic -- could use some advice on how to set this up.
+    char init_path[256];
+    getcwd(init_path, 256);
+    ARROW_ASSIGN_OR_RAISE(fs, arrow::fs::FileSystemFromUriOrPath(init_path));
+
+    //A file selector lets us actually traverse a multi-file dataset.
+    arrow::fs::FileSelector selector;
+    selector.base_dir = "parquet_dataset";
+    //Recursive is a safe bet if you don't know the nesting of your dataset.
+    selector.recursive = true;
+    //Making an options object lets us configure our dataset reading.
+    arrow::dataset::FileSystemFactoryOptions options;
+    // We'll use Hive-style partitioning. We'll let Arrow Datasets infer the partition
+    // schema. We won't set any other options, defaults are fine.
+    options.partitioning = arrow::dataset::HivePartitioning::MakeFactory();
+    auto read_format =
+            std::make_shared<arrow::dataset::ParquetFileFormat>();
+    //Now, we get a factory that will let us get our dataset -- we don't have the
+    //dataset yet!
+    auto factory = arrow::dataset::FileSystemDatasetFactory::Make(fs,
+                                                                  selector,
+                                                                  read_format,
+                                                                  options)
+            .ValueOrDie();
+    //Now we read into our dataset from the factory.
+    auto read_dataset = factory->Finish().ValueOrDie();
+    // Print out the fragments
+    for (const auto& fragment : read_dataset->GetFragments().ValueOrDie()) {
+        std::cout << "Found fragment: " << (*fragment)->ToString() << std::endl;
+    }
+
+    //Scan dataset into a Table -- once this is done, you can do
+    //normal table things with it, like computation and printing. However, now you're
+    //also dedicated to being in memory.
+    auto read_scan_builder = read_dataset->NewScan().ValueOrDie();
+    auto read_scanner = read_scan_builder->Finish().ValueOrDie();
+    std::shared_ptr<arrow::Table> table = read_scanner->ToTable().ValueOrDie();
+    std::cout << table->ToString();
+
+    //Now, let's get a table out as a dataset!
+    //We make a dataset from our Table, then set up a scanner, which lets us go to a file.
+    auto write_dataset = std::make_shared<arrow::dataset::InMemoryDataset>(table);
+    auto write_scanner_builder = write_dataset->NewScan().ValueOrDie();
+    auto write_scanner = write_scanner_builder->Finish().ValueOrDie();
+
+    // The partition schema determines which fields are used as keys for partitioning.
+    auto partition_schema = arrow::schema({arrow::field("a", arrow::utf8())});
+    // We'll use Hive-style partitioning, which creates directories with "key=value"
+    // pairs.
+    auto partitioning =
+            std::make_shared<arrow::dataset::HivePartitioning>(partition_schema);
+    // Now, we declare we'll be writing Parquet files.
+    auto write_format = std::make_shared<arrow::dataset::ParquetFileFormat>();
+    //This time, we make Options for writing, but do much more configuration.
+    arrow::dataset::FileSystemDatasetWriteOptions write_options;
+    //Defaults to start.
+    write_options.file_write_options = write_format->DefaultWriteOptions();
+    //Use the filesystem we already have.
+    write_options.filesystem = fs;
+    //Can only really be run once at the moment, because it'll explode upon noticing
+    // the folder has anything.

Review Comment:
   I think for this reason, it's reasonable to show use of `write_options.existing_data_behavior`.
   
   Also for brevity, maybe we can drop some of the other options, like `write_options.basename_template`?



##########
cpp/examples/tutorial_examples/file_access_example.cc:
##########
@@ -0,0 +1,174 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <arrow/api.h>
+#include <arrow/csv/api.h>
+#include <parquet/arrow/reader.h>
+#include <parquet/arrow/writer.h>
+#include <arrow/io/api.h>
+#include <arrow/ipc/api.h>
+
+#include <iostream>
+
+arrow::Status GenInitialFile(){
+  //Make a couple 8-bit integer arrays and a 16-bit integer array -- just like
+  //basic Arrow example.
+  arrow::Int8Builder int8builder; 
+  int8_t days_raw[5] = {1, 12, 17, 23, 28};
+  ARROW_RETURN_NOT_OK(int8builder.AppendValues(days_raw, 5));
+  std::shared_ptr<arrow::Array> days;
+  ARROW_ASSIGN_OR_RAISE(days, int8builder.Finish());
+
+  int8_t months_raw[5] = {1, 3, 5, 7, 1};
+  ARROW_RETURN_NOT_OK(int8builder.AppendValues(months_raw, 5));
+  std::shared_ptr<arrow::Array> months;
+  ARROW_ASSIGN_OR_RAISE(months, int8builder.Finish());
+
+  arrow::Int16Builder int16builder;
+  int16_t years_raw[5] = {1990, 2000, 1995, 2000, 1995};
+  ARROW_RETURN_NOT_OK(int16builder.AppendValues(years_raw, 5));
+  std::shared_ptr<arrow::Array> years;
+  ARROW_ASSIGN_OR_RAISE(years, int16builder.Finish());
+
+  //Get a vector of our Arrays
+  std::vector<std::shared_ptr<arrow::Array>> columns = {days, months, years};
+
+  //Make a schema to initialize the Table with
+  std::shared_ptr<arrow::Field> field_day, field_month, field_year;
+  std::shared_ptr<arrow::Schema> schema;
+
+  field_day = arrow::field("Day", arrow::int8());
+  field_month = arrow::field("Month", arrow::int8());
+  field_year = arrow::field("Year", arrow::int16());
+
+  schema = arrow::schema({field_day, field_month, field_year});
+  //With the schema and data, create a Table
+  std::shared_ptr<arrow::Table> table; 
+  table = arrow::Table::Make(schema, columns);
+
+  //Write out test files in IPC, CSV, and Parquet for the example to use.
+  std::shared_ptr<arrow::io::FileOutputStream> outfile;
+  ARROW_ASSIGN_OR_RAISE(outfile, arrow::io::FileOutputStream::Open("test_in.ipc"));
+  ARROW_ASSIGN_OR_RAISE(std::shared_ptr<arrow::ipc::RecordBatchWriter> ipc_writer,
+                        arrow::ipc::MakeFileWriter(outfile, schema));
+  ARROW_RETURN_NOT_OK(ipc_writer->WriteTable(*table));
+  ARROW_RETURN_NOT_OK(ipc_writer->Close());
+
+  ARROW_ASSIGN_OR_RAISE(outfile, arrow::io::FileOutputStream::Open("test_in.csv"));
+  ARROW_ASSIGN_OR_RAISE(auto csv_writer, arrow::csv::MakeCSVWriter(
+          outfile, table->schema()));
+  ARROW_RETURN_NOT_OK(csv_writer->WriteTable(*table));
+  ARROW_RETURN_NOT_OK(csv_writer->Close());
+
+  ARROW_ASSIGN_OR_RAISE(outfile, arrow::io::FileOutputStream::Open("test_in.parquet"));
+  PARQUET_THROW_NOT_OK(parquet::arrow::WriteTable(*table,
+                                                  arrow::default_memory_pool(),
+                                                  outfile, 5));
+
+  return arrow::Status::OK();
+}
+
+arrow::Status RunMain() {
+  //Generate initial files for each format with a helper function -- don't worry,
+  //we'll also write a table in this example.
+  ARROW_RETURN_NOT_OK(GenInitialFile());
+ 
+  //Reading and writing from files
+
+  //First, we have to set up a ReadableFile object, which just lets us point our
+  //readers to the right data on disk. We'll be reusing this object, and rebinding
+  //it to multiple files throughout the example.
+  std::shared_ptr<arrow::io::ReadableFile> infile;
+  //Get "test_in.ipc" into our file pointer
+  ARROW_ASSIGN_OR_RAISE(infile,
+                          arrow::io::ReadableFile::Open("test_in.ipc",
+                                                        arrow::default_memory_pool()));
+  //Open up the file with the IPC features of the library, gives us a reader object.
+  ARROW_ASSIGN_OR_RAISE(auto ipc_reader,
+                        arrow::ipc::RecordBatchFileReader::Open(infile));
+  //Using the reader, we can read Record Batches. Note that this is specific to IPC;

Review Comment:
   Created: https://issues.apache.org/jira/browse/ARROW-17401



##########
cpp/examples/tutorial_examples/file_access_example.cc:
##########
@@ -0,0 +1,174 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <arrow/api.h>
+#include <arrow/csv/api.h>
+#include <parquet/arrow/reader.h>
+#include <parquet/arrow/writer.h>
+#include <arrow/io/api.h>
+#include <arrow/ipc/api.h>
+
+#include <iostream>
+
+arrow::Status GenInitialFile(){
+  //Make a couple 8-bit integer arrays and a 16-bit integer array -- just like
+  //basic Arrow example.
+  arrow::Int8Builder int8builder; 
+  int8_t days_raw[5] = {1, 12, 17, 23, 28};
+  ARROW_RETURN_NOT_OK(int8builder.AppendValues(days_raw, 5));
+  std::shared_ptr<arrow::Array> days;
+  ARROW_ASSIGN_OR_RAISE(days, int8builder.Finish());
+
+  int8_t months_raw[5] = {1, 3, 5, 7, 1};
+  ARROW_RETURN_NOT_OK(int8builder.AppendValues(months_raw, 5));
+  std::shared_ptr<arrow::Array> months;
+  ARROW_ASSIGN_OR_RAISE(months, int8builder.Finish());
+
+  arrow::Int16Builder int16builder;
+  int16_t years_raw[5] = {1990, 2000, 1995, 2000, 1995};
+  ARROW_RETURN_NOT_OK(int16builder.AppendValues(years_raw, 5));
+  std::shared_ptr<arrow::Array> years;
+  ARROW_ASSIGN_OR_RAISE(years, int16builder.Finish());
+
+  //Get a vector of our Arrays
+  std::vector<std::shared_ptr<arrow::Array>> columns = {days, months, years};
+
+  //Make a schema to initialize the Table with
+  std::shared_ptr<arrow::Field> field_day, field_month, field_year;
+  std::shared_ptr<arrow::Schema> schema;
+
+  field_day = arrow::field("Day", arrow::int8());
+  field_month = arrow::field("Month", arrow::int8());
+  field_year = arrow::field("Year", arrow::int16());
+
+  schema = arrow::schema({field_day, field_month, field_year});
+  //With the schema and data, create a Table
+  std::shared_ptr<arrow::Table> table; 
+  table = arrow::Table::Make(schema, columns);
+
+  //Write out test files in IPC, CSV, and Parquet for the example to use.
+  std::shared_ptr<arrow::io::FileOutputStream> outfile;
+  ARROW_ASSIGN_OR_RAISE(outfile, arrow::io::FileOutputStream::Open("test_in.ipc"));
+  ARROW_ASSIGN_OR_RAISE(std::shared_ptr<arrow::ipc::RecordBatchWriter> ipc_writer,
+                        arrow::ipc::MakeFileWriter(outfile, schema));
+  ARROW_RETURN_NOT_OK(ipc_writer->WriteTable(*table));
+  ARROW_RETURN_NOT_OK(ipc_writer->Close());
+
+  ARROW_ASSIGN_OR_RAISE(outfile, arrow::io::FileOutputStream::Open("test_in.csv"));
+  ARROW_ASSIGN_OR_RAISE(auto csv_writer, arrow::csv::MakeCSVWriter(
+          outfile, table->schema()));
+  ARROW_RETURN_NOT_OK(csv_writer->WriteTable(*table));
+  ARROW_RETURN_NOT_OK(csv_writer->Close());
+
+  ARROW_ASSIGN_OR_RAISE(outfile, arrow::io::FileOutputStream::Open("test_in.parquet"));
+  PARQUET_THROW_NOT_OK(parquet::arrow::WriteTable(*table,
+                                                  arrow::default_memory_pool(),
+                                                  outfile, 5));
+
+  return arrow::Status::OK();
+}
+
+arrow::Status RunMain() {
+  //Generate initial files for each format with a helper function -- don't worry,
+  //we'll also write a table in this example.
+  ARROW_RETURN_NOT_OK(GenInitialFile());
+ 
+  //Reading and writing from files
+
+  //First, we have to set up a ReadableFile object, which just lets us point our
+  //readers to the right data on disk. We'll be reusing this object, and rebinding
+  //it to multiple files throughout the example.
+  std::shared_ptr<arrow::io::ReadableFile> infile;
+  //Get "test_in.ipc" into our file pointer
+  ARROW_ASSIGN_OR_RAISE(infile,
+                          arrow::io::ReadableFile::Open("test_in.ipc",
+                                                        arrow::default_memory_pool()));
+  //Open up the file with the IPC features of the library, gives us a reader object.
+  ARROW_ASSIGN_OR_RAISE(auto ipc_reader,
+                        arrow::ipc::RecordBatchFileReader::Open(infile));
+  //Using the reader, we can read Record Batches. Note that this is specific to IPC;
+  //for other formats, we focus on Tables, but here, RecordBatches are used.
+  std::shared_ptr<arrow::RecordBatch> rbatch;
+  ARROW_ASSIGN_OR_RAISE(rbatch, ipc_reader->ReadRecordBatch(0));
+
+  //Just like with input, we get an object for the output file.
+  std::shared_ptr<arrow::io::FileOutputStream> outfile;
+  //Bind it to "test_out.ipc"
+  ARROW_ASSIGN_OR_RAISE(outfile, arrow::io::FileOutputStream::Open("test_out.ipc"));
+  //Set up a writer with the output file -- and the schema! We're defining everything
+  //here, loading to fire.
+  ARROW_ASSIGN_OR_RAISE(std::shared_ptr<arrow::ipc::RecordBatchWriter> ipc_writer,
+                        arrow::ipc::MakeFileWriter(outfile,
+                                                   rbatch->schema()));
+  //Write the record batch.
+  ARROW_RETURN_NOT_OK(ipc_writer->WriteRecordBatch(*rbatch));
+  //Specifically for IPC, the writer needs to be explicitly closed.
+  ARROW_RETURN_NOT_OK(ipc_writer->Close());
+
+  //Bind our input file to "test_in.csv"
+  ARROW_ASSIGN_OR_RAISE(infile, arrow::io::ReadableFile::Open("test_in.csv"));
+  std::shared_ptr<arrow::Table> csv_table;
+  //The CSV reader has several objects for various options. For now, we'll use defaults.
+  ARROW_ASSIGN_OR_RAISE(auto csv_reader,
+                        arrow::csv::TableReader::Make(
+                                arrow::io::default_io_context(),
+                                infile, arrow::csv::ReadOptions::Defaults(),
+                                arrow::csv::ParseOptions::Defaults(),
+                                arrow::csv::ConvertOptions::Defaults()));
+  //Read the table.
+  ARROW_ASSIGN_OR_RAISE(csv_table, csv_reader->Read())
+
+  //Bind our output file to "test_out.csv"
+  ARROW_ASSIGN_OR_RAISE(outfile, arrow::io::FileOutputStream::Open("test_out.csv"));
+  //The CSV writer has simpler defaults, review API documentation for more complex usage.
+  ARROW_ASSIGN_OR_RAISE(auto csv_writer, arrow::csv::MakeCSVWriter(
+          outfile, csv_table->schema()));
+  ARROW_RETURN_NOT_OK(csv_writer->WriteTable(*csv_table));
+  //Not necessary, but a safe practice.
+  ARROW_RETURN_NOT_OK(csv_writer->Close());
+
+  //Bind our input file to "test_in.parquet"
+  ARROW_ASSIGN_OR_RAISE(infile, arrow::io::ReadableFile::Open("test_in.parquet"));
+  std::unique_ptr<parquet::arrow::FileReader> reader;
+  //Note that Parquet's OpenFile() takes the reader by reference, rather than returning
+  //a reader.

Review Comment:
   Created https://issues.apache.org/jira/browse/ARROW-17400



##########
cpp/examples/tutorial_examples/dataset_example.cc:
##########
@@ -0,0 +1,169 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <arrow/api.h>
+#include <parquet/arrow/reader.h>
+#include <parquet/arrow/writer.h>
+#include <arrow/dataset/api.h>
+
+#include <iostream>
+
+
+// Generate some data for the rest of this example.
+arrow::Result<std::shared_ptr<arrow::Table>> CreateTable() {
+    //This code should look familiar from the basic Arrow example, and is not the
+    //focus of this example. However, we need data to work on it, and this makes that!
+    auto schema =
+            arrow::schema({arrow::field("a", arrow::int64()),
+                           arrow::field("b", arrow::int64()),
+                           arrow::field("c", arrow::int64())});
+    std::shared_ptr<arrow::Array> array_a;
+    std::shared_ptr<arrow::Array> array_b;
+    std::shared_ptr<arrow::Array> array_c;
+    arrow::NumericBuilder<arrow::Int64Type> builder;
+    ARROW_RETURN_NOT_OK(builder.AppendValues({0, 1, 2, 3, 4, 5, 6, 7, 8, 9}));
+    ARROW_RETURN_NOT_OK(builder.Finish(&array_a));
+    builder.Reset();
+    ARROW_RETURN_NOT_OK(builder.AppendValues({9, 8, 7, 6, 5, 4, 3, 2, 1, 0}));
+    ARROW_RETURN_NOT_OK(builder.Finish(&array_b));
+    builder.Reset();
+    ARROW_RETURN_NOT_OK(builder.AppendValues({1, 2, 1, 2, 1, 2, 1, 2, 1, 2}));
+    ARROW_RETURN_NOT_OK(builder.Finish(&array_c));
+    return arrow::Table::Make(schema, {array_a, array_b, array_c});
+}
+
+// Set up a dataset by writing two Parquet files.
+arrow::Result<std::string> CreateExampleParquetDataset(
+        const std::shared_ptr<arrow::fs::FileSystem>& filesystem,
+        const std::string& root_path) {
+    //Much like CreateTable(), this is utility that gets us the dataset we'll be reading
+    //from. Don't worry, we also write a dataset in the example proper.
+    auto base_path = root_path + "parquet_dataset";
+    ARROW_RETURN_NOT_OK(filesystem->CreateDir(base_path));
+    // Create an Arrow Table
+    ARROW_ASSIGN_OR_RAISE(auto table, CreateTable());
+    // Write it into two Parquet files
+    ARROW_ASSIGN_OR_RAISE(auto output,
+                          filesystem->OpenOutputStream(base_path + "/data1.parquet"));
+    ARROW_RETURN_NOT_OK(parquet::arrow::WriteTable(
+            *table->Slice(0, 5), arrow::default_memory_pool(), output, 2048));
+    ARROW_ASSIGN_OR_RAISE(output,
+                          filesystem->OpenOutputStream(base_path + "/data2.parquet"));
+    ARROW_RETURN_NOT_OK(parquet::arrow::WriteTable(
+            *table->Slice(5), arrow::default_memory_pool(), output, 2048));
+    return base_path;
+}
+
+arrow::Status RunMain() {
+
+    //Get our environment prepared for reading, by setting up some quick writing.
+    ARROW_ASSIGN_OR_RAISE(auto src_table, CreateTable())
+    std::shared_ptr<arrow::fs::FileSystem> setup_fs;
+    //Note this operates in the directory the executable is built in.
+    char setup_path[256];
+    getcwd(setup_path, 256);
+    ARROW_ASSIGN_OR_RAISE(setup_fs, arrow::fs::FileSystemFromUriOrPath(setup_path));
+    ARROW_ASSIGN_OR_RAISE(auto dset_path, CreateExampleParquetDataset(setup_fs,
+                                                                      ""));
+
+    //First, we need a filesystem object, which lets us interact with our local
+    //filesystem starting at a given path. For the sake of simplicity, that'll be
+    //the current directory.
+    std::shared_ptr<arrow::fs::FileSystem> fs;
+    //This feels pretty bad, but I wasn't finding great solutions that're
+    //system-generic -- could use some advice on how to set this up.
+    char init_path[256];
+    getcwd(init_path, 256);
+    ARROW_ASSIGN_OR_RAISE(fs, arrow::fs::FileSystemFromUriOrPath(init_path));
+
+    //A file selector lets us actually traverse a multi-file dataset.
+    arrow::fs::FileSelector selector;
+    selector.base_dir = "parquet_dataset";
+    //Recursive is a safe bet if you don't know the nesting of your dataset.
+    selector.recursive = true;
+    //Making an options object lets us configure our dataset reading.
+    arrow::dataset::FileSystemFactoryOptions options;
+    // We'll use Hive-style partitioning. We'll let Arrow Datasets infer the partition
+    // schema. We won't set any other options, defaults are fine.
+    options.partitioning = arrow::dataset::HivePartitioning::MakeFactory();
+    auto read_format =
+            std::make_shared<arrow::dataset::ParquetFileFormat>();
+    //Now, we get a factory that will let us get our dataset -- we don't have the
+    //dataset yet!
+    auto factory = arrow::dataset::FileSystemDatasetFactory::Make(fs,

Review Comment:
   It seems like we stopped using the error handling functions around here are started calling `ValueOrDie`? Is that intentional?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org