You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2021/01/26 12:16:51 UTC

[GitHub] [arrow] alamb commented on a change in pull request #9306: ARROW-10297: [Rust] Parameter for parquet-read to output data in json format

alamb commented on a change in pull request #9306:
URL: https://github.com/apache/arrow/pull/9306#discussion_r564465581



##########
File path: rust/parquet/src/bin/parquet-read.rs
##########
@@ -34,39 +34,98 @@
 //! ```
 //!
 //! # Usage
-//!
 //! ```
-//! parquet-read <file-path> [num-records]
+//!  parquet-read <file-path> [num-records]
 //! ```
-//! where `file-path` is the path to a Parquet file and `num-records` is the optional
-//! numeric option that allows to specify number of records to read from a file.
-//! When not provided, all records are read.
+//!
+//! ## Flags
+//!     -h, --help       Prints help information
+//!     -j, --json       Print parquet file in JSON lines Format
+//!     -V, --version    Prints version information
+//!
+//! ## Args
+//!     <file-path>      Path to a parquet file
+//!     <num-records>    Number of records to read. When not provided, all records are read.
 //!
 //! Note that `parquet-read` reads full file schema, no projection or filtering is
 //! applied.
 
 extern crate parquet;
 
-use std::{env, fs::File, path::Path, process};
+use std::{env, fs::File, path::Path};
+
+use clap::{crate_authors, crate_version, App, Arg};
 
 use parquet::file::reader::{FileReader, SerializedFileReader};
+use parquet::record::Row;
 
 fn main() {
-    let args: Vec<String> = env::args().collect();
-    if args.len() != 2 && args.len() != 3 {
-        println!("Usage: parquet-read <file-path> [num-records]");
-        process::exit(1);
-    }
+    #[cfg(not(feature = "json_output"))]
+    let app = App::new("parquet-read")
+        .version(crate_version!())
+        .author(crate_authors!())
+        .about("Read data from parquet file")
+        .arg(
+            Arg::with_name("file_path")
+                .value_name("file-path")
+                .required(true)
+                .index(1)
+                .help("Path to a parquet file"),
+        )
+        .arg(
+            Arg::with_name("num_records")
+                .value_name("num-records")
+                .index(2)
+                .help(
+                    "Number of records to read. When not provided, all records are read.",
+                ),
+        );
+
+    #[cfg(feature = "json_output")]
+    let app = App::new("parquet-read")
+        .version(crate_version!())
+        .author(crate_authors!())
+        .about("Read data from parquet file")
+        .arg(
+            Arg::with_name("file_path")
+                .value_name("file-path")
+                .required(true)
+                .index(1)
+                .help("Path to a parquet file"),
+        )
+        .arg(
+            Arg::with_name("num_records")
+                .value_name("num-records")
+                .index(2)
+                .help(
+                    "Number of records to read. When not provided, all records are read.",
+                ),
+        )
+        .arg(

Review comment:
       I wonder if it would make sense to try to reduce the redundancy. Maybe something like 
   ```
   let app = App::new("parquet-read")...;
   
   #[cfg(feature = "json_output")]
   let app = app.arg(Arg::with_name("json")...)
   
   ```
   
   

##########
File path: rust/parquet/Cargo.toml
##########
@@ -43,6 +43,8 @@ chrono = "0.4"
 num-bigint = "0.3"
 arrow = { path = "../arrow", version = "4.0.0-SNAPSHOT", optional = true }
 base64 = { version = "0.12", optional = true }
+clap = "2.33.3"

Review comment:
       would it be possible to make the `clap` dependency optional as well? Or maybe we can move the command line tools into their own feature?  The hope for parquet is that it becomes a widely used library at the lower levels of a software stack so I think keeping dependencies minimal is a good goal
   
   @nevi-me  what do you think?

##########
File path: rust/parquet/src/record/api.rs
##########
@@ -635,6 +648,55 @@ impl Field {
             _ => nyi!(descr, value),
         }
     }
+
+    #[cfg(feature = "json_output")]
+    pub fn to_json_value(&self) -> Value {
+        match &self {
+            Field::Null => Value::Null,
+            Field::Bool(b) => Value::Bool(*b),
+            Field::Byte(n) => Value::Number(serde_json::Number::from(*n)),
+            Field::Short(n) => Value::Number(serde_json::Number::from(*n)),
+            Field::Int(n) => Value::Number(serde_json::Number::from(*n)),
+            Field::Long(n) => Value::Number(serde_json::Number::from(*n)),
+            Field::UByte(n) => Value::Number(serde_json::Number::from(*n)),
+            Field::UShort(n) => Value::Number(serde_json::Number::from(*n)),
+            Field::UInt(n) => Value::Number(serde_json::Number::from(*n)),
+            Field::ULong(n) => Value::Number(serde_json::Number::from(*n)),
+            Field::Float(n) => serde_json::Number::from_f64(*n as f64)
+                .map(Value::Number)
+                .unwrap_or(Value::Null),
+            Field::Double(n) => serde_json::Number::from_f64(*n)
+                .map(Value::Number)
+                .unwrap_or(Value::Null),
+            Field::Decimal(n) => Value::String(convert_decimal_to_string(&n)),
+            Field::Str(s) => Value::String(s.to_owned()),
+            Field::Bytes(b) => Value::String(base64::encode(b.data())),
+            Field::Date(d) => Value::String(convert_date_to_string(*d)),
+            Field::TimestampMillis(ts) => {
+                Value::String(convert_timestamp_millis_to_string(*ts))
+            }
+            Field::TimestampMicros(ts) => {
+                Value::String(convert_timestamp_micros_to_string(*ts))
+            }
+            Field::Group(row) => row.to_json_value(),
+            Field::ListInternal(fields) => {
+                Value::Array(fields.elements.iter().map(|f| f.to_json_value()).collect())
+            }
+            Field::MapInternal(map) => Value::Object(
+                map.entries
+                    .iter()
+                    .map(|(key_field, value_field)| {
+                        let key_val = key_field.to_json_value();
+                        let key_str = key_val
+                            .as_str()
+                            .map(|s| s.to_owned())
+                            .unwrap_or_else(|| key_val.to_string());
+                        (key_str, value_field.to_json_value())
+                    })
+                    .collect(),
+            ),
+        }
+    }
 }

Review comment:
       I suggest adding some tests to show this code working as well as avoid regressions in the future (aka avoid someone breaking this code accidentally in the future but not realizing it)




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org